dca_interface  6.3.4
url_samples/urldbsample_remote/main.cpp
1 /* IBM Source Code */
2 /* (C) Copyright IBM Corp. 2009, 2012 */
3 /* Licensed Materials - Property of IBM */
4 /* US Government Users Restricted Rights - Use duplication or disclosure restricted by GSA Schedule Contract with IBM Corp. */
5 
67 #include <string>
68 #include <vector>
69 #include <iostream>
70 #include <fstream>
71 #include <cstdlib>
72 #include <ctime>
73 
74 #ifdef WIN32
75 # include <winsock2.h>
76 #endif
77 
78 #include "dca/dca_base.h"
80 #include "dca/dca_callbacks.h"
81 
82 using namespace dca;
83 
84 const std::string S_ToolName = "urldbsample_remote";
85 const std::string S_ToolVersion = "1.3";
86 
87 std::string G_Locale = "en_US"; // Default locale
88 LogLevel G_LogLevel = LOG_Notice; // Default log level
89 
94 const std::string S_UsageString =
95 "<dca-redist-folder> <ticket> <product> <encryption-data> <encryption-key> "
96 "<url-list-file> [<locale>] [<log-level>]\n"
97  " dca-redist-folder - the folder where the DCA is installed to\n"
98  " ticket - a valid ticket\n"
99  " product - the product associated with your ticket\n"
100  " hex-encryption-data - the encryption data (as hex string) included in "
101 "your license\n"
102  " encryption-key - the encryption key included in your license\n"
103  " url-list-file - file that includes the URLs to classify\n"
104  " locale - optional locale for the categories names, default = en_US\n"
105  " log-level - optional log-level, default = 3 (LOG_Notice)\n\n"
106  ;
107 
112 #ifdef WIN32
113 # define DCA_BINDIR "bin/Win32"
114 #else
115 # define DCA_BINDIR "bin/linux"
116 #endif
117 
122 #define DCA_INITDIR "init"
123 
127 #define DCA_LOGDIR "./logs"
128 
139 static void SetupInitData( const std::string& redist_folder,
140  InitData& initData )
141 {
142  initData.binDir = redist_folder + DCA_BINDIR;
143  initData.initDir = redist_folder + DCA_INITDIR;
144  initData.logDir = DCA_LOGDIR;
145 }
146 
158 static bool StartupLibraries()
159 {
160 #ifdef WIN32
161  // Windows needs an extra socket-startup for this process to work
162  // correctly with e.g. IP(v6) input IP addresses
163  WORD wVersionRequested = MAKEWORD( 2, 2 );
164  WSADATA wsaData;
165  int err = WSAStartup( wVersionRequested, &wsaData );
166  if ( err != 0 ) {
167  std::cout << "Error on WSAStartup (" << err << ") occured, aborting" <<
168  std::endl;
169  return false;
170  }
171 #endif
172 
173  // init the 3rd party libraries
174  InitCUrl();
176  return true;
177 }
178 
185 static void ShutdownLibraries()
186 {
187  // deinit the 3rd party libraries
189  DeinitCUrl();
190 
191 #ifdef WIN32
192  // Cleanup Windows sockets for this process
193  WSACleanup();
194 #endif
195 }
196 
197 
206 static void SetupLicense( const std::string& ticket, const std::string& product,
207  LicenseData& licenseData )
208 {
209  licenseData.ticket = ticket;
210  licenseData.product = product;
211 }
212 
222 static bool SetupConnectionData( const std::string& encData,
223  const std::string& encKey, DbConnectionData& cData )
224 {
225  const int iEncKey = atoi( encKey.c_str() );
226 
227  if( iEncKey <= 0 )
228  return false;
229 
230  cData.useLocalDatabase = false;
231  cData.dbType = DBT_Url;
232  cData.remoteServerData.encryptionData = encData;
233  cData.remoteServerData.encryptionKey = iEncKey;
234 
235  return true;
236 }
237 
246 static void PrintResults( const CategoriesInfo& catinfos,
247  const UrlClassificationResults& cats )
248 {
249  const DCA_SIZE_TYPE numOfCats = cats.size();
250 
251  if( numOfCats == 0 ) {
252  std::cout << " No categories found." << std::endl;
253  return;
254  }
255 
256  const Categories myCategories = catinfos.getCategories();
257 
258  for( DCA_INDEX_TYPE i = 0; i < numOfCats; ++i ) {
259  const UrlClassificationResult result = cats[ i ];
260  const Category myCategory = myCategories.byId( result );
261  const std::string catname = myCategory.name( G_Locale );
262 
263  if( myCategory != NullCategory ) {
264  std::cout << " " << (i+1) << ".\t Category '" << catname <<
265  "' (id=" << myCategory.id() <<
266  ", groupid=" << myCategory.groupId() <<
267  ")" << std::endl;
268  }
269  }
270 }
271 
276 static void PrintToolHeader()
277 {
278  std::cout << "IBM DCA Sample: " << S_ToolName << " (" << S_ToolVersion
279  << ")" << std::endl;
280 }
281 
286 static void PrintUsage()
287 {
288  std::cout << " usage:" << std::endl;
289  std::cout << S_UsageString << std::endl;
290 }
291 
298 static void PrintDbConnectionInfo( const DbConnection& aDbConnection )
299 {
300  DatabaseInformation databaseInformation =
301  aDbConnection.getDatabaseInformation();
302 
303  std::cout << "URL Database Version: " << databaseInformation.versionString
304  << " as of " << databaseInformation.creationDateUTC << std::endl;
305 }
306 
312 static void PrintLicenseInfo( const License& aLicense )
313 {
314  const time_t expirationDate = aLicense.getExpirationDate();
315  struct tm *expirationTime = localtime( &expirationDate );
316 
317  std::cout << "License Info:" << std::endl;
318  std::cout << " DCA is " << ( aLicense.isLicensed() ? "licensed." :
319  "not licensed." ) << std::endl;
320  std::cout << " MaxUsers:" << aLicense.getMaxUsers() <<
321  std::endl;
322  std::cout << " MaxSessions:" << aLicense.getMaxSessions() <<
323  std::endl;
324  std::cout << " Ticket:" << aLicense.getTicket() <<
325  std::endl;
326  std::cout << " Session:" << aLicense.getSession() <<
327  std::endl;
328  std::cout << " Last Message:" << aLicense.getLastMessage() <<
329  std::endl;
330  std::cout << " Expiration Date:" << asctime( expirationTime ) <<
331  std::endl;
332 }
333 
341 static void LoadUrlFile( const std::string& fileName,
342  std::vector<std::string>& urlList )
343 {
344  std::ifstream fstream( fileName.c_str(), std::ios::in );
345  if (!fstream.is_open()) return;
346 
347  std::string line;
348 
349  while ( std::getline(fstream, line) )
350  {
351  if( !line.empty() && line[line.length()-1] == '\r')
352  line.erase( line.length() - 1 );
353 
354  if( !line.empty() )
355  urlList.push_back( line );
356  }
357 }
358 
378 void TestUrlClassification( const std::string& aUrlListFile,
379  const DcaInstance& myDca, const UrlDbClassifier& myUrlDbClassifier,
380  const CategoriesInfo& myCategoriesInfo )
381 {
382  size_t urlsRequested = 0;
383  size_t unknownUrls = 0;
384  size_t uncategerizedUrls = 0;
385  size_t categoriesFound = 0;
386 
387  std::cout << "Entering URL db classification routine..." << std::endl;
388 
389  UrlClassificationResults myUrlClassificationResults;
390 
391  std::vector<std::string> myUrlList;
392  LoadUrlFile( aUrlListFile, myUrlList );
393 
394  for( std::vector<std::string>::const_iterator U = myUrlList.begin(),
395  UEnd = myUrlList.end(); U != UEnd; ++U ) {
396 
397  const std::string& myUrlString = *U;
398 
399  std::cout << " Starting URL db classification for URL '" <<
400  myUrlString << "'" << std::endl;
401 
402  // setup a URL for given string
403  const Url myUrl = Url::create( myDca, myUrlString );
404 
405  ++urlsRequested;
406 
407  // start the classification
408  FunctionResult myFR =
409  myUrlDbClassifier.classify( myUrl, myUrlClassificationResults );
410  if( !myFR ) {
411  // error occured.
412  std::cerr << " Error from URL db classification. Details: " <<
413  myFR.getDescription() << " (" << myFR.getReturnCode() <<
414  "). Continuing with next URL." << std::endl;
415  continue;
416  }
417 
418  if( myUrlClassificationResults.isUnknownUrl() ) {
419  // the URL is not known in the database
420  std::cout << " Results: URL '" << myUrlString <<
421  "' is not known in the database. " << std::endl;
422  std::cout << " Continuing with next URL." << std::endl;
423  ++unknownUrls;
424  continue;
425  }
426 
427  if( !myUrlClassificationResults.isCategorized() ) {
428  // the URL is known but does not contains any category. This is
429  // either a white-host or includes only categorized sub-folders.
430  std::cout << " Results: URL '" << myUrlString <<
431  "' is not categorized (but known in database). " << std::endl;
432  std::cout << " Continuing with next URL" << std::endl;
433  ++uncategerizedUrls;
434  continue;
435  }
436 
437  // given URL is known and returned matched categories
438  std::cout << " Classification Results for URL '" << myUrlString <<
439  "'" << std::endl;
440  PrintResults( myCategoriesInfo, myUrlClassificationResults );
441  categoriesFound += myUrlClassificationResults.size();
442 
443  }
444 
445  std::cout << " Total Results: " << std::endl;
446  std::cout << " URLs requested:\t\t" << urlsRequested << std::endl;
447  std::cout << " URLs unknown:\t\t" << unknownUrls << std::endl;
448  std::cout << " URLs not categorized:\t" << uncategerizedUrls <<
449  std::endl;
450  std::cout << " URLs categorized:\t\t" <<
451  ( urlsRequested - unknownUrls - uncategerizedUrls ) << std::endl;
452  std::cout << " Categories found for URLs:\t" << categoriesFound <<
453  std::endl;
454 
455  std::cout << "Leaving URL db classification routine." << std::endl;
456 }
457 
466 std::string HexToString( const std::string& arg )
467 {
468  if( (arg.length()%2) != 0 || arg.find( "0x" ) != 0 ) // not a hex string
469  return arg;
470 
471  const std::string cmd( arg.substr( 2 ) );
472  std::string result;
473 
474  for( std::string::size_type i = 0; i < cmd.length(); i += 2 ) {
475  unsigned char hex[3] = { 0x00, 0x00, 0x00 };
476  hex[0] = cmd[i];
477  hex[1] = cmd[i+1];
478  unsigned int x = 0;
479  sscanf( (const char *)hex, "%02X", &x );
480  result += (unsigned char)x;
481  }
482  return result;
483 }
484 
493 int main( int argc, char *argv[] )
494 {
495  PrintToolHeader();
496 
497  int rc = 5;
498 
499  try {
500  if( argc < 7 ) {
501  PrintUsage();
502  return 5;
503  }
504 
505  std::string myRedistFolder = argv[ 1 ];
506  const std::string myTicket = argv[ 2 ];
507  const std::string myProduct = argv[ 3 ];
508 
509  // encryption data may be entered in hex format 0xAB02FF....
510  // or as a common string if only ASCII characters are used
511  const std::string myEncData = HexToString( argv[ 4 ] );
512 
513  const std::string myEncKey = argv[ 5 ];
514  const std::string myUrlList = argv[ 6 ];
515 
516  if (argc > 7)
517  {
518  G_Locale = argv[7];
519 
520  if (argc > 8)
521  {
522  G_LogLevel = static_cast<LogLevel>( atoi( argv[8] ) );
523  }
524  }
525 
526  if( myRedistFolder.empty() || myTicket.empty() ||
527  myProduct.empty() || myUrlList.empty() ||
528  myEncData.empty() || myEncKey.empty() ) {
529  PrintUsage();
530  return 5;
531  }
532 
533  // check for trailing fileslash - and add if necessary
534  const char c = myRedistFolder[ myRedistFolder.length() - 1 ];
535  if( c != '/' && c != '\\' )
536  myRedistFolder += "/";
537 
538  // Initialize socket on Windows and 3rd party libraries
539  if( !StartupLibraries() )
540  return 5;
541 
542  {
543  // setup DCA directories
544  InitData myInitData;
545  SetupInitData( myRedistFolder, myInitData );
546 
547  // instantiate DCA API
548  DcaInstance myDca;
549  myDca = DcaInstance::create( myInitData );
550 
551  // setup license data
552  LicenseData myLicenseData;
553  SetupLicense( myTicket, myProduct, myLicenseData );
554  const License myLicense = myDca.createLicense( myLicenseData,
555  ProxySettings(), G_LogLevel );
556 
557  PrintLicenseInfo( myLicense );
558 
559  if( myLicense.isLicensed( UrlClassification::ID ) ) {
560  // setup a signature database connection
561  DbConnectionData myDbConnectionData;
562  if ( !SetupConnectionData( myEncData, myEncKey, myDbConnectionData ) ) {
563  std::cout << "Could not setup remote connection because of wrong encryption data!" << std::endl;
564  rc = 5;
565  }
566  else {
567  const DbConnection myDbConnection =
568  myDca.createDbConnection( myLicense, myDbConnectionData,
569  ProxySettings(), G_LogLevel );
570  PrintDbConnectionInfo( myDbConnection );
571 
572  // initialize the URL classification module and create a URL
573  // db classifier
574  // (statistics and unknown url upload disabled)
575  UrlClassification myUrlClassification =
576  UrlClassification::create( myDca, myLicense );
577  myUrlClassification.setLogLevel( G_LogLevel );
578 
579  UrlDbClassifierOptions myUrlDbClassifierOptions;
580  myUrlDbClassifierOptions.enable_EmbeddedUrlDetection = true;
581  myUrlDbClassifierOptions.detect_EmbeddedUrlsInUrlPath = true;
582  myUrlDbClassifierOptions.enable_Feedback = false;
583 
584  UrlDbClassifier myUrlDbClassifier =
585  myUrlClassification.createDbClassifier( myDbConnection, myUrlDbClassifierOptions );
586  myUrlDbClassifier.setLogLevel( G_LogLevel );
587 
588  // create a categories info for printing out the category names
589  // together with the results per URL
590  const CategoriesInfo myCategoriesInfo =
592 
593  // call URL Classification routine
594  TestUrlClassification( myUrlList, myDca, myUrlDbClassifier,
595  myCategoriesInfo );
596 
597  rc = 0;
598  }
599  }
600  }
601  }
602  catch( const ExDca& ex ) {
603  std::cerr << "DCA Exception occured. Details: " << ex.getDescription()
604  << " (" << ex.getReturnCode() << ")." << std::endl;
605  rc = 10;
606  }
607  catch( const std::exception& s ) {
608  std::cerr << "std::exception occured. Details: " << s.what() << "." <<
609  std::endl;
610  rc = 10;
611  }
612  catch(...) {
613  std::cerr << "Unknown exception caught." << std::endl;
614  rc = 10;
615  }
616 
617  // deinit the 3rd party libraries
618  ShutdownLibraries();
619 
620  return rc;
621 }
static DCA_MODULE_ID_TYPE ID
The unique ID of the URL classification module.
Is used to create a License object. A license first must be created with DcaInstance::createLicense t...
Definition: base_classes.h:547
Exception class used in the DCA.
Definition: base_classes.h:237
bool isCategorized() const
Returns whether or not the URL matched one or more categories.
LogLevel setLogLevel(LogLevel newLevel)
Sets the logging level for the given class instance. The old value will be returned.
static Url create(const DcaInstance &aDcaInstance, const std::string &urlString)
Standard Url creation function.
const DCA_CATEGORIES_INFO_TYPE DCA_CAT_INFO_TYPE_URL
Refers to the internal categories info for URL classification.
void InitCUrl()
Initializes libcurl. Do not use any DCA function before initializing libcurl.
std::string initDir
the directory in which the DCA init files are stored
Definition: base_classes.h:266
int getMaxSessions() const
Returns the maximum allowed sessions associated with your ticket/license.
time_t getExpirationDate() const
Returns the expiration date of the license in UTC.
std::string getDescription() const
Returns a description of the error.
Definition of a container class for Category objects.
CategoriesInfo getCategoriesInfo(DCA_CATEGORIES_INFO_TYPE categoryType) const
Returns the DCA internal categories, groups and locales.
DatabaseInformation getDatabaseInformation() const
Returns information about the underlying database.
Contains information about underlying database.
Definition: base_classes.h:834
DCA_RESULT_TYPE getReturnCode() const
Returns the last error code (if any).
FunctionResult classify(const Url &aUrl, UrlClassificationResults &urlResults) const
Performs the URL classification and returns the results.
static void SetupInitData(const std::string &redist_folder, InitData &initData)
Sets up the given initData by substituting the given redist_folder with DCA subdirectories.
std::string creationDateUTC
Definition: base_classes.h:842
LogLevel setLogLevel(LogLevel newLevel)
Sets the logging level for the given class instance. The old value will be returned.
std::string binDir
the directory in which the DCA binary (*.dca) files are stored
Definition: base_classes.h:265
This header includes initialization/deinitialization support functions for the 3rd party libraries us...
unsigned int encryptionKey
The encryption key to be used (provided with your license)
Definition: base_classes.h:779
A container class that allows access to the contained Categories, Groups and Locales.
Sets up options for embedded URL detection and provided Feedback mechanism.
@ LOG_Notice
Write notices / important information to the log file.
Definition: base_classes.h:217
void SetOpenSslCallbacks()
Initializes the required callbacks for OpenSSL when using HTTPS or SSL connections in a multi-threade...
Stores the connection data for a database.
Definition: base_classes.h:815
void UnsetOpenSslCallbacks()
Unsets the openssl callbacks. Do not call any DCA function after you have called this function.
DCA_CATEGORY_ID_TYPE id() const
The category id.
int getMaxUsers() const
Returns the maximum allowed users associated with your ticket/license.
void DeinitCUrl()
Deinitializes libcurl. Do not call any DCA function after you have called this function.
bool isUnknownUrl() const
Returns whether a URL is known or unknown. A URL is unknown if it is not contained in the database.
Database connection class for a local or remote database.
Definition: base_classes.h:859
This header includes all header files of the URL Classification Package.
#define DCA_LOGDIR
Relative directory for logfile(s).
Main class for the URL classification.
#define DCA_INITDIR
DCA subdirectory of the DCA initialization data.
bool enable_Feedback
This switches the Feedback feature on or off. This is switched off by default.
DCA_RESULT_TYPE getReturnCode() const
Gets the code of the error.
DCA_CATEGORY_ID_TYPE UrlClassificationResult
The item of an URL classification result is typedef'd as DCA_CATEGORY_ID_TYPE.
Results of an URL classification.
bool detect_EmbeddedUrlsInUrlPath
If set to true, you can specify that embedded URL detection is additionally performed in the path par...
URL database classifier class.
static void PrintToolHeader()
Prints out the name and the version of this sample.
LogLevel
This enum is used in all setLogLevel() functions to change the verbosity level of the classes.
Definition: base_classes.h:212
Use a License to initialize a classification package or a toolbox package.
Definition: base_classes.h:560
Category byId(DCA_CATEGORY_ID_TYPE id) const
Returns the category with the given category id.
If you are using one or more proxy servers set up this structure and use it for e....
Definition: base_classes.h:275
bool isLicensed(DCA_MODULE_ID_TYPE id=0, bool force=false) const
Checks whether the given License is valid for the given module id.
static void PrintLicenseInfo(const License &aLicense)
Prints out the information about the provided License.
std::string ticket
The ticket as provided in the license.
Definition: base_classes.h:548
This header includes all header files of the DCA Base Package.
std::string versionString
Definition: base_classes.h:840
bool useLocalDatabase
Set to true to connect to a local or custom database, set to false to use a remote database.
Definition: base_classes.h:821
Encapsulates the init and deinit of the DCA API.
Definition: base_classes.h:315
Category NullCategory
Defines a constant unassigned Category you can use for checks. if( myCat == NullCategory ) --> myCat ...
std::string product
The product code used with the license.
Definition: base_classes.h:549
const DbType DBT_Url
Used for DbConnection classes for URL classification.
static UrlClassification create(const DcaInstance &aDcaInstance, const License &aLicense)
Creates the URL classification module by using the given DcaInstance and License.
std::string logDir
the directory in which the DCA log file should be created
Definition: base_classes.h:267
static void PrintUsage(const char *name)
Prints out the syntax of the sample.
std::string getLastMessage() const
Returns the last message received from our license server or if none available the last available mes...
size_t DCA_INDEX_TYPE
Type for index access (used for arrays and collections).
Definition: base_types.h:66
DbType dbType
The type of the database.
Definition: base_classes.h:820
Categories getCategories() const
Returns the contained Categories.
std::string getDescription() const
Returns the description for the error or warning.
size_t DCA_SIZE_TYPE
Type for size (used for size of array and collections).
Definition: base_types.h:72
std::string name(const std::string &localeString=std::string()) const
Returns the localized (display) name of the category.
DbConnectionRemoteServerData remoteServerData
If you are creating a remote database connection, this structure must be filled out with the encrypti...
Definition: base_classes.h:824
std::string getTicket() const
Returns the ticket of the license as string.
DCA_GROUP_ID_TYPE groupId() const
If the category is associated with a group, this is the group id.
bool enable_EmbeddedUrlDetection
If set to true embedded URL detection will be used in URL classification. This is switched on by defa...
#define DCA_BINDIR
DCA subdirectory of the DCA binaries.
Standard function result.
Definition: base_classes.h:148
std::string getSession() const
Returns the session of the license as string.
Encapsulates a URL object.
Definition: base_url.h:44
UrlDbClassifier createDbClassifier(const DbConnection &aDbConnection, const UrlDbClassifierOptions &options=UrlDbClassifierOptions()) const
Create a URL database classifier. The classifier is created by using the provided database connection...
const std::string S_UsageString
Usage string, displayed if a parameter is missing.
DbConnection createDbConnection(const License &aLicense, const DbConnectionData &dbcData, const ProxySettings &proxySettings=ProxySettings(), LogLevel aLogLevel=LOG_Initial) const
Creates a DbConnection object using the given DbConnectionData.
This structure is used to initialize the DcaInstance.
Definition: base_classes.h:264
Encapsulates a category as defined in the categories XML schema (see Categories XML: Categories).
Definition: base_category.h:26
static DcaInstance create(const InitData &initData)
Creates a DcaInstance, starts up the DCA API and initializes the required main module.
static void SetupLicense(const std::string &ticket, const std::string &product, LicenseData &licenseData)
Sets up the given licenseData by copying the given ticket and product strings.
std::string encryptionData
The encryption data to be used (provided with your license)
Definition: base_classes.h:778
License createLicense(const LicenseData &licData, const ProxySettings &proxySettings=ProxySettings(), LogLevel aLogLevel=LOG_Initial) const
Creates a License object using the given LicenseData.
DCA_SIZE_TYPE size() const
Returns the number of results in the container.
int main(int argc, char *argv[])
The main routine.