dca_interface  6.3.4
url_samples/urldbsample/main.cpp
1 /* IBM Source Code */
2 /* (C) Copyright IBM Corp. 2009, 2012 */
3 /* Licensed Materials - Property of IBM */
4 /* US Government Users Restricted Rights - Use duplication or disclosure restricted by GSA Schedule Contract with IBM Corp. */
5 
40 #include <string>
41 #include <vector>
42 #include <iostream>
43 #include <fstream>
44 #include <ctime>
45 
46 #ifdef WIN32
47 # include <winsock2.h>
48 #endif
49 
50 #include "dca/dca_base.h"
52 #include "dca/dca_callbacks.h"
53 
54 using namespace dca;
55 
56 const std::string S_ToolName = "urldbsample";
57 const std::string S_ToolVersion = "1.3";
58 
63 const std::string S_UsageString =
64  "<dca-redist-folder> <ticket> <product> <url-list-file>\n"
65  " dca-redist-folder - the folder where the DCA is installed to\n"
66  " ticket - a valid ticket\n"
67  " product - the product associated with your ticket\n"
68  " url-list-file - file that includes the URLs to classify\n\n"
69  ;
70 
75 #ifdef WIN32
76 # define DCA_BINDIR "bin/Win32"
77 #else
78 # define DCA_BINDIR "bin/linux"
79 #endif
80 
85 #define DCA_INITDIR "init"
86 
90 #define DCA_LOGDIR "./logs"
91 
100 static void SetupInitData( const std::string& redist_folder, InitData& initData )
101 {
102  initData.binDir = redist_folder + DCA_BINDIR;
103  initData.initDir = redist_folder + DCA_INITDIR;
104  initData.logDir = DCA_LOGDIR;
105 }
106 
118 static bool StartupLibraries()
119 {
120 #ifdef WIN32
121  // Windows needs an extra socket-startup for this process to work
122  // correctly with e.g. IP(v6) input IP addresses
123  WORD wVersionRequested = MAKEWORD( 2, 2 );
124  WSADATA wsaData;
125  int err = WSAStartup( wVersionRequested, &wsaData );
126  if ( err != 0 ) {
127  std::cout << "Error on WSAStartup (" << err << ") occured, aborting" <<
128  std::endl;
129  return false;
130  }
131 #endif
132 
133  // init the 3rd party libraries
134  InitCUrl();
136  return true;
137 }
138 
145 static void ShutdownLibraries()
146 {
147  // deinit the 3rd party libraries
149  DeinitCUrl();
150 
151 #ifdef WIN32
152  // Cleanup Windows sockets for this process
153  WSACleanup();
154 #endif
155 }
156 
164 static void SetupLicense( const std::string& ticket, const std::string& product,
165  LicenseData& licenseData )
166 {
167  licenseData.ticket = ticket;
168  licenseData.product = product;
169 }
170 
178 static void SetupConnectionData( DbConnectionData& cData )
179 {
180  cData.useLocalDatabase = true;
181  cData.dbType = DBT_Url;
182 }
183 
189 static void PrintLicenseInfo( const License& aLicense )
190 {
191  const time_t expirationDate = aLicense.getExpirationDate();
192  struct tm *expirationTime = localtime( &expirationDate );
193 
194  std::cout << "License Info:" << std::endl;
195  std::cout << " DCA is " << ( aLicense.isLicensed() ? "licensed." :
196  "not licensed." ) << std::endl;
197  std::cout << " MaxUsers:" << aLicense.getMaxUsers() <<
198  std::endl;
199  std::cout << " MaxSessions:" << aLicense.getMaxSessions() <<
200  std::endl;
201  std::cout << " Ticket:" << aLicense.getTicket() <<
202  std::endl;
203  std::cout << " Session:" << aLicense.getSession() <<
204  std::endl;
205  std::cout << " Last Message:" << aLicense.getLastMessage() <<
206  std::endl;
207  std::cout << " Expiration Date:" << asctime( expirationTime ) <<
208  std::endl;
209 }
210 
217 static void PrintDbConnectionInfo( const DbConnection& aDbConnection )
218 {
219  DatabaseInformation databaseInformation =
220  aDbConnection.getDatabaseInformation();
221 
222  std::cout << "URL Database Version: " << databaseInformation.versionString
223  << " as of " << databaseInformation.creationDateUTC << std::endl;
224 }
225 
226 
234 static void PrintResults( const CategoriesInfo& catinfos, const UrlClassificationResults& cats )
235 {
236  const DCA_SIZE_TYPE numOfCats = cats.size();
237 
238  if( numOfCats == 0 ) {
239  std::cout << "No categories found." << std::endl;
240  return;
241  }
242 
243  const Categories myCategories = catinfos.getCategories();
244 
245  for( DCA_INDEX_TYPE i = 0; i < numOfCats; ++i ) {
246  const UrlClassificationResult result = cats[ i ];
247  const Category myCategory = myCategories.byId( result );
248  const std::string catname = myCategory.name( "en_US" );
249 
250  if( myCategory != NullCategory ) {
251  std::cout << (i+1) << ".\t Category '" << catname <<
252  "' (id=" << myCategory.id() <<
253  ", groupid=" << myCategory.groupId() <<
254  ")" << std::endl;
255  }
256  }
257 }
258 
263 static void PrintToolHeader()
264 {
265  std::cout << "IBM DCA Sample: " << S_ToolName << " (" << S_ToolVersion << ")" << std::endl;
266 }
267 
273 static void PrintUsage( const char *name )
274 {
275  std::cout << name << " usage:" << std::endl;
276  std::cout << S_UsageString << std::endl;
277 }
278 
285 static void LoadUrlFile( const std::string& fileName, std::vector<std::string>& urlList )
286 {
287  std::ifstream fstream( fileName.c_str(), std::ios::in );
288  if (!fstream.is_open()) return;
289 
290  std::string line;
291 
292  while ( std::getline(fstream, line) )
293  {
294  if( !line.empty() && line[line.length()-1] == '\r')
295  line.erase( line.length() - 1 );
296 
297  if( !line.empty() )
298  urlList.push_back( line );
299  }
300 }
301 
320 void TestUrlClassification( const std::string& aUrlListFile, const DcaInstance& myDca,
321  const UrlDbClassifier& myUrlDbClassifier, const CategoriesInfo& myCategoriesInfo)
322 {
323  std::cout << "Starting URL db classification routine..." << std::endl;
324 
325  UrlClassificationResults myUrlClassificationResults;
326 
327  std::vector<std::string> myUrlList;
328  LoadUrlFile( aUrlListFile, myUrlList );
329 
330  for( std::vector<std::string>::const_iterator U = myUrlList.begin(),
331  UEnd = myUrlList.end(); U != UEnd; ++U ) {
332 
333  const std::string& myUrlString = *U;
334 
335  std::cout << " Starting URL db classification for URL '" <<
336  myUrlString << "'" << std::endl;
337 
338  // setup a URL for given string
339  const Url myUrl = Url::create( myDca, myUrlString );
340 
341  // start the classification
342  FunctionResult myFR = myUrlDbClassifier.classify( myUrl, myUrlClassificationResults );
343 
344  if( !myFR ) {
345  // error occured.
346  if( myFR.getReturnCode() == ERR_URL_INVALID ) {
347  std::cerr << "Invalid URL. ";
348  }
349  else if( myFR.getReturnCode() == ERR_URL_UNSUPPORTED_PROTOCOL ) {
350  std::cerr << "Unsupported URL protocol. ";
351  }
352  else {
353  std::cerr << "Error from URL db classification. Details: " << myFR.getDescription() <<
354  " (" << myFR.getReturnCode() << ").";
355  }
356  std::cerr << "Continuing with next URL." << std::endl;
357  continue;
358  }
359 
360  if( myUrlClassificationResults.isUnknownUrl() ) {
361  // the URL is not known in the database
362  std::cout << "Results: URL '" << myUrlString << "' is not known in the database. " <<
363  "Continuing with next URL." << std::endl;
364  continue;
365  }
366 
367  if( !myUrlClassificationResults.isCategorized() ) {
368  // the URL is known but does not contains any category. This is either a white-host or
369  // includes only categorized sub-folders.
370  std::cout << "Results: URL '" << myUrlString << "' is not categorized (but known in the database). "
371  << "Continuing with next URL" << std::endl;
372  continue;
373  }
374 
375  // given URL is known and returned matched categories
376  std::cout << "Classification Results for URL '" << myUrlString << "'" << std::endl;
377  PrintResults( myCategoriesInfo, myUrlClassificationResults );
378  }
379  std::cout << "Leaving URL db classification routine." << std::endl;
380 }
381 
389 int main( int argc, char *argv[] )
390 {
391  PrintToolHeader();
392 
393  int rc = 5;
394 
395  try {
396  if( argc < 5 ) {
397  PrintUsage( argv[0] );
398  return 5;
399  }
400 
401  std::string myRedistFolder = argv[ 1 ];
402  const std::string myTicket = argv[ 2 ];
403  const std::string myProduct = argv[ 3 ];
404  const std::string myUrlList = argv[ 4 ];
405 
406  if( myRedistFolder.empty() || myTicket.empty() ||
407  myProduct.empty() || myUrlList.empty() ) {
408  PrintUsage( argv[0] );
409  return 5;
410  }
411 
412  // check for trailing fileslash - and add if necessary
413  const char c = myRedistFolder[ myRedistFolder.length() - 1 ];
414  if( c != '/' && c != '\\' )
415  myRedistFolder += "/";
416 
417  // Initialize socket on Windows and 3rd party libraries
418  if( !StartupLibraries() )
419  return 5;
420 
421  {
422  // setup DCA directories
423  InitData myInitData;
424  SetupInitData( myRedistFolder, myInitData );
425 
426  // instantiate DCA API
427  DcaInstance myDca;
428  myDca = DcaInstance::create( myInitData );
429 
430  // setup license data
431  LicenseData myLicenseData;
432  SetupLicense( myTicket, myProduct, myLicenseData );
433  const License myLicense = myDca.createLicense( myLicenseData );
434 
435  PrintLicenseInfo( myLicense );
436 
437  if( myLicense.isLicensed( UrlClassification::ID ) ) {
438  // setup a signature database connection
439  DbConnectionData myDbConnectionData;
440  SetupConnectionData( myDbConnectionData );
441  const DbConnection myDbConnection = myDca.createDbConnection( myLicense, myDbConnectionData );
442  PrintDbConnectionInfo( myDbConnection );
443 
444  // initialize the URL Classification module and create a URL db classifier
445  // (embedded URL detection enabled, statistics and unknown url upload disabled)
446  UrlClassification myUrlClassification = UrlClassification::create( myDca, myLicense );
447 
448  UrlDbClassifierOptions myUrlDbClassifierOptions;
449  myUrlDbClassifierOptions.enable_EmbeddedUrlDetection = true;
450  myUrlDbClassifierOptions.detect_EmbeddedUrlsInUrlPath = true;
451  myUrlDbClassifierOptions.enable_Feedback = false;
452 
453  const UrlDbClassifier myUrlDbClassifier = myUrlClassification.createDbClassifier( myDbConnection, myUrlDbClassifierOptions );
454 
455  // create a categories info for printing out the category names together with the results per URL
456  const CategoriesInfo myCategoriesInfo = myDca.getCategoriesInfo( DCA_CAT_INFO_TYPE_URL );
457 
458  // call URL Classification routine
459  TestUrlClassification( myUrlList, myDca, myUrlDbClassifier, myCategoriesInfo );
460 
461  rc = 0;
462  }
463  }
464  }
465  catch( const ExDca& ex ) {
466  std::cerr << "DCA Exception occured. Details: " << ex.getDescription() <<
467  " (" << ex.getReturnCode() << ")." << std::endl;
468  rc = 10;
469  }
470  catch( const std::exception& s ) {
471  std::cerr << "std::exception occured. Details: " << s.what() << "." << std::endl;
472  rc = 10;
473  }
474  catch(...) {
475  std::cerr << "Unknown exception caught." << std::endl;
476  rc = 10;
477  }
478 
479  // deinit the 3rd party libraries
480  ShutdownLibraries();
481 
482  return rc;
483 }
484 
485 
486 
static DCA_MODULE_ID_TYPE ID
The unique ID of the URL classification module.
Is used to create a License object. A license first must be created with DcaInstance::createLicense t...
Definition: base_classes.h:547
Exception class used in the DCA.
Definition: base_classes.h:237
bool isCategorized() const
Returns whether or not the URL matched one or more categories.
static Url create(const DcaInstance &aDcaInstance, const std::string &urlString)
Standard Url creation function.
const DCA_CATEGORIES_INFO_TYPE DCA_CAT_INFO_TYPE_URL
Refers to the internal categories info for URL classification.
void InitCUrl()
Initializes libcurl. Do not use any DCA function before initializing libcurl.
std::string initDir
the directory in which the DCA init files are stored
Definition: base_classes.h:266
int getMaxSessions() const
Returns the maximum allowed sessions associated with your ticket/license.
time_t getExpirationDate() const
Returns the expiration date of the license in UTC.
std::string getDescription() const
Returns a description of the error.
Definition of a container class for Category objects.
CategoriesInfo getCategoriesInfo(DCA_CATEGORIES_INFO_TYPE categoryType) const
Returns the DCA internal categories, groups and locales.
DatabaseInformation getDatabaseInformation() const
Returns information about the underlying database.
Contains information about underlying database.
Definition: base_classes.h:834
DCA_RESULT_TYPE getReturnCode() const
Returns the last error code (if any).
FunctionResult classify(const Url &aUrl, UrlClassificationResults &urlResults) const
Performs the URL classification and returns the results.
static void SetupInitData(const std::string &redist_folder, InitData &initData)
Sets up the given initData by substituting the given redist_folder with DCA subdirectories.
std::string creationDateUTC
Definition: base_classes.h:842
std::string binDir
the directory in which the DCA binary (*.dca) files are stored
Definition: base_classes.h:265
This header includes initialization/deinitialization support functions for the 3rd party libraries us...
A container class that allows access to the contained Categories, Groups and Locales.
Sets up options for embedded URL detection and provided Feedback mechanism.
void SetOpenSslCallbacks()
Initializes the required callbacks for OpenSSL when using HTTPS or SSL connections in a multi-threade...
Stores the connection data for a database.
Definition: base_classes.h:815
void UnsetOpenSslCallbacks()
Unsets the openssl callbacks. Do not call any DCA function after you have called this function.
DCA_CATEGORY_ID_TYPE id() const
The category id.
int getMaxUsers() const
Returns the maximum allowed users associated with your ticket/license.
void DeinitCUrl()
Deinitializes libcurl. Do not call any DCA function after you have called this function.
bool isUnknownUrl() const
Returns whether a URL is known or unknown. A URL is unknown if it is not contained in the database.
Database connection class for a local or remote database.
Definition: base_classes.h:859
This header includes all header files of the URL Classification Package.
#define DCA_LOGDIR
Relative directory for logfile(s).
Main class for the URL classification.
#define DCA_INITDIR
DCA subdirectory of the DCA initialization data.
bool enable_Feedback
This switches the Feedback feature on or off. This is switched off by default.
DCA_RESULT_TYPE getReturnCode() const
Gets the code of the error.
DCA_CATEGORY_ID_TYPE UrlClassificationResult
The item of an URL classification result is typedef'd as DCA_CATEGORY_ID_TYPE.
const int ERR_URL_INVALID
Error code: The URL is invalid.
Results of an URL classification.
bool detect_EmbeddedUrlsInUrlPath
If set to true, you can specify that embedded URL detection is additionally performed in the path par...
URL database classifier class.
static void PrintToolHeader()
Prints out the name and the version of this sample.
Use a License to initialize a classification package or a toolbox package.
Definition: base_classes.h:560
Category byId(DCA_CATEGORY_ID_TYPE id) const
Returns the category with the given category id.
bool isLicensed(DCA_MODULE_ID_TYPE id=0, bool force=false) const
Checks whether the given License is valid for the given module id.
static void PrintLicenseInfo(const License &aLicense)
Prints out the information about the provided License.
std::string ticket
The ticket as provided in the license.
Definition: base_classes.h:548
This header includes all header files of the DCA Base Package.
std::string versionString
Definition: base_classes.h:840
bool useLocalDatabase
Set to true to connect to a local or custom database, set to false to use a remote database.
Definition: base_classes.h:821
Encapsulates the init and deinit of the DCA API.
Definition: base_classes.h:315
Category NullCategory
Defines a constant unassigned Category you can use for checks. if( myCat == NullCategory ) --> myCat ...
std::string product
The product code used with the license.
Definition: base_classes.h:549
const DbType DBT_Url
Used for DbConnection classes for URL classification.
static UrlClassification create(const DcaInstance &aDcaInstance, const License &aLicense)
Creates the URL classification module by using the given DcaInstance and License.
std::string logDir
the directory in which the DCA log file should be created
Definition: base_classes.h:267
const int ERR_URL_UNSUPPORTED_PROTOCOL
Error code: The protocol of the URL is unsupported.
static void PrintUsage(const char *name)
Prints out the syntax of the sample.
std::string getLastMessage() const
Returns the last message received from our license server or if none available the last available mes...
size_t DCA_INDEX_TYPE
Type for index access (used for arrays and collections).
Definition: base_types.h:66
DbType dbType
The type of the database.
Definition: base_classes.h:820
Categories getCategories() const
Returns the contained Categories.
std::string getDescription() const
Returns the description for the error or warning.
size_t DCA_SIZE_TYPE
Type for size (used for size of array and collections).
Definition: base_types.h:72
std::string name(const std::string &localeString=std::string()) const
Returns the localized (display) name of the category.
std::string getTicket() const
Returns the ticket of the license as string.
DCA_GROUP_ID_TYPE groupId() const
If the category is associated with a group, this is the group id.
bool enable_EmbeddedUrlDetection
If set to true embedded URL detection will be used in URL classification. This is switched on by defa...
#define DCA_BINDIR
DCA subdirectory of the DCA binaries.
Standard function result.
Definition: base_classes.h:148
std::string getSession() const
Returns the session of the license as string.
Encapsulates a URL object.
Definition: base_url.h:44
UrlDbClassifier createDbClassifier(const DbConnection &aDbConnection, const UrlDbClassifierOptions &options=UrlDbClassifierOptions()) const
Create a URL database classifier. The classifier is created by using the provided database connection...
const std::string S_UsageString
Usage string, displayed if a parameter is missing.
DbConnection createDbConnection(const License &aLicense, const DbConnectionData &dbcData, const ProxySettings &proxySettings=ProxySettings(), LogLevel aLogLevel=LOG_Initial) const
Creates a DbConnection object using the given DbConnectionData.
This structure is used to initialize the DcaInstance.
Definition: base_classes.h:264
Encapsulates a category as defined in the categories XML schema (see Categories XML: Categories).
Definition: base_category.h:26
static DcaInstance create(const InitData &initData)
Creates a DcaInstance, starts up the DCA API and initializes the required main module.
static void SetupLicense(const std::string &ticket, const std::string &product, LicenseData &licenseData)
Sets up the given licenseData by copying the given ticket and product strings.
License createLicense(const LicenseData &licData, const ProxySettings &proxySettings=ProxySettings(), LogLevel aLogLevel=LOG_Initial) const
Creates a License object using the given LicenseData.
DCA_SIZE_TYPE size() const
Returns the number of results in the container.
int main(int argc, char *argv[])
The main routine.