dca_interface  6.3.4
url_samples/urldbsample_extended/main.cpp
1 /* IBM Source Code */
2 /* (C) Copyright IBM Corp. 2009, 2012 */
3 /* Licensed Materials - Property of IBM */
4 /* US Government Users Restricted Rights - Use duplication or disclosure restricted by GSA Schedule Contract with IBM Corp. */
5 
42 #include <string>
43 #include <vector>
44 #include <iostream>
45 #include <fstream>
46 
47 #ifdef WIN32
48 # include <winsock2.h>
49 #endif
50 
51 #include "dca/dca_base.h"
53 #include "dca/dca_callbacks.h"
54 
55 #include "mythreads.h"
57 
58 using namespace dca;
59 
60 const std::string S_ToolName = "urldbsample_extended";
61 const std::string S_ToolVersion = "1.3";
62 
67 const std::string S_UsageString =
68  " <redist-folder> <ticket> <product> <url-list-file>\n"
69  " redist-folder - the folder where the DCA is installed to\n"
70  " ticket - a valid ticket\n"
71  " product - the product associated with your ticket\n"
72  " url-list-file - file that includes the URLs to classify\n\n"
73  ;
74 
79 #ifdef WIN32
80 # define DCA_BINDIR "bin/Win32"
81 #else
82 # define DCA_BINDIR "bin/linux"
83 #endif
84 
89 #define DCA_INITDIR "init"
90 
94 #define DCA_LOGDIR "./logs"
95 
104 static void SetupInitData( const std::string& redist_folder, InitData& initData )
105 {
106  initData.binDir = redist_folder + DCA_BINDIR;
107  initData.initDir = redist_folder + DCA_INITDIR;
108  initData.logDir = DCA_LOGDIR;
109 }
110 
122 static bool StartupLibraries()
123 {
124 #ifdef WIN32
125  // Windows needs an extra socket-startup for this process to work
126  // correctly with e.g. IP(v6) input IP addresses
127  WORD wVersionRequested = MAKEWORD( 2, 2 );
128  WSADATA wsaData;
129  int err = WSAStartup( wVersionRequested, &wsaData );
130  if ( err != 0 ) {
131  std::cout << "Error on WSAStartup (" << err << ") occured, aborting" <<
132  std::endl;
133  return false;
134  }
135 #endif
136 
137  // init the 3rd party libraries
138  InitCUrl();
140  return true;
141 }
142 
149 static void ShutdownLibraries()
150 {
151  // deinit the 3rd party libraries
153  DeinitCUrl();
154 
155 #ifdef WIN32
156  // Cleanup Windows sockets for this process
157  WSACleanup();
158 #endif
159 }
160 
168 static void SetupLicense( const std::string& ticket, const std::string& product,
169  LicenseData& licenseData )
170 {
171  licenseData.ticket = ticket;
172  licenseData.product = product;
173 }
174 
182 static void SetupConnectionData( DbConnectionData& cData )
183 {
184  cData.useLocalDatabase = true;
185  cData.dbType = DBT_Url;
186 }
187 
195 static void PrintResults( const CategoriesInfo& catinfos, const UrlClassificationResults& cats )
196 {
197  const DCA_SIZE_TYPE numOfCats = cats.size();
198 
199  if( numOfCats == 0 ) {
200  std::cout << "No categories found." << std::endl;
201  return;
202  }
203 
204  const Categories myCategories = catinfos.getCategories();
205 
206  for( DCA_INDEX_TYPE i = 0; i < numOfCats; ++i ) {
207 
208  const UrlClassificationResult result = cats[ i ];
209  const Category myCategory = myCategories.byId( result );
210  const std::string catname = myCategory.name( "en_US" );
211 
212  if( myCategory != NullCategory ) {
213  std::cout << (i+1) << ".\t Category '" << catname <<
214  "' (id=" << myCategory.id() <<
215  ", groupid=" << myCategory.groupId() <<
216  ")" << std::endl;
217  }
218  }
219 }
220 
225 static void PrintToolHeader()
226 {
227  std::cout << "IBM DCA Sample: " << S_ToolName << " (" << S_ToolVersion << ")" << std::endl;
228 }
229 
235 static void PrintUsage( const char *name )
236 {
237  std::cout << "usage: " << name << S_UsageString << std::endl;
238 }
239 
246 static void LoadUrlFile( const std::string& fileName, std::vector<std::string>& urlList )
247 {
248  std::ifstream fstream( fileName.c_str(), std::ios::in );
249  if (!fstream.is_open()) return;
250 
251  std::string line;
252 
253  while ( std::getline(fstream, line) )
254  {
255  if( !line.empty() && line[line.length()-1] == '\r')
256  line.erase( line.length() - 1 );
257 
258  if( !line.empty() )
259  urlList.push_back( line );
260  }
261 }
262 
281 void TestUrlClassification( const std::string& aUrlListFile, const DcaInstance& myDca,
282  const UrlDbClassifier& myUrlDbClassifier, const CategoriesInfo& myCategoriesInfo )
283 {
284  std::cout << "Starting URL db classification routine..." << std::endl;
285 
286  UrlClassificationResults myUrlClassificationResults;
287 
288  std::vector<std::string> myUrlList;
289  LoadUrlFile( aUrlListFile, myUrlList );
290 
291  for( std::vector<std::string>::const_iterator U = myUrlList.begin(),
292  UEnd = myUrlList.end(); U != UEnd; ++U ) {
293 
294  const std::string& myUrlString = *U;
295 
296  std::cout << " Starting URL db classification for URL '" <<
297  myUrlString << "'" << std::endl;
298 
299  // setup a URL for given string
300  const Url myUrl = Url::create( myDca, myUrlString );
301 
302  // start the classification
303  FunctionResult myFR = myUrlDbClassifier.classify( myUrl, myUrlClassificationResults );
304 
305  if( !myFR ) {
306  // error occured.
307  std::cerr << "Error from uURLrl db classification. Details: " << myFR.getDescription() <<
308  " (" << myFR.getReturnCode() << "). Continuing with next URL." << std::endl;
309  continue;
310  }
311 
312  if( myUrlClassificationResults.isUnknownUrl() ) {
313  // the URL is not known in the database
314  std::cout << "Results: URL '" << myUrlString << "' is not known in the database. " <<
315  "Continuing with next URL." << std::endl;
316  continue;
317  }
318 
319  if( !myUrlClassificationResults.isCategorized() ) {
320  // the URL is known but does not contains any category. This is either a white-host or
321  // includes only categorized sub-folders.
322  std::cout << "Results: URL '" << myUrlString << "' is not categorized (but known in database). "
323  << "Continuing with next URL" << std::endl;
324  continue;
325  }
326 
327  // given URL is known and returned matched categories
328  std::cout << "Classification Results for URL '" << myUrlString << "'" << std::endl;
329  PrintResults( myCategoriesInfo, myUrlClassificationResults );
330  }
331 
332  std::cout << "Leaving URL db classification routine." << std::endl;
333 }
334 
342 int main( int argc, char *argv[] )
343 {
344  PrintToolHeader();
345 
346  int rc = 0;
347 
348  try {
349 
350  if( argc < 5 ) {
351  PrintUsage( argv[0] );
352  return 5;
353  }
354 
355  std::string myRedistFolder = argv[ 1 ];
356  const std::string myTicket = argv[ 2 ];
357  const std::string myProduct = argv[ 3 ];
358  const std::string myUrlList = argv[ 4 ];
359 
360  if( myRedistFolder.empty() || myTicket.empty() ||
361  myProduct.empty() || myUrlList.empty() ) {
362  PrintUsage( argv[0] );
363  return 5;
364  }
365 
366  // check for traling fileslash - and add if necessary
367  const char c = myRedistFolder[ myRedistFolder.length() - 1 ];
368  if( c != '/' && c != '\\' )
369  myRedistFolder += "/";
370 
371  // Initialize socket on Windows and 3rd party libraries
372  if( !StartupLibraries() )
373  return 5;
374 
375  {
376  // setup DCA directories
377  InitData myInitData;
378  SetupInitData( myRedistFolder, myInitData );
379 
380  // instantiate DCA API
381  DcaInstance myDca;
382  myDca = DcaInstance::create( myInitData );
383 
384  // setup license data
385  LicenseData myLicenseData;
386  SetupLicense( myTicket, myProduct, myLicenseData );
387  const License myLicense = myDca.createLicense( myLicenseData );
388 
389  if( !myLicense.isLicensed( UrlClassification::ID ) ) {
390  std::cout << "DCA is not licensed!" << std::endl;
391  rc = 5;
392  } else {
393 
394  // setup a signature database connection
395  DbConnectionData myDbConnectionData;
396  SetupConnectionData( myDbConnectionData );
397  const DbConnection myDbConnection = myDca.createDbConnection( myLicense, myDbConnectionData );
398 
399  // initialize the URL classification module and create a URL db classifier
400  // (embedded URL detection enabled, statistics and unknown url upload enabled)
401  const UrlClassification myUrlClassification = UrlClassification::create( myDca, myLicense );
402 
403  UrlDbClassifierOptions myUrlDbClassifierOptions;
404  myUrlDbClassifierOptions.enable_EmbeddedUrlDetection = true;
405  myUrlDbClassifierOptions.detect_EmbeddedUrlsInUrlPath = true;
406  myUrlDbClassifierOptions.enable_Feedback = true;
407 
408  const UrlDbClassifier myUrlDbClassifier = myUrlClassification.createDbClassifier( myDbConnection, myUrlDbClassifierOptions );
409 
410  // create a categories info for printing out the category names together with the results per URL
411  const CategoriesInfo myCategoriesInfo = myDca.getCategoriesInfo( DCA_CAT_INFO_TYPE_URL );
412 
413  // create the update module
414  const UpdateModule myUpdateModule = UpdateModule::create( myDca, myLicense );
415 
416  // set up an instance of MyScheduleEventSubscriber to capture the schedule event information
417  MyScheduleEventSubscriber mySubscriber;
418 
419  StartupThreads(myDca, myUpdateModule, &mySubscriber);
420 
421  // call URL db classification routine
422  TestUrlClassification( myUrlList, myDca, myUrlDbClassifier, myCategoriesInfo );
423 
424  myUpdateModule.cancelUpdate();
425  myDca.signal( DCA_SIG_ABORT );
426  ShutdownThreads();
427 
428  mySubscriber.DumpEventMessages();
429  }
430  }
431  }
432  catch( const ExDca& ex ) {
433  std::cerr << "DCA Exception occured. Details: " << ex.getDescription() <<
434  " (" << ex.getReturnCode() << ")." << std::endl;
435  rc = 10;
436  }
437  catch( const std::exception& s ) {
438  std::cerr << "std::exception occured. Details: " << s.what() << "." << std::endl;
439  rc = 10;
440  }
441  catch(...) {
442  std::cerr << "Unknown exception caught." << std::endl;
443  rc = 10;
444  }
445 
446  // deinit the 3rd party libraries
447  ShutdownLibraries();
448 
449  return rc;
450 }
451 
452 
453 
static DCA_MODULE_ID_TYPE ID
The unique ID of the URL classification module.
Example implementation of a schedule event subscriber.
Is used to create a License object. A license first must be created with DcaInstance::createLicense t...
Definition: base_classes.h:547
Exception class used in the DCA.
Definition: base_classes.h:237
bool isCategorized() const
Returns whether or not the URL matched one or more categories.
static Url create(const DcaInstance &aDcaInstance, const std::string &urlString)
Standard Url creation function.
bool cancelUpdate() const
Cancels a currently running update process. If there is currently no update running,...
const DCA_CATEGORIES_INFO_TYPE DCA_CAT_INFO_TYPE_URL
Refers to the internal categories info for URL classification.
void InitCUrl()
Initializes libcurl. Do not use any DCA function before initializing libcurl.
std::string initDir
the directory in which the DCA init files are stored
Definition: base_classes.h:266
bool signal(unsigned int signal) const
Cancels functions which could potentially take a long time to return, such as DcaInstance::schedule()...
static UpdateModule create(const DcaInstance &aDcaInstance, const License &aLicense, const ProxySettings &proxySettings=ProxySettings())
std::string getDescription() const
Returns a description of the error.
Definition of a container class for Category objects.
CategoriesInfo getCategoriesInfo(DCA_CATEGORIES_INFO_TYPE categoryType) const
Returns the DCA internal categories, groups and locales.
DCA_RESULT_TYPE getReturnCode() const
Returns the last error code (if any).
FunctionResult classify(const Url &aUrl, UrlClassificationResults &urlResults) const
Performs the URL classification and returns the results.
static void SetupInitData(const std::string &redist_folder, InitData &initData)
Sets up the given initData by substituting the given redist_folder with DCA subdirectories.
The update module is used to download and install DCA content and engine updates.
Definition: base_classes.h:917
std::string binDir
the directory in which the DCA binary (*.dca) files are stored
Definition: base_classes.h:265
This header includes initialization/deinitialization support functions for the 3rd party libraries us...
A container class that allows access to the contained Categories, Groups and Locales.
Sets up options for embedded URL detection and provided Feedback mechanism.
Header file for the schedule event subscriber.
void SetOpenSslCallbacks()
Initializes the required callbacks for OpenSSL when using HTTPS or SSL connections in a multi-threade...
const unsigned int DCA_SIG_ABORT
Currently the only supported signal parameter for the DcaInstance::signal() function.
Stores the connection data for a database.
Definition: base_classes.h:815
void UnsetOpenSslCallbacks()
Unsets the openssl callbacks. Do not call any DCA function after you have called this function.
void StartupThreads(const dca::DcaInstance &aDcaInstance, const dca::UpdateModule &aUpdateModule, volatile bool *schedulerErrorSignal, volatile bool *performUpdateErrorSignal)
Starts up the update and schedule threads and supplies the given DcaInstance and UpdateModule.
DCA_CATEGORY_ID_TYPE id() const
The category id.
void ShutdownThreads()
Shuts down the previously started update and schedule threads.
void DeinitCUrl()
Deinitializes libcurl. Do not call any DCA function after you have called this function.
bool isUnknownUrl() const
Returns whether a URL is known or unknown. A URL is unknown if it is not contained in the database.
Database connection class for a local or remote database.
Definition: base_classes.h:859
This header includes all header files of the URL Classification Package.
#define DCA_LOGDIR
Relative directory for logfile(s).
Main class for the URL classification.
#define DCA_INITDIR
DCA subdirectory of the DCA initialization data.
bool enable_Feedback
This switches the Feedback feature on or off. This is switched off by default.
DCA_RESULT_TYPE getReturnCode() const
Gets the code of the error.
DCA_CATEGORY_ID_TYPE UrlClassificationResult
The item of an URL classification result is typedef'd as DCA_CATEGORY_ID_TYPE.
void DumpEventMessages() const
Dumps all collected messages to screen.
Results of an URL classification.
bool detect_EmbeddedUrlsInUrlPath
If set to true, you can specify that embedded URL detection is additionally performed in the path par...
URL database classifier class.
static void PrintToolHeader()
Prints out the name and the version of this sample.
Use a License to initialize a classification package or a toolbox package.
Definition: base_classes.h:560
Category byId(DCA_CATEGORY_ID_TYPE id) const
Returns the category with the given category id.
bool isLicensed(DCA_MODULE_ID_TYPE id=0, bool force=false) const
Checks whether the given License is valid for the given module id.
std::string ticket
The ticket as provided in the license.
Definition: base_classes.h:548
This header includes all header files of the DCA Base Package.
bool useLocalDatabase
Set to true to connect to a local or custom database, set to false to use a remote database.
Definition: base_classes.h:821
Encapsulates the init and deinit of the DCA API.
Definition: base_classes.h:315
Category NullCategory
Defines a constant unassigned Category you can use for checks. if( myCat == NullCategory ) --> myCat ...
std::string product
The product code used with the license.
Definition: base_classes.h:549
const DbType DBT_Url
Used for DbConnection classes for URL classification.
static UrlClassification create(const DcaInstance &aDcaInstance, const License &aLicense)
Creates the URL classification module by using the given DcaInstance and License.
std::string logDir
the directory in which the DCA log file should be created
Definition: base_classes.h:267
static void PrintUsage(const char *name)
Prints out the syntax of the sample.
size_t DCA_INDEX_TYPE
Type for index access (used for arrays and collections).
Definition: base_types.h:66
DbType dbType
The type of the database.
Definition: base_classes.h:820
Categories getCategories() const
Returns the contained Categories.
Header file for functions related to start and stop the update and schedule threads.
std::string getDescription() const
Returns the description for the error or warning.
size_t DCA_SIZE_TYPE
Type for size (used for size of array and collections).
Definition: base_types.h:72
std::string name(const std::string &localeString=std::string()) const
Returns the localized (display) name of the category.
DCA_GROUP_ID_TYPE groupId() const
If the category is associated with a group, this is the group id.
bool enable_EmbeddedUrlDetection
If set to true embedded URL detection will be used in URL classification. This is switched on by defa...
#define DCA_BINDIR
DCA subdirectory of the DCA binaries.
Standard function result.
Definition: base_classes.h:148
Encapsulates a URL object.
Definition: base_url.h:44
UrlDbClassifier createDbClassifier(const DbConnection &aDbConnection, const UrlDbClassifierOptions &options=UrlDbClassifierOptions()) const
Create a URL database classifier. The classifier is created by using the provided database connection...
const std::string S_UsageString
Usage string, displayed if a parameter is missing.
DbConnection createDbConnection(const License &aLicense, const DbConnectionData &dbcData, const ProxySettings &proxySettings=ProxySettings(), LogLevel aLogLevel=LOG_Initial) const
Creates a DbConnection object using the given DbConnectionData.
This structure is used to initialize the DcaInstance.
Definition: base_classes.h:264
Encapsulates a category as defined in the categories XML schema (see Categories XML: Categories).
Definition: base_category.h:26
static DcaInstance create(const InitData &initData)
Creates a DcaInstance, starts up the DCA API and initializes the required main module.
static void SetupLicense(const std::string &ticket, const std::string &product, LicenseData &licenseData)
Sets up the given licenseData by copying the given ticket and product strings.
License createLicense(const LicenseData &licData, const ProxySettings &proxySettings=ProxySettings(), LogLevel aLogLevel=LOG_Initial) const
Creates a License object using the given LicenseData.
DCA_SIZE_TYPE size() const
Returns the number of results in the container.
int main(int argc, char *argv[])
The main routine.