dca_interface  6.3.4
customdb_samples/customdbsample_extended/main.cpp
1 /* IBM Source Code */
2 /* (C) Copyright IBM Corp. 2009, 2012 */
3 /* Licensed Materials - Property of IBM */
4 /* US Government Users Restricted Rights - Use duplication or disclosure restricted by GSA Schedule Contract with IBM Corp. */
5 
46 #include <string>
47 #include <vector>
48 #include <iostream>
49 #include <fstream>
50 
51 #include "dca/dca_base.h"
53 #include "dca/dca_callbacks.h"
54 
55 #include "mythreads.h"
56 
57 using namespace dca;
58 
59 const std::string S_ToolName = "customdbsample_extended";
60 const std::string S_ToolVersion = "1.2";
61 
66 const std::string S_UsageString =
67  " <redist-folder> <ticket> <product> <url-list-file> <custom-db-folder>\n"
68  " redist-folder - the folder where the DCA is installed to\n"
69  " ticket - a valid ticket\n"
70  " product - the product associated with your ticket\n"
71  " url-list-file - file that includes the URLs to classify\n"
72  " custom-db-folder - folder where the Custom Database is located\n\n"
73  ;
74 
75 volatile bool S_schedulerThreadError = false;
76 volatile bool S_updateThreadError = false;
77 
82 #ifdef WIN32
83 # define DCA_BINDIR "bin/Win32"
84 #else
85 # define DCA_BINDIR "bin/linux"
86 #endif
87 
92 #define DCA_INITDIR "init"
93 
97 #define DCA_LOGDIR "./logs"
98 
107 static void SetupInitData( const std::string& redist_folder, InitData& initData )
108 {
109  initData.binDir = redist_folder + DCA_BINDIR;
110  initData.initDir = redist_folder + DCA_INITDIR;
111  initData.logDir = DCA_LOGDIR;
112 }
113 
121 static void SetupLicense( const std::string& ticket, const std::string& product,
122  LicenseData& licenseData )
123 {
124  licenseData.ticket = ticket;
125  licenseData.product = product;
126 }
127 
135 static void SetupConnectionData( DbConnectionData& cData )
136 {
137  cData.useLocalDatabase = true;
138  cData.dbType = DBT_Url;
139 }
140 
148 static void SetupCustomConnectionData( const std::string& customdb_folder, DbConnectionData& cData )
149 {
150  cData.useLocalDatabase = true;
151  cData.dbType = DBT_Custom;
152  cData.customData.configDir = customdb_folder;
153 }
154 
161 static void PrintResults( const CategoriesInfo& catinfos, const UrlClassificationResults& cats )
162 {
163  const DCA_SIZE_TYPE numOfCats = cats.size();
164 
165  if( numOfCats == 0 ) {
166  std::cout << "No categories found." << std::endl;
167  return;
168  }
169 
170  const Categories myCategories = catinfos.getCategories();
171 
172  for( DCA_INDEX_TYPE i = 0; i < numOfCats; i++ ) {
173 
174  const UrlClassificationResult result = cats[ i ];
175  const Category myCategory = myCategories.byId( result );
176  const std::string catname = myCategory.name( "en_US" );
177 
178  std::cout << (i+1) << ".\t Category '" << catname <<
179  "' (id=" << myCategory.id() <<
180  ", groupid=" << myCategory.groupId() <<
181  ")" << std::endl;
182  }
183 }
184 
189 static void PrintToolHeader()
190 {
191  std::cout << "IBM DCA Sample: " << S_ToolName << " (" << S_ToolVersion << ")" << std::endl;
192 }
193 
199 static void PrintUsage( const char *name )
200 {
201  std::cout << "usage: " << name << S_UsageString << std::endl;
202 }
203 
210 static void LoadUrlFile( const std::string& fileName, std::vector<std::string>& urlList )
211 {
212  std::ifstream fstream( fileName.c_str(), std::ios::in );
213  if (!fstream.is_open()) return;
214 
215  std::string line;
216 
217  while ( std::getline(fstream, line) )
218  {
219  if( !line.empty() && line[line.length()-1] == '\r')
220  line.erase( line.length() - 1 );
221 
222  if( !line.empty() )
223  urlList.push_back( line );
224  }
225 }
226 
244 void TestUrlClassification( const std::string& aUrlListFile, const DcaInstance& myDca,
245  const UrlDbClassifier& myUrlDbClassifier, const UrlDbClassifier& myUrlCustomDbClassifier,
246  const CategoriesInfo& myCategoriesInfo )
247 {
248  try {
249  std::cout << "Starting custom & URL db classification routine..." << std::endl;
250 
251  UrlClassificationResults myUrlClassificationResults;
252 
253  std::vector<std::string> myUrlList;
254  LoadUrlFile( aUrlListFile, myUrlList );
255 
256  // for each found URL first check for a match in the Custom Database
257  // -- if matched - done
258  // -- else try to match given URL in standard URL database
259  for( std::vector<std::string>::const_iterator U = myUrlList.begin(),
260  UEnd = myUrlList.end(); U != UEnd; ++U ) {
261 
262  // check for errors from update and scheduler thread....
263  if( S_schedulerThreadError ) {
264  std::cerr << "Scheduler Thread got an error from DcaInstance::schedule() call. aborting." <<
265  std::endl;
266  return;
267  }
268 
269  if( S_updateThreadError ) {
270  std::cerr << "Updater Thread got an error from UpdateModule::performUpdate() call. aborting." <<
271  std::endl;
272  return;
273  }
274 
275  // start over with next URL
276  const std::string& myUrlString = *U;
277 
278  std::cout << " Starting URL db classification for URL '" <<
279  myUrlString << "'" << std::endl;
280 
281  // setup a URL for given string
282  const Url myUrl = Url::create( myDca, myUrlString );
283 
284  // start the classification
285  // match with Custom Database
286  FunctionResult myFR = myUrlCustomDbClassifier.classify( myUrl, myUrlClassificationResults );
287  if( !myFR ) {
288  std::cerr << "Error from custom URL db classification. Details: " << myFR.getDescription() <<
289  " (" << myFR.getReturnCode() << "). Continuing with next URL." << std::endl;
290  continue;
291  }
292 
293  if( !myUrlClassificationResults.isUnknownUrl() ) {
294  std::cout << "Got a match from Custom Database." << std::endl;
295 
296  if( !myUrlClassificationResults.isCategorized() ) {
297  std::cout << "Results: URL '" << myUrlString << "' is not categorized (but known in database). "
298  << "Continuing with next URL" << std::endl;
299  }
300  else {
301  std::cout << "Classification Results for URL '" << myUrlString << "'" << std::endl;
302  PrintResults( myCategoriesInfo, myUrlClassificationResults );
303  }
304  continue;
305  }
306 
307  // match with local URL database
308  myFR = myUrlDbClassifier.classify( myUrl, myUrlClassificationResults );
309 
310  if( !myFR ) {
311  std::cerr << "Error from URL db classification. Details: " << myFR.getDescription() <<
312  " (" << myFR.getReturnCode() << "). Continuing with next URL." << std::endl;
313  continue;
314  }
315 
316  if( myUrlClassificationResults.isUnknownUrl() ) {
317  // the URL is not known in the database
318  std::cout << "Results: URL '" << myUrlString << "' is not known in the database. " <<
319  "Continuing with next URL." << std::endl;
320  continue;
321  }
322 
323  if( !myUrlClassificationResults.isCategorized() ) {
324  // the URL is known but does not contains any category. This is either a white-host or
325  // includes only categorized sub-folders.
326  std::cout << "Results: URL '" << myUrlString << "' is not categorized (but known in database). "
327  << "Continuing with next URL" << std::endl;
328  continue;
329  }
330 
331  // given URL is known and returned matched categories
332  std::cout << "Classification Results for URL '" << myUrlString << "'" << std::endl;
333  PrintResults( myCategoriesInfo, myUrlClassificationResults );
334  }
335  std::cout << "Leaving URL db classification routine." << std::endl;
336  }
337  catch( const ExDca& ex ) {
338  std::cerr << "DCA Exception occured in TestUrlClassification(). Details: " <<
339  ex.getDescription() << " (" << ex.getReturnCode() << ")." << std::endl;
340  }
341  catch( const std::exception& s ) {
342  std::cerr << "std::exception occured in TestUrlClassification(). Details: " <<
343  s.what() << "." << std::endl;
344  }
345  catch(...) {
346  std::cerr << "Unknown exception caught in TestUrlClassification()." << std::endl;
347  }
348 }
349 
358 int main( int argc, char *argv[] )
359 {
360  PrintToolHeader();
361 
362  int rc = 0;
363 
364  try {
365 
366  if( argc != 6 ) {
367  PrintUsage( argv[0] );
368  return 5;
369  }
370 
371  std::string myRedistFolder = argv[ 1 ];
372  const std::string myTicket = argv[ 2 ];
373  const std::string myProduct = argv[ 3 ];
374  const std::string myUrlList = argv[ 4 ];
375  std::string myDbFolder = argv[ 5 ];
376 
377  if( myRedistFolder.empty() || myTicket.empty() ||
378  myProduct.empty() || myUrlList.empty()||
379  myDbFolder.empty() ) {
380  PrintUsage( argv[0] );
381  return 5;
382  }
383 
384  // check for trailing fileslash - and add if necessary
385  char c = myRedistFolder[ myRedistFolder.length() - 1 ];
386  if( c != '/' && c != '\\' )
387  myRedistFolder += "/";
388 
389  c = myDbFolder[ myDbFolder.length() - 1 ];
390  if( c != '/' && c != '\\' )
391  myDbFolder += "/";
392 
393  // init the 3rd party libraries
394  InitCUrl();
396 
397  {
398  // setup DCA directories
399  InitData myInitData;
400  SetupInitData( myRedistFolder, myInitData );
401 
402  // instantiate DCA API
403  DcaInstance myDca;
404  myDca = DcaInstance::create( myInitData );
405 
406  // setup license data
407  LicenseData myLicenseData;
408  SetupLicense( myTicket, myProduct, myLicenseData );
409  const License myLicense = myDca.createLicense( myLicenseData );
410 
411  if( !myLicense.isLicensed( UrlClassification::ID ) ) {
412  std::cout << "DCA is not licensed!" << std::endl;
413  rc = 5;
414  } else {
415  // setup a signature database connection
416  DbConnectionData myDbConnectionData;
417  SetupConnectionData( myDbConnectionData );
418  const DbConnection myDbConnection = myDca.createDbConnection( myLicense, myDbConnectionData );
419 
420  // setup a custom db signature database connection
421  DbConnectionData myCustomDbConnectionData;
422  SetupCustomConnectionData( myDbFolder, myCustomDbConnectionData );
423  const DbConnection myCustomDbConnection = myDca.createDbConnection( myLicense, myCustomDbConnectionData );
424 
425  // initialize the URL classification module and create a URL db classifier
426  UrlClassification myUrlClassification = UrlClassification::create( myDca, myLicense );
427  const UrlDbClassifier myUrlDbClassifier = myUrlClassification.createDbClassifier( myDbConnection );
428 
429  // initialize the URL classifier connected to the custom DbConnection
430  const UrlDbClassifier myUrlCustomDbClassifier = myUrlClassification.createDbClassifier( myCustomDbConnection );
431 
432  // create a categories info for printing out the cotegory names together with the results per URL
433  const CategoriesInfo myCategoriesInfo = myDca.getCategoriesInfo( DCA_CAT_INFO_TYPE_URL );
434 
435  // create the update module
436  const UpdateModule myUpdateModule = UpdateModule::create( myDca, myLicense );
437 
438  StartupThreads(myDca, myUpdateModule, &S_schedulerThreadError, &S_updateThreadError);
439 
440  // call URL Classification with a Custom Database
441  TestUrlClassification( myUrlList, myDca, myUrlDbClassifier, myUrlCustomDbClassifier, myCategoriesInfo );
442 
443  myUpdateModule.cancelUpdate();
444  myDca.signal( DCA_SIG_ABORT );
445  ShutdownThreads();
446  }
447  }
448  }
449  catch( const ExDca& ex ) {
450  std::cerr << "DCA Exception occured. Details: " << ex.getDescription() << " (" << ex.getReturnCode() << ")." << std::endl;
451  rc = 10;
452  }
453  catch( const std::exception& s ) {
454  std::cerr << "std::exception occured. Details: " << s.what() << "." << std::endl;
455  rc = 10;
456  }
457  catch(...) {
458  std::cerr << "Unknown exception caught." << std::endl;
459  rc = 10;
460  }
461 
462  // deinit the 3rd party libraries
464  DeinitCUrl();
465 
466  return rc;
467 }
468 
469 
470 
static DCA_MODULE_ID_TYPE ID
The unique ID of the URL classification module.
Is used to create a License object. A license first must be created with DcaInstance::createLicense t...
Definition: base_classes.h:547
Exception class used in the DCA.
Definition: base_classes.h:237
bool isCategorized() const
Returns whether or not the URL matched one or more categories.
static Url create(const DcaInstance &aDcaInstance, const std::string &urlString)
Standard Url creation function.
bool cancelUpdate() const
Cancels a currently running update process. If there is currently no update running,...
const DCA_CATEGORIES_INFO_TYPE DCA_CAT_INFO_TYPE_URL
Refers to the internal categories info for URL classification.
void InitCUrl()
Initializes libcurl. Do not use any DCA function before initializing libcurl.
std::string initDir
the directory in which the DCA init files are stored
Definition: base_classes.h:266
bool signal(unsigned int signal) const
Cancels functions which could potentially take a long time to return, such as DcaInstance::schedule()...
static UpdateModule create(const DcaInstance &aDcaInstance, const License &aLicense, const ProxySettings &proxySettings=ProxySettings())
std::string getDescription() const
Returns a description of the error.
std::string configDir
Specifies the complete folder path where the custom database is located, or the folder in which it sh...
Definition: base_classes.h:754
Definition of a container class for Category objects.
CategoriesInfo getCategoriesInfo(DCA_CATEGORIES_INFO_TYPE categoryType) const
Returns the DCA internal categories, groups and locales.
DbConnectionCustomData customData
Fill out this structure only if you are using a custom database.
Definition: base_classes.h:823
DCA_RESULT_TYPE getReturnCode() const
Returns the last error code (if any).
FunctionResult classify(const Url &aUrl, UrlClassificationResults &urlResults) const
Performs the URL classification and returns the results.
static void SetupInitData(const std::string &redist_folder, InitData &initData)
Sets up the given initData by substituting the given redist_folder with DCA subdirectories.
The update module is used to download and install DCA content and engine updates.
Definition: base_classes.h:917
std::string binDir
the directory in which the DCA binary (*.dca) files are stored
Definition: base_classes.h:265
This header includes initialization/deinitialization support functions for the 3rd party libraries us...
A container class that allows access to the contained Categories, Groups and Locales.
void SetOpenSslCallbacks()
Initializes the required callbacks for OpenSSL when using HTTPS or SSL connections in a multi-threade...
const unsigned int DCA_SIG_ABORT
Currently the only supported signal parameter for the DcaInstance::signal() function.
Stores the connection data for a database.
Definition: base_classes.h:815
void UnsetOpenSslCallbacks()
Unsets the openssl callbacks. Do not call any DCA function after you have called this function.
void StartupThreads(const dca::DcaInstance &aDcaInstance, const dca::UpdateModule &aUpdateModule, volatile bool *schedulerErrorSignal, volatile bool *performUpdateErrorSignal)
Starts up the update and schedule threads and supplies the given DcaInstance and UpdateModule.
DCA_CATEGORY_ID_TYPE id() const
The category id.
void ShutdownThreads()
Shuts down the previously started update and schedule threads.
void DeinitCUrl()
Deinitializes libcurl. Do not call any DCA function after you have called this function.
bool isUnknownUrl() const
Returns whether a URL is known or unknown. A URL is unknown if it is not contained in the database.
Database connection class for a local or remote database.
Definition: base_classes.h:859
This header includes all header files of the URL Classification Package.
#define DCA_LOGDIR
Relative directory for logfile(s).
Main class for the URL classification.
#define DCA_INITDIR
DCA subdirectory of the DCA initialization data.
DCA_RESULT_TYPE getReturnCode() const
Gets the code of the error.
DCA_CATEGORY_ID_TYPE UrlClassificationResult
The item of an URL classification result is typedef'd as DCA_CATEGORY_ID_TYPE.
Results of an URL classification.
URL database classifier class.
static void PrintToolHeader()
Prints out the name and the version of this sample.
Use a License to initialize a classification package or a toolbox package.
Definition: base_classes.h:560
Category byId(DCA_CATEGORY_ID_TYPE id) const
Returns the category with the given category id.
const DbType DBT_Custom
Used for DbConnection classes of custom databases.
bool isLicensed(DCA_MODULE_ID_TYPE id=0, bool force=false) const
Checks whether the given License is valid for the given module id.
std::string ticket
The ticket as provided in the license.
Definition: base_classes.h:548
This header includes all header files of the DCA Base Package.
bool useLocalDatabase
Set to true to connect to a local or custom database, set to false to use a remote database.
Definition: base_classes.h:821
Encapsulates the init and deinit of the DCA API.
Definition: base_classes.h:315
std::string product
The product code used with the license.
Definition: base_classes.h:549
const DbType DBT_Url
Used for DbConnection classes for URL classification.
static UrlClassification create(const DcaInstance &aDcaInstance, const License &aLicense)
Creates the URL classification module by using the given DcaInstance and License.
std::string logDir
the directory in which the DCA log file should be created
Definition: base_classes.h:267
static void PrintUsage(const char *name)
Prints out the syntax of the sample.
size_t DCA_INDEX_TYPE
Type for index access (used for arrays and collections).
Definition: base_types.h:66
DbType dbType
The type of the database.
Definition: base_classes.h:820
Categories getCategories() const
Returns the contained Categories.
Header file for functions related to start and stop the update and schedule threads.
std::string getDescription() const
Returns the description for the error or warning.
size_t DCA_SIZE_TYPE
Type for size (used for size of array and collections).
Definition: base_types.h:72
std::string name(const std::string &localeString=std::string()) const
Returns the localized (display) name of the category.
DCA_GROUP_ID_TYPE groupId() const
If the category is associated with a group, this is the group id.
#define DCA_BINDIR
DCA subdirectory of the DCA binaries.
Standard function result.
Definition: base_classes.h:148
Encapsulates a URL object.
Definition: base_url.h:44
UrlDbClassifier createDbClassifier(const DbConnection &aDbConnection, const UrlDbClassifierOptions &options=UrlDbClassifierOptions()) const
Create a URL database classifier. The classifier is created by using the provided database connection...
const std::string S_UsageString
Usage string, displayed if a parameter is missing.
DbConnection createDbConnection(const License &aLicense, const DbConnectionData &dbcData, const ProxySettings &proxySettings=ProxySettings(), LogLevel aLogLevel=LOG_Initial) const
Creates a DbConnection object using the given DbConnectionData.
This structure is used to initialize the DcaInstance.
Definition: base_classes.h:264
Encapsulates a category as defined in the categories XML schema (see Categories XML: Categories).
Definition: base_category.h:26
static DcaInstance create(const InitData &initData)
Creates a DcaInstance, starts up the DCA API and initializes the required main module.
static void SetupLicense(const std::string &ticket, const std::string &product, LicenseData &licenseData)
Sets up the given licenseData by copying the given ticket and product strings.
License createLicense(const LicenseData &licData, const ProxySettings &proxySettings=ProxySettings(), LogLevel aLogLevel=LOG_Initial) const
Creates a License object using the given LicenseData.
DCA_SIZE_TYPE size() const
Returns the number of results in the container.
int main(int argc, char *argv[])
The main routine.