dca_interface  6.3.4
text_samples/htmltextsample/main.cpp
1 /* IBM Source Code */
2 /* (C) Copyright IBM Corp. 2009, 2012 */
3 /* Licensed Materials - Property of IBM */
4 /* US Government Users Restricted Rights - Use duplication or disclosure restricted by GSA Schedule Contract with IBM Corp. */
5 
47 #include <string>
48 #include <vector>
49 #include <iostream>
50 #include <fstream>
51 
52 #include "dca/dca_base.h"
54 #include "dca/dca_callbacks.h"
55 
56 using namespace dca;
57 
58 const std::string S_ToolName = "htmltextsample";
59 const std::string S_ToolVersion = "1.2";
60 
65 const std::string S_UsageString =
66  "<redist-folder> <ticket> <product> <html-list-file> [locale]\n"
67  " redist-folder - the folder where the DCA is installed to\n"
68  " ticket - a valid ticket\n"
69  " product - the product associated with your ticket\n"
70  " html-list-file - file that includes the HTML files to classify\n"
71  " locale - optional locale used for printing out category names, default = en_US\n\n"
72  ;
73 
78 #ifdef WIN32
79 # define DCA_BINDIR "bin/Win32"
80 #else
81 # define DCA_BINDIR "bin/linux"
82 #endif
83 
88 #define DCA_INITDIR "init"
89 
94 #define DCA_LOGDIR "./logs"
95 
104 static void SetupInitData( const std::string& redist_folder, InitData& initData )
105 {
106  initData.binDir = redist_folder + DCA_BINDIR;
107  initData.initDir = redist_folder + DCA_INITDIR;
108  initData.logDir = DCA_LOGDIR;
109 }
110 
118 static void SetupLicense( const std::string& ticket, const std::string& product,
119  LicenseData& licenseData )
120 {
121  licenseData.ticket = ticket;
122  licenseData.product = product;
123 }
124 
131 static void LoadHtmlFileList( const std::string& fileName, std::vector<std::string>& htmlList )
132 {
133  std::ifstream fstream( fileName.c_str(), std::ios::in );
134 
135  if (!fstream.is_open()) {
136  std::cout << "Opening HTML list file '" << fileName << "' failed!" << std::endl;
137  }
138 
139  std::string line;
140 
141  while ( std::getline(fstream, line) )
142  {
143  if( !line.empty() && line[line.length()-1] == '\r')
144  line.erase( line.length() - 1 );
145 
146  if( !line.empty() )
147  htmlList.push_back( line );
148  }
149 }
150 
157 static std::string LoadBinaryFile( const std::string& fileName )
158 {
159  std::ifstream fstream( fileName.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
160  if (!fstream.is_open()) return std::string();
161 
162  const size_t size = fstream.tellg();
163  fstream.seekg( 0, std::ios::beg );
164 
165  char* buffer = new char [ size ];
166  if (!buffer) return std::string();
167 
168  fstream.read( buffer, size );
169  std::string file = std::string( buffer, size );
170  delete [] buffer;
171 
172  return file;
173 }
174 
191 void TestTextClassification( const std::string& htmlFile, const DcaInstance& myDca,
192  const HtmlTextClassifier& myTextClassifier, const dca::CategoriesInfo& myCategoriesInfo,
193  const std::string& locale )
194 {
195  std::cout << "\nStarting HTML Text Classification routine..." << std::endl;
196 
197  std::vector<std::string> myFiles;
198  LoadHtmlFileList( htmlFile, myFiles );
199 
200  const dca::Categories myCategories = myCategoriesInfo.getCategories();
201 
202  // perform an HTML Text Classification with every of the HTML files...
203  for( std::vector<std::string>::const_iterator F = myFiles.begin(),
204  FEnd = myFiles.end(); F != FEnd; ++F ) {
205 
206  const std::string myFile = *F;
207 
208  std::cout << "Starting HTML Text Classification with file '" <<
209  myFile << "'" << std::endl;
210 
211  const std::string fileContents = LoadBinaryFile( myFile );
212  if( fileContents.empty() ) {
213  std::cerr << "Got empty file '" << myFile << "', continuing with next file." << std::endl;
214  continue;
215  }
216 
217  // create an HTML text object with given file contents
218  const HtmlText myText = HtmlText::create( myDca, fileContents );
219 
220  // perform the classification
221  TextClassificationResults myTextClassificationResults;
222  FunctionResult myFR = myTextClassifier.classify( myText, myTextClassificationResults );
223 
224  if( !myFR ) {
225  // we encounter an error... print detailed error code and continue with next file
226  std::cerr << "Got error from HTML Text Classification. Details: '" <<
227  myFR.getDescription() << "' (" << myFR.getReturnCode() <<
228  "). Continuing with next file" << std::endl;
229  continue;
230  }
231 
232  if( !myTextClassificationResults.isCategorized() ) {
233  // no error, but probably not enough data to classify or HTML page does not match to
234  // classifier categories.
235  std::cout << "Results: No matching categories found." << std::endl;
236  }
237  else {
238  // we got at least one matching category.
239  const DCA_SIZE_TYPE NumOfTCs = myTextClassificationResults.size();
240  // iterate through all matched categories and print out the detailed scores and results
241  for( DCA_INDEX_TYPE i = 0; i < NumOfTCs; ++i ) {
242  const TextClassificationResult myTextResult = myTextClassificationResults[ i ];
243  const double score = myTextResult.score();
244  const DCA_CATEGORY_ID_TYPE catid = myTextResult.id();
245  const dca::Category myCategory = myCategories.byId(catid);
246  const std::string catname = myCategory.name( locale );
247 
248  if( catid == CATEGORY_ID_TEXT_PORNOGRAPHY ) {
249  std::cout << "Results: " << (i+1) << ".\t Category: '" << catname <<
250  "', Score: " << score << std::endl;
251  }
252  else if( catid == CATEGORY_ID_TEXT_WAREZ ) {
253  std::cout << "Results: " << (i+1) << ".\t Category: '" << catname <<
254  "', Score: " << score << std::endl;
255  }
256  else if( catid == CATEGORY_ID_TEXT_GAMBLING ) {
257  std::cout << "Results: " << (i+1) << ".\t Category: '" << catname <<
258  "', Score: " << score << std::endl;
259  }
260  else if( catid == CATEGORY_ID_TEXT_ANONYMOUS_PROXIES ) {
261  std::cout << "Results: " << (i+1) << ".\t Category: '" << catname <<
262  "', Score: " << score << std::endl;
263  }
264  else if( catid == CATEGORY_ID_TEXT_ILLEGAL_DRUGS ) {
265  std::cout << "Results: " << (i+1) << ".\t Category: '" << catname <<
266  "', Score: " << score << std::endl;
267  }
268  else if( catid == CATEGORY_ID_TEXT_WEAPONS ) {
269  std::cout << "Results: " << (i+1) << ".\t Category: '" << catname <<
270  "', Score: " << score << std::endl;
271  }
272  else { // we got a new category not known at the time this sample has been created!
273  std::cout << "Results: " << (i+1) << ".\t New Category: '" << catname <<
274  "', Score: " << score << std::endl;
275  }
276  }
277  }
278 
279  } // for
280  std::cout << "Leaving HTML Text Classification routine." << std::endl;
281 }
282 
290 static void PrintCategoriesInfo( const dca::CategoriesInfo& myCategoriesInfo, const std::string& locale )
291 {
292  const dca::Categories myCategories = myCategoriesInfo.getCategories();
293  const DCA_SIZE_TYPE numOfCategories = myCategories.size();
294  std::cout << "\nAvailable are the following " << numOfCategories << " categories:\n";
295 
296  for( DCA_INDEX_TYPE i = 0; i < numOfCategories; ++i ) {
297  const dca::Category myCategory = myCategories[ i ];
298  std::cout << "(" << (i+1)
299  << ") Category name=" << myCategory.name( locale )
300  << ", id=" << myCategory.id()
301  << ", groupId=" << myCategory.groupId() << std::endl;
302  }
303 
304  {
305  const dca::Groups myGroups = myCategoriesInfo.getGroups();
306  const DCA_SIZE_TYPE numOfGroups = myGroups.size();
307  std::cout << "\nAvailable are the following " << numOfGroups << " groups:\n";
308 
309  for( DCA_INDEX_TYPE i = 0; i < numOfGroups; ++i ) {
310  const dca::Group myGroup = myGroups[ i ];
311  std::cout << "(" << (i+1)
312  << ") Group name=" << myGroup.name( locale )
313  << ", id=" << myGroup.id() << std::endl;
314  }
315  }
316 
317  {
318  const dca::Locales myLocales = myCategoriesInfo.getLocales();
319  const DCA_SIZE_TYPE numOfLocales = myLocales.size();
320  std::cout << "\nAvailable are the following " << numOfLocales << " locales:\n";
321 
322  for( DCA_INDEX_TYPE i = 0; i < numOfLocales; ++i ) {
323  const dca::Locale myLocale = myLocales[ i ];
324  std::cout << "(" << (i+1)
325  << ") Locale displayName='" << myLocale.displayName()
326  << "', languageId='" << myLocale.languageId() << "'"
327  << std::endl;
328  }
329  }
330 }
331 
336 static void PrintToolHeader()
337 {
338  std::cout << "IBM DCA Sample: " << S_ToolName << " (" << S_ToolVersion << ")" << std::endl;
339 }
340 
346 static void PrintUsage( const char *name )
347 {
348  std::cout << name << std::endl << "Usage:" << std::endl;
349  std::cout << S_UsageString << std::endl;
350 }
351 
359 int main( int argc, char *argv[] )
360 {
361  PrintToolHeader();
362 
363  int rc = 0;
364 
365  try {
366 
367  if( argc < 5 ) {
368  PrintUsage( argv[0] );
369  return 5;
370  }
371 
372  std::string redist_folder = argv[1];
373  const std::string ticket = argv[2];
374  const std::string product = argv[3];
375  const std::string htmlFile = argv[4];
376  std::string locale = "en_US";
377 
378  if( redist_folder.empty() || ticket.empty() ||
379  product.empty() || htmlFile.empty() ) {
380  PrintUsage( argv[0] );
381  return 5;
382  }
383 
384  if (argc > 5)
385  locale = argv[5];
386 
387  // check for trailing fileslash - and add if necessary
388  const char c = redist_folder[ redist_folder.length() - 1 ];
389  if( c != '/' && c != '\\' )
390  redist_folder += "/";
391 
392  // init the 3rd party libraries
393  InitCUrl();
395 
396  {
397  // setup DCA directories
398  InitData myInitData;
399  SetupInitData( redist_folder, myInitData );
400 
401  // instantiate the DCA API
402  DcaInstance myDca;
403  myDca = DcaInstance::create( myInitData );
404 
405  // setup license data
406  LicenseData myLicenseData;
407  SetupLicense( ticket, product, myLicenseData );
408  const License myLicense = myDca.createLicense( myLicenseData );
409 
410  if( !myLicense.isLicensed( TextClassification::ID ) ) {
411  std::cout << "DCA is not licensed!" << std::endl;
412  }
413  // get internal DCA categories info related to Text Classification
414  const dca::CategoriesInfo myCategoriesInfo = myDca.getCategoriesInfo( DCA_CAT_INFO_TYPE_TEXT );
415  // print the available categories, groups and locales
416  PrintCategoriesInfo( myCategoriesInfo, locale );
417 
418  // create the classification module and an HTML Text Classifier
419  TextClassification myTextClassification = TextClassification::create( myDca, myLicense );
420  const HtmlTextClassifier myTextClassifier = myTextClassification.createHtmlClassifier();
421 
422  // call Text Classification routine
423  TestTextClassification( htmlFile, myDca, myTextClassifier, myCategoriesInfo, locale );
424  }
425  }
426  catch( const ExDca& ex ) {
427  std::cerr << "DCA Exception occured. Details: " << ex.getDescription() <<
428  " (" << ex.getReturnCode() << ")." << std::endl;
429  rc = 10;
430  }
431  catch( const std::exception& s ) {
432  std::cerr << "std::exception occured. Details: " << s.what() << "." << std::endl;
433  rc = 10;
434  }
435  catch(...) {
436  std::cerr << "Unknown exception caught." << std::endl;
437  rc = 10;
438  }
439 
440  // deinit the 3rd party libraries
442  DeinitCUrl();
443 
444  return rc;
445 }
Is used to create a License object. A license first must be created with DcaInstance::createLicense t...
Definition: base_classes.h:547
Exception class used in the DCA.
Definition: base_classes.h:237
Single result of a text classification.
void InitCUrl()
Initializes libcurl. Do not use any DCA function before initializing libcurl.
std::string initDir
the directory in which the DCA init files are stored
Definition: base_classes.h:266
HtmlTextClassifier createHtmlClassifier() const
Creates a HtmlTextClassifier that is used to classify HtmlText objects.
DCA_SIZE_TYPE size() const
Returns the number of categories in the container.
std::string getDescription() const
Returns a description of the error.
Definition of a container class for Category objects.
CategoriesInfo getCategoriesInfo(DCA_CATEGORIES_INFO_TYPE categoryType) const
Returns the DCA internal categories, groups and locales.
Encapsulation of a locale class, which allows access to the language id and the display name of local...
Definition: base_locale.h:28
const DCA_CATEGORY_ID_TYPE CATEGORY_ID_TEXT_ILLEGAL_DRUGS
Text category.
DCA_GROUP_ID_TYPE id() const
Returns the id of the group as defined in the related categories XML schema.
DCA_RESULT_TYPE getReturnCode() const
Returns the last error code (if any).
static void SetupInitData(const std::string &redist_folder, InitData &initData)
Sets up the given initData by substituting the given redist_folder with DCA subdirectories.
std::string binDir
the directory in which the DCA binary (*.dca) files are stored
Definition: base_classes.h:265
This header includes initialization/deinitialization support functions for the 3rd party libraries us...
A container class that allows access to the contained Categories, Groups and Locales.
void SetOpenSslCallbacks()
Initializes the required callbacks for OpenSSL when using HTTPS or SSL connections in a multi-threade...
void UnsetOpenSslCallbacks()
Unsets the openssl callbacks. Do not call any DCA function after you have called this function.
DCA_CATEGORY_ID_TYPE id() const
The category id.
void DeinitCUrl()
Deinitializes libcurl. Do not call any DCA function after you have called this function.
DCA_SIZE_TYPE size() const
Returns the number of categories in the container.
#define DCA_LOGDIR
Relative directory for logfile(s).
This header includes all header files of the Text Classification Package.
#define DCA_INITDIR
DCA subdirectory of the DCA initialization data.
Definition of a container class for Group objects.
Definition: base_groups.h:34
const DCA_CATEGORY_ID_TYPE CATEGORY_ID_TEXT_WAREZ
Text category.
DCA_RESULT_TYPE getReturnCode() const
Gets the code of the error.
std::string name(const std::string &localeString=std::string()) const
Returns the localized (display) name of the group.
static DCA_MODULE_ID_TYPE ID
The unique ID of the text classification module.
Locales getLocales() const
Returns the contained Locales.
static void PrintToolHeader()
Prints out the name and the version of this sample.
double score() const
Returns the score of the classification (if any), range is from 0.0 to 1.0.
Use a License to initialize a classification package or a toolbox package.
Definition: base_classes.h:560
Category byId(DCA_CATEGORY_ID_TYPE id) const
Returns the category with the given category id.
bool isCategorized() const
Returns whether there are any results for the text classification.
bool isLicensed(DCA_MODULE_ID_TYPE id=0, bool force=false) const
Checks whether the given License is valid for the given module id.
std::string ticket
The ticket as provided in the license.
Definition: base_classes.h:548
This header includes all header files of the DCA Base Package.
const DCA_CATEGORY_ID_TYPE CATEGORY_ID_TEXT_GAMBLING
Text category.
DCA_SIZE_TYPE size() const
Returns the number of results in the container.
static HtmlText create(const DcaInstance &aDcaInstance, const std::string &htmlContent)
Creates an HTML text object, used as an input parameter for text classification.
Encapsulates the init and deinit of the DCA API.
Definition: base_classes.h:315
HTML text classifier object for text classification.
FunctionResult classify(const HtmlText &aText, TextClassificationResults &aTextResults) const
The HTML Text Classification method. The method takes an initialized HtmlText object and returns the ...
DCA_CATEGORY_ID_TYPE id() const
Returns the category id of the classification (if any).
const DCA_CATEGORY_ID_TYPE CATEGORY_ID_TEXT_PORNOGRAPHY
Text category.
DCA_SIZE_TYPE size() const
The number of items in the container.
std::string product
The product code used with the license.
Definition: base_classes.h:549
const DCA_CATEGORY_ID_TYPE CATEGORY_ID_TEXT_WEAPONS
Text category.
const DCA_CATEGORIES_INFO_TYPE DCA_CAT_INFO_TYPE_TEXT
Refers to the internal categories info for text classification.
int DCA_CATEGORY_ID_TYPE
Type for category ids.
Definition: base_types.h:31
std::string logDir
the directory in which the DCA log file should be created
Definition: base_classes.h:267
static void PrintUsage(const char *name)
Prints out the syntax of the sample.
size_t DCA_INDEX_TYPE
Type for index access (used for arrays and collections).
Definition: base_types.h:66
Categories getCategories() const
Returns the contained Categories.
std::string getDescription() const
Returns the description for the error or warning.
Encapsulates an HTML text object.
Definition: base_htmltext.h:24
Overall results of a text classification.
size_t DCA_SIZE_TYPE
Type for size (used for size of array and collections).
Definition: base_types.h:72
std::string name(const std::string &localeString=std::string()) const
Returns the localized (display) name of the category.
std::string displayName() const
Returns the display name of the locale.
static TextClassification create(const DcaInstance &aDcaInstance, const License &aLicense)
Initializes the TextClassification module.
DCA_GROUP_ID_TYPE groupId() const
If the category is associated with a group, this is the group id.
#define DCA_BINDIR
DCA subdirectory of the DCA binaries.
Standard function result.
Definition: base_classes.h:148
Definition of a container class for Locale objects.
Definition: base_locales.h:34
std::string languageId() const
Returns the language id as defined in the related categories.xml.
The HTML Text Classification module class.
const std::string S_UsageString
Usage string, displayed if a parameter is missing.
Encapsulates a group as defined in the related categories XML schema (see Categories XML: Groups).
Definition: base_group.h:26
const DCA_CATEGORY_ID_TYPE CATEGORY_ID_TEXT_ANONYMOUS_PROXIES
Text category.
This structure is used to initialize the DcaInstance.
Definition: base_classes.h:264
Encapsulates a category as defined in the categories XML schema (see Categories XML: Categories).
Definition: base_category.h:26
Groups getGroups() const
Returns the contained Groups.
static DcaInstance create(const InitData &initData)
Creates a DcaInstance, starts up the DCA API and initializes the required main module.
static void SetupLicense(const std::string &ticket, const std::string &product, LicenseData &licenseData)
Sets up the given licenseData by copying the given ticket and product strings.
License createLicense(const LicenseData &licData, const ProxySettings &proxySettings=ProxySettings(), LogLevel aLogLevel=LOG_Initial) const
Creates a License object using the given LicenseData.
int main(int argc, char *argv[])
The main routine.