58 const std::string S_ToolName =
"htmltextsample";
59 const std::string S_ToolVersion =
"1.2";
66 "<redist-folder> <ticket> <product> <html-list-file> [locale]\n"
67 " redist-folder - the folder where the DCA is installed to\n"
68 " ticket - a valid ticket\n"
69 " product - the product associated with your ticket\n"
70 " html-list-file - file that includes the HTML files to classify\n"
71 " locale - optional locale used for printing out category names, default = en_US\n\n"
79 # define DCA_BINDIR "bin/Win32"
81 # define DCA_BINDIR "bin/linux"
88 #define DCA_INITDIR "init"
94 #define DCA_LOGDIR "./logs"
118 static void SetupLicense(
const std::string& ticket,
const std::string& product,
121 licenseData.
ticket = ticket;
131 static void LoadHtmlFileList(
const std::string& fileName, std::vector<std::string>& htmlList )
133 std::ifstream fstream( fileName.c_str(), std::ios::in );
135 if (!fstream.is_open()) {
136 std::cout <<
"Opening HTML list file '" << fileName <<
"' failed!" << std::endl;
141 while ( std::getline(fstream, line) )
143 if( !line.empty() && line[line.length()-1] ==
'\r')
144 line.erase( line.length() - 1 );
147 htmlList.push_back( line );
157 static std::string LoadBinaryFile(
const std::string& fileName )
159 std::ifstream fstream( fileName.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
160 if (!fstream.is_open())
return std::string();
162 const size_t size = fstream.tellg();
163 fstream.seekg( 0, std::ios::beg );
165 char* buffer =
new char [ size ];
166 if (!buffer)
return std::string();
168 fstream.read( buffer, size );
169 std::string file = std::string( buffer, size );
191 void TestTextClassification(
const std::string& htmlFile,
const DcaInstance& myDca,
193 const std::string& locale )
195 std::cout <<
"\nStarting HTML Text Classification routine..." << std::endl;
197 std::vector<std::string> myFiles;
198 LoadHtmlFileList( htmlFile, myFiles );
203 for( std::vector<std::string>::const_iterator F = myFiles.begin(),
204 FEnd = myFiles.end(); F != FEnd; ++F ) {
206 const std::string myFile = *F;
208 std::cout <<
"Starting HTML Text Classification with file '" <<
209 myFile <<
"'" << std::endl;
211 const std::string fileContents = LoadBinaryFile( myFile );
212 if( fileContents.empty() ) {
213 std::cerr <<
"Got empty file '" << myFile <<
"', continuing with next file." << std::endl;
226 std::cerr <<
"Got error from HTML Text Classification. Details: '" <<
228 "). Continuing with next file" << std::endl;
235 std::cout <<
"Results: No matching categories found." << std::endl;
243 const double score = myTextResult.
score();
246 const std::string catname = myCategory.
name( locale );
249 std::cout <<
"Results: " << (i+1) <<
".\t Category: '" << catname <<
250 "', Score: " << score << std::endl;
253 std::cout <<
"Results: " << (i+1) <<
".\t Category: '" << catname <<
254 "', Score: " << score << std::endl;
257 std::cout <<
"Results: " << (i+1) <<
".\t Category: '" << catname <<
258 "', Score: " << score << std::endl;
261 std::cout <<
"Results: " << (i+1) <<
".\t Category: '" << catname <<
262 "', Score: " << score << std::endl;
265 std::cout <<
"Results: " << (i+1) <<
".\t Category: '" << catname <<
266 "', Score: " << score << std::endl;
269 std::cout <<
"Results: " << (i+1) <<
".\t Category: '" << catname <<
270 "', Score: " << score << std::endl;
273 std::cout <<
"Results: " << (i+1) <<
".\t New Category: '" << catname <<
274 "', Score: " << score << std::endl;
280 std::cout <<
"Leaving HTML Text Classification routine." << std::endl;
290 static void PrintCategoriesInfo(
const dca::CategoriesInfo& myCategoriesInfo,
const std::string& locale )
294 std::cout <<
"\nAvailable are the following " << numOfCategories <<
" categories:\n";
298 std::cout <<
"(" << (i+1)
299 <<
") Category name=" << myCategory.
name( locale )
300 <<
", id=" << myCategory.
id()
301 <<
", groupId=" << myCategory.
groupId() << std::endl;
307 std::cout <<
"\nAvailable are the following " << numOfGroups <<
" groups:\n";
311 std::cout <<
"(" << (i+1)
312 <<
") Group name=" << myGroup.
name( locale )
313 <<
", id=" << myGroup.
id() << std::endl;
320 std::cout <<
"\nAvailable are the following " << numOfLocales <<
" locales:\n";
324 std::cout <<
"(" << (i+1)
325 <<
") Locale displayName='" << myLocale.
displayName()
326 <<
"', languageId='" << myLocale.
languageId() <<
"'"
338 std::cout <<
"IBM DCA Sample: " << S_ToolName <<
" (" << S_ToolVersion <<
")" << std::endl;
348 std::cout << name << std::endl <<
"Usage:" << std::endl;
359 int main(
int argc,
char *argv[] )
372 std::string redist_folder = argv[1];
373 const std::string ticket = argv[2];
374 const std::string product = argv[3];
375 const std::string htmlFile = argv[4];
376 std::string locale =
"en_US";
378 if( redist_folder.empty() || ticket.empty() ||
379 product.empty() || htmlFile.empty() ) {
388 const char c = redist_folder[ redist_folder.length() - 1 ];
389 if( c !=
'/' && c !=
'\\' )
390 redist_folder +=
"/";
411 std::cout <<
"DCA is not licensed!" << std::endl;
416 PrintCategoriesInfo( myCategoriesInfo, locale );
423 TestTextClassification( htmlFile, myDca, myTextClassifier, myCategoriesInfo, locale );
426 catch(
const ExDca& ex ) {
427 std::cerr <<
"DCA Exception occured. Details: " << ex.
getDescription() <<
431 catch(
const std::exception& s ) {
432 std::cerr <<
"std::exception occured. Details: " << s.what() <<
"." << std::endl;
436 std::cerr <<
"Unknown exception caught." << std::endl;
Is used to create a License object. A license first must be created with DcaInstance::createLicense t...
Exception class used in the DCA.
Single result of a text classification.
void InitCUrl()
Initializes libcurl. Do not use any DCA function before initializing libcurl.
std::string initDir
the directory in which the DCA init files are stored
HtmlTextClassifier createHtmlClassifier() const
Creates a HtmlTextClassifier that is used to classify HtmlText objects.
DCA_SIZE_TYPE size() const
Returns the number of categories in the container.
std::string getDescription() const
Returns a description of the error.
Definition of a container class for Category objects.
CategoriesInfo getCategoriesInfo(DCA_CATEGORIES_INFO_TYPE categoryType) const
Returns the DCA internal categories, groups and locales.
Encapsulation of a locale class, which allows access to the language id and the display name of local...
const DCA_CATEGORY_ID_TYPE CATEGORY_ID_TEXT_ILLEGAL_DRUGS
Text category.
DCA_GROUP_ID_TYPE id() const
Returns the id of the group as defined in the related categories XML schema.
DCA_RESULT_TYPE getReturnCode() const
Returns the last error code (if any).
static void SetupInitData(const std::string &redist_folder, InitData &initData)
Sets up the given initData by substituting the given redist_folder with DCA subdirectories.
std::string binDir
the directory in which the DCA binary (*.dca) files are stored
This header includes initialization/deinitialization support functions for the 3rd party libraries us...
A container class that allows access to the contained Categories, Groups and Locales.
void SetOpenSslCallbacks()
Initializes the required callbacks for OpenSSL when using HTTPS or SSL connections in a multi-threade...
void UnsetOpenSslCallbacks()
Unsets the openssl callbacks. Do not call any DCA function after you have called this function.
DCA_CATEGORY_ID_TYPE id() const
The category id.
void DeinitCUrl()
Deinitializes libcurl. Do not call any DCA function after you have called this function.
DCA_SIZE_TYPE size() const
Returns the number of categories in the container.
#define DCA_LOGDIR
Relative directory for logfile(s).
This header includes all header files of the Text Classification Package.
#define DCA_INITDIR
DCA subdirectory of the DCA initialization data.
Definition of a container class for Group objects.
const DCA_CATEGORY_ID_TYPE CATEGORY_ID_TEXT_WAREZ
Text category.
DCA_RESULT_TYPE getReturnCode() const
Gets the code of the error.
std::string name(const std::string &localeString=std::string()) const
Returns the localized (display) name of the group.
static DCA_MODULE_ID_TYPE ID
The unique ID of the text classification module.
Locales getLocales() const
Returns the contained Locales.
static void PrintToolHeader()
Prints out the name and the version of this sample.
double score() const
Returns the score of the classification (if any), range is from 0.0 to 1.0.
Use a License to initialize a classification package or a toolbox package.
Category byId(DCA_CATEGORY_ID_TYPE id) const
Returns the category with the given category id.
bool isCategorized() const
Returns whether there are any results for the text classification.
bool isLicensed(DCA_MODULE_ID_TYPE id=0, bool force=false) const
Checks whether the given License is valid for the given module id.
std::string ticket
The ticket as provided in the license.
This header includes all header files of the DCA Base Package.
const DCA_CATEGORY_ID_TYPE CATEGORY_ID_TEXT_GAMBLING
Text category.
DCA_SIZE_TYPE size() const
Returns the number of results in the container.
static HtmlText create(const DcaInstance &aDcaInstance, const std::string &htmlContent)
Creates an HTML text object, used as an input parameter for text classification.
Encapsulates the init and deinit of the DCA API.
HTML text classifier object for text classification.
FunctionResult classify(const HtmlText &aText, TextClassificationResults &aTextResults) const
The HTML Text Classification method. The method takes an initialized HtmlText object and returns the ...
DCA_CATEGORY_ID_TYPE id() const
Returns the category id of the classification (if any).
const DCA_CATEGORY_ID_TYPE CATEGORY_ID_TEXT_PORNOGRAPHY
Text category.
DCA_SIZE_TYPE size() const
The number of items in the container.
std::string product
The product code used with the license.
const DCA_CATEGORY_ID_TYPE CATEGORY_ID_TEXT_WEAPONS
Text category.
const DCA_CATEGORIES_INFO_TYPE DCA_CAT_INFO_TYPE_TEXT
Refers to the internal categories info for text classification.
int DCA_CATEGORY_ID_TYPE
Type for category ids.
std::string logDir
the directory in which the DCA log file should be created
static void PrintUsage(const char *name)
Prints out the syntax of the sample.
size_t DCA_INDEX_TYPE
Type for index access (used for arrays and collections).
Categories getCategories() const
Returns the contained Categories.
std::string getDescription() const
Returns the description for the error or warning.
Encapsulates an HTML text object.
Overall results of a text classification.
size_t DCA_SIZE_TYPE
Type for size (used for size of array and collections).
std::string name(const std::string &localeString=std::string()) const
Returns the localized (display) name of the category.
std::string displayName() const
Returns the display name of the locale.
static TextClassification create(const DcaInstance &aDcaInstance, const License &aLicense)
Initializes the TextClassification module.
DCA_GROUP_ID_TYPE groupId() const
If the category is associated with a group, this is the group id.
#define DCA_BINDIR
DCA subdirectory of the DCA binaries.
Standard function result.
Definition of a container class for Locale objects.
std::string languageId() const
Returns the language id as defined in the related categories.xml.
The HTML Text Classification module class.
const std::string S_UsageString
Usage string, displayed if a parameter is missing.
Encapsulates a group as defined in the related categories XML schema (see Categories XML: Groups).
const DCA_CATEGORY_ID_TYPE CATEGORY_ID_TEXT_ANONYMOUS_PROXIES
Text category.
This structure is used to initialize the DcaInstance.
Encapsulates a category as defined in the categories XML schema (see Categories XML: Categories).
Groups getGroups() const
Returns the contained Groups.
static DcaInstance create(const InitData &initData)
Creates a DcaInstance, starts up the DCA API and initializes the required main module.
static void SetupLicense(const std::string &ticket, const std::string &product, LicenseData &licenseData)
Sets up the given licenseData by copying the given ticket and product strings.
License createLicense(const LicenseData &licData, const ProxySettings &proxySettings=ProxySettings(), LogLevel aLogLevel=LOG_Initial) const
Creates a License object using the given LicenseData.
int main(int argc, char *argv[])
The main routine.