1
0
mirror of https://github.com/stefanocasazza/ULib.git synced 2025-09-28 19:05:55 +08:00
ULib/examples/IR/IR.cpp
stefanocasazza 1e58dc49d0 fix+sync
2018-04-27 19:27:14 +02:00

223 lines
6.0 KiB
C++

// IR.cpp
#include <ulib/utility/dir_walk.h>
#include <ulib/utility/string_ext.h>
#include "IR.h"
/**
* inverted index: (data structure)
*
* Definition: An index into a set of texts of the words in the texts. The index is accessed by some search method.
* Each index entry gives the word and a list of texts, possibly with locations within the text, where the word occurs.
* See also full inverted index, inverted file index, block addressing index, index file, external index, forward index.
*
* Note: Suppose we want to search the texts "i love you," "god is love," "love is blind," and "blind justice."
* (The words of the text are all lower case for simplicity) If we index by (text, character within the text),
* the index with location in text is:
*
* blind (3,8);(4,0)
* god (2,0)
* i (1,0)
* is (2,4);(3,5)
* justice (4,6)
* love (1,2);(2,7);(3,0)
* you (1,7)
*
* The word "blind" is in document 3 ("love is blind") starting at character 8, so has an entry (3,8).
* To find, for instance, documents with both "is" and "love," first look up the words in the index,
* then find the intersection of the texts in each list. In this case, documents 2 and 3 have both words.
* We can quickly find documents where the words appear close to each other by comparing the character within the text
*/
UCDB* cdb_names;
UCDB* cdb_words;
int32_t IR::operation; // 0 -> add, 1 -> sub, 2 -> del, 3 -> check
UPosting* IR::posting;
UTokenizer* IR::t;
UString* IR::bad_words;
UVector<UString>* IR::filter_ext;
UVector<UString>* IR::suffix_bad_words;
UVector<UString>* IR::suffix_skip_tag_xml;
UVector<UCommand*>* IR::filter_cmd;
IR::~IR()
{
U_TRACE(5, "IR::~IR()")
if (t) U_DELETE(t)
if (filter_ext)
{
U_DELETE(filter_cmd)
U_DELETE(filter_ext)
}
if (posting) U_DELETE(posting)
if (bad_words) U_DELETE(bad_words)
if (suffix_bad_words) U_DELETE(suffix_bad_words)
if (suffix_skip_tag_xml) U_DELETE(suffix_skip_tag_xml)
}
void IR::setBadWords()
{
U_TRACE(5, "IR::setBadWords()")
if (t == U_NULLPTR) t = new UTokenizer;
cfg_skip_tag_xml = cfg[U_STRING_FROM_CONSTANT("SKIP_TAG_XML")];
if (cfg_skip_tag_xml) suffix_skip_tag_xml = new UVector<UString>(cfg_skip_tag_xml);
cfg_bad_words = cfg[U_STRING_FROM_CONSTANT("BAD_WORDS")];
if (cfg_bad_words)
{
U_NEW_STRING(bad_words, UString(cfg_bad_words));
cfg_bad_words_ext = cfg[U_STRING_FROM_CONSTANT("BAD_WORDS_EXT")];
if (cfg_bad_words_ext) suffix_bad_words = new UVector<UString>(cfg_bad_words_ext);
}
}
void IR::parse()
{
U_TRACE(5, "IR::parse()")
U_INTERNAL_ASSERT_POINTER(t)
UPosting::file->setPath(*UPosting::filename);
uint32_t i;
UString suffix = UPosting::file->getSuffix();
if (filter_ext &&
(i = filter_ext->find(suffix), i != U_NOT_FOUND))
{
# ifdef _MSWINDOWS_
(void) UPosting::file->open();
(void) UPosting::file->size(true);
(void) UPosting::file->close();
# else
(void) UPosting::file->stat();
# endif
UPosting::content->clear();
UPosting::content->reserve(UPosting::file->getSize());
(void) ((*filter_cmd)[i])->executeWithFileArgument(UPosting::content, UPosting::file);
}
else
{
*UPosting::content = UPosting::file->getContent(true, true);
}
UPosting::setDocID(operation); // insert/fetch/remove into table of docs name
// loop for all words in document
UPosting::word->clear(); // depend on content...
t->setData(*UPosting::content);
t->setAvoidPunctuation(true);
bool bad_words_active = bad_words &&
(suffix_bad_words == U_NULLPTR ||
suffix_bad_words->find(suffix) != U_NOT_FOUND);
if (suffix_skip_tag_xml) t->setSkipTagXML(suffix_skip_tag_xml->find(suffix) != U_NOT_FOUND);
while (t->next(*UPosting::word, (bool*)U_NULLPTR))
{
if (bad_words_active &&
UServices::dosMatchWithOR(*UPosting::word, *bad_words, UPosting::ignore_case))
{
continue;
}
UPosting::processWord(operation);
}
if (operation == 2) UPosting::file->_unlink(); // del
}
void IR::processFile()
{
U_TRACE(5, "IR::processFile()")
U_INTERNAL_ASSERT_EQUALS(UDirWalk::isDirectory(), false)
UDirWalk::setFoundFile(*UPosting::filename);
IR::parse();
if (operation == 3) (void) write(1, U_CONSTANT_TO_PARAM(".")); // check
// adjust virtual position if context 'directory as document'...
if (UPosting::dir_content_as_doc) UPosting::pos_start += UPosting::content->size();
}
void IR::processDirectory()
{
U_TRACE(5, "IR::processDirectory()")
UPosting::pos_start = 0;
UPosting::change_dir = true;
}
void IR::loadFiles()
{
U_TRACE(5, "IR::loadFiles()")
UDirWalk dirwalk;
UDirWalk::setSortingForInode();
UDirWalk::setRecurseSubDirs(true, false);
dirwalk.call_internal = IR::processFile;
if (UPosting::dir_content_as_doc) dirwalk.call_if_up = IR::processDirectory;
if (operation == 3) (void) write(1, U_CONSTANT_TO_PARAM("CHECK_1")); // check
dirwalk.walk();
if (operation == 3) (void) write(1, U_CONSTANT_TO_PARAM("OK")); // check
(void) UFile::chdir(U_NULLPTR, true);
}
void IR::loadFilters()
{
U_TRACE(5, "IR::loadFilters()")
cfg_filter_ext = cfg[U_STRING_FROM_CONSTANT("FILTER_EXT")];
if (cfg_filter_ext)
{
cfg_filter_cmd = cfg[U_STRING_FROM_CONSTANT("FILTER_CMD")];
if (cfg_filter_cmd)
{
filter_cmd = new UVector<UCommand*>;
filter_ext = new UVector<UString>(cfg_filter_ext);
UVector<UString> filter_str(cfg_filter_cmd);
U_ASSERT_EQUALS(filter_ext->size(),filter_str.size())
for (uint32_t i = 0; i < filter_str.size(); ++i)
{
UCommand* cmd = new UCommand(filter_str[i]);
cmd->setFileArgument();
filter_cmd->push(cmd);
}
}
}
}