mirror of
https://github.com/stefanocasazza/ULib.git
synced 2025-09-28 19:05:55 +08:00
223 lines
6.0 KiB
C++
223 lines
6.0 KiB
C++
// IR.cpp
|
|
|
|
#include <ulib/utility/dir_walk.h>
|
|
#include <ulib/utility/string_ext.h>
|
|
|
|
#include "IR.h"
|
|
|
|
/**
|
|
* inverted index: (data structure)
|
|
*
|
|
* Definition: An index into a set of texts of the words in the texts. The index is accessed by some search method.
|
|
* Each index entry gives the word and a list of texts, possibly with locations within the text, where the word occurs.
|
|
* See also full inverted index, inverted file index, block addressing index, index file, external index, forward index.
|
|
*
|
|
* Note: Suppose we want to search the texts "i love you," "god is love," "love is blind," and "blind justice."
|
|
* (The words of the text are all lower case for simplicity) If we index by (text, character within the text),
|
|
* the index with location in text is:
|
|
*
|
|
* blind (3,8);(4,0)
|
|
* god (2,0)
|
|
* i (1,0)
|
|
* is (2,4);(3,5)
|
|
* justice (4,6)
|
|
* love (1,2);(2,7);(3,0)
|
|
* you (1,7)
|
|
*
|
|
* The word "blind" is in document 3 ("love is blind") starting at character 8, so has an entry (3,8).
|
|
* To find, for instance, documents with both "is" and "love," first look up the words in the index,
|
|
* then find the intersection of the texts in each list. In this case, documents 2 and 3 have both words.
|
|
* We can quickly find documents where the words appear close to each other by comparing the character within the text
|
|
*/
|
|
|
|
UCDB* cdb_names;
|
|
UCDB* cdb_words;
|
|
|
|
int32_t IR::operation; // 0 -> add, 1 -> sub, 2 -> del, 3 -> check
|
|
UPosting* IR::posting;
|
|
UTokenizer* IR::t;
|
|
UString* IR::bad_words;
|
|
UVector<UString>* IR::filter_ext;
|
|
UVector<UString>* IR::suffix_bad_words;
|
|
UVector<UString>* IR::suffix_skip_tag_xml;
|
|
UVector<UCommand*>* IR::filter_cmd;
|
|
|
|
IR::~IR()
|
|
{
|
|
U_TRACE(5, "IR::~IR()")
|
|
|
|
if (t) U_DELETE(t)
|
|
|
|
if (filter_ext)
|
|
{
|
|
U_DELETE(filter_cmd)
|
|
U_DELETE(filter_ext)
|
|
}
|
|
|
|
if (posting) U_DELETE(posting)
|
|
if (bad_words) U_DELETE(bad_words)
|
|
if (suffix_bad_words) U_DELETE(suffix_bad_words)
|
|
if (suffix_skip_tag_xml) U_DELETE(suffix_skip_tag_xml)
|
|
}
|
|
|
|
void IR::setBadWords()
|
|
{
|
|
U_TRACE(5, "IR::setBadWords()")
|
|
|
|
if (t == U_NULLPTR) t = new UTokenizer;
|
|
|
|
cfg_skip_tag_xml = cfg[U_STRING_FROM_CONSTANT("SKIP_TAG_XML")];
|
|
|
|
if (cfg_skip_tag_xml) suffix_skip_tag_xml = new UVector<UString>(cfg_skip_tag_xml);
|
|
|
|
cfg_bad_words = cfg[U_STRING_FROM_CONSTANT("BAD_WORDS")];
|
|
|
|
if (cfg_bad_words)
|
|
{
|
|
U_NEW_STRING(bad_words, UString(cfg_bad_words));
|
|
|
|
cfg_bad_words_ext = cfg[U_STRING_FROM_CONSTANT("BAD_WORDS_EXT")];
|
|
|
|
if (cfg_bad_words_ext) suffix_bad_words = new UVector<UString>(cfg_bad_words_ext);
|
|
}
|
|
}
|
|
|
|
void IR::parse()
|
|
{
|
|
U_TRACE(5, "IR::parse()")
|
|
|
|
U_INTERNAL_ASSERT_POINTER(t)
|
|
|
|
UPosting::file->setPath(*UPosting::filename);
|
|
|
|
uint32_t i;
|
|
UString suffix = UPosting::file->getSuffix();
|
|
|
|
if (filter_ext &&
|
|
(i = filter_ext->find(suffix), i != U_NOT_FOUND))
|
|
{
|
|
# ifdef _MSWINDOWS_
|
|
(void) UPosting::file->open();
|
|
(void) UPosting::file->size(true);
|
|
(void) UPosting::file->close();
|
|
# else
|
|
(void) UPosting::file->stat();
|
|
# endif
|
|
|
|
UPosting::content->clear();
|
|
UPosting::content->reserve(UPosting::file->getSize());
|
|
|
|
(void) ((*filter_cmd)[i])->executeWithFileArgument(UPosting::content, UPosting::file);
|
|
}
|
|
else
|
|
{
|
|
*UPosting::content = UPosting::file->getContent(true, true);
|
|
}
|
|
|
|
UPosting::setDocID(operation); // insert/fetch/remove into table of docs name
|
|
|
|
// loop for all words in document
|
|
|
|
UPosting::word->clear(); // depend on content...
|
|
|
|
t->setData(*UPosting::content);
|
|
|
|
t->setAvoidPunctuation(true);
|
|
|
|
bool bad_words_active = bad_words &&
|
|
(suffix_bad_words == U_NULLPTR ||
|
|
suffix_bad_words->find(suffix) != U_NOT_FOUND);
|
|
|
|
if (suffix_skip_tag_xml) t->setSkipTagXML(suffix_skip_tag_xml->find(suffix) != U_NOT_FOUND);
|
|
|
|
while (t->next(*UPosting::word, (bool*)U_NULLPTR))
|
|
{
|
|
if (bad_words_active &&
|
|
UServices::dosMatchWithOR(*UPosting::word, *bad_words, UPosting::ignore_case))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
UPosting::processWord(operation);
|
|
}
|
|
|
|
if (operation == 2) UPosting::file->_unlink(); // del
|
|
}
|
|
|
|
void IR::processFile()
|
|
{
|
|
U_TRACE(5, "IR::processFile()")
|
|
|
|
U_INTERNAL_ASSERT_EQUALS(UDirWalk::isDirectory(), false)
|
|
|
|
UDirWalk::setFoundFile(*UPosting::filename);
|
|
|
|
IR::parse();
|
|
|
|
if (operation == 3) (void) write(1, U_CONSTANT_TO_PARAM(".")); // check
|
|
|
|
// adjust virtual position if context 'directory as document'...
|
|
|
|
if (UPosting::dir_content_as_doc) UPosting::pos_start += UPosting::content->size();
|
|
}
|
|
|
|
void IR::processDirectory()
|
|
{
|
|
U_TRACE(5, "IR::processDirectory()")
|
|
|
|
UPosting::pos_start = 0;
|
|
UPosting::change_dir = true;
|
|
}
|
|
|
|
void IR::loadFiles()
|
|
{
|
|
U_TRACE(5, "IR::loadFiles()")
|
|
|
|
UDirWalk dirwalk;
|
|
|
|
UDirWalk::setSortingForInode();
|
|
UDirWalk::setRecurseSubDirs(true, false);
|
|
|
|
dirwalk.call_internal = IR::processFile;
|
|
if (UPosting::dir_content_as_doc) dirwalk.call_if_up = IR::processDirectory;
|
|
|
|
if (operation == 3) (void) write(1, U_CONSTANT_TO_PARAM("CHECK_1")); // check
|
|
|
|
dirwalk.walk();
|
|
|
|
if (operation == 3) (void) write(1, U_CONSTANT_TO_PARAM("OK")); // check
|
|
|
|
(void) UFile::chdir(U_NULLPTR, true);
|
|
}
|
|
|
|
void IR::loadFilters()
|
|
{
|
|
U_TRACE(5, "IR::loadFilters()")
|
|
|
|
cfg_filter_ext = cfg[U_STRING_FROM_CONSTANT("FILTER_EXT")];
|
|
|
|
if (cfg_filter_ext)
|
|
{
|
|
cfg_filter_cmd = cfg[U_STRING_FROM_CONSTANT("FILTER_CMD")];
|
|
|
|
if (cfg_filter_cmd)
|
|
{
|
|
filter_cmd = new UVector<UCommand*>;
|
|
filter_ext = new UVector<UString>(cfg_filter_ext);
|
|
|
|
UVector<UString> filter_str(cfg_filter_cmd);
|
|
|
|
U_ASSERT_EQUALS(filter_ext->size(),filter_str.size())
|
|
|
|
for (uint32_t i = 0; i < filter_str.size(); ++i)
|
|
{
|
|
UCommand* cmd = new UCommand(filter_str[i]);
|
|
|
|
cmd->setFileArgument();
|
|
|
|
filter_cmd->push(cmd);
|
|
}
|
|
}
|
|
}
|
|
}
|