// IR.cpp #include #include #include "IR.h" /** * inverted index: (data structure) * * Definition: An index into a set of texts of the words in the texts. The index is accessed by some search method. * Each index entry gives the word and a list of texts, possibly with locations within the text, where the word occurs. * See also full inverted index, inverted file index, block addressing index, index file, external index, forward index. * * Note: Suppose we want to search the texts "i love you," "god is love," "love is blind," and "blind justice." * (The words of the text are all lower case for simplicity) If we index by (text, character within the text), * the index with location in text is: * * blind (3,8);(4,0) * god (2,0) * i (1,0) * is (2,4);(3,5) * justice (4,6) * love (1,2);(2,7);(3,0) * you (1,7) * * The word "blind" is in document 3 ("love is blind") starting at character 8, so has an entry (3,8). * To find, for instance, documents with both "is" and "love," first look up the words in the index, * then find the intersection of the texts in each list. In this case, documents 2 and 3 have both words. * We can quickly find documents where the words appear close to each other by comparing the character within the text */ UCDB* cdb_names; UCDB* cdb_words; int32_t IR::operation; // 0 -> add, 1 -> sub, 2 -> del, 3 -> check UPosting* IR::posting; UTokenizer* IR::t; UString* IR::bad_words; UVector* IR::filter_ext; UVector* IR::suffix_bad_words; UVector* IR::suffix_skip_tag_xml; UVector* IR::filter_cmd; IR::~IR() { U_TRACE(5, "IR::~IR()") if (t) U_DELETE(t) if (filter_ext) { U_DELETE(filter_cmd) U_DELETE(filter_ext) } if (posting) U_DELETE(posting) if (bad_words) U_DELETE(bad_words) if (suffix_bad_words) U_DELETE(suffix_bad_words) if (suffix_skip_tag_xml) U_DELETE(suffix_skip_tag_xml) } void IR::setBadWords() { U_TRACE(5, "IR::setBadWords()") if (t == U_NULLPTR) t = new UTokenizer; cfg_skip_tag_xml = cfg[U_STRING_FROM_CONSTANT("SKIP_TAG_XML")]; if (cfg_skip_tag_xml) suffix_skip_tag_xml = new UVector(cfg_skip_tag_xml); cfg_bad_words = cfg[U_STRING_FROM_CONSTANT("BAD_WORDS")]; if (cfg_bad_words) { U_NEW_STRING(bad_words, UString(cfg_bad_words)); cfg_bad_words_ext = cfg[U_STRING_FROM_CONSTANT("BAD_WORDS_EXT")]; if (cfg_bad_words_ext) suffix_bad_words = new UVector(cfg_bad_words_ext); } } void IR::parse() { U_TRACE(5, "IR::parse()") U_INTERNAL_ASSERT_POINTER(t) UPosting::file->setPath(*UPosting::filename); uint32_t i; UString suffix = UPosting::file->getSuffix(); if (filter_ext && (i = filter_ext->find(suffix), i != U_NOT_FOUND)) { # ifdef _MSWINDOWS_ (void) UPosting::file->open(); (void) UPosting::file->size(true); (void) UPosting::file->close(); # else (void) UPosting::file->stat(); # endif UPosting::content->clear(); UPosting::content->reserve(UPosting::file->getSize()); (void) ((*filter_cmd)[i])->executeWithFileArgument(UPosting::content, UPosting::file); } else { *UPosting::content = UPosting::file->getContent(true, true); } UPosting::setDocID(operation); // insert/fetch/remove into table of docs name // loop for all words in document UPosting::word->clear(); // depend on content... t->setData(*UPosting::content); t->setAvoidPunctuation(true); bool bad_words_active = bad_words && (suffix_bad_words == U_NULLPTR || suffix_bad_words->find(suffix) != U_NOT_FOUND); if (suffix_skip_tag_xml) t->setSkipTagXML(suffix_skip_tag_xml->find(suffix) != U_NOT_FOUND); while (t->next(*UPosting::word, (bool*)U_NULLPTR)) { if (bad_words_active && UServices::dosMatchWithOR(*UPosting::word, *bad_words, UPosting::ignore_case)) { continue; } UPosting::processWord(operation); } if (operation == 2) UPosting::file->_unlink(); // del } void IR::processFile() { U_TRACE(5, "IR::processFile()") U_INTERNAL_ASSERT_EQUALS(UDirWalk::isDirectory(), false) UDirWalk::setFoundFile(*UPosting::filename); IR::parse(); if (operation == 3) (void) write(1, U_CONSTANT_TO_PARAM(".")); // check // adjust virtual position if context 'directory as document'... if (UPosting::dir_content_as_doc) UPosting::pos_start += UPosting::content->size(); } void IR::processDirectory() { U_TRACE(5, "IR::processDirectory()") UPosting::pos_start = 0; UPosting::change_dir = true; } void IR::loadFiles() { U_TRACE(5, "IR::loadFiles()") UDirWalk dirwalk; UDirWalk::setSortingForInode(); UDirWalk::setRecurseSubDirs(true, false); dirwalk.call_internal = IR::processFile; if (UPosting::dir_content_as_doc) dirwalk.call_if_up = IR::processDirectory; if (operation == 3) (void) write(1, U_CONSTANT_TO_PARAM("CHECK_1")); // check dirwalk.walk(); if (operation == 3) (void) write(1, U_CONSTANT_TO_PARAM("OK")); // check (void) UFile::chdir(U_NULLPTR, true); } void IR::loadFilters() { U_TRACE(5, "IR::loadFilters()") cfg_filter_ext = cfg[U_STRING_FROM_CONSTANT("FILTER_EXT")]; if (cfg_filter_ext) { cfg_filter_cmd = cfg[U_STRING_FROM_CONSTANT("FILTER_CMD")]; if (cfg_filter_cmd) { filter_cmd = new UVector; filter_ext = new UVector(cfg_filter_ext); UVector filter_str(cfg_filter_cmd); U_ASSERT_EQUALS(filter_ext->size(),filter_str.size()) for (uint32_t i = 0; i < filter_str.size(); ++i) { UCommand* cmd = new UCommand(filter_str[i]); cmd->setFileArgument(); filter_cmd->push(cmd); } } } }