ULib/examples/IR/IR.cpp

// IR.cpp

#include <ulib/utility/dir_walk.h>
#include <ulib/utility/string_ext.h>

#include "IR.h"

/**
 * inverted index: (data structure)
 *
 * Definition: An index into a set of texts of the words in the texts. The index is accessed by some search method.
 * Each index entry gives the word and a list of texts, possibly with locations within the text, where the word occurs.
 * See also full inverted index, inverted file index, block addressing index, index file, external index, forward index.
 *
 * Note: Suppose we want to search the texts "i love you," "god is love," "love is blind," and "blind justice."
 * (The words of the text are all lower case for simplicity) If we index by (text, character within the text),
 * the index with location in text is:
 *
 * blind   (3,8);(4,0)
 * god     (2,0)
 * i       (1,0)
 * is      (2,4);(3,5)
 * justice (4,6)
 * love    (1,2);(2,7);(3,0)
 * you     (1,7)
 *
 * The word "blind" is in document 3 ("love is blind") starting at character 8, so has an entry (3,8).
 * To find, for instance, documents with both "is" and "love," first look up the words in the index,
 * then find the intersection of the texts in each list. In this case, documents 2 and 3 have both words.
 * We can quickly find documents where the words appear close to each other by comparing the character within the text
 */

UCDB* cdb_names;
UCDB* cdb_words;

int32_t              IR::operation; // 0 -> add, 1 -> sub, 2 -> del, 3 -> check
UPosting*            IR::posting;
UTokenizer*          IR::t;
UString*             IR::bad_words;
UVector<UString>*    IR::filter_ext;
UVector<UString>*    IR::suffix_bad_words;
UVector<UString>*    IR::suffix_skip_tag_xml;
UVector<UCommand*>*  IR::filter_cmd;

IR::~IR()
{
   U_TRACE(5, "IR::~IR()")

   if (t) U_DELETE(t)

   if (filter_ext)
      {
      U_DELETE(filter_cmd)
      U_DELETE(filter_ext)
      }

   if (posting)             U_DELETE(posting)
   if (bad_words)           U_DELETE(bad_words)
   if (suffix_bad_words)    U_DELETE(suffix_bad_words)
   if (suffix_skip_tag_xml) U_DELETE(suffix_skip_tag_xml)
}

void IR::setBadWords()
{
   U_TRACE(5, "IR::setBadWords()")

   if (t == U_NULLPTR) t = new UTokenizer;

   cfg_skip_tag_xml = cfg[U_STRING_FROM_CONSTANT("SKIP_TAG_XML")];

   if (cfg_skip_tag_xml) suffix_skip_tag_xml = new UVector<UString>(cfg_skip_tag_xml);

   cfg_bad_words = cfg[U_STRING_FROM_CONSTANT("BAD_WORDS")];

   if (cfg_bad_words)
      {
      U_NEW_STRING(bad_words, UString(cfg_bad_words));

      cfg_bad_words_ext = cfg[U_STRING_FROM_CONSTANT("BAD_WORDS_EXT")];

      if (cfg_bad_words_ext) suffix_bad_words = new UVector<UString>(cfg_bad_words_ext);
      }
}

void IR::parse()
{
   U_TRACE(5, "IR::parse()")

   U_INTERNAL_ASSERT_POINTER(t)

   UPosting::file->setPath(*UPosting::filename);

   uint32_t i;
   UString suffix = UPosting::file->getSuffix();

   if (filter_ext &&
       (i = filter_ext->find(suffix), i != U_NOT_FOUND))
      {
#  ifdef _MSWINDOWS_
      (void) UPosting::file->open();
      (void) UPosting::file->size(true);
      (void) UPosting::file->close();
#  else
      (void) UPosting::file->stat();
#  endif

      UPosting::content->clear();
      UPosting::content->reserve(UPosting::file->getSize());

      (void) ((*filter_cmd)[i])->executeWithFileArgument(UPosting::content, UPosting::file);
      }
   else
      {
      *UPosting::content = UPosting::file->getContent(true, true);
      }

   UPosting::setDocID(operation); // insert/fetch/remove into table of docs name

   // loop for all words in document

   UPosting::word->clear(); // depend on content...

   t->setData(*UPosting::content);

   t->setAvoidPunctuation(true);

   bool bad_words_active = bad_words                      &&
                           (suffix_bad_words == U_NULLPTR ||
                            suffix_bad_words->find(suffix) != U_NOT_FOUND);

   if (suffix_skip_tag_xml) t->setSkipTagXML(suffix_skip_tag_xml->find(suffix) != U_NOT_FOUND);

   while (t->next(*UPosting::word, (bool*)U_NULLPTR))
      {
      if (bad_words_active &&
          UServices::dosMatchWithOR(*UPosting::word, *bad_words, UPosting::ignore_case))
         {
         continue;
         }

      UPosting::processWord(operation);
      }

   if (operation == 2) UPosting::file->_unlink(); // del
}

void IR::processFile()
{
   U_TRACE(5, "IR::processFile()")

   U_INTERNAL_ASSERT_EQUALS(UDirWalk::isDirectory(), false)

   UDirWalk::setFoundFile(*UPosting::filename);

   IR::parse();

   if (operation == 3) (void) write(1, U_CONSTANT_TO_PARAM(".")); // check

   // adjust virtual position if context 'directory as document'...

   if (UPosting::dir_content_as_doc) UPosting::pos_start += UPosting::content->size();
}

void IR::processDirectory()
{
   U_TRACE(5, "IR::processDirectory()")

   UPosting::pos_start  = 0;
   UPosting::change_dir = true;
}

void IR::loadFiles()
{
   U_TRACE(5, "IR::loadFiles()")

   UDirWalk dirwalk;

   UDirWalk::setSortingForInode();
   UDirWalk::setRecurseSubDirs(true, false);

                                     dirwalk.call_internal = IR::processFile;
   if (UPosting::dir_content_as_doc) dirwalk.call_if_up    = IR::processDirectory;

   if (operation == 3) (void) write(1, U_CONSTANT_TO_PARAM("CHECK_1")); // check

   dirwalk.walk();

   if (operation == 3) (void) write(1, U_CONSTANT_TO_PARAM("OK")); // check

   (void) UFile::chdir(U_NULLPTR, true);
}

void IR::loadFilters()
{
   U_TRACE(5, "IR::loadFilters()")

   cfg_filter_ext = cfg[U_STRING_FROM_CONSTANT("FILTER_EXT")];

   if (cfg_filter_ext)
      {
      cfg_filter_cmd = cfg[U_STRING_FROM_CONSTANT("FILTER_CMD")];

      if (cfg_filter_cmd)
         {
         filter_cmd = new UVector<UCommand*>;
         filter_ext = new UVector<UString>(cfg_filter_ext);

         UVector<UString> filter_str(cfg_filter_cmd);

         U_ASSERT_EQUALS(filter_ext->size(),filter_str.size())

         for (uint32_t i = 0; i < filter_str.size(); ++i)
            {
            UCommand* cmd = new UCommand(filter_str[i]);

            cmd->setFileArgument();

            filter_cmd->push(cmd);
            }
         }
      }
}