ULib/examples/IR/posting.cpp

// posting.cpp - class support for inverted index (data structure)

#include <ulib/db/rdb.h>
#include <ulib/base/utility.h>
#include <ulib/container/vector.h>
#include <ulib/container/hash_map.h>
#include <ulib/utility/string_ext.h>

#include "posting.h"

extern UCDB* cdb_names;
extern UCDB* cdb_words;

// public

bool               UPosting::change_dir;
bool               UPosting::ignore_case;
bool               UPosting::dir_content_as_doc;
UFile*             UPosting::file;
UString*           UPosting::word;
UString*           UPosting::content;
UString*           UPosting::posting;
UString*           UPosting::filename;
UString*           UPosting::str_cur_doc_id;
uint32_t           UPosting::word_freq;
uint32_t           UPosting::min_word_size;
uint32_t           UPosting::tbl_name_space;
uint32_t           UPosting::tbl_words_space;
uint32_t           UPosting::max_distance = 2;
UHashMap<UString>* UPosting::tbl_name;
UHashMap<UString>* UPosting::tbl_words;

// private

vPF                UPosting::pfunction;
char*              UPosting::ptr;
char*              UPosting::data;
char*              UPosting::ptr_cur_doc_id;
uint64_t           UPosting::cur_doc_id;
uint32_t           UPosting::pos;
uint32_t           UPosting::pos_start;
uint32_t           UPosting::space;
uint32_t           UPosting::distance;
uint32_t           UPosting::size_entry;
uint32_t           UPosting::sub_word_size;
uint32_t           UPosting::off_last_doc_id;
uint32_t           UPosting::vec_sub_word_size;
uint32_t           UPosting::sub_word_pos_prev;
uint32_t           UPosting::approximate_num_words;
UString*           UPosting::sub_word;
UVector<UString>*  UPosting::vec_word;
UVector<UString>*  UPosting::vec_entry;
UVector<UString>*  UPosting::vec_posting;
UVector<UString>*  UPosting::vec_sub_word;
UVector<UString>*  UPosting::vec_sub_word_posting;

typedef struct u_property {
   uint32_t is_meta;
   uint32_t is_quoted;
} u_property;

static u_property property[32];

/**
 * uint32_t off_last_doc_id;
 *
 * typedef struct u_posting {
 *    uint64_t doc_id;
 *    uint32_t word_freq;
 *    // -----> array of unsigned with word_freq elements
 * } u_posting;
 *
 * +------+--------------------+--------+-----------+-------+-----+-------+-----+--------+-----------+-------+-----+-------+
 * | WORD | offset last DOC id | DOC id | frequency | pos 1 | ... | pos n | ... | DOC id | frequency | pos 1 | ... | pos n |
 * +------+--------------------+--------+-----------+-------+-----+-------+-----+--------+-----------+-------+-----+-------+
 */

#define POSTING32(x,attr) u_get_unaligned32(((u_posting*)(x))->attr)
#define POSTING64(x,attr) u_get_unaligned64(((u_posting*)(x))->attr)

#define POSTING_SIZE(x) (sizeof(u_posting)+((POSTING32(x,word_freq))*sizeof(uint32_t)))

#define POSTING_POS(n)             u_get_unalignedp32(ptr+sizeof(u_posting)+(n*sizeof(uint32_t)))
#define POSTING_OFFSET_LAST_DOC_ID u_get_unalignedp32(data)

UPosting::UPosting(uint32_t dimension, bool parsing, bool index)
{
   U_TRACE_REGISTER_OBJECT(5, UPosting, "%u,%b,%b", dimension, parsing, index)

   U_INTERNAL_ASSERT_EQUALS(word,0)
   U_INTERNAL_ASSERT_EQUALS(posting,0)
   U_INTERNAL_ASSERT_EQUALS(filename,0)
   U_INTERNAL_ASSERT_EQUALS(str_cur_doc_id,0)

   word           = U_NEW(UString);
   posting        = U_NEW(UString);
   filename       = U_NEW(UString);
   str_cur_doc_id = U_NEW(UString(sizeof(cur_doc_id)));

   approximate_num_words = 2000 + (dimension * 8);

   if (index)
      {
      U_INTERNAL_ASSERT_EQUALS(tbl_name, 0)
      U_INTERNAL_ASSERT_EQUALS(tbl_words, 0)

      dimension += dimension / 4;

      tbl_name  = U_NEW(UHashMap<UString>(U_GET_NEXT_PRIME_NUMBER(dimension)));
      tbl_words = U_NEW(UHashMap<UString>(U_GET_NEXT_PRIME_NUMBER(approximate_num_words), ignore_case));
      }

   if (parsing)
      {
      U_INTERNAL_ASSERT_EQUALS(file,0)
      U_INTERNAL_ASSERT_EQUALS(content,0)

      file    = U_NEW(UFile);
      content = U_NEW(UString);
      }
}

void UPosting::resetVectorCompositeWord()
{
   U_TRACE(5, "UPosting::resetVectorCompositeWord()")

   delete sub_word;
   delete vec_sub_word;
   delete vec_sub_word_posting;

   sub_word             = 0;
   vec_sub_word         = 0;
   vec_sub_word_posting = 0;
   vec_sub_word_size    = 0;
}

void UPosting::reset()
{
   U_TRACE(5, "UPosting::reset()")

   max_distance      = 2;
   sub_word_pos_prev = 0;

   if (vec_sub_word) resetVectorCompositeWord();

   if (vec_word)
      {
      delete vec_word;
      delete vec_entry;
      delete vec_posting;

      vec_word    = 0;
      vec_entry   = 0;
      vec_posting = 0;
      }
}

UPosting::~UPosting()
{
   U_TRACE_UNREGISTER_OBJECT(5, UPosting)

   delete posting;
   delete filename;
   delete str_cur_doc_id;

   if (tbl_name == 0) reset();
   else
      {
      delete tbl_name;
      delete tbl_words;
      }

   delete word;

   if (file)
      {
      delete file;
      delete content;
      }
}

// #define U_OPTIMIZE
// #define U_COMPRESS_ENTRY

// MANAGE POSTING VALUE ON DATABASE

inline bool UPosting::decompress()
{
   U_TRACE(5, "UPosting::decompress()")

#ifdef U_COMPRESS_ENTRY
   if (posting->size() > (sizeof(uint32_t) * 4) &&
       UStringExt::isCompress(*posting))
      {
      posting->decompress();

      U_RETURN(true);
      }
#endif

   U_RETURN(false);
}

U_NO_EXPORT void UPosting::readPosting(UStringRep* word_rep, bool flag)
{
   U_TRACE(5, "UPosting::readPosting(%p,%b)", word_rep, flag)

   U_INTERNAL_ASSERT_POINTER(cdb_words)

   *posting = (word_rep ? (flag ? (*((URDB*)cdb_words))[word_rep]
                                : (*        cdb_words) [word_rep])
                        : ((URDB*)cdb_words)->UCDB::elem());

#ifdef U_COMPRESS_ENTRY
   if (decompress() == false)
#endif
      {
      // check if add operation...

      if (word_rep == 0) posting->duplicate(); // NB: need duplicate string because we need space on string constant..
      }

   U_INTERNAL_ASSERT_EQUALS(UStringExt::isCompress(posting->data()), false)
}

U_NO_EXPORT int UPosting::writePosting(int flag)
{
   U_TRACE(5, "UPosting::writePosting(%d)", flag)

   U_INTERNAL_ASSERT_POINTER(cdb_words)
   U_INTERNAL_ASSERT_EQUALS(UStringExt::isCompress(posting->data()), false)

#ifdef U_COMPRESS_ENTRY
   if (posting->size() > U_CAPACITY) posting->compress();
#endif

   int result = ((URDB*)cdb_words)->store(*word, *posting, flag);

   U_RETURN(result);
}

// FIND CURRENT DOC ID ON POSTING

inline bool UPosting::checkEntry(char* str, char* s, uint32_t n)
{
   U_TRACE(5, "UPosting::checkEntry(%p,%p,%u)", str, n)

   U_INTERNAL_ASSERT_POINTER(s)
   U_INTERNAL_ASSERT_POINTER(str)

   uint32_t offset = (str - s);

   U_INTERNAL_DUMP("offset     = %u", offset)
   U_INTERNAL_DUMP("size_entry = %u", POSTING_SIZE(str))

   if ((offset % sizeof(uint32_t)) == 0 &&
       POSTING_SIZE(str) <= (n - offset))
      {
      U_RETURN(true);
      }

   U_RETURN(false);
}

U_NO_EXPORT __pure char* UPosting::find(char* s, uint32_t n, bool boptmize)
{
   U_TRACE(5, "UPosting::find(%p,%u,%b)", s, n, boptmize)

   U_INTERNAL_DUMP("cur_doc_id = %llu", cur_doc_id)

   if (boptmize)
      {
      char* str;

      do {
         str = (char*) u_find(s, n, ptr_cur_doc_id, sizeof(cur_doc_id));

         if (str)
            {
            if (checkEntry(str, s, n)) U_RETURN(str);

            ++str;

            n = (s + n) - str;
            s =           str;
            }
         }
      while (str);
      }
   else
      {
      char* str = s;
      char* end = s + n;

      do {
         if (POSTING64(str,doc_id) == cur_doc_id)
            {
            U_INTERNAL_DUMP("offset     = %u",   (str - s))
            U_INTERNAL_DUMP("doc_id     = %llu", POSTING64(str,doc_id))
            U_INTERNAL_DUMP("word_freq  = %u",   POSTING32(str,word_freq))
            U_INTERNAL_DUMP("size_entry = %u",   POSTING_SIZE(str))

            U_RETURN(str);
            }

         str += POSTING_SIZE(str);
         }
      while (str < end);

      U_INTERNAL_ASSERT_EQUALS((str - s), (ptrdiff_t)n)
      }

   U_RETURN((char*)0);
}

U_NO_EXPORT bool UPosting::setPosting(bool bcache)
{
   U_TRACE(5, "UPosting::setPosting(%b)", bcache)

   U_INTERNAL_ASSERT_POINTER(ptr_cur_doc_id)

   if (bcache)
      {
      static char* last_posting_ptr;
      static uint32_t last_posting_sz;
      static UStringRep* last_word_rep;

      if (word->rep != last_word_rep)
         {
         readPosting(last_word_rep = word->rep, false);

         if (posting->empty()) last_posting_sz  = 0;
         else
            {
            last_posting_sz  = posting->size() - sizeof(uint32_t); // offset last DOC id
            last_posting_ptr = posting->data() + sizeof(uint32_t);
            }
         }

      char* posting_ptr = (last_posting_sz ? find(last_posting_ptr, last_posting_sz, true) : 0);

      if (posting_ptr) U_RETURN(true);

      U_RETURN(false);
      }

   U_INTERNAL_ASSERT(*posting)

// if (posting->empty()) U_RETURN(false);

   ptr = find(posting->data() + sizeof(uint32_t), posting->size() - sizeof(uint32_t), false);

   U_INTERNAL_DUMP("ptr = %p", ptr)

   if (ptr) U_RETURN(true);

   U_RETURN(false);
}

U_NO_EXPORT bool UPosting::findCurrentDocIdOnPosting(UStringRep* value)
{
   U_TRACE(5, "UPosting::findCurrentDocIdOnPosting(%.*S)", U_STRING_TO_TRACE(*value))

   posting->_assign(value);

#ifdef U_COMPRESS_ENTRY
   (void)decompress();
#endif

   ptr = (char*) u_find(U_STRING_TO_PARAM(*posting), ptr_cur_doc_id, sizeof(cur_doc_id));

   // check for collision...

   if (ptr &&
       checkEntry(ptr, U_STRING_TO_PARAM(*posting)))
      {
      U_RETURN(true);
      }

   U_RETURN(false);
}

// DELETE WORD

inline bool UPosting::isOneEntry()
{
   U_TRACE(5, "UPosting::isOneEntry()")

   if (posting->size() == (sizeof(uint32_t) + POSTING_SIZE(ptr))) U_RETURN(true);

   U_RETURN(false);
}

U_NO_EXPORT void UPosting::del()
{
   U_TRACE(5, "UPosting::del()")

   U_INTERNAL_ASSERT(*word)
   U_INTERNAL_ASSERT(*posting)
   U_INTERNAL_ASSERT_POINTER(ptr)

   data = posting->data();

   U_INTERNAL_DUMP("word_freq       = %u",   POSTING32(ptr,word_freq))
   U_INTERNAL_DUMP("size_entry      = %u",   POSTING_SIZE(ptr))
   U_INTERNAL_DUMP("cur_doc_id      = %llu", cur_doc_id)
   U_INTERNAL_DUMP("posting->size() = %u",   posting->size())
   U_INTERNAL_DUMP("off_last_doc_id = %u",   POSTING_OFFSET_LAST_DOC_ID)

   U_INTERNAL_ASSERT_MAJOR(POSTING32(ptr,word_freq), 0)

   // check if other reference...

   int result;

   if (isOneEntry())
      {
      result = ((URDB*)cdb_words)->remove(*word);

      if (result != 0)
         {
         U_ERROR("del(): error<%d> on operation remove word<%.*s> from index database", result, U_STRING_TO_TRACE(*word));
         }

      return;
      }

#if defined(DEBUG) || defined(U_TEST)
   uint32_t last_offset = POSTING_OFFSET_LAST_DOC_ID;
#endif

   char* end = posting->rep->end();

   do {
      U_INTERNAL_DUMP("doc_id          = %lu", POSTING64(ptr,doc_id))

      size_entry = POSTING_SIZE(ptr);

      if (POSTING64(ptr,doc_id) == cur_doc_id) break;

      ptr += size_entry;
      }
   while (ptr < end);

   if (ptr >= end)
      {
      U_ERROR("del(): cannot find DocID<%llu> reference for word<%.*s> on index database", cur_doc_id, U_STRING_TO_TRACE(*word));
      }

   (void) posting->replace(ptr - data, size_entry, 0, '\0');

   // reassign value

   data = posting->data();
   ptr  = data + sizeof(uint32_t);
   end  = posting->rep->end();

   U_INTERNAL_DUMP("posting->size() = %u", posting->size())

   U_INTERNAL_ASSERT_EQUALS(POSTING_OFFSET_LAST_DOC_ID, last_offset)

   do {
      size_entry = POSTING_SIZE(ptr);

      ptr += size_entry;
      }
   while (ptr < end);

   U_INTERNAL_ASSERT_EQUALS(ptr - data, (ptrdiff_t)posting->size())

   u_put_unalignedp32(data, (ptr - data) - size_entry);

   U_INTERNAL_DUMP("off_last_doc_id = %u", POSTING_OFFSET_LAST_DOC_ID)

   U_INTERNAL_ASSERT_MINOR(POSTING_OFFSET_LAST_DOC_ID, last_offset)

   result = writePosting(RDB_REPLACE);

   if (result != 0)
      {
      U_ERROR("del(): error<%d> on operation replace reference for word<%.*s> on index database", result, U_STRING_TO_TRACE(*word));
      }
}

// ADD WORD

inline void UPosting::init()
{
   U_TRACE(5, "UPosting::init()")

   u_put_unalignedp64(&(((u_posting*)(ptr))->doc_id), cur_doc_id);
   u_put_unalignedp32(&(((u_posting*)(ptr))->word_freq), 0);
}

inline void UPosting::checkCapacity()
{
   U_TRACE(5, "UPosting::checkCapacity()")

   space = off_last_doc_id + size_entry + sizeof(off_last_doc_id);

   U_INTERNAL_DUMP("space = %u", space)

   if (space >= posting->capacity())
      {
      if (tbl_words) tbl_words_space -= posting->capacity();

   // if (posting->uniq() == false) posting->duplicate();

      posting->reserve(space * 2);

      if (tbl_words)
         {
         tbl_words_space += posting->capacity();

         tbl_words->replaceAfterFind(*posting);
         }
      }

   data = posting->data();
   ptr  = data + off_last_doc_id;
}

U_NO_EXPORT void UPosting::add()
{
   U_TRACE(5, "UPosting::add()")

   U_INTERNAL_ASSERT(*word)

   bool present = (tbl_words ?          tbl_words->find(*word)
                             : ((URDB*)cdb_words)->find(*word));

   if (present == false)
      {
      // first document for this word...

      *posting = UString(U_CAPACITY);
      data     = posting->data();
      ptr      = data + sizeof(off_last_doc_id);

      // setting off_last_doc_id for first document...

      size_entry      = sizeof(u_posting);
      off_last_doc_id = sizeof(off_last_doc_id);

      u_put_unalignedp32(data, sizeof(off_last_doc_id));

      init();

      if (tbl_words)
         {
         word->duplicate(); // NB: need duplicate string because depends on mmap()'s content of document...

         tbl_words_space += word->size() + posting->capacity();

         tbl_words->insertAfterFind(*word, *posting);
         }
      }
   else
      {
      if (tbl_words) posting->_assign(tbl_words->elem());
      else           readPosting(0, false);

   // U_ASSERT_EQUALS(0,find(posting->data() + sizeof(uint32_t), posting->size() - sizeof(uint32_t)))

      data            = posting->data();
      off_last_doc_id = POSTING_OFFSET_LAST_DOC_ID;
      ptr             = data + off_last_doc_id;
      size_entry      = POSTING_SIZE(ptr);

      U_INTERNAL_DUMP("word_freq       = %u", POSTING32(ptr,word_freq))
      U_INTERNAL_DUMP("size_entry      = %u", size_entry)
      U_INTERNAL_DUMP("off_last_doc_id = %u", off_last_doc_id)

      U_INTERNAL_ASSERT_MAJOR(POSTING32(ptr,word_freq), 0)

      U_ASSERT((off_last_doc_id + size_entry) <= posting->capacity())

      // check for new document...

      if (POSTING64(ptr,doc_id) != cur_doc_id)
         {
         off_last_doc_id += size_entry;
               size_entry = sizeof(u_posting);

         u_put_unalignedp32(data, off_last_doc_id);

         // check string capacity...

         checkCapacity();

         init();
         }
      }

   U_INTERNAL_ASSERT_EQUALS(cur_doc_id,      POSTING64(ptr,doc_id))
   U_INTERNAL_ASSERT_EQUALS(off_last_doc_id, POSTING_OFFSET_LAST_DOC_ID)

   // check string capacity...

   checkCapacity();

   U_INTERNAL_ASSERT_EQUALS(cur_doc_id,      POSTING64(ptr,doc_id))
   U_INTERNAL_ASSERT_EQUALS(off_last_doc_id, POSTING_OFFSET_LAST_DOC_ID)

   // save position of word in document...

   word_freq = POSTING32(ptr,word_freq);

   u_put_unalignedp32(ptr+sizeof(u_posting)+(word_freq*sizeof(uint32_t)), pos);

   u_put_unalignedp32(&(((u_posting*)(ptr))->word_freq), word_freq+1);

   U_INTERNAL_DUMP("pos       = %u", pos)
   U_INTERNAL_DUMP("pos[0]    = %u", POSTING_POS(0))
   U_INTERNAL_DUMP("pos[1]    = %u", POSTING_POS(1))
   U_INTERNAL_DUMP("word_freq = %u", POSTING32(ptr,word_freq))

   posting->size_adjust_force(space);

   if (tbl_words == 0)
      {
      int result = writePosting(present ? RDB_REPLACE : RDB_INSERT);

      if (result != 0)
         {
         U_ERROR("add(): error<%d> on operation store reference for word<%.*s> on index database", result, U_STRING_TO_TRACE(*word));
         }
      }
}

// CHECK POSTING AND WORD

U_NO_EXPORT void UPosting::checkPosting()
{
   U_TRACE(5, "UPosting::checkPosting()")

   U_INTERNAL_ASSERT_POINTER(ptr)

   U_INTERNAL_DUMP("cur_doc_id      = %llu", cur_doc_id)

   U_INTERNAL_ASSERT_EQUALS(cur_doc_id, POSTING64(ptr,doc_id))

   word_freq = POSTING32(ptr,word_freq);

   U_INTERNAL_DUMP("word_freq       = %u", word_freq)
   U_INTERNAL_DUMP("size_entry      = %u", POSTING_SIZE(ptr))
   U_INTERNAL_DUMP("offset          = %u", posting->distance(ptr))
   U_INTERNAL_DUMP("posting->size() = %u", posting->size())

#if defined(U_OPTIMIZE) && defined(DEBUG)
   if (cdb_words->ignoreCase() == false)
      {
      U_INTERNAL_ASSERT_DIFFERS(u_find(content->data(), content->size(), word->data(), word->size()),0)
      }
#endif

   uint32_t end = content->size(),
             sz =    word->size();

   uint32_t* vpos = (uint32_t*)(ptr+sizeof(u_posting));

   char* ptr_word = word->data();
   char* ptr_data = content->data();

   for (uint32_t i = 0; i < word_freq; ++i)
      {
      uint32_t start = u_get_unalignedp32(vpos+i);

      U_INTERNAL_DUMP("start           = %u", start)

      if (start >= end ||
          u_equal(ptr_word, ptr_data+start, sz, cdb_words->ignoreCase()))
         {
         U_ERROR("checkPosting(): word<%v> reference at position<%u> for DocID<%llu %v> lost", word->rep, start, cur_doc_id, filename->rep);
         }
      }
}

// PROCESS WORD

void UPosting::processWord(int32_t op)
{
   U_TRACE(5, "UPosting::processWord(%d)", op)

   U_INTERNAL_ASSERT_POINTER(content)

   if (word->size() < min_word_size) return;

   pos = pos_start + (word->data() - content->data());

   U_INTERNAL_DUMP("word = %V pos = %u", word->rep, pos)

   if (op == 0 ||
       op == 1)
      {
      add(); // add/sub
      }
   else
      {
      readPosting(word->rep, (op == 2)); // del

      if (posting->empty() ||
          setPosting(false) == false)
         {
         U_ERROR("processWord(%d): word<%v> reference at position<%u> for DocID<%llu %v> lost", op, word->rep, pos, cur_doc_id, filename->rep);
         }

      if      (op == 2) del();          // del
      else if (op == 3) checkPosting(); // check
      }
}

// CHECK DOCUMENT

U_NO_EXPORT int UPosting::checkDocument(UStringRep* word_rep, UStringRep* value)
{
   U_TRACE(5, "UPosting::checkDocument(%.*S,%p)", U_STRING_TO_TRACE(*word_rep), value)

   U_INTERNAL_ASSERT(*content)

   if (findCurrentDocIdOnPosting(value))
      {
      word->_assign(word_rep);

      U_INTERNAL_DUMP("cdb_words->ignoreCase() = %b", cdb_words->ignoreCase())

#  ifndef U_OPTIMIZE
      if (cdb_words->ignoreCase() == false &&
          u_find(content->data(), content->size(), word->data(), word->size()) == 0)
         {
         U_RETURN(1);
         }
#  endif

      checkPosting();
      }

   U_RETURN(1);
}

U_NO_EXPORT int UPosting::checkAllEntry(UStringRep* word_rep, UStringRep* value)
{
   U_TRACE(5, "UPosting::checkAllEntry(%.*S,%p)", U_STRING_TO_TRACE(*word_rep), value)

   word->_assign(word_rep);

   callForPostingAndSetFilename(0);

   (void) write(1, U_CONSTANT_TO_PARAM(".")); // CHECK_2

   U_RETURN(1);
}

void UPosting::checkAllEntry()
{
   U_TRACE(5, "UPosting::checkAllEntry()")

   cdb_words->callForAllEntryWithPattern(checkAllEntry, 0);
}

// SUBSTITUTE DOCUMENT

U_NO_EXPORT int UPosting::substitute(UStringRep* word_rep, UStringRep* value)
{
   U_TRACE(5, "UPosting::substitute(%.*S,%p)", U_STRING_TO_TRACE(*word_rep), value)

   posting->_assign(value);

#ifdef U_COMPRESS_ENTRY
   (void) decompress();
#endif

   word->_assign(word_rep);

   if (setPosting(false)) del();

   U_RETURN(1);
}

// PROCESS DOCUMENT

inline void UPosting::setDocID(bool from_inode)
{
   U_TRACE(5, "UPosting::setDocID(%b)", from_inode)

   if (from_inode) cur_doc_id = - file->inode();

   U_INTERNAL_DUMP("cur_doc_id = %llu", cur_doc_id)

   str_cur_doc_id->setFromInode(&cur_doc_id);

   ptr_cur_doc_id = str_cur_doc_id->data();
}

void UPosting::setDocID(int32_t op)
{
   U_TRACE(5, "UPosting::setDocID(%d)", op)

   U_DUMP("filename = %.*S", U_STRING_TO_TRACE(*filename))

   if (dir_content_as_doc)
      {
      if (op != 0) // add
         {
         U_ERROR("setDocID(%d): sorry, not implemented", op);
         }

      if (change_dir == false) return;

      change_dir = false;

      *filename = UStringExt::dirname(*filename);

      filename->duplicate(); // NB: need duplicate string because depends on volatile buffer of filename...

      U_INTERNAL_DUMP("dirname = %V", filename->rep)
      }

   // insert/fetch/remove into table of docs name

   setDocID(true);

   if (tbl_name)
      {
      tbl_name->insert(*str_cur_doc_id, *filename);

      tbl_name_space += str_cur_doc_id->size() + filename->size();
      }
   else
      {
      int result = 0;

      if (op == 0) result = ((URDB*)cdb_names)->store( *str_cur_doc_id, *filename, RDB_INSERT); // add
      if (op == 2) result = ((URDB*)cdb_names)->remove(*str_cur_doc_id);                        // del

      if (result != 0)
         {
         U_ERROR("setDocID(%d): error<%d> for operation<%d> on DocID database", op, result, op);
         }

      if (op == 1) // sub
         {
         if (((URDB*)cdb_names)->find(*str_cur_doc_id) == false)
            {
            U_ERROR("setDocID(%d): cannot find DocID<%llu> on names database", op, cur_doc_id);
            }

         // SUBSTITUTE: we find all reference for this document in the words database and erase it...

         U_INTERNAL_ASSERT_POINTER(cdb_words)

         ((URDB*)cdb_words)->callForAllEntryWithPattern(substitute, 0);
         }
      else if (op == 3) // check
         {
         // CHECK: we find all reference for this document in the words database and check it...

         U_INTERNAL_ASSERT_POINTER(cdb_words)

         cdb_words->callForAllEntryWithPattern(checkDocument, 0);
         }
      }
}

// PROCESSING MISC (QUERY)

void UPosting::callForPosting(vPF function)
{
   U_TRACE(5, "UPosting::callForPosting(%p)", function)

   U_INTERNAL_ASSERT(*posting)
   U_INTERNAL_ASSERT_POINTER(function)

   data      = posting->data();
   ptr       = data + sizeof(uint32_t);
   char* end = posting->rep->end();

   do {
      word_freq  = POSTING32(ptr,word_freq);
      size_entry = POSTING_SIZE(ptr);
      cur_doc_id = POSTING64(ptr,doc_id);

      U_INTERNAL_DUMP("word_freq  = %u",   word_freq)
      U_INTERNAL_DUMP("size_entry = %u",   size_entry)
      U_INTERNAL_DUMP("cur_doc_id = %llu", cur_doc_id)

      function();

      ptr += size_entry;
      }
   while (ptr < end);

   U_INTERNAL_ASSERT_EQUALS(ptr - data, (ptrdiff_t)posting->size())
}

U_NO_EXPORT void UPosting::setFilename()
{
   U_TRACE(5, "UPosting::setFilename()")

   U_INTERNAL_ASSERT(*str_cur_doc_id)
   U_INTERNAL_ASSERT_POINTER(cdb_names)

   *filename = (*cdb_names)[str_cur_doc_id->rep];

   if (filename->empty())
      {
      U_ERROR("setFilename(): cannot find document name from DocID<%llu> reference for word<%v> on index database", cur_doc_id, word->rep);
      }
}

U_NO_EXPORT void UPosting::callForPostingAndSetFilename()
{
   U_TRACE(5, "UPosting::callForPostingAndSetFilename()")

   str_cur_doc_id->setFromInode(&cur_doc_id);

   setFilename();

   if (pfunction) pfunction();
}

void UPosting::callForPostingAndSetFilename(vPF function)
{
   U_TRACE(5, "UPosting::callForPostingAndSetFilename(%p)", function)

   pfunction = function;

   callForPosting(callForPostingAndSetFilename);
}

U_NO_EXPORT bool UPosting::setVectorCompositeWord()
{
   U_TRACE(5, "UPosting::setVectorCompositeWord()")

   U_INTERNAL_ASSERT(*word)
   U_INTERNAL_ASSERT_POINTER(cdb_words)
   U_INTERNAL_ASSERT_EQUALS(vec_sub_word,0)
   U_INTERNAL_ASSERT_EQUALS(vec_sub_word_posting,0)

   sub_word             = U_NEW(UString);
   vec_sub_word         = U_NEW(UVector<UString>(*word));
   vec_sub_word_posting = U_NEW(UVector<UString>);
   vec_sub_word_size    = vec_sub_word->size();

   for (uint32_t i = 0; i < vec_sub_word_size; ++i)
      {
      *sub_word = (*vec_sub_word)[i];

      U_INTERNAL_DUMP("sub_word = %V", sub_word->rep)

      if (sub_word->size() < min_word_size)
         {
         vec_sub_word_posting->push(UString::getStringNull());

         continue;
         }

      readPosting(sub_word->rep, false);

      if (posting->empty())
         {
         resetVectorCompositeWord();

         U_RETURN(false);
         }

      vec_sub_word_posting->push(*posting);
      }

   U_RETURN(true);
}

inline bool UPosting::setSubWord(uint32_t i)
{
   U_TRACE(5, "UPosting::setSubWord(%u)", i)

   U_INTERNAL_ASSERT_POINTER(sub_word)
   U_INTERNAL_ASSERT_POINTER(vec_sub_word)

   *sub_word      = (*vec_sub_word)[i];
    sub_word_size = sub_word->size();

   U_INTERNAL_DUMP("sub_word = %V", sub_word->rep)

   if (sub_word_size < min_word_size)
      {
      sub_word_pos_prev += 1 + sub_word_size;

      U_RETURN(false);
      }

   U_INTERNAL_DUMP("sub_word_pos_prev = %u", sub_word_pos_prev)

   *posting = (*vec_sub_word_posting)[i];

   U_INTERNAL_ASSERT(*posting)

   U_RETURN(true);
}

inline UString UPosting::extractDocID()
{
   U_TRACE(5, "UPosting::extractDocID()")

   /**
    * +------+--------------------+--------+-----------+-------+-----+-------+-----+--------+-----------+-------+-----+-------+
    * | WORD | offset last DOC id | DOC id | frequency | pos 1 | ... | pos n | ... | DOC id | frequency | pos 1 | ... | pos n |
    * +------+--------------------+--------+-----------+-------+-----+-------+-----+--------+-----------+-------+-----+-------+
    */

   UStringRep* r;
   UString s(size_entry);

   char* sdata = s.data();
   char* sptr  = sdata;

   for (uint32_t i = 0, n = vec_entry->size(); i < n; ++i)
      {
      r = vec_entry->UVector<UStringRep*>::at(i);

#  ifdef U_COMPRESS_ENTRY
      posting->_assign(r);

      if (decompress()) r = posting->rep;
#  endif

      data      = r->data();
      ptr       = data + sizeof(uint32_t);
      char* end = r->end();

      do {
         size_entry = POSTING_SIZE(ptr);
         cur_doc_id = POSTING64(ptr,doc_id);

         U_INTERNAL_DUMP("word_freq  = %u",   POSTING32(ptr,word_freq))
         U_INTERNAL_DUMP("size_entry = %u",   size_entry)
         U_INTERNAL_DUMP("cur_doc_id = %llu", cur_doc_id)

         U_MEMCPY((void*)sptr, &cur_doc_id, sizeof(cur_doc_id));

          ptr += size_entry;
         sptr += sizeof(uint64_t);
         }
      while (ptr < end);

      U_INTERNAL_ASSERT_EQUALS(ptr - data, (ptrdiff_t)r->size())
      }

   s.size_adjust_force(sptr - sdata);

   U_RETURN_STRING(s);
}

/**
 * ------------------------------------------------
 * findDocID() is be called from:
 * ------------------------------------------------
 * UQueryParser::evaluate() (loop for all doc name)
 * ------------------------------------------------
 */

bool UPosting::findDocID(UStringRep* word_rep)
{
   U_TRACE(5, "UPosting::findDocID(%V)", word_rep)

   U_INTERNAL_ASSERT(*str_cur_doc_id)
   U_INTERNAL_ASSERT_POINTER(cdb_words)

   // manage meta expr, composite word, etc. for all doc name... (save info with property)

   uint32_t i;

   word->_assign(word_rep);

   if (vec_word) i = vec_word->find(*word);
   else
      {
      // allocation property...

      i           = U_NOT_FOUND;
      vec_word    = U_NEW(UVector<UString>(32));
      vec_entry   = U_NEW(UVector<UString>(approximate_num_words));
      vec_posting = U_NEW(UVector<UString>(32));
      }

   // check if exist property for this word...

   if (i == U_NOT_FOUND)
      {
      // set property for this word...

      i = vec_word->size();

      vec_word->push(*word);

      if (word_rep->isQuoted('"'))
         {
         property[i].is_meta   = false;
         property[i].is_quoted = true;
         }
      else if (word->find_first_of("?*", 0, 2) != U_NOT_FOUND)
         {
         property[i].is_meta   = true;
         property[i].is_quoted = false;
         }
      else
         {
         property[i].is_meta = property[i].is_quoted = false;
         }
      }

   // find context with property...

   if (property[i].is_quoted)
      {
      // context composite word...

      static UStringRep* last_word_rep;

      if (word_rep != last_word_rep)
         {
         last_word_rep = word_rep;

         if (vec_sub_word) resetVectorCompositeWord();
         }

      if (callForCompositeWord(0)) U_RETURN(true);

      U_RETURN(false);
      }

   if (property[i].is_meta)
      {
      // context meta word...

      UString entry;

      if (i < vec_posting->size()) entry = (*vec_posting)[i];
      else
         {
         vec_entry->clear();

         if (cdb_words->getValuesWithKeyNask(*vec_entry, *word, &size_entry)) entry = extractDocID();

         vec_posting->push(entry);
         }

      if (entry &&
          u_find(entry.data(), entry.size(), ptr_cur_doc_id, sizeof(cur_doc_id)))
         {
         U_RETURN(true);
         }

      U_RETURN(false);
      }

   // context single word...

   if (setPosting(true)) U_RETURN(true);

   U_RETURN(false);
}

void UPosting::callForPosting(vPF function, bool is_space)
{
   U_TRACE(5, "UPosting::callForPosting(%p,%b)", function, is_space)

   U_INTERNAL_ASSERT(*word)
   U_INTERNAL_ASSERT_POINTER(cdb_words)

   if (is_space) // composite word
      {
      // check for NEAR operation (string not quoted)...

      bool is_quoted = word->isQuoted();

      U_INTERNAL_DUMP("is_quoted = %b", is_quoted)

      if (is_quoted == false) max_distance = ((uint32_t)-1);

      (void) callForCompositeWord(function);
      }
   else // single word
      {
      readPosting(word->rep, false);

      if (*posting) callForPostingAndSetFilename(function);
      }
}

/**
 * ------------------------------------------------
 * CONTEXT COMPOSITE WORD...
 * ------------------------------------------------
 * checkCompositeWord() may be called from:
 * ------------------------------------------------
 * UQueryParser::evaluate() (loop for all doc name)
 * callForPosting()         (for composite word)
 * ------------------------------------------------
 */

U_NO_EXPORT bool UPosting::callForCompositeWord(vPF function)
{
   U_TRACE(5, "UPosting::callForCompositeWord(%p)", function)

   U_INTERNAL_ASSERT_POINTER(cdb_names)

   if (vec_sub_word == 0 &&
       setVectorCompositeWord() == false)
      {
      U_RETURN(false);
      }

   // find the first sub-word util

   uint32_t first_subword_index;

   for (first_subword_index = 0; first_subword_index < vec_sub_word_size; ++first_subword_index)
      {
      if (setSubWord(first_subword_index) == false) continue;

      break;
      }

   U_INTERNAL_DUMP("FIRST subword util  = %.*S", U_STRING_TO_TRACE(*sub_word))
   U_INTERNAL_DUMP("first_subword_index = %u", first_subword_index)

   U_INTERNAL_ASSERT(*posting)

#if defined(DEBUG) || defined(U_TEST)
   uint32_t posting_size = posting->size();
#endif

   bool match;
   char* ptr1;
   char* end1 = 0;
   int32_t i, first_subword_freq;
   uint32_t j, k, sz = sub_word_size;

   // loop for all posting entry of the first sub-word util
   // ---------------------------------------------------------------------------------------------------------------
   // immatricolazione (-261845,1,1148)(-261811,1,2024)(-261819,4,2037,2624,4075,4567)(-261832,4,2037,2624,4479,4971)
   // ---------------------------------------------------------------------------------------------------------------

   if (function == 0)
      {
      if (setPosting(false) == false) U_RETURN(false);

      ptr1 = ptr;

      goto start;
      }

   data = posting->data();
   ptr1 = data + sizeof(uint32_t);
   end1 = posting->rep->end();

   do {
      ptr        = ptr1;
      cur_doc_id = POSTING64(ptr,doc_id);
      size_entry = POSTING_SIZE(ptr);

      U_INTERNAL_DUMP("cur_doc_id          = %llu", cur_doc_id)
      U_INTERNAL_DUMP("size_entry          = %u",   size_entry)

      setDocID(false);

start:
      first_subword_freq = POSTING32(ptr,word_freq);

      U_INTERNAL_DUMP("first_subword_freq  = %u", first_subword_freq)

      U_INTERNAL_ASSERT_MAJOR(first_subword_freq, 0)

      i     = -1;
      match = false;

      // loop for all the position of first sub-word util for this document...

loop1:
      while (++i < first_subword_freq)
         {
         ptr               = ptr1;
         pos               = POSTING_POS(i);
         sub_word_pos_prev = pos + sz;

         U_INTERNAL_DUMP("pos[%3u]           = %u", i, pos)
         U_INTERNAL_DUMP("sub_word_pos_prev  = %u", sub_word_pos_prev)

         // check if the other sub-word match in the current doc...

         for (j = first_subword_index + 1; j < vec_sub_word_size; ++j)
            {
            if (setSubWord(j) == false) continue;

            if (setPosting(false) == false) goto loop1;

            U_INTERNAL_DUMP("subword            = %.*S", U_STRING_TO_TRACE(*sub_word))

            U_INTERNAL_ASSERT_EQUALS(cur_doc_id, POSTING64(ptr,doc_id))

            word_freq = POSTING32(ptr,word_freq);

            U_INTERNAL_DUMP("word_freq          = %u", word_freq)

            U_INTERNAL_ASSERT_MAJOR(word_freq, 0)

            // check for all position for this sub-word in the current doc...

            for (k = 0; k < word_freq; ++k)
               {
               pos      = POSTING_POS(k);
               distance = pos - sub_word_pos_prev;

               U_INTERNAL_DUMP("pos[%3u]           = %u", k, pos)
               U_INTERNAL_DUMP("sub_word_pos_prev  = %u", sub_word_pos_prev)
               U_INTERNAL_DUMP("distance           = %u", distance)
               U_INTERNAL_DUMP("max_distance       = %u", max_distance)

               if (distance <= max_distance) break;
               }

            if (k == word_freq) goto loop1; // NOT match, go to another position for the first sub-word util...

            sub_word_pos_prev = pos + sub_word_size;
            }

         match = true;
         }

      U_INTERNAL_ASSERT_EQUALS(i, first_subword_freq)

      if (function == 0) U_RETURN(match);

      if (match)
         {
         setFilename();

         word_freq = (max_distance == ((uint32_t)-1) // check for NEAR...
                              ? 0
                              : max_distance - distance);

         function();
         }

      ptr1 += size_entry;
      }
   while (ptr1 < end1);

   U_INTERNAL_ASSERT_EQUALS(ptr1 - data, (ptrdiff_t)posting_size)

   U_RETURN(true);
}

// PRINT DATABASE

static ostream* os;
static UString* buffer;

#  define SIZE_ENTRY ( 2 + \
                      10 + \
                      10 + \
                      10 * word_freq)

U_NO_EXPORT void UPosting::printPosting()
{
   U_TRACE(5, "UPosting::printPosting()")

   static UStringRep* last = word->rep;

start:
   if (last == word->rep)
      {
      if (buffer->empty())
         {
#     ifdef U_STDCPP_ENABLE
         *os << "-------------------------------------------------------------------------------------------\n";

         last->write(*os);
#     endif
         }

      uint32_t* vpos = (uint32_t*)(ptr+sizeof(u_posting));

      if (buffer->space() < SIZE_ENTRY) buffer->reserve(buffer->capacity() * 2);

                                               buffer->snprintf_add("(%llX,%u,%u", cur_doc_id, word_freq, u_get_unalignedp32(vpos));
      for (uint32_t i = 1; i < word_freq; ++i) buffer->snprintf_add(",%u", u_get_unalignedp32(vpos+i));
                                        (void) buffer->append(U_CONSTANT_TO_PARAM(")"));
      }
   else
      {
#  ifdef U_STDCPP_ENABLE
      *os << ' ' << *buffer << '\n';
#  endif

      last = word->rep;

      buffer->setEmpty();

      goto start;
      }
}

U_NO_EXPORT int UPosting::print(UStringRep* word_rep, UStringRep* value)
{
   U_TRACE(5, "UPosting::print(%.*S,%p)", U_STRING_TO_TRACE(*word_rep), value)

   posting->_assign(value);

#ifdef U_COMPRESS_ENTRY
   (void) decompress();
#endif

   word->_assign(word_rep);

   callForPosting(printPosting);

   U_RETURN(1);
}

U_NO_EXPORT int UPosting::printDocName(UStringRep* doc_id, UStringRep* doc_name)
{
   U_TRACE(5, "UPosting::printDocName(%.*S,%.*S)", U_STRING_TO_TRACE(*doc_id), U_STRING_TO_TRACE(*doc_name))

#ifdef U_STDCPP_ENABLE
   char _buffer[20];

   os->write(_buffer, u__snprintf(_buffer, sizeof(_buffer), "%llX ", *((uint64_t*)(doc_id->data()))));

   doc_name->write(*os);

   os->put('\n');
#endif

   U_RETURN(1);
}

void UPosting::printDB(ostream& s)
{
   U_TRACE(5, "UPosting::printDB(%p)", &s)

#ifdef U_STDCPP_ENABLE
   os = &s;

   buffer = U_NEW(UString(U_CAPACITY));

   if (tbl_words) tbl_words->callForAllEntry((bPFprpv)print);
   else
      {
      U_INTERNAL_ASSERT_POINTER(cdb_names)
      U_INTERNAL_ASSERT_POINTER(cdb_words)

      cdb_names->callForAllEntryWithPattern(printDocName, 0);
      cdb_words->callForAllEntryWithPattern(print, 0);
      }

   *os << ' ' << *buffer
       << "\n-------------------------------------------------------------------------------------------\n";

   os->flush();

   delete buffer;
#endif
}

// DEBUG

#if defined(DEBUG) && defined(U_STDCPP_ENABLE)
const char* UPosting::dump(bool _reset) const
{
   *UObjectIO::os << "pos                                     " << pos                          << '\n'
                  << "ptr                                     " << (void*)ptr                   << '\n'
                  << "data                                    " << (void*)data                  << '\n'
                  << "space                                   " << space                        << '\n'
                  << "distance                                " << distance                     << '\n'
                  << "pfunction                               " << (void*)pfunction             << '\n'
                  << "pos_start                               " << pos_start                    << '\n'
                  << "size_entry                              " << size_entry                   << '\n'
                  << "word_freq                               " << word_freq                    << '\n'
                  << "cur_doc_id                              " << cur_doc_id                   << '\n'
                  << "max_distance                            " << max_distance                 << '\n'
                  << "min_word_size                           " << min_word_size                << '\n'
                  << "sub_word_size                           " << sub_word_size                << '\n'
                  << "tbl_name_space                          " << tbl_name_space               << '\n'
                  << "ptr_cur_doc_id                          " << (void*)ptr_cur_doc_id        << '\n'
                  << "tbl_words_space                         " << tbl_words_space              << '\n'
                  << "off_last_doc_id                         " << off_last_doc_id              << '\n'
                  << "vec_sub_word_size                       " << vec_sub_word_size            << '\n'
                  << "sub_word_pos_prev                       " << sub_word_pos_prev            << '\n'
                  << "file                 (UFile             " << (void*)file                  << ")\n"
                  << "word                 (UString           " << (void*)word                  << ")\n"
                  << "content              (UString           " << (void*)content               << ")\n"
                  << "posting              (UString           " << (void*)posting               << ")\n"
                  << "filename             (UString           " << (void*)filename              << ")\n"
                  << "sub_word             (UString           " << (void*)sub_word              << ")\n"
                  << "tbl_name             (UHashMap<UString> " << (void*)tbl_name              << ")\n"
                  << "tbl_words            (UHashMap<UString> " << (void*)tbl_words             << ")\n"
                  << "vec_sub_word         (UVector<UString>  " << (void*)vec_sub_word          << ")\n"
                  << "str_cur_doc_id       (UString           " << (void*)str_cur_doc_id        << ")\n"
                  << "vec_sub_word_posting (UVector<UString>  " << (void*)vec_sub_word_posting  << ')';

   if (_reset)
      {
      UObjectIO::output();

      return UObjectIO::buffer_output;
      }

   return 0;
}
#endif