1
0
mirror of https://github.com/stefanocasazza/ULib.git synced 2025-09-28 19:05:55 +08:00
ULib/examples/IR/posting.h
stefanocasazza 9ac5de60a3 bug fixing
2015-07-13 17:44:12 +02:00

142 lines
4.8 KiB
C++

// posting.h - class support for inverted index (data structure)
#ifndef POSTING_H
#define POSTING_H 1
#include <ulib/string.h>
class UCDB;
class UFile;
template <class T> class UVector;
template <class T> class UHashMap;
/**
* +------+--------------------+--------+-----------+-------+-----+-------+-----+--------+-----------+-------+-----+-------+
* | WORD | offset last DOC id | DOC id | frequency | pos 1 | ... | pos n | ... | DOC id | frequency | pos 1 | ... | pos n |
* +------+--------------------+--------+-----------+-------+-----+-------+-----+--------+-----------+-------+-----+-------+
*/
class U_EXPORT UPosting {
public:
// uint32_t off_last_doc_id;
typedef struct u_posting {
uint64_t doc_id;
uint32_t word_freq;
// -----> array of unsigned with word_freq elements
} u_posting;
// Check Memory
U_MEMORY_TEST
// Allocator e Deallocator
U_MEMORY_ALLOCATOR
U_MEMORY_DEALLOCATOR
static UFile* file;
static UString* word;
static UString* content;
static UString* posting;
static UString* filename;
static UString* str_cur_doc_id;
static UHashMap<UString>* tbl_name;
static UHashMap<UString>* tbl_words;
static bool ignore_case, dir_content_as_doc, change_dir;
static uint32_t word_freq, tbl_name_space, tbl_words_space, min_word_size, max_distance, pos_start;
// COSTRUTTORE
UPosting(uint32_t dimension, bool parsing, bool index);
~UPosting();
// SERVICES
static void reset();
static void setDocID(int32_t op);
static void setDocID(UStringRep* str_inode)
{
U_TRACE(5, "UPosting::setDocID(%#.*S)", U_STRING_TO_TRACE(*str_inode))
(void) str_cur_doc_id->_assign(str_inode);
ptr_cur_doc_id = str_cur_doc_id->data();
cur_doc_id = u_get_unalignedp64(str_inode->data());
U_INTERNAL_DUMP("cur_doc_id = %llu", cur_doc_id)
U_INTERNAL_ASSERT_DIFFERS(cur_doc_id, 0)
}
static void processWord(int32_t op);
// Call function for all/one entry
static void printDB(ostream& os);
static bool findDocID(UStringRep* word_rep);
static void checkAllEntry();
static void callForPosting(vPF function);
static void callForPostingAndSetFilename(vPF function);
static void callForPosting(vPF function, bool is_space);
// DEBUG
#ifdef DEBUG
const char* dump(bool reset) const;
#endif
protected:
static char* ptr;
static char* data;
static vPF pfunction;
static UString* sub_word;
static uint64_t cur_doc_id;
static char* ptr_cur_doc_id;
static UVector<UString>* vec_word;
static UVector<UString>* vec_entry;
static UVector<UString>* vec_posting;
static UVector<UString>* vec_sub_word;
static UVector<UString>* vec_sub_word_posting;
static uint32_t pos, size_entry, space, off_last_doc_id, distance, vec_sub_word_size, sub_word_size, sub_word_pos_prev, approximate_num_words;
private:
static inline void init() U_NO_EXPORT;
static inline bool decompress() U_NO_EXPORT;
static inline bool isOneEntry() U_NO_EXPORT;
static inline UString extractDocID() U_NO_EXPORT;
static inline bool setSubWord(uint32_t i) U_NO_EXPORT;
static inline void setDocID(bool from_inode) U_NO_EXPORT;
static inline bool checkEntry(char* str, char* s, uint32_t n) U_NO_EXPORT;
static char* find(char* s, uint32_t n, bool boptmize) U_NO_EXPORT __pure;
static void add() U_NO_EXPORT; // op 0
static void del() U_NO_EXPORT; // op 2
static void checkWord() U_NO_EXPORT;
static void setFilename() U_NO_EXPORT;
static void printPosting() U_NO_EXPORT;
static void checkPosting() U_NO_EXPORT;
static void checkCapacity() U_NO_EXPORT;
static int writePosting(int flag) U_NO_EXPORT;
static bool setPosting(bool bcache) U_NO_EXPORT;
static bool setVectorCompositeWord() U_NO_EXPORT;
static void resetVectorCompositeWord() U_NO_EXPORT;
static void callForPostingAndSetFilename() U_NO_EXPORT;
static bool callForCompositeWord(vPF function) U_NO_EXPORT;
static void readPosting(UStringRep* word_rep, bool flag) U_NO_EXPORT;
static bool findCurrentDocIdOnPosting(UStringRep* value) U_NO_EXPORT;
static int print(UStringRep* word_rep, UStringRep* value) U_NO_EXPORT;
static int substitute(UStringRep* word_rep, UStringRep* value) U_NO_EXPORT;
static int checkAllEntry(UStringRep* word_rep, UStringRep* value) U_NO_EXPORT;
static int checkDocument(UStringRep* word_rep, UStringRep* value) U_NO_EXPORT;
static int printDocName(UStringRep* doc_id, UStringRep* doc_name) U_NO_EXPORT;
// Forbidden operations
UPosting(const UPosting&) {}
UPosting& operator=(const UPosting&) { return *this; }
};
#endif