1
0
mirror of https://github.com/stefanocasazza/ULib.git synced 2025-09-28 19:05:55 +08:00
ULib/tests/examples/IR/benchmark/run.sh
2015-01-23 17:24:36 +01:00

313 lines
6.4 KiB
Bash
Executable File

#!/bin/bash
#UTRACE="0 45M 0"
#UOBJDUMP="0 100k 10"
#USIMERR="error.sim"
#EXEC_ON_EXIT="/utility/stack_dump.sh"
export UTRACE UOBJDUMP USIMERR EXEC_ON_EXIT
LOG=log/benchmark.log
TIMEFORMAT=$'*** Real %3lR\tUser %3lU\tSys %3lS\t%P%% CPU'
#DIR="."
DIR="../.."
#DIR_CMD="$DIR"
DIR_CMD="$DIR/../../examples/IR"
#. $DIR/.colors
. $DIR/../.colors
mkdir -p out/ err/ log/
#export CMD=valgrind
#export LD_LIBRARY_PATH=.
print_cmd() {
# log
echo "$1" >> $LOG
# terminal
$WARNING
echo $1
$NORMAL
}
exec_cmd() {
if [ -z "$CMD" ]; then
( time $DIR_CMD/"$1" "$2" "$3" "$4" "$5" >out/$1.out 2>err/$1.err ) 2>tmp || \
{ $FAILURE; echo "*** ERROR ON $1 ***"; $NORMAL; exit 1; }
else
( $CMD $DIR_CMD/"$1" "$2" "$3" "$4" "$5" >out/$1.out 2>err/$1.err ) 2>tmp || \
{ $FAILURE; echo "*** ERROR ON $1 ***"; $NORMAL; exit 1; }
fi
# log
cat tmp >> $LOG
echo >> $LOG
# terminal
cat tmp
echo
rm -rf tmp
}
# 1 -> 20 directory, 1 directory = 1 cartella, 1 cartella = 20 files
document_generation() {
print_cmd "DOCUMENT_GENERATION:"
rm -rf $DIR/IR/doc
mkdir -p $DIR/IR/doc
exec_cmd gendoc $1 $DIR/IR/doc/
}
indexing_mem() {
# UTRACE="0 80M 0"
print_cmd "INDEXING(1):"
rm -rf $DIR/IR/db
mkdir -p $DIR/IR/db
exec_cmd index1
# unset UTRACE
}
indexing_dsk() {
# UTRACE="1 80M 0"
print_cmd "INDEXING(2):"
rm -rf $DIR/IR/db
mkdir -p $DIR/IR/db
exec_cmd index2
# unset UTRACE
}
updating() {
# UTRACE="0 10M 0"
DEL_FILE="D01/D01_studente_1.xml,D01/D01_studente_1_op_1.xml"
ADD_FILE="$DEL_FILE,D01/D01_studente_1_op_2?.xml"
SUB_FILE="D01/D01_studente_1_op_2?.xml"
print_cmd "UPDATING del:"
if [ -f $DIR/IR/doc/D01/D01_studente_1_op_20.xml ]
then
exec_cmd update -d "$ADD_FILE"
else
exec_cmd update -d "$DEL_FILE"
fi
cp $DIR/inp/D01/D01_studente_1.xml \
$DIR/inp/D01/D01_studente_1_op_1.xml \
$DIR/inp/D01/D01_studente_1_op_20.xml $DIR/IR/doc/D01/ >/dev/null 2>/dev/null
print_cmd "UPDATING add + substitute:"
exec_cmd update -a "$ADD_FILE" -s "$SUB_FILE"
# unset UTRACE
}
db_check() {
# UTRACE="0 10M 0"
print_cmd "DATABASES CHECK:"
exec_cmd db_check
# unset UTRACE
}
querying() {
# export UTRACE="0 10M 0"
print_cmd "QUERY: '$1'"
exec_cmd query "$1"
diff -q out/query.out query.exp >/dev/null 2>/dev/null || \
{ $FAILURE; echo '*** ERROR ON QUERY ***'; $NORMAL; exit 1; }
# unset UTRACE
}
profiling() {
gprof -b $DIR_CMD/$1 gmon.out >profile.out 2>/dev/null
}
sizing_doc() {
print_cmd "DOCUMENT size:"
sync
du -sh $DIR/IR/doc | tee -a $LOG
echo >> $LOG
echo
}
sizing_db() {
print_cmd "DATABASE size:"
sync
du -sh $DIR/IR/db/* | tee -a $LOG
echo >> $LOG
echo
}
print_start() {
# log
DATE=`date '+%X %D'`
echo "($DATE) NUMBER OF DOCUMENTS = $NUM" >> $LOG
echo "=====================================================================" >> $LOG
echo >> $LOG
# terminal
$NORMAL
$ECHO "($DATE) NUMBER OF DOCUMENTS = "
$SUCCESS
$ECHO $NUM
$NORMAL
echo
echo "====================================================================="
echo
}
print_end() {
# log
echo "=====================================================================" >> $LOG
echo >> $LOG
# terminal
$NORMAL
echo "====================================================================="
echo
}
chain() {
# ----------------------------------------------------------------------------------------------------------
# index.cfg - configuration data for program of Imformation Retrieval
# ----------------------------------------------------------------------------------------------------------
# DB location for index db (must be terminated by /)
# DIRECTORY location of docs to index
# DIMENSION approximate number of docs to index
# MIN_WORD_SIZE sets the mininum length of words that will be indexed
# IGNORE_CASE case sensitive or not
# SKIP_TAG_XML skip index of tag xml for files with suffix indicated
# BAD_WORDS template words to not index for files with suffix indicated in BAD_WORDS_EXT
# BAD_WORDS_EXT extension file for BAD_WORDS
# FILTER_EXT preprocessing for files with suffix indicated
# FILTER_CMD preprocessing command for files with suffix indicated in FILTER_EXT
# ----------------------------------------------------------------------------------------------------------
NUM=`expr $1 \* 400`
cat << EOF > index.cfg
INDEX_CFG
{
DB $DIR/IR/db/ # must be terminated by /
DIRECTORY $DIR/IR/doc
DIMENSION $NUM
MIN_WORD_SIZE 3
IGNORE_CASE yes
SKIP_TAG_XML "[ xml ]"
BAD_WORDS ??/??/????|??:??:??|workflow
# BAD_WORDS_EXT "[ xml ]"
# FILTER_EXT "[ pdf doc html ]"
# FILTER_CMD "[ \"pdftotext -raw -nopgbrk -q $FILE -\" \"catdoc -aw $FILE\" \"htuml2txt\"
}
EOF
print_start
document_generation $1
sizing_doc
# indexing_mem
indexing_dsk
sizing_db
updating
querying 'D01_studente_1_op_20'
querying 'D01_studente_1_op_2?'
querying 'Ritiro certificato20'
querying '"Ritiro certificato20"'
querying '(D01_studente_1_op_20 AND Ritiro AND certificato20)'
querying '(D01_studente_1_op_20 AND "Ritiro certificato20")'
querying '(D01_studente_1_op_20 AND Ritiro AND certificato20) AND NOT 01_rossi_2'
querying '(D01_STUDENTE_1_OP_20 and RITIRO and Certificato20) and NOT 01_Rossi_2'
querying '(*Studente*20 AND Certificato??) AND NOT ??_Rossi*2'
# querying '<id>D01_studente_1_op_20</id>'
# querying '<id>D01_studente_1_op_2?</id>'
# querying '<description>Ritiro certificato</description>'
# querying '"<description>Ritiro certificato</description>"'
# querying '(<id>D01_studente_1_op_20</id> AND <description>Ritiro AND certificato</description>)'
# querying '(<id>D01_studente_1_op_20</id> AND "<description>Ritiro certificato</description>")'
# querying '(<id>D01_studente_1_op_20</id> AND <description>Ritiro AND certificato</description>) AND NOT 01_rossi_2'
# profiling query
db_check
print_end
}
if [ -f $LOG ]
then
mv $LOG $LOG.pre
fi
rm -rf out/* err/* gmon.out profile.out \
trace.*.[0-9]* \
object.*.[0-9]* \
stack_dump.*.[0-9]*
#chain 1 # 400
chain 2 # 800
#chain 20 # 8000
#chain 50 # 20000
#chain 100 # 40000
#chain 150 # 60000
#chain 200 # 80000
#chain 250 # 100000
#chain 300 # 120000
#chain 600 # 240000
#chain 800 # 320000
#chain 1000 # 400000