1
0
mirror of https://github.com/stefanocasazza/ULib.git synced 2025-09-28 19:05:55 +08:00
ULib/src/ulib/tokenizer.cpp
2015-05-28 19:04:25 +02:00

644 lines
14 KiB
C++

// ============================================================================
//
// = LIBRARY
// ULib - c++ library
//
// = FILENAME
// tokenizer.cpp
//
// = AUTHOR
// Stefano Casazza
//
// ============================================================================
#include <ulib/tokenizer.h>
#include <ulib/utility/services.h>
#include <ulib/container/vector.h>
bool UTokenizer::group_skip;
bool UTokenizer::avoid_punctuation;
uint32_t UTokenizer::group_len;
uint32_t UTokenizer::group_len_div_2;
const char* UTokenizer::group;
void UTokenizer::setData(const UString& data)
{
U_TRACE(0, "UTokenizer::setData(%V)", data.rep)
str = data;
end = (s = data.data()) + data.size();
}
bool UTokenizer::next(UString& token, bPFi func)
{
U_TRACE(0, "UTokenizer::next(%p,%p)", &token, func)
const char* p;
while (s < end)
{
// skip char with function
if (func(*s))
{
++s;
continue;
}
p = s;
while (s < end &&
func(*s) == false)
{
++s;
}
token = str.substr(p, s - p);
++s;
U_RETURN(true);
}
U_RETURN(false);
}
bool UTokenizer::next(UString& token, char c)
{
U_TRACE(0, "UTokenizer::next(%p,%C)", &token, c)
const char* p;
while (s < end)
{
// skip char delimiter
if (*s == c)
{
++s;
continue;
}
// delimit token with char delimiter
p = s;
s = (const char*) memchr(s, c, end - s);
if (s == 0) s = end;
token = str.substr(p, s - p);
++s;
U_RETURN(true);
}
U_RETURN(false);
}
// extend the actual token to the next char 'c'... (see PEC_report.cpp)
bool UTokenizer::extend(UString& token, char c)
{
U_TRACE(0, "UTokenizer::extend(%p,%C)", &token, c)
const char* p;
while (s < end)
{
// skip char delimiter
if (*s == c)
{
++s;
continue;
}
// delimit token with char delimiter
p = token.data();
s = (const char*) memchr(s, c, end - s);
if (s == 0) s = end;
token = str.substr(p, s - p);
++s;
U_RETURN(true);
}
U_RETURN(false);
}
bool UTokenizer::next(UString& token, bool* bgroup)
{
U_TRACE(0, "UTokenizer::next(%p,%p)", &token, bgroup)
const char* p = s;
uint32_t shift = 1, n;
if (bgroup) *bgroup = false;
while (s < end)
{
loop:
if (delim)
{
s = u_delimit_token(s, &p, end, delim, 0);
if (p) goto tok;
U_RETURN(false);
}
s = u_skip(s, end, 0, 0);
if (s == end) break;
if (group)
{
if (memcmp(s, group, group_len_div_2) == 0)
{
p = s + group_len_div_2 - 1;
s = u_strpend(p, end - p, group, group_len, '\0');
++p;
if (s == 0) s = end;
U_INTERNAL_DUMP("p = %.*S s = %.*S", s - p, p, end - s, s)
if (group_skip)
{
s += group_len_div_2;
continue;
}
if (bgroup) *bgroup = true;
shift = group_len_div_2;
goto tok;
}
else if (group_skip)
{
// -------------------------------------------------------------------
// examples:
// -------------------------------------------------------------------
// <date>03/11/2005 10:17:46</date>
// <description>description_556adfbc-0107-5000-ede4-d208</description>
// -------------------------------------------------------------------
s = u_delimit_token(s, &p, end, 0, 0);
if (s < end)
{
const char* x = (char*) memchr(p, group[0], s - p);
if (x && (memcmp(x, group, group_len_div_2) == 0))
{
s = x;
shift = 0;
}
}
goto tok;
}
}
s = u_delimit_token(s, &p, end, 0, 0);
tok:
n = s - p;
if (avoid_punctuation)
{
while (u__ispunct(*p))
{
--n;
++p;
if (p == s) goto loop;
}
while (u__ispunct(p[n-1]))
{
--n;
if (n == 0) goto loop;
}
}
token = str.substr(p, n);
s += shift;
U_RETURN(true);
}
U_RETURN(false);
}
bool UTokenizer::tokenSeen(const UString* x)
{
U_TRACE(0, "UTokenizer::tokenSeen(%V)", x->rep)
U_INTERNAL_DUMP("s = %.*S", end - s, s)
skipSpaces();
if (s < end)
{
uint32_t sz = x->size();
if (memcmp(s, x->data(), sz) == 0)
{
s += sz;
U_RETURN(true);
}
}
U_RETURN(false);
}
bool UTokenizer::skipToken(const char* token, uint32_t sz)
{
U_TRACE(0, "UTokenizer::skipToken(%.*S,%u)", sz, token, sz)
if (str.distance(s) >= sz &&
memcmp(s, token, sz) == 0)
{
s += sz;
U_RETURN(true);
}
U_RETURN(false);
}
bool UTokenizer::skipNumber(bool& isReal)
{
U_TRACE(0, "UTokenizer::skipNumber(%p)", &isReal)
isReal = false;
for (char c; s < end; ++s)
{
c = *s;
if (u__isnumber(c)) continue;
if (u__isreal(c) ||
u__toupper(c) == 'E')
{
isReal = true;
continue;
}
U_RETURN(true);
}
U_RETURN(false);
}
UString UTokenizer::getTokenQueryParser()
{
U_TRACE(0, "UTokenizer::getTokenQueryParser()")
skipSpaces();
const char* p = s++;
if (*p == '"')
{
while (s < end && *s++ != '"') {}
}
else
{
while (s < end && // u__isname(*s)
(u__isspace(*s) == false &&
*s != '(' &&
*s != ')'))
{
++s;
}
}
UString token = str.substr(p, s - p);
U_RETURN_STRING(token);
}
/**
* Expression is tokenized as:
*
* precedence: ( )
* logical: && || !
* compare: = == != < <= > =>
* Additive operators: +, -
* Multiplicative operators: *, /, %
* unquoted strings: string
* quoted strings: 'string with a dollar: $FOO'
* variable substitution: $REMOTE_ADDR ${REMOTE_ADDR} $$(pid)
* function call with optional params: FN_CALL([p1,p2,...,pn])
*
* contains: ^
* ends_with: =~
* starts_with: ~=
*/
int UTokenizer::getTokenId(UString& token)
{
U_TRACE(0, "UTokenizer::getTokenId(%p)", &token)
static const int dispatch_table[] = {
(int)((char*)&&case_exclamation-(char*)&&cvalue),/* '!' */
0,/* '"' */
0,/* '#' */
(int)((char*)&&case_dollar-(char*)&&cvalue),/* '$' */
(int)((char*)&&case_percent-(char*)&&cvalue),/* '%' */
(int)((char*)&&case_ampersand-(char*)&&cvalue),/* '&' */
(int)((char*)&&case_quote-(char*)&&cvalue),/* '\'' */
(int)((char*)&&case_opening_parenthesis-(char*)&&cvalue),/* '(' */
(int)((char*)&&case_closing_parenthesis-(char*)&&cvalue),/* ')' */
(int)((char*)&&case_asterisk-(char*)&&cvalue),/* '*' */
(int)((char*)&&case_plus-(char*)&&cvalue),/* '+' */
(int)((char*)&&case_comma-(char*)&&cvalue),/* ',' */
(int)((char*)&&case_minus-(char*)&&cvalue),/* '-' */
0,/* '.' */
(int)((char*)&&case_slash-(char*)&&cvalue),/* '/' */
(int)((char*)&&case_digit-(char*)&&cvalue),/* '0' */
(int)((char*)&&case_digit-(char*)&&cvalue),/* '1' */
(int)((char*)&&case_digit-(char*)&&cvalue),/* '2' */
(int)((char*)&&case_digit-(char*)&&cvalue),/* '3' */
(int)((char*)&&case_digit-(char*)&&cvalue),/* '4' */
(int)((char*)&&case_digit-(char*)&&cvalue),/* '5' */
(int)((char*)&&case_digit-(char*)&&cvalue),/* '6' */
(int)((char*)&&case_digit-(char*)&&cvalue),/* '7' */
(int)((char*)&&case_digit-(char*)&&cvalue),/* '8' */
(int)((char*)&&case_digit-(char*)&&cvalue),/* '9' */
0,/* ':' */
0,/* ';' */
(int)((char*)&&case_less-(char*)&&cvalue),/* '<' */
(int)((char*)&&case_equal-(char*)&&cvalue),/* '=' */
(int)((char*)&&case_major-(char*)&&cvalue),/* '>' */
0,/* '?' */
0,/* '@' */
0,/* 'A' */
0,/* 'B' */
0,/* 'C' */
0,/* 'D' */
0,/* 'E' */
0,/* 'F' */
0,/* 'G' */
0,/* 'H' */
0,/* 'I' */
0,/* 'J' */
0,/* 'K' */
0,/* 'L' */
0,/* 'M' */
0,/* 'N' */
0,/* 'O' */
0,/* 'P' */
0,/* 'Q' */
0,/* 'R' */
0,/* 'S' */
0,/* 'T' */
0,/* 'U' */
0,/* 'V' */
0,/* 'W' */
0,/* 'X' */
0,/* 'Y' */
0,/* 'Z' */
0,/* '[' */
0,/* '\' */
0,/* ']' */
(int)((char*)&&case_xor-(char*)&&cvalue),/* '^' */
0,/* '_' */
0,/* '`' */
0,/* 'a' */
0,/* 'b' */
0,/* 'c' */
0,/* 'd' */
0,/* 'e' */
(int)((char*)&&case_bool-(char*)&&cvalue),/* 'f' */
0,/* 'g' */
0,/* 'h' */
0,/* 'i' */
0,/* 'j' */
0,/* 'k' */
0,/* 'l' */
0,/* 'm' */
0,/* 'n' */
0,/* 'o' */
0,/* 'p' */
0,/* 'q' */
0,/* 'r' */
0,/* 's' */
(int)((char*)&&case_bool-(char*)&&cvalue),/* 't' */
0,/* 'u' */
0,/* 'v' */
0,/* 'w' */
0,/* 'x' */
0,/* 'y' */
0,/* 'z' */
0,/* '{' */
(int)((char*)&&case_pipe-(char*)&&cvalue),/* '|' */
0,/* '}' */
(int)((char*)&&case_tilde-(char*)&&cvalue)/* '~' */
};
char c;
int tid = 0;
const char* p1;
const char* p2;
U_INTERNAL_DUMP("s = %.*S", 20, s)
loop:
p1 = p2 = s;
if (s >= end) goto end;
c = *s++;
if (u__isspace(c)) goto loop;
U_INTERNAL_DUMP("dispatch_table[%d] = %p &&cvalue = %p", c-'!', dispatch_table[c-'!'], &&cvalue)
goto *((char*)&&cvalue + dispatch_table[c-'!']);
case_exclamation: tid = (*s == '=' ? (++s, U_TK_NE) : U_TK_NOT); p2 = s; goto end; /* '!' */
case_dollar: /* '$' */
if (*s == '=')
{
p2 = ++s;
tid = U_TK_ENDS_WITH;
}
else if (*s == '$')
{
p2 = ++s;
tid = U_TK_PID;
}
else
{
tid = U_TK_NAME;
if (*s == '{')
{
p1 = ++s;
while (s < end && *s != '}') ++s;
p2 = s++;
}
else
{
p1 = s;
while (s < end && u__isname(*s)) ++s;
p2 = s;
}
}
goto end;
case_percent: tid = U_TK_MOD; p2 = s; goto end; /* '%' */
case_ampersand: tid = (*s == '&' ? (++s, U_TK_AND) : U_TK_ERROR); p2 = s; goto end; /* '&' */
case_quote: /* '\'' */
tid = U_TK_VALUE;
p1 = s;
while (s < end && *s != '\'') ++s;
p2 = s++;
goto end;
case_opening_parenthesis: tid = U_TK_LPAREN; p2 = s; goto end; /* '(' */
case_closing_parenthesis: tid = U_TK_RPAREN; p2 = s; goto end; /* ')' */
case_asterisk: tid = (*s == '=' ? (++s, U_TK_CONTAINS) : U_TK_MULT); p2 = s; goto end; /* '*' */
case_plus: tid = U_TK_PLUS; p2 = s; goto end; /* '+' */
case_comma: tid = U_TK_COMMA; p2 = s; goto end; /* ',' */
case_minus: tid = U_TK_MINUS; p2 = s; goto end; /* '-' */
case_slash: /* '/' */
c = *s;
if (u__isdigit(c) ||
u__isspace(c))
{
p2 = s;
tid = U_TK_DIV;
goto end;
}
goto cvalue;
case_digit: /* '0' ... '9' */
tid = U_TK_VALUE;
while (s < end && u__isdigit(*s)) ++s;
p2 = s;
goto end;
// foo = "bar" - Un elemento il cui attributo "foo" è uguale a "bar"
// foo ~= "bar" - Un elemento il cui attributo "foo" ha per valore un elenco di valori separati da spazio, uno dei quali uguale a "bar"
// foo ^= "bar" - Un elemento il cui attributo "foo" ha un valore che inizia per "bar"
// foo $= "bar" - Un elemento il cui attributo "foo" ha un valore che finisce per "bar"
// foo *= "bar" - Un elemento il cui attributo "foo" ha un valore che contiene la sottostringa "bar"
case_less: tid = (*s == '=' ? (++s, U_TK_LE) : U_TK_LT); p2 = s; goto end; /* '<' */
case_equal: tid = (*s == '=' ? (++s, U_TK_EQ) : U_TK_EQ); p2 = s; goto end; /* '=' */
case_major: tid = (*s == '=' ? (++s, U_TK_GE) : U_TK_GT); p2 = s; goto end; /* '>' */
case_xor: tid = (*s == '=' ? (++s, U_TK_STARTS_WITH) : U_TK_ERROR); p2 = s; goto end; /* '^' */
case_bool: /* 'f' 't' */
if (c == 't' ? skipToken(U_CONSTANT_TO_PARAM("rue"))
: skipToken(U_CONSTANT_TO_PARAM("alse")))
{
tid = U_TK_VALUE;
if (c == 't') p2 = s;
goto end;
}
goto cvalue;
case_pipe: tid = (*s == '|' ? (++s, U_TK_OR) : U_TK_ERROR); p2 = s; goto end; /* '|' */
case_tilde: tid = (*s == '=' ? (++s, U_TK_IS_PRESENT) : U_TK_ERROR); p2 = s; goto end; /* '~' */
cvalue:
while (s < end)
{
c = *s;
if (c == '(' ||
c == ')' ||
c == ',' ||
u__isgraph(c) == false)
{
break;
}
++s;
}
p2 = s;
tid = (c == '(' ? U_TK_FN_CALL : U_TK_VALUE);
end:
token = str.substr(p1, p2 - p1);
U_INTERNAL_DUMP("token = %V", token.rep)
U_RETURN(tid);
}
#if defined(U_STDCPP_ENABLE) && defined(DEBUG)
const char* UTokenizer::dump(bool reset) const
{
*UObjectIO::os << "s " << (void*)s << '\n'
<< "end " << (void*)end << '\n'
<< "group ";
char buffer[32];
UObjectIO::os->write(buffer, u__snprintf(buffer, sizeof(buffer), "%S", group));
*UObjectIO::os << '\n'
<< "delim ";
UObjectIO::os->write(buffer, u__snprintf(buffer, sizeof(buffer), "%S", delim));
*UObjectIO::os << '\n'
<< "group_skip " << group_skip << '\n'
<< "str (UString " << (void*)&str << ')';
if (reset)
{
UObjectIO::output();
return UObjectIO::buffer_output;
}
return 0;
}
#endif