ULib/src/ulib/tokenizer.cpp

// ============================================================================
//
// = LIBRARY
//    ULib - c++ library
//
// = FILENAME
//    tokenizer.cpp
//
// = AUTHOR
//    Stefano Casazza
//
// ============================================================================

#include <ulib/tokenizer.h>
#include <ulib/utility/services.h>
#include <ulib/container/vector.h>

bool        UTokenizer::group_skip;
bool        UTokenizer::avoid_punctuation;
uint32_t    UTokenizer::group_len;
uint32_t    UTokenizer::group_len_div_2;
const char* UTokenizer::group;

void UTokenizer::setData(const UString& data)
{
   U_TRACE(0, "UTokenizer::setData(%V)", data.rep)

   str = data;
   end = (s = data.data()) + data.size();
}

bool UTokenizer::next(UString& token, bPFi func)
{
   U_TRACE(0, "UTokenizer::next(%p,%p)", &token, func)

   const char* p;

   while (s < end)
      {
      // skip char with function

      if (func(*s))
         {
         ++s;

         continue;
         }

      p = s;

      while (s < end &&
             func(*s) == false)
         {
         ++s;
         }

      token = str.substr(p, s - p);

      ++s;

      U_RETURN(true);
      }

   U_RETURN(false);
}

bool UTokenizer::next(UString& token, char c)
{
   U_TRACE(0, "UTokenizer::next(%p,%C)", &token, c)

   const char* p;

   while (s < end)
      {
      // skip char delimiter

      if (*s == c)
         {
         ++s;

         continue;
         }

      // delimit token with char delimiter

      p = s;
      s = (const char*) memchr(s, c, end - s);

      if (s == 0) s = end;

      token = str.substr(p, s - p);

      ++s;

      U_RETURN(true);
      }

   U_RETURN(false);
}

// extend the actual token to the next char 'c'... (see PEC_report.cpp)

bool UTokenizer::extend(UString& token, char c)
{
   U_TRACE(0, "UTokenizer::extend(%p,%C)", &token, c)

   const char* p;

   while (s < end)
      {
      // skip char delimiter

      if (*s == c)
         {
         ++s;

         continue;
         }

      // delimit token with char delimiter

      p = token.data();
      s = (const char*) memchr(s, c, end - s);

      if (s == 0) s = end;

      token = str.substr(p, s - p);

      ++s;

      U_RETURN(true);
      }

   U_RETURN(false);
}

bool UTokenizer::next(UString& token, bool* bgroup)
{
   U_TRACE(0, "UTokenizer::next(%p,%p)", &token, bgroup)

   const char* p  = s;
   uint32_t shift = 1, n;

   if (bgroup) *bgroup = false;

   while (s < end)
      {
loop:
      if (delim)
         {
         s = u_delimit_token(s, &p, end, delim, 0);

         if (p) goto tok;

         U_RETURN(false);
         }

      s = u_skip(s, end, 0, 0);

      if (s == end) break;

      if (group)
         {
         if (memcmp(s, group, group_len_div_2) == 0)
            {
            p = s + group_len_div_2 - 1;
            s = u_strpend(p, end - p, group, group_len, '\0');

            ++p;

            if (s == 0) s = end;

            U_INTERNAL_DUMP("p = %.*S s = %.*S", s - p, p, end - s, s)

            if (group_skip)
               {
               s += group_len_div_2;

               continue;
               }

            if (bgroup) *bgroup = true;

            shift = group_len_div_2;

            goto tok;
            }
         else if (group_skip)
            {
            // -------------------------------------------------------------------
            // examples:
            // -------------------------------------------------------------------
            // <date>03/11/2005 10:17:46</date>
            // <description>description_556adfbc-0107-5000-ede4-d208</description>
            // -------------------------------------------------------------------

            s = u_delimit_token(s, &p, end, 0, 0);

            if (s < end)
               {
               const char* x = (char*) memchr(p, group[0], s - p);

               if (x && (memcmp(x, group, group_len_div_2) == 0))
                  {
                  s     = x;
                  shift = 0;
                  }
               }

            goto tok;
            }
         }

      s = u_delimit_token(s, &p, end, 0, 0);

tok:
      n = s - p;

      if (avoid_punctuation)
         {
         while (u__ispunct(*p))
            {
            --n;
            ++p;

            if (p == s) goto loop;
            }

         while (u__ispunct(p[n-1]))
            {
            --n;

            if (n == 0) goto loop;
            }
         }

      token = str.substr(p, n);

      s += shift;

      U_RETURN(true);
      }

   U_RETURN(false);
}

bool UTokenizer::tokenSeen(const UString* x)
{
   U_TRACE(0, "UTokenizer::tokenSeen(%V)", x->rep)

   U_INTERNAL_DUMP("s = %.*S", end - s, s)

   skipSpaces();

   if (s < end)
      {
      uint32_t sz = x->size();

      if (memcmp(s, x->data(), sz) == 0)
         {
         s += sz;

         U_RETURN(true);
         }
      }

   U_RETURN(false);
}

bool UTokenizer::skipToken(const char* token, uint32_t sz)
{
   U_TRACE(0, "UTokenizer::skipToken(%.*S,%u)", sz, token, sz)

   if (str.distance(s) >= sz &&
       memcmp(s, token, sz) == 0)
      {
      s += sz;

      U_RETURN(true);
      }

   U_RETURN(false);
}

bool UTokenizer::skipNumber(bool& isReal)
{
   U_TRACE(0, "UTokenizer::skipNumber(%p)", &isReal)

   isReal = false;

   for (char c; s < end; ++s)
      {
      c = *s;

      if (u__isnumber(c)) continue;

      if (u__isreal(c) ||
          u__toupper(c) == 'E')
         {
         isReal = true;

         continue;
         }

      U_RETURN(true);
      }

   U_RETURN(false);
}

UString UTokenizer::getTokenQueryParser()
{
   U_TRACE(0, "UTokenizer::getTokenQueryParser()")

   skipSpaces();

   const char* p = s++;

   if (*p == '"')
      {
      while (s < end && *s++ != '"') {}
      }
   else
      {
      while (s < end &&  // u__isname(*s)
             (u__isspace(*s) == false &&
              *s != '('               &&
              *s != ')'))
         {
         ++s;
         }
      }

   UString token = str.substr(p, s - p);

   U_RETURN_STRING(token);
}

/**
 * Expression is tokenized as:
 *
 * precedence: ( )
 * logical: && || !
 * compare: = == != < <= > =>
 * Additive operators: +, -
 * Multiplicative operators: *, /, %
 * unquoted strings: string
 * quoted strings: 'string with a dollar: $FOO'
 * variable substitution: $REMOTE_ADDR ${REMOTE_ADDR} $$(pid)
 * function call with optional params: FN_CALL([p1,p2,...,pn])
 *
 * contains:    ^
 * ends_with:   =~
 * starts_with: ~=
 */

int UTokenizer::getTokenId(UString& token)
{
   U_TRACE(0, "UTokenizer::getTokenId(%p)", &token)

   static const int dispatch_table[] = {
      (int)((char*)&&case_exclamation-(char*)&&cvalue),/* '!' */
      0,/* '"' */
      0,/* '#' */
      (int)((char*)&&case_dollar-(char*)&&cvalue),/* '$' */
      (int)((char*)&&case_percent-(char*)&&cvalue),/* '%' */
      (int)((char*)&&case_ampersand-(char*)&&cvalue),/* '&' */
      (int)((char*)&&case_quote-(char*)&&cvalue),/* '\'' */
      (int)((char*)&&case_opening_parenthesis-(char*)&&cvalue),/* '(' */
      (int)((char*)&&case_closing_parenthesis-(char*)&&cvalue),/* ')' */
      (int)((char*)&&case_asterisk-(char*)&&cvalue),/* '*' */
      (int)((char*)&&case_plus-(char*)&&cvalue),/* '+' */
      (int)((char*)&&case_comma-(char*)&&cvalue),/* ',' */
      (int)((char*)&&case_minus-(char*)&&cvalue),/* '-' */
      0,/* '.' */
      (int)((char*)&&case_slash-(char*)&&cvalue),/* '/' */
      (int)((char*)&&case_digit-(char*)&&cvalue),/* '0' */
      (int)((char*)&&case_digit-(char*)&&cvalue),/* '1' */
      (int)((char*)&&case_digit-(char*)&&cvalue),/* '2' */
      (int)((char*)&&case_digit-(char*)&&cvalue),/* '3' */
      (int)((char*)&&case_digit-(char*)&&cvalue),/* '4' */
      (int)((char*)&&case_digit-(char*)&&cvalue),/* '5' */
      (int)((char*)&&case_digit-(char*)&&cvalue),/* '6' */
      (int)((char*)&&case_digit-(char*)&&cvalue),/* '7' */
      (int)((char*)&&case_digit-(char*)&&cvalue),/* '8' */
      (int)((char*)&&case_digit-(char*)&&cvalue),/* '9' */
      0,/* ':' */
      0,/* ';' */
      (int)((char*)&&case_less-(char*)&&cvalue),/* '<' */
      (int)((char*)&&case_equal-(char*)&&cvalue),/* '=' */
      (int)((char*)&&case_major-(char*)&&cvalue),/* '>' */
      0,/* '?' */
      0,/* '@' */
      0,/* 'A' */
      0,/* 'B' */
      0,/* 'C' */
      0,/* 'D' */
      0,/* 'E' */
      0,/* 'F' */
      0,/* 'G' */
      0,/* 'H' */
      0,/* 'I' */
      0,/* 'J' */
      0,/* 'K' */
      0,/* 'L' */
      0,/* 'M' */
      0,/* 'N' */
      0,/* 'O' */
      0,/* 'P' */
      0,/* 'Q' */
      0,/* 'R' */
      0,/* 'S' */
      0,/* 'T' */
      0,/* 'U' */
      0,/* 'V' */
      0,/* 'W' */
      0,/* 'X' */
      0,/* 'Y' */
      0,/* 'Z' */
      0,/* '[' */
      0,/* '\' */
      0,/* ']' */
      (int)((char*)&&case_xor-(char*)&&cvalue),/* '^' */
      0,/* '_' */
      0,/* '`' */
      0,/* 'a' */
      0,/* 'b' */
      0,/* 'c' */
      0,/* 'd' */
      0,/* 'e' */
      (int)((char*)&&case_bool-(char*)&&cvalue),/* 'f' */
      0,/* 'g' */
      0,/* 'h' */
      0,/* 'i' */
      0,/* 'j' */
      0,/* 'k' */
      0,/* 'l' */
      0,/* 'm' */
      0,/* 'n' */
      0,/* 'o' */
      0,/* 'p' */
      0,/* 'q' */
      0,/* 'r' */
      0,/* 's' */
      (int)((char*)&&case_bool-(char*)&&cvalue),/* 't' */
      0,/* 'u' */
      0,/* 'v' */
      0,/* 'w' */
      0,/* 'x' */
      0,/* 'y' */
      0,/* 'z' */
      0,/* '{' */
      (int)((char*)&&case_pipe-(char*)&&cvalue),/* '|' */
      0,/* '}' */
      (int)((char*)&&case_tilde-(char*)&&cvalue)/* '~' */
   };

   char c;
   int tid = 0;
   const char* p1;
   const char* p2;

   U_INTERNAL_DUMP("s = %.*S", 20, s)

loop:
   p1 = p2 = s;

   if (s >= end) goto end;

   c = *s++;

   if (u__isspace(c)) goto loop;

   U_INTERNAL_DUMP("dispatch_table[%d] = %p &&cvalue = %p", c-'!', dispatch_table[c-'!'], &&cvalue)

   goto *((char*)&&cvalue + dispatch_table[c-'!']);

case_exclamation: tid = (*s == '=' ? (++s, U_TK_NE) : U_TK_NOT); p2 = s; goto end; /* '!' */

case_dollar: /* '$' */
   if (*s == '=')
      {
      p2  = ++s;
      tid = U_TK_ENDS_WITH;
      }
   else if (*s == '$')
      {
      p2  = ++s;
      tid = U_TK_PID;
      }
   else
      {
      tid = U_TK_NAME;

      if (*s == '{')
         {
         p1 = ++s;

         while (s < end && *s != '}') ++s;

         p2 = s++;
         }
      else
         {
         p1 = s;

         while (s < end && u__isname(*s)) ++s;

         p2 = s;
         }
      }

   goto end;

case_percent:   tid = U_TK_MOD;                                   p2 = s; goto end; /* '%' */
case_ampersand: tid = (*s == '&' ? (++s, U_TK_AND) : U_TK_ERROR); p2 = s; goto end; /* '&' */

case_quote: /* '\'' */
   tid = U_TK_VALUE;

   p1 = s;

   while (s < end && *s != '\'') ++s;

   p2 = s++;

   goto end;

case_opening_parenthesis: tid = U_TK_LPAREN; p2 = s; goto end; /* '(' */
case_closing_parenthesis: tid = U_TK_RPAREN; p2 = s; goto end; /* ')' */

case_asterisk: tid = (*s == '=' ? (++s, U_TK_CONTAINS) : U_TK_MULT); p2 = s; goto end; /* '*' */
case_plus:     tid = U_TK_PLUS;                                      p2 = s; goto end; /* '+' */
case_comma:    tid = U_TK_COMMA;                                     p2 = s; goto end; /* ',' */
case_minus:    tid = U_TK_MINUS;                                     p2 = s; goto end; /* '-' */

case_slash: /* '/' */
   c = *s;

   if (u__isdigit(c) ||
       u__isspace(c))
      {
      p2  = s;
      tid = U_TK_DIV;

      goto end;
      }

   goto cvalue;

case_digit: /* '0' ... '9' */
   tid = U_TK_VALUE;

   while (s < end && u__isdigit(*s)) ++s;

   p2 = s;

   goto end;

// foo  = "bar" - Un elemento il cui attributo "foo" è uguale a "bar"
// foo ~= "bar" - Un elemento il cui attributo "foo" ha per valore un elenco di valori separati da spazio, uno dei quali uguale a "bar"
// foo ^= "bar" - Un elemento il cui attributo "foo" ha un valore che inizia per "bar"
// foo $= "bar" - Un elemento il cui attributo "foo" ha un valore che finisce per "bar"
// foo *= "bar" - Un elemento il cui attributo "foo" ha un valore che contiene la sottostringa "bar"

case_less:  tid = (*s == '=' ? (++s, U_TK_LE)          : U_TK_LT);    p2 = s; goto end; /* '<' */
case_equal: tid = (*s == '=' ? (++s, U_TK_EQ)          : U_TK_EQ);    p2 = s; goto end; /* '=' */
case_major: tid = (*s == '=' ? (++s, U_TK_GE)          : U_TK_GT);    p2 = s; goto end; /* '>' */
case_xor:   tid = (*s == '=' ? (++s, U_TK_STARTS_WITH) : U_TK_ERROR); p2 = s; goto end; /* '^' */

case_bool: /* 'f' 't' */
   if (c == 't' ? skipToken(U_CONSTANT_TO_PARAM("rue"))
                : skipToken(U_CONSTANT_TO_PARAM("alse")))
      {
      tid = U_TK_VALUE;

      if (c == 't') p2 = s;

      goto end;
      }

   goto cvalue;

case_pipe:  tid = (*s == '|' ? (++s, U_TK_OR)          : U_TK_ERROR); p2 = s; goto end; /* '|' */
case_tilde: tid = (*s == '=' ? (++s, U_TK_IS_PRESENT)  : U_TK_ERROR); p2 = s; goto end; /* '~' */

cvalue:
   while (s < end)
      {
      c = *s;

      if (c == '(' ||
          c == ')' ||
          c == ',' ||
          u__isgraph(c) == false)
         {
         break;
         }

      ++s;
      }

   p2  = s;
   tid = (c == '(' ? U_TK_FN_CALL : U_TK_VALUE);

end:
   token = str.substr(p1, p2 - p1);

   U_INTERNAL_DUMP("token = %V", token.rep)

   U_RETURN(tid);
}

#if defined(U_STDCPP_ENABLE) && defined(DEBUG)
const char* UTokenizer::dump(bool reset) const
{
   *UObjectIO::os << "s                           " << (void*)s   << '\n'
                  << "end                         " << (void*)end << '\n'
                  << "group                       ";

   char buffer[32];

   UObjectIO::os->write(buffer, u__snprintf(buffer, sizeof(buffer), "%S", group));

   *UObjectIO::os << '\n'
                  << "delim                       ";

   UObjectIO::os->write(buffer, u__snprintf(buffer, sizeof(buffer), "%S", delim));

   *UObjectIO::os << '\n'
                  << "group_skip                  " << group_skip  << '\n'
                  << "str       (UString          " << (void*)&str << ')';

   if (reset)
      {
      UObjectIO::output();

      return UObjectIO::buffer_output;
      }

   return 0;
}
#endif