1
0
mirror of https://github.com/stefanocasazza/ULib.git synced 2025-09-28 19:05:55 +08:00
ULib/src/ulib/utility/string_ext.cpp
2015-06-05 15:45:24 +02:00

1916 lines
43 KiB
C++

// ============================================================================
//
// = LIBRARY
// ULib - c++ library
//
// = FILENAME
// string_ext.cpp
//
// = AUTHOR
// Stefano Casazza
//
// ============================================================================
#include <ulib/url.h>
#include <ulib/file.h>
#include <ulib/tokenizer.h>
#include <ulib/base/miniz/miniz.h>
#include <ulib/utility/string_ext.h>
#ifdef USE_LIBPCRE
# include <ulib/pcre/pcre.h>
#endif
#ifdef USE_LIBZ
# include <ulib/base/coder/gzio.h>
# include <ulib/base/zopfli/zopfli.h>
# include <ulib/base/zopfli/gzip_container.h>
#endif
#ifdef USE_LIBEXPAT
# include <ulib/xml/expat/xml2txt.h>
#endif
#ifndef _MSWINDOWS_
# include <pwd.h>
#endif
#include <errno.h>
#ifdef USE_LIBSSL
UString UStringExt::BIOtoString(BIO* bio)
{
U_TRACE(1, "UStringExt::BIOtoString(%p)", bio)
char* buffer = 0;
long len = BIO_get_mem_data(bio, &buffer);
if (len > 0)
{
UString result((void*)buffer, len);
// only bio needs to be freed :)
(void) BIO_set_close(bio, BIO_CLOSE); // So BIO_free() free BUF_MEM
(void) U_SYSCALL(BIO_free, "%p", bio);
U_RETURN_STRING(result);
}
return UString::getStringNull();
}
UString UStringExt::ASN1TimetoString(ASN1_GENERALIZEDTIME* t)
{
U_TRACE(1, "UStringExt::ASN1TimetoString(%p)", t)
UString result;
if (t)
{
BIO* bio = (BIO*) U_SYSCALL(BIO_new, "%p", BIO_s_mem());
(void) U_SYSCALL(ASN1_GENERALIZEDTIME_print, "%p,%p", bio, t);
result = BIOtoString(bio);
U_RETURN_STRING(result);
}
return UString::getStringNull();
}
#endif
// Replace parts of a string using regular expressions. This method is the counterpart of the perl s// operator.
// It replaces the substrings which matched the given regular expression with the supplied string
#ifdef USE_LIBPCRE
UString UStringExt::pregReplace(const UString& pattern, const UString& replacement, const UString& subject)
{
U_TRACE(0, "UStringExt::pregReplace(%V,%V,%V)", pattern.rep, replacement.rep, subject.rep)
UPCRE _pcre(pattern, PCRE_FOR_REPLACE);
UString result = _pcre.replace(subject, replacement);
U_RETURN_STRING(result);
}
UString UStringExt::sanitize(const UString& input)
{
U_TRACE(0, "UStringExt::sanitize(%V)", input.rep)
/*
static UPCRE* strip1;
static UPCRE* strip2;
static UPCRE* strip3;
static UPCRE* strip4;
if (strip1 == 0)
{
U_NEW_ULIB_OBJECT(strip1, UPCRE(U_STRING_FROM_CONSTANT("@<script[^>]*?>.*?</script>@si"), 0)); // Strip out javascript
U_NEW_ULIB_OBJECT(strip2, UPCRE(U_STRING_FROM_CONSTANT("@<[/!]*?[^<>]*?>@si"), 0)); // Strip out HTML tags
U_NEW_ULIB_OBJECT(strip3, UPCRE(U_STRING_FROM_CONSTANT("@<style[^>]*?>.*?</style>@siU"), 0)); // Strip style tags properly
U_NEW_ULIB_OBJECT(strip4, UPCRE(U_STRING_FROM_CONSTANT("@<![sS]*?--[ \t\n\r]*>@"), 0)); // Strip multi-line comments
strip1->study();
strip2->study();
strip3->study();
strip4->study();
}
UString result = strip1->replace(input, UString::getStringNull());
result = strip2->replace(result, UString::getStringNull());
result = strip3->replace(result, UString::getStringNull());
result = strip4->replace(result, UString::getStringNull());
U_RETURN_STRING(result);
*/
U_RETURN_STRING(input);
}
#endif
#ifdef USE_LIBEXPAT
UString UStringExt::stripTags(const UString& html, UString* list_tags_allowed)
{
U_TRACE(0, "UStringExt::stripTags(%V,%p)", html.rep, list_tags_allowed)
UString tag_list, result;
if (list_tags_allowed) tag_list = *list_tags_allowed;
UXml2Txt converter(tag_list, false, true);
if (converter.parse(html)) result = converter.getText();
U_RETURN_STRING(result);
}
#endif
UString UStringExt::stringFromNumber(long n)
{
U_TRACE(0, "UStringExt::stringFromNumber(%lld)", n)
UString x(20U);
#if SIZEOF_LONG == 4
x.setFromNumber32s(n);
#else
x.setFromNumber64s(n);
#endif
U_RETURN_STRING(x);
}
UString UStringExt::printSize(off_t n)
{
U_TRACE(0, "UStringExt::printSize(%I)", n)
UString x(32U);
u_printSize(x.data(), n);
x.size_adjust();
U_RETURN_STRING(x);
}
UString UStringExt::numberToString(double n)
{
U_TRACE(0, "UStringExt::numberToString(%f)", n)
UString x(32U);
x.snprintf("%f", n);
U_RETURN_STRING(x);
}
UString UStringExt::numberToString(uint32_t n)
{
U_TRACE(0, "UStringExt::numberToString(%u)", n)
UString x(22U);
x.setFromNumber32(n);
U_RETURN_STRING(x);
}
UString UStringExt::numberToString(uint64_t n)
{
U_TRACE(0, "UStringExt::numberToString(%llu)", n)
int i;
UString x(32U);
uint64_t u = 1024ULL;
for (i = 0; i < U_CONSTANT_SIZE("bKMGTPEZY"); ++i)
{
if ((n / u) == 0) break;
u *= 1024ULL;
}
if (i == 0)
{
x.setFromNumber64(n);
x.push_back('b');
}
else
{
float fsize = (float)((double)n/(u/1024ULL));
x.snprintf("%.1f%c", fsize, "bKMGTPEZY"[i]);
}
U_RETURN_STRING(x);
}
UString UStringExt::expandTab(const char* s, uint32_t n, int tab)
{
U_TRACE(1, "UStringExt::expandTab(%.*S,%u,%d)", n, s, n, tab)
void* p;
UString x(U_CAPACITY);
uint32_t start = 0, len;
while ((p = (void*)memchr(s + start, '\t', n - start)))
{
uint32_t _end = (const char*)p - s;
if (_end > start)
{
len = _end - start;
(void) x.reserve(x.size() + len + tab);
if (len)
{
U_MEMCPY(x.rep->end(), s + start, len);
x.rep->_length += len;
}
}
uint32_t num = tab - (x.rep->_length % tab);
U_INTERNAL_DUMP("start = %u _end = %u num = %u", start, _end, num)
char* r = x.rep->data();
while (num--) r[x.rep->_length++] = ' ';
start = _end + 1;
}
len = n - start;
if (len) (void) x.append(s + start, len);
(void) x.shrink();
U_RETURN_STRING(x);
}
UString UStringExt::substitute(const char* s, uint32_t n, const char* a, uint32_t n1, const char* b, uint32_t n2)
{
U_TRACE(1, "UStringExt::substitute(%.*S,%u,%.*S,%u,%.*S,%u)", n, s, n, n1, a, n1, n2, b, n2)
U_INTERNAL_ASSERT_MAJOR_MSG(n, 0, "elaborazione su stringa vuota: inserire if empty()...")
void* p;
uint32_t start = 0, len, capacity = (n / n1);
if (capacity == 0) capacity = 10U;
if (n2) capacity *= n2;
if (capacity > (256U * 1024U * 1024U)) capacity = (256U * 1024U * 1024U); // worst case...
UString x(capacity);
while ((p = u_find(s + start, n - start, a, n1)))
{
uint32_t _end = (const char*)p - s;
len = (_end > start ? _end - start : 0);
U_INTERNAL_DUMP("start = %u _end = %u len = %u", start, _end, len)
(void) x.reserve(x.size() + len + n2);
if (len)
{
U_MEMCPY(x.rep->end(), s + start, len);
x.rep->_length += len;
}
if (n2)
{
U_MEMCPY(x.rep->end(), b, n2);
x.rep->_length += n2;
}
start = _end + n1;
}
len = n - start;
if (len) (void) x.append(s + start, len);
U_RETURN_STRING(x);
}
// dos2unix '\n' convertor
UString UStringExt::dos2unix(const UString& s, bool unix2dos)
{
U_TRACE(0, "UStringExt::dos2unix(%V,%b)", s.rep, unix2dos)
UString result(s.size() * 2);
const char* ptr = s.data();
const char* _end = s.end();
char* str = result.data();
char* start = str;
while (ptr < _end)
{
char c = *ptr++;
if (c == '\r') continue;
if (c == '\n')
{
if (unix2dos) *str++ = '\r';
*str++ = '\n';
continue;
}
*str++ = c;
}
result.size_adjust(str - start);
U_RETURN_STRING(result);
}
UString UStringExt::expandPath(const char* s, uint32_t n, const UString* environment)
{
U_TRACE(0, "UStringExt::expandPath(%.*S,%u,%p)", n, s, n, environment)
U_INTERNAL_ASSERT_MAJOR_MSG(n, 0, "elaborazione su stringa vuota: inserire if empty()...")
char c = *s;
UString x(n+100);
if (c == '~' ||
c == '$')
{
UString value;
uint32_t _end = 1;
while (_end < n && s[_end] != '/') ++_end;
U_INTERNAL_DUMP("_end = %u", _end)
if (_end == 1)
{
if (c == '$') goto end;
// expand ~/...
value = getEnvironmentVar(U_CONSTANT_TO_PARAM("HOME"), environment);
}
else if (c == '$')
{
// expand $var... and $var/...
value = getEnvironmentVar(s + 1, _end - 1, environment);
}
else
{
// expand ~user/...
char buffer[128];
U_INTERNAL_ASSERT_MINOR(_end, sizeof(buffer))
U_MEMCPY(buffer, s + 1, _end - 1);
buffer[_end-1] = '\0';
struct passwd* pw = (struct passwd*) U_SYSCALL(getpwnam, "%S", buffer);
if (pw && pw->pw_dir) (void) value.assign(pw->pw_dir);
}
s += _end;
n -= _end;
(void) x.append(value);
}
end:
if (n) (void) x.append(s, n);
U_RETURN_STRING(x);
}
// prepare for environment variables (check if some of them need quoting...)
UString UStringExt::prepareForEnvironmentVar(const char* s, uint32_t n)
{
U_TRACE(0, "UStringExt::prepareForEnvironmentVar(%.*S,%u)", n, s, n)
U_INTERNAL_ASSERT_MAJOR_MSG(n, 0, "elaborazione su stringa vuota: inserire if empty()...")
bool quoted;
const char* p;
const char* ptr;
const char* ptr1;
uint32_t len, sz = 0;
UString result(n + (n / 4));
const char* _end = s + n - 1;
char c = 0, delimiter = (memchr(s, '\n', n) ? '\n' : ' ');
char* str = result.data();
while (s < _end)
{
if (u__isspace(*s))
{
++s;
continue;
}
U_INTERNAL_DUMP("s = %.*S", 10, s)
if (*s == '#') // skip line comment
{
s = (const char* restrict) memchr(s, delimiter, _end - s + 1);
if (s == 0) goto end;
continue;
}
p = s;
s = (const char* restrict) memchr(s, '=', _end - s + 1);
if (s == 0) goto end;
U_INTERNAL_DUMP("name = %.*S", s - p, p)
++s;
quoted = false;
if (*p == '\'')
{
s = (const char* restrict) memchr(s, '\'', _end - s + 1);
if (s == 0) goto end;
len = (++s - p);
U_INTERNAL_DUMP("copy = %.*S", len, p)
}
else
{
s = (const char* restrict) memchr(s, delimiter, _end - s + 1);
if (s == 0) s = _end;
ptr = s;
U_INTERNAL_DUMP("*ptr = %C", *ptr)
for (c = *ptr; u__isspace(c) && --ptr > p; c = *ptr) {}
len = (ptr - p) + 1;
U_INTERNAL_ASSERT_MAJOR(len, 0)
ptr1 = p;
while (++ptr1 < ptr)
{
c = *ptr1;
if (c == '=')
{
U_INTERNAL_DUMP("name = %.*S value = %.*S", ptr1 - p, p, ptr - ptr1, ptr1+1)
while (++ptr1 < ptr)
{
c = *ptr1;
if (c == ' ' ||
c == '"')
{
quoted = true;
str[sz++] = '\'';
break;
}
}
break;
}
U_INTERNAL_ASSERT(u__isname(c))
}
}
U_MEMCPY(str + sz, p, len);
sz += len;
if (quoted) str[sz++] = '\'';
str[sz++] = '\n';
}
end:
result.size_adjust(sz);
U_INTERNAL_DUMP("result(%d) = %#V", sz, result.rep)
U_RETURN_STRING(result);
}
// recursively expand environment variables if needed
UString UStringExt::expandEnvironmentVar(const char* s, uint32_t n, const UString* environment)
{
U_TRACE(0, "UStringExt::expandEnvironmentVar(%.*S,%u,%p)", n, s, n, environment)
U_INTERNAL_ASSERT_MAJOR_MSG(n, 0, "elaborazione su stringa vuota: inserire if empty()...")
char* new_ptr;
const char* p;
const char* var_ptr = 0;
UString value, result(n+500U);
uint32_t var_size, new_size = 0;
while ((p = (const char*) memchr(s, '$', n)))
{
U_INTERNAL_DUMP("p = %.*S", 10, p)
uint32_t len = p - s;
n -= len;
// read name=$var
// =>...
uint32_t _end = 1;
while (_end < n &&
u__isname(p[_end]))
{
U_INTERNAL_ASSERT_DIFFERS(p[_end], '$')
U_INTERNAL_ASSERT_EQUALS(u__isspace(p[_end]), false)
++_end;
}
U_INTERNAL_DUMP("len = %u n = %u _end = %u", len, n, _end)
if (_end == 1) var_size = 0;
else
{
var_ptr = p + 1;
var_size = _end - 1;
U_INTERNAL_DUMP("var = %.*S", var_size, var_ptr)
value = getEnvironmentVar(var_ptr, var_size, environment);
if (new_size &&
value.find('$', 0U) != U_NOT_FOUND)
{
value = getEnvironmentVar(var_ptr, var_size, &result);
}
var_ptr = value.data();
var_size = value.size();
}
(void) result.reserve(new_size + len + var_size);
new_ptr = result.c_pointer(new_size);
if (len) U_MEMCPY(new_ptr, s, len);
if (var_size)
{
U_MEMCPY(new_ptr + len, var_ptr, var_size);
new_size += var_size;
}
new_size += len;
result.size_adjust(new_size);
s = p + _end;
n -= _end;
}
if (n) (void) result.append(s, n);
U_RETURN_STRING(result);
}
UString UStringExt::getEnvironmentVar(const char* s, uint32_t n, const UString* environment)
{
U_TRACE(1, "UStringExt::getEnvironmentVar(%.*S,%u,%p)", n, s, n, environment)
UString value(300U);
if (environment)
{
char c, c1;
const char* end;
uint32_t start = 0;
bool quoted, bexpand;
// NB: check if param 's' is a environment-var
loop:
start = environment->find(s, start, n);
if (start == U_NOT_FOUND) goto next;
c = '\0';
if (start)
{
c = environment->c_char(start-1);
U_INTERNAL_DUMP("c = %C", c)
if (u__isname(c) ||
c == '#') // NB: check if commented...
{
start += n;
goto loop;
}
}
start += n;
c1 = environment->c_char(start);
U_INTERNAL_DUMP("c1 = %C", c1)
if (c1 != '=') goto loop;
quoted = u__isquote(c);
bexpand = false;
U_INTERNAL_DUMP("quoted = %b", quoted)
s = environment->c_pointer(++start);
end = environment->end();
U_INTERNAL_DUMP("end - s = %ld", end - s)
if (s < end)
{
const char* ptr = s;
do {
if ((c1 = *ptr) == '$') bexpand = true;
if (quoted)
{
if (c1 != c || ptr[-1] == '\\') continue;
}
else
{
if (u__isspace(c1) == false) continue;
}
U_INTERNAL_DUMP("ptr - s = %ld", ptr - s)
if (ptr == s) goto end; // NB: name=<empty>...
n = ptr - s;
goto assign;
}
while (++ptr < end);
n = end - s;
assign:
U_INTERNAL_DUMP("n = %u", n)
U_INTERNAL_ASSERT_MAJOR(n, 0)
if (bexpand) value = expandEnvironmentVar(s, n, environment);
else (void) value.assign(s, n);
}
}
else
{
next:
char buffer[128];
U_INTERNAL_ASSERT_MINOR(n, sizeof(buffer))
U_MEMCPY(buffer, s, n);
buffer[n] = '\0';
const char* ptr = U_SYSCALL(getenv, "%S", buffer);
if (ptr) (void) value.assign(ptr);
}
end:
U_RETURN_STRING(value);
}
UString UStringExt::getPidProcess()
{
U_TRACE(0, "UStringExt::getPidProcess()")
UString value(32U);
U_MEMCPY(value.data(), u_pid_str, u_pid_str_len);
value.size_adjust(u_pid_str_len);
U_RETURN_STRING(value);
}
extern void* expressionParserAlloc(void* (*mallocProc)(size_t));
extern void expressionParserFree(void* p, void (*freeProc)(void*));
extern void expressionParserTrace(FILE* stream, char* zPrefix);
extern void expressionParser(void* yyp, int yymajor, UString* yyminor, UString* result);
UString UStringExt::evalExpression(const UString& expr, const UString& environment)
{
U_TRACE(0, "UStringExt::evalExpression(%V,%V)", expr.rep, environment.rep)
int token_id;
UTokenizer t(expr);
UString token, result = *UString::str_true;
void* pParser = expressionParserAlloc(malloc);
#ifdef DEBUG
// (void) fprintf(stderr, "start parsing expr: \"%v\"\n", expr));
// expressionParserTrace(stderr, (char*)"parser: ");
#endif
while (result &&
(token_id = t.getTokenId(token)) > 0)
{
if (token_id == U_TK_NAME)
{
token = UStringExt::getEnvironmentVar(token, &environment);
token_id = U_TK_VALUE;
}
else if (token_id == U_TK_PID)
{
token = UStringExt::getPidProcess();
token_id = U_TK_VALUE;
}
expressionParser( pParser, token_id, U_NEW(UString(token)), &result);
}
expressionParser( pParser, 0, 0, &result);
expressionParserFree(pParser, free);
#ifdef DEBUG
// (void) fprintf(stderr, "ended parsing expr: \"%v\"\n", expr));
#endif
U_RETURN_STRING(result);
}
// Returns a string that has the delimiter escaped
UString UStringExt::insertEscape(const char* s, uint32_t n, char delimiter)
{
U_TRACE(0, "UStringExt::insertEscape(%.*S,%u,%C)", n, s, n, delimiter)
U_INTERNAL_ASSERT_MAJOR_MSG(n,0,"elaborazione su stringa vuota: inserire if empty()...")
U_INTERNAL_ASSERT_POINTER(memchr(s, delimiter, n))
char* p;
uint32_t sz, sz1 = 0;
UString result(n * 2);
const char* _end = s + n;
char* str = result.data();
while (s < _end)
{
p = (char*) memchr(s, delimiter, _end - s);
if (p)
{
sz = p - s;
U_MEMCPY(str, s, sz);
s = p + 1;
str += sz;
*str++ = '\\';
*str++ = delimiter;
sz1 += sz + 2;
}
else
{
sz = _end - s;
U_MEMCPY(str, s, sz);
sz1 += sz;
break;
}
}
result.size_adjust(sz1);
U_RETURN_STRING(result);
}
// manage escaping for delimiter character
UString UStringExt::removeEscape(const char* s, uint32_t n)
{
U_TRACE(0, "UStringExt::removeEscape(%.*S,%u,%C)", n, s, n)
U_INTERNAL_ASSERT_MAJOR_MSG(n,0,"elaborazione su stringa vuota: inserire if empty()...")
U_INTERNAL_ASSERT_POINTER(memchr(s, '\\', n))
char* p;
UString result(n);
uint32_t sz, sz1 = 0;
const char* _end = s + n;
char* str = result.data();
while (s < _end)
{
p = (char*) memchr(s, '\\', _end - s);
if (p)
{
sz = p - s;
U_MEMCPY(str, s, sz);
s = p + 1;
str += sz;
sz1 += sz;
}
else
{
sz = _end - s;
U_MEMCPY(str, s, sz);
sz1 += sz;
break;
}
}
result.size_adjust(sz1);
U_RETURN_STRING(result);
}
// Returns a string that has whitespace removed from the start and the end
UString UStringExt::trim(const char* s, uint32_t n)
{
U_TRACE(0, "UStringExt::trim(%.*S,%u)", n, s, n)
if (n == 0) return UString::getStringNull();
int32_t i = 0;
UString result(n);
// skip white space from start
while (i < (int32_t)n && u__isspace(s[i])) ++i;
U_INTERNAL_DUMP("i = %d", i)
if (i < (int32_t)n) // not only white space
{
while (u__isspace(s[--n])) {} // skip white space from end
n += 1 - i;
U_MEMCPY(result.data(), s+i, n);
result.size_adjust(n);
}
U_RETURN_STRING(result);
}
// Returns a string that has any printable character which is not a space or
// an alphanumeric character removed from the start and the end (leading and trailing)
UString UStringExt::trimPunctuation(const char* s, uint32_t n)
{
U_TRACE(0, "UStringExt::trimPunctuation(%.*S,%u)", n, s, n)
// U_INTERNAL_ASSERT_MAJOR_MSG(n,0,"elaborazione su stringa vuota: inserire if empty()...")
int32_t i = 0;
UString result(n);
// skip punctuation character from start
while (i < (int32_t)n && u__ispunct(s[i])) ++i;
U_INTERNAL_DUMP("i = %d", i)
if (i < (int32_t)n) // not only punctuation character
{
while (u__ispunct(s[--n])) {} // skip punctuation character from end
n += 1 - i;
U_MEMCPY(result.data(), s+i, n);
result.size_adjust(n);
}
U_RETURN_STRING(result);
}
// returns a string that has whitespace removed from the start and the end,
// and which has each sequence of internal whitespace replaced with a single space.
UString UStringExt::simplifyWhiteSpace(const char* s, uint32_t n)
{
U_TRACE(0, "UStringExt::simplifyWhiteSpace(%.*S,%u)", n, s, n)
// U_INTERNAL_ASSERT_MAJOR_MSG(n,0,"elaborazione su stringa vuota: inserire if empty()...")
UString result(n);
uint32_t sz1, sz = 0;
char* str = result.data();
const char* p;
const char* _end = s + n;
while (s < _end)
{
if (u__isspace(*s))
{
++s;
continue;
}
p = s++;
while (s < _end &&
u__isspace(*s) == false)
{
++s;
}
sz1 = (s - p);
U_MEMCPY(str + sz, p, sz1); // result.append(p, sz1);
sz += sz1;
if (++s < _end) str[sz++] = ' ';
}
if (sz && u__isspace(str[sz-1])) --sz;
result.size_adjust(sz);
U_RETURN_STRING(result);
}
// returns a string that has suppressed all whitespace
UString UStringExt::removeWhiteSpace(const char* s, uint32_t n)
{
U_TRACE(0, "UStringExt::removeWhiteSpace(%.*S,%u)", n, s, n)
// U_INTERNAL_ASSERT_MAJOR_MSG(n,0,"elaborazione su stringa vuota: inserire if empty()...")
UString result(n);
uint32_t sz1, sz = 0;
char* str = result.data();
const char* p;
const char* _end = s + n;
while (s < _end)
{
if (u__isspace(*s))
{
++s;
continue;
}
p = s++;
while (s < _end &&
u__isspace(*s) == false)
{
++s;
}
sz1 = (s - p);
U_MEMCPY(str + sz, p, sz1); // result.append(p, sz1);
sz += sz1;
}
if (sz && u__isspace(str[sz-1])) --sz;
result.size_adjust(sz);
U_RETURN_STRING(result);
}
// returns a string that has suppressed repeated empty lines
UString UStringExt::removeEmptyLine(const char* s, uint32_t n)
{
U_TRACE(0, "UStringExt::removeEmptyLine(%.*S,%u)", n, s, n)
// U_INTERNAL_ASSERT_MAJOR_MSG(n,0,"elaborazione su stringa vuota: inserire if empty()...")
UString result(n);
uint32_t sz1, sz = 0;
char* str = result.data();
const char* p;
const char* _end = s + n;
while (s < _end)
{
if (u__islterm(*s))
{
++s;
continue;
}
p = s++;
while (s < _end &&
u__islterm(*s) == false)
{
++s;
}
sz1 = (s - p);
U_MEMCPY(str + sz, p, sz1); // result.append(p, sz1);
sz += sz1;
if (++s < _end) str[sz++] = '\n';
}
if (sz && u__islterm(str[sz-1])) --sz;
result.size_adjust(sz);
U_RETURN_STRING(result);
}
// Within a string we can count number of occurrence of another string by using substr_count function.
// This function takes the main string and the search string as inputs and returns number of time search string is found inside the main string.
__pure uint32_t UStringExt::substr_count(const char* s, uint32_t n, const char* a, uint32_t n1)
{
U_TRACE(0, "UStringExt::substr_count(%.*S,%u,%.*S,%u)", n, s, n, n1, a, n1)
uint32_t num = 0;
const char* ptr = s;
const char* end = s + n;
while (true)
{
ptr = (const char*) u_find(ptr, end - ptr, a, n1);
if (ptr == 0) U_RETURN(num);
++num;
ptr += n1;
}
}
UString UStringExt::dirname(const char* path, uint32_t n)
{
U_TRACE(0, "UStringExt::dirname(%.*S,%u)", n, path, n)
const char* runp;
const char* last_slash = (const char*) memrchr(path, '/', n); // Find last '/'
if (last_slash &&
last_slash != path &&
(n - (last_slash - path)) == 1)
{
// Determine whether all remaining characters are slashes
for (runp = last_slash; runp != path; --runp) if (runp[-1] != '/') break;
// The '/' is the last character, we have to look further
if (runp != path) last_slash = (const char*) memrchr(path, '/', runp - path);
}
if (last_slash == 0)
{
// This assignment is ill-designed but the XPG specs require to
// return a string containing "." in any case no directory part is
// found and so a static and constant string is required
U_RETURN_STRING(*UString::str_point);
}
// Determine whether all remaining characters are slashes
for (runp = last_slash; runp != path; --runp) if (runp[-1] != '/') break;
// Terminate the path
if (runp != path) last_slash = runp;
else
{
// The last slash is the first character in the string. We have to return "/".
// As a special case we have to return "//" if there are exactly two slashes at the beginning of the string.
// See XBD 4.10 Path Name Resolution for more information
if (last_slash == path + 1) ++last_slash;
else last_slash = path + 1;
}
UString result(path, (uint32_t)(last_slash - path));
U_RETURN_STRING(result);
}
UString UStringExt::basename(const char* s, uint32_t n)
{
U_TRACE(0, "UStringExt::basename(%.*S,%u)", n, s, n)
const char* last_slash = (const char*) memrchr(s, '/', n); // Find last '/'
if (last_slash)
{
UString result(last_slash+1, n-(last_slash-s)-1);
U_RETURN_STRING(result);
}
UString same(s, n);
U_RETURN_STRING(same);
}
__pure uint32_t UStringExt::getBaseNameLen(const UString& s)
{
U_TRACE(0, "UStringExt::getBaseNameLen(%V)", s.rep)
uint32_t len = s.size(),
pos = s.rfind('/'); // Find last '/'
if (pos != U_NOT_FOUND) len -= pos + 1;
U_RETURN(len);
}
/* Sort two version numbers, comparing equivalently seperated strings of digits numerically.
*
* Returns a positive number if (a > b)
* Returns a negative number if (a < b)
* Returns zero if (a == b)
*/
__pure int UStringExt::compareversion(const char* a, uint32_t alen, const char* b, uint32_t blen)
{
U_TRACE(0, "UStringExt::compareversion(%.*S,%u,%.*S,%u)", alen, a, alen, blen, b, blen)
if (a == b) U_RETURN(0);
bool isnum;
uint32_t apos2 = 0, bpos2 = 0;
while (apos2 < alen &&
bpos2 < blen)
{
uint32_t apos1 = apos2,
bpos1 = bpos2;
if (u__isdigit(a[apos2]))
{
isnum = true;
while (apos2 < alen && u__isdigit(a[apos2])) apos2++;
while (bpos2 < blen && u__isdigit(b[bpos2])) bpos2++;
}
else
{
isnum = false;
while (apos2 < alen && !u__isdigit(a[apos2])) apos2++;
while (bpos2 < blen && !u__isdigit(b[bpos2])) bpos2++;
}
U_INTERNAL_ASSERT_DIFFERS(apos1,apos2)
/* isdigit(a[0]) != isdigit(b[0])
* arbitrarily sort the non-digit first */
if (bpos1 == bpos2) U_RETURN(isnum ? 1 : -1);
if (isnum)
{
/* skip numeric leading zeros */
while (apos1 < alen && a[apos1] == '0') apos1++;
while (bpos1 < blen && b[bpos1] == '0') bpos1++;
/* if one number has more digits, it is greater */
if (apos2-apos1 > bpos2-bpos1) U_RETURN(1);
if (apos2-apos1 < bpos2-bpos1) U_RETURN(-1);
}
/* do an ordinary lexicographic string comparison */
uint32_t n1 = apos2-apos1,
n2 = bpos2-bpos1;
int cval = memcmp(a+apos1, b+bpos1, U_min(n1, n2));
if (cval) U_RETURN(cval < 1 ? -1 : 1);
}
/* ran out of characters in one string, without finding a difference */
/* maybe they were the same version, but with different leading zeros */
if (apos2 == alen && bpos2 == blen) U_RETURN(0);
/* the version with a suffix remaining is greater */
U_RETURN(apos2 < alen ? 1 : -1);
}
__pure int UStringExt::compareversion(const UString& s, const UString& a) { return compareversion(U_STRING_TO_PARAM(s), U_STRING_TO_PARAM(a)); }
__pure bool UStringExt::isEmailAddress(const UString& s)
{
U_TRACE(0, "UStringExt::isEmailAddress(%V)", s.rep)
if (u_validate_email_address(U_STRING_TO_PARAM(s))) U_RETURN(true);
U_RETURN(false);
}
UString UStringExt::compress(const char* s, uint32_t sz)
{
U_TRACE(0, "UStringExt::compress(%.*S,%u)", sz, s, sz)
UString out(U_CONSTANT_SIZE(U_MINIZ_COMPRESS) + sizeof(uint32_t) + sz + 32);
mz_ulong out_len = out.capacity() - U_CONSTANT_SIZE(U_MINIZ_COMPRESS) + sizeof(uint32_t);
unsigned char* ptr = (unsigned char*)out.data();
// copy magic byte
*(int32_t*)ptr = U_MULTICHAR_CONSTANT32('\x89','M','N','Z'); // U_MINIZ_COMPRESS
ptr += 4;
// copy original size
#if __BYTE_ORDER == __LITTLE_ENDIAN
uint32_t size_original = sz;
#else
uint32_t size_original = u_invert32(*(uint32_t*)&sz);
#endif
*(int32_t*)ptr = *(int32_t*)&size_original;
#ifdef DEBUG
int r =
#endif
U_SYSCALL(mz_compress, "%p,%p,%p,%u", ptr + sizeof(uint32_t), &out_len, (const unsigned char*)s, sz);
U_INTERNAL_ASSERT_EQUALS(r, Z_OK)
out.rep->_length = U_CONSTANT_SIZE(U_MINIZ_COMPRESS) + sizeof(uint32_t) + out_len;
U_INTERNAL_DUMP("compressed %u bytes into %lu bytes (%u%%)", sz, out_len, 100 - (out_len * 100 / sz))
U_INTERNAL_ASSERT(UStringExt::isCompress(out))
U_RETURN_STRING(out);
}
UString UStringExt::decompress(const char* s, uint32_t n)
{
U_TRACE(0, "UStringExt::decompress(%.*S,%u)", n, s, n)
// check magic byte
U_INTERNAL_ASSERT(UStringExt::isCompress(s))
// read original size
const char* ptr = (char*)s + U_CONSTANT_SIZE(U_MINIZ_COMPRESS);
#if __BYTE_ORDER == __LITTLE_ENDIAN
uint32_t sz = *(uint32_t*)ptr;
#else
uint32_t sz = u_invert32(*(uint32_t*)ptr);
#endif
U_INTERNAL_DUMP("sz = %u", sz)
UString out(sz + 32);
mz_ulong out_len = out.capacity();
#ifdef DEBUG
int r =
#endif
U_SYSCALL(mz_uncompress, "%p,%p,%p,%u", (unsigned char*)out.rep->data(), &out_len, (const unsigned char*)ptr + sizeof(uint32_t), n - U_CONSTANT_SIZE(U_MINIZ_COMPRESS) - sizeof(uint32_t));
U_INTERNAL_ASSERT_EQUALS(r, Z_OK)
U_INTERNAL_DUMP("decompressed %u bytes back into %lu bytes", n - U_CONSTANT_SIZE(U_MINIZ_COMPRESS) - sizeof(uint32_t), out_len)
out.rep->_length = out_len;
U_RETURN_STRING(out);
}
UString UStringExt::deflate(const char* s, uint32_t len, int type) // .gz compress
{
U_TRACE(1, "UStringExt::deflate(%.*S,%u,%d)", len, s, len, type)
#ifndef USE_LIBZ
return UString::getStringNull();
#endif
// The zlib documentation states that destination buffer size must be at least 0.1% larger than avail_in plus 12 bytes
uint32_t sz = len + (len / 10) + 12U;
if (UFile::isAllocableFromPool(sz))
{
# ifndef HAVE_OLD_IOSTREAM
if (type <= 1)
# endif
{
# ifdef USE_LIBZ
len = u_gz_deflate(s, len, UFile::pfree, type);
# endif
U_INTERNAL_DUMP("u_gz_deflate() = %u", len)
}
# ifndef HAVE_OLD_IOSTREAM
else
{
# ifdef USE_LIBZ // zopfli...
size_t outsize = 0;
ZopfliOptions options;
unsigned char* out = 0;
U_SYSCALL_VOID(ZopfliInitOptions, "%p", &options);
U_SYSCALL_VOID(ZopfliGzipCompress, "%p,%p,%u,%p,%p", &options, (unsigned char*)s, (size_t)len, &out, &outsize);
U_INTERNAL_DUMP("ZopfliGzipCompress(%u) = %u", len, outsize)
len = outsize;
U_MEMCPY(UFile::pfree, out, len);
U_SYSCALL_VOID(free, "%p", out);
# endif
}
# endif
sz = (len + U_PAGEMASK) & ~U_PAGEMASK;
UString result(len, sz, UFile::pfree);
UFile::pfree += sz;
UFile::nfree -= sz;
U_RETURN_STRING(result);
}
UString r(sz);
#ifdef USE_LIBZ
r.rep->_length = u_gz_deflate(s, len, r.rep->data(), (type ? true : false));
#endif
U_INTERNAL_DUMP("u_gz_deflate(%u) = %u", len, r.size())
#ifdef DEBUG
if (type)
{
uint32_t* psize_original = (uint32_t*)r.c_pointer(r.size() - 4);
# if __BYTE_ORDER == __LITTLE_ENDIAN
U_INTERNAL_DUMP("size original = %u (LE)", *psize_original)
# else
U_INTERNAL_DUMP("size original = %u (BE)", u_invert32(*psize_original))
# endif
}
#endif
U_RETURN_STRING(r);
}
UString UStringExt::gunzip(const char* ptr, uint32_t sz, uint32_t space) // .gz uncompress
{
U_TRACE(0, "UStringExt::gunzip(%.*S,%u,%u)", sz, ptr, sz, space)
if (space == 0)
{
if (isGzip(ptr)) // check magic byte
{
uint32_t* psize_original = (uint32_t*)(ptr + sz - 4); // read original size
# if __BYTE_ORDER == __LITTLE_ENDIAN
space = *psize_original;
# else
space = u_invert32(*psize_original);
# endif
U_INTERNAL_DUMP("space = %u", space)
}
if (space == 0) space = sz * 4;
}
#ifdef USE_LIBZ // decompress with zlib
UString result(space);
result.rep->_length = u_gz_inflate(ptr, sz, result.rep->data());
U_INTERNAL_DUMP("u_gz_inflate() = %d", result.rep->_length)
U_RETURN_STRING(result);
#else
return UString::getStringNull();
#endif
}
// convert letter to upper or lower case
UString UStringExt::tolower(const char* s, uint32_t n)
{
U_TRACE(0, "UStringExt::tolower(%.*S,%u)", n, s, n)
U_INTERNAL_ASSERT_MAJOR_MSG(n, 0, "elaborazione su stringa vuota: inserire if empty()...")
UString r(n);
char* ptr = r.rep->data();
const char* end = s + (r.rep->_length = n);
while (s < end) *ptr++ = u__tolower(*s++);
*ptr = '\0';
U_RETURN_STRING(r);
}
UString UStringExt::toupper(const char* s, uint32_t n)
{
U_TRACE(0, "UStringExt::toupper(%.*S,%u)", n, s, n)
U_INTERNAL_ASSERT_MAJOR_MSG(n, 0, "elaborazione su stringa vuota: inserire if empty()...")
UString r(n);
char* ptr = r.rep->data();
const char* end = s + (r.rep->_length = n);
while (s < end) *ptr++ = u__toupper(*s++);
*ptr = '\0';
U_RETURN_STRING(r);
}
// gived the name retrieve pointer on value element from headers "name1:value1\nname2:value2\n"...
__pure const char* UStringExt::getValueFromName(const UString& buffer, uint32_t pos, uint32_t len, const char* name, uint32_t name_len, bool nocase)
{
U_TRACE(0, "UStringExt::getValueFromName(%V,%u,%u,%.*S,%u,%b)", buffer.rep, pos, len, name_len, name, name_len, nocase)
U_INTERNAL_ASSERT(buffer)
U_INTERNAL_ASSERT_MAJOR(len, 0)
U_ASSERT_EQUALS(memchr(name, ':', name_len), 0)
const char* ptr_header_value;
uint32_t header_line, end = pos + len;
loop:
header_line = buffer.find(name, pos, name_len, len);
if (header_line == U_NOT_FOUND)
{
if (nocase)
{
header_line = buffer.findnocase(name, pos, name_len, len);
if (header_line != U_NOT_FOUND) goto next;
}
U_RETURN((const char*)0);
}
next:
U_INTERNAL_DUMP("header_line = %.*S", 20, buffer.c_pointer(header_line))
ptr_header_value = buffer.c_pointer(header_line + name_len);
while (u__isspace(*ptr_header_value)) ++ptr_header_value;
if (*ptr_header_value != ':')
{
pos = buffer.distance(ptr_header_value);
len = end - pos;
goto loop;
}
do { ++ptr_header_value; } while (u__isspace(*ptr_header_value));
U_INTERNAL_DUMP("ptr_header_value = %.*S", 20, ptr_header_value)
return ptr_header_value;
}
// retrieve information on form elements as couple <name1>=<value1>&<name2>=<value2>&...
uint32_t UStringExt::getNameValueFromData(const UString& content, UVector<UString>& name_value, const char* delim, uint32_t dlen)
{
U_TRACE(0, "UStringExt::getNameValueFromData(%V,%p,%.*S,%u)", content.rep, &name_value, dlen, delim, dlen)
U_INTERNAL_ASSERT(content)
U_INTERNAL_ASSERT_POINTER(delim)
// Parse the data in one fell swoop for efficiency
uint32_t n = content.size();
const char* s = content.data();
const char* p = s;
const char* _end = s + n;
bool bform = (dlen == 1 && *delim == '&'),
burl = (bform ? u_isUrlEncoded(s, n, true) : false);
UString x;
uint32_t old_size = name_value.size(), oldPos = 0, pos = 0, len, result;
U_INTERNAL_DUMP("bform = %b burl = %b", bform, burl)
while (s < _end)
{
// Find the '=' separating the name from its value
if (*s != '=')
{
++s;
continue;
}
len = s - p;
if (len)
{
U_INTERNAL_DUMP("oldPos = %u p(%u) = %.*S", oldPos, len, len, p)
U_INTERNAL_ASSERT_EQUALS(p, content.c_pointer(oldPos))
if (burl == false ||
u_isUrlEncoded(p, len, false) == false)
{
name_value.push_back(content.substr(oldPos, len));
}
else
{
// name is URL encoded...
x.setBuffer(len);
Url::decode(p, len, x);
name_value.push_back(x);
}
oldPos += len + 1;
}
else
{
name_value.push_back(UString::getStringNull());
++oldPos;
}
p = ++s;
// Find the delimitator separating subsequent name/value pairs
if (bform)
{
while ( s < _end &&
*s != '&')
{
++s;
}
len = s - p;
}
else
{
// check if string is quoted...
if (*s == '"') s = u_find_char(++s, _end, '"'); // find char '"' not quoted
pos = content.find_first_of(delim, content.distance(s), dlen);
// Even if an delimitator wasn't found the rest of the string is a value and value is already decoded...
len = (pos == U_NOT_FOUND ? n : pos) - oldPos;
}
if (len)
{
U_INTERNAL_DUMP("oldPos = %u p(%u) = %.*S", oldPos, len, len, p)
U_INTERNAL_ASSERT_EQUALS(p, content.c_pointer(oldPos))
if (bform)
{
if (burl == false ||
u_isUrlEncoded(p, len, false) == false)
{
name_value.push_back(content.substr(oldPos, len));
}
else
{
// value is URL encoded...
x.setBuffer(len);
Url::decode(p, len, x);
name_value.push_back(x);
}
}
else
{
x = content.substr(oldPos, len);
if (x.isQuoted()) x.rep->unQuote();
name_value.push_back(x);
}
oldPos += len + 1;
}
else
{
name_value.push_back(UString::getStringNull());
++oldPos;
}
// Update parse position
if (bform) p = ++s;
else
{
if (pos == U_NOT_FOUND) break;
s = content.c_pointer(pos);
while (++s < _end && memchr(delim, *s, dlen)) {}
oldPos = content.distance((p = s));
}
}
result = (name_value.size() - old_size);
U_RETURN(result);
}
void UStringExt::buildTokenInt(const char* token, uint32_t value, UString& buffer)
{
U_TRACE(0, "UStringExt::buildTokenInt(%S,%u,%V)", token, value, buffer.rep)
U_INTERNAL_ASSERT_POINTER(token)
U_INTERNAL_ASSERT(u__strlen(token, __PRETTY_FUNCTION__) == U_TOKEN_NM)
uint32_t start = buffer.size();
char* ptr = buffer.c_pointer(start);
U_MEMCPY(ptr, token, U_TOKEN_NM);
u_int2hex(ptr + U_TOKEN_NM, value);
buffer.size_adjust(start + U_TOKEN_LN);
}
// Minifies CSS/JS by removing comments and whitespaces
static inline bool unextendable(char c)
{
U_TRACE(0, "::unextendable(%C)", c)
// return true for any character that never needs to be separated from other characters via whitespace
switch (c)
{
case '[':
case ']':
case '{':
case '}':
case '/':
case ';':
case ':': U_RETURN(true);
default: U_RETURN(false);
}
}
// return true for any character that must separated from other "extendable"
// characters by whitespace on the _right_ in order keep tokens separate.
static inline bool isExtendableOnRight(char c)
{
U_TRACE(0, "::isExtendableOnRight(%C)", c)
// NB: left paren only here -- see http://code.google.com/p/page-speed/issues/detail?id=339
bool result = ((unextendable(c) || c == '(') == false);
U_RETURN(result);
}
// return true for any character that must separated from other "extendable"
// characters by whitespace on the _left_ in order keep tokens separate.
static inline bool isExtendableOnLeft(char c)
{
U_TRACE(0, "::isExtendableOnLeft(%C)", c)
// NB: right paren only here
bool result = ((unextendable(c) || c == ')') == false);
U_RETURN(result);
}
UString UStringExt::minifyCssJs(const char* s, uint32_t n)
{
U_TRACE(0+256, "UStringExt::minifyCssJs(%.*S,%u)", n, s, n)
U_INTERNAL_ASSERT_MAJOR_MSG(n, 0, "elaborazione su stringa vuota: inserire if empty()...")
char quote;
UString r(n);
const char* start;
char* str = r.data();
uint32_t sz1, sz = 0;
const char* begin = s;
const char* _end = s + n;
// we have these tokens: comment, whitespace, single/double-quoted string, and other
while (s < _end)
{
if ( *s == '/' &&
*(s + 1) == '*' &&
(s + 1) < _end)
{
// comment: scan to end of comment
for (s += 2; s < _end; ++s)
{
if (*s == '*' &&
*(s + 1) == '/' &&
(s + 1) < _end)
{
s += 2;
break;
}
}
}
else if (*s == '/' &&
*(s + 1) == '/' &&
(s + 1) < _end)
{
// comment: scan to end of comment
for (s += 2; s < _end && *s != '\n'; ++s) {}
}
else if (u__isspace(*s))
{
// whitespace: scan to end of whitespace; put a single space into the
// consumer if necessary to separate tokens, otherwise put nothing
start = s;
do { ++s; } while (s < _end && u__isspace(*s));
if (s < _end &&
start > begin &&
isExtendableOnRight(*(start - 1)) &&
isExtendableOnLeft(*s))
{
str[sz++] = ' ';
}
}
else if (u__isquote(*s))
{
// single/double-quoted string: scan to end of string (first unescaped quote of the
// same kind used to open the string), and put the whole string into the consumer
start = s;
quote = *s++;
while (s < _end)
{
if (*s == quote)
{
++s;
break;
}
else if (*s == '\\' && (s + 1) < _end)
{
s += 2;
}
else
{
++s;
}
}
sz1 = (s - start);
U_MEMCPY(str + sz, start, sz1); // result.append(start, sz1);
sz += sz1;
}
else
{
// other: just copy the character over
str[sz++] = *s;
if (*s == '}')
{
// add a newline after each closing brace to prevent output lines from being too long
str[sz++] = '\n';
}
++s;
}
if (sz >= n) goto end;
}
U_INTERNAL_ASSERT(sz <= n)
end:
r.size_adjust(sz);
U_RETURN_STRING(r);
}