2008-01-28 21:37:16 +00:00
|
|
|
/* source: nestlex.c */
|
2016-01-26 18:08:18 +00:00
|
|
|
/* Copyright Gerhard Rieger */
|
2008-01-27 12:00:08 +00:00
|
|
|
/* Published under the GNU General Public License V.2, see file COPYING */
|
|
|
|
|
|
|
|
/* a function for lexical scanning of nested character patterns */
|
|
|
|
|
|
|
|
#include "config.h"
|
|
|
|
#include "mytypes.h"
|
|
|
|
|
|
|
|
#include "sysincludes.h"
|
|
|
|
|
2016-01-26 18:08:18 +00:00
|
|
|
static int _nestlex(const char **addr,
|
|
|
|
char **token,
|
|
|
|
ptrdiff_t *len,
|
|
|
|
const char *ends[],
|
|
|
|
const char *hquotes[],
|
|
|
|
const char *squotes[],
|
|
|
|
const char *nests[],
|
|
|
|
bool dropquotes,
|
|
|
|
bool c_esc,
|
|
|
|
bool html_esc
|
|
|
|
);
|
2008-01-27 12:00:08 +00:00
|
|
|
|
|
|
|
/* sub: scan a string and copy its value to output string
|
|
|
|
end scanning when an unescaped, unnested string from ends array is found
|
|
|
|
does not copy the end pattern
|
|
|
|
does not write a trailing \0 to token
|
|
|
|
allows escaping with \ and quoting (\ and quotes are removed)
|
|
|
|
allows nesting with div. parens
|
|
|
|
returns -1 if out string was too small
|
|
|
|
returns 1 if addr ended unexpectedly
|
|
|
|
returns 0 if token could be extracted successfully
|
|
|
|
*/
|
|
|
|
int nestlex(const char **addr, /* input string; aft points to end token */
|
|
|
|
char **token, /* output token; aft points to first unwritten
|
|
|
|
char (caller might want to set it to \0) */
|
|
|
|
size_t *len, /* remaining bytes in token space (incl. \0) */
|
|
|
|
const char *ends[], /* list of end strings */
|
|
|
|
const char *hquotes[],/* list of strings that quote (hard qu.) */
|
|
|
|
const char *squotes[],/* list of strings that quote softly */
|
|
|
|
const char *nests[],/* list of strings that start nesting;
|
|
|
|
every second one is matching end */
|
|
|
|
bool dropquotes, /* drop the outermost quotes */
|
|
|
|
bool c_esc, /* solve C char escapes: \n \t \0 etc */
|
|
|
|
bool html_esc /* solve HTML char escapes: %0d %08 etc */
|
|
|
|
) {
|
2016-01-26 18:08:18 +00:00
|
|
|
return
|
|
|
|
_nestlex(addr, token, (ptrdiff_t *)len, ends, hquotes, squotes, nests,
|
|
|
|
dropquotes, c_esc, html_esc);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int _nestlex(const char **addr,
|
|
|
|
char **token,
|
|
|
|
ptrdiff_t *len,
|
|
|
|
const char *ends[],
|
|
|
|
const char *hquotes[],
|
|
|
|
const char *squotes[],
|
|
|
|
const char *nests[],
|
|
|
|
bool dropquotes,
|
|
|
|
bool c_esc,
|
|
|
|
bool html_esc
|
|
|
|
) {
|
2008-01-27 12:00:08 +00:00
|
|
|
const char *in = *addr; /* pointer into input string */
|
|
|
|
const char **endx; /* loops over end patterns */
|
|
|
|
const char **quotx; /* loops over quote patterns */
|
|
|
|
const char **nestx; /* loops over nest patterns */
|
|
|
|
char *out = *token; /* pointer into output token */
|
|
|
|
char c;
|
|
|
|
int i;
|
|
|
|
int result;
|
|
|
|
|
|
|
|
while (true) {
|
|
|
|
|
|
|
|
/* is this end of input string? */
|
|
|
|
if (*in == 0) {
|
|
|
|
|
|
|
|
break; /* end of string */
|
|
|
|
}
|
|
|
|
|
|
|
|
/* first check the end patterns (e.g. for ']') */
|
|
|
|
endx = ends; i = 0;
|
|
|
|
while (*endx) {
|
|
|
|
if (!strncmp(in, *endx, strlen(*endx))) {
|
|
|
|
/* this end pattern matches */
|
|
|
|
*addr = in;
|
|
|
|
*token = out;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
++endx;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check for hard quoting pattern */
|
|
|
|
quotx = hquotes;
|
|
|
|
while (hquotes && *quotx) {
|
|
|
|
if (!strncmp(in, *quotx, strlen(*quotx))) {
|
|
|
|
/* this quote pattern matches */
|
|
|
|
const char *endnest[2];
|
|
|
|
if (dropquotes) {
|
|
|
|
/* we strip this quote */
|
|
|
|
in += strlen(*quotx);
|
|
|
|
} else {
|
|
|
|
for (i = strlen(*quotx); i > 0; --i) {
|
|
|
|
*out++ = *in++;
|
|
|
|
if (--*len <= 0) { *addr = in; *token = out; return -1; }
|
|
|
|
}
|
|
|
|
}
|
2016-01-26 18:08:18 +00:00
|
|
|
/* we call _nestlex recursively */
|
2008-01-27 12:00:08 +00:00
|
|
|
endnest[0] = *quotx;
|
|
|
|
endnest[1] = NULL;
|
|
|
|
result =
|
2016-01-26 18:08:18 +00:00
|
|
|
_nestlex(&in, &out, len, endnest, NULL/*hquotes*/,
|
2008-01-27 12:00:08 +00:00
|
|
|
NULL/*squotes*/, NULL/*nests*/,
|
|
|
|
false, c_esc, html_esc);
|
|
|
|
if (result == 0 && dropquotes) {
|
|
|
|
/* we strip this quote */
|
|
|
|
in += strlen(*quotx);
|
2016-01-26 18:08:18 +00:00
|
|
|
} else if (result < 0) {
|
|
|
|
*addr = in; *token = out; return result;
|
2008-01-27 12:00:08 +00:00
|
|
|
} else {
|
|
|
|
/* we copy the trailing quote */
|
|
|
|
for (i = strlen(*quotx); i > 0; --i) {
|
|
|
|
*out++ = *in++;
|
|
|
|
if (--*len <= 0) { *addr = in; *token = out; return -1; }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
++quotx;
|
|
|
|
}
|
|
|
|
if (hquotes && *quotx != NULL) {
|
|
|
|
/* there was a quote; string might continue with hard quote */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check for soft quoting pattern */
|
|
|
|
quotx = squotes;
|
|
|
|
while (squotes && *quotx) {
|
|
|
|
if (!strncmp(in, *quotx, strlen(*quotx))) {
|
|
|
|
/* this quote pattern matches */
|
|
|
|
/* we strip this quote */
|
2016-01-26 18:08:18 +00:00
|
|
|
/* we call _nestlex recursively */
|
2008-01-27 12:00:08 +00:00
|
|
|
const char *endnest[2];
|
|
|
|
if (dropquotes) {
|
|
|
|
/* we strip this quote */
|
|
|
|
in += strlen(*quotx);
|
|
|
|
} else {
|
|
|
|
for (i = strlen(*quotx); i > 0; --i) {
|
|
|
|
*out++ = *in++;
|
|
|
|
if (--*len <= 0) { *addr = in; *token = out; return -1; }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
endnest[0] = *quotx;
|
|
|
|
endnest[1] = NULL;
|
|
|
|
result =
|
2016-01-26 18:08:18 +00:00
|
|
|
_nestlex(&in, &out, len, endnest, hquotes,
|
2008-01-27 12:00:08 +00:00
|
|
|
squotes, nests,
|
|
|
|
false, c_esc, html_esc);
|
|
|
|
|
|
|
|
if (result == 0 && dropquotes) {
|
|
|
|
/* we strip the trailing quote */
|
|
|
|
in += strlen(*quotx);
|
2016-01-26 18:08:18 +00:00
|
|
|
} else if (result < 0) {
|
|
|
|
*addr = in; *token = out; return result;
|
2008-01-27 12:00:08 +00:00
|
|
|
} else {
|
|
|
|
/* we copy the trailing quote */
|
|
|
|
for (i = strlen(*quotx); i > 0; --i) {
|
|
|
|
*out++ = *in++;
|
|
|
|
if (--*len <= 0) { *addr = in; *token = out; return -1; }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
++quotx;
|
|
|
|
}
|
|
|
|
if (squotes && *quotx != NULL) {
|
|
|
|
/* there was a soft quote; string might continue with any quote */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* check patterns that start a nested clause */
|
|
|
|
nestx = nests; i = 0;
|
|
|
|
while (nests && *nestx) {
|
|
|
|
if (!strncmp(in, *nestx, strlen(*nestx))) {
|
|
|
|
/* this nest pattern matches */
|
|
|
|
const char *endnest[2];
|
|
|
|
endnest[0] = nestx[1];
|
|
|
|
endnest[1] = NULL;
|
|
|
|
|
|
|
|
for (i = strlen(nestx[1]); i > 0; --i) {
|
|
|
|
*out++ = *in++;
|
|
|
|
if (--*len <= 0) { *addr = in; *token = out; return -1; }
|
|
|
|
}
|
|
|
|
|
|
|
|
result =
|
2016-01-26 18:08:18 +00:00
|
|
|
_nestlex(&in, &out, len, endnest, hquotes, squotes, nests,
|
2008-01-27 12:00:08 +00:00
|
|
|
false, c_esc, html_esc);
|
|
|
|
if (result == 0) {
|
|
|
|
/* copy endnest */
|
|
|
|
i = strlen(nestx[1]); while (i > 0) {
|
|
|
|
*out++ = *in++;
|
|
|
|
if (--*len <= 0) {
|
|
|
|
*addr = in;
|
|
|
|
*token = out;
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
--i;
|
|
|
|
}
|
2016-01-26 18:08:18 +00:00
|
|
|
} else if (result < 0) {
|
|
|
|
*addr = in; *token = out; return result;
|
2008-01-27 12:00:08 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
nestx += 2; /* skip matching end pattern in table */
|
|
|
|
}
|
|
|
|
if (nests && *nestx) {
|
|
|
|
/* we handled a nested expression, continue loop */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* "normal" data, possibly escaped */
|
|
|
|
c = *in++;
|
|
|
|
if (c == '\\') {
|
|
|
|
/* found a plain \ escaped part */
|
|
|
|
c = *in++;
|
|
|
|
if (c == 0) { /* Warn("trailing '\\'");*/ break; }
|
|
|
|
if (c_esc) { /* solve C char escapes: \n \t \0 etc */
|
|
|
|
switch (c) {
|
|
|
|
case '0': c = '\0'; break;
|
|
|
|
case 'a': c = '\a'; break;
|
|
|
|
case 'b': c = '\b'; break;
|
|
|
|
case 'f': c = '\f'; break;
|
|
|
|
case 'n': c = '\n'; break;
|
|
|
|
case 'r': c = '\r'; break;
|
|
|
|
case 't': c = '\t'; break;
|
|
|
|
case 'v': c = '\v'; break;
|
|
|
|
#if LATER
|
|
|
|
case 'x': !!! 1 to 2 hex digits; break;
|
|
|
|
case 'u': !!! 4 hex digits?; break;
|
|
|
|
case 'U': !!! 8 hex digits?; break;
|
|
|
|
#endif
|
|
|
|
default: break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
*out++ = c;
|
|
|
|
--*len;
|
2016-01-26 18:08:18 +00:00
|
|
|
if (*len <= 0) {
|
2008-01-27 12:00:08 +00:00
|
|
|
*addr = in;
|
|
|
|
*token = out;
|
|
|
|
return -1; /* output overflow */
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* just a simple char */
|
|
|
|
*out++ = c;
|
|
|
|
--*len;
|
2016-01-26 18:08:18 +00:00
|
|
|
if (*len <= 0) {
|
2008-01-27 12:00:08 +00:00
|
|
|
*addr = in;
|
|
|
|
*token = out;
|
|
|
|
return -1; /* output overflow */
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
/* never come here? */
|
|
|
|
|
|
|
|
*addr = in;
|
|
|
|
*token = out;
|
|
|
|
return 0; /* OK */
|
|
|
|
}
|