X-Git-Url: http://git.archive.openwrt.org/?p=project%2Fjsonpath.git;a=blobdiff_plain;f=lexer.c;fp=lexer.c;h=3703d56d7bc4500ec07213a2d140442c21e5ff52;hp=0000000000000000000000000000000000000000;hb=f3830138661374ca10fe6a0b6f2f4b949dea3e5c;hpb=960dafd0b61eb14032d13c1562566618be55133f diff --git a/lexer.c b/lexer.c new file mode 100644 index 0000000..3703d56 --- /dev/null +++ b/lexer.c @@ -0,0 +1,412 @@ +/* + * Copyright (C) 2013-2014 Jo-Philipp Wich + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include + +#include "ast.h" +#include "lexer.h" +#include "parser.h" + + +struct token { + int type; + const char *pat; + int plen; + int (*parse)(const char *buf, struct jp_opcode *op); +}; + +#define dec(o) \ + ((o) - '0') + +#define hex(x) \ + (((x) >= 'a') ? (10 + (x) - 'a') : \ + (((x) >= 'A') ? (10 + (x) - 'A') : dec(x))) + +/* + * Stores the given codepoint as a utf8 multibyte sequence into the given + * output buffer and substracts the required amount of bytes from the given + * length pointer. + * + * Returns false if the multibyte sequence would not fit into the buffer, + * otherwise true. + */ + +static bool +utf8enc(char **out, int *rem, int code) +{ + if (code > 0 && code <= 0x7F) + { + if (*rem < 1) + return false; + + *(*out++) = code; (*rem)--; + return true; + } + else if (code > 0 && code <= 0x7FF) + { + if (*rem < 2) + return false; + + *(*out)++ = ((code >> 6) & 0x1F) | 0xC0; (*rem)--; + *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; + return true; + } + else if (code > 0 && code <= 0xFFFF) + { + if (*rem < 3) + return false; + + *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--; + *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--; + *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; + return true; + } + else if (code > 0 && code <= 0x10FFFF) + { + if (*rem < 4) + return false; + + *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--; + *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--; + *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--; + *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; + return true; + } + + return true; +} + + +/* + * Parses a string literal from the given buffer. + * + * Returns a negative value on error, otherwise the amount of consumed + * characters from the given buffer. + * + * Error values: + * -1 Unterminated string + * -2 Invalid escape sequence + * -3 String literal too long + */ + +static int +parse_string(const char *buf, struct jp_opcode *op) +{ + char q = *(buf++); + char str[128] = { 0 }; + char *out = str; + const char *in = buf; + bool esc = false; + int rem = sizeof(str) - 1; + int code; + + while (*in) + { + /* continuation of escape sequence */ + if (esc) + { + /* \uFFFF */ + if (in[0] == 'u') + { + if (isxdigit(in[1]) && isxdigit(in[2]) && + isxdigit(in[3]) && isxdigit(in[4])) + { + if (!utf8enc(&out, &rem, + hex(in[1]) * 16 * 16 * 16 + + hex(in[2]) * 16 * 16 + + hex(in[3]) * 16 + + hex(in[4]))) + return -3; + + in += 5; + } + else + { + return -2; + } + } + + /* \xFF */ + else if (in[0] == 'x') + { + if (isxdigit(in[1]) && isxdigit(in[2])) + { + if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2]))) + return -3; + + in += 3; + } + else + { + return -2; + } + } + + /* \377, \77 or \7 */ + else if (in[0] >= '0' && in[0] <= '7') + { + /* \377 */ + if (in[1] >= '0' && in[1] <= '7' && + in[2] >= '0' && in[2] <= '7') + { + code = dec(in[0]) * 8 * 8 + + dec(in[1]) * 8 + + dec(in[2]); + + if (code > 255) + return -2; + + if (!utf8enc(&out, &rem, code)) + return -3; + + in += 3; + } + + /* \77 */ + else if (in[1] >= '0' && in[1] <= '7') + { + if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1]))) + return -3; + + in += 2; + } + + /* \7 */ + else + { + if (!utf8enc(&out, &rem, dec(in[0]))) + return -3; + + in += 1; + } + } + + /* single character escape */ + else + { + if (rem-- < 1) + return -3; + + switch (in[0]) + { + case 'a': *out = '\a'; break; + case 'b': *out = '\b'; break; + case 'e': *out = '\e'; break; + case 'f': *out = '\f'; break; + case 'n': *out = '\n'; break; + case 'r': *out = '\r'; break; + case 't': *out = '\t'; break; + case 'v': *out = '\v'; break; + default: *out = *in; break; + } + + in++; + out++; + } + + esc = false; + } + + /* begin of escape sequence */ + else if (*in == '\\') + { + in++; + esc = true; + } + + /* terminating quote */ + else if (*in == q) + { + op->str = strdup(str); + return (in - buf) + 2; + } + + /* ordinary char */ + else + { + if (rem-- < 1) + return -3; + + *out++ = *in++; + } + } + + return -1; +} + + +/* + * Parses a label from the given buffer. + * + * Returns a negative value on error, otherwise the amount of consumed + * characters from the given buffer. + * + * Error values: + * -3 Label too long + */ + +static int +parse_label(const char *buf, struct jp_opcode *op) +{ + char str[128] = { 0 }; + char *out = str; + const char *in = buf; + int rem = sizeof(str) - 1; + + while (*in == '_' || isalnum(*in)) + { + if (rem-- < 1) + return -3; + + *out++ = *in++; + } + + if (!strcmp(str, "true") || !strcmp(str, "false")) + { + op->num = (str[0] == 't'); + op->type = T_BOOL; + } + else + { + op->str = strdup(str); + } + + return (in - buf); +} + + +/* + * Parses a number literal from the given buffer. + * + * Returns a negative value on error, otherwise the amount of consumed + * characters from the given buffer. + * + * Error values: + * -2 Invalid number character + */ + +static int +parse_number(const char *buf, struct jp_opcode *op) +{ + char *e; + int n = strtol(buf, &e, 10); + + if (e == buf) + return -2; + + op->num = n; + + return (e - buf); +} + +static const struct token tokens[] = { + { 0, " ", 1 }, + { 0, "\t", 1 }, + { 0, "\n", 1 }, + { T_LE, "<=", 2 }, + { T_GE, ">=", 2 }, + { T_NE, "!=", 2 }, + { T_AND, "&&", 2 }, + { T_OR, "||", 2 }, + { T_DOT, ".", 1 }, + { T_BROPEN, "[", 1 }, + { T_BRCLOSE, "]", 1 }, + { T_POPEN, "(", 1 }, + { T_PCLOSE, ")", 1 }, + { T_UNION, ",", 1 }, + { T_ROOT, "$", 1 }, + { T_THIS, "@", 1 }, + { T_LT, "<", 1 }, + { T_GT, ">", 1 }, + { T_EQ, "=", 1 }, + { T_NOT, "!", 1 }, + { T_WILDCARD, "*", 1 }, + { T_STRING, "'", 1, parse_string }, + { T_STRING, "\"", 1, parse_string }, + { T_LABEL, "_", 1, parse_label }, + { T_LABEL, "az", 0, parse_label }, + { T_LABEL, "AZ", 0, parse_label }, + { T_NUMBER, "-", 1, parse_number }, + { T_NUMBER, "09", 0, parse_number }, +}; + +const char *tokennames[23] = { + [0] = "End of file", + [T_AND] = "'&&'", + [T_OR] = "'||'", + [T_UNION] = "','", + [T_EQ] = "'='", + [T_NE] = "'!='", + [T_GT] = "'>'", + [T_GE] = "'>='", + [T_LT] = "'<'", + [T_LE] = "'<='", + [T_NOT] = "'!'", + [T_LABEL] = "Label", + [T_ROOT] = "'$'", + [T_THIS] = "'@'", + [T_DOT] = "'.'", + [T_WILDCARD] = "'*'", + [T_BROPEN] = "'['", + [T_BRCLOSE] = "']'", + [T_BOOL] = "Bool", + [T_NUMBER] = "Number", + [T_STRING] = "String", + [T_POPEN] = "'('", + [T_PCLOSE] = "')'", +}; + + +static int +match_token(const char *ptr, struct jp_opcode *op) +{ + int i; + const struct token *tok; + + for (i = 0, tok = &tokens[0]; + i < sizeof(tokens) / sizeof(tokens[0]); + i++, tok = &tokens[i]) + { + if ((tok->plen > 0 && !strncmp(ptr, tok->pat, tok->plen)) || + (tok->plen == 0 && *ptr >= tok->pat[0] && *ptr <= tok->pat[1])) + { + op->type = tok->type; + + if (tok->parse) + return tok->parse(ptr, op); + + return tok->plen; + } + } + + return -1; +} + +struct jp_opcode * +jp_get_token(struct jp_state *s, const char *input, int *mlen) +{ + struct jp_opcode op = { 0 }; + + *mlen = match_token(input, &op); + + if (*mlen < 0 || op.type == 0) + return NULL; + + return jp_alloc_op(s, op.type, op.num, op.str, NULL); +}