implement POSIX regexp support
[project/jsonpath.git] / lexer.c
diff --git a/lexer.c b/lexer.c
index 3703d56..c016d41 100644 (file)
--- a/lexer.c
+++ b/lexer.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2013-2014 Jo-Philipp Wich <jow@openwrt.org>
+ * Copyright (C) 2013-2014 Jo-Philipp Wich <jo@mein.io>
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -18,6 +18,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
+#include <regex.h>
 
 #include "ast.h"
 #include "lexer.h"
@@ -28,7 +29,7 @@ struct token {
        int type;
        const char *pat;
        int plen;
-       int (*parse)(const char *buf, struct jp_opcode *op);
+       int (*parse)(const char *buf, struct jp_opcode *op, struct jp_state *s);
 };
 
 #define dec(o) \
@@ -55,7 +56,7 @@ utf8enc(char **out, int *rem, int code)
                if (*rem < 1)
                        return false;
 
-               *(*out++) = code; (*rem)--;
+               *(*out)++ = code; (*rem)--;
                return true;
        }
        else if (code > 0 && code <= 0x7FF)
@@ -106,7 +107,7 @@ utf8enc(char **out, int *rem, int code)
  */
 
 static int
-parse_string(const char *buf, struct jp_opcode *op)
+parse_string(const char *buf, struct jp_opcode *op, struct jp_state *s)
 {
        char q = *(buf++);
        char str[128] = { 0 };
@@ -132,12 +133,16 @@ parse_string(const char *buf, struct jp_opcode *op)
                                                     hex(in[2]) * 16 * 16 +
                                                     hex(in[3]) * 16 +
                                                     hex(in[4])))
+                                       {
+                                               s->error_pos = s->off + (in - buf);
                                                return -3;
+                                       }
 
                                        in += 5;
                                }
                                else
                                {
+                                       s->error_pos = s->off + (in - buf);
                                        return -2;
                                }
                        }
@@ -148,12 +153,16 @@ parse_string(const char *buf, struct jp_opcode *op)
                                if (isxdigit(in[1]) && isxdigit(in[2]))
                                {
                                        if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2])))
+                                       {
+                                               s->error_pos = s->off + (in - buf);
                                                return -3;
+                                       }
 
                                        in += 3;
                                }
                                else
                                {
+                                       s->error_pos = s->off + (in - buf);
                                        return -2;
                                }
                        }
@@ -170,10 +179,16 @@ parse_string(const char *buf, struct jp_opcode *op)
                                               dec(in[2]);
 
                                        if (code > 255)
+                                       {
+                                               s->error_pos = s->off + (in - buf);
                                                return -2;
+                                       }
 
                                        if (!utf8enc(&out, &rem, code))
+                                       {
+                                               s->error_pos = s->off + (in - buf);
                                                return -3;
+                                       }
 
                                        in += 3;
                                }
@@ -182,7 +197,10 @@ parse_string(const char *buf, struct jp_opcode *op)
                                else if (in[1] >= '0' && in[1] <= '7')
                                {
                                        if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1])))
+                                       {
+                                               s->error_pos = s->off + (in - buf);
                                                return -3;
+                                       }
 
                                        in += 2;
                                }
@@ -191,7 +209,10 @@ parse_string(const char *buf, struct jp_opcode *op)
                                else
                                {
                                        if (!utf8enc(&out, &rem, dec(in[0])))
+                                       {
+                                               s->error_pos = s->off + (in - buf);
                                                return -3;
+                                       }
 
                                        in += 1;
                                }
@@ -201,7 +222,10 @@ parse_string(const char *buf, struct jp_opcode *op)
                        else
                        {
                                if (rem-- < 1)
+                               {
+                                       s->error_pos = s->off + (in - buf);
                                        return -3;
+                               }
 
                                switch (in[0])
                                {
@@ -213,7 +237,21 @@ parse_string(const char *buf, struct jp_opcode *op)
                                case 'r': *out = '\r'; break;
                                case 't': *out = '\t'; break;
                                case 'v': *out = '\v'; break;
-                               default:  *out = *in; break;
+                               default:
+                                       /* in regexp mode, retain backslash */
+                                       if (q == '/')
+                                       {
+                                               if (rem-- < 1)
+                                               {
+                                                       s->error_pos = s->off + (in - buf);
+                                                       return -3;
+                                               }
+
+                                               *out++ = '\\';
+                                       }
+
+                                       *out = *in;
+                                       break;
                                }
 
                                in++;
@@ -241,7 +279,10 @@ parse_string(const char *buf, struct jp_opcode *op)
                else
                {
                        if (rem-- < 1)
+                       {
+                               s->error_pos = s->off + (in - buf);
                                return -3;
+                       }
 
                        *out++ = *in++;
                }
@@ -252,6 +293,58 @@ parse_string(const char *buf, struct jp_opcode *op)
 
 
 /*
+ * Parses a regexp literal from the given buffer.
+ *
+ * Returns a negative value on error, otherwise the amount of consumed
+ * characters from the given buffer.
+ *
+ * Error values:
+ *  -1 Unterminated regexp
+ *  -2 Invalid escape sequence
+ *  -3 Regexp literal too long
+ */
+
+static int
+parse_regexp(const char *buf, struct jp_opcode *op, struct jp_state *s)
+{
+       int len = parse_string(buf, op, s);
+       const char *p;
+
+       if (len >= 2)
+       {
+               op->num = REG_NOSUB | REG_NEWLINE;
+
+               for (p = buf + len; p; p++)
+               {
+                       switch (*p)
+                       {
+                       case 'e':
+                               op->num |= REG_EXTENDED;
+                               len++;
+                               break;
+
+                       case 'i':
+                               op->num |= REG_ICASE;
+                               len++;
+                               break;
+
+                       case 's':
+                               op->num &= ~REG_NEWLINE;
+                               len++;
+                               break;
+
+                       default:
+                               return len;
+                       }
+               }
+
+       }
+
+       return len;
+}
+
+
+/*
  * Parses a label from the given buffer.
  *
  * Returns a negative value on error, otherwise the amount of consumed
@@ -262,7 +355,7 @@ parse_string(const char *buf, struct jp_opcode *op)
  */
 
 static int
-parse_label(const char *buf, struct jp_opcode *op)
+parse_label(const char *buf, struct jp_opcode *op, struct jp_state *s)
 {
        char str[128] = { 0 };
        char *out = str;
@@ -272,7 +365,10 @@ parse_label(const char *buf, struct jp_opcode *op)
        while (*in == '_' || isalnum(*in))
        {
                if (rem-- < 1)
+               {
+                       s->error_pos = s->off + (in - buf);
                        return -3;
+               }
 
                *out++ = *in++;
        }
@@ -302,13 +398,16 @@ parse_label(const char *buf, struct jp_opcode *op)
  */
 
 static int
-parse_number(const char *buf, struct jp_opcode *op)
+parse_number(const char *buf, struct jp_opcode *op, struct jp_state *s)
 {
        char *e;
        int n = strtol(buf, &e, 10);
 
        if (e == buf)
+       {
+               s->error_pos = s->off;
                return -2;
+       }
 
        op->num = n;
 
@@ -335,8 +434,10 @@ static const struct token tokens[] = {
        { T_LT,                 "<",     1 },
        { T_GT,                 ">",     1 },
        { T_EQ,                 "=",     1 },
+       { T_MATCH,              "~",     1 },
        { T_NOT,                "!",     1 },
        { T_WILDCARD,   "*",     1 },
+       { T_REGEXP,             "/",     1, parse_regexp },
        { T_STRING,             "'",     1, parse_string },
        { T_STRING,             "\"",    1, parse_string },
        { T_LABEL,              "_",     1, parse_label  },
@@ -346,7 +447,7 @@ static const struct token tokens[] = {
        { T_NUMBER,             "09",    0, parse_number },
 };
 
-const char *tokennames[23] = {
+const char *tokennames[25] = {
        [0]                             = "End of file",
        [T_AND]                 = "'&&'",
        [T_OR]                  = "'||'",
@@ -357,12 +458,14 @@ const char *tokennames[23] = {
        [T_GE]                  = "'>='",
        [T_LT]                  = "'<'",
        [T_LE]                  = "'<='",
+       [T_MATCH]       = "'~'",
        [T_NOT]                 = "'!'",
        [T_LABEL]               = "Label",
        [T_ROOT]                = "'$'",
        [T_THIS]                = "'@'",
        [T_DOT]                 = "'.'",
        [T_WILDCARD]    = "'*'",
+       [T_REGEXP]      = "/.../",
        [T_BROPEN]              = "'['",
        [T_BRCLOSE]             = "']'",
        [T_BOOL]                = "Bool",
@@ -374,7 +477,7 @@ const char *tokennames[23] = {
 
 
 static int
-match_token(const char *ptr, struct jp_opcode *op)
+match_token(const char *ptr, struct jp_opcode *op, struct jp_state *s)
 {
        int i;
        const struct token *tok;
@@ -389,13 +492,14 @@ match_token(const char *ptr, struct jp_opcode *op)
                        op->type = tok->type;
 
                        if (tok->parse)
-                               return tok->parse(ptr, op);
+                               return tok->parse(ptr, op, s);
 
                        return tok->plen;
                }
        }
 
-       return -1;
+       s->error_pos = s->off;
+       return -4;
 }
 
 struct jp_opcode *
@@ -403,10 +507,17 @@ jp_get_token(struct jp_state *s, const char *input, int *mlen)
 {
        struct jp_opcode op = { 0 };
 
-       *mlen = match_token(input, &op);
+       *mlen = match_token(input, &op, s);
 
-       if (*mlen < 0 || op.type == 0)
+       if (*mlen < 0)
+       {
+               s->error_code = *mlen;
                return NULL;
+       }
+       else if (op.type == 0)
+       {
+               return NULL;
+       }
 
        return jp_alloc_op(s, op.type, op.num, op.str, NULL);
 }