implement POSIX regexp support master
authorJo-Philipp Wich <jo@mein.io>
Sun, 4 Feb 2018 17:36:07 +0000 (18:36 +0100)
committerJo-Philipp Wich <jo@mein.io>
Sun, 4 Feb 2018 17:45:21 +0000 (18:45 +0100)
Introduce a new operator `~` and new `/.../eis` regular expression syntax.

This allows filtering by regular expression, e.g.

   jsonfilter -s '[ "foo", "bar", "baz" ]' -e '$[@ ~ /^b/]'

... would yield the values `bar` and `baz`.

Possible regular expression modifiers are:

  - `e` ... enable extended POSIX regular expressions
  - `i` ... perform case insensitive matches
  - `s` ... let ranges and `.` match the newline character

A regular expression literal may occur on the left or the right side of
the `~` operator, but not on both.

In case neither side of the `~` operator is a regular expression, the right
side will be treated as regular expression pattern. Non-string values are
converted to their string representation before performing matching.

Signed-off-by: Jo-Philipp Wich <jo@mein.io>
lexer.c
lexer.h
matcher.c
matcher.h
parser.y

diff --git a/lexer.c b/lexer.c
index ca5880e..c016d41 100644 (file)
--- a/lexer.c
+++ b/lexer.c
@@ -18,6 +18,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
+#include <regex.h>
 
 #include "ast.h"
 #include "lexer.h"
 
 #include "ast.h"
 #include "lexer.h"
@@ -236,7 +237,21 @@ parse_string(const char *buf, struct jp_opcode *op, struct jp_state *s)
                                case 'r': *out = '\r'; break;
                                case 't': *out = '\t'; break;
                                case 'v': *out = '\v'; break;
                                case 'r': *out = '\r'; break;
                                case 't': *out = '\t'; break;
                                case 'v': *out = '\v'; break;
-                               default:  *out = *in; break;
+                               default:
+                                       /* in regexp mode, retain backslash */
+                                       if (q == '/')
+                                       {
+                                               if (rem-- < 1)
+                                               {
+                                                       s->error_pos = s->off + (in - buf);
+                                                       return -3;
+                                               }
+
+                                               *out++ = '\\';
+                                       }
+
+                                       *out = *in;
+                                       break;
                                }
 
                                in++;
                                }
 
                                in++;
@@ -278,6 +293,58 @@ parse_string(const char *buf, struct jp_opcode *op, struct jp_state *s)
 
 
 /*
 
 
 /*
+ * Parses a regexp literal from the given buffer.
+ *
+ * Returns a negative value on error, otherwise the amount of consumed
+ * characters from the given buffer.
+ *
+ * Error values:
+ *  -1 Unterminated regexp
+ *  -2 Invalid escape sequence
+ *  -3 Regexp literal too long
+ */
+
+static int
+parse_regexp(const char *buf, struct jp_opcode *op, struct jp_state *s)
+{
+       int len = parse_string(buf, op, s);
+       const char *p;
+
+       if (len >= 2)
+       {
+               op->num = REG_NOSUB | REG_NEWLINE;
+
+               for (p = buf + len; p; p++)
+               {
+                       switch (*p)
+                       {
+                       case 'e':
+                               op->num |= REG_EXTENDED;
+                               len++;
+                               break;
+
+                       case 'i':
+                               op->num |= REG_ICASE;
+                               len++;
+                               break;
+
+                       case 's':
+                               op->num &= ~REG_NEWLINE;
+                               len++;
+                               break;
+
+                       default:
+                               return len;
+                       }
+               }
+
+       }
+
+       return len;
+}
+
+
+/*
  * Parses a label from the given buffer.
  *
  * Returns a negative value on error, otherwise the amount of consumed
  * Parses a label from the given buffer.
  *
  * Returns a negative value on error, otherwise the amount of consumed
@@ -367,8 +434,10 @@ static const struct token tokens[] = {
        { T_LT,                 "<",     1 },
        { T_GT,                 ">",     1 },
        { T_EQ,                 "=",     1 },
        { T_LT,                 "<",     1 },
        { T_GT,                 ">",     1 },
        { T_EQ,                 "=",     1 },
+       { T_MATCH,              "~",     1 },
        { T_NOT,                "!",     1 },
        { T_WILDCARD,   "*",     1 },
        { T_NOT,                "!",     1 },
        { T_WILDCARD,   "*",     1 },
+       { T_REGEXP,             "/",     1, parse_regexp },
        { T_STRING,             "'",     1, parse_string },
        { T_STRING,             "\"",    1, parse_string },
        { T_LABEL,              "_",     1, parse_label  },
        { T_STRING,             "'",     1, parse_string },
        { T_STRING,             "\"",    1, parse_string },
        { T_LABEL,              "_",     1, parse_label  },
@@ -378,7 +447,7 @@ static const struct token tokens[] = {
        { T_NUMBER,             "09",    0, parse_number },
 };
 
        { T_NUMBER,             "09",    0, parse_number },
 };
 
-const char *tokennames[23] = {
+const char *tokennames[25] = {
        [0]                             = "End of file",
        [T_AND]                 = "'&&'",
        [T_OR]                  = "'||'",
        [0]                             = "End of file",
        [T_AND]                 = "'&&'",
        [T_OR]                  = "'||'",
@@ -389,12 +458,14 @@ const char *tokennames[23] = {
        [T_GE]                  = "'>='",
        [T_LT]                  = "'<'",
        [T_LE]                  = "'<='",
        [T_GE]                  = "'>='",
        [T_LT]                  = "'<'",
        [T_LE]                  = "'<='",
+       [T_MATCH]       = "'~'",
        [T_NOT]                 = "'!'",
        [T_LABEL]               = "Label",
        [T_ROOT]                = "'$'",
        [T_THIS]                = "'@'",
        [T_DOT]                 = "'.'",
        [T_WILDCARD]    = "'*'",
        [T_NOT]                 = "'!'",
        [T_LABEL]               = "Label",
        [T_ROOT]                = "'$'",
        [T_THIS]                = "'@'",
        [T_DOT]                 = "'.'",
        [T_WILDCARD]    = "'*'",
+       [T_REGEXP]      = "/.../",
        [T_BROPEN]              = "'['",
        [T_BRCLOSE]             = "']'",
        [T_BOOL]                = "Bool",
        [T_BROPEN]              = "'['",
        [T_BRCLOSE]             = "']'",
        [T_BOOL]                = "Bool",
diff --git a/lexer.h b/lexer.h
index 0906f76..a47c154 100644 (file)
--- a/lexer.h
+++ b/lexer.h
@@ -19,7 +19,7 @@
 
 #include "ast.h"
 
 
 #include "ast.h"
 
-extern const char *tokennames[23];
+extern const char *tokennames[25];
 
 struct jp_opcode *
 jp_get_token(struct jp_state *s, const char *input, int *mlen);
 
 struct jp_opcode *
 jp_get_token(struct jp_state *s, const char *input, int *mlen);
index 85bd1c5..d2a8767 100644 (file)
--- a/matcher.c
+++ b/matcher.c
@@ -17,6 +17,7 @@
 #include "parser.h"
 #include "matcher.h"
 
 #include "parser.h"
 #include "matcher.h"
 
+
 static struct json_object *
 jp_match_next(struct jp_opcode *ptr,
               struct json_object *root, struct json_object *cur,
 static struct json_object *
 jp_match_next(struct jp_opcode *ptr,
               struct json_object *root, struct json_object *cur,
@@ -131,6 +132,99 @@ jp_cmp(struct jp_opcode *op, struct json_object *root, struct json_object *cur)
 }
 
 static bool
 }
 
 static bool
+jp_regmatch(struct jp_opcode *op, struct json_object *root, struct json_object *cur)
+{
+       struct jp_opcode left, right;
+       char lbuf[22], rbuf[22], *lval, *rval;
+       int err, rflags = REG_NOSUB | REG_NEWLINE;
+       regex_t preg;
+
+
+       if (!jp_resolve(root, cur, op->down, &left) ||
+           !jp_resolve(root, cur, op->down->sibling, &right))
+               return false;
+
+       if (left.type == T_REGEXP)
+       {
+               switch (right.type)
+               {
+               case T_BOOL:
+                       lval = right.num ? "true" : "false";
+                       break;
+
+               case T_NUMBER:
+                       snprintf(lbuf, sizeof(lbuf), "%d", right.num);
+                       lval = lbuf;
+                       break;
+
+               case T_STRING:
+                       lval = right.str;
+                       break;
+
+               default:
+                       return false;
+               }
+
+               rval = left.str;
+               rflags = left.num;
+       }
+       else
+       {
+               switch (left.type)
+               {
+               case T_BOOL:
+                       lval = left.num ? "true" : "false";
+                       break;
+
+               case T_NUMBER:
+                       snprintf(lbuf, sizeof(lbuf), "%d", left.num);
+                       lval = lbuf;
+                       break;
+
+               case T_STRING:
+                       lval = left.str;
+                       break;
+
+               default:
+                       return false;
+               }
+
+               switch (right.type)
+               {
+               case T_BOOL:
+                       rval = right.num ? "true" : "false";
+                       break;
+
+               case T_NUMBER:
+                       snprintf(rbuf, sizeof(rbuf), "%d", right.num);
+                       rval = rbuf;
+                       break;
+
+               case T_STRING:
+                       rval = right.str;
+                       break;
+
+               case T_REGEXP:
+                       rval = right.str;
+                       rflags = right.num;
+                       break;
+
+               default:
+                       return false;
+               }
+       }
+
+       if (regcomp(&preg, rval, rflags))
+               return false;
+
+       err = regexec(&preg, lval, 0, NULL, 0);
+
+       regfree(&preg);
+
+       return err ? false : true;
+}
+
+static bool
 jp_expr(struct jp_opcode *op, struct json_object *root, struct json_object *cur,
         int idx, const char *key, jp_match_cb_t cb, void *priv)
 {
 jp_expr(struct jp_opcode *op, struct json_object *root, struct json_object *cur,
         int idx, const char *key, jp_match_cb_t cb, void *priv)
 {
@@ -149,6 +243,9 @@ jp_expr(struct jp_opcode *op, struct json_object *root, struct json_object *cur,
        case T_GE:
                return jp_cmp(op, root, cur);
 
        case T_GE:
                return jp_cmp(op, root, cur);
 
+       case T_MATCH:
+               return jp_regmatch(op, root, cur);
+
        case T_ROOT:
                return !!jp_match(op, root, NULL, NULL);
 
        case T_ROOT:
                return !!jp_match(op, root, NULL, NULL);
 
index 468ddf2..aac21b9 100644 (file)
--- a/matcher.h
+++ b/matcher.h
@@ -19,6 +19,8 @@
 
 #include <string.h>
 #include <stdbool.h>
 
 #include <string.h>
 #include <stdbool.h>
+#include <stdio.h>
+#include <regex.h>
 
 #ifdef JSONC
        #include <json.h>
 
 #ifdef JSONC
        #include <json.h>
index 29b43ba..4d3581e 100644 (file)
--- a/parser.y
+++ b/parser.y
@@ -20,7 +20,7 @@
 %left T_AND.
 %left T_OR.
 %left T_UNION.
 %left T_AND.
 %left T_OR.
 %left T_UNION.
-%nonassoc T_EQ T_NE T_GT T_GE T_LT T_LE.
+%nonassoc T_EQ T_NE T_GT T_GE T_LT T_LE T_MATCH.
 %right T_NOT.
 
 %include {
 %right T_NOT.
 
 %include {
@@ -87,11 +87,13 @@ cmp_exp(A) ::= unary_exp(B) T_GT unary_exp(C).              { A = alloc_op(T_GT, 0, NULL, B,
 cmp_exp(A) ::= unary_exp(B) T_GE unary_exp(C).         { A = alloc_op(T_GE, 0, NULL, B, C); }
 cmp_exp(A) ::= unary_exp(B) T_EQ unary_exp(C).         { A = alloc_op(T_EQ, 0, NULL, B, C); }
 cmp_exp(A) ::= unary_exp(B) T_NE unary_exp(C).         { A = alloc_op(T_NE, 0, NULL, B, C); }
 cmp_exp(A) ::= unary_exp(B) T_GE unary_exp(C).         { A = alloc_op(T_GE, 0, NULL, B, C); }
 cmp_exp(A) ::= unary_exp(B) T_EQ unary_exp(C).         { A = alloc_op(T_EQ, 0, NULL, B, C); }
 cmp_exp(A) ::= unary_exp(B) T_NE unary_exp(C).         { A = alloc_op(T_NE, 0, NULL, B, C); }
+cmp_exp(A) ::= unary_exp(B) T_MATCH unary_exp(C).      { A = alloc_op(T_MATCH, 0, NULL, B, C); }
 cmp_exp(A) ::= unary_exp(B).                                           { A = B; }
 
 unary_exp(A) ::= T_BOOL(B).                                                    { A = B; }
 unary_exp(A) ::= T_NUMBER(B).                                          { A = B; }
 unary_exp(A) ::= T_STRING(B).                                          { A = B; }
 cmp_exp(A) ::= unary_exp(B).                                           { A = B; }
 
 unary_exp(A) ::= T_BOOL(B).                                                    { A = B; }
 unary_exp(A) ::= T_NUMBER(B).                                          { A = B; }
 unary_exp(A) ::= T_STRING(B).                                          { A = B; }
+unary_exp(A) ::= T_REGEXP(B).                                          { A = B; }
 unary_exp(A) ::= T_WILDCARD(B).                                                { A = B; }
 unary_exp(A) ::= T_POPEN or_exps(B) T_PCLOSE.          { A = B; }
 unary_exp(A) ::= T_NOT unary_exp(B).                           { A = alloc_op(T_NOT, 0, NULL, B); }
 unary_exp(A) ::= T_WILDCARD(B).                                                { A = B; }
 unary_exp(A) ::= T_POPEN or_exps(B) T_PCLOSE.          { A = B; }
 unary_exp(A) ::= T_NOT unary_exp(B).                           { A = alloc_op(T_NOT, 0, NULL, B); }