Add support for JSON sequence MIME type

Per draft-ietf-json-text-sequence-07 (which soon will be published as an RFC).
author: Nicolas Williams <nico@cryptonector.com> 2014-10-10 22:19:38 -0500
committer: Nicolas Williams <nico@cryptonector.com> 2014-10-12 08:44:40 -0500
commit: 89791a000ba8fd614d8a8fa59a5ba76f21ea4d1d (patch)
tree: 6aab1b7f73f996652a8e11bd2c74ad633ae55925
parent: 3411167c03fba129d44a0e7c9699767c9e8fd5cd (diff)
5 files changed, 141 insertions, 21 deletions
diff --git a/docs/content/3.manual/manual.yml b/docs/content/3.manual/manual.yml
index a89feb67..31c7b2f0 100644
--- a/docs/content/3.manual/manual.yml
+++ b/docs/content/3.manual/manual.yml
@@ -92,6 +92,17 @@ sections:
 
         Output the jq version and exit with zero.
 
+      * `--seq`:
+
+        Use the `application/json-seq` MIME type scheme for separating
+        JSON texts in jq's input and output.  This means that an ASCII
+        RS (record separator) character is printed before each value on
+        output and an ASCII LF (line feed) is printed after every
+        output.  Input JSON texts that fail to parse are ignored (but
+        warned about), discarding all subsequent input until the next
+        RS.  This more also parses the output of jq without the `--seq`
+        option.
+
       * `--slurp`/`-s`:
 
         Instead of running the filter for each JSON object in the
diff --git a/jv.h b/jv.h
index 465070ad..08b89aec 100644
--- a/jv.h
+++ b/jv.h
@@ -156,6 +156,8 @@ void jv_dump(jv, int flags);
 void jv_show(jv, int flags);
 jv jv_dump_string(jv, int flags);
 
+#define JV_PARSE_SEQ 1
+
 jv jv_parse(const char* string);
 jv jv_parse_sized(const char* string, int length);
 
diff --git a/jv_parse.c b/jv_parse.c
index 5b703fd7..e534e93b 100644
--- a/jv_parse.c
+++ b/jv_parse.c
@@ -24,6 +24,8 @@ struct jv_parser {
   int curr_buf_is_partial;
   unsigned bom_strip_position;
 
+  int flags;
+
   jv* stack;
   int stackpos;
   int stacklen;
@@ -40,12 +42,15 @@ struct jv_parser {
   enum {
     JV_PARSER_NORMAL,
     JV_PARSER_STRING,
-    JV_PARSER_STRING_ESCAPE
+    JV_PARSER_STRING_ESCAPE,
+    JV_PARSER_WAITING_FOR_RS // parse error, waiting for RS
   } st;
+  unsigned int last_ch_was_ws:1;
 };
 
 
 static void parser_init(struct jv_parser* p) {
+  p->flags = 0;
   p->stack = 0;
   p->stacklen = p->stackpos = 0;
   p->next = jv_invalid();
@@ -60,10 +65,18 @@ static void parser_init(struct jv_parser* p) {
   jvp_dtoa_context_init(&p->dtoa);
 }
 
-static void parser_free(struct jv_parser* p) {
+static void parser_reset(struct jv_parser* p) {
   jv_free(p->next);
+  p->next = jv_invalid();
   for (int i=0; i<p->stackpos; i++) 
     jv_free(p->stack[i]);
+  p->stackpos = 0;
+  p->tokenpos = 0;
+  p->st = JV_PARSER_NORMAL;
+}
+
+static void parser_free(struct jv_parser* p) {
+  parser_reset(p);
   jv_mem_free(p->stack);
   jv_mem_free(p->tokenbuf);
   jvp_dtoa_context_free(&p->dtoa);
@@ -330,9 +343,26 @@ static pfunc scan(struct jv_parser* p, char ch, jv* out) {
     p->line++;
     p->column = 0;
   }
+  if (ch == '\036' /* ASCII RS; see draft-ietf-json-sequence-07 */) {
+    TRY(check_literal(p));
+    if (p->st == JV_PARSER_NORMAL && check_done(p, out)) {
+      if ((p->flags & JV_PARSE_SEQ) && !p->last_ch_was_ws && jv_get_kind(*out) == JV_KIND_NUMBER) {
+        jv_free(*out);
+        *out = jv_invalid();
+        return "Potentially truncated top-level numeric value";
+      }
+      return OK;
+    }
+    parser_reset(p);
+    *out = jv_invalid();
+    return "Truncated value";
+  }
   presult answer = 0;
+  p->last_ch_was_ws = 0;
   if (p->st == JV_PARSER_NORMAL) {
     chclass cls = classify(ch);
+    if (cls == WHITESPACE)
+      p->last_ch_was_ws = 1;
     if (cls != LITERAL) {
       TRY(check_literal(p));
       if (check_done(p, out)) answer = OK;
@@ -373,6 +403,7 @@ static pfunc scan(struct jv_parser* p, char ch, jv* out) {
 struct jv_parser* jv_parser_new(int flags) {
   struct jv_parser* p = jv_mem_alloc(sizeof(struct jv_parser));
   parser_init(p);
+  p->flags = flags;
   return p;
 }
 
@@ -412,14 +443,22 @@ jv jv_parser_next(struct jv_parser* p) {
   assert(p->curr_buf && "a buffer must be provided");
   if (p->bom_strip_position == 0xff) return jv_invalid_with_msg(jv_string("Malformed BOM"));
   jv value;
+  char ch;
   presult msg = 0;
   while (!msg && p->curr_buf_pos < p->curr_buf_length) {
-    char ch = p->curr_buf[p->curr_buf_pos++];
+    ch = p->curr_buf[p->curr_buf_pos++];
+    if (ch != '\036' && p->st == JV_PARSER_WAITING_FOR_RS)
+      continue; // need to resync, wait for RS
     msg = scan(p, ch, &value);
   }
   if (msg == OK) {
     return value;
   } else if (msg) {
+    parser_reset(p);
+    if (ch != '\036' && (p->flags & JV_PARSE_SEQ)) {
+      p->st = JV_PARSER_WAITING_FOR_RS;
+      return jv_invalid_with_msg(jv_string_fmt("%s at line %d, column %d (need RS to resync)", msg, p->line, p->column));
+    }
     return jv_invalid_with_msg(jv_string_fmt("%s at line %d, column %d", msg, p->line, p->column));
   } else if (p->curr_buf_is_partial) {
     assert(p->curr_buf_pos == p->curr_buf_length);
@@ -428,16 +467,31 @@ jv jv_parser_next(struct jv_parser* p) {
   } else {
     assert(p->curr_buf_pos == p->curr_buf_length);
     // at EOF
-    if (p->st != JV_PARSER_NORMAL) 
-      return jv_invalid_with_msg(jv_string("Unfinished string"));
-    if ((msg = check_literal(p)))
-      return jv_invalid_with_msg(jv_string(msg));
-    if (p->stackpos != 0)
-      return jv_invalid_with_msg(jv_string("Unfinished JSON term"));
+    if (p->st != JV_PARSER_WAITING_FOR_RS) {
+      if (p->st != JV_PARSER_NORMAL) {
+        parser_reset(p);
+        p->st = JV_PARSER_WAITING_FOR_RS;
+        return jv_invalid_with_msg(jv_string("Unfinished string"));
+      }
+      if ((msg = check_literal(p))) {
+        parser_reset(p);
+        p->st = JV_PARSER_WAITING_FOR_RS;
+        return jv_invalid_with_msg(jv_string(msg));
+      }
+      if (p->stackpos != 0) {
+        parser_reset(p);
+        p->st = JV_PARSER_WAITING_FOR_RS;
+        return jv_invalid_with_msg(jv_string("Unfinished JSON term"));
+      }
+    }
     // p->next is either invalid (nothing here but no syntax error)
     // or valid (this is the value). either way it's the thing to return
     value = p->next;
     p->next = jv_invalid();
+    if ((p->flags & JV_PARSE_SEQ) && !p->last_ch_was_ws && jv_get_kind(value) == JV_KIND_NUMBER) {
+      jv_free(value);
+      return jv_invalid_with_msg(jv_string("Potentially truncated top-level numeric value"));
+    }
     return value;
   }
 }
diff --git a/main.c b/main.c
index 335ca7b2..8ebdb9fc 100644
--- a/main.c
+++ b/main.c
@@ -90,8 +90,9 @@ enum {
   UNBUFFERED_OUTPUT     = 2048,
   EXIT_STATUS           = 4096,
   IN_PLACE              = 8192,
+  SEQ                   = 16384,
   /* debugging only */
-  DUMP_DISASM           = 16384,
+  DUMP_DISASM           = 32768,
 };
 static int options = 0;
 
@@ -122,6 +123,8 @@ static int process(jq_state *jq, jv value, int flags) {
         ret = 11;
       else
         ret = 0;
+      if (options & SEQ)
+        fwrite("\036", 1, 1, stdout);
       jv_dump(result, dumpopts);
     }
     if (!(options & RAW_NO_LF))
@@ -284,6 +287,10 @@ int main(int argc, char* argv[]) {
         options |= IN_PLACE;
         if (!short_opts) continue;
       }
+      if (isoption(argv[i], 0, "seq", &short_opts)) {
+        options |= SEQ;
+        if (!short_opts) continue;
+      }
       if (isoption(argv[i], 'e', "exit-status", &short_opts)) {
         options |= EXIT_STATUS;
         if (!short_opts) continue;
@@ -444,7 +451,7 @@ int main(int argc, char* argv[]) {
         slurped = jv_array();
       }
     }
-    struct jv_parser* parser = jv_parser_new(0);
+    struct jv_parser* parser = jv_parser_new((options & SEQ) ? JV_PARSE_SEQ : 0);
     char buf[4096];
     int is_last = 0;
     while (read_more(buf, sizeof(buf), &is_last)) {
@@ -461,22 +468,28 @@ int main(int argc, char* argv[]) {
       } else {
         jv_parser_set_buf(parser, buf, strlen(buf), !is_last);
         jv value;
-        while (jv_is_valid((value = jv_parser_next(parser)))) {
+        while (jv_is_valid(value = jv_parser_next(parser)) || jv_invalid_has_msg(jv_copy(value))) {
+          if (!jv_is_valid(value)) {
+            jv msg = jv_invalid_get_msg(value);
+            if (!(options & SEQ)) {
+              // We used to treat parse errors as fatal...
+              ret = 4;
+              fprintf(stderr, "parse error: %s\n", jv_string_value(msg));
+              jv_free(msg);
+              break;
+            }
+            fprintf(stderr, "ignoring parse error: %s\n", jv_string_value(msg));
+            jv_free(msg);
+            // ...but with --seq we attempt to recover.
+            continue;
+          }
           if (options & SLURP) {
             slurped = jv_array_append(slurped, value);
           } else {
             ret = process(jq, value, jq_flags);
+            value = jv_invalid();
           }
         }
-        if (jv_invalid_has_msg(jv_copy(value))) {
-          jv msg = jv_invalid_get_msg(value);
-          fprintf(stderr, "parse error: %s\n", jv_string_value(msg));
-          jv_free(msg);
-          ret = 4;
-          break;
-        } else {
-          jv_free(value);
-        }
       }
     }
     jv_parser_free(parser);
diff --git a/tests/run b/tests/run
index 0b5fcafc..252ad732 100755
--- a/tests/run
+++ b/tests/run
@@ -78,6 +78,46 @@ case "$v" in
 *) true;;
 esac
 
+## Test JSON sequence support
+
+cat > $d/expected <<EOF
+ignoring parse error: Potentially truncated top-level numeric value at line 1, column 2
+ignoring parse error: Truncated value at line 2, column 5
+ignoring parse error: Truncated value at line 2, column 25
+ignoring parse error: Truncated value at line 2, column 41
+EOF
+printf '1\0362 3\n[0,1\036[4,5]true"ab"{"c":4\036{}{"d":5,"e":6"\036false\n'|$VALGRIND $Q ./jq -ces --seq '. == [2,3,[4,5],true,"ab",{},false]' > /dev/null 2> $d/out
+cmp $d/out $d/expected
+
+cat > $d/expected <<EOF
+ignoring parse error: Potentially truncated top-level numeric value at line 1, column 2
+ignoring parse error: Truncated value at line 2, column 5
+ignoring parse error: Truncated value at line 2, column 25
+ignoring parse error: Invalid literal at line 3, column 1
+EOF
+printf '1\0362 3\n[0,1\036[4,5]true"ab"{"c":4\036{}{"d":5,"e":6"false\n\036null'|$VALGRIND $Q ./jq -ces --seq '. == [2,3,[4,5],true,"ab",{},null]' > /dev/null 2> $d/out
+cmp $d/out $d/expected
+
+# Note that here jq sees no inputs at all but it still succeeds because
+# --seq ignores parse errors
+cat > $d/expected <<EOF
+ignoring parse error: Unfinished string
+EOF
+printf '"foo'|./jq -ce --seq . > $d/out 2>&1
+cmp $d/out $d/expected
+
+# Numeric values truncated by EOF are ignored
+cat > $d/expected <<EOF
+ignoring parse error: Potentially truncated top-level numeric value
+EOF
+printf '1'|./jq -ce --seq . > $d/out 2>&1
+cmp $d/out $d/expected
+
+cat > $d/expected <<EOF
+EOF
+printf '1\n'|./jq -ces --seq '. == [1]' >/dev/null 2> $d/out
+cmp $d/out $d/expected
+
 ## Test library/module system
 
 mods=$PWD/tests/modules
author	Nicolas Williams <nico@cryptonector.com>	2014-10-10 22:19:38 -0500
committer	Nicolas Williams <nico@cryptonector.com>	2014-10-12 08:44:40 -0500
commit	89791a000ba8fd614d8a8fa59a5ba76f21ea4d1d (patch)
tree	6aab1b7f73f996652a8e11bd2c74ad633ae55925
parent	3411167c03fba129d44a0e7c9699767c9e8fd5cd (diff)