--raw-input ought to read NULs (partial fix #760)

We can't know how many bytes fgets() read when we reach EOF and fgets() didn't see a newline; we can only assume that at least strlen(buf) bytes were read. This is quite obnoxious if one wants to use NULs in raw input, but at least we can make reading "a\0b\0c\0" with no newline yield "a\0b\0c", losing only the final sequence of NULs. We can't use getline() either, since it will want to allocate a buffer big enough for an entire line, and we might not have any newlines in our input. A complete fix will have to use getc() or read(), preferably the latter.
author: Nicolas Williams <nico@cryptonector.com> 2015-04-23 18:27:53 -0500
committer: Nicolas Williams <nico@cryptonector.com> 2015-04-23 23:43:44 -0500
commit: dad6e42934ab255bf07ff4ffbae093240d4f6e90 (patch)
tree: f5f625e9c00ccb68f8139453835293082388d369
parent: 7d938487dd61f129c1e1b40ff4fa9be27d1b24f5 (diff)
3 files changed, 56 insertions, 22 deletions
diff --git a/jq.h b/jq.h
index 9182cfae..493bfce8 100644
--- a/jq.h
+++ b/jq.h
@@ -44,7 +44,6 @@ void jq_util_input_set_parser(jq_util_input_state, jv_parser *, int);
 void jq_util_input_free(jq_util_input_state *);
 void jq_util_input_add_input(jq_util_input_state, jv);
 int jq_util_input_open_errors(jq_util_input_state);
-int jq_util_input_read_more(jq_util_input_state);
 jv jq_util_input_next_input(jq_util_input_state);
 jv jq_util_input_next_input_cb(jq_state *, void *);
 
diff --git a/tests/run b/tests/run
index bb7d0727..67229969 100755
--- a/tests/run
+++ b/tests/run
@@ -12,10 +12,9 @@ fi
 
 mods=$PWD/tests/modules
 
+# jq-coded tests here:
 cat $@ | $VALGRIND $Q ./jq -L "$mods" --run-tests
 
-set -x
-
 clean=true
 d=
 clean () {
@@ -32,6 +31,10 @@ if [ -z "$d" ]; then
     exit 0
 fi
 
+printf 'a\0b\nc\0d\ne' > $d/input
+$VALGRIND $Q ./jq -Rse '. == "a\u0000b\nc\u0000d\ne"' $d/input
+$VALGRIND $Q ./jq -Rne '[inputs] == ["a\u0000b", "c\u0000d", "e"]' $d/input
+
 ## Test constant folding
 
 ## XXX If we add a builtin to list the program's disassembly then we can
diff --git a/util.c b/util.c
index 31a17683..19b2b051 100644
--- a/util.c
+++ b/util.c
@@ -154,6 +154,7 @@ struct jq_util_input_state {
   int open_failures;
   jv slurped;
   char buf[4096];
+  size_t buf_valid_len;
 };
 
 static void fprinter(void *data, jv fname) {
@@ -176,6 +177,7 @@ jq_util_input_state jq_util_input_init(jq_msg_cb err_cb, void *err_cb_data) {
   new_state->files = jv_array();
   new_state->slurped = jv_invalid();
   new_state->buf[0] = 0;
+  new_state->buf_valid_len = 0;
 
   return new_state;
 }
@@ -220,7 +222,7 @@ static jv next_file(jq_util_input_state state) {
   return next;
 }
 
-int jq_util_input_read_more(jq_util_input_state state) {
+static int jq_util_input_read_more(jq_util_input_state state) {
   if (!state->current_input || feof(state->current_input) || ferror(state->current_input)) {
     if (state->current_input && ferror(state->current_input)) {
       // System-level input error on the stream. It will be closed (below).
@@ -252,9 +254,41 @@ int jq_util_input_read_more(jq_util_input_state state) {
   }
 
   state->buf[0] = 0;
+  state->buf_valid_len = 0;
   if (state->current_input) {
-    if (!fgets(state->buf, sizeof(state->buf), state->current_input))
+    memset(state->buf, 0, sizeof(state->buf));
+    if (!fgets(state->buf, sizeof(state->buf), state->current_input)) {
       state->buf[0] = 0;
+    } else {
+      const char *p = memchr(state->buf, '\n', sizeof(state->buf));
+      
+      if (p == NULL && state->parser != NULL) {
+        /* There should be no NULs in JSON texts */
+        state->buf_valid_len = strlen(state->buf);
+      } else if (p == NULL && feof(state->current_input)) {
+        size_t i;
+
+        /*
+         * XXX We can't know how many bytes we've read!
+         *
+         * We can't use getline() because there need not be any newlines
+         * in the input.  The only entirely correct choices are: use
+         * fgetc() or read(), and of those the latter will be the
+         * best-performing.
+         *
+         * For now we guess how much fgets() read.
+         */
+        for (p = state->buf, i = 0; i < sizeof(state->buf); i++) {
+          if (state->buf[i] != '\0')
+            p = &state->buf[i];
+        }
+        state->buf_valid_len = p - state->buf + 1;
+      } else if (p == NULL) {
+        state->buf_valid_len = sizeof(state->buf);
+      } else {
+        state->buf_valid_len = (p - state->buf) + 1;
+      }
+    }
   }
   return jv_array_length(jv_copy(state->files)) == 0 && (!state->current_input || feof(state->current_input));
 }
@@ -272,29 +306,27 @@ jv jq_util_input_next_input(jq_util_input_state state) {
     if (state->parser == NULL) {
       // Raw input
       is_last = jq_util_input_read_more(state);
-      if (state->buf[0] == '\0')
+      if (state->buf_valid_len == 0)
         continue;
-      int len = strlen(state->buf); // Raw input doesn't support NULs
-      if (len > 0) {
-        if (jv_is_valid(state->slurped)) {
-          // Slurped raw input
-          state->slurped = jv_string_concat(state->slurped, jv_string(state->buf));
-        } else {
-          if (!jv_is_valid(value))
-            value = jv_string("");
-          if (state->buf[len-1] == '\n') {
-            // whole line
-            state->buf[len-1] = 0;
-            return jv_string_concat(value, jv_string(state->buf));
-          }
-          value = jv_string_concat(value, jv_string(state->buf));
-          state->buf[0] = '\0';
+      if (jv_is_valid(state->slurped)) {
+        // Slurped raw input
+        state->slurped = jv_string_concat(state->slurped, jv_string_sized(state->buf, state->buf_valid_len));
+      } else {
+        if (!jv_is_valid(value))
+          value = jv_string("");
+        if (state->buf[state->buf_valid_len-1] == '\n') {
+          // whole line
+          state->buf[state->buf_valid_len-1] = 0;
+          return jv_string_concat(value, jv_string_sized(state->buf, state->buf_valid_len-1));
         }
+        value = jv_string_concat(value, jv_string_sized(state->buf, state->buf_valid_len));
+        state->buf[0] = '\0';
+        state->buf_valid_len = 0;
       }
     } else {
       if (jv_parser_remaining(state->parser) == 0) {
         is_last = jq_util_input_read_more(state);
-        jv_parser_set_buf(state->parser, state->buf, strlen(state->buf), !is_last);
+        jv_parser_set_buf(state->parser, state->buf, state->buf_valid_len, !is_last);
       }
       value = jv_parser_next(state->parser);
       if (jv_is_valid(state->slurped)) {
author	Nicolas Williams <nico@cryptonector.com>	2015-04-23 18:27:53 -0500
committer	Nicolas Williams <nico@cryptonector.com>	2015-04-23 23:43:44 -0500
commit	dad6e42934ab255bf07ff4ffbae093240d4f6e90 (patch)
tree	f5f625e9c00ccb68f8139453835293082388d369
parent	7d938487dd61f129c1e1b40ff4fa9be27d1b24f5 (diff)