Ignore a UTF-8 BOM if one appears at the start of a JSON document.

Closes #45.
author: Stephen Dolan <mu@netsoc.tcd.ie> 2012-12-02 23:53:55 +0000
committer: Stephen Dolan <mu@netsoc.tcd.ie> 2012-12-02 23:53:55 +0000
commit: 67f8ad943538e00826966c069d917b5bc99a4e47 (patch)
tree: 2db175b49ab9269c389f37d0d275a73b51d14287
parent: 5b45184a1a79372630df76c0e840f4fffcce9494 (diff)
3 files changed, 28 insertions, 0 deletions
diff --git a/jv_parse.c b/jv_parse.c
index 63cdf935..738beb94 100644
--- a/jv_parse.c
+++ b/jv_parse.c
@@ -24,6 +24,7 @@ void jv_parser_init(struct jv_parser* p) {
   p->st = JV_PARSER_NORMAL;
   p->curr_buf = 0;
   p->curr_buf_length = p->curr_buf_pos = p->curr_buf_is_partial = 0;
+  p->bom_strip_position = 0;
   jvp_dtoa_context_init(&p->dtoa);
 }
 
@@ -332,9 +333,27 @@ static pfunc scan(struct jv_parser* p, char ch, jv* out) {
   return answer;
 }
 
+static unsigned char UTF8_BOM[] = {0xEF,0xBB,0xBF};
+
 void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_partial) {
   assert((p->curr_buf == 0 || p->curr_buf_pos == p->curr_buf_length)
          && "previous buffer not exhausted");
+  while (p->bom_strip_position < sizeof(UTF8_BOM)) {
+    if ((unsigned char)*buf == UTF8_BOM[p->bom_strip_position]) {
+      // matched a BOM character
+      buf++;
+      length--;
+      p->bom_strip_position++;
+    } else {
+      if (p->bom_strip_position == 0) {
+        // no BOM in this document
+        p->bom_strip_position = sizeof(UTF8_BOM);
+      } else {
+        // malformed BOM (prefix present, rest missing)
+        p->bom_strip_position = 0xff;
+      }
+    }
+  }
   p->curr_buf = buf;
   p->curr_buf_length = length;
   p->curr_buf_pos = 0;
@@ -343,6 +362,7 @@ void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_
 
 jv jv_parser_next(struct jv_parser* p) {
   assert(p->curr_buf && "a buffer must be provided");
+  if (p->bom_strip_position == 0xff) return jv_invalid_with_msg(jv_string("Malformed BOM"));
   jv value;
   presult msg = 0;
   while (!msg && p->curr_buf_pos < p->curr_buf_length) {
diff --git a/jv_parse.h b/jv_parse.h
index 5b8e7cdf..5958316a 100644
--- a/jv_parse.h
+++ b/jv_parse.h
@@ -6,6 +6,7 @@ struct jv_parser {
   int curr_buf_length;
   int curr_buf_pos;
   int curr_buf_is_partial;
+  unsigned bom_strip_position;
 
   jv* stack;
   int stackpos;
diff --git a/testdata b/testdata
index 29bbe7a8..e08e6aa5 100644
--- a/testdata
+++ b/testdata
@@ -31,6 +31,13 @@ null
 null
 []
 
+# The input line starts with a 0xFEFF (byte order mark) codepoint
+# No, there is no reason to have a byte order mark in UTF8 text.
+# But apparently people do, so jq shouldn't break on it.
+.
+"byte order mark"
+"byte order mark"
+
 # We test escapes by matching them against Unicode codepoints
 # FIXME: more tests needed for weird unicode stuff (e.g. utf16 pairs)
 "Aa\r\n\t\b\f\u03bc"
author	Stephen Dolan <mu@netsoc.tcd.ie>	2012-12-02 23:53:55 +0000
committer	Stephen Dolan <mu@netsoc.tcd.ie>	2012-12-02 23:53:55 +0000
commit	67f8ad943538e00826966c069d917b5bc99a4e47 (patch)
tree	2db175b49ab9269c389f37d0d275a73b51d14287
parent	5b45184a1a79372630df76c0e840f4fffcce9494 (diff)