From 67f8ad943538e00826966c069d917b5bc99a4e47 Mon Sep 17 00:00:00 2001 From: Stephen Dolan Date: Sun, 2 Dec 2012 23:53:55 +0000 Subject: Ignore a UTF-8 BOM if one appears at the start of a JSON document. Closes #45. --- jv_parse.c | 20 ++++++++++++++++++++ jv_parse.h | 1 + testdata | 7 +++++++ 3 files changed, 28 insertions(+) diff --git a/jv_parse.c b/jv_parse.c index 63cdf935..738beb94 100644 --- a/jv_parse.c +++ b/jv_parse.c @@ -24,6 +24,7 @@ void jv_parser_init(struct jv_parser* p) { p->st = JV_PARSER_NORMAL; p->curr_buf = 0; p->curr_buf_length = p->curr_buf_pos = p->curr_buf_is_partial = 0; + p->bom_strip_position = 0; jvp_dtoa_context_init(&p->dtoa); } @@ -332,9 +333,27 @@ static pfunc scan(struct jv_parser* p, char ch, jv* out) { return answer; } +static unsigned char UTF8_BOM[] = {0xEF,0xBB,0xBF}; + void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_partial) { assert((p->curr_buf == 0 || p->curr_buf_pos == p->curr_buf_length) && "previous buffer not exhausted"); + while (p->bom_strip_position < sizeof(UTF8_BOM)) { + if ((unsigned char)*buf == UTF8_BOM[p->bom_strip_position]) { + // matched a BOM character + buf++; + length--; + p->bom_strip_position++; + } else { + if (p->bom_strip_position == 0) { + // no BOM in this document + p->bom_strip_position = sizeof(UTF8_BOM); + } else { + // malformed BOM (prefix present, rest missing) + p->bom_strip_position = 0xff; + } + } + } p->curr_buf = buf; p->curr_buf_length = length; p->curr_buf_pos = 0; @@ -343,6 +362,7 @@ void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_ jv jv_parser_next(struct jv_parser* p) { assert(p->curr_buf && "a buffer must be provided"); + if (p->bom_strip_position == 0xff) return jv_invalid_with_msg(jv_string("Malformed BOM")); jv value; presult msg = 0; while (!msg && p->curr_buf_pos < p->curr_buf_length) { diff --git a/jv_parse.h b/jv_parse.h index 5b8e7cdf..5958316a 100644 --- a/jv_parse.h +++ b/jv_parse.h @@ -6,6 +6,7 @@ struct jv_parser { int curr_buf_length; int curr_buf_pos; int curr_buf_is_partial; + unsigned bom_strip_position; jv* stack; int stackpos; diff --git a/testdata b/testdata index 29bbe7a8..e08e6aa5 100644 --- a/testdata +++ b/testdata @@ -31,6 +31,13 @@ null null [] +# The input line starts with a 0xFEFF (byte order mark) codepoint +# No, there is no reason to have a byte order mark in UTF8 text. +# But apparently people do, so jq shouldn't break on it. +. +"byte order mark" +"byte order mark" + # We test escapes by matching them against Unicode codepoints # FIXME: more tests needed for weird unicode stuff (e.g. utf16 pairs) "Aa\r\n\t\b\f\u03bc" -- cgit v1.2.3