diff options
author | Stephen Dolan <mu@netsoc.tcd.ie> | 2012-12-02 23:53:55 +0000 |
---|---|---|
committer | Stephen Dolan <mu@netsoc.tcd.ie> | 2012-12-02 23:53:55 +0000 |
commit | 67f8ad943538e00826966c069d917b5bc99a4e47 (patch) | |
tree | 2db175b49ab9269c389f37d0d275a73b51d14287 | |
parent | 5b45184a1a79372630df76c0e840f4fffcce9494 (diff) |
Ignore a UTF-8 BOM if one appears at the start of a JSON document.
Closes #45.
-rw-r--r-- | jv_parse.c | 20 | ||||
-rw-r--r-- | jv_parse.h | 1 | ||||
-rw-r--r-- | testdata | 7 |
3 files changed, 28 insertions, 0 deletions
@@ -24,6 +24,7 @@ void jv_parser_init(struct jv_parser* p) { p->st = JV_PARSER_NORMAL; p->curr_buf = 0; p->curr_buf_length = p->curr_buf_pos = p->curr_buf_is_partial = 0; + p->bom_strip_position = 0; jvp_dtoa_context_init(&p->dtoa); } @@ -332,9 +333,27 @@ static pfunc scan(struct jv_parser* p, char ch, jv* out) { return answer; } +static unsigned char UTF8_BOM[] = {0xEF,0xBB,0xBF}; + void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_partial) { assert((p->curr_buf == 0 || p->curr_buf_pos == p->curr_buf_length) && "previous buffer not exhausted"); + while (p->bom_strip_position < sizeof(UTF8_BOM)) { + if ((unsigned char)*buf == UTF8_BOM[p->bom_strip_position]) { + // matched a BOM character + buf++; + length--; + p->bom_strip_position++; + } else { + if (p->bom_strip_position == 0) { + // no BOM in this document + p->bom_strip_position = sizeof(UTF8_BOM); + } else { + // malformed BOM (prefix present, rest missing) + p->bom_strip_position = 0xff; + } + } + } p->curr_buf = buf; p->curr_buf_length = length; p->curr_buf_pos = 0; @@ -343,6 +362,7 @@ void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_ jv jv_parser_next(struct jv_parser* p) { assert(p->curr_buf && "a buffer must be provided"); + if (p->bom_strip_position == 0xff) return jv_invalid_with_msg(jv_string("Malformed BOM")); jv value; presult msg = 0; while (!msg && p->curr_buf_pos < p->curr_buf_length) { @@ -6,6 +6,7 @@ struct jv_parser { int curr_buf_length; int curr_buf_pos; int curr_buf_is_partial; + unsigned bom_strip_position; jv* stack; int stackpos; @@ -31,6 +31,13 @@ null null [] +# The input line starts with a 0xFEFF (byte order mark) codepoint +# No, there is no reason to have a byte order mark in UTF8 text. +# But apparently people do, so jq shouldn't break on it. +. +"byte order mark" +"byte order mark" + # We test escapes by matching them against Unicode codepoints # FIXME: more tests needed for weird unicode stuff (e.g. utf16 pairs) "Aa\r\n\t\b\f\u03bc" |