diff options
author | William Langford <wlangfor@gmail.com> | 2017-01-24 23:05:47 -0500 |
---|---|---|
committer | William Langford <wlangfor@gmail.com> | 2017-01-27 21:04:26 -0500 |
commit | e84d17196c03da6e6dc56f4fcd319a7fe84f8dbc (patch) | |
tree | a69bd988a82c256dbeb78c045137e08b1da55325 /src/jv_file.c | |
parent | fd4ae8304e23007672af9a37855c7a76de7c78cf (diff) |
Handle cut-off UTF-8 sequences when reading files
Read additional bytes from the file to complete the UTF-8 sequence so the bytes
in it don't get converted to U+FFFD replacement characters.
Diffstat (limited to 'src/jv_file.c')
-rw-r--r-- | src/jv_file.c | 19 |
1 files changed, 16 insertions, 3 deletions
diff --git a/src/jv_file.c b/src/jv_file.c index 33d327c7..3159df53 100644 --- a/src/jv_file.c +++ b/src/jv_file.c @@ -4,6 +4,7 @@ #include <stdlib.h> #include <string.h> #include "jv.h" +#include "jv_unicode.h" jv jv_load_file(const char* filename, int raw) { FILE* file = fopen(filename, "r"); @@ -20,11 +21,23 @@ jv jv_load_file(const char* filename, int raw) { data = jv_array(); parser = jv_parser_new(0); } + + // To avoid mangling UTF-8 multi-byte sequences that cross the end of our read + // buffer, we need to be able to read the remainder of a sequence and add that + // before appending. + const int max_utf8_len = 4; + char buf[4096+max_utf8_len]; while (!feof(file) && !ferror(file)) { - char buf[4096]; - size_t n = fread(buf, 1, sizeof(buf), file); + size_t n = fread(buf, 1, sizeof(buf)-max_utf8_len, file); + int len = 0; + if (jvp_utf8_backtrack(buf+(n-1), buf, &len) && len > 0) { + if (!feof(file) && !ferror(file)) { + n += fread(buf+n, 1, len, file); + } + } + if (raw) { - data = jv_string_concat(data, jv_string_sized(buf, (int)n)); + data = jv_string_append_buf(data, buf, n); } else { jv_parser_set_buf(parser, buf, n, !feof(file)); jv value; |