Handle cut-off UTF-8 sequences when reading files

Read additional bytes from the file to complete the UTF-8 sequence so the bytes in it don't get converted to U+FFFD replacement characters.
author: William Langford <wlangfor@gmail.com> 2017-01-24 23:05:47 -0500
committer: William Langford <wlangfor@gmail.com> 2017-01-27 21:04:26 -0500
commit: e84d17196c03da6e6dc56f4fcd319a7fe84f8dbc (patch)
tree: a69bd988a82c256dbeb78c045137e08b1da55325 /src/jv_file.c
parent: fd4ae8304e23007672af9a37855c7a76de7c78cf (diff)
1 files changed, 16 insertions, 3 deletions
diff --git a/src/jv_file.c b/src/jv_file.c
index 33d327c7..3159df53 100644
--- a/src/jv_file.c
+++ b/src/jv_file.c
@@ -4,6 +4,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "jv.h"
+#include "jv_unicode.h"
 
 jv jv_load_file(const char* filename, int raw) {
   FILE* file = fopen(filename, "r");
@@ -20,11 +21,23 @@ jv jv_load_file(const char* filename, int raw) {
     data = jv_array();
     parser = jv_parser_new(0);
   }
+
+  // To avoid mangling UTF-8 multi-byte sequences that cross the end of our read
+  // buffer, we need to be able to read the remainder of a sequence and add that
+  // before appending.
+  const int max_utf8_len = 4;
+  char buf[4096+max_utf8_len];
   while (!feof(file) && !ferror(file)) {
-    char buf[4096];
-    size_t n = fread(buf, 1, sizeof(buf), file);
+    size_t n = fread(buf, 1, sizeof(buf)-max_utf8_len, file);
+    int len = 0;
+    if (jvp_utf8_backtrack(buf+(n-1), buf, &len) && len > 0) {
+      if (!feof(file) && !ferror(file)) {
+        n += fread(buf+n, 1, len, file);
+      }
+    }
+
     if (raw) {
-      data = jv_string_concat(data, jv_string_sized(buf, (int)n));
+      data = jv_string_append_buf(data, buf, n);
     } else {
       jv_parser_set_buf(parser, buf, n, !feof(file));
       jv value;
author	William Langford <wlangfor@gmail.com>	2017-01-24 23:05:47 -0500
committer	William Langford <wlangfor@gmail.com>	2017-01-27 21:04:26 -0500
commit	e84d17196c03da6e6dc56f4fcd319a7fe84f8dbc (patch)
tree	a69bd988a82c256dbeb78c045137e08b1da55325 /src/jv_file.c
parent	fd4ae8304e23007672af9a37855c7a76de7c78cf (diff)