Handle cut-off UTF-8 sequences when reading files

Read additional bytes from the file to complete the UTF-8 sequence so the bytes in it don't get converted to U+FFFD replacement characters.
author: William Langford <wlangfor@gmail.com> 2017-01-24 23:05:47 -0500
committer: William Langford <wlangfor@gmail.com> 2017-01-27 21:04:26 -0500
commit: e84d17196c03da6e6dc56f4fcd319a7fe84f8dbc (patch)
tree: a69bd988a82c256dbeb78c045137e08b1da55325 /src/jv_unicode.c
parent: fd4ae8304e23007672af9a37855c7a76de7c78cf (diff)
1 files changed, 23 insertions, 0 deletions
diff --git a/src/jv_unicode.c b/src/jv_unicode.c
index fbf7454b..b3a50b2d 100644
--- a/src/jv_unicode.c
+++ b/src/jv_unicode.c
@@ -3,6 +3,29 @@
 #include "jv_unicode.h"
 #include "jv_utf8_tables.h"
 
+// jvp_utf8_backtrack returns the beginning of the last codepoint in the
+// string, assuming that start is the last byte in the string.
+// If the last codepoint is incomplete, returns the number of missing bytes via
+// *missing_bytes.  If there are no leading bytes or an invalid byte is
+// encountered, NULL is returned and *missing_bytes is not altered.
+const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes) {
+  assert(min < start);
+  if (min == start) {
+    return min;
+  }
+  int length = 0;
+  int seen = 1;
+  while (start >= min && (length = utf8_coding_length[(unsigned char)*start]) == UTF8_CONTINUATION_BYTE) {
+    start--;
+    seen++;
+  }
+  if (length == 0 || length == UTF8_CONTINUATION_BYTE || length - seen < 0) {
+    return NULL;
+  }
+  if (missing_bytes) *missing_bytes = length - seen;
+  return start;
+}
+
 const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
   assert(in <= end);
   if (in == end) {
author	William Langford <wlangfor@gmail.com>	2017-01-24 23:05:47 -0500
committer	William Langford <wlangfor@gmail.com>	2017-01-27 21:04:26 -0500
commit	e84d17196c03da6e6dc56f4fcd319a7fe84f8dbc (patch)
tree	a69bd988a82c256dbeb78c045137e08b1da55325 /src/jv_unicode.c
parent	fd4ae8304e23007672af9a37855c7a76de7c78cf (diff)