summaryrefslogtreecommitdiffstats
path: root/src/jv_unicode.c
diff options
context:
space:
mode:
authorWilliam Langford <wlangfor@gmail.com>2017-01-24 23:05:47 -0500
committerWilliam Langford <wlangfor@gmail.com>2017-01-27 21:04:26 -0500
commite84d17196c03da6e6dc56f4fcd319a7fe84f8dbc (patch)
treea69bd988a82c256dbeb78c045137e08b1da55325 /src/jv_unicode.c
parentfd4ae8304e23007672af9a37855c7a76de7c78cf (diff)
Handle cut-off UTF-8 sequences when reading files
Read additional bytes from the file to complete the UTF-8 sequence so the bytes in it don't get converted to U+FFFD replacement characters.
Diffstat (limited to 'src/jv_unicode.c')
-rw-r--r--src/jv_unicode.c23
1 files changed, 23 insertions, 0 deletions
diff --git a/src/jv_unicode.c b/src/jv_unicode.c
index fbf7454b..b3a50b2d 100644
--- a/src/jv_unicode.c
+++ b/src/jv_unicode.c
@@ -3,6 +3,29 @@
#include "jv_unicode.h"
#include "jv_utf8_tables.h"
+// jvp_utf8_backtrack returns the beginning of the last codepoint in the
+// string, assuming that start is the last byte in the string.
+// If the last codepoint is incomplete, returns the number of missing bytes via
+// *missing_bytes. If there are no leading bytes or an invalid byte is
+// encountered, NULL is returned and *missing_bytes is not altered.
+const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes) {
+ assert(min < start);
+ if (min == start) {
+ return min;
+ }
+ int length = 0;
+ int seen = 1;
+ while (start >= min && (length = utf8_coding_length[(unsigned char)*start]) == UTF8_CONTINUATION_BYTE) {
+ start--;
+ seen++;
+ }
+ if (length == 0 || length == UTF8_CONTINUATION_BYTE || length - seen < 0) {
+ return NULL;
+ }
+ if (missing_bytes) *missing_bytes = length - seen;
+ return start;
+}
+
const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) {
assert(in <= end);
if (in == end) {