diff options
author | William Langford <wlangfor@gmail.com> | 2017-01-24 23:05:47 -0500 |
---|---|---|
committer | William Langford <wlangfor@gmail.com> | 2017-01-27 21:04:26 -0500 |
commit | e84d17196c03da6e6dc56f4fcd319a7fe84f8dbc (patch) | |
tree | a69bd988a82c256dbeb78c045137e08b1da55325 /src/jv_unicode.c | |
parent | fd4ae8304e23007672af9a37855c7a76de7c78cf (diff) |
Handle cut-off UTF-8 sequences when reading files
Read additional bytes from the file to complete the UTF-8 sequence so the bytes
in it don't get converted to U+FFFD replacement characters.
Diffstat (limited to 'src/jv_unicode.c')
-rw-r--r-- | src/jv_unicode.c | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/src/jv_unicode.c b/src/jv_unicode.c index fbf7454b..b3a50b2d 100644 --- a/src/jv_unicode.c +++ b/src/jv_unicode.c @@ -3,6 +3,29 @@ #include "jv_unicode.h" #include "jv_utf8_tables.h" +// jvp_utf8_backtrack returns the beginning of the last codepoint in the +// string, assuming that start is the last byte in the string. +// If the last codepoint is incomplete, returns the number of missing bytes via +// *missing_bytes. If there are no leading bytes or an invalid byte is +// encountered, NULL is returned and *missing_bytes is not altered. +const char* jvp_utf8_backtrack(const char* start, const char* min, int *missing_bytes) { + assert(min < start); + if (min == start) { + return min; + } + int length = 0; + int seen = 1; + while (start >= min && (length = utf8_coding_length[(unsigned char)*start]) == UTF8_CONTINUATION_BYTE) { + start--; + seen++; + } + if (length == 0 || length == UTF8_CONTINUATION_BYTE || length - seen < 0) { + return NULL; + } + if (missing_bytes) *missing_bytes = length - seen; + return start; +} + const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { assert(in <= end); if (in == end) { |