diff options
author | David Tolnay <dtolnay@gmail.com> | 2015-08-23 20:36:11 -0700 |
---|---|---|
committer | David Tolnay <dtolnay@gmail.com> | 2015-08-23 20:36:11 -0700 |
commit | 0c93eb3379241dc4775718a9d39f54a6c4de20d6 (patch) | |
tree | 67bb5510adb707d54c6f72b51b0718578a2caf5c /src/jv_unicode.c | |
parent | 891f28ef5e406a8d2156ad88d0244ab03fe490eb (diff) |
Move source files to src/
Diffstat (limited to 'src/jv_unicode.c')
-rw-r--r-- | src/jv_unicode.c | 97 |
1 files changed, 97 insertions, 0 deletions
diff --git a/src/jv_unicode.c b/src/jv_unicode.c new file mode 100644 index 00000000..fbf7454b --- /dev/null +++ b/src/jv_unicode.c @@ -0,0 +1,97 @@ +#include <stdio.h> +#include <assert.h> +#include "jv_unicode.h" +#include "jv_utf8_tables.h" + +const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { + assert(in <= end); + if (in == end) { + return 0; + } + int codepoint = -1; + unsigned char first = (unsigned char)in[0]; + int length = utf8_coding_length[first]; + if ((first & 0x80) == 0) { + /* Fast-path for ASCII */ + codepoint = first; + length = 1; + } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) { + /* Bad single byte - either an invalid byte or an out-of-place continuation byte */ + length = 1; + } else if (in + length > end) { + /* String ends before UTF8 sequence ends */ + length = end - in; + } else { + codepoint = ((unsigned)in[0]) & utf8_coding_bits[first]; + for (int i=1; i<length; i++) { + unsigned ch = (unsigned char)in[i]; + if (utf8_coding_length[ch] != UTF8_CONTINUATION_BYTE){ + /* Invalid UTF8 sequence - not followed by the right number of continuation bytes */ + codepoint = -1; + length = i; + break; + } + codepoint = (codepoint << 6) | (ch & 0x3f); + } + if (codepoint < utf8_first_codepoint[length]) { + /* Overlong UTF8 sequence */ + codepoint = -1; + } + if (0xD800 <= codepoint && codepoint <= 0xDFFF) { + /* Surrogate codepoints can't be encoded in UTF8 */ + codepoint = -1; + } + if (codepoint > 0x10FFFF) { + /* Outside Unicode range */ + codepoint = -1; + } + } + assert(length > 0); + *codepoint_ret = codepoint; + return in + length; +} + +int jvp_utf8_is_valid(const char* in, const char* end) { + int codepoint; + while ((in = jvp_utf8_next(in, end, &codepoint))) { + if (codepoint == -1) return 0; + } + return 1; +} + +/* Assumes startchar is the first byte of a valid character sequence */ +int jvp_utf8_decode_length(char startchar) { + if ((startchar & 0x80) == 0) return 1; // 0___ ____ + else if ((startchar & 0xE0) == 0xC0) return 2; // 110_ ____ + else if ((startchar & 0xF0) == 0xE0) return 3; // 1110 ____ + else return 4; // 1111 ____ +} + +int jvp_utf8_encode_length(int codepoint) { + if (codepoint <= 0x7F) return 1; + else if (codepoint <= 0x7FF) return 2; + else if (codepoint <= 0xFFFF) return 3; + else return 4; +} + +int jvp_utf8_encode(int codepoint, char* out) { + assert(codepoint >= 0 && codepoint <= 0x10FFFF); + char* start = out; + if (codepoint <= 0x7F) { + *out++ = codepoint; + } else if (codepoint <= 0x7FF) { + *out++ = 0xC0 + ((codepoint & 0x7C0) >> 6); + *out++ = 0x80 + ((codepoint & 0x03F)); + } else if(codepoint <= 0xFFFF) { + *out++ = 0xE0 + ((codepoint & 0xF000) >> 12); + *out++ = 0x80 + ((codepoint & 0x0FC0) >> 6); + *out++ = 0x80 + ((codepoint & 0x003F)); + } else { + *out++ = 0xF0 + ((codepoint & 0x1C0000) >> 18); + *out++ = 0x80 + ((codepoint & 0x03F000) >> 12); + *out++ = 0x80 + ((codepoint & 0x000FC0) >> 6); + *out++ = 0x80 + ((codepoint & 0x00003F)); + } + assert(out - start == jvp_utf8_encode_length(codepoint)); + return out - start; +} |