#include #include #include "jv_unicode.h" #include "jv_utf8_tables.h" const char* jvp_utf8_next(const char* in, const char* end, int* codepoint_ret) { assert(in <= end); if (in == end) { return 0; } int codepoint = -1; unsigned char first = (unsigned char)in[0]; int length = utf8_coding_length[first]; if ((first & 0x80) == 0) { /* Fast-path for ASCII */ codepoint = first; length = 1; } else if (length == 0 || length == UTF8_CONTINUATION_BYTE) { /* Bad single byte - either an invalid byte or an out-of-place continuation byte */ length = 1; } else if (in + length > end) { /* String ends before UTF8 sequence ends */ length = end - in; } else { codepoint = ((unsigned)in[0]) & utf8_coding_bits[first]; for (int i=1; i 0x10FFFF) { /* Outside Unicode range */ codepoint = -1; } } assert(length > 0); *codepoint_ret = codepoint; return in + length; } int jvp_utf8_is_valid(const char* in, const char* end) { int codepoint; while ((in = jvp_utf8_next(in, end, &codepoint))) { if (codepoint == -1) return 0; } return 1; } int jvp_utf8_decode_length(char startchar) { if ((startchar & 0x80) == 0) return 1; else if ((startchar & 0xC0) == 0xC0) return 2; else if ((startchar & 0xE0) == 0xE0) return 3; else return 4; } int jvp_utf8_encode_length(int codepoint) { if (codepoint <= 0x7F) return 1; else if (codepoint <= 0x7FF) return 2; else if (codepoint <= 0xFFFF) return 3; else return 4; } int jvp_utf8_encode(int codepoint, char* out) { assert(codepoint >= 0 && codepoint <= 0x10FFFF); char* start = out; if (codepoint <= 0x7F) { *out++ = codepoint; } else if (codepoint <= 0x7FF) { *out++ = 0xC0 + ((codepoint & 0x7C0) >> 6); *out++ = 0x80 + ((codepoint & 0x03F)); } else if(codepoint <= 0xFFFF) { *out++ = 0xE0 + ((codepoint & 0xF000) >> 12); *out++ = 0x80 + ((codepoint & 0x0FC0) >> 6); *out++ = 0x80 + ((codepoint & 0x003F)); } else { *out++ = 0xF0 + ((codepoint & 0x1C0000) >> 18); *out++ = 0x80 + ((codepoint & 0x03F000) >> 12); *out++ = 0x80 + ((codepoint & 0x000FC0) >> 6); *out++ = 0x80 + ((codepoint & 0x00003F)); } assert(out - start == jvp_utf8_encode_length(codepoint)); return out - start; }