diff options
author | Stephen Dolan <mu@netsoc.tcd.ie> | 2012-09-18 17:44:43 +0100 |
---|---|---|
committer | Stephen Dolan <mu@netsoc.tcd.ie> | 2012-09-18 17:44:43 +0100 |
commit | a4eea165bbab6d13f89b59707e835d58b7014a66 (patch) | |
tree | b99ee5dde8540f8dbe5de3d87b99e04ac4dd2673 /jv_unicode.c | |
parent | 25cbab056b1f73e96b636c88779a92400d92dc15 (diff) |
Move everything around - delete old Haskell code, clean up build.
Diffstat (limited to 'jv_unicode.c')
-rw-r--r-- | jv_unicode.c | 64 |
1 files changed, 64 insertions, 0 deletions
diff --git a/jv_unicode.c b/jv_unicode.c new file mode 100644 index 00000000..b1417a2a --- /dev/null +++ b/jv_unicode.c @@ -0,0 +1,64 @@ +#include <stdio.h> +#include <assert.h> +#include "jv_unicode.h" +#include "jv_utf8_tables.gen.h" + +const char* jvp_utf8_next(const char* in, const char* end, int* codepoint) { + if (in == end) { + codepoint = 0; + return 0; + } + unsigned char first = (unsigned char)in[0]; + int length = utf8_coding_length[first]; + if (length == 0 || length == UTF8_CONTINUATION_BYTE || in + length > end) { + *codepoint = -1; + return 0; + } + *codepoint = ((unsigned)in[0]) & utf8_coding_bits[first]; + for (int i=1; i<length; i++) { + int ch = (unsigned char)in[i]; + if (utf8_coding_length[(unsigned char)in[i]] != UTF8_CONTINUATION_BYTE){ + *codepoint = -1; + return 0; + } + *codepoint = (*codepoint << 6) | (ch & 0x3f); + } + return in + length; +} + +int jvp_utf8_verify(const char* in, const char* end) { + int codepoint = 0; + while ((in = jvp_utf8_next(in, end, &codepoint))) { + if (codepoint == -1) return 0; + } + return codepoint != -1; +} + +int jvp_utf8_encode_length(int codepoint) { + if (codepoint <= 0x7F) return 1; + else if (codepoint <= 0x7FF) return 2; + else if (codepoint <= 0xFFFF) return 3; + else return 4; +} + +int jvp_utf8_encode(int codepoint, char* out) { + assert(codepoint >= 0 && codepoint <= 0x10FFFF); + char* start = out; + if (codepoint <= 0x7F) { + *out++ = codepoint; + } else if (codepoint <= 0x7FF) { + *out++ = 0xC0 + ((codepoint & 0x7C0) >> 6); + *out++ = 0x80 + ((codepoint & 0x03F)); + } else if(codepoint <= 0xFFFF) { + *out++ = 0xE0 + ((codepoint & 0xF000) >> 12); + *out++ = 0x80 + ((codepoint & 0x0FC0) >> 6); + *out++ = 0x80 + ((codepoint & 0x003F)); + } else { + *out++ = 0xF0 + ((codepoint & 0x1C0000) >> 18); + *out++ = 0x80 + ((codepoint & 0x03F000) >> 12); + *out++ = 0x80 + ((codepoint & 0x000FC0) >> 6); + *out++ = 0x80 + ((codepoint & 0x00003F)); + } + assert(out - start == jvp_utf8_encode_length(codepoint)); + return out - start; +} |