Move everything around - delete old Haskell code, clean up build.

author: Stephen Dolan <mu@netsoc.tcd.ie> 2012-09-18 17:44:43 +0100
committer: Stephen Dolan <mu@netsoc.tcd.ie> 2012-09-18 17:44:43 +0100
commit: a4eea165bbab6d13f89b59707e835d58b7014a66 (patch)
tree: b99ee5dde8540f8dbe5de3d87b99e04ac4dd2673 /jv_unicode.c
parent: 25cbab056b1f73e96b636c88779a92400d92dc15 (diff)
1 files changed, 64 insertions, 0 deletions
diff --git a/jv_unicode.c b/jv_unicode.c
new file mode 100644
index 00000000..b1417a2a
--- /dev/null
+++ b/jv_unicode.c
@@ -0,0 +1,64 @@
+#include <stdio.h>
+#include <assert.h>
+#include "jv_unicode.h"
+#include "jv_utf8_tables.gen.h"
+
+const char* jvp_utf8_next(const char* in, const char* end, int* codepoint) {
+  if (in == end) {
+    codepoint = 0;
+    return 0;
+  }
+  unsigned char first = (unsigned char)in[0];
+  int length = utf8_coding_length[first];
+  if (length == 0 || length == UTF8_CONTINUATION_BYTE || in + length > end) {
+    *codepoint = -1;
+    return 0;
+  }
+  *codepoint = ((unsigned)in[0]) & utf8_coding_bits[first];
+  for (int i=1; i<length; i++) {
+    int ch = (unsigned char)in[i];
+    if (utf8_coding_length[(unsigned char)in[i]] != UTF8_CONTINUATION_BYTE){
+      *codepoint = -1;
+      return 0;
+    }
+    *codepoint = (*codepoint << 6) | (ch & 0x3f);
+  }
+  return in + length;
+}
+
+int jvp_utf8_verify(const char* in, const char* end) {
+  int codepoint = 0;
+  while ((in = jvp_utf8_next(in, end, &codepoint))) {
+    if (codepoint == -1) return 0;
+  }
+  return codepoint != -1;
+}
+
+int jvp_utf8_encode_length(int codepoint) {
+  if (codepoint <= 0x7F) return 1;
+  else if (codepoint <= 0x7FF) return 2;
+  else if (codepoint <= 0xFFFF) return 3;
+  else return 4;
+}
+
+int jvp_utf8_encode(int codepoint, char* out) {
+  assert(codepoint >= 0 && codepoint <= 0x10FFFF);
+  char* start = out;
+  if (codepoint <= 0x7F) {
+    *out++ = codepoint;
+  } else if (codepoint <= 0x7FF) {
+    *out++ = 0xC0 + ((codepoint & 0x7C0) >> 6);
+    *out++ = 0x80 + ((codepoint & 0x03F));
+  } else if(codepoint <= 0xFFFF) {
+    *out++ = 0xE0 + ((codepoint & 0xF000) >> 12);
+    *out++ = 0x80 + ((codepoint & 0x0FC0) >> 6);
+    *out++ = 0x80 + ((codepoint & 0x003F));
+  } else {
+    *out++ = 0xF0 + ((codepoint & 0x1C0000) >> 18);
+    *out++ = 0x80 + ((codepoint & 0x03F000) >> 12);
+    *out++ = 0x80 + ((codepoint & 0x000FC0) >> 6);
+    *out++ = 0x80 + ((codepoint & 0x00003F));
+  }
+  assert(out - start == jvp_utf8_encode_length(codepoint));
+  return out - start;
+}
author	Stephen Dolan <mu@netsoc.tcd.ie>	2012-09-18 17:44:43 +0100
committer	Stephen Dolan <mu@netsoc.tcd.ie>	2012-09-18 17:44:43 +0100
commit	a4eea165bbab6d13f89b59707e835d58b7014a66 (patch)
tree	b99ee5dde8540f8dbe5de3d87b99e04ac4dd2673 /jv_unicode.c
parent	25cbab056b1f73e96b636c88779a92400d92dc15 (diff)