Replace the UTF8 validation routine with a more compact version

author: pgen <p.gen.progs@gmail.com> 2016-12-22 21:48:17 +0100
committer: pgen <p.gen.progs@gmail.com> 2016-12-22 21:51:12 +0100
commit: 22843756191b41875ace7c25f22523978100880a (patch)
tree: e7cfdc836fe37cfd5d84aa543f720c579e981369
parent: eaf35966c23c02a4d6306b706de3b448be082dd9 (diff)
1 files changed, 83 insertions, 105 deletions
diff --git a/smenu.c b/smenu.c
index 26db32a..d0a3f2a 100644
--- a/smenu.c
+++ b/smenu.c
@@ -112,7 +112,7 @@ static void get_terminal_size(int * const r, int * const c);
 static char * mb_strprefix(char * d, char * s, int n, int * pos);
 static int mb_strlen(char * str);
 static wchar_t * mb_strtowcs(char * s);
-static void * validate_mb(const void * str);
+static int validate_mb(const char * str, int length);
 static int outch(int c);
 static void restore_term(int const fd);
 static void setup_term(int const fd);
@@ -2288,124 +2288,102 @@ count_leading_set_bits(unsigned char c)
     return 1;
 }
 
-/* ============================================================= */
-/* Thank you Neil (https://github.com/sheredom)                  */
-/* Return NULL if the UTF-8 sequence is valid or the position of */
-/* the invalid UTF-8 codepoint on failure.                       */
-/* ============================================================= */
-void *
-validate_mb(const void * str)
+static const char trailing_bytes_for_utf8[256] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
+};
+
+/* ================================================================= */
+/* UTF8 validation routine inspired by Jeff Bezanson                 */
+/*   placed in the public domain Fall 2005                           */
+/*   (https://github.com/JeffBezanson/cutef8)                        */
+/*                                                                   */
+/* Returns 1 if str contains a valid UTF8 byte sequence, 0 otherwise */
+/* ================================================================= */
+static int
+validate_mb(const char * str, int length)
 {
-  const char * s = (const char *)str;
+  const unsigned char *p, *pend = (unsigned char *)str + length;
+  unsigned char        c;
+  int                  ab;
 
-  while ('\0' != *s)
+  for (p = (unsigned char *)str; p < pend; p++)
   {
-    if (0xf0 == (0xf8 & *s))
-    {
-      /* Ensure that each of the 3 following bytes in this 4-byte */
-      /* UTF-8 codepoint bega, with 0b10xxxxxx                    */
-      /* """""""""""""""""""""""""""""""""""""""""""""""""""""""" */
-      if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))
-          || (0x80 != (0xc0 & s[3])))
-      {
-        return (void *)s;
-      }
-
-      /* Ensure that our UTF-8 codepoint ended after 4 bytes */
-      /* """"""""""""""""""""""""""""""""""""""""""""""""""" */
-      if (0x80 == (0xc0 & s[4]))
-      {
-        return (void *)s;
-      }
+    c = *p;
+    if (c < 128)
+      continue;
+    if ((c & 0xc0) != 0xc0)
+      return 0;
+    ab = trailing_bytes_for_utf8[c];
+    if (length < ab)
+      return 0;
+    length -= ab;
 
-      /* Ensure that the top 5 bits of this 4-byte UTF-8  */
-      /* codepoint were not 0, as then we could have used */
-      /* one of the smaller encodings                     */
-      /* """""""""""""""""""""""""""""""""""""""""""""""" */
-      if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1])))
-      {
-        return (void *)s;
-      }
+    p++;
+    /* Check top bits in the second byte */
+    /* """"""""""""""""""""""""""""""""" */
+    if ((*p & 0xc0) != 0x80)
+      return 0;
 
-      /* 4-byte UTF-8 code point (began with 0b11110xxx) */
-      /* """"""""""""""""""""""""""""""""""""""""""""""" */
-      s += 4;
-    }
-    else if (0xe0 == (0xf0 & *s))
+    /* Check for overlong sequences for each different length */
+    /* """""""""""""""""""""""""""""""""""""""""""""""""""""" */
+    switch (ab)
     {
-      /* Ensure each of the 2 following bytes in this 3-byte */
-      /* UTF-8 codepoint began with 0b10xxxxxx               */
-      /* """"""""""""""""""""""""""""""""""""""""""""""""""" */
-      if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])))
-      {
-        return (void *)s;
-      }
-
-      /* Ensure that our UTF-8 codepoint ended after 3 bytes */
-      /* """"""""""""""""""""""""""""""""""""""""""""""""""" */
-      if (0x80 == (0xc0 & s[3]))
-      {
-        return (void *)s;
-      }
-
-      /* Ensure that the top 5 bits of this 3-byte UTF-8  */
-      /* codepoint were not 0, as then we could have used */
-      /* one of the smaller encodings                     */
-      /* """""""""""""""""""""""""""""""""""""""""""""""" */
-      if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1])))
-      {
-        return (void *)s;
-      }
+      /* Check for xx00 000x */
+      /* """"""""""""""""""" */
+      case 1:
+        if ((c & 0x3e) == 0)
+          return 0;
+        continue; /* We know there aren't any more bytes to check */
 
-      /* 3-byte UTF-8 code point (began with 0b1110xxxx) */
-      /* """"""""""""""""""""""""""""""""""""""""""""""" */
-      s += 3;
-    }
-    else if (0xc0 == (0xe0 & *s))
-    {
-      /* Ensure the 1 following byte in this 2-byte */
-      /* UTF-8 codepoint began with 0b10xxxxxx      */
-      /* """""""""""""""""""""""""""""""""""""""""" */
-      if (0x80 != (0xc0 & s[1]))
-      {
-        return (void *)s;
-      }
+      /* Check for 1110 0000, xx0x xxxx */
+      /* """""""""""""""""""""""""""""" */
+      case 2:
+        if (c == 0xe0 && (*p & 0x20) == 0)
+          return 0;
+        break;
 
-      /* Ensure that our UTF-8 codepoint ended after 2 bytes */
-      /* """"""""""""""""""""""""""""""""""""""""""""""""""" */
-      if (0x80 == (0xc0 & s[2]))
-      {
-        return (void *)s;
-      }
+      /* Check for 1111 0000, xx00 xxxx */
+      /* """""""""""""""""""""""""""""" */
+      case 3:
+        if (c == 0xf0 && (*p & 0x30) == 0)
+          return 0;
+        break;
 
-      /* Ensure that the top 4 bits of this 2-byte UTD-8  */
-      /* codepoint were not 0, as then we could have used */
-      /* one of the smaller encodings                     */
-      /* """""""""""""""""""""""""""""""""""""""""""""""" */
-      if (0 == (0x1e & s[0]))
-      {
-        return (void *)s;
-      }
+      /* Check for 1111 1000, xx00 0xxx */
+      /* """""""""""""""""""""""""""""" */
+      case 4:
+        if (c == 0xf8 && (*p & 0x38) == 0)
+          return 0;
+        break;
 
-      /* 2-byte UTF-8 code point (began with 0b110xxxxx) */
-      /* """"""""""""""""""""""""""""""""""""""""""""""" */
-      s += 2;
-    }
-    else if (0x00 == (0x80 & *s))
-    {
-      /* 1-byte ascii (began with 0b0xxxxxxx) */
-      /* """""""""""""""""""""""""""""""""""" */
-      s += 1;
+      /* Check for leading 0xfe or 0xff,   */
+      /* and then for 1111 1100, xx00 00xx */
+      /* """"""""""""""""""""""""""""""""" */
+      case 5:
+        if (c == 0xfe || c == 0xff || (c == 0xfc && (*p & 0x3c) == 0))
+          return 0;
+        break;
     }
-    else
+
+    /* Check for valid bytes after the 2nd, if any; all must start 10 */
+    /* """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
+    while (--ab > 0)
     {
-      /* We have an invalid 0b1xxxxxxx UTF-8 code point entry */
-      /* """""""""""""""""""""""""""""""""""""""""""""""""""" */
-      return (void *)s;
+      if ((*(++p) & 0xc0) != 0x80)
+        return 0;
     }
   }
 
-  return NULL;
+  return 1;
 }
 
 /* =============================================== */
@@ -2878,7 +2856,7 @@ get_bytes(FILE * input, char * mb_buffer, ll_t * word_delims_list,
   /* In this case the original sequence is lost (unsupported  */
   /* encoding).                                               */
   /* """""""""""""""""""""""""""""""""""""""""""""""""""""""" */
-  if (langinfo->utf8 && validate_mb(mb_buffer) != NULL)
+  if (langinfo->utf8 && !validate_mb(mb_buffer, last))
   {
     byte = mb_buffer[0] = '.';
     mb_buffer[1]        = '\0';
author	pgen <p.gen.progs@gmail.com>	2016-12-22 21:48:17 +0100
committer	pgen <p.gen.progs@gmail.com>	2016-12-22 21:51:12 +0100
commit	22843756191b41875ace7c25f22523978100880a (patch)
tree	e7cfdc836fe37cfd5d84aa543f720c579e981369
parent	eaf35966c23c02a4d6306b706de3b448be082dd9 (diff)