summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorpgen <p.gen.progs@gmail.com>2016-12-22 21:48:17 +0100
committerpgen <p.gen.progs@gmail.com>2016-12-22 21:51:12 +0100
commit22843756191b41875ace7c25f22523978100880a (patch)
treee7cfdc836fe37cfd5d84aa543f720c579e981369
parenteaf35966c23c02a4d6306b706de3b448be082dd9 (diff)
Replace the UTF8 validation routine with a more compact version
-rw-r--r--smenu.c188
1 files changed, 83 insertions, 105 deletions
diff --git a/smenu.c b/smenu.c
index 26db32a..d0a3f2a 100644
--- a/smenu.c
+++ b/smenu.c
@@ -112,7 +112,7 @@ static void get_terminal_size(int * const r, int * const c);
static char * mb_strprefix(char * d, char * s, int n, int * pos);
static int mb_strlen(char * str);
static wchar_t * mb_strtowcs(char * s);
-static void * validate_mb(const void * str);
+static int validate_mb(const char * str, int length);
static int outch(int c);
static void restore_term(int const fd);
static void setup_term(int const fd);
@@ -2288,124 +2288,102 @@ count_leading_set_bits(unsigned char c)
return 1;
}
-/* ============================================================= */
-/* Thank you Neil (https://github.com/sheredom) */
-/* Return NULL if the UTF-8 sequence is valid or the position of */
-/* the invalid UTF-8 codepoint on failure. */
-/* ============================================================= */
-void *
-validate_mb(const void * str)
+static const char trailing_bytes_for_utf8[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
+};
+
+/* ================================================================= */
+/* UTF8 validation routine inspired by Jeff Bezanson */
+/* placed in the public domain Fall 2005 */
+/* (https://github.com/JeffBezanson/cutef8) */
+/* */
+/* Returns 1 if str contains a valid UTF8 byte sequence, 0 otherwise */
+/* ================================================================= */
+static int
+validate_mb(const char * str, int length)
{
- const char * s = (const char *)str;
+ const unsigned char *p, *pend = (unsigned char *)str + length;
+ unsigned char c;
+ int ab;
- while ('\0' != *s)
+ for (p = (unsigned char *)str; p < pend; p++)
{
- if (0xf0 == (0xf8 & *s))
- {
- /* Ensure that each of the 3 following bytes in this 4-byte */
- /* UTF-8 codepoint bega, with 0b10xxxxxx */
- /* """""""""""""""""""""""""""""""""""""""""""""""""""""""" */
- if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))
- || (0x80 != (0xc0 & s[3])))
- {
- return (void *)s;
- }
-
- /* Ensure that our UTF-8 codepoint ended after 4 bytes */
- /* """"""""""""""""""""""""""""""""""""""""""""""""""" */
- if (0x80 == (0xc0 & s[4]))
- {
- return (void *)s;
- }
+ c = *p;
+ if (c < 128)
+ continue;
+ if ((c & 0xc0) != 0xc0)
+ return 0;
+ ab = trailing_bytes_for_utf8[c];
+ if (length < ab)
+ return 0;
+ length -= ab;
- /* Ensure that the top 5 bits of this 4-byte UTF-8 */
- /* codepoint were not 0, as then we could have used */
- /* one of the smaller encodings */
- /* """""""""""""""""""""""""""""""""""""""""""""""" */
- if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1])))
- {
- return (void *)s;
- }
+ p++;
+ /* Check top bits in the second byte */
+ /* """"""""""""""""""""""""""""""""" */
+ if ((*p & 0xc0) != 0x80)
+ return 0;
- /* 4-byte UTF-8 code point (began with 0b11110xxx) */
- /* """"""""""""""""""""""""""""""""""""""""""""""" */
- s += 4;
- }
- else if (0xe0 == (0xf0 & *s))
+ /* Check for overlong sequences for each different length */
+ /* """""""""""""""""""""""""""""""""""""""""""""""""""""" */
+ switch (ab)
{
- /* Ensure each of the 2 following bytes in this 3-byte */
- /* UTF-8 codepoint began with 0b10xxxxxx */
- /* """"""""""""""""""""""""""""""""""""""""""""""""""" */
- if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])))
- {
- return (void *)s;
- }
-
- /* Ensure that our UTF-8 codepoint ended after 3 bytes */
- /* """"""""""""""""""""""""""""""""""""""""""""""""""" */
- if (0x80 == (0xc0 & s[3]))
- {
- return (void *)s;
- }
-
- /* Ensure that the top 5 bits of this 3-byte UTF-8 */
- /* codepoint were not 0, as then we could have used */
- /* one of the smaller encodings */
- /* """""""""""""""""""""""""""""""""""""""""""""""" */
- if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1])))
- {
- return (void *)s;
- }
+ /* Check for xx00 000x */
+ /* """"""""""""""""""" */
+ case 1:
+ if ((c & 0x3e) == 0)
+ return 0;
+ continue; /* We know there aren't any more bytes to check */
- /* 3-byte UTF-8 code point (began with 0b1110xxxx) */
- /* """"""""""""""""""""""""""""""""""""""""""""""" */
- s += 3;
- }
- else if (0xc0 == (0xe0 & *s))
- {
- /* Ensure the 1 following byte in this 2-byte */
- /* UTF-8 codepoint began with 0b10xxxxxx */
- /* """""""""""""""""""""""""""""""""""""""""" */
- if (0x80 != (0xc0 & s[1]))
- {
- return (void *)s;
- }
+ /* Check for 1110 0000, xx0x xxxx */
+ /* """""""""""""""""""""""""""""" */
+ case 2:
+ if (c == 0xe0 && (*p & 0x20) == 0)
+ return 0;
+ break;
- /* Ensure that our UTF-8 codepoint ended after 2 bytes */
- /* """"""""""""""""""""""""""""""""""""""""""""""""""" */
- if (0x80 == (0xc0 & s[2]))
- {
- return (void *)s;
- }
+ /* Check for 1111 0000, xx00 xxxx */
+ /* """""""""""""""""""""""""""""" */
+ case 3:
+ if (c == 0xf0 && (*p & 0x30) == 0)
+ return 0;
+ break;
- /* Ensure that the top 4 bits of this 2-byte UTD-8 */
- /* codepoint were not 0, as then we could have used */
- /* one of the smaller encodings */
- /* """""""""""""""""""""""""""""""""""""""""""""""" */
- if (0 == (0x1e & s[0]))
- {
- return (void *)s;
- }
+ /* Check for 1111 1000, xx00 0xxx */
+ /* """""""""""""""""""""""""""""" */
+ case 4:
+ if (c == 0xf8 && (*p & 0x38) == 0)
+ return 0;
+ break;
- /* 2-byte UTF-8 code point (began with 0b110xxxxx) */
- /* """"""""""""""""""""""""""""""""""""""""""""""" */
- s += 2;
- }
- else if (0x00 == (0x80 & *s))
- {
- /* 1-byte ascii (began with 0b0xxxxxxx) */
- /* """""""""""""""""""""""""""""""""""" */
- s += 1;
+ /* Check for leading 0xfe or 0xff, */
+ /* and then for 1111 1100, xx00 00xx */
+ /* """"""""""""""""""""""""""""""""" */
+ case 5:
+ if (c == 0xfe || c == 0xff || (c == 0xfc && (*p & 0x3c) == 0))
+ return 0;
+ break;
}
- else
+
+ /* Check for valid bytes after the 2nd, if any; all must start 10 */
+ /* """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
+ while (--ab > 0)
{
- /* We have an invalid 0b1xxxxxxx UTF-8 code point entry */
- /* """""""""""""""""""""""""""""""""""""""""""""""""""" */
- return (void *)s;
+ if ((*(++p) & 0xc0) != 0x80)
+ return 0;
}
}
- return NULL;
+ return 1;
}
/* =============================================== */
@@ -2878,7 +2856,7 @@ get_bytes(FILE * input, char * mb_buffer, ll_t * word_delims_list,
/* In this case the original sequence is lost (unsupported */
/* encoding). */
/* """""""""""""""""""""""""""""""""""""""""""""""""""""""" */
- if (langinfo->utf8 && validate_mb(mb_buffer) != NULL)
+ if (langinfo->utf8 && !validate_mb(mb_buffer, last))
{
byte = mb_buffer[0] = '.';
mb_buffer[1] = '\0';