summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorpgen <p.gen.progs@gmail.com>2021-02-19 22:08:22 +0100
committerpgen <p.gen.progs@gmail.com>2021-02-19 23:31:04 +0100
commitdf9f37847eca8da7dbae177b8e70582af0ea8c6f (patch)
treeed31655543b01fe30bdc71f34961ec772b62bec1
parent0a398c7bb7deab04f61861fd4423c289a4a5db0b (diff)
Change the implementation of utf8_validate
Thanks to Markus Kuhn for this code. The old implementation was not foolproof.
-rw-r--r--smenu.c4
-rw-r--r--utf8.c150
-rw-r--r--utf8.h4
3 files changed, 66 insertions, 92 deletions
diff --git a/smenu.c b/smenu.c
index fed9515..8e80111 100644
--- a/smenu.c
+++ b/smenu.c
@@ -2398,7 +2398,7 @@ get_bytes(FILE * input, char * utf8_buffer, ll_t * zapped_glyphs_list,
/* In this case the original sequence is lost (unsupported */
/* encoding). */
/* """""""""""""""""""""""""""""""""""""""""""""""""""""""" */
- if (langinfo->utf8 && !utf8_validate(utf8_buffer, last))
+ if (langinfo->utf8 && utf8_validate(utf8_buffer) != NULL)
{
byte = utf8_buffer[0] = misc->invalid_char_substitute;
utf8_buffer[1] = '\0';
@@ -3025,6 +3025,8 @@ build_metadata(term_t * term, long count, win_t * win)
while (i < count)
{
/* Determine the number of screen positions used by the word. */
+ /* Note: mbstowcs will always succeed here as word_a[i].str */
+ /* has already been utf8_validated/repaired. */
/* """""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
word_len = mbstowcs(NULL, word_a[i].str, 0);
word_width = wcswidth((w = utf8_strtowcs(word_a[i].str)), word_len);
diff --git a/utf8.c b/utf8.c
index d33e9ab..03c9514 100644
--- a/utf8.c
+++ b/utf8.c
@@ -240,7 +240,7 @@ utf8_interpret(char * s, langinfo_t * langinfo, char substitute)
/* Does they form a valid UTF-8 char? */
/* '''''''''''''''''''''''''''''''''' */
- if (utf8_validate(tmp, utf8_ascii_len / 2))
+ if (utf8_validate(tmp) == NULL)
{
/* Put them back in the original string and move */
/* the remaining bytes after them. */
@@ -382,102 +382,74 @@ utf8_sanitize(char * s, char substitute)
}
}
-static const char trailing_bytes_for_utf8[256] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
-};
-
-/* =================================================================== */
-/* UTF-8 validation routine inspired by Jeff Bezanson */
-/* placed in the public domain Fall 2005 */
-/* (https://github.com/JeffBezanson/cutef8). */
-/* */
-/* Returns 1 if str contains a valid UTF-8 byte sequence, 0 otherwise. */
-/* =================================================================== */
-int
-utf8_validate(const char * str, size_t length)
+/* ======================================================================= */
+/* The utf8_validate() function scans the '\0'-terminated string starting */
+/* at s. */
+/* It returns a pointer to the first byte of the first malformed */
+/* or overlong UTF-8 sequence found, or NULL if the string contains only */
+/* correct UTF-8. */
+/* It also spots UTF-8 sequences that could cause trouble if converted to */
+/* UTF-16, namely surrogate characters (U+D800..U+DFFF) and non-Unicode */
+/* positions (U+FFFE..U+FFFF). */
+/* This routine is very likely to find a malformed sequence if the input */
+/* uses any other encoding than UTF-8. */
+/* It therefore can be used as a very effective heuristic for */
+/* distinguishing between UTF-8 and other encodings. */
+/* */
+/* I wrote this code mainly as a specification of functionality; there */
+/* are no doubt performance optimizations possible for certain CPUs. */
+/* */
+/* Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30 */
+/* License: http://www.cl.cam.ac.uk/~mgk25/short-license.html */
+/* ======================================================================= */
+unsigned char *
+utf8_validate(unsigned char * s)
{
- const unsigned char *p, *pend = (const unsigned char *)str + length;
- unsigned char c;
- size_t ab;
-
- for (p = (const unsigned char *)str; p < pend; p++)
+ /* clang-format off */
+ while (*s)
{
- c = *p;
- if (c < 128)
- continue;
- if ((c & 0xc0) != 0xc0)
- return 0;
- ab = trailing_bytes_for_utf8[c];
- if (length < ab)
- return 0;
- length -= ab;
-
- p++;
- /* Check top bits in the second byte. */
- /* """""""""""""""""""""""""""""""""" */
- if ((*p & 0xc0) != 0x80)
- return 0;
-
- /* Check for overlong sequences for each different length. */
- /* """"""""""""""""""""""""""""""""""""""""""""""""""""""" */
- switch (ab)
+ if (*s < 0x80)
+ /* 0xxxxxxx */
+ s++;
+ else if ((s[0] & 0xe0) == 0xc0)
{
- /* Check for xx00 000x. */
- /* """""""""""""""""""" */
- case 1:
- if ((c & 0x3e) == 0)
- return 0;
- continue; /* We know there aren't any more bytes to check. */
-
- /* Check for 1110 0000, xx0x xxxx. */
- /* """"""""""""""""""""""""""""""" */
- case 2:
- if (c == 0xe0 && (*p & 0x20) == 0)
- return 0;
- break;
-
- /* Check for 1111 0000, xx00 xxxx. */
- /* """"""""""""""""""""""""""""""" */
- case 3:
- if (c == 0xf0 && (*p & 0x30) == 0)
- return 0;
- break;
-
- /* Check for 1111 1000, xx00 0xxx. */
- /* """"""""""""""""""""""""""""""" */
- case 4:
- if (c == 0xf8 && (*p & 0x38) == 0)
- return 0;
- break;
-
- /* Check for leading 0xfe or 0xff, */
- /* and then for 1111 1100, xx00 00xx. */
- /* """""""""""""""""""""""""""""""""" */
- case 5:
- if (c == 0xfe || c == 0xff || (c == 0xfc && (*p & 0x3c) == 0))
- return 0;
- break;
+ /* 110XXXXx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 || (s[0] & 0xfe) == 0xc0) /* overlong? */
+ return s;
+ else
+ s += 2;
}
-
- /* Check for valid bytes after the 2nd, if any; all must start with 10. */
- /* """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
- while (--ab > 0)
+ else if ((s[0] & 0xf0) == 0xe0)
{
- if ((*(++p) & 0xc0) != 0x80)
- return 0;
+ /* 1110XXXX 10Xxxxxx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */
+ (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */
+ (s[0] == 0xef && s[1] == 0xbf &&
+ (s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */
+ return s;
+ else
+ s += 3;
}
+ else if ((s[0] & 0xf8) == 0xf0)
+ {
+ /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[3] & 0xc0) != 0x80 ||
+ (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */
+ (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
+ return s;
+ else
+ s += 4;
+ }
+ else
+ return s;
}
+ /* clang-format on */
- return 1;
+ return NULL;
}
/* ======================= */
diff --git a/utf8.h b/utf8.h
index a43d5af..855a3c8 100644
--- a/utf8.h
+++ b/utf8.h
@@ -42,8 +42,8 @@ cptoutf8(char * utf8_str, uint32_t c);
int
utf8_interpret(char * s, langinfo_t * langinfo, char sc);
-int
-utf8_validate(const char * str, size_t length);
+unsigned char *
+utf8_validate(unsigned char * str);
char *
utf8_prev(const char * str, const char * p);