summaryrefslogtreecommitdiffstats
path: root/utf8.c
diff options
context:
space:
mode:
authorpgen <p.gen.progs@gmail.com>2018-09-27 23:16:48 +0200
committerpgen <p.gen.progs@gmail.com>2018-10-01 19:36:56 +0200
commitf07e9d15916ac14721c9819c6d245a28f685f42b (patch)
tree20bf5c45117afcfeacf0e4f66d19ccf68c9e3d4f /utf8.c
parent1b52c4c75422742f04363048dda64999adb8402c (diff)
Create utf8.[ch] and change code accordingly
Change multibyte to utf8 to specify that we only support UTF-8 encoding.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c437
1 files changed, 437 insertions, 0 deletions
diff --git a/utf8.c b/utf8.c
new file mode 100644
index 0000000..1507660
--- /dev/null
+++ b/utf8.c
@@ -0,0 +1,437 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <ctype.h>
+#include <limits.h>
+#include <langinfo.h>
+#include "xmalloc.h"
+#include "utf8.h"
+
+/* ======================================================================== */
+/* Unicode (UTF-8) ascii representation interpreter. */
+/* The string passed will be altered but will not move in memory */
+/* All sequence of \uxx, \uxxxx, \uxxxxxx and \uxxxxxxxx will be replace by */
+/* the corresponding UTF-8 character. */
+/* ======================================================================== */
+void
+utf8_interpret(char * s, langinfo_t * langinfo)
+{
+ char * utf8_str; /* \uxx... */
+ size_t utf8_to_eos_len; /* bytes in s starting from the first *
+ * occurrence of \u */
+ size_t init_len; /* initial lengths of the string to interpret */
+ size_t utf8_ascii_len; /* 2,4,6 or 8 bytes */
+ size_t len_to_remove = 0; /* number of bytes to remove after the conversion */
+ char tmp[9]; /* temporary string */
+
+ /* Guard against the case where s is NULL */
+ /* """""""""""""""""""""""""""""""""""""" */
+ if (s == NULL)
+ return;
+
+ init_len = strlen(s);
+
+ while ((utf8_str = strstr(s, "\\u")) != NULL)
+ {
+ utf8_to_eos_len = strlen(utf8_str);
+ if (utf8_to_eos_len < 4) /* string too short to contain *
+ * a valid UTF-8 char */
+ {
+ *utf8_str = '.';
+ *(utf8_str + 1) = '\0';
+ }
+ else /* s is long enough */
+ {
+ unsigned byte;
+ char * utf8_seq_offset = utf8_str + 2;
+
+ /* Get the first 2 utf8 bytes */
+ *tmp = *utf8_seq_offset;
+ *(tmp + 1) = *(utf8_seq_offset + 1);
+ *(tmp + 2) = '\0';
+
+ /* If they are invalid, replace the \u sequence by a dot */
+ /* """"""""""""""""""""""""""""""""""""""""""""""""""""" */
+ if (!isxdigit(tmp[0]) || !isxdigit(tmp[1]))
+ {
+ *utf8_str = '.';
+ if (4 >= utf8_to_eos_len)
+ *(utf8_str + 1) = '\0';
+ else
+ memmove(utf8_str, utf8_str + 4, utf8_to_eos_len - 4);
+ return;
+ }
+ else
+ {
+ /* They are valid, deduce from them the length of the sequence */
+ /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
+ sscanf(tmp, "%2x", &byte);
+ utf8_ascii_len = utf8_get_length(byte) * 2;
+
+ /* Check again if the inputs string is long enough */
+ /* """"""""""""""""""""""""""""""""""""""""""""""" */
+ if (utf8_to_eos_len - 2 < utf8_ascii_len)
+ {
+ *utf8_str = '.';
+ *(utf8_str + 1) = '\0';
+ }
+ else
+ {
+ /* replace the \u sequence by the bytes forming the UTF-8 char */
+ /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
+ size_t i;
+ *tmp = byte;
+
+ /* Put the bytes in the tmp string */
+ /* ''''''''''''''''''''''''''''''' */
+ if (langinfo->utf8)
+ {
+ for (i = 1; i < utf8_ascii_len / 2; i++)
+ {
+ sscanf(utf8_seq_offset + 2 * i, "%2x", &byte);
+ *(tmp + i) = byte;
+ }
+ tmp[utf8_ascii_len / 2] = '\0';
+ }
+
+ /* Does they form a valid UTF-8 char? */
+ /* '''''''''''''''''''''''''''''''''' */
+ if (langinfo->utf8 && utf8_validate(tmp, utf8_ascii_len / 2))
+ {
+ /* Put them back in the original string and move */
+ /* the remaining bytes after them */
+ /* ''''''''''''''''''''''''''''''''''''''''''''' */
+ memmove(utf8_str, tmp, utf8_ascii_len / 2);
+
+ if (utf8_to_eos_len < utf8_ascii_len)
+ *(utf8_str + utf8_ascii_len / 2 + 1) = '\0';
+ else
+ memmove(utf8_str + utf8_ascii_len / 2,
+ utf8_seq_offset + utf8_ascii_len,
+ utf8_to_eos_len - utf8_ascii_len - 2 + 1);
+ }
+ else
+ {
+ /* The invalid sequence is replaced by a dot */
+ /* ''''''''''''''''''''''''''''''''''''''''' */
+ *utf8_str = '.';
+ if (utf8_to_eos_len < utf8_ascii_len)
+ *(utf8_str + 1) = '\0';
+ else
+ memmove(utf8_str + 1, utf8_seq_offset + utf8_ascii_len,
+ utf8_to_eos_len - utf8_ascii_len - 2 + 1);
+ utf8_ascii_len = 2;
+ }
+ }
+
+ /* Update the number of bytes to remove at the end */
+ /* of the initial string */
+ /* """"""""""""""""""""""""""""""""""""""""""""""" */
+ len_to_remove += 2 + utf8_ascii_len / 2;
+ }
+ }
+ }
+
+ /* Make sure that the string is well terminated */
+ /* """""""""""""""""""""""""""""""""""""""""""" */
+ *(s + init_len - len_to_remove) = '\0';
+
+ return;
+}
+
+/* ========================================================= */
+/* Decode the number of bytes taken by a character (UTF-8) */
+/* It is the length of the leading sequence of bits set to 1 */
+/* (Count Leading Ones) */
+/* ========================================================= */
+int
+utf8_get_length(unsigned char c)
+{
+ if (c >= 0xf0)
+ return 4;
+ else if (c >= 0xe0)
+ return 3;
+ else if (c >= 0xc2)
+ return 2;
+ else
+ return 1;
+}
+
+/* ================================================== */
+/* Return the byte offset of the nth UTF-8 glyph in s */
+/* ================================================== */
+size_t
+utf8_offset(char * s, size_t n)
+{
+ size_t i = 0;
+
+ while (n > 0)
+ {
+ if (s[i++] & 0x80)
+ {
+ (void)(((s[++i] & 0xc0) != 0x80) || ((s[++i] & 0xc0) != 0x80) || ++i);
+ }
+ n--;
+ }
+ return i;
+}
+
+/* ============================================== */
+/* Points to the previous UTF-8 glyph in a string */
+/* from the given position */
+/* ============================================== */
+char *
+utf8_prev(const char * str, const char * p)
+{
+ while ((*p & 0xc0) == 0x80)
+ p--;
+
+ for (--p; p >= str; --p)
+ {
+ if ((*p & 0xc0) != 0x80)
+ return (char *)p;
+ }
+ return NULL;
+}
+
+/* ========================================== */
+/* Points to the next UTF-8 glyph in a string */
+/* from the current position */
+/* ========================================== */
+char *
+utf8_next(char * p)
+{
+ if (*p)
+ {
+ for (++p; (*p & 0xc0) == 0x80; ++p)
+ ;
+ }
+ return (*p == '\0' ? NULL : p);
+}
+
+/* ============================================================ */
+/* Replace any UTF-8 glyph present in s by a dot in-place */
+/* s will be modified but its address in memory will not change */
+/* ============================================================ */
+void
+utf8_sanitize(char * s)
+{
+ char * p = s;
+ int n;
+ size_t len;
+
+ len = strlen(s);
+ while (*p)
+ {
+ n = utf8_get_length(*p);
+ if (n > 1)
+ {
+ *p = '.';
+ memmove(p + 1, p + n, len - (p - s) - n + 1);
+ len -= (n - 1);
+ }
+ p++;
+ }
+}
+
+static const char trailing_bytes_for_utf8[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
+};
+
+/* ================================================================== */
+/* UTF-8 validation routine inspired by Jeff Bezanson */
+/* placed in the public domain Fall 2005 */
+/* (https://github.com/JeffBezanson/cutef8) */
+/* */
+/* Returns 1 if str contains a valid UTF-8 byte sequence, 0 otherwise */
+/* ================================================================== */
+int
+utf8_validate(const char * str, size_t length)
+{
+ const unsigned char *p, *pend = (const unsigned char *)str + length;
+ unsigned char c;
+ size_t ab;
+
+ for (p = (const unsigned char *)str; p < pend; p++)
+ {
+ c = *p;
+ if (c < 128)
+ continue;
+ if ((c & 0xc0) != 0xc0)
+ return 0;
+ ab = trailing_bytes_for_utf8[c];
+ if (length < ab)
+ return 0;
+ length -= ab;
+
+ p++;
+ /* Check top bits in the second byte */
+ /* """"""""""""""""""""""""""""""""" */
+ if ((*p & 0xc0) != 0x80)
+ return 0;
+
+ /* Check for overlong sequences for each different length */
+ /* """""""""""""""""""""""""""""""""""""""""""""""""""""" */
+ switch (ab)
+ {
+ /* Check for xx00 000x */
+ /* """"""""""""""""""" */
+ case 1:
+ if ((c & 0x3e) == 0)
+ return 0;
+ continue; /* We know there aren't any more bytes to check */
+
+ /* Check for 1110 0000, xx0x xxxx */
+ /* """""""""""""""""""""""""""""" */
+ case 2:
+ if (c == 0xe0 && (*p & 0x20) == 0)
+ return 0;
+ break;
+
+ /* Check for 1111 0000, xx00 xxxx */
+ /* """""""""""""""""""""""""""""" */
+ case 3:
+ if (c == 0xf0 && (*p & 0x30) == 0)
+ return 0;
+ break;
+
+ /* Check for 1111 1000, xx00 0xxx */
+ /* """""""""""""""""""""""""""""" */
+ case 4:
+ if (c == 0xf8 && (*p & 0x38) == 0)
+ return 0;
+ break;
+
+ /* Check for leading 0xfe or 0xff, */
+ /* and then for 1111 1100, xx00 00xx */
+ /* """"""""""""""""""""""""""""""""" */
+ case 5:
+ if (c == 0xfe || c == 0xff || (c == 0xfc && (*p & 0x3c) == 0))
+ return 0;
+ break;
+ }
+
+ /* Check for valid bytes after the 2nd, if any; all must start 10 */
+ /* """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
+ while (--ab > 0)
+ {
+ if ((*(++p) & 0xc0) != 0x80)
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+/* ====================== */
+/* Multibyte UTF-8 strlen */
+/* ====================== */
+size_t
+utf8_strlen(char * str)
+{
+ size_t i = 0, j = 0;
+
+ while (str[i])
+ {
+ if ((str[i] & 0xc0) != 0x80)
+ j++;
+ i++;
+ }
+ return j;
+}
+
+/* =================================================================== */
+/* Multibytes extraction of the prefix of n UTF-8 glyphs from a string */
+/* The destination string d must have been allocated before. */
+/* pos is updated to reflect the position AFTER the prefix. */
+/* =================================================================== */
+char *
+utf8_strprefix(char * d, char * s, long n, long * pos)
+{
+ long i = 0;
+ long j = 0;
+
+ *pos = 0;
+
+ while (s[i] && j < n)
+ {
+ d[i] = s[i];
+ i++;
+ j++;
+ while (s[i] && (s[i] & 0xC0) == 0x80)
+ {
+ d[i] = s[i];
+ i++;
+ }
+ }
+
+ *pos = i;
+
+ d[i] = '\0';
+
+ return d;
+}
+
+/* ================================================ */
+/* Convert a UTF-8 glyph string to a wchar_t string */
+/* ================================================ */
+wchar_t *
+utf8_strtowcs(char * s)
+{
+ int converted = 0;
+ unsigned char * ch;
+ wchar_t * wptr, *w;
+ size_t size;
+
+ size = (long)strlen(s);
+ w = xmalloc((size + 1) * sizeof(wchar_t));
+ w[0] = L'\0';
+
+ wptr = w;
+ for (ch = (unsigned char *)s; *ch; ch += converted)
+ {
+ if ((converted = mbtowc(wptr, (char *)ch, 4)) > 0)
+ wptr++;
+ else
+ {
+ *wptr++ = (wchar_t)*ch;
+ converted = 1;
+ }
+ }
+
+ *wptr = L'\0';
+
+ return w;
+}
+
+/* ============================================================== */
+/* Fill dst whi a lowercase ocopy of src whar the character is an */
+/* ascci one. dsk must be preallocated before the call. */
+/* ============================================================== */
+void
+utf8_strtolower(char * dst, char * src)
+{
+ unsigned char c;
+
+ while ((c = *src))
+ {
+ if (c >= 0x80)
+ *dst = c;
+ else
+ *dst = tolower(c);
+
+ src++;
+ dst++;
+ }
+ *dst = '\0';
+}