Create utf8.[ch] and change code accordingly

Change multibyte to utf8 to specify that we only support UTF-8 encoding.
author: pgen <p.gen.progs@gmail.com> 2018-09-27 23:16:48 +0200
committer: pgen <p.gen.progs@gmail.com> 2018-10-01 19:36:56 +0200
commit: f07e9d15916ac14721c9819c6d245a28f685f42b (patch)
tree: 20bf5c45117afcfeacf0e4f66d19ccf68c9e3d4f /utf8.c
parent: 1b52c4c75422742f04363048dda64999adb8402c (diff)
1 files changed, 437 insertions, 0 deletions
diff --git a/utf8.c b/utf8.c
new file mode 100644
index 0000000..1507660
--- /dev/null
+++ b/utf8.c
@@ -0,0 +1,437 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <ctype.h>
+#include <limits.h>
+#include <langinfo.h>
+#include "xmalloc.h"
+#include "utf8.h"
+
+/* ======================================================================== */
+/* Unicode (UTF-8) ascii representation interpreter.                        */
+/* The string passed will be altered but will not move in memory            */
+/* All sequence of \uxx, \uxxxx, \uxxxxxx and \uxxxxxxxx will be replace by */
+/* the corresponding UTF-8 character.                                       */
+/* ======================================================================== */
+void
+utf8_interpret(char * s, langinfo_t * langinfo)
+{
+  char * utf8_str;          /* \uxx...                                        */
+  size_t utf8_to_eos_len;   /* bytes in s starting from the first             *
+                             * occurrence of \u                               */
+  size_t init_len;          /* initial lengths of the string to interpret     */
+  size_t utf8_ascii_len;    /* 2,4,6 or 8 bytes                               */
+  size_t len_to_remove = 0; /* number of bytes to remove after the conversion */
+  char   tmp[9];            /* temporary string                               */
+
+  /* Guard against the case where s is NULL */
+  /* """""""""""""""""""""""""""""""""""""" */
+  if (s == NULL)
+    return;
+
+  init_len = strlen(s);
+
+  while ((utf8_str = strstr(s, "\\u")) != NULL)
+  {
+    utf8_to_eos_len = strlen(utf8_str);
+    if (utf8_to_eos_len < 4) /* string too short to contain *
+                              * a valid UTF-8 char          */
+    {
+      *utf8_str       = '.';
+      *(utf8_str + 1) = '\0';
+    }
+    else /* s is long enough */
+    {
+      unsigned byte;
+      char *   utf8_seq_offset = utf8_str + 2;
+
+      /* Get the first 2 utf8 bytes */
+      *tmp       = *utf8_seq_offset;
+      *(tmp + 1) = *(utf8_seq_offset + 1);
+      *(tmp + 2) = '\0';
+
+      /* If they are invalid, replace the \u sequence by a dot */
+      /* """"""""""""""""""""""""""""""""""""""""""""""""""""" */
+      if (!isxdigit(tmp[0]) || !isxdigit(tmp[1]))
+      {
+        *utf8_str = '.';
+        if (4 >= utf8_to_eos_len)
+          *(utf8_str + 1) = '\0';
+        else
+          memmove(utf8_str, utf8_str + 4, utf8_to_eos_len - 4);
+        return;
+      }
+      else
+      {
+        /* They are valid, deduce from them the length of the sequence */
+        /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
+        sscanf(tmp, "%2x", &byte);
+        utf8_ascii_len = utf8_get_length(byte) * 2;
+
+        /* Check again if the inputs string is long enough */
+        /* """"""""""""""""""""""""""""""""""""""""""""""" */
+        if (utf8_to_eos_len - 2 < utf8_ascii_len)
+        {
+          *utf8_str       = '.';
+          *(utf8_str + 1) = '\0';
+        }
+        else
+        {
+          /* replace the \u sequence by the bytes forming the UTF-8 char */
+          /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
+          size_t i;
+          *tmp = byte;
+
+          /* Put the bytes in the tmp string */
+          /* ''''''''''''''''''''''''''''''' */
+          if (langinfo->utf8)
+          {
+            for (i = 1; i < utf8_ascii_len / 2; i++)
+            {
+              sscanf(utf8_seq_offset + 2 * i, "%2x", &byte);
+              *(tmp + i) = byte;
+            }
+            tmp[utf8_ascii_len / 2] = '\0';
+          }
+
+          /* Does they form a valid UTF-8 char? */
+          /* '''''''''''''''''''''''''''''''''' */
+          if (langinfo->utf8 && utf8_validate(tmp, utf8_ascii_len / 2))
+          {
+            /* Put them back in the original string and move */
+            /* the remaining bytes after them                */
+            /* ''''''''''''''''''''''''''''''''''''''''''''' */
+            memmove(utf8_str, tmp, utf8_ascii_len / 2);
+
+            if (utf8_to_eos_len < utf8_ascii_len)
+              *(utf8_str + utf8_ascii_len / 2 + 1) = '\0';
+            else
+              memmove(utf8_str + utf8_ascii_len / 2,
+                      utf8_seq_offset + utf8_ascii_len,
+                      utf8_to_eos_len - utf8_ascii_len - 2 + 1);
+          }
+          else
+          {
+            /* The invalid sequence is replaced by a dot */
+            /* ''''''''''''''''''''''''''''''''''''''''' */
+            *utf8_str = '.';
+            if (utf8_to_eos_len < utf8_ascii_len)
+              *(utf8_str + 1) = '\0';
+            else
+              memmove(utf8_str + 1, utf8_seq_offset + utf8_ascii_len,
+                      utf8_to_eos_len - utf8_ascii_len - 2 + 1);
+            utf8_ascii_len = 2;
+          }
+        }
+
+        /* Update the number of bytes to remove at the end */
+        /* of the initial string                           */
+        /* """"""""""""""""""""""""""""""""""""""""""""""" */
+        len_to_remove += 2 + utf8_ascii_len / 2;
+      }
+    }
+  }
+
+  /* Make sure that the string is well terminated */
+  /* """""""""""""""""""""""""""""""""""""""""""" */
+  *(s + init_len - len_to_remove) = '\0';
+
+  return;
+}
+
+/* ========================================================= */
+/* Decode the number of bytes taken by a character (UTF-8)   */
+/* It is the length of the leading sequence of bits set to 1 */
+/* (Count Leading Ones)                                      */
+/* ========================================================= */
+int
+utf8_get_length(unsigned char c)
+{
+  if (c >= 0xf0)
+    return 4;
+  else if (c >= 0xe0)
+    return 3;
+  else if (c >= 0xc2)
+    return 2;
+  else
+    return 1;
+}
+
+/* ================================================== */
+/* Return the byte offset of the nth UTF-8 glyph in s */
+/* ================================================== */
+size_t
+utf8_offset(char * s, size_t n)
+{
+  size_t i = 0;
+
+  while (n > 0)
+  {
+    if (s[i++] & 0x80)
+    {
+      (void)(((s[++i] & 0xc0) != 0x80) || ((s[++i] & 0xc0) != 0x80) || ++i);
+    }
+    n--;
+  }
+  return i;
+}
+
+/* ============================================== */
+/* Points to the previous UTF-8 glyph in a string */
+/* from the given position                        */
+/* ============================================== */
+char *
+utf8_prev(const char * str, const char * p)
+{
+  while ((*p & 0xc0) == 0x80)
+    p--;
+
+  for (--p; p >= str; --p)
+  {
+    if ((*p & 0xc0) != 0x80)
+      return (char *)p;
+  }
+  return NULL;
+}
+
+/* ========================================== */
+/* Points to the next UTF-8 glyph in a string */
+/* from the current position                  */
+/* ========================================== */
+char *
+utf8_next(char * p)
+{
+  if (*p)
+  {
+    for (++p; (*p & 0xc0) == 0x80; ++p)
+      ;
+  }
+  return (*p == '\0' ? NULL : p);
+}
+
+/* ============================================================ */
+/* Replace any UTF-8 glyph present in s by a dot in-place       */
+/* s will be modified but its address in memory will not change */
+/* ============================================================ */
+void
+utf8_sanitize(char * s)
+{
+  char * p = s;
+  int    n;
+  size_t len;
+
+  len = strlen(s);
+  while (*p)
+  {
+    n = utf8_get_length(*p);
+    if (n > 1)
+    {
+      *p = '.';
+      memmove(p + 1, p + n, len - (p - s) - n + 1);
+      len -= (n - 1);
+    }
+    p++;
+  }
+}
+
+static const char trailing_bytes_for_utf8[256] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
+};
+
+/* ================================================================== */
+/* UTF-8 validation routine inspired by Jeff Bezanson                 */
+/*   placed in the public domain Fall 2005                            */
+/*   (https://github.com/JeffBezanson/cutef8)                         */
+/*                                                                    */
+/* Returns 1 if str contains a valid UTF-8 byte sequence, 0 otherwise */
+/* ================================================================== */
+int
+utf8_validate(const char * str, size_t length)
+{
+  const unsigned char *p, *pend = (const unsigned char *)str + length;
+  unsigned char        c;
+  size_t               ab;
+
+  for (p = (const unsigned char *)str; p < pend; p++)
+  {
+    c = *p;
+    if (c < 128)
+      continue;
+    if ((c & 0xc0) != 0xc0)
+      return 0;
+    ab = trailing_bytes_for_utf8[c];
+    if (length < ab)
+      return 0;
+    length -= ab;
+
+    p++;
+    /* Check top bits in the second byte */
+    /* """"""""""""""""""""""""""""""""" */
+    if ((*p & 0xc0) != 0x80)
+      return 0;
+
+    /* Check for overlong sequences for each different length */
+    /* """""""""""""""""""""""""""""""""""""""""""""""""""""" */
+    switch (ab)
+    {
+      /* Check for xx00 000x */
+      /* """"""""""""""""""" */
+      case 1:
+        if ((c & 0x3e) == 0)
+          return 0;
+        continue; /* We know there aren't any more bytes to check */
+
+      /* Check for 1110 0000, xx0x xxxx */
+      /* """""""""""""""""""""""""""""" */
+      case 2:
+        if (c == 0xe0 && (*p & 0x20) == 0)
+          return 0;
+        break;
+
+      /* Check for 1111 0000, xx00 xxxx */
+      /* """""""""""""""""""""""""""""" */
+      case 3:
+        if (c == 0xf0 && (*p & 0x30) == 0)
+          return 0;
+        break;
+
+      /* Check for 1111 1000, xx00 0xxx */
+      /* """""""""""""""""""""""""""""" */
+      case 4:
+        if (c == 0xf8 && (*p & 0x38) == 0)
+          return 0;
+        break;
+
+      /* Check for leading 0xfe or 0xff,   */
+      /* and then for 1111 1100, xx00 00xx */
+      /* """"""""""""""""""""""""""""""""" */
+      case 5:
+        if (c == 0xfe || c == 0xff || (c == 0xfc && (*p & 0x3c) == 0))
+          return 0;
+        break;
+    }
+
+    /* Check for valid bytes after the 2nd, if any; all must start 10 */
+    /* """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
+    while (--ab > 0)
+    {
+      if ((*(++p) & 0xc0) != 0x80)
+        return 0;
+    }
+  }
+
+  return 1;
+}
+
+/* ====================== */
+/* Multibyte UTF-8 strlen */
+/* ====================== */
+size_t
+utf8_strlen(char * str)
+{
+  size_t i = 0, j = 0;
+
+  while (str[i])
+  {
+    if ((str[i] & 0xc0) != 0x80)
+      j++;
+    i++;
+  }
+  return j;
+}
+
+/* =================================================================== */
+/* Multibytes extraction of the prefix of n UTF-8 glyphs from a string */
+/* The destination string d must have been allocated before.           */
+/* pos is updated to reflect the position AFTER the prefix.            */
+/* =================================================================== */
+char *
+utf8_strprefix(char * d, char * s, long n, long * pos)
+{
+  long i = 0;
+  long j = 0;
+
+  *pos = 0;
+
+  while (s[i] && j < n)
+  {
+    d[i] = s[i];
+    i++;
+    j++;
+    while (s[i] && (s[i] & 0xC0) == 0x80)
+    {
+      d[i] = s[i];
+      i++;
+    }
+  }
+
+  *pos = i;
+
+  d[i] = '\0';
+
+  return d;
+}
+
+/* ================================================ */
+/* Convert a UTF-8 glyph string to a wchar_t string */
+/* ================================================ */
+wchar_t *
+utf8_strtowcs(char * s)
+{
+  int             converted = 0;
+  unsigned char * ch;
+  wchar_t *       wptr, *w;
+  size_t          size;
+
+  size = (long)strlen(s);
+  w    = xmalloc((size + 1) * sizeof(wchar_t));
+  w[0] = L'\0';
+
+  wptr = w;
+  for (ch = (unsigned char *)s; *ch; ch += converted)
+  {
+    if ((converted = mbtowc(wptr, (char *)ch, 4)) > 0)
+      wptr++;
+    else
+    {
+      *wptr++   = (wchar_t)*ch;
+      converted = 1;
+    }
+  }
+
+  *wptr = L'\0';
+
+  return w;
+}
+
+/* ============================================================== */
+/* Fill dst whi a lowercase ocopy of src whar the character is an */
+/* ascci one. dsk must be preallocated before the call.           */
+/* ============================================================== */
+void
+utf8_strtolower(char * dst, char * src)
+{
+  unsigned char c;
+
+  while ((c = *src))
+  {
+    if (c >= 0x80)
+      *dst = c;
+    else
+      *dst = tolower(c);
+
+    src++;
+    dst++;
+  }
+  *dst = '\0';
+}
author	pgen <p.gen.progs@gmail.com>	2018-09-27 23:16:48 +0200
committer	pgen <p.gen.progs@gmail.com>	2018-10-01 19:36:56 +0200
commit	f07e9d15916ac14721c9819c6d245a28f685f42b (patch)
tree	20bf5c45117afcfeacf0e4f66d19ccf68c9e3d4f /utf8.c
parent	1b52c4c75422742f04363048dda64999adb8402c (diff)