Allow to change the substitution character

A pointer to the misc structure has to be passed to a certain number of functions for that, but this prepares the ground for future developments of this type. The code to interpret \u (UTF-8) sequences has also been made more robust for better manage incorrect entries.
author: pgen <p.gen.progs@gmail.com> 2020-07-12 00:11:00 +0200
committer: pgen <p.gen.progs@gmail.com> 2020-07-18 18:31:03 +0200
commit: ecb544da53f6c7cba0191f4d7c5764dcd6b864a2 (patch)
tree: 1479a4ba9bc5df3bd0c56b23ae696804aad0e07e /utf8.c
parent: 01f9896fb17ea1cb7228dcd13b655f3812a72919 (diff)
1 files changed, 80 insertions, 74 deletions
diff --git a/utf8.c b/utf8.c
index bf007b2..504f9c9 100644
--- a/utf8.c
+++ b/utf8.c
@@ -23,21 +23,23 @@
 /* All hexadecimal sequences of \uxx, \uxxxx, \uxxxxxx and \uxxxxxxxx will */
 /* be replace by the corresponding UTF-8 character.                        */
 /* ======================================================================= */
-void
-utf8_interpret(char * s, langinfo_t * langinfo)
+int
+utf8_interpret(char * s, langinfo_t * langinfo, char substitute)
 {
-  char * utf8_str;          /* \uxx...                                        */
-  size_t utf8_to_eos_len;   /* bytes in s starting from the first             *
-                             * occurrence of \u                               */
-  size_t init_len;          /* initial lengths of the string to interpret     */
-  size_t utf8_ascii_len;    /* 2,4,6 or 8 bytes                               */
-  size_t len_to_remove = 0; /* number of bytes to remove after the conversion */
-  char   tmp[9];            /* temporary string                               */
-
-  /* Guard against the case where s is NULL */
-  /* """""""""""""""""""""""""""""""""""""" */
+  char * utf8_str;          /* \uxx...                                     */
+  size_t utf8_to_eos_len;   /* bytes in s starting from the first          *
+                             * occurrence of \u.                           */
+  size_t init_len;          /* initial lengths of the string to interpret  */
+  size_t utf8_ascii_len;    /* 2,4,6 or 8 bytes.                           */
+  size_t len_to_remove = 0; /* number of bytes to remove after the         *
+                             | conversion.                                 */
+  char tmp[9];              /* temporary string.                           */
+  int  rc = 1;              /* return code, 0: error, 1: fine.             */
+
+  /* Guard against the case where s is NULL. */
+  /* """"""""""""""""""""""""""""""""""""""" */
   if (s == NULL)
-    return;
+    return 0;
 
   init_len = strlen(s);
 
@@ -45,12 +47,13 @@ utf8_interpret(char * s, langinfo_t * langinfo)
   {
     utf8_to_eos_len = strlen(utf8_str);
     if (utf8_to_eos_len < 4) /* string too short to contain *
-                              * a valid UTF-8 char          */
+                              | a valid UTF-8 char.         */
     {
-      *utf8_str       = '.';
+      *utf8_str       = substitute;
       *(utf8_str + 1) = '\0';
+      rc              = 0;
     }
-    else /* s is long enough */
+    else /* s is long enough. */
     {
       unsigned byte;
       char *   utf8_seq_offset = utf8_str + 2;
@@ -60,78 +63,80 @@ utf8_interpret(char * s, langinfo_t * langinfo)
       *(tmp + 1) = *(utf8_seq_offset + 1);
       *(tmp + 2) = '\0';
 
-      /* If they are invalid, replace the \u sequence by a dot */
-      /* """"""""""""""""""""""""""""""""""""""""""""""""""""" */
+      /* If they are invalid, replace the \u sequence by the */
+      /* substitute character.                               */
+      /* """"""""""""""""""""""""""""""""""""""""""""""""""" */
       if (!isxdigit(tmp[0]) || !isxdigit(tmp[1]))
       {
-        *utf8_str = '.';
+        *utf8_str = substitute;
         if (4 >= utf8_to_eos_len)
           *(utf8_str + 1) = '\0';
         else
-          memmove(utf8_str, utf8_str + 4, utf8_to_eos_len - 4);
-        return;
+        {
+          /* Do not forget the training \0 */
+          /* ''''''''''''''''''''''''''''' */
+          memmove(utf8_str + 1, utf8_str + 4, utf8_to_eos_len - 4 + 1);
+        }
+        rc = 0;
       }
       else
       {
+        int    n;
+        size_t i;
+
         /* They are valid, deduce from them the length of the sequence */
         /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
         sscanf(tmp, "%2x", &byte);
+
         utf8_ascii_len = utf8_get_length(byte) * 2;
 
-        /* Check again if the inputs string is long enough */
-        /* """"""""""""""""""""""""""""""""""""""""""""""" */
-        if (utf8_to_eos_len - 2 < utf8_ascii_len)
+        /* replace the \u sequence by the bytes forming the UTF-8 char */
+        /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
+
+        *tmp = byte;
+
+        /* Put the bytes in the tmp string */
+        /* ''''''''''''''''''''''''''''''' */
+        for (i = 1; i < utf8_ascii_len / 2; i++)
         {
-          *utf8_str       = '.';
-          *(utf8_str + 1) = '\0';
+          n = sscanf(utf8_seq_offset + 2 * i, "%2x", &byte);
+          if (n == 0 || (byte & 0xc0) != 0x80)
+            utf8_ascii_len = 2 * i; /* Force the new length according to the *
+                                     | number of valid UTF-8 bytes read.     */
+          else
+            *(tmp + i) = byte;
+        }
+        tmp[utf8_ascii_len / 2] = '\0';
+
+        /* Does they form a valid UTF-8 char? */
+        /* '''''''''''''''''''''''''''''''''' */
+        if (langinfo->utf8 && utf8_validate(tmp, utf8_ascii_len / 2))
+        {
+          /* Put them back in the original string and move */
+          /* the remaining bytes after them                */
+          /* ''''''''''''''''''''''''''''''''''''''''''''' */
+          memmove(utf8_str, tmp, utf8_ascii_len / 2);
+
+          if (utf8_to_eos_len < utf8_ascii_len)
+            *(utf8_str + utf8_ascii_len / 2 + 1) = '\0';
+          else
+            memmove(utf8_str + utf8_ascii_len / 2,
+                    utf8_seq_offset + utf8_ascii_len,
+                    utf8_to_eos_len - utf8_ascii_len - 2 + 1);
         }
         else
         {
-          /* replace the \u sequence by the bytes forming the UTF-8 char */
-          /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
-          size_t i;
-          *tmp = byte;
-
-          /* Put the bytes in the tmp string */
-          /* ''''''''''''''''''''''''''''''' */
-          if (langinfo->utf8)
-          {
-            for (i = 1; i < utf8_ascii_len / 2; i++)
-            {
-              sscanf(utf8_seq_offset + 2 * i, "%2x", &byte);
-              *(tmp + i) = byte;
-            }
-            tmp[utf8_ascii_len / 2] = '\0';
-          }
-
-          /* Does they form a valid UTF-8 char? */
-          /* '''''''''''''''''''''''''''''''''' */
-          if (langinfo->utf8 && utf8_validate(tmp, utf8_ascii_len / 2))
-          {
-            /* Put them back in the original string and move */
-            /* the remaining bytes after them                */
-            /* ''''''''''''''''''''''''''''''''''''''''''''' */
-            memmove(utf8_str, tmp, utf8_ascii_len / 2);
-
-            if (utf8_to_eos_len < utf8_ascii_len)
-              *(utf8_str + utf8_ascii_len / 2 + 1) = '\0';
-            else
-              memmove(utf8_str + utf8_ascii_len / 2,
-                      utf8_seq_offset + utf8_ascii_len,
-                      utf8_to_eos_len - utf8_ascii_len - 2 + 1);
-          }
+          /* The invalid sequence is replaced by a */
+          /* substitution character.               */
+          /* ''''''''''''''''''''''''''''''''''''' */
+          *utf8_str = substitute;
+          if (utf8_to_eos_len < utf8_ascii_len)
+            *(utf8_str + 1) = '\0';
           else
-          {
-            /* The invalid sequence is replaced by a dot */
-            /* ''''''''''''''''''''''''''''''''''''''''' */
-            *utf8_str = '.';
-            if (utf8_to_eos_len < utf8_ascii_len)
-              *(utf8_str + 1) = '\0';
-            else
-              memmove(utf8_str + 1, utf8_seq_offset + utf8_ascii_len,
-                      utf8_to_eos_len - utf8_ascii_len - 2 + 1);
-            utf8_ascii_len = 2;
-          }
+            memmove(utf8_str + 1, utf8_seq_offset + utf8_ascii_len,
+                    utf8_to_eos_len - utf8_ascii_len - 2 + 1);
+          utf8_ascii_len = 2;
+          rc             = 0;
         }
 
         /* Update the number of bytes to remove at the end */
@@ -146,7 +151,7 @@ utf8_interpret(char * s, langinfo_t * langinfo)
   /* """""""""""""""""""""""""""""""""""""""""""" */
   *(s + init_len - len_to_remove) = '\0';
 
-  return;
+  return rc;
 }
 
 /* ========================================================= */
@@ -220,11 +225,12 @@ utf8_next(char * p)
 }
 
 /* ============================================================ */
-/* Replaces any UTF-8 glyph present in s by a dot in-place      */
+/* Replaces any UTF-8 glyph present in s by a substitution      */
+/* character in-place.                                          */
 /* s will be modified but its address in memory will not change */
 /* ============================================================ */
 void
-utf8_sanitize(char * s)
+utf8_sanitize(char * s, char substitute)
 {
   char * p = s;
   int    n;
@@ -236,7 +242,7 @@ utf8_sanitize(char * s)
     n = utf8_get_length(*p);
     if (n > 1)
     {
-      *p = '.';
+      *p = substitute;
       memmove(p + 1, p + n, len - (p - s) - n + 1);
       len -= (n - 1);
     }
author	pgen <p.gen.progs@gmail.com>	2020-07-12 00:11:00 +0200
committer	pgen <p.gen.progs@gmail.com>	2020-07-18 18:31:03 +0200
commit	ecb544da53f6c7cba0191f4d7c5764dcd6b864a2 (patch)
tree	1479a4ba9bc5df3bd0c56b23ae696804aad0e07e /utf8.c
parent	01f9896fb17ea1cb7228dcd13b655f3812a72919 (diff)