Further UTF-8 entry fixes

author: pgen <p.gen.progs@gmail.com> 2020-07-12 01:59:11 +0200
committer: pgen <p.gen.progs@gmail.com> 2020-07-18 18:31:03 +0200
commit: d9a12b47298503813878d55d7dd8d998088fca09 (patch)
tree: 69883a744d153fd696aa4544bcff7c938acde642
parent: ecb544da53f6c7cba0191f4d7c5764dcd6b864a2 (diff)
5 files changed, 36 insertions, 23 deletions
diff --git a/smenu.1 b/smenu.1
index b328c7b..0587a9b 100644
--- a/smenu.1
+++ b/smenu.1
@@ -128,15 +128,15 @@ There are nevertheless a possibilities to change this substitution
 character with another \fBASCII\fP printable one with the help of the
 command line option \fB-.\fP|\fB-dot\fP|\fB-invalid\fP.
 .PP
+\fBWarning\fP, \fBUTF-8\fP encoded codepoints are quietly converted
+into the substitution character when the user locale is not \fBUTF-8\fP
+aware like \fBPOSIX\fP or \fBC\fP by example.
+.PP
 Words containing only spaces, entered directly or resulting from a
 substitution, are also rejected unless they are not selectable.
 This allows special effects like creating blank lines for example.
 These words are also kept in column mode, selectable or not.
 .PP
-\fBWarning\fP, \fBUTF-8\fP encoded codepoints are quietly converted
-into the substitution character when the user locale is not \fBUTF-8\fP
-aware like \fBPOSIX\fP or \fBC\fP by example.
-.PP
 smenu has an option to define a set of characters or UTF-8 sequences
 which should be ignored when reading words.
 This can be very useful when dealing with inputs where the EOL sequence
diff --git a/tests/utf8/data2 b/tests/utf8/data2
index 843db68..be1d7d9 100644
--- a/tests/utf8/data2
+++ b/tests/utf8/data2
@@ -1,5 +1,7 @@
 *\u3* *\u0a* *\u34* *\u345* *\u3456* *\uc3*4*
-*\u45* *\u451* *\u4512* *\u45123* 
-*\u45\u46* *\u451\u46* 
+*\u45* *\u451* *\u4512* *\u45123*
+*\u45\u46* *\u451\u46*
 *\uc3\u45* *\uc3a\u45*
-*\uefb899\uf0908589*
+*\uefb899ø\uf0908589* *ø\uc3a9\uc3aaabø*
+*\uc3a9*\uf0* *\uc3a9*\uf0aa* *\uc3a9*\uf0aaab*
+*\uc3a9*\uF09D849E* *\uC3A9*\uF09D84aa* 
diff --git a/tests/utf8/t0002.good b/tests/utf8/t0002.good
index e4cff28..ee21714 100644
--- a/tests/utf8/t0002.good
+++ b/tests/utf8/t0002.good
@@ -1,19 +1,23 @@
-$ OUT=$(LC_ALL=en_US.UTF-8 smenu -c t0002.in)
+$ OUT=$(LC_ALL=en_US.UTF-8 smenu -n 7 -c t0002.in)
 
-*.    *\n*   *4*   *45*   *456* *.* 
-0:07 1:07 2:07 3:07 4:07 
-*E*   *E1*   *E12* *E123* 
+*.     *\n*     *4*   *45*   *456* *.*4* 
 
-*EF*  *E1F*  
+*E*    *E1*     *E12* *E123* 
 
-*.45* *.u45* 
+*EF*   *E1F*    
 
-*︙𐅉* 
+*.E*   *.aE*    
 
+*︙ø𐅉* *øéêabø* 
+
+*é*.*  *é*.*    *é*.* 
+
+*é*𝄞*  *é*𝄪*    
+0:07 1:07 2:07 3:07 4:07 5:07 
 $ 
 
 $ echo ":$OUT:"
 
-:*.:
+:*é*𝄞*:
 
 $ exit 0
diff --git a/tests/utf8/t0002.tst b/tests/utf8/t0002.tst
index 6949f77..a7fbab8 100644
--- a/tests/utf8/t0002.tst
+++ b/tests/utf8/t0002.tst
@@ -1,4 +1,4 @@
-\S[150]\s[10]OUT=$(LC_ALL=en_US.UTF-8 smenu -c t0002.in)
-\S[150]\s[150]\r
+\S[150]\s[10]OUT=$(LC_ALL=en_US.UTF-8 smenu -n 7 -c t0002.in)
+\S[200]\s[200]jjjjjj\r
 \S[150]\s[10]echo ":$\s[10]OUT:"
 exit 0
diff --git a/utf8.c b/utf8.c
index 504f9c9..406bea6 100644
--- a/utf8.c
+++ b/utf8.c
@@ -21,7 +21,9 @@
 /* Unicode (UTF-8) ascii representation interpreter.                       */
 /* The string passed will be altered but its address will not change.      */
 /* All hexadecimal sequences of \uxx, \uxxxx, \uxxxxxx and \uxxxxxxxx will */
-/* be replace by the corresponding UTF-8 character.                        */
+/* be replace by the corresponding UTF-8 character when possible.          */
+/* When not possible the substitution character is substituted in place.   */
+/* Returns 0 if the conversion has faild else 1.                           */
 /* ======================================================================= */
 int
 utf8_interpret(char * s, langinfo_t * langinfo, char substitute)
@@ -83,6 +85,7 @@ utf8_interpret(char * s, langinfo_t * langinfo, char substitute)
       {
         int    n;
         size_t i;
+        char   b[2] = { ' ', ' ' };
 
         /* They are valid, deduce from them the length of the sequence */
         /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
@@ -93,14 +96,16 @@ utf8_interpret(char * s, langinfo_t * langinfo, char substitute)
         /* replace the \u sequence by the bytes forming the UTF-8 char */
         /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
 
-        *tmp = byte;
-
         /* Put the bytes in the tmp string */
         /* ''''''''''''''''''''''''''''''' */
+        *tmp = byte; /* Reuse the tmp array. */
+
         for (i = 1; i < utf8_ascii_len / 2; i++)
         {
-          n = sscanf(utf8_seq_offset + 2 * i, "%2x", &byte);
-          if (n == 0 || (byte & 0xc0) != 0x80)
+          n = sscanf(utf8_seq_offset + 2 * i, "%c%c", &b[0], &b[1]);
+          sscanf(b, "%x", &byte);
+
+          if (n < 2 || (byte & 0xc0) != 0x80)
             utf8_ascii_len = 2 * i; /* Force the new length according to the *
                                      | number of valid UTF-8 bytes read.     */
           else
@@ -110,7 +115,7 @@ utf8_interpret(char * s, langinfo_t * langinfo, char substitute)
 
         /* Does they form a valid UTF-8 char? */
         /* '''''''''''''''''''''''''''''''''' */
-        if (langinfo->utf8 && utf8_validate(tmp, utf8_ascii_len / 2))
+        if (utf8_validate(tmp, utf8_ascii_len / 2))
         {
           /* Put them back in the original string and move */
           /* the remaining bytes after them                */
@@ -130,11 +135,13 @@ utf8_interpret(char * s, langinfo_t * langinfo, char substitute)
           /* substitution character.               */
           /* ''''''''''''''''''''''''''''''''''''' */
           *utf8_str = substitute;
+
           if (utf8_to_eos_len < utf8_ascii_len)
             *(utf8_str + 1) = '\0';
           else
             memmove(utf8_str + 1, utf8_seq_offset + utf8_ascii_len,
                     utf8_to_eos_len - utf8_ascii_len - 2 + 1);
+
           utf8_ascii_len = 2;
           rc             = 0;
         }
author	pgen <p.gen.progs@gmail.com>	2020-07-12 01:59:11 +0200
committer	pgen <p.gen.progs@gmail.com>	2020-07-18 18:31:03 +0200
commit	d9a12b47298503813878d55d7dd8d998088fca09 (patch)
tree	69883a744d153fd696aa4544bcff7c938acde642
parent	ecb544da53f6c7cba0191f4d7c5764dcd6b864a2 (diff)