diff options
author | pgen <p.gen.progs@gmail.com> | 2020-07-12 01:59:11 +0200 |
---|---|---|
committer | pgen <p.gen.progs@gmail.com> | 2020-07-18 18:31:03 +0200 |
commit | d9a12b47298503813878d55d7dd8d998088fca09 (patch) | |
tree | 69883a744d153fd696aa4544bcff7c938acde642 | |
parent | ecb544da53f6c7cba0191f4d7c5764dcd6b864a2 (diff) |
Further UTF-8 entry fixes
-rw-r--r-- | smenu.1 | 8 | ||||
-rw-r--r-- | tests/utf8/data2 | 8 | ||||
-rw-r--r-- | tests/utf8/t0002.good | 20 | ||||
-rw-r--r-- | tests/utf8/t0002.tst | 4 | ||||
-rw-r--r-- | utf8.c | 19 |
5 files changed, 36 insertions, 23 deletions
@@ -128,15 +128,15 @@ There are nevertheless a possibilities to change this substitution character with another \fBASCII\fP printable one with the help of the command line option \fB-.\fP|\fB-dot\fP|\fB-invalid\fP. .PP +\fBWarning\fP, \fBUTF-8\fP encoded codepoints are quietly converted +into the substitution character when the user locale is not \fBUTF-8\fP +aware like \fBPOSIX\fP or \fBC\fP by example. +.PP Words containing only spaces, entered directly or resulting from a substitution, are also rejected unless they are not selectable. This allows special effects like creating blank lines for example. These words are also kept in column mode, selectable or not. .PP -\fBWarning\fP, \fBUTF-8\fP encoded codepoints are quietly converted -into the substitution character when the user locale is not \fBUTF-8\fP -aware like \fBPOSIX\fP or \fBC\fP by example. -.PP smenu has an option to define a set of characters or UTF-8 sequences which should be ignored when reading words. This can be very useful when dealing with inputs where the EOL sequence diff --git a/tests/utf8/data2 b/tests/utf8/data2 index 843db68..be1d7d9 100644 --- a/tests/utf8/data2 +++ b/tests/utf8/data2 @@ -1,5 +1,7 @@ *\u3* *\u0a* *\u34* *\u345* *\u3456* *\uc3*4* -*\u45* *\u451* *\u4512* *\u45123* -*\u45\u46* *\u451\u46* +*\u45* *\u451* *\u4512* *\u45123* +*\u45\u46* *\u451\u46* *\uc3\u45* *\uc3a\u45* -*\uefb899\uf0908589* +*\uefb899ø\uf0908589* *ø\uc3a9\uc3aaabø* +*\uc3a9*\uf0* *\uc3a9*\uf0aa* *\uc3a9*\uf0aaab* +*\uc3a9*\uF09D849E* *\uC3A9*\uF09D84aa* diff --git a/tests/utf8/t0002.good b/tests/utf8/t0002.good index e4cff28..ee21714 100644 --- a/tests/utf8/t0002.good +++ b/tests/utf8/t0002.good @@ -1,19 +1,23 @@ -$ OUT=$(LC_ALL=en_US.UTF-8 smenu -c t0002.in) +$ OUT=$(LC_ALL=en_US.UTF-8 smenu -n 7 -c t0002.in) -*. *\n* *4* *45* *456* *.* -0:07 1:07 2:07 3:07 4:07 -*E* *E1* *E12* *E123* +*. *\n* *4* *45* *456* *.*4* -*EF* *E1F* +*E* *E1* *E12* *E123* -*.45* *.u45* +*EF* *E1F* -*︙𐅉* +*.E* *.aE* +*︙ø𐅉* *øéêabø* + +*é*.* *é*.* *é*.* + +*é*𝄞* *é*𝄪* +0:07 1:07 2:07 3:07 4:07 5:07 $ $ echo ":$OUT:" -:*.: +:*é*𝄞*: $ exit 0 diff --git a/tests/utf8/t0002.tst b/tests/utf8/t0002.tst index 6949f77..a7fbab8 100644 --- a/tests/utf8/t0002.tst +++ b/tests/utf8/t0002.tst @@ -1,4 +1,4 @@ -\S[150]\s[10]OUT=$(LC_ALL=en_US.UTF-8 smenu -c t0002.in) -\S[150]\s[150]\r +\S[150]\s[10]OUT=$(LC_ALL=en_US.UTF-8 smenu -n 7 -c t0002.in) +\S[200]\s[200]jjjjjj\r \S[150]\s[10]echo ":$\s[10]OUT:" exit 0 @@ -21,7 +21,9 @@ /* Unicode (UTF-8) ascii representation interpreter. */ /* The string passed will be altered but its address will not change. */ /* All hexadecimal sequences of \uxx, \uxxxx, \uxxxxxx and \uxxxxxxxx will */ -/* be replace by the corresponding UTF-8 character. */ +/* be replace by the corresponding UTF-8 character when possible. */ +/* When not possible the substitution character is substituted in place. */ +/* Returns 0 if the conversion has faild else 1. */ /* ======================================================================= */ int utf8_interpret(char * s, langinfo_t * langinfo, char substitute) @@ -83,6 +85,7 @@ utf8_interpret(char * s, langinfo_t * langinfo, char substitute) { int n; size_t i; + char b[2] = { ' ', ' ' }; /* They are valid, deduce from them the length of the sequence */ /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */ @@ -93,14 +96,16 @@ utf8_interpret(char * s, langinfo_t * langinfo, char substitute) /* replace the \u sequence by the bytes forming the UTF-8 char */ /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */ - *tmp = byte; - /* Put the bytes in the tmp string */ /* ''''''''''''''''''''''''''''''' */ + *tmp = byte; /* Reuse the tmp array. */ + for (i = 1; i < utf8_ascii_len / 2; i++) { - n = sscanf(utf8_seq_offset + 2 * i, "%2x", &byte); - if (n == 0 || (byte & 0xc0) != 0x80) + n = sscanf(utf8_seq_offset + 2 * i, "%c%c", &b[0], &b[1]); + sscanf(b, "%x", &byte); + + if (n < 2 || (byte & 0xc0) != 0x80) utf8_ascii_len = 2 * i; /* Force the new length according to the * | number of valid UTF-8 bytes read. */ else @@ -110,7 +115,7 @@ utf8_interpret(char * s, langinfo_t * langinfo, char substitute) /* Does they form a valid UTF-8 char? */ /* '''''''''''''''''''''''''''''''''' */ - if (langinfo->utf8 && utf8_validate(tmp, utf8_ascii_len / 2)) + if (utf8_validate(tmp, utf8_ascii_len / 2)) { /* Put them back in the original string and move */ /* the remaining bytes after them */ @@ -130,11 +135,13 @@ utf8_interpret(char * s, langinfo_t * langinfo, char substitute) /* substitution character. */ /* ''''''''''''''''''''''''''''''''''''' */ *utf8_str = substitute; + if (utf8_to_eos_len < utf8_ascii_len) *(utf8_str + 1) = '\0'; else memmove(utf8_str + 1, utf8_seq_offset + utf8_ascii_len, utf8_to_eos_len - utf8_ascii_len - 2 + 1); + utf8_ascii_len = 2; rc = 0; } |