summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorpgen <p.gen.progs@gmail.com>2020-07-12 01:59:11 +0200
committerpgen <p.gen.progs@gmail.com>2020-07-18 18:31:03 +0200
commitd9a12b47298503813878d55d7dd8d998088fca09 (patch)
tree69883a744d153fd696aa4544bcff7c938acde642
parentecb544da53f6c7cba0191f4d7c5764dcd6b864a2 (diff)
Further UTF-8 entry fixes
-rw-r--r--smenu.18
-rw-r--r--tests/utf8/data28
-rw-r--r--tests/utf8/t0002.good20
-rw-r--r--tests/utf8/t0002.tst4
-rw-r--r--utf8.c19
5 files changed, 36 insertions, 23 deletions
diff --git a/smenu.1 b/smenu.1
index b328c7b..0587a9b 100644
--- a/smenu.1
+++ b/smenu.1
@@ -128,15 +128,15 @@ There are nevertheless a possibilities to change this substitution
character with another \fBASCII\fP printable one with the help of the
command line option \fB-.\fP|\fB-dot\fP|\fB-invalid\fP.
.PP
+\fBWarning\fP, \fBUTF-8\fP encoded codepoints are quietly converted
+into the substitution character when the user locale is not \fBUTF-8\fP
+aware like \fBPOSIX\fP or \fBC\fP by example.
+.PP
Words containing only spaces, entered directly or resulting from a
substitution, are also rejected unless they are not selectable.
This allows special effects like creating blank lines for example.
These words are also kept in column mode, selectable or not.
.PP
-\fBWarning\fP, \fBUTF-8\fP encoded codepoints are quietly converted
-into the substitution character when the user locale is not \fBUTF-8\fP
-aware like \fBPOSIX\fP or \fBC\fP by example.
-.PP
smenu has an option to define a set of characters or UTF-8 sequences
which should be ignored when reading words.
This can be very useful when dealing with inputs where the EOL sequence
diff --git a/tests/utf8/data2 b/tests/utf8/data2
index 843db68..be1d7d9 100644
--- a/tests/utf8/data2
+++ b/tests/utf8/data2
@@ -1,5 +1,7 @@
*\u3* *\u0a* *\u34* *\u345* *\u3456* *\uc3*4*
-*\u45* *\u451* *\u4512* *\u45123*
-*\u45\u46* *\u451\u46*
+*\u45* *\u451* *\u4512* *\u45123*
+*\u45\u46* *\u451\u46*
*\uc3\u45* *\uc3a\u45*
-*\uefb899\uf0908589*
+*\uefb899ø\uf0908589* *ø\uc3a9\uc3aaabø*
+*\uc3a9*\uf0* *\uc3a9*\uf0aa* *\uc3a9*\uf0aaab*
+*\uc3a9*\uF09D849E* *\uC3A9*\uF09D84aa*
diff --git a/tests/utf8/t0002.good b/tests/utf8/t0002.good
index e4cff28..ee21714 100644
--- a/tests/utf8/t0002.good
+++ b/tests/utf8/t0002.good
@@ -1,19 +1,23 @@
-$ OUT=$(LC_ALL=en_US.UTF-8 smenu -c t0002.in)
+$ OUT=$(LC_ALL=en_US.UTF-8 smenu -n 7 -c t0002.in)
-*. *\n* *4* *45* *456* *.*
-0:07 1:07 2:07 3:07 4:07
-*E* *E1* *E12* *E123*
+*. *\n* *4* *45* *456* *.*4*
-*EF* *E1F*
+*E* *E1* *E12* *E123*
-*.45* *.u45*
+*EF* *E1F*
-*︙𐅉*
+*.E* *.aE*
+*︙ø𐅉* *øéêabø*
+
+*é*.* *é*.* *é*.*
+
+*é*𝄞* *é*𝄪*
+0:07 1:07 2:07 3:07 4:07 5:07
$
$ echo ":$OUT:"
-:*.:
+:*é*𝄞*:
$ exit 0
diff --git a/tests/utf8/t0002.tst b/tests/utf8/t0002.tst
index 6949f77..a7fbab8 100644
--- a/tests/utf8/t0002.tst
+++ b/tests/utf8/t0002.tst
@@ -1,4 +1,4 @@
-\S[150]\s[10]OUT=$(LC_ALL=en_US.UTF-8 smenu -c t0002.in)
-\S[150]\s[150]\r
+\S[150]\s[10]OUT=$(LC_ALL=en_US.UTF-8 smenu -n 7 -c t0002.in)
+\S[200]\s[200]jjjjjj\r
\S[150]\s[10]echo ":$\s[10]OUT:"
exit 0
diff --git a/utf8.c b/utf8.c
index 504f9c9..406bea6 100644
--- a/utf8.c
+++ b/utf8.c
@@ -21,7 +21,9 @@
/* Unicode (UTF-8) ascii representation interpreter. */
/* The string passed will be altered but its address will not change. */
/* All hexadecimal sequences of \uxx, \uxxxx, \uxxxxxx and \uxxxxxxxx will */
-/* be replace by the corresponding UTF-8 character. */
+/* be replace by the corresponding UTF-8 character when possible. */
+/* When not possible the substitution character is substituted in place. */
+/* Returns 0 if the conversion has faild else 1. */
/* ======================================================================= */
int
utf8_interpret(char * s, langinfo_t * langinfo, char substitute)
@@ -83,6 +85,7 @@ utf8_interpret(char * s, langinfo_t * langinfo, char substitute)
{
int n;
size_t i;
+ char b[2] = { ' ', ' ' };
/* They are valid, deduce from them the length of the sequence */
/* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
@@ -93,14 +96,16 @@ utf8_interpret(char * s, langinfo_t * langinfo, char substitute)
/* replace the \u sequence by the bytes forming the UTF-8 char */
/* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
- *tmp = byte;
-
/* Put the bytes in the tmp string */
/* ''''''''''''''''''''''''''''''' */
+ *tmp = byte; /* Reuse the tmp array. */
+
for (i = 1; i < utf8_ascii_len / 2; i++)
{
- n = sscanf(utf8_seq_offset + 2 * i, "%2x", &byte);
- if (n == 0 || (byte & 0xc0) != 0x80)
+ n = sscanf(utf8_seq_offset + 2 * i, "%c%c", &b[0], &b[1]);
+ sscanf(b, "%x", &byte);
+
+ if (n < 2 || (byte & 0xc0) != 0x80)
utf8_ascii_len = 2 * i; /* Force the new length according to the *
| number of valid UTF-8 bytes read. */
else
@@ -110,7 +115,7 @@ utf8_interpret(char * s, langinfo_t * langinfo, char substitute)
/* Does they form a valid UTF-8 char? */
/* '''''''''''''''''''''''''''''''''' */
- if (langinfo->utf8 && utf8_validate(tmp, utf8_ascii_len / 2))
+ if (utf8_validate(tmp, utf8_ascii_len / 2))
{
/* Put them back in the original string and move */
/* the remaining bytes after them */
@@ -130,11 +135,13 @@ utf8_interpret(char * s, langinfo_t * langinfo, char substitute)
/* substitution character. */
/* ''''''''''''''''''''''''''''''''''''' */
*utf8_str = substitute;
+
if (utf8_to_eos_len < utf8_ascii_len)
*(utf8_str + 1) = '\0';
else
memmove(utf8_str + 1, utf8_seq_offset + utf8_ascii_len,
utf8_to_eos_len - utf8_ascii_len - 2 + 1);
+
utf8_ascii_len = 2;
rc = 0;
}