Initial support for UTF-8 ext. grapheme clusters

author: pgen <p.gen.progs@gmail.com> 2024-02-18 00:14:57 +0100
committer: pgen <p.gen.progs@gmail.com> 2024-02-18 23:14:02 +0100
commit: 5c63131b72c214ea8e5f5505f49ae7fff6fab44b (patch)
tree: 6b05afdd1183fea636a150518c8e0d4f9ad1cb9a
parent: 58d6a889133d9b6820471bebed029edce6a03212 (diff)
13 files changed, 110 insertions, 20 deletions
diff --git a/README.rst b/README.rst
index e0445d1..5d5680f 100644
--- a/README.rst
+++ b/README.rst
@@ -30,8 +30,11 @@ previous contents of the terminal are neither modified nor lost.
 I've tried to make it as easy to use as possible.
 It should work on all terminals managed in the ``terminfo`` database.
 
-The ``UTF-8`` encoding is supported, including for large characters,
-with the exception of grapheme groups for the time being.
+``UTF-8`` encoding is supported, including for double-width characters.
+Support for extended grapheme clusters is experimental but seems to work,
+with best results when appropriate terminals are used such as wezterm
+or iTerm.
+
 The encoding of ``UTF-8`` glyphs must also be in canonical form, as no
 effort will be made to put them in this form.
 
diff --git a/smenu.1 b/smenu.1
index fe06b46..aa74110 100755
--- a/smenu.1
+++ b/smenu.1
@@ -245,6 +245,12 @@ This can block sole shifting operations.
 With many terminal emulators, it is possible to use the mouse to interact
 with the screen content.
 
+\fBWarning\fP, if groups of extended graphemes are present in the input,
+mouse-based selection is only accurate if the terminal correctly displays
+these graphemes.
+An example of a non-functional terminal is xterm, an example of a
+functional terminal is wezterm.
+
 When the mouse is supported, the cursor can turn into an arrow (but
 not always) and the mouse can then be used as a point and click device
 as follows:
diff --git a/smenu.c b/smenu.c
index e09a0ae..a8bec6d 100755
--- a/smenu.c
+++ b/smenu.c
@@ -4025,7 +4025,7 @@ get_message_lines(char *message,
 
     /* If needed, update the message maximum width. */
     /* """""""""""""""""""""""""""""""""""""""""""" */
-    n = wcswidth((w = utf8_strtowcs(str)), utf8_strlen(str));
+    n = my_wcswidth((w = utf8_strtowcs(str)), utf8_strlen(str));
     free(w);
 
     if (n > *message_max_width)
@@ -4046,7 +4046,7 @@ get_message_lines(char *message,
   {
     ll_append(message_lines_list, xstrdup(ptr));
 
-    n = wcswidth((w = utf8_strtowcs(ptr)), utf8_strlen(ptr));
+    n = my_wcswidth((w = utf8_strtowcs(ptr)), utf8_strlen(ptr));
     free(w);
 
     if (n > *message_max_width)
@@ -4134,7 +4134,7 @@ build_metadata(term_t *term, long count, win_t *win)
     /*       has already been utf8_validated/repaired.             */
     /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
     word_len   = mbstowcs(NULL, word_a[i].str, 0);
-    word_width = wcswidth((w = utf8_strtowcs(word_a[i].str)), word_len);
+    word_width = my_wcswidth((w = utf8_strtowcs(word_a[i].str)), word_len);
 
     /* Manage the case where the word is larger than the terminal width. */
     /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
@@ -4144,7 +4144,7 @@ build_metadata(term_t *term, long count, win_t *win)
       /* """"""""""""""""""""""""""""""" */
       do
       {
-        word_width = wcswidth(w, word_len--);
+        word_width = my_wcswidth(w, word_len--);
       } while (word_len > 0 && word_width >= term->ncolumns - 2);
     }
     free(w);
@@ -4718,9 +4718,9 @@ disp_message(ll_t       *message_lines_list,
 
     /* Adjust size and len if the terminal is not large enough. */
     /* """""""""""""""""""""""""""""""""""""""""""""""""""""""" */
-    size = wcswidth(w, len);
+    size = my_wcswidth(w, len);
     while (len > 0 && size > term->ncolumns)
-      size = wcswidth(w, --len);
+      size = my_wcswidth(w, --len);
 
     free(w);
 
@@ -7052,7 +7052,7 @@ gutter_action(char  *ctx_name,
       win->gutter_a[i] = xcalloc(1, mblength + 1);
       memcpy(win->gutter_a[i], gutter + offset, mblength);
 
-      n = wcswidth((w = utf8_strtowcs(win->gutter_a[i])), 1);
+      n = my_wcswidth((w = utf8_strtowcs(win->gutter_a[i])), 1);
       free(w);
 
       if (n > 1)
@@ -7798,7 +7798,7 @@ da_options_action(char  *ctx_name,
           ctxopt_ctx_disp_usage(ctx_name, exit_after);
         }
 
-        n = wcswidth((w = utf8_strtowcs(daccess.left)), 1);
+        n = my_wcswidth((w = utf8_strtowcs(daccess.left)), 1);
         free(w);
 
         if (n > 1)
@@ -7823,7 +7823,7 @@ da_options_action(char  *ctx_name,
           ctxopt_ctx_disp_usage(ctx_name, exit_after);
         }
 
-        n = wcswidth((w = utf8_strtowcs(daccess.right)), 1);
+        n = my_wcswidth((w = utf8_strtowcs(daccess.right)), 1);
         free(w);
 
         if (n > 1)
@@ -7976,7 +7976,7 @@ da_options_action(char  *ctx_name,
           ctxopt_ctx_disp_usage(ctx_name, exit_after);
         }
 
-        n = wcswidth((w = utf8_strtowcs(daccess.num_sep)), 1);
+        n = my_wcswidth((w = utf8_strtowcs(daccess.num_sep)), 1);
         free(w);
 
         if (n > 1)
@@ -11817,7 +11817,7 @@ main(int argc, char *argv[])
       }
 
       s = (long)mbstowcs(NULL, word->str, 0);
-      s = wcswidth((tmpw = utf8_strtowcs(word->str)), s);
+      s = my_wcswidth((tmpw = utf8_strtowcs(word->str)), s);
       free(tmpw);
 
       if (s > col_max_size[col_index])
@@ -11845,7 +11845,7 @@ main(int argc, char *argv[])
       /* """"""""""""""""""""""""""""" */
       size = (long)mbstowcs(NULL, word->str, 0);
 
-      if ((size = wcswidth((tmpw = utf8_strtowcs(word->str)), size))
+      if ((size = my_wcswidth((tmpw = utf8_strtowcs(word->str)), size))
           > tab_max_size)
         tab_max_size = size;
 
@@ -12135,7 +12135,7 @@ main(int argc, char *argv[])
 
       s1         = (long)strlen(word_a[wi].str);
       word_width = mbstowcs(NULL, word_a[wi].str, 0);
-      s2         = wcswidth((w = utf8_strtowcs(word_a[wi].str)), word_width);
+      s2         = my_wcswidth((w = utf8_strtowcs(word_a[wi].str)), word_width);
       free(w);
 
       /* Use the 0x05 character as a placeholder to preserve the internal    */
@@ -12179,7 +12179,7 @@ main(int argc, char *argv[])
 
       s1         = (long)strlen(word_a[wi].str);
       word_width = mbstowcs(NULL, word_a[wi].str, 0);
-      s2         = wcswidth((w = utf8_strtowcs(word_a[wi].str)), word_width);
+      s2         = my_wcswidth((w = utf8_strtowcs(word_a[wi].str)), word_width);
       free(w);
       temp = xcalloc(1, tab_real_max_size + s1 - s2 + 1);
       memset(temp, ' ', tab_max_size + s1 - s2);
@@ -14153,7 +14153,7 @@ main(int argc, char *argv[])
                 str = ((output_t *)(node->data))->output_str;
 
                 fprintf(old_stdout, "%s", str);
-                width += wcswidth((w = utf8_strtowcs(str)), 65535);
+                width += my_wcswidth((w = utf8_strtowcs(str)), 65535);
                 free(w);
                 free(str);
                 free(node->data);
@@ -14161,7 +14161,7 @@ main(int argc, char *argv[])
                 if (win.sel_sep != NULL)
                 {
                   fprintf(old_stdout, "%s", win.sel_sep);
-                  width += wcswidth((w = utf8_strtowcs(win.sel_sep)), 65535);
+                  width += my_wcswidth((w = utf8_strtowcs(win.sel_sep)), 65535);
                   free(w);
                 }
                 else
@@ -14175,7 +14175,7 @@ main(int argc, char *argv[])
 
               str = ((output_t *)(node->data))->output_str;
               fprintf(old_stdout, "%s", str);
-              width += wcswidth((w = utf8_strtowcs(str)), 65535);
+              width += my_wcswidth((w = utf8_strtowcs(str)), 65535);
               free(w);
               free(str);
               free(node->data);
@@ -14216,7 +14216,7 @@ main(int argc, char *argv[])
                 rtrim(output_str, " \t", 0);
               }
 
-              width = wcswidth((w = utf8_strtowcs(output_str)), 65535);
+              width = my_wcswidth((w = utf8_strtowcs(output_str)), 65535);
               free(w);
 
               /* And print it. */
diff --git a/tests/utf8/data7 b/tests/utf8/data7
new file mode 100644
index 0000000..aa81599
--- /dev/null
+++ b/tests/utf8/data7
@@ -0,0 +1,3 @@
+x \U01F1FA\U01F1F8 x
+\uf09f87ab\uf09f87b7 x \uf09f87ab\uf09f87b7
+x \U01F1FA\U01F1F8 x
diff --git a/tests/utf8/data8 b/tests/utf8/data8
new file mode 100644
index 0000000..7fc45b5
--- /dev/null
+++ b/tests/utf8/data8
@@ -0,0 +1,2 @@
+x  y \ue29da4\uefb88f\ue2808d\uf09f94a5 x y
+\uf09f91a8\ue2808d\uf09f91a9\ue2808d\uf09f91a7 x y 👩\U00200d💻
diff --git a/tests/utf8/t0009.good b/tests/utf8/t0009.good
new file mode 100644
index 0000000..10598e9
--- /dev/null
+++ b/tests/utf8/t0009.good
@@ -0,0 +1,15 @@
+$ OUT=$(smenu -c t0009.in)
+
+x  🇺🇸 x  
+
+🇫🇷 x  🇫🇷 
+
+x  🇺🇸 x  
+6:07 7:07 
+$ 
+
+$ echo ":$OUT:"
+
+:x:
+
+$ exit 0
diff --git a/tests/utf8/t0009.in b/tests/utf8/t0009.in
new file mode 120000
index 0000000..592b7e2
--- /dev/null
+++ b/tests/utf8/t0009.in
@@ -0,0 +1 @@
+data7
+\ No newline at end of file
diff --git a/tests/utf8/t0009.tst b/tests/utf8/t0009.tst
new file mode 100644
index 0000000..9063d3a
--- /dev/null
+++ b/tests/utf8/t0009.tst
@@ -0,0 +1,4 @@
+\S[300]\s[80]OUT=$(smenu -c t0009.in)
+\S[300]\s[200]ljjl\r
+\S[300]\s[80]echo ":$\s[80]OUT:"
+exit 0
diff --git a/tests/utf8/t0010.good b/tests/utf8/t0010.good
new file mode 100644
index 0000000..3f58828
--- /dev/null
+++ b/tests/utf8/t0010.good
@@ -0,0 +1,13 @@
+$ OUT=$(smenu -c t0010.in)
+
+x  y ❤️‍🔥 x  y 
+
+👨‍👩‍👧 x y  👩‍💻 
+11:07 12:07 13:07 
+$ 
+
+$ echo ":$OUT:"
+
+:👩‍💻:
+
+$ exit 0
diff --git a/tests/utf8/t0010.in b/tests/utf8/t0010.in
new file mode 120000
index 0000000..3ce4e61
--- /dev/null
+++ b/tests/utf8/t0010.in
@@ -0,0 +1 @@
+data8
+\ No newline at end of file
diff --git a/tests/utf8/t0010.tst b/tests/utf8/t0010.tst
new file mode 100644
index 0000000..b66f6b9
--- /dev/null
+++ b/tests/utf8/t0010.tst
@@ -0,0 +1,4 @@
+\S[300]\s[80]OUT=$(smenu -c t0010.in)
+\S[300]\s[200]lljhhlll\r
+\S[300]\s[80]echo ":$\s[80]OUT:"
+exit 0
diff --git a/utils.c b/utils.c
index 010d16a..59adb75 100644
--- a/utils.c
+++ b/utils.c
@@ -20,6 +20,7 @@
 #include <stdarg.h>
 #include <wctype.h>
 #include "xmalloc.h"
+#include "wchar.h"
 #include "list.h"
 #include "utf8.h"
 #include "utils.h"
@@ -467,3 +468,37 @@ hexdump(const char *buf, FILE *fp, const char *prefix, size_t size)
     fprintf(fp, "\n");
   }
 }
+
+/* ===================================================================== */
+/* Version of wcswidth which tries to support extended grapheme clusters */
+/* by taking into zero width characters.                                 */
+/* ===================================================================== */
+int
+my_wcswidth(const wchar_t *s, size_t n)
+{
+  int len = 0;
+  int l   = 0;
+  int m   = 0;
+
+  if (s == NULL || *s == L'\0')
+    return 0;
+
+  while (*s && m < n)
+  {
+    if ((l = wcwidth(*s)) >= 0)
+    {
+      /* Do not count zero-width-length glyphs. */
+      /* """""""""""""""""""""""""""""""""""""" */
+      if (*s != L'\x200d' && *(s + 1) != L'\x200d' && *(s + 1) != L'\xfe0f'
+          && *(s + 1) != L'\x20e3')
+        len += l;
+    }
+    else
+      return -1; /* wcwidth returned -1. */
+
+    s++;
+    m++;
+  }
+
+  return len;
+}
diff --git a/utils.h b/utils.h
index 775bad1..c77d4c3 100644
--- a/utils.h
+++ b/utils.h
@@ -83,4 +83,7 @@ strprint(char const *s);
 void
 hexdump(const char *buf, FILE *fp, const char *prefix, size_t size);
 
+int
+my_wcswidth(const wchar_t *s, size_t n);
+
 #endif
author	pgen <p.gen.progs@gmail.com>	2024-02-18 00:14:57 +0100
committer	pgen <p.gen.progs@gmail.com>	2024-02-18 23:14:02 +0100
commit	5c63131b72c214ea8e5f5505f49ae7fff6fab44b (patch)
tree	6b05afdd1183fea636a150518c8e0d4f9ad1cb9a
parent	58d6a889133d9b6820471bebed029edce6a03212 (diff)