From 5c63131b72c214ea8e5f5505f49ae7fff6fab44b Mon Sep 17 00:00:00 2001 From: pgen Date: Sun, 18 Feb 2024 00:14:57 +0100 Subject: Initial support for UTF-8 ext. grapheme clusters --- README.rst | 7 +++++-- smenu.1 | 6 ++++++ smenu.c | 36 ++++++++++++++++++------------------ tests/utf8/data7 | 3 +++ tests/utf8/data8 | 2 ++ tests/utf8/t0009.good | 15 +++++++++++++++ tests/utf8/t0009.in | 1 + tests/utf8/t0009.tst | 4 ++++ tests/utf8/t0010.good | 13 +++++++++++++ tests/utf8/t0010.in | 1 + tests/utf8/t0010.tst | 4 ++++ utils.c | 35 +++++++++++++++++++++++++++++++++++ utils.h | 3 +++ 13 files changed, 110 insertions(+), 20 deletions(-) create mode 100644 tests/utf8/data7 create mode 100644 tests/utf8/data8 create mode 100644 tests/utf8/t0009.good create mode 120000 tests/utf8/t0009.in create mode 100644 tests/utf8/t0009.tst create mode 100644 tests/utf8/t0010.good create mode 120000 tests/utf8/t0010.in create mode 100644 tests/utf8/t0010.tst diff --git a/README.rst b/README.rst index e0445d1..5d5680f 100644 --- a/README.rst +++ b/README.rst @@ -30,8 +30,11 @@ previous contents of the terminal are neither modified nor lost. I've tried to make it as easy to use as possible. It should work on all terminals managed in the ``terminfo`` database. -The ``UTF-8`` encoding is supported, including for large characters, -with the exception of grapheme groups for the time being. +``UTF-8`` encoding is supported, including for double-width characters. +Support for extended grapheme clusters is experimental but seems to work, +with best results when appropriate terminals are used such as wezterm +or iTerm. + The encoding of ``UTF-8`` glyphs must also be in canonical form, as no effort will be made to put them in this form. diff --git a/smenu.1 b/smenu.1 index fe06b46..aa74110 100755 --- a/smenu.1 +++ b/smenu.1 @@ -245,6 +245,12 @@ This can block sole shifting operations. With many terminal emulators, it is possible to use the mouse to interact with the screen content. +\fBWarning\fP, if groups of extended graphemes are present in the input, +mouse-based selection is only accurate if the terminal correctly displays +these graphemes. +An example of a non-functional terminal is xterm, an example of a +functional terminal is wezterm. + When the mouse is supported, the cursor can turn into an arrow (but not always) and the mouse can then be used as a point and click device as follows: diff --git a/smenu.c b/smenu.c index e09a0ae..a8bec6d 100755 --- a/smenu.c +++ b/smenu.c @@ -4025,7 +4025,7 @@ get_message_lines(char *message, /* If needed, update the message maximum width. */ /* """""""""""""""""""""""""""""""""""""""""""" */ - n = wcswidth((w = utf8_strtowcs(str)), utf8_strlen(str)); + n = my_wcswidth((w = utf8_strtowcs(str)), utf8_strlen(str)); free(w); if (n > *message_max_width) @@ -4046,7 +4046,7 @@ get_message_lines(char *message, { ll_append(message_lines_list, xstrdup(ptr)); - n = wcswidth((w = utf8_strtowcs(ptr)), utf8_strlen(ptr)); + n = my_wcswidth((w = utf8_strtowcs(ptr)), utf8_strlen(ptr)); free(w); if (n > *message_max_width) @@ -4134,7 +4134,7 @@ build_metadata(term_t *term, long count, win_t *win) /* has already been utf8_validated/repaired. */ /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */ word_len = mbstowcs(NULL, word_a[i].str, 0); - word_width = wcswidth((w = utf8_strtowcs(word_a[i].str)), word_len); + word_width = my_wcswidth((w = utf8_strtowcs(word_a[i].str)), word_len); /* Manage the case where the word is larger than the terminal width. */ /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */ @@ -4144,7 +4144,7 @@ build_metadata(term_t *term, long count, win_t *win) /* """"""""""""""""""""""""""""""" */ do { - word_width = wcswidth(w, word_len--); + word_width = my_wcswidth(w, word_len--); } while (word_len > 0 && word_width >= term->ncolumns - 2); } free(w); @@ -4718,9 +4718,9 @@ disp_message(ll_t *message_lines_list, /* Adjust size and len if the terminal is not large enough. */ /* """""""""""""""""""""""""""""""""""""""""""""""""""""""" */ - size = wcswidth(w, len); + size = my_wcswidth(w, len); while (len > 0 && size > term->ncolumns) - size = wcswidth(w, --len); + size = my_wcswidth(w, --len); free(w); @@ -7052,7 +7052,7 @@ gutter_action(char *ctx_name, win->gutter_a[i] = xcalloc(1, mblength + 1); memcpy(win->gutter_a[i], gutter + offset, mblength); - n = wcswidth((w = utf8_strtowcs(win->gutter_a[i])), 1); + n = my_wcswidth((w = utf8_strtowcs(win->gutter_a[i])), 1); free(w); if (n > 1) @@ -7798,7 +7798,7 @@ da_options_action(char *ctx_name, ctxopt_ctx_disp_usage(ctx_name, exit_after); } - n = wcswidth((w = utf8_strtowcs(daccess.left)), 1); + n = my_wcswidth((w = utf8_strtowcs(daccess.left)), 1); free(w); if (n > 1) @@ -7823,7 +7823,7 @@ da_options_action(char *ctx_name, ctxopt_ctx_disp_usage(ctx_name, exit_after); } - n = wcswidth((w = utf8_strtowcs(daccess.right)), 1); + n = my_wcswidth((w = utf8_strtowcs(daccess.right)), 1); free(w); if (n > 1) @@ -7976,7 +7976,7 @@ da_options_action(char *ctx_name, ctxopt_ctx_disp_usage(ctx_name, exit_after); } - n = wcswidth((w = utf8_strtowcs(daccess.num_sep)), 1); + n = my_wcswidth((w = utf8_strtowcs(daccess.num_sep)), 1); free(w); if (n > 1) @@ -11817,7 +11817,7 @@ main(int argc, char *argv[]) } s = (long)mbstowcs(NULL, word->str, 0); - s = wcswidth((tmpw = utf8_strtowcs(word->str)), s); + s = my_wcswidth((tmpw = utf8_strtowcs(word->str)), s); free(tmpw); if (s > col_max_size[col_index]) @@ -11845,7 +11845,7 @@ main(int argc, char *argv[]) /* """"""""""""""""""""""""""""" */ size = (long)mbstowcs(NULL, word->str, 0); - if ((size = wcswidth((tmpw = utf8_strtowcs(word->str)), size)) + if ((size = my_wcswidth((tmpw = utf8_strtowcs(word->str)), size)) > tab_max_size) tab_max_size = size; @@ -12135,7 +12135,7 @@ main(int argc, char *argv[]) s1 = (long)strlen(word_a[wi].str); word_width = mbstowcs(NULL, word_a[wi].str, 0); - s2 = wcswidth((w = utf8_strtowcs(word_a[wi].str)), word_width); + s2 = my_wcswidth((w = utf8_strtowcs(word_a[wi].str)), word_width); free(w); /* Use the 0x05 character as a placeholder to preserve the internal */ @@ -12179,7 +12179,7 @@ main(int argc, char *argv[]) s1 = (long)strlen(word_a[wi].str); word_width = mbstowcs(NULL, word_a[wi].str, 0); - s2 = wcswidth((w = utf8_strtowcs(word_a[wi].str)), word_width); + s2 = my_wcswidth((w = utf8_strtowcs(word_a[wi].str)), word_width); free(w); temp = xcalloc(1, tab_real_max_size + s1 - s2 + 1); memset(temp, ' ', tab_max_size + s1 - s2); @@ -14153,7 +14153,7 @@ main(int argc, char *argv[]) str = ((output_t *)(node->data))->output_str; fprintf(old_stdout, "%s", str); - width += wcswidth((w = utf8_strtowcs(str)), 65535); + width += my_wcswidth((w = utf8_strtowcs(str)), 65535); free(w); free(str); free(node->data); @@ -14161,7 +14161,7 @@ main(int argc, char *argv[]) if (win.sel_sep != NULL) { fprintf(old_stdout, "%s", win.sel_sep); - width += wcswidth((w = utf8_strtowcs(win.sel_sep)), 65535); + width += my_wcswidth((w = utf8_strtowcs(win.sel_sep)), 65535); free(w); } else @@ -14175,7 +14175,7 @@ main(int argc, char *argv[]) str = ((output_t *)(node->data))->output_str; fprintf(old_stdout, "%s", str); - width += wcswidth((w = utf8_strtowcs(str)), 65535); + width += my_wcswidth((w = utf8_strtowcs(str)), 65535); free(w); free(str); free(node->data); @@ -14216,7 +14216,7 @@ main(int argc, char *argv[]) rtrim(output_str, " \t", 0); } - width = wcswidth((w = utf8_strtowcs(output_str)), 65535); + width = my_wcswidth((w = utf8_strtowcs(output_str)), 65535); free(w); /* And print it. */ diff --git a/tests/utf8/data7 b/tests/utf8/data7 new file mode 100644 index 0000000..aa81599 --- /dev/null +++ b/tests/utf8/data7 @@ -0,0 +1,3 @@ +x \U01F1FA\U01F1F8 x +\uf09f87ab\uf09f87b7 x \uf09f87ab\uf09f87b7 +x \U01F1FA\U01F1F8 x diff --git a/tests/utf8/data8 b/tests/utf8/data8 new file mode 100644 index 0000000..7fc45b5 --- /dev/null +++ b/tests/utf8/data8 @@ -0,0 +1,2 @@ +x y \ue29da4\uefb88f\ue2808d\uf09f94a5 x y +\uf09f91a8\ue2808d\uf09f91a9\ue2808d\uf09f91a7 x y πŸ‘©\U00200dπŸ’» diff --git a/tests/utf8/t0009.good b/tests/utf8/t0009.good new file mode 100644 index 0000000..10598e9 --- /dev/null +++ b/tests/utf8/t0009.good @@ -0,0 +1,15 @@ +$ OUT=$(smenu -c t0009.in) + +x πŸ‡ΊπŸ‡Έ x + +πŸ‡«πŸ‡· x πŸ‡«πŸ‡· + +x πŸ‡ΊπŸ‡Έ x +6:07 7:07 +$ + +$ echo ":$OUT:" + +:x: + +$ exit 0 diff --git a/tests/utf8/t0009.in b/tests/utf8/t0009.in new file mode 120000 index 0000000..592b7e2 --- /dev/null +++ b/tests/utf8/t0009.in @@ -0,0 +1 @@ +data7 \ No newline at end of file diff --git a/tests/utf8/t0009.tst b/tests/utf8/t0009.tst new file mode 100644 index 0000000..9063d3a --- /dev/null +++ b/tests/utf8/t0009.tst @@ -0,0 +1,4 @@ +\S[300]\s[80]OUT=$(smenu -c t0009.in) +\S[300]\s[200]ljjl\r +\S[300]\s[80]echo ":$\s[80]OUT:" +exit 0 diff --git a/tests/utf8/t0010.good b/tests/utf8/t0010.good new file mode 100644 index 0000000..3f58828 --- /dev/null +++ b/tests/utf8/t0010.good @@ -0,0 +1,13 @@ +$ OUT=$(smenu -c t0010.in) + +x y ❀️‍πŸ”₯ x y + +πŸ‘¨β€πŸ‘©β€πŸ‘§ x y πŸ‘©β€πŸ’» +11:07 12:07 13:07 +$ + +$ echo ":$OUT:" + +:πŸ‘©β€πŸ’»: + +$ exit 0 diff --git a/tests/utf8/t0010.in b/tests/utf8/t0010.in new file mode 120000 index 0000000..3ce4e61 --- /dev/null +++ b/tests/utf8/t0010.in @@ -0,0 +1 @@ +data8 \ No newline at end of file diff --git a/tests/utf8/t0010.tst b/tests/utf8/t0010.tst new file mode 100644 index 0000000..b66f6b9 --- /dev/null +++ b/tests/utf8/t0010.tst @@ -0,0 +1,4 @@ +\S[300]\s[80]OUT=$(smenu -c t0010.in) +\S[300]\s[200]lljhhlll\r +\S[300]\s[80]echo ":$\s[80]OUT:" +exit 0 diff --git a/utils.c b/utils.c index 010d16a..59adb75 100644 --- a/utils.c +++ b/utils.c @@ -20,6 +20,7 @@ #include #include #include "xmalloc.h" +#include "wchar.h" #include "list.h" #include "utf8.h" #include "utils.h" @@ -467,3 +468,37 @@ hexdump(const char *buf, FILE *fp, const char *prefix, size_t size) fprintf(fp, "\n"); } } + +/* ===================================================================== */ +/* Version of wcswidth which tries to support extended grapheme clusters */ +/* by taking into zero width characters. */ +/* ===================================================================== */ +int +my_wcswidth(const wchar_t *s, size_t n) +{ + int len = 0; + int l = 0; + int m = 0; + + if (s == NULL || *s == L'\0') + return 0; + + while (*s && m < n) + { + if ((l = wcwidth(*s)) >= 0) + { + /* Do not count zero-width-length glyphs. */ + /* """""""""""""""""""""""""""""""""""""" */ + if (*s != L'\x200d' && *(s + 1) != L'\x200d' && *(s + 1) != L'\xfe0f' + && *(s + 1) != L'\x20e3') + len += l; + } + else + return -1; /* wcwidth returned -1. */ + + s++; + m++; + } + + return len; +} diff --git a/utils.h b/utils.h index 775bad1..c77d4c3 100644 --- a/utils.h +++ b/utils.h @@ -83,4 +83,7 @@ strprint(char const *s); void hexdump(const char *buf, FILE *fp, const char *prefix, size_t size); +int +my_wcswidth(const wchar_t *s, size_t n); + #endif -- cgit v1.2.3