summaryrefslogtreecommitdiffstats
path: root/utf8.c
diff options
context:
space:
mode:
authornicm <nicm>2023-09-15 15:49:05 +0000
committernicm <nicm>2023-09-15 15:49:05 +0000
commitf09cde2542470e5c1a292cc6871c4f0e00cedde5 (patch)
treefa31d93e697ab40131a1c3613f6f63028c0cee8c /utf8.c
parentd394293ba59fc932085eb8c01592822a9b1ec1f7 (diff)
Change UTF-8 combining to inspect the previous character at the cursor
position rather than keeping the last character from the input stream, this is how most terminals work and fixes problems with displaying these characters in vim. GitHub issue 3600.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c218
1 files changed, 209 insertions, 9 deletions
diff --git a/utf8.c b/utf8.c
index d26a49e4..0abb7875 100644
--- a/utf8.c
+++ b/utf8.c
@@ -23,10 +23,174 @@
#include <stdlib.h>
#include <string.h>
#include <vis.h>
-#include <wchar.h>
#include "tmux.h"
+static const wchar_t utf8_force_wide[] = {
+ 0x0261D,
+ 0x026F9,
+ 0x0270A,
+ 0x0270B,
+ 0x0270C,
+ 0x0270D,
+ 0x1F1E6,
+ 0x1F1E7,
+ 0x1F1E8,
+ 0x1F1E9,
+ 0x1F1EA,
+ 0x1F1EB,
+ 0x1F1EC,
+ 0x1F1ED,
+ 0x1F1EE,
+ 0x1F1EF,
+ 0x1F1F0,
+ 0x1F1F1,
+ 0x1F1F2,
+ 0x1F1F3,
+ 0x1F1F4,
+ 0x1F1F5,
+ 0x1F1F6,
+ 0x1F1F7,
+ 0x1F1F8,
+ 0x1F1F9,
+ 0x1F1FA,
+ 0x1F1FB,
+ 0x1F1FC,
+ 0x1F1FD,
+ 0x1F1FE,
+ 0x1F1FF,
+ 0x1F385,
+ 0x1F3C2,
+ 0x1F3C3,
+ 0x1F3C4,
+ 0x1F3C7,
+ 0x1F3CA,
+ 0x1F3CB,
+ 0x1F3CC,
+ 0x1F3FB,
+ 0x1F3FC,
+ 0x1F3FD,
+ 0x1F3FE,
+ 0x1F3FF,
+ 0x1F442,
+ 0x1F443,
+ 0x1F446,
+ 0x1F447,
+ 0x1F448,
+ 0x1F449,
+ 0x1F44A,
+ 0x1F44B,
+ 0x1F44C,
+ 0x1F44D,
+ 0x1F44E,
+ 0x1F44F,
+ 0x1F450,
+ 0x1F466,
+ 0x1F467,
+ 0x1F468,
+ 0x1F469,
+ 0x1F46B,
+ 0x1F46C,
+ 0x1F46D,
+ 0x1F46E,
+ 0x1F470,
+ 0x1F471,
+ 0x1F472,
+ 0x1F473,
+ 0x1F474,
+ 0x1F475,
+ 0x1F476,
+ 0x1F477,
+ 0x1F478,
+ 0x1F47C,
+ 0x1F481,
+ 0x1F482,
+ 0x1F483,
+ 0x1F485,
+ 0x1F486,
+ 0x1F487,
+ 0x1F48F,
+ 0x1F491,
+ 0x1F4AA,
+ 0x1F574,
+ 0x1F575,
+ 0x1F57A,
+ 0x1F590,
+ 0x1F595,
+ 0x1F596,
+ 0x1F645,
+ 0x1F646,
+ 0x1F647,
+ 0x1F64B,
+ 0x1F64C,
+ 0x1F64D,
+ 0x1F64E,
+ 0x1F64F,
+ 0x1F6A3,
+ 0x1F6B4,
+ 0x1F6B5,
+ 0x1F6B6,
+ 0x1F6C0,
+ 0x1F6CC,
+ 0x1F90C,
+ 0x1F90F,
+ 0x1F918,
+ 0x1F919,
+ 0x1F91A,
+ 0x1F91B,
+ 0x1F91C,
+ 0x1F91D,
+ 0x1F91E,
+ 0x1F91F,
+ 0x1F926,
+ 0x1F930,
+ 0x1F931,
+ 0x1F932,
+ 0x1F933,
+ 0x1F934,
+ 0x1F935,
+ 0x1F936,
+ 0x1F937,
+ 0x1F938,
+ 0x1F939,
+ 0x1F93D,
+ 0x1F93E,
+ 0x1F977,
+ 0x1F9B5,
+ 0x1F9B6,
+ 0x1F9B8,
+ 0x1F9B9,
+ 0x1F9BB,
+ 0x1F9CD,
+ 0x1F9CE,
+ 0x1F9CF,
+ 0x1F9D1,
+ 0x1F9D2,
+ 0x1F9D3,
+ 0x1F9D4,
+ 0x1F9D5,
+ 0x1F9D6,
+ 0x1F9D7,
+ 0x1F9D8,
+ 0x1F9D9,
+ 0x1F9DA,
+ 0x1F9DB,
+ 0x1F9DC,
+ 0x1F9DD,
+ 0x1FAC3,
+ 0x1FAC4,
+ 0x1FAC5,
+ 0x1FAF0,
+ 0x1FAF1,
+ 0x1FAF2,
+ 0x1FAF3,
+ 0x1FAF4,
+ 0x1FAF5,
+ 0x1FAF6,
+ 0x1FAF7,
+ 0x1FAF8
+};
+
struct utf8_item {
RB_ENTRY(utf8_item) index_entry;
u_int index;
@@ -123,6 +287,28 @@ utf8_put_item(const u_char *data, size_t size, u_int *index)
return (0);
}
+static int
+utf8_table_cmp(const void *vp1, const void *vp2)
+{
+ const wchar_t *wc1 = vp1, *wc2 = vp2;
+
+ if (*wc1 < *wc2)
+ return (-1);
+ if (*wc1 > *wc2)
+ return (1);
+ return (0);
+}
+
+/* Check if character in table. */
+int
+utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
+{
+ wchar_t *found;
+
+ found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
+ return (found != NULL);
+}
+
/* Get UTF-8 character from data. */
enum utf8_state
utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
@@ -217,16 +403,13 @@ utf8_width(struct utf8_data *ud, int *width)
{
wchar_t wc;
- switch (mbtowc(&wc, ud->data, ud->size)) {
- case -1:
- log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
- errno);
- mbtowc(NULL, NULL, MB_CUR_MAX);
- return (UTF8_ERROR);
- case 0:
+ if (utf8_towc(ud, &wc) != UTF8_DONE)
return (UTF8_ERROR);
+ if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
+ *width = 2;
+ return (UTF8_DONE);
}
- log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)wc);
+
*width = wcwidth(wc);
log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
if (*width < 0) {
@@ -241,6 +424,23 @@ utf8_width(struct utf8_data *ud, int *width)
return (UTF8_ERROR);
}
+/* Convert UTF-8 character to wide character. */
+enum utf8_state
+utf8_towc(const struct utf8_data *ud, wchar_t *wc)
+{
+ switch (mbtowc(wc, ud->data, ud->size)) {
+ case -1:
+ log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
+ errno);
+ mbtowc(NULL, NULL, MB_CUR_MAX);
+ return (UTF8_ERROR);
+ case 0:
+ return (UTF8_ERROR);
+ }
+ log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
+ return (UTF8_DONE);
+}
+
/*
* Open UTF-8 sequence.
*