Change UTF-8 combining to inspect the previous character at the cursor

position rather than keeping the last character from the input stream, this is how most terminals work and fixes problems with displaying these characters in vim. GitHub issue 3600.
author: nicm <nicm> 2023-09-15 15:49:05 +0000
committer: nicm <nicm> 2023-09-15 15:49:05 +0000
commit: f09cde2542470e5c1a292cc6871c4f0e00cedde5 (patch)
tree: fa31d93e697ab40131a1c3613f6f63028c0cee8c /utf8.c
parent: d394293ba59fc932085eb8c01592822a9b1ec1f7 (diff)
1 files changed, 209 insertions, 9 deletions
diff --git a/utf8.c b/utf8.c
index d26a49e4..0abb7875 100644
--- a/utf8.c
+++ b/utf8.c
@@ -23,10 +23,174 @@
 #include <stdlib.h>
 #include <string.h>
 #include <vis.h>
-#include <wchar.h>
 
 #include "tmux.h"
 
+static const wchar_t utf8_force_wide[] = {
+	0x0261D,
+	0x026F9,
+	0x0270A,
+	0x0270B,
+	0x0270C,
+	0x0270D,
+	0x1F1E6,
+	0x1F1E7,
+	0x1F1E8,
+	0x1F1E9,
+	0x1F1EA,
+	0x1F1EB,
+	0x1F1EC,
+	0x1F1ED,
+	0x1F1EE,
+	0x1F1EF,
+	0x1F1F0,
+	0x1F1F1,
+	0x1F1F2,
+	0x1F1F3,
+	0x1F1F4,
+	0x1F1F5,
+	0x1F1F6,
+	0x1F1F7,
+	0x1F1F8,
+	0x1F1F9,
+	0x1F1FA,
+	0x1F1FB,
+	0x1F1FC,
+	0x1F1FD,
+	0x1F1FE,
+	0x1F1FF,
+	0x1F385,
+	0x1F3C2,
+	0x1F3C3,
+	0x1F3C4,
+	0x1F3C7,
+	0x1F3CA,
+	0x1F3CB,
+	0x1F3CC,
+	0x1F3FB,
+	0x1F3FC,
+	0x1F3FD,
+	0x1F3FE,
+	0x1F3FF,
+	0x1F442,
+	0x1F443,
+	0x1F446,
+	0x1F447,
+	0x1F448,
+	0x1F449,
+	0x1F44A,
+	0x1F44B,
+	0x1F44C,
+	0x1F44D,
+	0x1F44E,
+	0x1F44F,
+	0x1F450,
+	0x1F466,
+	0x1F467,
+	0x1F468,
+	0x1F469,
+	0x1F46B,
+	0x1F46C,
+	0x1F46D,
+	0x1F46E,
+	0x1F470,
+	0x1F471,
+	0x1F472,
+	0x1F473,
+	0x1F474,
+	0x1F475,
+	0x1F476,
+	0x1F477,
+	0x1F478,
+	0x1F47C,
+	0x1F481,
+	0x1F482,
+	0x1F483,
+	0x1F485,
+	0x1F486,
+	0x1F487,
+	0x1F48F,
+	0x1F491,
+	0x1F4AA,
+	0x1F574,
+	0x1F575,
+	0x1F57A,
+	0x1F590,
+	0x1F595,
+	0x1F596,
+	0x1F645,
+	0x1F646,
+	0x1F647,
+	0x1F64B,
+	0x1F64C,
+	0x1F64D,
+	0x1F64E,
+	0x1F64F,
+	0x1F6A3,
+	0x1F6B4,
+	0x1F6B5,
+	0x1F6B6,
+	0x1F6C0,
+	0x1F6CC,
+	0x1F90C,
+	0x1F90F,
+	0x1F918,
+	0x1F919,
+	0x1F91A,
+	0x1F91B,
+	0x1F91C,
+	0x1F91D,
+	0x1F91E,
+	0x1F91F,
+	0x1F926,
+	0x1F930,
+	0x1F931,
+	0x1F932,
+	0x1F933,
+	0x1F934,
+	0x1F935,
+	0x1F936,
+	0x1F937,
+	0x1F938,
+	0x1F939,
+	0x1F93D,
+	0x1F93E,
+	0x1F977,
+	0x1F9B5,
+	0x1F9B6,
+	0x1F9B8,
+	0x1F9B9,
+	0x1F9BB,
+	0x1F9CD,
+	0x1F9CE,
+	0x1F9CF,
+	0x1F9D1,
+	0x1F9D2,
+	0x1F9D3,
+	0x1F9D4,
+	0x1F9D5,
+	0x1F9D6,
+	0x1F9D7,
+	0x1F9D8,
+	0x1F9D9,
+	0x1F9DA,
+	0x1F9DB,
+	0x1F9DC,
+	0x1F9DD,
+	0x1FAC3,
+	0x1FAC4,
+	0x1FAC5,
+	0x1FAF0,
+	0x1FAF1,
+	0x1FAF2,
+	0x1FAF3,
+	0x1FAF4,
+	0x1FAF5,
+	0x1FAF6,
+	0x1FAF7,
+	0x1FAF8
+};
+
 struct utf8_item {
 	RB_ENTRY(utf8_item)	index_entry;
 	u_int			index;
@@ -123,6 +287,28 @@ utf8_put_item(const u_char *data, size_t size, u_int *index)
 	return (0);
 }
 
+static int
+utf8_table_cmp(const void *vp1, const void *vp2)
+{
+	const wchar_t	*wc1 = vp1, *wc2 = vp2;
+
+	if (*wc1 < *wc2)
+		return (-1);
+	if (*wc1 > *wc2)
+		return (1);
+	return (0);
+}
+
+/* Check if character in table. */
+int
+utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
+{
+	wchar_t	*found;
+
+	found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
+	return (found != NULL);
+}
+
 /* Get UTF-8 character from data. */
 enum utf8_state
 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
@@ -217,16 +403,13 @@ utf8_width(struct utf8_data *ud, int *width)
 {
 	wchar_t	wc;
 
-	switch (mbtowc(&wc, ud->data, ud->size)) {
-	case -1:
-		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
-		    errno);
-		mbtowc(NULL, NULL, MB_CUR_MAX);
-		return (UTF8_ERROR);
-	case 0:
+	if (utf8_towc(ud, &wc) != UTF8_DONE)
 		return (UTF8_ERROR);
+	if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
+		*width = 2;
+		return (UTF8_DONE);
 	}
-	log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)wc);
+
 	*width = wcwidth(wc);
 	log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
 	if (*width < 0) {
@@ -241,6 +424,23 @@ utf8_width(struct utf8_data *ud, int *width)
 	return (UTF8_ERROR);
 }
 
+/* Convert UTF-8 character to wide character. */
+enum utf8_state
+utf8_towc(const struct utf8_data *ud, wchar_t *wc)
+{
+	switch (mbtowc(wc, ud->data, ud->size)) {
+	case -1:
+		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
+		    errno);
+		mbtowc(NULL, NULL, MB_CUR_MAX);
+		return (UTF8_ERROR);
+	case 0:
+		return (UTF8_ERROR);
+	}
+	log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
+	return (UTF8_DONE);
+}
+
 /*
  * Open UTF-8 sequence.
  *
author	nicm <nicm>	2023-09-15 15:49:05 +0000
committer	nicm <nicm>	2023-09-15 15:49:05 +0000
commit	f09cde2542470e5c1a292cc6871c4f0e00cedde5 (patch)
tree	fa31d93e697ab40131a1c3613f6f63028c0cee8c /utf8.c
parent	d394293ba59fc932085eb8c01592822a9b1ec1f7 (diff)