Instead of storing all UTF-8 characters in the extended cell which means

that 14 bytes are wasted for each character in the BMP, only store characters of three bytes or less in the cell itself and store others (outside the BMP or with combining characters) in a separate global tree. Can reduce grid memory use for heavy Unicode users by around 30%.
author: nicm <nicm> 2020-05-25 09:32:10 +0000
committer: nicm <nicm> 2020-05-25 09:32:10 +0000
commit: 3a5219c6d0c1a85ac3cf7a6b938f724650001a4d (patch)
tree: 87ac834df4315372c6e569fd0fbd95e8d23d9d68 /utf8.c
parent: 14a9fd58d56211f9ee1ee9347d135fc00e03d4bd (diff)
1 files changed, 174 insertions, 0 deletions
diff --git a/utf8.c b/utf8.c
index 3f378fb3..68c970d0 100644
--- a/utf8.c
+++ b/utf8.c
@@ -29,6 +29,180 @@
 
 static int	utf8_width(wchar_t);
 
+struct utf8_big_item {
+	u_int			index;
+	RB_ENTRY(utf8_big_item)	entry;
+
+	char			data[UTF8_SIZE];
+	u_char			size;
+};
+RB_HEAD(utf8_big_tree, utf8_big_item);
+
+static int
+utf8_big_cmp(struct utf8_big_item *bi1, struct utf8_big_item *bi2)
+{
+	if (bi1->size < bi2->size)
+		return (-1);
+	if (bi1->size > bi2->size)
+		return (1);
+	return (memcmp(bi1->data, bi2->data, bi1->size));
+}
+RB_GENERATE_STATIC(utf8_big_tree, utf8_big_item, entry, utf8_big_cmp);
+static struct utf8_big_tree utf8_big_tree = RB_INITIALIZER(utf8_big_tree);
+
+static struct utf8_big_item *utf8_big_list;
+static u_int utf8_big_list_size;
+static u_int utf8_big_list_used;
+
+union utf8_big_map {
+	uint32_t	value;
+	struct {
+		u_char	flags;
+#define UTF8_BIG_SIZE 0x1f
+#define UTF8_BIG_WIDTH2 0x20
+
+		u_char	data[3];
+	};
+} __packed;
+
+static const union utf8_big_map utf8_big_space1 = {
+	.flags = 1,
+	.data = " "
+};
+static const union utf8_big_map utf8_big_space2 = {
+	.flags = UTF8_BIG_WIDTH2|2,
+	.data = "  "
+};
+
+/* Get a big item by index. */
+static struct utf8_big_item *
+utf8_get_big_item(const char *data, size_t size)
+{
+	struct utf8_big_item bi;
+
+	memcpy(bi.data, data, size);
+	bi.size = size;
+
+	return (RB_FIND(utf8_big_tree, &utf8_big_tree, &bi));
+}
+
+/* Add a big item. */
+static int
+utf8_put_big_item(const char *data, size_t size, u_int *index)
+{
+	struct utf8_big_item	*bi;
+
+	bi = utf8_get_big_item(data, size);
+	if (bi != NULL) {
+		*index = bi->index;
+		log_debug("%s: have %.*s at %u", __func__, (int)size, data,
+		    *index);
+		return (0);
+	}
+
+	if (utf8_big_list_used == utf8_big_list_size) {
+		if (utf8_big_list_size == 0xffffff)
+			return (-1);
+		if (utf8_big_list_size == 0)
+			utf8_big_list_size = 256;
+		else if (utf8_big_list_size > 0x7fffff)
+			utf8_big_list_size = 0xffffff;
+		else
+			utf8_big_list_size *= 2;
+		utf8_big_list = xreallocarray(utf8_big_list, utf8_big_list_size,
+		    sizeof *utf8_big_list);
+	}
+	*index = utf8_big_list_used++;
+
+	bi = &utf8_big_list[*index];
+	bi->index = *index;
+	memcpy(bi->data, data, size);
+	bi->size = size;
+	RB_INSERT(utf8_big_tree, &utf8_big_tree, bi);
+
+	log_debug("%s: added %.*s at %u", __func__, (int)size, data, *index);
+	return (0);
+}
+
+/* Get UTF-8 as index into buffer. */
+uint32_t
+utf8_map_big(const struct utf8_data *ud)
+{
+	union utf8_big_map	 m = { .value = 0 };
+	u_int			 o;
+	const char		*data = ud->data;
+	size_t			 size = ud->size;
+
+	if (ud->width != 1 && ud->width != 2)
+		return (utf8_big_space1.value);
+
+	if (size > UTF8_BIG_SIZE)
+		goto fail;
+	if (size == 1)
+		return (utf8_set_big(data[0], 1));
+
+	m.flags = size;
+	if (ud->width == 2)
+		m.flags |= UTF8_BIG_WIDTH2;
+
+	if (size <= 3) {
+		memcpy(&m.data, data, size);
+		return (m.value);
+	}
+
+	if (utf8_put_big_item(data, size, &o) != 0)
+		goto fail;
+	m.data[0] = (o & 0xff);
+	m.data[1] = (o >> 8) & 0xff;
+	m.data[2] = (o >> 16);
+	return (m.value);
+
+fail:
+	if (ud->width == 1)
+		return (utf8_big_space1.value);
+	return (utf8_big_space2.value);
+}
+
+/* Get UTF-8 from index into buffer. */
+void
+utf8_get_big(uint32_t v, struct utf8_data *ud)
+{
+	union utf8_big_map	 m = { .value = v };
+	struct utf8_big_item	*bi;
+	u_int			 o;
+
+	memset(ud, 0, sizeof *ud);
+	ud->size = ud->have = (m.flags & UTF8_BIG_SIZE);
+	if (m.flags & UTF8_BIG_WIDTH2)
+		ud->width = 2;
+	else
+		ud->width = 1;
+
+	if (ud->size <= 3) {
+		memcpy(ud->data, m.data, ud->size);
+		return;
+	}
+
+	o = ((uint32_t)m.data[2] << 16)|((uint32_t)m.data[1] << 8)|m.data[0];
+	if (o >= utf8_big_list_used)
+		memset(ud->data, ' ', ud->size);
+	else {
+		bi = &utf8_big_list[o];
+		memcpy(ud->data, bi->data, ud->size);
+	}
+}
+
+/* Get big value for UTF-8 single character. */
+uint32_t
+utf8_set_big(char c, u_int width)
+{
+	union utf8_big_map	m = { .flags = 1, .data[0] = c };
+
+	if (width == 2)
+		m.flags |= UTF8_BIG_WIDTH2;
+	return (m.value);
+}
+
 /* Set a single character. */
 void
 utf8_set(struct utf8_data *ud, u_char ch)
author	nicm <nicm>	2020-05-25 09:32:10 +0000
committer	nicm <nicm>	2020-05-25 09:32:10 +0000
commit	3a5219c6d0c1a85ac3cf7a6b938f724650001a4d (patch)
tree	87ac834df4315372c6e569fd0fbd95e8d23d9d68 /utf8.c
parent	14a9fd58d56211f9ee1ee9347d135fc00e03d4bd (diff)