Allow UTF-8 characters of width 0 to be stored, it is useful to be able

to put padding cells in as width 0.
author: nicm <nicm> 2020-06-02 20:10:23 +0000
committer: nicm <nicm> 2020-06-02 20:10:23 +0000
commit: 2a4d4bda2b94602e9f999ff0b59efa92613f75a9 (patch)
tree: 86cd9c1ee3eac4f1a9aacb9b9d1a3062ec106a09 /utf8.c
parent: f5366ff828cde78c140efa2dd9453956b59f5241 (diff)
1 files changed, 24 insertions, 30 deletions
diff --git a/utf8.c b/utf8.c
index 5c11b7ca..c33b6690 100644
--- a/utf8.c
+++ b/utf8.c
@@ -56,19 +56,26 @@ union utf8_map {
 	utf8_char	uc;
 	struct {
 		u_char	flags;
-#define UTF8_FLAG_SIZE 0x1f
-#define UTF8_FLAG_WIDTH2 0x20
-
 		u_char	data[3];
 	};
 } __packed;
 
+#define UTF8_GET_SIZE(flags) ((flags) & 0x1f)
+#define UTF8_GET_WIDTH(flags) (((flags) >> 5) - 1)
+
+#define UTF8_SET_SIZE(size) (size)
+#define UTF8_SET_WIDTH(width) ((width + 1) << 5)
+
+static const union utf8_map utf8_space0 = {
+	.flags = UTF8_SET_WIDTH(0)|UTF8_SET_SIZE(0),
+	.data = ""
+};
 static const union utf8_map utf8_space1 = {
-	.flags = 1,
+	.flags = UTF8_SET_WIDTH(1)|UTF8_SET_SIZE(1),
 	.data = " "
 };
 static const union utf8_map utf8_space2 = {
-	.flags = UTF8_FLAG_WIDTH2|2,
+	.flags = UTF8_SET_WIDTH(2)|UTF8_SET_SIZE(2),
 	.data = "  "
 };
 
@@ -135,24 +142,12 @@ utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
 	union utf8_map	 m = { .uc = 0 };
 	u_int		 offset;
 
-	if (ud->width == 0)
-		goto fail;
-	if (ud->width != 1 && ud->width != 2)
+	if (ud->width > 2)
 		fatalx("invalid UTF-8 width");
-	if (ud->size == 0)
-		fatalx("invalid UTF-8 size");
 
-	if (ud->size > UTF8_FLAG_SIZE)
+	if (ud->size > UTF8_SIZE)
 		goto fail;
-	if (ud->size == 1) {
-		*uc = utf8_build_one(ud->data[0], 1);
-		return (UTF8_DONE);
-	}
-
-	m.flags = ud->size;
-	if (ud->width == 2)
-		m.flags |= UTF8_FLAG_WIDTH2;
-
+	m.flags = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width);
 	if (ud->size <= 3)
 		memcpy(m.data, ud->data, ud->size);
 	else {
@@ -166,7 +161,9 @@ utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
 	return (UTF8_DONE);
 
 fail:
-	if (ud->width == 1)
+	if (ud->width == 0)
+		*uc = htonl(utf8_space0.uc);
+	else if (ud->width == 1)
 		*uc = htonl(utf8_space1.uc);
 	else
 		*uc = htonl(utf8_space2.uc);
@@ -182,11 +179,8 @@ utf8_to_data(utf8_char uc, struct utf8_data *ud)
 	u_int			 offset;
 
 	memset(ud, 0, sizeof *ud);
-	ud->size = ud->have = (m.flags & UTF8_FLAG_SIZE);
-	if (m.flags & UTF8_FLAG_WIDTH2)
-		ud->width = 2;
-	else
-		ud->width = 1;
+	ud->size = ud->have = UTF8_GET_SIZE(m.flags);
+	ud->width = UTF8_GET_WIDTH(m.flags);
 
 	if (ud->size <= 3) {
 		memcpy(ud->data, m.data, ud->size);
@@ -204,12 +198,12 @@ utf8_to_data(utf8_char uc, struct utf8_data *ud)
 
 /* Get UTF-8 character from a single ASCII character. */
 u_int
-utf8_build_one(char c, u_int width)
+utf8_build_one(u_char ch)
 {
-	union utf8_map	m = { .flags = 1, .data[0] = c };
+	union utf8_map	m;
 
-	if (width == 2)
-		m.flags |= UTF8_FLAG_WIDTH2;
+	m.flags = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1);
+	m.data[0] = ch;
 	return (htonl(m.uc));
 }
author	nicm <nicm>	2020-06-02 20:10:23 +0000
committer	nicm <nicm>	2020-06-02 20:10:23 +0000
commit	2a4d4bda2b94602e9f999ff0b59efa92613f75a9 (patch)
tree	86cd9c1ee3eac4f1a9aacb9b9d1a3062ec106a09 /utf8.c
parent	f5366ff828cde78c140efa2dd9453956b59f5241 (diff)