Use the internal representation for UTF-8 keys instead of wchar_t and

drop some code only needed for that.
author: nicm <nicm> 2020-05-25 18:57:24 +0000
committer: nicm <nicm> 2020-05-25 18:57:24 +0000
commit: 6f03e49e68dfe0d9c0c7d49079c4383b26aca916 (patch)
tree: 86a94f09a878fe2d32cd3ef29a69db242208897f /utf8.c
parent: 35779d655d7eec4b904eeb3a670bbef02aba016d (diff)
1 files changed, 19 insertions, 48 deletions
diff --git a/utf8.c b/utf8.c
index 38827e3d..9df74590 100644
--- a/utf8.c
+++ b/utf8.c
@@ -230,17 +230,27 @@ utf8_copy(struct utf8_data *to, const struct utf8_data *from)
 }
 
 /* Get width of Unicode character. */
-static int
-utf8_width(wchar_t wc)
+static enum utf8_state
+utf8_width(struct utf8_data *ud, int *width)
 {
-	int	width;
+	wchar_t	wc;
 
-	width = wcwidth(wc);
-	if (width < 0 || width > 0xff) {
-		log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width);
-		return (-1);
+	switch (mbtowc(&wc, ud->data, ud->size)) {
+	case -1:
+		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
+		    errno);
+		mbtowc(NULL, NULL, MB_CUR_MAX);
+		return (UTF8_ERROR);
+	case 0:
+		return (UTF8_ERROR);
 	}
-	return (width);
+	*width = wcwidth(wc);
+	if (*width < 0 || *width > 0xff) {
+		log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data,
+		    *width);
+		return (UTF8_ERROR);
+	}
+	return (UTF8_DONE);
 }
 
 /*
@@ -270,7 +280,6 @@ utf8_open(struct utf8_data *ud, u_char ch)
 enum utf8_state
 utf8_append(struct utf8_data *ud, u_char ch)
 {
-	wchar_t	wc;
 	int	width;
 
 	if (ud->have >= ud->size)
@@ -287,51 +296,13 @@ utf8_append(struct utf8_data *ud, u_char ch)
 
 	if (ud->width == 0xff)
 		return (UTF8_ERROR);
-
-	if (utf8_combine(ud, &wc) != UTF8_DONE)
-		return (UTF8_ERROR);
-	if ((width = utf8_width(wc)) < 0)
+	if (utf8_width(ud, &width) != UTF8_DONE)
 		return (UTF8_ERROR);
 	ud->width = width;
 
 	return (UTF8_DONE);
 }
 
-/* Combine UTF-8 into Unicode. */
-enum utf8_state
-utf8_combine(const struct utf8_data *ud, wchar_t *wc)
-{
-	switch (mbtowc(wc, ud->data, ud->size)) {
-	case -1:
-		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
-		    errno);
-		mbtowc(NULL, NULL, MB_CUR_MAX);
-		return (UTF8_ERROR);
-	case 0:
-		return (UTF8_ERROR);
-	default:
-		return (UTF8_DONE);
-	}
-}
-
-/* Split Unicode into UTF-8. */
-enum utf8_state
-utf8_split(wchar_t wc, struct utf8_data *ud)
-{
-	char	s[MB_LEN_MAX];
-	int	slen;
-
-	slen = wctomb(s, wc);
-	if (slen <= 0 || slen > (int)sizeof ud->data)
-		return (UTF8_ERROR);
-
-	memcpy(ud->data, s, slen);
-	ud->size = slen;
-
-	ud->width = utf8_width(wc);
-	return (UTF8_DONE);
-}
-
 /*
  * Encode len characters from src into dst, which is guaranteed to have four
  * bytes available for each character from src (for \abc or UTF-8) plus space
author	nicm <nicm>	2020-05-25 18:57:24 +0000
committer	nicm <nicm>	2020-05-25 18:57:24 +0000
commit	6f03e49e68dfe0d9c0c7d49079c4383b26aca916 (patch)
tree	86a94f09a878fe2d32cd3ef29a69db242208897f /utf8.c
parent	35779d655d7eec4b904eeb3a670bbef02aba016d (diff)