From 6f03e49e68dfe0d9c0c7d49079c4383b26aca916 Mon Sep 17 00:00:00 2001 From: nicm Date: Mon, 25 May 2020 18:57:24 +0000 Subject: Use the internal representation for UTF-8 keys instead of wchar_t and drop some code only needed for that. --- utf8.c | 67 +++++++++++++++++++----------------------------------------------- 1 file changed, 19 insertions(+), 48 deletions(-) (limited to 'utf8.c') diff --git a/utf8.c b/utf8.c index 38827e3d..9df74590 100644 --- a/utf8.c +++ b/utf8.c @@ -230,17 +230,27 @@ utf8_copy(struct utf8_data *to, const struct utf8_data *from) } /* Get width of Unicode character. */ -static int -utf8_width(wchar_t wc) +static enum utf8_state +utf8_width(struct utf8_data *ud, int *width) { - int width; + wchar_t wc; - width = wcwidth(wc); - if (width < 0 || width > 0xff) { - log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width); - return (-1); + switch (mbtowc(&wc, ud->data, ud->size)) { + case -1: + log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data, + errno); + mbtowc(NULL, NULL, MB_CUR_MAX); + return (UTF8_ERROR); + case 0: + return (UTF8_ERROR); } - return (width); + *width = wcwidth(wc); + if (*width < 0 || *width > 0xff) { + log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data, + *width); + return (UTF8_ERROR); + } + return (UTF8_DONE); } /* @@ -270,7 +280,6 @@ utf8_open(struct utf8_data *ud, u_char ch) enum utf8_state utf8_append(struct utf8_data *ud, u_char ch) { - wchar_t wc; int width; if (ud->have >= ud->size) @@ -287,51 +296,13 @@ utf8_append(struct utf8_data *ud, u_char ch) if (ud->width == 0xff) return (UTF8_ERROR); - - if (utf8_combine(ud, &wc) != UTF8_DONE) - return (UTF8_ERROR); - if ((width = utf8_width(wc)) < 0) + if (utf8_width(ud, &width) != UTF8_DONE) return (UTF8_ERROR); ud->width = width; return (UTF8_DONE); } -/* Combine UTF-8 into Unicode. */ -enum utf8_state -utf8_combine(const struct utf8_data *ud, wchar_t *wc) -{ - switch (mbtowc(wc, ud->data, ud->size)) { - case -1: - log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data, - errno); - mbtowc(NULL, NULL, MB_CUR_MAX); - return (UTF8_ERROR); - case 0: - return (UTF8_ERROR); - default: - return (UTF8_DONE); - } -} - -/* Split Unicode into UTF-8. */ -enum utf8_state -utf8_split(wchar_t wc, struct utf8_data *ud) -{ - char s[MB_LEN_MAX]; - int slen; - - slen = wctomb(s, wc); - if (slen <= 0 || slen > (int)sizeof ud->data) - return (UTF8_ERROR); - - memcpy(ud->data, s, slen); - ud->size = slen; - - ud->width = utf8_width(wc); - return (UTF8_DONE); -} - /* * Encode len characters from src into dst, which is guaranteed to have four * bytes available for each character from src (for \abc or UTF-8) plus space -- cgit v1.2.3