summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNaïm Favier <n@monade.li>2023-02-16 19:34:24 +0100
committerGitHub <noreply@github.com>2023-02-16 19:34:24 +0100
commitf6248c6ea9f3243986798f8fb1b48fb582d64517 (patch)
tree6641cf7afb0a6658ef9db225e11f3b4ff3c86476
parent1b1aa37f20d1748b161bc70a8a4151204c17d17e (diff)
More Unicode normalization with `-normalize-match` (#1813)
Normalize the string to a fully decomposed form, then filter out mark/accent characters.
-rw-r--r--source/helper.c23
1 files changed, 9 insertions, 14 deletions
diff --git a/source/helper.c b/source/helper.c
index 4b14f361..a84a7d9e 100644
--- a/source/helper.c
+++ b/source/helper.c
@@ -175,30 +175,25 @@ static gchar *prefix_regex(const char *input) {
return retv;
}
-static char *utf8_helper_simplify_string(const char *s) {
- gunichar buf2[G_UNICHAR_MAX_DECOMPOSITION_LENGTH] = {
- 0,
- };
+static char *utf8_helper_simplify_string(const char *os) {
char buf[6] = {
0,
};
- // Compose the string in maximally composed form.
+
+ // Normalize the string to a fully decomposed form, then filter out mark/accent characters.
+ char *s = g_utf8_normalize(os, -1, G_NORMALIZE_ALL);
ssize_t str_size = (g_utf8_strlen(s, -1) * 6 + 2 + 1) * sizeof(char);
char *str = g_malloc0(str_size);
char *striter = str;
for (const char *iter = s; iter && *iter; iter = g_utf8_next_char(iter)) {
gunichar uc = g_utf8_get_char(iter);
- int l = 0;
- gsize dl = g_unichar_fully_decompose(uc, FALSE, buf2,
- G_UNICHAR_MAX_DECOMPOSITION_LENGTH);
- if (dl) {
- l = g_unichar_to_utf8(buf2[0], buf);
- } else {
- l = g_unichar_to_utf8(uc, buf);
+ if (!g_unichar_ismark(uc)) {
+ int l = g_unichar_to_utf8(uc, buf);
+ memcpy(striter, buf, l);
+ striter += l;
}
- memcpy(striter, buf, l);
- striter += l;
}
+ g_free(s);
return str;
}