diff options
author | David Bremner <david@tethera.net> | 2019-03-25 23:07:24 -0300 |
---|---|---|
committer | David Bremner <david@tethera.net> | 2019-05-25 06:51:12 -0300 |
commit | 781125c9e92a2b9a2b9fbe54adec28ddb60f35b1 (patch) | |
tree | 5625ddaa918acc0eec1b22a5d551c624fcefb311 | |
parent | 46ab6013a29233b32dba49cf9c50e70fd02db1c3 (diff) |
util: add unicode_word_utf8
This originally use Xapian::Unicode::is_wordchar, but that forces
clients to link directly to libxapian, which seems like it might be
busywork if nothing else.
-rw-r--r-- | util/Makefile.local | 3 | ||||
-rw-r--r-- | util/unicode-util.c | 43 | ||||
-rw-r--r-- | util/unicode-util.h | 12 |
3 files changed, 57 insertions, 1 deletions
diff --git a/util/Makefile.local b/util/Makefile.local index ba03230e..46f8af3a 100644 --- a/util/Makefile.local +++ b/util/Makefile.local @@ -5,7 +5,8 @@ extra_cflags += -I$(srcdir)/$(dir) libnotmuch_util_c_srcs := $(dir)/xutil.c $(dir)/error_util.c $(dir)/hex-escape.c \ $(dir)/string-util.c $(dir)/talloc-extra.c $(dir)/zlib-extra.c \ - $(dir)/util.c $(dir)/gmime-extra.c $(dir)/crypto.c + $(dir)/util.c $(dir)/gmime-extra.c $(dir)/crypto.c \ + $(dir)/unicode-util.c libnotmuch_util_modules := $(libnotmuch_util_c_srcs:.c=.o) diff --git a/util/unicode-util.c b/util/unicode-util.c new file mode 100644 index 00000000..312e900f --- /dev/null +++ b/util/unicode-util.c @@ -0,0 +1,43 @@ +#include "unicode-util.h" + +/* Based on Xapian::Unicode::is_wordchar, to avoid forcing clients to + link directly to libxapian. +*/ + +static bool +unicode_is_wordchar (notmuch_unichar ch) +{ + switch (g_unichar_type (ch)) { + case G_UNICODE_UPPERCASE_LETTER: + case G_UNICODE_LOWERCASE_LETTER: + case G_UNICODE_TITLECASE_LETTER: + case G_UNICODE_MODIFIER_LETTER: + case G_UNICODE_OTHER_LETTER: + case G_UNICODE_NON_SPACING_MARK: + case G_UNICODE_ENCLOSING_MARK: + case G_UNICODE_SPACING_MARK: + case G_UNICODE_DECIMAL_NUMBER: + case G_UNICODE_LETTER_NUMBER: + case G_UNICODE_OTHER_NUMBER: + case G_UNICODE_CONNECT_PUNCTUATION: + return true; + default: + return false; + } +} + +bool +unicode_word_utf8 (const char *utf8_str) +{ + gunichar *decoded = g_utf8_to_ucs4_fast (utf8_str, -1, NULL); + const gunichar *p = decoded; + bool ret; + + while (*p && unicode_is_wordchar (*p)) + p++; + + ret = (*p == '\0'); + + g_free (decoded); + return ret; +} diff --git a/util/unicode-util.h b/util/unicode-util.h new file mode 100644 index 00000000..32d1e6ef --- /dev/null +++ b/util/unicode-util.h @@ -0,0 +1,12 @@ +#ifndef UNICODE_UTIL_H +#define UNICODE_UTIL_H + +#include <stdbool.h> +#include <gmodule.h> + +/* The utf8 encoded string would tokenize as a single word, according + * to xapian. */ +bool unicode_word_utf8 (const char *str); +typedef gunichar notmuch_unichar; + +#endif |