diff options
author | pgen <p.gen.progs@gmail.com> | 2020-08-01 23:41:39 +0200 |
---|---|---|
committer | pgen <p.gen.progs@gmail.com> | 2020-08-01 23:55:21 +0200 |
commit | c78abaf23302b7ed66665a9239f05c7df32295be (patch) | |
tree | 1d1a70be91f875a92f58b3e30099b0b559c39b0c /utf8.c | |
parent | 0a4bb0c1b71ad59d0685529e9df7ff26ea7b9662 (diff) |
Allow to enter unicode UCS-4 codepoints using \U
\U must be followed by exactly 6 hexadecimal digits with leading zeros
if necessary.
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 112 |
1 files changed, 111 insertions, 1 deletions
@@ -17,11 +17,61 @@ #include "xmalloc.h" #include "utf8.h" +/* =========================================================== */ +/* UTF-8 byte sequence generation from a given UCS-4 codepoint */ +/* utf8_str must be preallocated with a size of at least 5 */ +/* bytes. */ +/* return the length of the generated sequence or 0 if c is */ +/* not a valid codepoint. */ +/* =========================================================== */ +int +cptoutf8(char * utf8_str, uint32_t c) +{ + int len = 0; + int first; + int i; + + if (c < 0x80) + { + first = 0; + len = 1; + } + else if (c < 0x800) + { + first = 0xc0; + len = 2; + } + else if (c < 0x10000) + { + first = 0xe0; + len = 3; + } + else if (c < 0x200000) + { + first = 0xf0; + len = 4; + } + + if (utf8_str) + { + for (i = len - 1; i > 0; --i) + { + utf8_str[i] = (c & 0x3f) | 0x80; + c >>= 6; + } + utf8_str[0] = c | first; + } + + return len; +} + /* ======================================================================= */ /* Unicode (UTF-8) ascii representation interpreter. */ /* The string passed will be altered but its address will not change. */ /* All hexadecimal sequences of \uxx, \uxxxx, \uxxxxxx and \uxxxxxxxx will */ -/* be replace by the corresponding UTF-8 character when possible. */ +/* be replaced by the corresponding UTF-8 character when possible. */ +/* All hexadecimal sequences of \Uxxxxx will be replaced with the UTF-8 */ +/* sequence corresponding to the given UCS-4 codepoint. */ /* When not possible the substitution character is substituted in place. */ /* Returns 0 if the conversion has faild else 1. */ /* ======================================================================= */ @@ -45,6 +95,66 @@ utf8_interpret(char * s, langinfo_t * langinfo, char substitute) init_len = strlen(s); + /* Manage \U codepoints. */ + /* """"""""""""""""""""" */ + while ((utf8_str = strstr(s, "\\U")) != NULL) + { + char str[7]; + int utf8_str_len; + int len; + int n; + uint32_t cp; + int subst; /* 0, the \U sequance is valid, else 1. */ + + utf8_to_eos_len = strlen(utf8_str); + utf8_str_len = 0; + + n = sscanf(utf8_str + 2, + "%6[" + "0123456789" + "abcdef" + "ABCDEF" + "]%n", + tmp, &utf8_str_len); + + subst = 0; + + if (n == 1 && utf8_str_len == 6) + { + sscanf(tmp, "%x", &cp); + if (cp > 0x10FFFF) + subst = 1; /* Invalid range. */ + else + { + len = cptoutf8(str, cp); + str[len] = '\0'; + *(utf8_str + 1) = 'u'; + memmove(utf8_str, str, len); + memmove(utf8_str + len, utf8_str + 8, utf8_to_eos_len - 8); + len_to_remove += 8 - len; + } + } + else + subst = 1; /* Invalid sequence. */ + + /* In case of invalid \U sequence, replace it with the */ + /* substitution character. */ + /* ''''''''''''''''''''''''''''''''''''''''''''''''''' */ + if (subst) + { + *utf8_str = substitute; + memmove(utf8_str + 1, utf8_str + 2 + utf8_str_len, + utf8_to_eos_len - (utf8_str_len + 2 - 1)); + len_to_remove += utf8_str_len + 2 - 1; + } + } + + /* Make sure that the string is well terminated */ + /* """""""""""""""""""""""""""""""""""""""""""" */ + *(s + init_len - len_to_remove) = '\0'; + + /* Manage \u UTF-8 byte sequences. */ + /* """"""""""""""""""""""""""""""" */ while ((utf8_str = strstr(s, "\\u")) != NULL) { utf8_to_eos_len = strlen(utf8_str); |