/* ########################################################### */ /* This Software is licensed under the GPL licensed Version 2, */ /* please read http://www.gnu.org/copyleft/gpl.html */ /* ########################################################### */ /* ************************************ */ /* Various UTF-8 manipulation functions */ /* ************************************ */ #include #include #include #include #include #include #include #include "xmalloc.h" #include "utf8.h" /* ======================================================================== */ /* Unicode (UTF-8) ascii representation interpreter. */ /* The string passed will be altered but will not move in memory */ /* All sequence of \uxx, \uxxxx, \uxxxxxx and \uxxxxxxxx will be replace by */ /* the corresponding UTF-8 character. */ /* ======================================================================== */ void utf8_interpret(char * s, langinfo_t * langinfo) { char * utf8_str; /* \uxx... */ size_t utf8_to_eos_len; /* bytes in s starting from the first * * occurrence of \u */ size_t init_len; /* initial lengths of the string to interpret */ size_t utf8_ascii_len; /* 2,4,6 or 8 bytes */ size_t len_to_remove = 0; /* number of bytes to remove after the conversion */ char tmp[9]; /* temporary string */ /* Guard against the case where s is NULL */ /* """""""""""""""""""""""""""""""""""""" */ if (s == NULL) return; init_len = strlen(s); while ((utf8_str = strstr(s, "\\u")) != NULL) { utf8_to_eos_len = strlen(utf8_str); if (utf8_to_eos_len < 4) /* string too short to contain * * a valid UTF-8 char */ { *utf8_str = '.'; *(utf8_str + 1) = '\0'; } else /* s is long enough */ { unsigned byte; char * utf8_seq_offset = utf8_str + 2; /* Get the first 2 utf8 bytes */ *tmp = *utf8_seq_offset; *(tmp + 1) = *(utf8_seq_offset + 1); *(tmp + 2) = '\0'; /* If they are invalid, replace the \u sequence by a dot */ /* """"""""""""""""""""""""""""""""""""""""""""""""""""" */ if (!isxdigit(tmp[0]) || !isxdigit(tmp[1])) { *utf8_str = '.'; if (4 >= utf8_to_eos_len) *(utf8_str + 1) = '\0'; else memmove(utf8_str, utf8_str + 4, utf8_to_eos_len - 4); return; } else { /* They are valid, deduce from them the length of the sequence */ /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */ sscanf(tmp, "%2x", &byte); utf8_ascii_len = utf8_get_length(byte) * 2; /* Check again if the inputs string is long enough */ /* """"""""""""""""""""""""""""""""""""""""""""""" */ if (utf8_to_eos_len - 2 < utf8_ascii_len) { *utf8_str = '.'; *(utf8_str + 1) = '\0'; } else { /* replace the \u sequence by the bytes forming the UTF-8 char */ /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */ size_t i; *tmp = byte; /* Put the bytes in the tmp string */ /* ''''''''''''''''''''''''''''''' */ if (langinfo->utf8) { for (i = 1; i < utf8_ascii_len / 2; i++) { sscanf(utf8_seq_offset + 2 * i, "%2x", &byte); *(tmp + i) = byte; } tmp[utf8_ascii_len / 2] = '\0'; } /* Does they form a valid UTF-8 char? */ /* '''''''''''''''''''''''''''''''''' */ if (langinfo->utf8 && utf8_validate(tmp, utf8_ascii_len / 2)) { /* Put them back in the original string and move */ /* the remaining bytes after them */ /* ''''''''''''''''''''''''''''''''''''''''''''' */ memmove(utf8_str, tmp, utf8_ascii_len / 2); if (utf8_to_eos_len < utf8_ascii_len) *(utf8_str + utf8_ascii_len / 2 + 1) = '\0'; else memmove(utf8_str + utf8_ascii_len / 2, utf8_seq_offset + utf8_ascii_len, utf8_to_eos_len - utf8_ascii_len - 2 + 1); } else { /* The invalid sequence is replaced by a dot */ /* ''''''''''''''''''''''''''''''''''''''''' */ *utf8_str = '.'; if (utf8_to_eos_len < utf8_ascii_len) *(utf8_str + 1) = '\0'; else memmove(utf8_str + 1, utf8_seq_offset + utf8_ascii_len, utf8_to_eos_len - utf8_ascii_len - 2 + 1); utf8_ascii_len = 2; } } /* Update the number of bytes to remove at the end */ /* of the initial string */ /* """"""""""""""""""""""""""""""""""""""""""""""" */ len_to_remove += 2 + utf8_ascii_len / 2; } } } /* Make sure that the string is well terminated */ /* """""""""""""""""""""""""""""""""""""""""""" */ *(s + init_len - len_to_remove) = '\0'; return; } /* ========================================================= */ /* Decode the number of bytes taken by a character (UTF-8) */ /* It is the length of the leading sequence of bits set to 1 */ /* (Count Leading Ones) */ /* ========================================================= */ int utf8_get_length(unsigned char c) { if (c < 0x80) return 1; else if (c < 0xe0) return 2; else if (c < 0xf0) return 3; else return 4; } /* ================================================== */ /* Return the byte offset of the nth UTF-8 glyph in s */ /* ================================================== */ size_t utf8_offset(char * s, size_t n) { size_t i = 0; while (n > 0) { if (s[i++] & 0x80) { (void)(((s[++i] & 0xc0) != 0x80) || ((s[++i] & 0xc0) != 0x80) || ++i); } n--; } return i; } /* ============================================== */ /* Points to the previous UTF-8 glyph in a string */ /* from the given position */ /* ============================================== */ char * utf8_prev(const char * str, const char * p) { while ((*p & 0xc0) == 0x80) p--; for (--p; p >= str; --p) { if ((*p & 0xc0) != 0x80) return (char *)p; } return NULL; } /* ========================================== */ /* Points to the next UTF-8 glyph in a string */ /* from the current position */ /* ========================================== */ char * utf8_next(char * p) { if (*p) { for (++p; (*p & 0xc0) == 0x80; ++p) ; } return (*p == '\0' ? NULL : p); } /* ============================================================ */ /* Replace any UTF-8 glyph present in s by a dot in-place */ /* s will be modified but its address in memory will not change */ /* ============================================================ */ void utf8_sanitize(char * s) { char * p = s; int n; size_t len; len = strlen(s); while (*p) { n = utf8_get_length(*p); if (n > 1) { *p = '.'; memmove(p + 1, p + n, len - (p - s) - n + 1); len -= (n - 1); } p++; } } static const char trailing_bytes_for_utf8[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; /* ================================================================== */ /* UTF-8 validation routine inspired by Jeff Bezanson */ /* placed in the public domain Fall 2005 */ /* (https://github.com/JeffBezanson/cutef8) */ /* */ /* Returns 1 if str contains a valid UTF-8 byte sequence, 0 otherwise */ /* ================================================================== */ int utf8_validate(const char * str, size_t length) { const unsigned char *p, *pend = (const unsigned char *)str + length; unsigned char c; size_t ab; for (p = (const unsigned char *)str; p < pend; p++) { c = *p; if (c < 128) continue; if ((c & 0xc0) != 0xc0) return 0; ab = trailing_bytes_for_utf8[c]; if (length < ab) return 0; length -= ab; p++; /* Check top bits in the second byte */ /* """"""""""""""""""""""""""""""""" */ if ((*p & 0xc0) != 0x80) return 0; /* Check for overlong sequences for each different length */ /* """""""""""""""""""""""""""""""""""""""""""""""""""""" */ switch (ab) { /* Check for xx00 000x */ /* """"""""""""""""""" */ case 1: if ((c & 0x3e) == 0) return 0; continue; /* We know there aren't any more bytes to check */ /* Check for 1110 0000, xx0x xxxx */ /* """""""""""""""""""""""""""""" */ case 2: if (c == 0xe0 && (*p & 0x20) == 0) return 0; break; /* Check for 1111 0000, xx00 xxxx */ /* """""""""""""""""""""""""""""" */ case 3: if (c == 0xf0 && (*p & 0x30) == 0) return 0; break; /* Check for 1111 1000, xx00 0xxx */ /* """""""""""""""""""""""""""""" */ case 4: if (c == 0xf8 && (*p & 0x38) == 0) return 0; break; /* Check for leading 0xfe or 0xff, */ /* and then for 1111 1100, xx00 00xx */ /* """"""""""""""""""""""""""""""""" */ case 5: if (c == 0xfe || c == 0xff || (c == 0xfc && (*p & 0x3c) == 0)) return 0; break; } /* Check for valid bytes after the 2nd, if any; all must start 10 */ /* """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */ while (--ab > 0) { if ((*(++p) & 0xc0) != 0x80) return 0; } } return 1; } /* ====================== */ /* Multibyte UTF-8 strlen */ /* ====================== */ size_t utf8_strlen(char * str) { size_t i = 0, j = 0; while (str[i]) { if ((str[i] & 0xc0) != 0x80) j++; i++; } return j; } /* =================================================================== */ /* Multibytes extraction of the prefix of n UTF-8 glyphs from a string */ /* The destination string d must have been allocated before. */ /* pos is updated to reflect the position AFTER the prefix. */ /* =================================================================== */ char * utf8_strprefix(char * d, char * s, long n, long * pos) { long i = 0; long j = 0; *pos = 0; while (s[i] && j < n) { d[i] = s[i]; i++; j++; while (s[i] && (s[i] & 0xC0) == 0x80) { d[i] = s[i]; i++; } } *pos = i; d[i] = '\0'; return d; } /* ================================================ */ /* Convert a UTF-8 glyph string to a wchar_t string */ /* ================================================ */ wchar_t * utf8_strtowcs(char * s) { int converted = 0; unsigned char * ch; wchar_t * wptr, *w; size_t size; size = (long)strlen(s); w = xmalloc((size + 1) * sizeof(wchar_t)); w[0] = L'\0'; wptr = w; for (ch = (unsigned char *)s; *ch; ch += converted) { if ((converted = mbtowc(wptr, (char *)ch, 4)) > 0) wptr++; else { *wptr++ = (wchar_t)*ch; converted = 1; } } *wptr = L'\0'; return w; } /* ============================================================== */ /* Fill dst whi a lowercase ocopy of src whar the character is an */ /* ascci one. dsk must be preallocated before the call. */ /* ============================================================== */ void utf8_strtolower(char * dst, char * src) { unsigned char c; while ((c = *src)) { if (c >= 0x80) *dst = c; else *dst = tolower(c); src++; dst++; } *dst = '\0'; }