diff options
author | pgen <p.gen.progs@gmail.com> | 2018-09-27 23:16:48 +0200 |
---|---|---|
committer | pgen <p.gen.progs@gmail.com> | 2018-10-01 19:36:56 +0200 |
commit | f07e9d15916ac14721c9819c6d245a28f685f42b (patch) | |
tree | 20bf5c45117afcfeacf0e4f66d19ccf68c9e3d4f /utf8.c | |
parent | 1b52c4c75422742f04363048dda64999adb8402c (diff) |
Create utf8.[ch] and change code accordingly
Change multibyte to utf8 to specify that we only support UTF-8 encoding.
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 437 |
1 files changed, 437 insertions, 0 deletions
@@ -0,0 +1,437 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> +#include <string.h> +#include <ctype.h> +#include <limits.h> +#include <langinfo.h> +#include "xmalloc.h" +#include "utf8.h" + +/* ======================================================================== */ +/* Unicode (UTF-8) ascii representation interpreter. */ +/* The string passed will be altered but will not move in memory */ +/* All sequence of \uxx, \uxxxx, \uxxxxxx and \uxxxxxxxx will be replace by */ +/* the corresponding UTF-8 character. */ +/* ======================================================================== */ +void +utf8_interpret(char * s, langinfo_t * langinfo) +{ + char * utf8_str; /* \uxx... */ + size_t utf8_to_eos_len; /* bytes in s starting from the first * + * occurrence of \u */ + size_t init_len; /* initial lengths of the string to interpret */ + size_t utf8_ascii_len; /* 2,4,6 or 8 bytes */ + size_t len_to_remove = 0; /* number of bytes to remove after the conversion */ + char tmp[9]; /* temporary string */ + + /* Guard against the case where s is NULL */ + /* """""""""""""""""""""""""""""""""""""" */ + if (s == NULL) + return; + + init_len = strlen(s); + + while ((utf8_str = strstr(s, "\\u")) != NULL) + { + utf8_to_eos_len = strlen(utf8_str); + if (utf8_to_eos_len < 4) /* string too short to contain * + * a valid UTF-8 char */ + { + *utf8_str = '.'; + *(utf8_str + 1) = '\0'; + } + else /* s is long enough */ + { + unsigned byte; + char * utf8_seq_offset = utf8_str + 2; + + /* Get the first 2 utf8 bytes */ + *tmp = *utf8_seq_offset; + *(tmp + 1) = *(utf8_seq_offset + 1); + *(tmp + 2) = '\0'; + + /* If they are invalid, replace the \u sequence by a dot */ + /* """"""""""""""""""""""""""""""""""""""""""""""""""""" */ + if (!isxdigit(tmp[0]) || !isxdigit(tmp[1])) + { + *utf8_str = '.'; + if (4 >= utf8_to_eos_len) + *(utf8_str + 1) = '\0'; + else + memmove(utf8_str, utf8_str + 4, utf8_to_eos_len - 4); + return; + } + else + { + /* They are valid, deduce from them the length of the sequence */ + /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */ + sscanf(tmp, "%2x", &byte); + utf8_ascii_len = utf8_get_length(byte) * 2; + + /* Check again if the inputs string is long enough */ + /* """"""""""""""""""""""""""""""""""""""""""""""" */ + if (utf8_to_eos_len - 2 < utf8_ascii_len) + { + *utf8_str = '.'; + *(utf8_str + 1) = '\0'; + } + else + { + /* replace the \u sequence by the bytes forming the UTF-8 char */ + /* """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" */ + size_t i; + *tmp = byte; + + /* Put the bytes in the tmp string */ + /* ''''''''''''''''''''''''''''''' */ + if (langinfo->utf8) + { + for (i = 1; i < utf8_ascii_len / 2; i++) + { + sscanf(utf8_seq_offset + 2 * i, "%2x", &byte); + *(tmp + i) = byte; + } + tmp[utf8_ascii_len / 2] = '\0'; + } + + /* Does they form a valid UTF-8 char? */ + /* '''''''''''''''''''''''''''''''''' */ + if (langinfo->utf8 && utf8_validate(tmp, utf8_ascii_len / 2)) + { + /* Put them back in the original string and move */ + /* the remaining bytes after them */ + /* ''''''''''''''''''''''''''''''''''''''''''''' */ + memmove(utf8_str, tmp, utf8_ascii_len / 2); + + if (utf8_to_eos_len < utf8_ascii_len) + *(utf8_str + utf8_ascii_len / 2 + 1) = '\0'; + else + memmove(utf8_str + utf8_ascii_len / 2, + utf8_seq_offset + utf8_ascii_len, + utf8_to_eos_len - utf8_ascii_len - 2 + 1); + } + else + { + /* The invalid sequence is replaced by a dot */ + /* ''''''''''''''''''''''''''''''''''''''''' */ + *utf8_str = '.'; + if (utf8_to_eos_len < utf8_ascii_len) + *(utf8_str + 1) = '\0'; + else + memmove(utf8_str + 1, utf8_seq_offset + utf8_ascii_len, + utf8_to_eos_len - utf8_ascii_len - 2 + 1); + utf8_ascii_len = 2; + } + } + + /* Update the number of bytes to remove at the end */ + /* of the initial string */ + /* """"""""""""""""""""""""""""""""""""""""""""""" */ + len_to_remove += 2 + utf8_ascii_len / 2; + } + } + } + + /* Make sure that the string is well terminated */ + /* """""""""""""""""""""""""""""""""""""""""""" */ + *(s + init_len - len_to_remove) = '\0'; + + return; +} + +/* ========================================================= */ +/* Decode the number of bytes taken by a character (UTF-8) */ +/* It is the length of the leading sequence of bits set to 1 */ +/* (Count Leading Ones) */ +/* ========================================================= */ +int +utf8_get_length(unsigned char c) +{ + if (c >= 0xf0) + return 4; + else if (c >= 0xe0) + return 3; + else if (c >= 0xc2) + return 2; + else + return 1; +} + +/* ================================================== */ +/* Return the byte offset of the nth UTF-8 glyph in s */ +/* ================================================== */ +size_t +utf8_offset(char * s, size_t n) +{ + size_t i = 0; + + while (n > 0) + { + if (s[i++] & 0x80) + { + (void)(((s[++i] & 0xc0) != 0x80) || ((s[++i] & 0xc0) != 0x80) || ++i); + } + n--; + } + return i; +} + +/* ============================================== */ +/* Points to the previous UTF-8 glyph in a string */ +/* from the given position */ +/* ============================================== */ +char * +utf8_prev(const char * str, const char * p) +{ + while ((*p & 0xc0) == 0x80) + p--; + + for (--p; p >= str; --p) + { + if ((*p & 0xc0) != 0x80) + return (char *)p; + } + return NULL; +} + +/* ========================================== */ +/* Points to the next UTF-8 glyph in a string */ +/* from the current position */ +/* ========================================== */ +char * +utf8_next(char * p) +{ + if (*p) + { + for (++p; (*p & 0xc0) == 0x80; ++p) + ; + } + return (*p == '\0' ? NULL : p); +} + +/* ============================================================ */ +/* Replace any UTF-8 glyph present in s by a dot in-place */ +/* s will be modified but its address in memory will not change */ +/* ============================================================ */ +void +utf8_sanitize(char * s) +{ + char * p = s; + int n; + size_t len; + + len = strlen(s); + while (*p) + { + n = utf8_get_length(*p); + if (n > 1) + { + *p = '.'; + memmove(p + 1, p + n, len - (p - s) - n + 1); + len -= (n - 1); + } + p++; + } +} + +static const char trailing_bytes_for_utf8[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 +}; + +/* ================================================================== */ +/* UTF-8 validation routine inspired by Jeff Bezanson */ +/* placed in the public domain Fall 2005 */ +/* (https://github.com/JeffBezanson/cutef8) */ +/* */ +/* Returns 1 if str contains a valid UTF-8 byte sequence, 0 otherwise */ +/* ================================================================== */ +int +utf8_validate(const char * str, size_t length) +{ + const unsigned char *p, *pend = (const unsigned char *)str + length; + unsigned char c; + size_t ab; + + for (p = (const unsigned char *)str; p < pend; p++) + { + c = *p; + if (c < 128) + continue; + if ((c & 0xc0) != 0xc0) + return 0; + ab = trailing_bytes_for_utf8[c]; + if (length < ab) + return 0; + length -= ab; + + p++; + /* Check top bits in the second byte */ + /* """"""""""""""""""""""""""""""""" */ + if ((*p & 0xc0) != 0x80) + return 0; + + /* Check for overlong sequences for each different length */ + /* """""""""""""""""""""""""""""""""""""""""""""""""""""" */ + switch (ab) + { + /* Check for xx00 000x */ + /* """"""""""""""""""" */ + case 1: + if ((c & 0x3e) == 0) + return 0; + continue; /* We know there aren't any more bytes to check */ + + /* Check for 1110 0000, xx0x xxxx */ + /* """""""""""""""""""""""""""""" */ + case 2: + if (c == 0xe0 && (*p & 0x20) == 0) + return 0; + break; + + /* Check for 1111 0000, xx00 xxxx */ + /* """""""""""""""""""""""""""""" */ + case 3: + if (c == 0xf0 && (*p & 0x30) == 0) + return 0; + break; + + /* Check for 1111 1000, xx00 0xxx */ + /* """""""""""""""""""""""""""""" */ + case 4: + if (c == 0xf8 && (*p & 0x38) == 0) + return 0; + break; + + /* Check for leading 0xfe or 0xff, */ + /* and then for 1111 1100, xx00 00xx */ + /* """"""""""""""""""""""""""""""""" */ + case 5: + if (c == 0xfe || c == 0xff || (c == 0xfc && (*p & 0x3c) == 0)) + return 0; + break; + } + + /* Check for valid bytes after the 2nd, if any; all must start 10 */ + /* """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */ + while (--ab > 0) + { + if ((*(++p) & 0xc0) != 0x80) + return 0; + } + } + + return 1; +} + +/* ====================== */ +/* Multibyte UTF-8 strlen */ +/* ====================== */ +size_t +utf8_strlen(char * str) +{ + size_t i = 0, j = 0; + + while (str[i]) + { + if ((str[i] & 0xc0) != 0x80) + j++; + i++; + } + return j; +} + +/* =================================================================== */ +/* Multibytes extraction of the prefix of n UTF-8 glyphs from a string */ +/* The destination string d must have been allocated before. */ +/* pos is updated to reflect the position AFTER the prefix. */ +/* =================================================================== */ +char * +utf8_strprefix(char * d, char * s, long n, long * pos) +{ + long i = 0; + long j = 0; + + *pos = 0; + + while (s[i] && j < n) + { + d[i] = s[i]; + i++; + j++; + while (s[i] && (s[i] & 0xC0) == 0x80) + { + d[i] = s[i]; + i++; + } + } + + *pos = i; + + d[i] = '\0'; + + return d; +} + +/* ================================================ */ +/* Convert a UTF-8 glyph string to a wchar_t string */ +/* ================================================ */ +wchar_t * +utf8_strtowcs(char * s) +{ + int converted = 0; + unsigned char * ch; + wchar_t * wptr, *w; + size_t size; + + size = (long)strlen(s); + w = xmalloc((size + 1) * sizeof(wchar_t)); + w[0] = L'\0'; + + wptr = w; + for (ch = (unsigned char *)s; *ch; ch += converted) + { + if ((converted = mbtowc(wptr, (char *)ch, 4)) > 0) + wptr++; + else + { + *wptr++ = (wchar_t)*ch; + converted = 1; + } + } + + *wptr = L'\0'; + + return w; +} + +/* ============================================================== */ +/* Fill dst whi a lowercase ocopy of src whar the character is an */ +/* ascci one. dsk must be preallocated before the call. */ +/* ============================================================== */ +void +utf8_strtolower(char * dst, char * src) +{ + unsigned char c; + + while ((c = *src)) + { + if (c >= 0x80) + *dst = c; + else + *dst = tolower(c); + + src++; + dst++; + } + *dst = '\0'; +} |