diff options
author | pgen <p.gen.progs@gmail.com> | 2024-02-06 00:00:25 +0100 |
---|---|---|
committer | pgen <p.gen.progs@gmail.com> | 2024-02-06 00:00:46 +0100 |
commit | 85d783c4e669e14adccc5c009ca71180c630458c (patch) | |
tree | 6139a42b7f785c8d40782d8e46346e8969689d54 | |
parent | 482b39a82661f9cf5d1ff2e9d6457d156b177f2f (diff) |
Take UTF-8 spaces into account for empty strings
-rwxr-xr-x | smenu.c | 188 | ||||
-rwxr-xr-x | smenu.h | 6 |
2 files changed, 179 insertions, 15 deletions
@@ -83,6 +83,7 @@ int marked = -1; /* Index of the marked word or -1. */ char *word_buffer; int (*my_isprint)(int); +int (*my_isempty)(const unsigned char *); /* UTF-8 useful symbols. */ /* """"""""""""""""""""" */ @@ -1692,20 +1693,164 @@ read: /* ====================================================== */ /* Returns 1 if a string is empty or only made of spaces. */ -/* TODO: take all UTF-8 spaces into account. */ +/* Non UTF-8 version. */ /* ====================================================== */ int -isempty(const char *s) +isempty_non_utf8(const unsigned char *s) { while (*s != '\0') { - if (my_isprint(*s) && *s != ' ' && *s != '\t') + if (*s != ' ' && *s != '\t') return 0; s++; } return 1; } +/* ====================================================== */ +/* Returns 1 if a string is empty or only made of spaces. */ +/* UTF-8 version. */ +/* ====================================================== */ +int +isempty_utf8(const unsigned char *s) +{ + unsigned char c, d; + + while (*s != '\0') + { + if (*s == ' ' || *s == '\t') /* Normal ASCII spaces. */ + goto next; + + if (*s < 0xc2) /* Not an UTF-8 space -> return FALSE. */ + return 0; + + /* Scanning for a potential non UTF-8 spaces scanning. */ + /* """"""""""""""""""""""""""""""""""""""""""""""""""" */ + if ((c = *(s + 1)) != '\0') + { + if (*s == 0xc2 && (c == 0x85 || c == 0xa0)) + { + s++; + goto next; /* Unnamed control character or NO-BREAK SPACE. */ + } + + if ((d = *(s + 2)) == '\0') + return 0; + + if (*s == 0xe1 && c == 0x9a && d == 0x80) + { + s += 2; + goto next; /* OGHAM SPACE MARK. */ + } + + if (*s == 0xe1 && c == 0xa0 && d == 0x8e) + { + s += 2; + goto next; /* MONGOLIAN VOWEL SEPARATOR. */ + } + + if (*s == 0xe2 && c == 0x80 && d == 0x80) + { + s += 2; + goto next; /* EN QUAD. */ + } + + if (*s == 0xe2 && c == 0x80 && d == 0x81) + { + s += 2; + goto next; /* EM QUAD. */ + } + + if (*s == 0xe2 && c == 0x80 && d == 0x82) + { + s += 2; + goto next; /* EN SPACE. */ + } + if (*s == 0xe2 && c == 0x80 && d == 0x83) + { + s += 2; + goto next; /* EM SPACE. */ + } + + if (*s == 0xe2 && c == 0x80 && d == 0x84) + { + s += 2; + goto next; /* THREE-PER-EM SPACE. */ + } + + if (*s == 0xe2 && c == 0x80 && d == 0x85) + { + s += 2; + goto next; /* FOUR-PER-EM SPACE. */ + } + + if (*s == 0xe2 && c == 0x80 && d == 0x86) + { + s += 2; + goto next; /* SIX-PER-EM SPACE. */ + } + if (*s == 0xe2 && c == 0x80 && d == 0x87) + { + s += 2; + goto next; /* FIGURE SPACE. */ + } + + if (*s == 0xe2 && c == 0x80 && d == 0x88) + { + s += 2; + goto next; /* PUNCTUATION SPACE. */ + } + + if (*s == 0xe2 && c == 0x80 && d == 0x89) + { + s += 2; + goto next; /* THIN SPACE. */ + } + + if (*s == 0xe2 && c == 0x80 && d == 0x8a) + { + s += 2; + goto next; /* HAIR SPACE. */ + } + + if (*s == 0xe2 && c == 0x80 && d == 0xa8) + { + s += 2; + goto next; /* LINE SEPARATOR. */ + } + + if (*s == 0xe2 && c == 0x80 && d == 0xa9) + { + s += 2; + goto next; /* PARAGRAPH SEPARATOR. */ + } + + if (*s == 0xe2 && c == 0x80 && d == 0xaf) + { + s += 2; + goto next; /* NARROW NO-BREAK SPACE. */ + } + + if (*s == 0xe2 && c == 0x81 && d == 0x9f) + { + s += 2; + goto next; /* MEDIUM MATHEMATICAL SPACE. */ + } + + if (*s == 0xe3 && c == 0x80 && d == 0x80) + { + s += 2; + goto next; /* IDEOGRAPHIC SPACE. */ + } + + return 0; + } + next: + s++; + } + return 1; +} + /* ======================================================================== */ /* Parse a regular expression based selector. */ /* The string to parse is bounded by a delimiter so we must parse something */ @@ -3550,16 +3695,19 @@ get_word(FILE *input, && ll_find(line_delims_list, utf8_buffer, buffer_cmp) == NULL) byte = get_bytes(input, utf8_buffer, zapped_glyphs_list, langinfo, misc); - if (langinfo->utf8 && utf8_get_length(utf8_buffer[0]) > 1) + if (byte != EOF) { - size_t pos; + if (langinfo->utf8 && utf8_get_length(utf8_buffer[0]) > 1) + { + size_t pos; - pos = strlen(utf8_buffer); - while (pos > 0) - my_ungetc(utf8_buffer[--pos], input); + pos = strlen(utf8_buffer); + while (pos > 0) + my_ungetc(utf8_buffer[--pos], input); + } + else + my_ungetc(byte, input); } - else - my_ungetc(byte, input); } /* Mark it as the last word of a record if its sequence matches a */ @@ -8833,14 +8981,24 @@ main(int argc, char *argv[]) else langinfo.utf8 = 0; - /* my_isprint is a function pointer that points to the 7 or 8-bit */ - /* version of isprint according to the content of UTF-8. */ - /* """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */ + /* my_isprint is a function pointers that points to */ + /* the 7 or 8-bit version of isprint according to the */ + /* current locale. */ + /* """""""""""""""""""""""""""""""""""""""""""""""""" */ if (langinfo.utf8 || langinfo.bits == 8) my_isprint = isprint8; else my_isprint = isprint7; + /* my_isempty is a function pointers that points to */ + /* the utf8 or non_utf8 version of isprint according to the */ + /* current locale. */ + /* """""""""""""""""""""""""""""""""""""""""""""""""""""""" */ + if (langinfo.utf8) + my_isempty = isempty_utf8; + else + my_isempty = isempty_non_utf8; + /* Set terminal in noncanonical, noecho mode and */ /* if TERM is unset or unknown, vt100 is assumed. */ /* """""""""""""""""""""""""""""""""""""""""""""" */ @@ -11322,7 +11480,7 @@ main(int argc, char *argv[]) long *word_pos = xmalloc(sizeof(long)); int may_number; - if (!isempty(word->str)) + if (!my_isempty(word->str)) { *word_pos = wi; @@ -11774,7 +11932,7 @@ main(int argc, char *argv[]) else len = 0; - if (!isempty(word_a[wi + offset].str + len)) + if (!my_isempty(word_a[wi + offset].str + len)) break; /* Keep non selectable empty words to allow special effects. */ @@ -822,3 +822,9 @@ get_clicked_index(win_t *win, void align_word(word_t *word, alignment_t alignment, size_t prerfix, char sp); + +int +isempty_non_utf8(const unsigned char * s); + +int +isempty_utf8(const unsigned char * s); |