summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorpgen <p.gen.progs@gmail.com>2024-02-06 00:00:25 +0100
committerpgen <p.gen.progs@gmail.com>2024-02-06 00:00:46 +0100
commit85d783c4e669e14adccc5c009ca71180c630458c (patch)
tree6139a42b7f785c8d40782d8e46346e8969689d54
parent482b39a82661f9cf5d1ff2e9d6457d156b177f2f (diff)
Take UTF-8 spaces into account for empty strings
-rwxr-xr-xsmenu.c188
-rwxr-xr-xsmenu.h6
2 files changed, 179 insertions, 15 deletions
diff --git a/smenu.c b/smenu.c
index 4a08024..3266f8e 100755
--- a/smenu.c
+++ b/smenu.c
@@ -83,6 +83,7 @@ int marked = -1; /* Index of the marked word or -1. */
char *word_buffer;
int (*my_isprint)(int);
+int (*my_isempty)(const unsigned char *);
/* UTF-8 useful symbols. */
/* """"""""""""""""""""" */
@@ -1692,20 +1693,164 @@ read:
/* ====================================================== */
/* Returns 1 if a string is empty or only made of spaces. */
-/* TODO: take all UTF-8 spaces into account. */
+/* Non UTF-8 version. */
/* ====================================================== */
int
-isempty(const char *s)
+isempty_non_utf8(const unsigned char *s)
{
while (*s != '\0')
{
- if (my_isprint(*s) && *s != ' ' && *s != '\t')
+ if (*s != ' ' && *s != '\t')
return 0;
s++;
}
return 1;
}
+/* ====================================================== */
+/* Returns 1 if a string is empty or only made of spaces. */
+/* UTF-8 version. */
+/* ====================================================== */
+int
+isempty_utf8(const unsigned char *s)
+{
+ unsigned char c, d;
+
+ while (*s != '\0')
+ {
+ if (*s == ' ' || *s == '\t') /* Normal ASCII spaces. */
+ goto next;
+
+ if (*s < 0xc2) /* Not an UTF-8 space -> return FALSE. */
+ return 0;
+
+ /* Scanning for a potential non UTF-8 spaces scanning. */
+ /* """"""""""""""""""""""""""""""""""""""""""""""""""" */
+ if ((c = *(s + 1)) != '\0')
+ {
+ if (*s == 0xc2 && (c == 0x85 || c == 0xa0))
+ {
+ s++;
+ goto next; /* Unnamed control character or NO-BREAK SPACE. */
+ }
+
+ if ((d = *(s + 2)) == '\0')
+ return 0;
+
+ if (*s == 0xe1 && c == 0x9a && d == 0x80)
+ {
+ s += 2;
+ goto next; /* OGHAM SPACE MARK. */
+ }
+
+ if (*s == 0xe1 && c == 0xa0 && d == 0x8e)
+ {
+ s += 2;
+ goto next; /* MONGOLIAN VOWEL SEPARATOR. */
+ }
+
+ if (*s == 0xe2 && c == 0x80 && d == 0x80)
+ {
+ s += 2;
+ goto next; /* EN QUAD. */
+ }
+
+ if (*s == 0xe2 && c == 0x80 && d == 0x81)
+ {
+ s += 2;
+ goto next; /* EM QUAD. */
+ }
+
+ if (*s == 0xe2 && c == 0x80 && d == 0x82)
+ {
+ s += 2;
+ goto next; /* EN SPACE. */
+ }
+ if (*s == 0xe2 && c == 0x80 && d == 0x83)
+ {
+ s += 2;
+ goto next; /* EM SPACE. */
+ }
+
+ if (*s == 0xe2 && c == 0x80 && d == 0x84)
+ {
+ s += 2;
+ goto next; /* THREE-PER-EM SPACE. */
+ }
+
+ if (*s == 0xe2 && c == 0x80 && d == 0x85)
+ {
+ s += 2;
+ goto next; /* FOUR-PER-EM SPACE. */
+ }
+
+ if (*s == 0xe2 && c == 0x80 && d == 0x86)
+ {
+ s += 2;
+ goto next; /* SIX-PER-EM SPACE. */
+ }
+ if (*s == 0xe2 && c == 0x80 && d == 0x87)
+ {
+ s += 2;
+ goto next; /* FIGURE SPACE. */
+ }
+
+ if (*s == 0xe2 && c == 0x80 && d == 0x88)
+ {
+ s += 2;
+ goto next; /* PUNCTUATION SPACE. */
+ }
+
+ if (*s == 0xe2 && c == 0x80 && d == 0x89)
+ {
+ s += 2;
+ goto next; /* THIN SPACE. */
+ }
+
+ if (*s == 0xe2 && c == 0x80 && d == 0x8a)
+ {
+ s += 2;
+ goto next; /* HAIR SPACE. */
+ }
+
+ if (*s == 0xe2 && c == 0x80 && d == 0xa8)
+ {
+ s += 2;
+ goto next; /* LINE SEPARATOR. */
+ }
+
+ if (*s == 0xe2 && c == 0x80 && d == 0xa9)
+ {
+ s += 2;
+ goto next; /* PARAGRAPH SEPARATOR. */
+ }
+
+ if (*s == 0xe2 && c == 0x80 && d == 0xaf)
+ {
+ s += 2;
+ goto next; /* NARROW NO-BREAK SPACE. */
+ }
+
+ if (*s == 0xe2 && c == 0x81 && d == 0x9f)
+ {
+ s += 2;
+ goto next; /* MEDIUM MATHEMATICAL SPACE. */
+ }
+
+ if (*s == 0xe3 && c == 0x80 && d == 0x80)
+ {
+ s += 2;
+ goto next; /* IDEOGRAPHIC SPACE. */
+ }
+
+ return 0;
+ }
+ next:
+ s++;
+ }
+ return 1;
+}
+
/* ======================================================================== */
/* Parse a regular expression based selector. */
/* The string to parse is bounded by a delimiter so we must parse something */
@@ -3550,16 +3695,19 @@ get_word(FILE *input,
&& ll_find(line_delims_list, utf8_buffer, buffer_cmp) == NULL)
byte = get_bytes(input, utf8_buffer, zapped_glyphs_list, langinfo, misc);
- if (langinfo->utf8 && utf8_get_length(utf8_buffer[0]) > 1)
+ if (byte != EOF)
{
- size_t pos;
+ if (langinfo->utf8 && utf8_get_length(utf8_buffer[0]) > 1)
+ {
+ size_t pos;
- pos = strlen(utf8_buffer);
- while (pos > 0)
- my_ungetc(utf8_buffer[--pos], input);
+ pos = strlen(utf8_buffer);
+ while (pos > 0)
+ my_ungetc(utf8_buffer[--pos], input);
+ }
+ else
+ my_ungetc(byte, input);
}
- else
- my_ungetc(byte, input);
}
/* Mark it as the last word of a record if its sequence matches a */
@@ -8833,14 +8981,24 @@ main(int argc, char *argv[])
else
langinfo.utf8 = 0;
- /* my_isprint is a function pointer that points to the 7 or 8-bit */
- /* version of isprint according to the content of UTF-8. */
- /* """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" */
+ /* my_isprint is a function pointers that points to */
+ /* the 7 or 8-bit version of isprint according to the */
+ /* current locale. */
+ /* """""""""""""""""""""""""""""""""""""""""""""""""" */
if (langinfo.utf8 || langinfo.bits == 8)
my_isprint = isprint8;
else
my_isprint = isprint7;
+ /* my_isempty is a function pointers that points to */
+ /* the utf8 or non_utf8 version of isprint according to the */
+ /* current locale. */
+ /* """""""""""""""""""""""""""""""""""""""""""""""""""""""" */
+ if (langinfo.utf8)
+ my_isempty = isempty_utf8;
+ else
+ my_isempty = isempty_non_utf8;
+
/* Set terminal in noncanonical, noecho mode and */
/* if TERM is unset or unknown, vt100 is assumed. */
/* """""""""""""""""""""""""""""""""""""""""""""" */
@@ -11322,7 +11480,7 @@ main(int argc, char *argv[])
long *word_pos = xmalloc(sizeof(long));
int may_number;
- if (!isempty(word->str))
+ if (!my_isempty(word->str))
{
*word_pos = wi;
@@ -11774,7 +11932,7 @@ main(int argc, char *argv[])
else
len = 0;
- if (!isempty(word_a[wi + offset].str + len))
+ if (!my_isempty(word_a[wi + offset].str + len))
break;
/* Keep non selectable empty words to allow special effects. */
diff --git a/smenu.h b/smenu.h
index 9994b54..c242ab1 100755
--- a/smenu.h
+++ b/smenu.h
@@ -822,3 +822,9 @@ get_clicked_index(win_t *win,
void
align_word(word_t *word, alignment_t alignment, size_t prerfix, char sp);
+
+int
+isempty_non_utf8(const unsigned char * s);
+
+int
+isempty_utf8(const unsigned char * s);