diff options
-rw-r--r-- | src/mbyte.c | 9 | ||||
-rw-r--r-- | src/proto/mbyte.pro | 1 | ||||
-rw-r--r-- | src/regexp.c | 11 | ||||
-rw-r--r-- | src/regexp_bt.c | 8 | ||||
-rw-r--r-- | src/regexp_nfa.c | 9 | ||||
-rw-r--r-- | src/testdir/test_regexp_utf8.vim | 28 | ||||
-rw-r--r-- | src/version.c | 2 | ||||
-rw-r--r-- | src/vim.h | 1 |
8 files changed, 66 insertions, 3 deletions
diff --git a/src/mbyte.c b/src/mbyte.c index d6fb7ecc76..3be75099f1 100644 --- a/src/mbyte.c +++ b/src/mbyte.c @@ -3801,6 +3801,15 @@ utf_strnicmp( * two characters otherwise. */ int +mb_strnicmp2(char_u *s1, char_u *s2, int n1, int n2) +{ + if (n1 == n2 || !enc_utf8) + return mb_strnicmp(s1, s2, n1); + else + return utf_strnicmp(s1, s2, n1, n2); +} + + int mb_strnicmp(char_u *s1, char_u *s2, size_t nn) { int i, l; diff --git a/src/proto/mbyte.pro b/src/proto/mbyte.pro index 7883b3b4c7..c49f7e7072 100644 --- a/src/proto/mbyte.pro +++ b/src/proto/mbyte.pro @@ -48,6 +48,7 @@ int utf_islower(int a); int utf_tolower(int a); int utf_isupper(int a); int mb_strnicmp(char_u *s1, char_u *s2, size_t nn); +int mb_strnicmp2(char_u *s1, char_u *s2, int n1, int n2); void show_utf8(void); int latin_head_off(char_u *base, char_u *p); int dbcs_screen_head_off(char_u *base, char_u *p); diff --git a/src/regexp.c b/src/regexp.c index 4373ae0cfa..4e85ebc29e 100644 --- a/src/regexp.c +++ b/src/regexp.c @@ -1606,7 +1606,9 @@ mb_decompose(int c, int *c1, int *c2, int *c3) /* * Compare two strings, ignore case if rex.reg_ic set. * Return 0 if strings match, non-zero otherwise. - * Correct the length "*n" when composing characters are ignored. + * Correct the length "*n" when composing characters are ignored + * or for utf8 when both utf codepoints are considered equal because of + * case-folding but have different length (e.g. 's' and 'ſ') */ static int cstrncmp(char_u *s1, char_u *s2, int *n) @@ -1615,6 +1617,13 @@ cstrncmp(char_u *s1, char_u *s2, int *n) if (!rex.reg_ic) result = STRNCMP(s1, s2, *n); + else if (enc_utf8) + { + int l2 = mb_ptr2len(s2); + result = MB_STRNICMP2(s1, s2, *n, l2); + if (result == 0 && l2 < *n) + *n = l2; + } else result = MB_STRNICMP(s1, s2, *n); diff --git a/src/regexp_bt.c b/src/regexp_bt.c index 5d9450d871..2a03fec579 100644 --- a/src/regexp_bt.c +++ b/src/regexp_bt.c @@ -3816,6 +3816,14 @@ regmatch( } } } + else if (enc_utf8) + { + if (cstrncmp(opnd, rex.input, &len) != 0) + { + status = RA_NOMATCH; + break; + } + } else for (i = 0; i < len; ++i) if (opnd[i] != rex.input[i]) diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c index 5e4fadd028..451720a09d 100644 --- a/src/regexp_nfa.c +++ b/src/regexp_nfa.c @@ -5666,7 +5666,12 @@ find_match_text(colnr_T *startcol, int regstart, char_u *match_text) for (;;) { match = TRUE; - len2 = MB_CHAR2LEN(regstart); // skip regstart + // skip regstart + len2 = MB_CHAR2LEN(regstart); + if (enc_utf8 && len2 > 1 && MB_CHAR2LEN(PTR2CHAR(rex.line + col)) != len2) + // because of case-folding of the previously matched text, we may need + // to skip fewer bytes than mb_char2len(regstart) + len2 = mb_char2len(utf_fold(regstart)); for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1)) { c1 = PTR2CHAR(match_text + len1); @@ -7503,7 +7508,7 @@ nfa_regexec_both( // If match_text is set it contains the full text that must match. // Nothing else to try. Doesn't handle combining chars well. - if (prog->match_text != NULL && !rex.reg_icombine) + if (prog->match_text != NULL && *prog->match_text != NUL && !rex.reg_icombine) { retval = find_match_text(&col, prog->regstart, prog->match_text); if (REG_MULTI) diff --git a/src/testdir/test_regexp_utf8.vim b/src/testdir/test_regexp_utf8.vim index 6669dee57e..9980e5b7f5 100644 --- a/src/testdir/test_regexp_utf8.vim +++ b/src/testdir/test_regexp_utf8.vim @@ -587,4 +587,32 @@ func Test_combining_chars_in_collection() bw! endfunc +func Test_search_multibyte_match_ascii() + new + " Match single 'ſ' and 's' + call setline(1, 'das abc heraus abc ſich abc ſind') + for i in range(0, 2) + exe "set re="..i + let ic_match = matchbufline('%', '\c\%u17f', 1, '$')->mapnew({idx, val -> val.text}) + let noic_match = matchbufline('%', '\C\%u17f', 1, '$')->mapnew({idx, val -> val.text}) + call assert_equal(['s', 's', 'ſ','ſ'], ic_match, "Ignorecase Regex-engine: " .. &re) + call assert_equal(['ſ','ſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re) + endfor + " Match several 'ſſ' and 'ss' + call setline(1, 'das abc herauss abc ſſich abc ſind') + for i in range(0, 2) + exe "set re="..i + let ic_match = matchbufline('%', '\c\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text}) + let noic_match = matchbufline('%', '\C\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text}) + let ic_match2 = matchbufline('%', '\c\%u17f\+', 1, '$')->mapnew({idx, val -> val.text}) + let noic_match2 = matchbufline('%', '\C\%u17f\+', 1, '$')->mapnew({idx, val -> val.text}) + + call assert_equal(['ss', 'ſſ'], ic_match, "Ignorecase Regex-engine: " .. &re) + call assert_equal(['ſſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re) + call assert_equal(['s', 'ss', 'ſſ', 'ſ'], ic_match2, "Ignorecase Regex-engine: " .. &re) + call assert_equal(['ſſ','ſ'], noic_match2, "No-Ignorecase Regex-engine: " .. &re) + endfor + bw! +endfunc + " vim: shiftwidth=2 sts=2 expandtab diff --git a/src/version.c b/src/version.c index 2c6e7d02fd..c63b141aa9 100644 --- a/src/version.c +++ b/src/version.c @@ -705,6 +705,8 @@ static char *(features[]) = static int included_patches[] = { /* Add new patch number below this line */ /**/ + 296, +/**/ 295, /**/ 294, @@ -1751,6 +1751,7 @@ void *vim_memset(void *, int, size_t); # define MB_STRICMP(d, s) mb_strnicmp((char_u *)(d), (char_u *)(s), (int)MAXCOL) # define MB_STRNICMP(d, s, n) mb_strnicmp((char_u *)(d), (char_u *)(s), (int)(n)) +# define MB_STRNICMP2(d, s, n1, n2) mb_strnicmp2((char_u *)(d), (char_u *)(s), (int)(n1), (int)(n2)) #define STRCAT(d, s) strcat((char *)(d), (char *)(s)) #define STRNCAT(d, s, n) strncat((char *)(d), (char *)(s), (size_t)(n)) |