summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorChristian Brabandt <cb@256bit.org>2024-01-04 22:54:08 +0100
committerChristian Brabandt <cb@256bit.org>2024-01-04 22:54:08 +0100
commitd2cc51f9a1a5a30ef5d2e732f49d7f495cae24cf (patch)
treecaaf1ea4beed67081e63e5b82299d7078a5098df /src
parent81642d9d6ff5cd6a90a012b1b98632ce51eeb1a8 (diff)
patch 9.1.0011: regexp cannot match combining chars in collectionv9.1.0011
Problem: regexp cannot match combining chars in collection Solution: Check for combining characters in regex collections for the NFA and BT Regex Engine Also, while at it, make debug mode work again. fixes #10286 closes: #12871 Signed-off-by: Christian Brabandt <cb@256bit.org>
Diffstat (limited to 'src')
-rw-r--r--src/regexp.c10
-rw-r--r--src/regexp.h4
-rw-r--r--src/regexp_bt.c39
-rw-r--r--src/regexp_nfa.c111
-rw-r--r--src/testdir/test_regexp_utf8.vim11
-rw-r--r--src/version.c2
6 files changed, 166 insertions, 11 deletions
diff --git a/src/regexp.c b/src/regexp.c
index a64672856c..c3bc4966c7 100644
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -2686,7 +2686,10 @@ static regengine_T bt_regengine =
bt_regcomp,
bt_regfree,
bt_regexec_nl,
- bt_regexec_multi,
+ bt_regexec_multi
+#ifdef DEBUG
+ ,(char_u *)""
+#endif
};
#include "regexp_nfa.c"
@@ -2696,7 +2699,10 @@ static regengine_T nfa_regengine =
nfa_regcomp,
nfa_regfree,
nfa_regexec_nl,
- nfa_regexec_multi,
+ nfa_regexec_multi
+#ifdef DEBUG
+ ,(char_u *)""
+#endif
};
// Which regexp engine to use? Needed for vim_regcomp().
diff --git a/src/regexp.h b/src/regexp.h
index d6c8f48c7b..1ff2e1b6ef 100644
--- a/src/regexp.h
+++ b/src/regexp.h
@@ -178,7 +178,9 @@ struct regengine
int (*regexec_nl)(regmatch_T *, char_u *, colnr_T, int);
// bt_regexec_mult or nfa_regexec_mult
long (*regexec_multi)(regmmatch_T *, win_T *, buf_T *, linenr_T, colnr_T, int *);
- //char_u *expr;
+#ifdef DEBUG
+ char_u *expr;
+#endif
};
// Flags used by vim_regsub() and vim_regsub_both()
diff --git a/src/regexp_bt.c b/src/regexp_bt.c
index 522cf37e2d..198946e0dc 100644
--- a/src/regexp_bt.c
+++ b/src/regexp_bt.c
@@ -3743,13 +3743,38 @@ regmatch(
case ANYOF:
case ANYBUT:
- if (c == NUL)
- status = RA_NOMATCH;
- else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
- status = RA_NOMATCH;
- else
- ADVANCE_REGINPUT();
- break;
+ {
+ char_u *q = OPERAND(scan);
+
+ if (c == NUL)
+ status = RA_NOMATCH;
+ else if ((cstrchr(q, c) == NULL) == (op == ANYOF))
+ status = RA_NOMATCH;
+ else
+ {
+ // Check following combining characters
+ int len = 0;
+ int i;
+
+ if (enc_utf8)
+ len = utfc_ptr2len(q) - utf_ptr2len(q);
+
+ MB_CPTR_ADV(rex.input);
+ MB_CPTR_ADV(q);
+
+ if (!enc_utf8 || len == 0)
+ break;
+
+ for (i = 0; i < len; ++i)
+ if (q[i] != rex.input[i])
+ {
+ status = RA_NOMATCH;
+ break;
+ }
+ rex.input += len;
+ }
+ break;
+ }
case MULTIBYTECODE:
if (has_mbyte)
diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c
index d724d527b6..ff54348905 100644
--- a/src/regexp_nfa.c
+++ b/src/regexp_nfa.c
@@ -1764,6 +1764,7 @@ collection:
endp = skip_anyof(p);
if (*endp == ']')
{
+ int plen;
/*
* Try to reverse engineer character classes. For example,
* recognize that [0-9] stands for \d and [A-Za-z_] for \h,
@@ -2033,13 +2034,43 @@ collection:
else
{
if (got_coll_char == TRUE && startc == 0)
+ {
EMIT(0x0a);
+ EMIT(NFA_CONCAT);
+ }
else
+ {
EMIT(startc);
- EMIT(NFA_CONCAT);
+ if (!(enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse)))))
+ {
+ EMIT(NFA_CONCAT);
+ }
+ }
}
}
+ if (enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))))
+ {
+ int i = utf_ptr2len(regparse);
+
+ c = utf_ptr2char(regparse + i);
+
+ // Add composing characters
+ for (;;)
+ {
+ if (c == 0)
+ // \x00 is translated to \x0a, start at \x01.
+ EMIT(1);
+ else
+ EMIT(c);
+ EMIT(NFA_CONCAT);
+ if ((i += utf_char2len(c)) >= plen)
+ break;
+ c = utf_ptr2char(regparse + i);
+ }
+ EMIT(NFA_COMPOSING);
+ EMIT(NFA_CONCAT);
+ }
MB_PTR_ADV(regparse);
} // while (p < endp)
@@ -6418,6 +6449,84 @@ nfa_regmatch(
result_if_matched = (t->state->c == NFA_START_COLL);
for (;;)
{
+ if (state->c == NFA_COMPOSING)
+ {
+ int mc = curc;
+ int len = 0;
+ nfa_state_T *end;
+ nfa_state_T *sta;
+ int cchars[MAX_MCO];
+ int ccount = 0;
+ int j;
+
+ sta = t->state->out->out;
+ len = 0;
+ if (utf_iscomposing(sta->c))
+ {
+ // Only match composing character(s), ignore base
+ // character. Used for ".{composing}" and "{composing}"
+ // (no preceding character).
+ len += mb_char2len(mc);
+ }
+ if (rex.reg_icombine && len == 0)
+ {
+ // If \Z was present, then ignore composing characters.
+ // When ignoring the base character this always matches.
+ if (sta->c != curc)
+ result = FAIL;
+ else
+ result = OK;
+ while (sta->c != NFA_END_COMPOSING)
+ sta = sta->out;
+ }
+ // Check base character matches first, unless ignored.
+ else if (len > 0 || mc == sta->c)
+// if (len > 0 || mc == sta->c)
+ {
+ if (len == 0)
+ {
+ len += mb_char2len(mc);
+ sta = sta->out;
+ }
+
+ // We don't care about the order of composing characters.
+ // Get them into cchars[] first.
+ while (len < clen)
+ {
+ mc = mb_ptr2char(rex.input + len);
+ cchars[ccount++] = mc;
+ len += mb_char2len(mc);
+ if (ccount == MAX_MCO)
+ break;
+ }
+
+ // Check that each composing char in the pattern matches a
+ // composing char in the text. We do not check if all
+ // composing chars are matched.
+ result = OK;
+ while (sta->c != NFA_END_COMPOSING)
+ {
+ for (j = 0; j < ccount; ++j)
+ if (cchars[j] == sta->c)
+ break;
+ if (j == ccount)
+ {
+ result = FAIL;
+ break;
+ }
+ sta = sta->out;
+ }
+ }
+ else
+ result = FAIL;
+
+ if (t->state->out->out1->c == NFA_END_COMPOSING)
+ {
+ end = t->state->out->out1;
+ ADD_STATE_IF_MATCH(end);
+ }
+ break;
+ }
if (state->c == NFA_END_COLL)
{
result = !result_if_matched;
diff --git a/src/testdir/test_regexp_utf8.vim b/src/testdir/test_regexp_utf8.vim
index b591aedbb7..6669dee57e 100644
--- a/src/testdir/test_regexp_utf8.vim
+++ b/src/testdir/test_regexp_utf8.vim
@@ -575,5 +575,16 @@ func Test_match_too_complicated()
set regexpengine=0
endfunc
+func Test_combining_chars_in_collection()
+ new
+ for i in range(0,2)
+ exe "set re=".i
+ put =['ɔ̃', 'ɔ', '̃ ã', 'abcd']
+ :%s/[ɔ̃]//
+ call assert_equal(['', '', 'ɔ', '̃ ã', 'abcd'], getline(1,'$'))
+ %d
+ endfor
+ bw!
+endfunc
" vim: shiftwidth=2 sts=2 expandtab
diff --git a/src/version.c b/src/version.c
index c31fbf6358..d45181d9d7 100644
--- a/src/version.c
+++ b/src/version.c
@@ -705,6 +705,8 @@ static char *(features[]) =
static int included_patches[] =
{ /* Add new patch number below this line */
/**/
+ 11,
+/**/
10,
/**/
9,