patch 9.1.0011: regexp cannot match combining chars in collectionv9.1.0011

Problem: regexp cannot match combining chars in collection Solution: Check for combining characters in regex collections for the NFA and BT Regex Engine Also, while at it, make debug mode work again. fixes #10286 closes: #12871 Signed-off-by: Christian Brabandt <cb@256bit.org>
author: Christian Brabandt <cb@256bit.org> 2024-01-04 22:54:08 +0100
committer: Christian Brabandt <cb@256bit.org> 2024-01-04 22:54:08 +0100
commit: d2cc51f9a1a5a30ef5d2e732f49d7f495cae24cf (patch)
tree: caaf1ea4beed67081e63e5b82299d7078a5098df /src
parent: 81642d9d6ff5cd6a90a012b1b98632ce51eeb1a8 (diff)
6 files changed, 166 insertions, 11 deletions
diff --git a/src/regexp.c b/src/regexp.c
index a64672856c..c3bc4966c7 100644
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -2686,7 +2686,10 @@ static regengine_T bt_regengine =
     bt_regcomp,
     bt_regfree,
     bt_regexec_nl,
-    bt_regexec_multi,
+    bt_regexec_multi
+#ifdef DEBUG
+    ,(char_u *)""
+#endif
 };
 
 #include "regexp_nfa.c"
@@ -2696,7 +2699,10 @@ static regengine_T nfa_regengine =
     nfa_regcomp,
     nfa_regfree,
     nfa_regexec_nl,
-    nfa_regexec_multi,
+    nfa_regexec_multi
+#ifdef DEBUG
+    ,(char_u *)""
+#endif
 };
 
 // Which regexp engine to use? Needed for vim_regcomp().
diff --git a/src/regexp.h b/src/regexp.h
index d6c8f48c7b..1ff2e1b6ef 100644
--- a/src/regexp.h
+++ b/src/regexp.h
@@ -178,7 +178,9 @@ struct regengine
     int		(*regexec_nl)(regmatch_T *, char_u *, colnr_T, int);
     // bt_regexec_mult or nfa_regexec_mult
     long	(*regexec_multi)(regmmatch_T *, win_T *, buf_T *, linenr_T, colnr_T, int *);
-    //char_u	*expr;
+#ifdef DEBUG
+    char_u	*expr;
+#endif
 };
 
 // Flags used by vim_regsub() and vim_regsub_both()
diff --git a/src/regexp_bt.c b/src/regexp_bt.c
index 522cf37e2d..198946e0dc 100644
--- a/src/regexp_bt.c
+++ b/src/regexp_bt.c
@@ -3743,13 +3743,38 @@ regmatch(
 
 	  case ANYOF:
 	  case ANYBUT:
-	    if (c == NUL)
-		status = RA_NOMATCH;
-	    else if ((cstrchr(OPERAND(scan), c) == NULL) == (op == ANYOF))
-		status = RA_NOMATCH;
-	    else
-		ADVANCE_REGINPUT();
-	    break;
+	    {
+		char_u  *q = OPERAND(scan);
+
+		if (c == NUL)
+		    status = RA_NOMATCH;
+		else if ((cstrchr(q, c) == NULL) == (op == ANYOF))
+		    status = RA_NOMATCH;
+		else
+		{
+		    // Check following combining characters
+		    int	len = 0;
+		    int i;
+
+		    if (enc_utf8)
+			len = utfc_ptr2len(q) - utf_ptr2len(q);
+
+		    MB_CPTR_ADV(rex.input);
+		    MB_CPTR_ADV(q);
+
+		    if (!enc_utf8 || len == 0)
+			break;
+
+		    for (i = 0; i < len; ++i)
+			if (q[i] != rex.input[i])
+			{
+			    status = RA_NOMATCH;
+			    break;
+			}
+		    rex.input += len;
+		}
+		break;
+	    }
 
 	  case MULTIBYTECODE:
 	    if (has_mbyte)
diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c
index d724d527b6..ff54348905 100644
--- a/src/regexp_nfa.c
+++ b/src/regexp_nfa.c
@@ -1764,6 +1764,7 @@ collection:
 	    endp = skip_anyof(p);
 	    if (*endp == ']')
 	    {
+		int plen;
 		/*
 		 * Try to reverse engineer character classes. For example,
 		 * recognize that [0-9] stands for \d and [A-Za-z_] for \h,
@@ -2033,13 +2034,43 @@ collection:
 			else
 			{
 			    if (got_coll_char == TRUE && startc == 0)
+			    {
 				EMIT(0x0a);
+				EMIT(NFA_CONCAT);
+			    }
 			    else
+			    {
 				EMIT(startc);
-			    EMIT(NFA_CONCAT);
+				if (!(enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse)))))
+				{
+				    EMIT(NFA_CONCAT);
+				}
+			    }
 			}
 		    }
 
+		    if (enc_utf8 && (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))))
+		    {
+			int i = utf_ptr2len(regparse);
+
+			c = utf_ptr2char(regparse + i);
+
+			// Add composing characters
+			for (;;)
+			{
+			    if (c == 0)
+				// \x00 is translated to \x0a, start at \x01.
+				EMIT(1);
+			    else
+				EMIT(c);
+			    EMIT(NFA_CONCAT);
+			    if ((i += utf_char2len(c)) >= plen)
+				break;
+			    c = utf_ptr2char(regparse + i);
+			}
+			EMIT(NFA_COMPOSING);
+			EMIT(NFA_CONCAT);
+		    }
 		    MB_PTR_ADV(regparse);
 		} // while (p < endp)
 
@@ -6418,6 +6449,84 @@ nfa_regmatch(
 		result_if_matched = (t->state->c == NFA_START_COLL);
 		for (;;)
 		{
+		    if (state->c == NFA_COMPOSING)
+		    {
+			int	    mc = curc;
+			int	    len = 0;
+			nfa_state_T *end;
+			nfa_state_T *sta;
+			int	    cchars[MAX_MCO];
+			int	    ccount = 0;
+			int	    j;
+
+			sta = t->state->out->out;
+			len = 0;
+			if (utf_iscomposing(sta->c))
+			{
+			    // Only match composing character(s), ignore base
+			    // character.  Used for ".{composing}" and "{composing}"
+			    // (no preceding character).
+			    len += mb_char2len(mc);
+			}
+			if (rex.reg_icombine && len == 0)
+			{
+			    // If \Z was present, then ignore composing characters.
+			    // When ignoring the base character this always matches.
+			    if (sta->c != curc)
+				result = FAIL;
+			    else
+				result = OK;
+			    while (sta->c != NFA_END_COMPOSING)
+				sta = sta->out;
+			}
+			// Check base character matches first, unless ignored.
+			else if (len > 0 || mc == sta->c)
+//			if (len > 0 || mc == sta->c)
+			{
+			    if (len == 0)
+			    {
+				len += mb_char2len(mc);
+				sta = sta->out;
+			    }
+
+			    // We don't care about the order of composing characters.
+			    // Get them into cchars[] first.
+			    while (len < clen)
+			    {
+				mc = mb_ptr2char(rex.input + len);
+				cchars[ccount++] = mc;
+				len += mb_char2len(mc);
+				if (ccount == MAX_MCO)
+				    break;
+			    }
+
+			    // Check that each composing char in the pattern matches a
+			    // composing char in the text.  We do not check if all
+			    // composing chars are matched.
+			    result = OK;
+			    while (sta->c != NFA_END_COMPOSING)
+			    {
+				for (j = 0; j < ccount; ++j)
+				    if (cchars[j] == sta->c)
+					break;
+				if (j == ccount)
+				{
+				    result = FAIL;
+				    break;
+				}
+				sta = sta->out;
+			    }
+			}
+			else
+			    result = FAIL;
+
+			if (t->state->out->out1->c == NFA_END_COMPOSING)
+			{
+			    end = t->state->out->out1;
+			    ADD_STATE_IF_MATCH(end);
+			}
+			break;
+		    }
 		    if (state->c == NFA_END_COLL)
 		    {
 			result = !result_if_matched;
diff --git a/src/testdir/test_regexp_utf8.vim b/src/testdir/test_regexp_utf8.vim
index b591aedbb7..6669dee57e 100644
--- a/src/testdir/test_regexp_utf8.vim
+++ b/src/testdir/test_regexp_utf8.vim
@@ -575,5 +575,16 @@ func Test_match_too_complicated()
   set regexpengine=0
 endfunc
 
+func Test_combining_chars_in_collection()
+  new
+  for i in range(0,2)
+    exe "set re=".i
+    put =['ɔ̃', 'ɔ',  '̃  ã', 'abcd']
+    :%s/[ɔ̃]//
+    call assert_equal(['', '', 'ɔ', '̃  ã', 'abcd'], getline(1,'$'))
+    %d
+  endfor
+  bw!
+endfunc
 
 " vim: shiftwidth=2 sts=2 expandtab
diff --git a/src/version.c b/src/version.c
index c31fbf6358..d45181d9d7 100644
--- a/src/version.c
+++ b/src/version.c
@@ -705,6 +705,8 @@ static char *(features[]) =
 static int included_patches[] =
 {   /* Add new patch number below this line */
 /**/
+    11,
+/**/
     10,
 /**/
     9,
author	Christian Brabandt <cb@256bit.org>	2024-01-04 22:54:08 +0100
committer	Christian Brabandt <cb@256bit.org>	2024-01-04 22:54:08 +0100
commit	d2cc51f9a1a5a30ef5d2e732f49d7f495cae24cf (patch)
tree	caaf1ea4beed67081e63e5b82299d7078a5098df /src
parent	81642d9d6ff5cd6a90a012b1b98632ce51eeb1a8 (diff)