summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBram Moolenaar <Bram@vim.org>2020-09-11 22:25:15 +0200
committerBram Moolenaar <Bram@vim.org>2020-09-11 22:25:15 +0200
commit635414dd2f3ae7d4d972d79b806348a6516cb91a (patch)
tree70ba4988a414e4c1b76eb9f59ace2c3afba39ad3
parentc2c820563441499892359da949db3c8f7f16d109 (diff)
patch 8.2.1665: cannot do fuzzy string matchingv8.2.1665
Problem: Cannot do fuzzy string matching. Solution: Add matchfuzzy(). (Yegappan Lakshmanan, closes #6932)
-rw-r--r--runtime/doc/eval.txt24
-rw-r--r--runtime/doc/usr_41.txt1
-rw-r--r--src/evalfunc.c1
-rw-r--r--src/proto/search.pro1
-rw-r--r--src/search.c340
-rw-r--r--src/testdir/test_functions.vim24
-rw-r--r--src/version.c2
7 files changed, 393 insertions, 0 deletions
diff --git a/runtime/doc/eval.txt b/runtime/doc/eval.txt
index e72a388302..68884ba1ed 100644
--- a/runtime/doc/eval.txt
+++ b/runtime/doc/eval.txt
@@ -2641,6 +2641,7 @@ matcharg({nr}) List arguments of |:match|
matchdelete({id} [, {win}]) Number delete match identified by {id}
matchend({expr}, {pat} [, {start} [, {count}]])
Number position where {pat} ends in {expr}
+matchfuzzy({list}, {str}) List fuzzy match {str} in {list}
matchlist({expr}, {pat} [, {start} [, {count}]])
List match and submatches of {pat} in {expr}
matchstr({expr}, {pat} [, {start} [, {count}]])
@@ -7307,6 +7308,29 @@ matchend({expr}, {pat} [, {start} [, {count}]]) *matchend()*
Can also be used as a |method|: >
GetText()->matchend('word')
+
+matchfuzzy({list}, {str}) *matchfuzzy()*
+ Returns a list with all the strings in {list} that fuzzy
+ match {str}. The strings in the returned list are sorted
+ based on the matching score. {str} is treated as a literal
+ string and regular expression matching is NOT supported.
+ The maximum supported {str} length is 256.
+
+ If there are no matching strings or there is an error, then an
+ empty list is returned. If length of {str} is greater than
+ 256, then returns an empty list.
+
+ Example: >
+ :echo matchfuzzy(["clay", "crow"], "cay")
+< results in ["clay"]. >
+ :echo getbufinfo()->map({_, v -> v.name})->matchfuzzy("ndl")
+< results in a list of buffer names fuzzy matching "ndl". >
+ :echo v:oldfiles->matchfuzzy("test")
+< results in a list of file names fuzzy matching "test". >
+ :let l = readfile("buffer.c")->matchfuzzy("str")
+< results in a list of lines in "buffer.c" fuzzy matching "str".
+
+
matchlist({expr}, {pat} [, {start} [, {count}]]) *matchlist()*
Same as |match()|, but return a |List|. The first item in the
list is the matched string, same as what matchstr() would
diff --git a/runtime/doc/usr_41.txt b/runtime/doc/usr_41.txt
index 1dbfba128b..1e5915f41b 100644
--- a/runtime/doc/usr_41.txt
+++ b/runtime/doc/usr_41.txt
@@ -603,6 +603,7 @@ String manipulation: *string-functions*
charclass() class of a character
match() position where a pattern matches in a string
matchend() position where a pattern match ends in a string
+ matchfuzzy() fuzzy matches a string in a list of strings
matchstr() match of a pattern in a string
matchstrpos() match and positions of a pattern in a string
matchlist() like matchstr() and also return submatches
diff --git a/src/evalfunc.c b/src/evalfunc.c
index 619abba986..533f5b95dd 100644
--- a/src/evalfunc.c
+++ b/src/evalfunc.c
@@ -750,6 +750,7 @@ static funcentry_T global_functions[] =
{"matcharg", 1, 1, FEARG_1, ret_list_string, f_matcharg},
{"matchdelete", 1, 2, FEARG_1, ret_number, f_matchdelete},
{"matchend", 2, 4, FEARG_1, ret_number, f_matchend},
+ {"matchfuzzy", 2, 2, FEARG_1, ret_list_string, f_matchfuzzy},
{"matchlist", 2, 4, FEARG_1, ret_list_string, f_matchlist},
{"matchstr", 2, 4, FEARG_1, ret_string, f_matchstr},
{"matchstrpos", 2, 4, FEARG_1, ret_list_any, f_matchstrpos},
diff --git a/src/proto/search.pro b/src/proto/search.pro
index ccad0fb8eb..1b62254199 100644
--- a/src/proto/search.pro
+++ b/src/proto/search.pro
@@ -36,4 +36,5 @@ void find_pattern_in_path(char_u *ptr, int dir, int len, int whole, int skip_com
spat_T *get_spat(int idx);
int get_spat_last_idx(void);
void f_searchcount(typval_T *argvars, typval_T *rettv);
+void f_matchfuzzy(typval_T *argvars, typval_T *rettv);
/* vim: set ft=c : */
diff --git a/src/search.c b/src/search.c
index 28e0adc7ae..9572785b7e 100644
--- a/src/search.c
+++ b/src/search.c
@@ -4165,4 +4165,344 @@ f_searchcount(typval_T *argvars, typval_T *rettv)
the_end:
restore_last_search_pattern();
}
+
+/*
+ * Fuzzy string matching
+ *
+ * Ported from the lib_fts library authored by Forrest Smith.
+ * https://github.com/forrestthewoods/lib_fts/tree/master/code
+ *
+ * Blog describing the algorithm:
+ * https://www.forrestthewoods.com/blog/reverse_engineering_sublime_texts_fuzzy_match/
+ *
+ * Each matching string is assigned a score. The following factors are checked:
+ * Matched letter
+ * Unmatched letter
+ * Consecutively matched letters
+ * Proximity to start
+ * Letter following a separator (space, underscore)
+ * Uppercase letter following lowercase (aka CamelCase)
+ *
+ * Matched letters are good. Unmatched letters are bad. Matching near the start
+ * is good. Matching the first letter in the middle of a phrase is good.
+ * Matching the uppercase letters in camel case entries is good.
+ *
+ * The score assigned for each factor is explained below.
+ * File paths are different from file names. File extensions may be ignorable.
+ * Single words care about consecutive matches but not separators or camel
+ * case.
+ * Score starts at 0
+ * Matched letter: +0 points
+ * Unmatched letter: -1 point
+ * Consecutive match bonus: +5 points
+ * Separator bonus: +10 points
+ * Camel case bonus: +10 points
+ * Unmatched leading letter: -3 points (max: -9)
+ *
+ * There is some nuance to this. Scores don’t have an intrinsic meaning. The
+ * score range isn’t 0 to 100. It’s roughly [-50, 50]. Longer words have a
+ * lower minimum score due to unmatched letter penalty. Longer search patterns
+ * have a higher maximum score due to match bonuses.
+ *
+ * Separator and camel case bonus is worth a LOT. Consecutive matches are worth
+ * quite a bit.
+ *
+ * There is a penalty if you DON’T match the first three letters. Which
+ * effectively rewards matching near the start. However there’s no difference
+ * in matching between the middle and end.
+ *
+ * There is not an explicit bonus for an exact match. Unmatched letters receive
+ * a penalty. So shorter strings and closer matches are worth more.
+ */
+typedef struct
+{
+ listitem_T *item;
+ int score;
+} fuzzyItem_T;
+
+ static int
+fuzzy_match_recursive(
+ char_u *fuzpat,
+ char_u *str,
+ int *outScore,
+ char_u *strBegin,
+ char_u *srcMatches,
+ char_u *matches,
+ int maxMatches,
+ int nextMatch,
+ int *recursionCount,
+ int recursionLimit)
+{
+ // Recursion params
+ int recursiveMatch = FALSE;
+ char_u bestRecursiveMatches[256];
+ int bestRecursiveScore = 0;
+ int first_match;
+ int matched;
+
+ // Count recursions
+ ++*recursionCount;
+ if (*recursionCount >= recursionLimit)
+ return FALSE;
+
+ // Detect end of strings
+ if (*fuzpat == '\0' || *str == '\0')
+ return FALSE;
+
+ // Loop through fuzpat and str looking for a match
+ first_match = TRUE;
+ while (*fuzpat != '\0' && *str != '\0')
+ {
+ // Found match
+ if (vim_tolower(*fuzpat) == vim_tolower(*str))
+ {
+ char_u recursiveMatches[256];
+ int recursiveScore = 0;
+
+ // Supplied matches buffer was too short
+ if (nextMatch >= maxMatches)
+ return FALSE;
+
+ // "Copy-on-Write" srcMatches into matches
+ if (first_match && srcMatches)
+ {
+ memcpy(matches, srcMatches, nextMatch);
+ first_match = FALSE;
+ }
+
+ // Recursive call that "skips" this match
+ if (fuzzy_match_recursive(fuzpat, str + 1, &recursiveScore,
+ strBegin, matches, recursiveMatches,
+ sizeof(recursiveMatches), nextMatch, recursionCount,
+ recursionLimit))
+ {
+ // Pick best recursive score
+ if (!recursiveMatch || recursiveScore > bestRecursiveScore)
+ {
+ memcpy(bestRecursiveMatches, recursiveMatches, 256);
+ bestRecursiveScore = recursiveScore;
+ }
+ recursiveMatch = TRUE;
+ }
+
+ // Advance
+ matches[nextMatch++] = (char_u)(str - strBegin);
+ ++fuzpat;
+ }
+ ++str;
+ }
+
+ // Determine if full fuzpat was matched
+ matched = *fuzpat == '\0' ? TRUE : FALSE;
+
+ // Calculate score
+ if (matched)
+ {
+ // bonus for adjacent matches
+ int sequential_bonus = 15;
+ // bonus if match occurs after a separator
+ int separator_bonus = 30;
+ // bonus if match is uppercase and prev is lower
+ int camel_bonus = 30;
+ // bonus if the first letter is matched
+ int first_letter_bonus = 15;
+ // penalty applied for every letter in str before the first match
+ int leading_letter_penalty = -5;
+ // maximum penalty for leading letters
+ int max_leading_letter_penalty = -15;
+ // penalty for every letter that doesn't matter
+ int unmatched_letter_penalty = -1;
+ int penalty;
+ int unmatched;
+ int i;
+
+ // Iterate str to end
+ while (*str != '\0')
+ ++str;
+
+ // Initialize score
+ *outScore = 100;
+
+ // Apply leading letter penalty
+ penalty = leading_letter_penalty * matches[0];
+ if (penalty < max_leading_letter_penalty)
+ penalty = max_leading_letter_penalty;
+ *outScore += penalty;
+
+ // Apply unmatched penalty
+ unmatched = (int)(str - strBegin) - nextMatch;
+ *outScore += unmatched_letter_penalty * unmatched;
+
+ // Apply ordering bonuses
+ for (i = 0; i < nextMatch; ++i)
+ {
+ char_u currIdx = matches[i];
+
+ if (i > 0)
+ {
+ char_u prevIdx = matches[i - 1];
+
+ // Sequential
+ if (currIdx == (prevIdx + 1))
+ *outScore += sequential_bonus;
+ }
+
+ // Check for bonuses based on neighbor character value
+ if (currIdx > 0)
+ {
+ // Camel case
+ char_u neighbor = strBegin[currIdx - 1];
+ char_u curr = strBegin[currIdx];
+ int neighborSeparator;
+
+ if (islower(neighbor) && isupper(curr))
+ *outScore += camel_bonus;
+
+ // Separator
+ neighborSeparator = neighbor == '_' || neighbor == ' ';
+ if (neighborSeparator)
+ *outScore += separator_bonus;
+ }
+ else
+ {
+ // First letter
+ *outScore += first_letter_bonus;
+ }
+ }
+ }
+
+ // Return best result
+ if (recursiveMatch && (!matched || bestRecursiveScore > *outScore))
+ {
+ // Recursive score is better than "this"
+ memcpy(matches, bestRecursiveMatches, maxMatches);
+ *outScore = bestRecursiveScore;
+ return TRUE;
+ }
+ else if (matched)
+ return TRUE; // "this" score is better than recursive
+
+ return FALSE; // no match
+}
+
+/*
+ * fuzzy_match()
+ *
+ * Performs exhaustive search via recursion to find all possible matches and
+ * match with highest score.
+ * Scores values have no intrinsic meaning. Possible score range is not
+ * normalized and varies with pattern.
+ * Recursion is limited internally (default=10) to prevent degenerate cases
+ * (fuzpat="aaaaaa" str="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa").
+ * Uses char_u for match indices. Therefore patterns are limited to 256
+ * characters.
+ *
+ * Returns TRUE if fuzpat is found AND calculates a score.
+ */
+ static int
+fuzzy_match(char_u *str, char_u *fuzpat, int *outScore)
+{
+ char_u matches[256];
+ int recursionCount = 0;
+ int recursionLimit = 10;
+
+ *outScore = 0;
+
+ return fuzzy_match_recursive(fuzpat, str, outScore, str, NULL, matches,
+ sizeof(matches), 0, &recursionCount, recursionLimit);
+}
+
+/*
+ * Sort the fuzzy matches in the descending order of the match score.
+ */
+ static int
+fuzzy_item_compare(const void *s1, const void *s2)
+{
+ int v1 = ((fuzzyItem_T *)s1)->score;
+ int v2 = ((fuzzyItem_T *)s2)->score;
+
+ return v1 == v2 ? 0 : v1 > v2 ? -1 : 1;
+}
+
+/*
+ * Fuzzy search the string 'str' in 'strlist' and return the matching strings
+ * in 'fmatchlist'.
+ */
+ static void
+match_fuzzy(list_T *strlist, char_u *str, list_T *fmatchlist)
+{
+ long len;
+ fuzzyItem_T *ptrs;
+ listitem_T *li;
+ long i = 0;
+ int found_match = FALSE;
+
+ len = list_len(strlist);
+ if (len == 0)
+ return;
+
+ ptrs = ALLOC_MULT(fuzzyItem_T, len);
+ if (ptrs == NULL)
+ return;
+
+ // For all the string items in strlist, get the fuzzy matching score
+ FOR_ALL_LIST_ITEMS(strlist, li)
+ {
+ int score;
+
+ ptrs[i].item = li;
+ ptrs[i].score = -9999;
+ // ignore non-string items in the list
+ if (li->li_tv.v_type == VAR_STRING && li->li_tv.vval.v_string != NULL)
+ if (fuzzy_match(li->li_tv.vval.v_string, str, &score))
+ {
+ ptrs[i].score = score;
+ found_match = TRUE;
+ }
+ ++i;
+ }
+
+ if (found_match)
+ {
+ // Sort the list by the descending order of the match score
+ qsort((void *)ptrs, (size_t)len, sizeof(fuzzyItem_T),
+ fuzzy_item_compare);
+
+ // Copy the matching strings with 'score != -9999' to the return list
+ for (i = 0; i < len; i++)
+ {
+ if (ptrs[i].score == -9999)
+ break;
+ list_append_string(fmatchlist, ptrs[i].item->li_tv.vval.v_string,
+ -1);
+ }
+ }
+
+ vim_free(ptrs);
+}
+
+/*
+ * "matchfuzzy()" function
+ */
+ void
+f_matchfuzzy(typval_T *argvars, typval_T *rettv)
+{
+ if (argvars[0].v_type != VAR_LIST)
+ {
+ emsg(_(e_listreq));
+ return;
+ }
+ if (argvars[0].vval.v_list == NULL)
+ return;
+ if (argvars[1].v_type != VAR_STRING
+ || argvars[1].vval.v_string == NULL)
+ {
+ semsg(_(e_invarg2), tv_get_string(&argvars[1]));
+ return;
+ }
+ if (rettv_list_alloc(rettv) == OK)
+ match_fuzzy(argvars[0].vval.v_list, tv_get_string(&argvars[1]),
+ rettv->vval.v_list);
+}
+
#endif
diff --git a/src/testdir/test_functions.vim b/src/testdir/test_functions.vim
index 7ff9b7a819..e31b63cc43 100644
--- a/src/testdir/test_functions.vim
+++ b/src/testdir/test_functions.vim
@@ -2554,4 +2554,28 @@ func Test_browsedir()
call assert_fails('call browsedir("open", [])', 'E730:')
endfunc
+" Test for matchfuzzy()
+func Test_matchfuzzy()
+ call assert_fails('call matchfuzzy(10, "abc")', 'E714:')
+ call assert_fails('call matchfuzzy(["abc"], [])', 'E730:')
+ call assert_equal([], matchfuzzy([], 'abc'))
+ call assert_equal([], matchfuzzy(['abc'], ''))
+ call assert_equal(['abc'], matchfuzzy(['abc', 10], 'ac'))
+ call assert_equal([], matchfuzzy([10, 20], 'ac'))
+ call assert_equal(['abc'], matchfuzzy(['abc'], 'abc'))
+ call assert_equal(['crayon', 'camera'], matchfuzzy(['camera', 'crayon'], 'cra'))
+ call assert_equal(['aabbaa', 'aaabbbaaa', 'aaaabbbbaaaa', 'aba'], matchfuzzy(['aba', 'aabbaa', 'aaabbbaaa', 'aaaabbbbaaaa'], 'aa'))
+ call assert_equal(['one'], matchfuzzy(['one', 'two'], 'one'))
+ call assert_equal(['oneTwo', 'onetwo'], matchfuzzy(['onetwo', 'oneTwo'], 'oneTwo'))
+ call assert_equal(['one_two', 'onetwo'], matchfuzzy(['onetwo', 'one_two'], 'oneTwo'))
+ call assert_equal(['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], matchfuzzy(['aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'], 'aa'))
+ call assert_equal([], matchfuzzy([repeat('a', 300)], repeat('a', 257)))
+
+ %bw!
+ eval ['somebuf', 'anotherone', 'needle', 'yetanotherone']->map({_, v -> bufadd(v) + bufload(v)})
+ let l = getbufinfo()->map({_, v -> v.name})->matchfuzzy('ndl')
+ call assert_equal(1, len(l))
+ call assert_match('needle', l[0])
+endfunc
+
" vim: shiftwidth=2 sts=2 expandtab
diff --git a/src/version.c b/src/version.c
index 735b8f6252..e0afdf8352 100644
--- a/src/version.c
+++ b/src/version.c
@@ -751,6 +751,8 @@ static char *(features[]) =
static int included_patches[] =
{ /* Add new patch number below this line */
/**/
+ 1665,
+/**/
1664,
/**/
1663,