diff options
author | Canop <cano.petrole@gmail.com> | 2021-03-05 14:47:29 +0100 |
---|---|---|
committer | Canop <cano.petrole@gmail.com> | 2021-03-05 14:47:29 +0100 |
commit | a965b995fc845e9404e018ff6ccb40fcc8a2adf0 (patch) | |
tree | 0040e637b49d85fa9ff9708b800a64f44a664137 /src/pattern | |
parent | 4d44b1585cde2b95e4742ca8615bd9c0b2772072 (diff) |
improves and simplifies fuzzy matching (WIP)
I've removed the separation between a scoring algorithm and
the matching one. This means we lose a few % in performances
but it's way easier to change the matching algorighm when it
doesn't have to be reproduced with a lighter logic.
I've tuned the algorithm to avoid singled chars if possible
and to have a malus when they can't be avoided.
Diffstat (limited to 'src/pattern')
-rw-r--r-- | src/pattern/fuzzy_pattern.rs | 224 |
1 files changed, 30 insertions, 194 deletions
diff --git a/src/pattern/fuzzy_pattern.rs b/src/pattern/fuzzy_pattern.rs index 52a1e24..031e104 100644 --- a/src/pattern/fuzzy_pattern.rs +++ b/src/pattern/fuzzy_pattern.rs @@ -16,6 +16,7 @@ const BONUS_START_WORD: i32 = 5; const BONUS_CANDIDATE_LENGTH: i32 = -1; // per char const BONUS_MATCH_LENGTH: i32 = -10; // per char of length of the match const BONUS_NB_HOLES: i32 = -30; // there's also a max on that number +const BONUS_SINGLED_CHAR: i32 = -40; // when there's a char, neither first not last, isolated /// A pattern for fuzzy matching #[derive(Debug, Clone)] @@ -38,12 +39,6 @@ enum MatchSearchResult { Some(NameMatch), None, } -enum ScoreSearchResult { - Perfect(i32), // no need to test other positions - Some(i32), - None, - NoneToEnd, -} fn is_word_separator(c: char) -> bool { matches!(c, '_' | ' ' | '-') @@ -88,8 +83,9 @@ impl FuzzyPattern { pos.push(start_idx); let mut d = 1; let mut nb_holes = 0; + let mut nb_singled_chars = 0; for pat_idx in 1..self.chars.len() { - let hole_start = d; + let group_start = d; loop { let cand_idx = start_idx + d; if cand_idx == cand_chars.len() { @@ -101,17 +97,38 @@ impl FuzzyPattern { break; } } - if hole_start + 1 != d { + if group_start + 1 != d { // note that there's no absolute guarantee we found the minimal // number of holes. The algorithm isn't perfect - if nb_holes >= self.max_nb_holes { - return MatchSearchResult::None; + if nb_holes > 0 { + if nb_holes >= self.max_nb_holes { + return MatchSearchResult::None; + } + // we're just after a hole which wasn't the first one + // we check whether the previous group (which wasn't the + // initial one) was a single char. + let last_idx = pos.len() - 1; + // The last char of the previous group is at index last_idx-1 + if pos[last_idx-1] > pos[last_idx-2] + 1 { + debug!("singled char in {:?}", cand_chars.iter().collect::<String>()); + debug!("before merge: {:?}", &pos); + // The last group was of size 1, it's a singled char. + // But maybe it can be rattached to the new group ? + if self.chars[pat_idx-1] == cand_chars[pos[last_idx]-1] { + pos[last_idx-1] = pos[last_idx] - 1; + debug!("merging singled char -> {:?}", &pos); + nb_holes -= 1; + } else { + nb_singled_chars += 1; + } + } } nb_holes += 1; } } let mut score = BONUS_MATCH; score += BONUS_CANDIDATE_LENGTH * (cand_chars.len() as i32); + score += BONUS_SINGLED_CHAR * (nb_singled_chars as i32); score += BONUS_NB_HOLES * (nb_holes as i32); score += d as i32 * BONUS_MATCH_LENGTH; if start_idx == 0 { @@ -129,6 +146,7 @@ impl FuzzyPattern { } } } + debug!("{} -> s={}, pos={:?}", cand_chars.iter().collect::<String>(), score, &pos); MatchSearchResult::Some(NameMatch { score, pos }) } @@ -164,151 +182,11 @@ impl FuzzyPattern { best_match } - /// compute the score in the specific case the pattern is of length 1 - fn score_1_char(&self, candidate: &str, pat_chr: char) -> Option<i32> { - let mut cand_chars = candidate.chars().map(secular::lower_lay_char); - match cand_chars.next() { - None => None, // empty candidate: this looks pathological but might be valid - Some(chr) if pat_chr == chr => { - let cand_len = cand_chars.count() as i32 + 1; - let score = BONUS_MATCH - + BONUS_START - + BONUS_START_WORD - + cand_len * BONUS_CANDIDATE_LENGTH - + BONUS_MATCH_LENGTH; - Some(if cand_len == 1 { - score + BONUS_EXACT - } else { - score - }) - } - Some(chr) => { - let mut starts_word = is_word_separator(chr); - while let Some(cand_chr) = cand_chars.next() { - if cand_chr == pat_chr { - let cand_len = candidate.chars().count() as i32; - let score = - BONUS_MATCH + cand_len * BONUS_CANDIDATE_LENGTH + BONUS_MATCH_LENGTH; - if starts_word { - return Some(score + BONUS_START_WORD); - } else { - // looking for another match after a space - for cand_chr in cand_chars { - if cand_chr == pat_chr && starts_word { - return Some(score + BONUS_START_WORD); - } else { - starts_word = is_word_separator(cand_chr); - } - } - return Some(score); - } - } else { - starts_word = is_word_separator(cand_chr); - } - } - None - } - } - } - - /// if a match starts at the given char index, return its score. - /// Return it as "Perfect" if no better match can be found further - /// in the string. - /// The pattern is assumed to be of length 2 or more - fn score_starting_at( - &self, - cand_chars: &[char], - start_idx: usize, // start index in candidate - ) -> ScoreSearchResult { - if cand_chars[start_idx] != self.chars[0] { - return ScoreSearchResult::None; - } - let mut d = 1; - let mut nb_holes = 0; - for pat_idx in 1..self.chars.len() { - let hole_start = d; - loop { - let cand_idx = start_idx + d; - if cand_idx == cand_chars.len() { - return ScoreSearchResult::NoneToEnd; - } - d += 1; - if cand_chars[cand_idx] == self.chars[pat_idx] { - break; - } - } - if hole_start + 1 != d { - // note that there's no absolute guarantee we found the minimal - // number of holes. The algorithm isn't perfect - if nb_holes >= self.max_nb_holes { - return ScoreSearchResult::None; - } - nb_holes += 1; - } - } - let mut score = BONUS_MATCH - + BONUS_CANDIDATE_LENGTH * (cand_chars.len() as i32) - + BONUS_NB_HOLES * (nb_holes as i32) - + d as i32 * BONUS_MATCH_LENGTH; - if start_idx == 0 { - score += BONUS_START + BONUS_START_WORD; - if cand_chars.len() == self.chars.len() { - score += BONUS_EXACT; - return ScoreSearchResult::Perfect(score); - } - } else { - let previous = cand_chars[start_idx - 1]; - if is_word_separator(previous) { - score += BONUS_START_WORD; - if cand_chars.len() - start_idx == self.chars.len() { - return ScoreSearchResult::Perfect(score); - } - } - } - ScoreSearchResult::Some(score) - } - - fn score_n_chars(&self, candidate: &str) -> Option<i32> { - if candidate.len() < self.chars.len() { - return None; - } - let mut cand_chars: Vec<char> = Vec::with_capacity(candidate.len()); - cand_chars.extend(candidate.chars().map(secular::lower_lay_char)); - if cand_chars.len() < self.chars.len() { - return None; - } - let mut best_score = 0; - let n = cand_chars.len() - self.chars.len(); - for start_idx in 0..=n { - match self.score_starting_at(&cand_chars, start_idx) { - ScoreSearchResult::Perfect(s) => { - return Some(s); - } - ScoreSearchResult::Some(score) => { - if score > best_score { - best_score = score; - } - } - ScoreSearchResult::NoneToEnd => { - break; - } - _ => {} - } - } - if best_score > 0 { - Some(best_score) - } else { - None - } - } - /// compute the score of the best match, in a way mostly similar to `find` but /// faster by not storing match positions pub fn score_of(&self, candidate: &str) -> Option<i32> { - match self.chars.len() { - 1 => self.score_1_char(candidate, self.chars[0]), - _ => self.score_n_chars(candidate), - } + self.find(candidate) + .map(|nm| nm.score) } } @@ -317,53 +195,11 @@ mod fuzzy_pattern_tests { use super::*; - #[test] - fn check_equal_scores() { - static PATTERNS: &[&str] = &["reveil", "dystroy", "broot", "AB", "z", "é", "év", "a"]; - static NAMES: &[&str] = &[ - " brr ooT", - "Reveillon", - "una a", - "dys", - "test", - " a reveil", - "a rbrroot", - "Ab", - "Eve", - "zeévr", - "alnékjhz vaoi", - "jfhf br mleh & é kn rr o hrzpqôùù", - "ÅΩ", - ]; - for pattern in PATTERNS { - let fp = FuzzyPattern::from(pattern); - for name in NAMES { - println!("checking pattern {:?} on name {:?}", pattern, name); - assert_eq!(fp.score_of(name), fp.find(name).map(|m| m.score)); - } - } - } - /// check that the scores of all names are strictly decreasing /// (pattern is first tested against itself). /// We verify this property with both computation functions. fn check_ordering_for(pattern: &str, names: &[&str]) { let fp = FuzzyPattern::from(pattern); - // checking using score_of - let mut last_score = fp.score_of(pattern); - let mut last_name = pattern; - for name in names { - let score = fp.score_of(name); - assert!( - score < last_score, - "score({:?}) should be lower than score({:?}) (using score_of)", - name, - last_name - ); - last_name = name; - last_score = score; - } - // checking using find let mut last_score = fp.find(pattern).map(|m| m.score); let mut last_name = pattern; for name in names { |