summaryrefslogtreecommitdiffstats
path: root/src/pattern
diff options
context:
space:
mode:
authorCanop <cano.petrole@gmail.com>2021-03-05 14:47:29 +0100
committerCanop <cano.petrole@gmail.com>2021-03-05 14:47:29 +0100
commita965b995fc845e9404e018ff6ccb40fcc8a2adf0 (patch)
tree0040e637b49d85fa9ff9708b800a64f44a664137 /src/pattern
parent4d44b1585cde2b95e4742ca8615bd9c0b2772072 (diff)
improves and simplifies fuzzy matching (WIP)
I've removed the separation between a scoring algorithm and the matching one. This means we lose a few % in performances but it's way easier to change the matching algorighm when it doesn't have to be reproduced with a lighter logic. I've tuned the algorithm to avoid singled chars if possible and to have a malus when they can't be avoided.
Diffstat (limited to 'src/pattern')
-rw-r--r--src/pattern/fuzzy_pattern.rs224
1 files changed, 30 insertions, 194 deletions
diff --git a/src/pattern/fuzzy_pattern.rs b/src/pattern/fuzzy_pattern.rs
index 52a1e24..031e104 100644
--- a/src/pattern/fuzzy_pattern.rs
+++ b/src/pattern/fuzzy_pattern.rs
@@ -16,6 +16,7 @@ const BONUS_START_WORD: i32 = 5;
const BONUS_CANDIDATE_LENGTH: i32 = -1; // per char
const BONUS_MATCH_LENGTH: i32 = -10; // per char of length of the match
const BONUS_NB_HOLES: i32 = -30; // there's also a max on that number
+const BONUS_SINGLED_CHAR: i32 = -40; // when there's a char, neither first not last, isolated
/// A pattern for fuzzy matching
#[derive(Debug, Clone)]
@@ -38,12 +39,6 @@ enum MatchSearchResult {
Some(NameMatch),
None,
}
-enum ScoreSearchResult {
- Perfect(i32), // no need to test other positions
- Some(i32),
- None,
- NoneToEnd,
-}
fn is_word_separator(c: char) -> bool {
matches!(c, '_' | ' ' | '-')
@@ -88,8 +83,9 @@ impl FuzzyPattern {
pos.push(start_idx);
let mut d = 1;
let mut nb_holes = 0;
+ let mut nb_singled_chars = 0;
for pat_idx in 1..self.chars.len() {
- let hole_start = d;
+ let group_start = d;
loop {
let cand_idx = start_idx + d;
if cand_idx == cand_chars.len() {
@@ -101,17 +97,38 @@ impl FuzzyPattern {
break;
}
}
- if hole_start + 1 != d {
+ if group_start + 1 != d {
// note that there's no absolute guarantee we found the minimal
// number of holes. The algorithm isn't perfect
- if nb_holes >= self.max_nb_holes {
- return MatchSearchResult::None;
+ if nb_holes > 0 {
+ if nb_holes >= self.max_nb_holes {
+ return MatchSearchResult::None;
+ }
+ // we're just after a hole which wasn't the first one
+ // we check whether the previous group (which wasn't the
+ // initial one) was a single char.
+ let last_idx = pos.len() - 1;
+ // The last char of the previous group is at index last_idx-1
+ if pos[last_idx-1] > pos[last_idx-2] + 1 {
+ debug!("singled char in {:?}", cand_chars.iter().collect::<String>());
+ debug!("before merge: {:?}", &pos);
+ // The last group was of size 1, it's a singled char.
+ // But maybe it can be rattached to the new group ?
+ if self.chars[pat_idx-1] == cand_chars[pos[last_idx]-1] {
+ pos[last_idx-1] = pos[last_idx] - 1;
+ debug!("merging singled char -> {:?}", &pos);
+ nb_holes -= 1;
+ } else {
+ nb_singled_chars += 1;
+ }
+ }
}
nb_holes += 1;
}
}
let mut score = BONUS_MATCH;
score += BONUS_CANDIDATE_LENGTH * (cand_chars.len() as i32);
+ score += BONUS_SINGLED_CHAR * (nb_singled_chars as i32);
score += BONUS_NB_HOLES * (nb_holes as i32);
score += d as i32 * BONUS_MATCH_LENGTH;
if start_idx == 0 {
@@ -129,6 +146,7 @@ impl FuzzyPattern {
}
}
}
+ debug!("{} -> s={}, pos={:?}", cand_chars.iter().collect::<String>(), score, &pos);
MatchSearchResult::Some(NameMatch { score, pos })
}
@@ -164,151 +182,11 @@ impl FuzzyPattern {
best_match
}
- /// compute the score in the specific case the pattern is of length 1
- fn score_1_char(&self, candidate: &str, pat_chr: char) -> Option<i32> {
- let mut cand_chars = candidate.chars().map(secular::lower_lay_char);
- match cand_chars.next() {
- None => None, // empty candidate: this looks pathological but might be valid
- Some(chr) if pat_chr == chr => {
- let cand_len = cand_chars.count() as i32 + 1;
- let score = BONUS_MATCH
- + BONUS_START
- + BONUS_START_WORD
- + cand_len * BONUS_CANDIDATE_LENGTH
- + BONUS_MATCH_LENGTH;
- Some(if cand_len == 1 {
- score + BONUS_EXACT
- } else {
- score
- })
- }
- Some(chr) => {
- let mut starts_word = is_word_separator(chr);
- while let Some(cand_chr) = cand_chars.next() {
- if cand_chr == pat_chr {
- let cand_len = candidate.chars().count() as i32;
- let score =
- BONUS_MATCH + cand_len * BONUS_CANDIDATE_LENGTH + BONUS_MATCH_LENGTH;
- if starts_word {
- return Some(score + BONUS_START_WORD);
- } else {
- // looking for another match after a space
- for cand_chr in cand_chars {
- if cand_chr == pat_chr && starts_word {
- return Some(score + BONUS_START_WORD);
- } else {
- starts_word = is_word_separator(cand_chr);
- }
- }
- return Some(score);
- }
- } else {
- starts_word = is_word_separator(cand_chr);
- }
- }
- None
- }
- }
- }
-
- /// if a match starts at the given char index, return its score.
- /// Return it as "Perfect" if no better match can be found further
- /// in the string.
- /// The pattern is assumed to be of length 2 or more
- fn score_starting_at(
- &self,
- cand_chars: &[char],
- start_idx: usize, // start index in candidate
- ) -> ScoreSearchResult {
- if cand_chars[start_idx] != self.chars[0] {
- return ScoreSearchResult::None;
- }
- let mut d = 1;
- let mut nb_holes = 0;
- for pat_idx in 1..self.chars.len() {
- let hole_start = d;
- loop {
- let cand_idx = start_idx + d;
- if cand_idx == cand_chars.len() {
- return ScoreSearchResult::NoneToEnd;
- }
- d += 1;
- if cand_chars[cand_idx] == self.chars[pat_idx] {
- break;
- }
- }
- if hole_start + 1 != d {
- // note that there's no absolute guarantee we found the minimal
- // number of holes. The algorithm isn't perfect
- if nb_holes >= self.max_nb_holes {
- return ScoreSearchResult::None;
- }
- nb_holes += 1;
- }
- }
- let mut score = BONUS_MATCH
- + BONUS_CANDIDATE_LENGTH * (cand_chars.len() as i32)
- + BONUS_NB_HOLES * (nb_holes as i32)
- + d as i32 * BONUS_MATCH_LENGTH;
- if start_idx == 0 {
- score += BONUS_START + BONUS_START_WORD;
- if cand_chars.len() == self.chars.len() {
- score += BONUS_EXACT;
- return ScoreSearchResult::Perfect(score);
- }
- } else {
- let previous = cand_chars[start_idx - 1];
- if is_word_separator(previous) {
- score += BONUS_START_WORD;
- if cand_chars.len() - start_idx == self.chars.len() {
- return ScoreSearchResult::Perfect(score);
- }
- }
- }
- ScoreSearchResult::Some(score)
- }
-
- fn score_n_chars(&self, candidate: &str) -> Option<i32> {
- if candidate.len() < self.chars.len() {
- return None;
- }
- let mut cand_chars: Vec<char> = Vec::with_capacity(candidate.len());
- cand_chars.extend(candidate.chars().map(secular::lower_lay_char));
- if cand_chars.len() < self.chars.len() {
- return None;
- }
- let mut best_score = 0;
- let n = cand_chars.len() - self.chars.len();
- for start_idx in 0..=n {
- match self.score_starting_at(&cand_chars, start_idx) {
- ScoreSearchResult::Perfect(s) => {
- return Some(s);
- }
- ScoreSearchResult::Some(score) => {
- if score > best_score {
- best_score = score;
- }
- }
- ScoreSearchResult::NoneToEnd => {
- break;
- }
- _ => {}
- }
- }
- if best_score > 0 {
- Some(best_score)
- } else {
- None
- }
- }
-
/// compute the score of the best match, in a way mostly similar to `find` but
/// faster by not storing match positions
pub fn score_of(&self, candidate: &str) -> Option<i32> {
- match self.chars.len() {
- 1 => self.score_1_char(candidate, self.chars[0]),
- _ => self.score_n_chars(candidate),
- }
+ self.find(candidate)
+ .map(|nm| nm.score)
}
}
@@ -317,53 +195,11 @@ mod fuzzy_pattern_tests {
use super::*;
- #[test]
- fn check_equal_scores() {
- static PATTERNS: &[&str] = &["reveil", "dystroy", "broot", "AB", "z", "é", "év", "a"];
- static NAMES: &[&str] = &[
- " brr ooT",
- "Reveillon",
- "una a",
- "dys",
- "test",
- " a reveil",
- "a rbrroot",
- "Ab",
- "Eve",
- "zeévr",
- "alnékjhz vaoi",
- "jfhf br mleh & é kn rr o hrzpqôùù",
- "ÅΩ",
- ];
- for pattern in PATTERNS {
- let fp = FuzzyPattern::from(pattern);
- for name in NAMES {
- println!("checking pattern {:?} on name {:?}", pattern, name);
- assert_eq!(fp.score_of(name), fp.find(name).map(|m| m.score));
- }
- }
- }
-
/// check that the scores of all names are strictly decreasing
/// (pattern is first tested against itself).
/// We verify this property with both computation functions.
fn check_ordering_for(pattern: &str, names: &[&str]) {
let fp = FuzzyPattern::from(pattern);
- // checking using score_of
- let mut last_score = fp.score_of(pattern);
- let mut last_name = pattern;
- for name in names {
- let score = fp.score_of(name);
- assert!(
- score < last_score,
- "score({:?}) should be lower than score({:?}) (using score_of)",
- name,
- last_name
- );
- last_name = name;
- last_score = score;
- }
- // checking using find
let mut last_score = fp.find(pattern).map(|m| m.score);
let mut last_name = pattern;
for name in names {