//! a fuzzy pattern matcher for filename filtering / sorting. //! It's not meant for file contents but for small strings (less than 1000 chars) //! such as file names. use { super::NameMatch, secular, smallvec::{smallvec, SmallVec}, std::fmt::{self, Write}, }; type CandChars = SmallVec<[char; 32]>; // weights used in match score computing const BONUS_MATCH: i32 = 50_000; const BONUS_EXACT: i32 = 1_000; const BONUS_START: i32 = 10; const BONUS_START_WORD: i32 = 5; const BONUS_CANDIDATE_LENGTH: i32 = -1; // per char const BONUS_MATCH_LENGTH: i32 = -10; // per char of length of the match const BONUS_NB_HOLES: i32 = -30; // there's also a max on that number const BONUS_SINGLED_CHAR: i32 = -15; // when there's a char, neither first not last, isolated /// A pattern for fuzzy matching #[derive(Debug, Clone)] pub struct FuzzyPattern { chars: Box<[char]>, // secularized characters max_nb_holes: usize, } impl fmt::Display for FuzzyPattern { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { for &c in self.chars.iter() { f.write_char(c)? } Ok(()) } } enum MatchSearchResult { Perfect(NameMatch), // no need to test other positions Some(NameMatch), None, } fn is_word_separator(c: char) -> bool { matches!(c, '_' | ' ' | '-') } impl FuzzyPattern { /// build a pattern which will later be usable for fuzzy search. /// A pattern should be reused pub fn from(pat: &str) -> Self { let chars = secular::normalized_lower_lay_string(pat) .chars() .collect::>() .into_boxed_slice(); let max_nb_holes = match chars.len() { 1 => 0, 2 => 1, 3 => 2, 4 => 2, 5 => 2, 6 => 3, 7 => 3, 8 => 4, _ => chars.len() * 4 / 7, }; FuzzyPattern { chars, max_nb_holes, } } /// an "empty" pattern is one which accepts everything because /// it has no discriminant pub fn is_empty(&self) -> bool { self.chars.is_empty() } fn tight_match_from_index( &self, cand_chars: &CandChars, start_idx: usize, // start index in candidate, in chars ) -> MatchSearchResult { let mut pos = smallvec![0; self.chars.len()]; // positions of matching chars in candidate let mut cand_idx = start_idx; let mut pat_idx = 0; // index both in self.chars and pos let mut in_hole = false; loop { if cand_chars[cand_idx] == self.chars[pat_idx] { pos[pat_idx] = cand_idx; if in_hole { // We're no more in a hole. // Let's look if we can bring back the chars before the hole let mut rev_idx = 1; loop { if pat_idx < rev_idx { break; } if cand_chars[cand_idx-rev_idx] == self.chars[pat_idx-rev_idx] { // we move the pos forward pos[pat_idx-rev_idx] = cand_idx-rev_idx; } else { break; } rev_idx += 1; } in_hole = false; } pat_idx += 1; if pat_idx == self.chars.len() { break; // match, finished } } else { // there's a hole if cand_chars.len() - cand_idx <= self.chars.len() - pat_idx { return MatchSearchResult::None; } in_hole = true; } cand_idx += 1; } let mut nb_holes = 0; let mut nb_singled_chars = 0; for idx in 1..pos.len() { if pos[idx] > 1 + pos[idx-1] { nb_holes += 1; if idx > 1 && pos[idx-1] > 1 + pos[idx-2] { // we improve a simple case: the one of a singleton which was created // by pushing forward a char if cand_chars[pos[idx-2]+1] == cand_chars[pos[idx-1]] { // in some cases we're really removing another singletons but // let's forget this pos[idx-1] = pos[idx-2]+1; nb_holes -= 1; } else { nb_singled_chars += 1; } } } } if nb_holes > self.max_nb_holes { return MatchSearchResult::None; } let match_len = 1 + cand_idx - pos[0]; let mut score = BONUS_MATCH; score += BONUS_CANDIDATE_LENGTH * (cand_chars.len() as i32); score += BONUS_SINGLED_CHAR * nb_singled_chars; score += BONUS_NB_HOLES * (nb_holes as i32); score += match_len as i32 * BONUS_MATCH_LENGTH; if pos[0] == 0 { score += BONUS_START + BONUS_START_WORD; if cand_chars.len() == self.chars.len() { score += BONUS_EXACT; return MatchSearchResult::Perfect(NameMatch { score, pos }); } } else { let previous = cand_chars[pos[0] - 1]; if is_word_separator(previous) { score += BONUS_START_WORD; if cand_chars.len() - pos[0] == self.chars.len() { return MatchSearchResult::Perfect(NameMatch { score, pos }); } } } MatchSearchResult::Some(NameMatch { score, pos }) } /// return a match if the pattern can be found in the candidate string. /// The algorithm tries to return the best one. For example if you search /// "abc" in "ababca-abc", the returned match would be at the end. pub fn find(&self, candidate: &str) -> Option { if candidate.len() < self.chars.len() { return None; } let mut cand_chars: CandChars = SmallVec::with_capacity(candidate.len()); cand_chars.extend(candidate.chars().map(secular::lower_lay_char)); if cand_chars.len() < self.chars.len() { return None; } let mut best_score = 0; let mut best_match: Option = None; let n = cand_chars.len() + 1 - self.chars.len(); for start_idx in 0..n { if cand_chars[start_idx] == self.chars[0] { match self.tight_match_from_index(&cand_chars, start_idx) { MatchSearchResult::Perfect(m) => { return Some(m); } MatchSearchResult::Some(m) => { if m.score > best_score { best_score = m.score; best_match = Some(m); } // we could make start_idx jump to pos[0] here // but it doesn't improve the perfs (it's rare // anyway to have pos[0] much greater than the // start of the search) } _ => {} } } } best_match } /// compute the score of the best match pub fn score_of(&self, candidate: &str) -> Option { self.find(candidate) .map(|nm| nm.score) } } #[cfg(test)] mod fuzzy_pattern_tests { use { super::*, crate::pattern::Pos, }; /// check position of the match of the pattern in name fn check_pos(pattern: &str, name: &str, pos: &str) { let pat = FuzzyPattern::from(pattern); let match_pos = pat.find(name).unwrap().pos; let target_pos: Pos = pos.chars() .enumerate() .filter(|(_, c)| *c=='^') .map(|(i, _)| i) .collect(); assert_eq!(match_pos, target_pos); } /// test the quality of match length and groups number minimization #[test] fn check_match_pos() { check_pos( "ba", "babababaaa", "^^ ", ); check_pos( "baa", "babababaaa", " ^^^ ", ); check_pos( "bbabaa", "babababaaa", " ^ ^^^^^ ", ); check_pos( "aoml", "bacon.coml", " ^ ^^^", ); check_pos( "broot", "ab br bro oxt ", " ^^^ ^ ^ ", ); check_pos( "broot", "b_broooooot_broot", " ^^^^^", ); check_pos( "buityp", "builder-styles-less-typing.d.ts", "^^^ ^^^ ", ); check_pos( "broot", "ветер br x o vent ootot", " ^^ ^^^ ", ); check_pos( "broot", "brbroorrbbbbbrrooorobrototooooot.txt", " ^^^ ^^ ", ); check_pos( "besh", "benches/shared", "^^ ^^ ", ); } /// check that the scores of all names are strictly decreasing /// (pattern is first tested against itself). /// We verify this property with both computation functions. fn check_ordering_for(pattern: &str, names: &[&str]) { let fp = FuzzyPattern::from(pattern); let mut last_score = fp.find(pattern).map(|m| m.score); let mut last_name = pattern; for name in names { let score = fp.find(name).map(|m| m.score); assert!( score < last_score, "score({name:?}) should be lower than score({last_name:?}) (using find)" ); last_name = name; last_score = score; } } #[test] fn check_orderings() { check_ordering_for( "broot", &[ "a broot", "abbroot", "abcbroot", " abdbroot", "1234broot1", "12345brrrroooottt", "12345brrr roooottt", "brot", ], ); check_ordering_for( "Abc", &[ "abCd", "aBdc", " abdc", " abdbccccc", " a b c", "nothing", ], ); check_ordering_for( "réveil", &[ "Réveillon", "Réveillons", " réveils", "πréveil", "déréveil", "Rêve-t-il ?", " rêves", ], ); } /// check that we don't fail to find strings differing by diacritics /// or unicode normalization. /// /// Note that we can't go past the limits of unicode normalization /// and for example I don't know how to make 'B́' one char (help welcome). /// This should normally not cause any problem for the user as searching /// `"ab"` in `"aB́"` will still match. #[test] fn check_equivalences() { fn check_equivalences_in(arr: &[&str]) { for pattern in arr.iter() { let fp = FuzzyPattern::from(pattern); for name in arr.iter() { println!("looking for pattern {pattern:?} in name {name:?}"); assert!(fp.find(name).unwrap().score > 0); } } } check_equivalences_in(&["aB", "ab", "àb", "âB"]); let c12 = "Comunicações"; assert_eq!(c12.len(), 14); assert_eq!(c12.chars().count(), 12); let c14 = "Comunicações"; assert_eq!(c14.len(), 16); assert_eq!(c14.chars().count(), 14); check_equivalences_in(&["comunicacoes", c12, c14]); } }