diff options
author | Canop <cano.petrole@gmail.com> | 2021-03-12 18:29:29 +0100 |
---|---|---|
committer | Canop <cano.petrole@gmail.com> | 2021-03-12 20:41:20 +0100 |
commit | f98e055ecd7ee315ba812520cacea54ef8c83b94 (patch) | |
tree | 77da76bfc6e5db6d4aae020fdf7c6e0a383cecfd /src/pattern | |
parent | d863f3f17745e77b426e9c583085b94d79de679c (diff) |
improve fuzzy matching, holes minimization
Diffstat (limited to 'src/pattern')
-rw-r--r-- | src/pattern/fuzzy_pattern.rs | 206 | ||||
-rw-r--r-- | src/pattern/mod.rs | 2 | ||||
-rw-r--r-- | src/pattern/name_match.rs | 5 |
3 files changed, 137 insertions, 76 deletions
diff --git a/src/pattern/fuzzy_pattern.rs b/src/pattern/fuzzy_pattern.rs index af7f75c..076aaff 100644 --- a/src/pattern/fuzzy_pattern.rs +++ b/src/pattern/fuzzy_pattern.rs @@ -1,14 +1,16 @@ -//! a simple fuzzy pattern matcher for filename filtering / sorting. +//! a fuzzy pattern matcher for filename filtering / sorting. //! It's not meant for file contents but for small strings (less than 1000 chars) //! such as file names. use { super::NameMatch, secular, - smallvec::smallvec, + smallvec::{smallvec, SmallVec}, std::fmt::{self, Write}, }; +type CandChars = SmallVec<[char; 32]>; + // weights used in match score computing const BONUS_MATCH: i32 = 50_000; const BONUS_EXACT: i32 = 1_000; @@ -71,90 +73,75 @@ impl FuzzyPattern { } } - /// look for a match starting at a given character - fn match_starting_at_index( + fn tight_match_from_index( &self, - cand_chars: &[char], + cand_chars: &CandChars, start_idx: usize, // start index in candidate, in chars ) -> MatchSearchResult { - if cand_chars[start_idx] != self.chars[0] { - return MatchSearchResult::None; - } let mut pos = smallvec![0; self.chars.len()]; // positions of matching chars in candidate - let mut nb_holes = 0; - let mut nb_singled_chars = 0; - let mut cand_idx = 1 + start_idx; - if self.chars.len() > 1 { - let mut group_len = 1; - let mut pat_idx = 1; // index both in self.chars and pos - let mut in_hole = false; - loop { - if cand_chars[cand_idx] == self.chars[pat_idx] { - if in_hole { - // we're no more in a hole - in_hole = false; - if nb_holes > 0 { - // it's not the first hole. We see whether it's possible - // to join the previous group to the new one - let mut can_be_joined = true; - for ri in 0..group_len { - if cand_chars[cand_idx-ri-1] != self.chars[pat_idx-ri-1] { - can_be_joined = false; - break; - } - } - if can_be_joined { - for ri in 0..group_len { - pos[pat_idx-ri-1] = cand_idx-ri-1; - } - } else { - if group_len == 1 { - nb_singled_chars += 1; - } - nb_holes += 1; - group_len = 0; - } + let mut cand_idx = start_idx; + let mut pat_idx = 0; // index both in self.chars and pos + let mut in_hole = false; + loop { + if cand_chars[cand_idx] == self.chars[pat_idx] { + pos[pat_idx] = cand_idx; + if in_hole { + // We're no more in a hole. + // Let's look if we can bring back the chars before the hole + let mut rev_idx = 1; + loop { + if pat_idx < rev_idx { + break; + } + if cand_chars[cand_idx-rev_idx] == self.chars[pat_idx-rev_idx] { + pos[pat_idx-rev_idx] = cand_idx-rev_idx; } else { - // first hole - nb_holes += 1; - group_len = 0; + break; } + rev_idx += 1; } - pos[pat_idx] = cand_idx; - pat_idx += 1; - if pat_idx == self.chars.len() { - break; // match, finished - } - cand_idx += 1; - group_len += 1; - } else { - // there's a hole - if cand_chars.len() - cand_idx <= self.chars.len() - pat_idx { - return MatchSearchResult::None; - } - cand_idx += 1; - in_hole = true; + in_hole = false; + } + pat_idx += 1; + if pat_idx == self.chars.len() { + break; // match, finished + } + } else { + // there's a hole + if cand_chars.len() - cand_idx <= self.chars.len() - pat_idx { + return MatchSearchResult::None; } + in_hole = true; } + cand_idx += 1; } - pos[0] = start_idx; - let match_len = 1 + cand_idx - start_idx; + let mut nb_holes = 0; + let mut nb_singled_chars = 0; + for idx in 1..pos.len() { + if pos[idx] > 1 + pos[idx-1] { + nb_holes += 1; + if idx > 1 && pos[idx-1] > 1 + pos[idx-2] { + nb_singled_chars += 1; + } + } + } + let match_len = 1 + cand_idx - pos[0]; let mut score = BONUS_MATCH; score += BONUS_CANDIDATE_LENGTH * (cand_chars.len() as i32); score += BONUS_SINGLED_CHAR * (nb_singled_chars as i32); score += BONUS_NB_HOLES * (nb_holes as i32); score += match_len as i32 * BONUS_MATCH_LENGTH; - if start_idx == 0 { + if pos[0] == 0 { score += BONUS_START + BONUS_START_WORD; if cand_chars.len() == self.chars.len() { score += BONUS_EXACT; return MatchSearchResult::Perfect(NameMatch { score, pos }); } } else { - let previous = cand_chars[start_idx - 1]; + let previous = cand_chars[pos[0] - 1]; if is_word_separator(previous) { score += BONUS_START_WORD; - if cand_chars.len() - start_idx == self.chars.len() { + if cand_chars.len() - pos[0] == self.chars.len() { return MatchSearchResult::Perfect(NameMatch { score, pos }); } } @@ -169,7 +156,7 @@ impl FuzzyPattern { if candidate.len() < self.chars.len() { return None; } - let mut cand_chars: Vec<char> = Vec::with_capacity(candidate.len()); + let mut cand_chars: CandChars = SmallVec::with_capacity(candidate.len()); cand_chars.extend(candidate.chars().map(secular::lower_lay_char)); if cand_chars.len() < self.chars.len() { return None; @@ -178,17 +165,23 @@ impl FuzzyPattern { let mut best_match: Option<NameMatch> = None; let n = cand_chars.len() - self.chars.len(); for start_idx in 0..=n { - match self.match_starting_at_index(&cand_chars, start_idx) { - MatchSearchResult::Perfect(m) => { - return Some(m); - } - MatchSearchResult::Some(m) => { - if m.score > best_score { - best_score = m.score; - best_match = Some(m); + if cand_chars[start_idx] == self.chars[0] { + match self.tight_match_from_index(&cand_chars, start_idx) { + MatchSearchResult::Perfect(m) => { + return Some(m); + } + MatchSearchResult::Some(m) => { + if m.score > best_score { + best_score = m.score; + best_match = Some(m); + } + // we could make start_idx jump to pos[0] here + // but it doesn't improve the perfs (it's rare + // anyway to have pos[0] much greater than the + // start of the search) } + _ => {} } - _ => {} } } best_match @@ -205,7 +198,72 @@ impl FuzzyPattern { #[cfg(test)] mod fuzzy_pattern_tests { - use super::*; + use { + super::*, + crate::pattern::Pos, + }; + + /// check position of the match of the pattern in name + fn check_pos(pattern: &str, name: &str, pos: &str) { + let pat = FuzzyPattern::from(pattern); + let match_pos = pat.find(name).unwrap().pos; + let target_pos: Pos = pos.chars() + .enumerate() + .filter(|(_, c)| *c=='^') + .map(|(i, _)| i) + .collect(); + assert_eq!(match_pos, target_pos); + } + + /// test the quality of match length and groups number minimization + #[test] + fn check_match_pos() { + check_pos( + "ba", + "babababaaa", + "^^ ", + ); + check_pos( + "baa", + "babababaaa", + " ^^^ ", + ); + check_pos( + "bbabaa", + "babababaaa", + " ^ ^^^^^ ", + ); + check_pos( + "aoml", + "bacon.coml", + " ^ ^^^", + ); + check_pos( + "broot", + "ab br bro oxt ", + " ^^^ ^ ^ ", + ); + check_pos( + "broot", + "b_broooooot_broot", + " ^^^^^", + ); + check_pos( + "buityp", + "builder-styles-less-typing.d.ts", + "^^^ ^^^ ", + ); + check_pos( + "broot", + "ветер br x o vent ootot", + " ^^ ^^^ ", + ); + check_pos( + "broot", + "brbroorrbbbbbrrooorobrototooooot.txt", + " ^^^ ^^ ", + ); + } /// check that the scores of all names are strictly decreasing /// (pattern is first tested against itself). diff --git a/src/pattern/mod.rs b/src/pattern/mod.rs index d05c671..43c2698 100644 --- a/src/pattern/mod.rs +++ b/src/pattern/mod.rs @@ -22,7 +22,7 @@ pub use { exact_pattern::ExactPattern, fuzzy_pattern::FuzzyPattern, input_pattern::InputPattern, - name_match::NameMatch, + name_match::{NameMatch, Pos}, pattern::Pattern, pattern_object::PatternObject, pattern_parts::PatternParts, diff --git a/src/pattern/name_match.rs b/src/pattern/name_match.rs index 0350806..2459252 100644 --- a/src/pattern/name_match.rs +++ b/src/pattern/name_match.rs @@ -2,12 +2,15 @@ use { smallvec::SmallVec, }; +/// a vector of indexes of the matching characters (not bytes) +pub type Pos = SmallVec<[usize; 8]>; + /// A NameMatch is a positive result of pattern matching inside /// a filename or subpath #[derive(Debug, Clone)] pub struct NameMatch { pub score: i32, // score of the match, guaranteed strictly positive, bigger is better - pub pos: SmallVec<[usize; 8]>, // positions of the matching chars + pub pos: Pos, // positions of the matching chars } impl NameMatch { |