summaryrefslogtreecommitdiffstats
path: root/src/pattern
diff options
context:
space:
mode:
authorCanop <cano.petrole@gmail.com>2021-03-12 18:29:29 +0100
committerCanop <cano.petrole@gmail.com>2021-03-12 20:41:20 +0100
commitf98e055ecd7ee315ba812520cacea54ef8c83b94 (patch)
tree77da76bfc6e5db6d4aae020fdf7c6e0a383cecfd /src/pattern
parentd863f3f17745e77b426e9c583085b94d79de679c (diff)
improve fuzzy matching, holes minimization
Diffstat (limited to 'src/pattern')
-rw-r--r--src/pattern/fuzzy_pattern.rs206
-rw-r--r--src/pattern/mod.rs2
-rw-r--r--src/pattern/name_match.rs5
3 files changed, 137 insertions, 76 deletions
diff --git a/src/pattern/fuzzy_pattern.rs b/src/pattern/fuzzy_pattern.rs
index af7f75c..076aaff 100644
--- a/src/pattern/fuzzy_pattern.rs
+++ b/src/pattern/fuzzy_pattern.rs
@@ -1,14 +1,16 @@
-//! a simple fuzzy pattern matcher for filename filtering / sorting.
+//! a fuzzy pattern matcher for filename filtering / sorting.
//! It's not meant for file contents but for small strings (less than 1000 chars)
//! such as file names.
use {
super::NameMatch,
secular,
- smallvec::smallvec,
+ smallvec::{smallvec, SmallVec},
std::fmt::{self, Write},
};
+type CandChars = SmallVec<[char; 32]>;
+
// weights used in match score computing
const BONUS_MATCH: i32 = 50_000;
const BONUS_EXACT: i32 = 1_000;
@@ -71,90 +73,75 @@ impl FuzzyPattern {
}
}
- /// look for a match starting at a given character
- fn match_starting_at_index(
+ fn tight_match_from_index(
&self,
- cand_chars: &[char],
+ cand_chars: &CandChars,
start_idx: usize, // start index in candidate, in chars
) -> MatchSearchResult {
- if cand_chars[start_idx] != self.chars[0] {
- return MatchSearchResult::None;
- }
let mut pos = smallvec![0; self.chars.len()]; // positions of matching chars in candidate
- let mut nb_holes = 0;
- let mut nb_singled_chars = 0;
- let mut cand_idx = 1 + start_idx;
- if self.chars.len() > 1 {
- let mut group_len = 1;
- let mut pat_idx = 1; // index both in self.chars and pos
- let mut in_hole = false;
- loop {
- if cand_chars[cand_idx] == self.chars[pat_idx] {
- if in_hole {
- // we're no more in a hole
- in_hole = false;
- if nb_holes > 0 {
- // it's not the first hole. We see whether it's possible
- // to join the previous group to the new one
- let mut can_be_joined = true;
- for ri in 0..group_len {
- if cand_chars[cand_idx-ri-1] != self.chars[pat_idx-ri-1] {
- can_be_joined = false;
- break;
- }
- }
- if can_be_joined {
- for ri in 0..group_len {
- pos[pat_idx-ri-1] = cand_idx-ri-1;
- }
- } else {
- if group_len == 1 {
- nb_singled_chars += 1;
- }
- nb_holes += 1;
- group_len = 0;
- }
+ let mut cand_idx = start_idx;
+ let mut pat_idx = 0; // index both in self.chars and pos
+ let mut in_hole = false;
+ loop {
+ if cand_chars[cand_idx] == self.chars[pat_idx] {
+ pos[pat_idx] = cand_idx;
+ if in_hole {
+ // We're no more in a hole.
+ // Let's look if we can bring back the chars before the hole
+ let mut rev_idx = 1;
+ loop {
+ if pat_idx < rev_idx {
+ break;
+ }
+ if cand_chars[cand_idx-rev_idx] == self.chars[pat_idx-rev_idx] {
+ pos[pat_idx-rev_idx] = cand_idx-rev_idx;
} else {
- // first hole
- nb_holes += 1;
- group_len = 0;
+ break;
}
+ rev_idx += 1;
}
- pos[pat_idx] = cand_idx;
- pat_idx += 1;
- if pat_idx == self.chars.len() {
- break; // match, finished
- }
- cand_idx += 1;
- group_len += 1;
- } else {
- // there's a hole
- if cand_chars.len() - cand_idx <= self.chars.len() - pat_idx {
- return MatchSearchResult::None;
- }
- cand_idx += 1;
- in_hole = true;
+ in_hole = false;
+ }
+ pat_idx += 1;
+ if pat_idx == self.chars.len() {
+ break; // match, finished
+ }
+ } else {
+ // there's a hole
+ if cand_chars.len() - cand_idx <= self.chars.len() - pat_idx {
+ return MatchSearchResult::None;
}
+ in_hole = true;
}
+ cand_idx += 1;
}
- pos[0] = start_idx;
- let match_len = 1 + cand_idx - start_idx;
+ let mut nb_holes = 0;
+ let mut nb_singled_chars = 0;
+ for idx in 1..pos.len() {
+ if pos[idx] > 1 + pos[idx-1] {
+ nb_holes += 1;
+ if idx > 1 && pos[idx-1] > 1 + pos[idx-2] {
+ nb_singled_chars += 1;
+ }
+ }
+ }
+ let match_len = 1 + cand_idx - pos[0];
let mut score = BONUS_MATCH;
score += BONUS_CANDIDATE_LENGTH * (cand_chars.len() as i32);
score += BONUS_SINGLED_CHAR * (nb_singled_chars as i32);
score += BONUS_NB_HOLES * (nb_holes as i32);
score += match_len as i32 * BONUS_MATCH_LENGTH;
- if start_idx == 0 {
+ if pos[0] == 0 {
score += BONUS_START + BONUS_START_WORD;
if cand_chars.len() == self.chars.len() {
score += BONUS_EXACT;
return MatchSearchResult::Perfect(NameMatch { score, pos });
}
} else {
- let previous = cand_chars[start_idx - 1];
+ let previous = cand_chars[pos[0] - 1];
if is_word_separator(previous) {
score += BONUS_START_WORD;
- if cand_chars.len() - start_idx == self.chars.len() {
+ if cand_chars.len() - pos[0] == self.chars.len() {
return MatchSearchResult::Perfect(NameMatch { score, pos });
}
}
@@ -169,7 +156,7 @@ impl FuzzyPattern {
if candidate.len() < self.chars.len() {
return None;
}
- let mut cand_chars: Vec<char> = Vec::with_capacity(candidate.len());
+ let mut cand_chars: CandChars = SmallVec::with_capacity(candidate.len());
cand_chars.extend(candidate.chars().map(secular::lower_lay_char));
if cand_chars.len() < self.chars.len() {
return None;
@@ -178,17 +165,23 @@ impl FuzzyPattern {
let mut best_match: Option<NameMatch> = None;
let n = cand_chars.len() - self.chars.len();
for start_idx in 0..=n {
- match self.match_starting_at_index(&cand_chars, start_idx) {
- MatchSearchResult::Perfect(m) => {
- return Some(m);
- }
- MatchSearchResult::Some(m) => {
- if m.score > best_score {
- best_score = m.score;
- best_match = Some(m);
+ if cand_chars[start_idx] == self.chars[0] {
+ match self.tight_match_from_index(&cand_chars, start_idx) {
+ MatchSearchResult::Perfect(m) => {
+ return Some(m);
+ }
+ MatchSearchResult::Some(m) => {
+ if m.score > best_score {
+ best_score = m.score;
+ best_match = Some(m);
+ }
+ // we could make start_idx jump to pos[0] here
+ // but it doesn't improve the perfs (it's rare
+ // anyway to have pos[0] much greater than the
+ // start of the search)
}
+ _ => {}
}
- _ => {}
}
}
best_match
@@ -205,7 +198,72 @@ impl FuzzyPattern {
#[cfg(test)]
mod fuzzy_pattern_tests {
- use super::*;
+ use {
+ super::*,
+ crate::pattern::Pos,
+ };
+
+ /// check position of the match of the pattern in name
+ fn check_pos(pattern: &str, name: &str, pos: &str) {
+ let pat = FuzzyPattern::from(pattern);
+ let match_pos = pat.find(name).unwrap().pos;
+ let target_pos: Pos = pos.chars()
+ .enumerate()
+ .filter(|(_, c)| *c=='^')
+ .map(|(i, _)| i)
+ .collect();
+ assert_eq!(match_pos, target_pos);
+ }
+
+ /// test the quality of match length and groups number minimization
+ #[test]
+ fn check_match_pos() {
+ check_pos(
+ "ba",
+ "babababaaa",
+ "^^ ",
+ );
+ check_pos(
+ "baa",
+ "babababaaa",
+ " ^^^ ",
+ );
+ check_pos(
+ "bbabaa",
+ "babababaaa",
+ " ^ ^^^^^ ",
+ );
+ check_pos(
+ "aoml",
+ "bacon.coml",
+ " ^ ^^^",
+ );
+ check_pos(
+ "broot",
+ "ab br bro oxt ",
+ " ^^^ ^ ^ ",
+ );
+ check_pos(
+ "broot",
+ "b_broooooot_broot",
+ " ^^^^^",
+ );
+ check_pos(
+ "buityp",
+ "builder-styles-less-typing.d.ts",
+ "^^^ ^^^ ",
+ );
+ check_pos(
+ "broot",
+ "ветер br x o vent ootot",
+ " ^^ ^^^ ",
+ );
+ check_pos(
+ "broot",
+ "brbroorrbbbbbrrooorobrototooooot.txt",
+ " ^^^ ^^ ",
+ );
+ }
/// check that the scores of all names are strictly decreasing
/// (pattern is first tested against itself).
diff --git a/src/pattern/mod.rs b/src/pattern/mod.rs
index d05c671..43c2698 100644
--- a/src/pattern/mod.rs
+++ b/src/pattern/mod.rs
@@ -22,7 +22,7 @@ pub use {
exact_pattern::ExactPattern,
fuzzy_pattern::FuzzyPattern,
input_pattern::InputPattern,
- name_match::NameMatch,
+ name_match::{NameMatch, Pos},
pattern::Pattern,
pattern_object::PatternObject,
pattern_parts::PatternParts,
diff --git a/src/pattern/name_match.rs b/src/pattern/name_match.rs
index 0350806..2459252 100644
--- a/src/pattern/name_match.rs
+++ b/src/pattern/name_match.rs
@@ -2,12 +2,15 @@ use {
smallvec::SmallVec,
};
+/// a vector of indexes of the matching characters (not bytes)
+pub type Pos = SmallVec<[usize; 8]>;
+
/// A NameMatch is a positive result of pattern matching inside
/// a filename or subpath
#[derive(Debug, Clone)]
pub struct NameMatch {
pub score: i32, // score of the match, guaranteed strictly positive, bigger is better
- pub pos: SmallVec<[usize; 8]>, // positions of the matching chars
+ pub pos: Pos, // positions of the matching chars
}
impl NameMatch {