diff options
author | Canop <cano.petrole@gmail.com> | 2021-05-04 19:50:53 +0200 |
---|---|---|
committer | Canop <cano.petrole@gmail.com> | 2021-05-04 19:50:53 +0200 |
commit | d97b05a9536b08cd48dec4d3e244e14d4ed080ef (patch) | |
tree | 941b7f42b80421c06d4c2122916cb384ae25c760 /src/pattern | |
parent | 57277840522e463546476c0bda34314b9e9b2c21 (diff) |
first implementation of a "tokens" pattern
Diffstat (limited to 'src/pattern')
-rw-r--r-- | src/pattern/fuzzy_pattern.rs | 3 | ||||
-rw-r--r-- | src/pattern/mod.rs | 6 | ||||
-rw-r--r-- | src/pattern/name_match.rs | 5 | ||||
-rw-r--r-- | src/pattern/pattern.rs | 18 | ||||
-rw-r--r-- | src/pattern/pos.rs | 7 | ||||
-rw-r--r-- | src/pattern/search_mode.rs | 32 | ||||
-rw-r--r-- | src/pattern/tok_pattern.rs | 190 |
7 files changed, 243 insertions, 18 deletions
diff --git a/src/pattern/fuzzy_pattern.rs b/src/pattern/fuzzy_pattern.rs index 45857b0..b0be711 100644 --- a/src/pattern/fuzzy_pattern.rs +++ b/src/pattern/fuzzy_pattern.rs @@ -186,8 +186,7 @@ impl FuzzyPattern { best_match } - /// compute the score of the best match, in a way mostly similar to `find` but - /// faster by not storing match positions + /// compute the score of the best match pub fn score_of(&self, candidate: &str) -> Option<i32> { self.find(candidate) .map(|nm| nm.score) diff --git a/src/pattern/mod.rs b/src/pattern/mod.rs index 43c2698..f867f7d 100644 --- a/src/pattern/mod.rs +++ b/src/pattern/mod.rs @@ -11,8 +11,10 @@ mod operator; mod pattern; mod pattern_object; mod pattern_parts; +mod pos; mod regex_pattern; mod search_mode; +mod tok_pattern; pub use { candidate::Candidate, @@ -22,13 +24,15 @@ pub use { exact_pattern::ExactPattern, fuzzy_pattern::FuzzyPattern, input_pattern::InputPattern, - name_match::{NameMatch, Pos}, + name_match::NameMatch, pattern::Pattern, pattern_object::PatternObject, pattern_parts::PatternParts, + pos::*, operator::PatternOperator, regex_pattern::RegexPattern, search_mode::*, + tok_pattern::*, }; use crate::errors::PatternError; diff --git a/src/pattern/name_match.rs b/src/pattern/name_match.rs index 2459252..a4bffcf 100644 --- a/src/pattern/name_match.rs +++ b/src/pattern/name_match.rs @@ -1,10 +1,7 @@ use { - smallvec::SmallVec, + super::Pos, }; -/// a vector of indexes of the matching characters (not bytes) -pub type Pos = SmallVec<[usize; 8]>; - /// A NameMatch is a positive result of pattern matching inside /// a filename or subpath #[derive(Debug, Clone)] diff --git a/src/pattern/pattern.rs b/src/pattern/pattern.rs index 6bf7c75..157c0ac 100644 --- a/src/pattern/pattern.rs +++ b/src/pattern/pattern.rs @@ -17,9 +17,11 @@ pub enum Pattern { NameExact(ExactPattern), NameFuzzy(FuzzyPattern), NameRegex(RegexPattern), + NameTokens(TokPattern), PathExact(ExactPattern), PathFuzzy(FuzzyPattern), PathRegex(RegexPattern), + PathTokens(TokPattern), ContentExact(ContentExactPattern), ContentRegex(ContentRegexPattern), Composite(CompositePattern), @@ -51,6 +53,9 @@ impl Pattern { SearchMode::NameRegex => Self::NameRegex( RegexPattern::from(core, flags.unwrap_or(""))? ), + SearchMode::NameTokens => Self::NameTokens( + TokPattern::new(core, ',') + ), SearchMode::PathExact => Self::PathExact( ExactPattern::from(core) ), @@ -60,6 +65,9 @@ impl Pattern { SearchMode::PathRegex => Self::PathRegex( RegexPattern::from(core, flags.unwrap_or(""))? ), + SearchMode::PathTokens => Self::PathTokens( + TokPattern::new(core, ',') + ), SearchMode::ContentExact => Self::ContentExact( ContentExactPattern::from(core) ), @@ -83,10 +91,10 @@ impl Pattern { let mut object = PatternObject::default(); match self { Self::None => {} - Self::NameExact(_) | Self::NameFuzzy(_) | Self::NameRegex(_) => { + Self::NameExact(_) | Self::NameFuzzy(_) | Self::NameRegex(_) | Self::NameTokens(_) => { object.name = true; } - Self::PathExact(_) | Self::PathFuzzy(_) | Self::PathRegex(_) => { + Self::PathExact(_) | Self::PathFuzzy(_) | Self::PathRegex(_) | Self::PathTokens(_) => { object.subpath = true; } Self::ContentExact(_) | Self::ContentRegex(_) => { @@ -109,9 +117,11 @@ impl Pattern { Self::NameExact(ep) => ep.find(candidate), Self::NameFuzzy(fp) => fp.find(candidate), Self::NameRegex(rp) => rp.find(candidate), + Self::NameTokens(tp) => tp.find(candidate), Self::PathExact(ep) => ep.find(candidate), Self::PathFuzzy(fp) => fp.find(candidate), Self::PathRegex(rp) => rp.find(candidate), + Self::PathTokens(tp) => tp.find(candidate), Self::Composite(cp) => cp.search_string(candidate), _ => None, } @@ -137,9 +147,11 @@ impl Pattern { Self::NameExact(ep) => ep.score_of(&candidate.name), Self::NameFuzzy(fp) => fp.score_of(&candidate.name), Self::NameRegex(rp) => rp.find(&candidate.name).map(|m| m.score), + Self::NameTokens(tp) => tp.score_of(&candidate.name), Self::PathExact(ep) => ep.score_of(&candidate.subpath), Self::PathFuzzy(fp) => fp.score_of(&candidate.subpath), Self::PathRegex(rp) => rp.find(&candidate.subpath).map(|m| m.score), + Self::PathTokens(tp) => tp.score_of(&candidate.subpath), Self::ContentExact(cp) => cp.score_of(candidate), Self::ContentRegex(cp) => cp.score_of(candidate), Self::Composite(cp) => cp.score_of(candidate), @@ -152,9 +164,11 @@ impl Pattern { Self::NameExact(ep) => ep.score_of(&candidate), Self::NameFuzzy(fp) => fp.score_of(&candidate), Self::NameRegex(rp) => rp.find(&candidate).map(|m| m.score), + Self::NameTokens(tp) => tp.score_of(&candidate), Self::PathExact(ep) => ep.score_of(&candidate), Self::PathFuzzy(fp) => fp.score_of(&candidate), Self::PathRegex(rp) => rp.find(&candidate).map(|m| m.score), + Self::PathTokens(tp) => tp.score_of(&candidate), Self::ContentExact(_) => None, // this isn't suitable Self::ContentRegex(_) => None, // this isn't suitable Self::Composite(cp) => cp.score_of_string(candidate), diff --git a/src/pattern/pos.rs b/src/pattern/pos.rs new file mode 100644 index 0000000..cee14a5 --- /dev/null +++ b/src/pattern/pos.rs @@ -0,0 +1,7 @@ +use { + smallvec::SmallVec, +}; + +/// a vector of indexes of the matching characters (not bytes) +pub type Pos = SmallVec<[usize; 8]>; + diff --git a/src/pattern/search_mode.rs b/src/pattern/search_mode.rs index 1254c62..a3267fe 100644 --- a/src/pattern/search_mode.rs +++ b/src/pattern/search_mode.rs @@ -20,6 +20,7 @@ pub enum SearchKind { Exact, Fuzzy, Regex, + Tokens, Unspecified, } @@ -30,9 +31,11 @@ pub enum SearchMode { NameExact, NameFuzzy, NameRegex, + NameTokens, PathExact, PathFuzzy, PathRegex, + PathTokens, ContentExact, ContentRegex, } @@ -41,9 +44,11 @@ pub static SEARCH_MODES: &[SearchMode] = &[ SearchMode::NameFuzzy, SearchMode::NameRegex, SearchMode::NameExact, + SearchMode::NameTokens, + SearchMode::PathExact, SearchMode::PathFuzzy, SearchMode::PathRegex, - SearchMode::PathExact, + SearchMode::PathTokens, SearchMode::ContentExact, SearchMode::ContentRegex, ]; @@ -59,22 +64,25 @@ impl SearchMode { (Name, Exact) => Some(Self::NameExact), (Name, Fuzzy) => Some(Self::NameFuzzy), (Name, Regex) => Some(Self::NameRegex), + (Name, Tokens) => Some(Self::NameTokens), (Path, Unspecified) => Some(Self::PathFuzzy), (Path, Exact) => Some(Self::PathExact), (Path, Fuzzy) => Some(Self::PathFuzzy), (Path, Regex) => Some(Self::PathRegex), + (Path, Tokens) => Some(Self::PathTokens), (Content, Unspecified) => Some(Self::ContentExact), (Content, Exact) => Some(Self::ContentExact), (Content, Fuzzy) => None, // unsupported for now - could be but why ? (Content, Regex) => Some(Self::ContentRegex), + (Content, Tokens) => None, // unsupported for now - could be but need bench } } pub fn object(&self) -> SearchObject { match self { - Self::NameExact | Self::NameFuzzy | Self::NameRegex => SearchObject::Name, - Self::PathExact | Self::PathFuzzy | Self::PathRegex => SearchObject::Path, + Self::NameExact | Self::NameFuzzy | Self::NameRegex | Self::NameTokens => SearchObject::Name, + Self::PathExact | Self::PathFuzzy | Self::PathRegex | Self::PathTokens => SearchObject::Path, Self::ContentExact | Self::ContentRegex => SearchObject::Content, } } @@ -83,9 +91,11 @@ impl SearchMode { Self::NameExact => SearchKind::Exact, Self::NameFuzzy => SearchKind::Fuzzy, Self::NameRegex => SearchKind::Regex, + Self::NameTokens => SearchKind::Tokens, Self::PathExact => SearchKind::Exact, Self::PathFuzzy => SearchKind::Fuzzy, Self::PathRegex => SearchKind::Regex, + Self::PathTokens => SearchKind::Tokens, Self::ContentExact => SearchKind::Exact, Self::ContentRegex => SearchKind::Regex, } @@ -130,14 +140,16 @@ impl SearchModeMapEntry { let exact = s.contains("exact"); let fuzzy = s.contains("fuzzy"); let regex = s.contains("regex"); - let search_kind = match (exact, fuzzy, regex) { - (false, false, false) => SearchKind::Unspecified, - (true, false, false) => SearchKind::Exact, - (false, true, false) => SearchKind::Fuzzy, - (false, false, true) => SearchKind::Regex, + let tokens = s.contains("tokens"); + let search_kind = match (exact, fuzzy, regex, tokens) { + (false, false, false, false) => SearchKind::Unspecified, + (true, false, false, false) => SearchKind::Exact, + (false, true, false, false) => SearchKind::Fuzzy, + (false, false, true, false) => SearchKind::Regex, + (false, false, false, true) => SearchKind::Tokens, _ => { return Err(ConfError::InvalidSearchMode { - details: "you may have at most one of \"exact\", \"fuzzy\" or \"regex\"".to_string() + details: "you may have at most one of \"exact\", \"fuzzy\", \"regex\", or \"tokens\"".to_string() }); } }; @@ -181,6 +193,8 @@ impl Default for SearchModeMap { smm.setm(&["pr", "rp"], SearchMode::PathRegex); smm.setm(&["ce", "ec", "c"], SearchMode::ContentExact); smm.setm(&["rx", "cr"], SearchMode::ContentRegex); + smm.setm(&["pt", "tp", "t"], SearchMode::PathTokens); + smm.setm(&["pn", "np"], SearchMode::NameTokens); smm.set(SearchModeMapEntry { key: None, mode: SearchMode::NameFuzzy }); smm } diff --git a/src/pattern/tok_pattern.rs b/src/pattern/tok_pattern.rs new file mode 100644 index 0000000..fc98d68 --- /dev/null +++ b/src/pattern/tok_pattern.rs @@ -0,0 +1,190 @@ +use { + super::NameMatch, + secular, + smallvec::{smallvec, SmallVec}, + std::{ + cmp::Reverse, + ops::Range, + }, +}; + +type CandChars = SmallVec<[char; 32]>; + +/// a list of tokens we want to find, non overlapping +/// and in any order, in strings +#[derive(Debug, Clone)] +pub struct TokPattern { + toks: Vec<Box<[char]>>, + sum_len: usize, +} + +// optimizations to test: +// - score_of doesn't need to compute pos, just the ranges +// - try the first tok before creating the matching_ranges vec +// (the first one is also impler to test) +// - if there's no room for any of the other toks before the +// first matching tok, start other loops after the first +// matching range +// But until I add some scoring and multiple positions tests +// there should not be any perf concerns +// scoring basis ? +// - number of parts of the candidats (separated by / for example) +// that are touched by a tok ? +// - malus for adjacent ranges +// - bonus for ranges starting just after a separator +// - bonus for order ? +impl TokPattern { + + pub fn new(pattern: &str, sep: char) -> Self { + let mut toks: Vec<Box<[char]>> = pattern.split(sep) + .filter(|s| s.len() > 0) + .map(|s| { + secular::normalized_lower_lay_string(s) + .chars() + .collect::<Vec<char>>() + .into_boxed_slice() + }) + .collect(); + // we sort the tokens from biggest to smallest + // because the current algorithm stops at the + // first match for any tok. Thus it would fail + // to find "abc,b" in "abcdb" if it looked first + // at the "b" token + toks.sort_by_key(|t| Reverse(t.len())); + let sum_len = toks.iter().map(|s| s.len()).sum(); + Self { + toks, + sum_len, + } + } + + pub fn find(&self, candidate: &str) -> Option<NameMatch> { + if candidate.len() < self.sum_len { + return None; + } + let mut cand_chars: CandChars = SmallVec::with_capacity(candidate.len()); + cand_chars.extend(candidate.chars().map(secular::lower_lay_char)); + let mut matching_ranges: Vec<Range<usize>> = Vec::with_capacity(self.toks.len()); + for tok in &self.toks { + let l = tok.len(); + let matching_range = (0..cand_chars.len()+1-l) + .map(|idx| idx..idx+l) + .filter(|r| { + &cand_chars[r.start..r.end] == tok.as_ref() + }) + .filter(|r| { + // check we're not intersecting a previous range + for pr in &matching_ranges { + if pr.contains(&r.start) || pr.contains(&(r.end-1)) { + return false; + } + } + true + }) + .next(); + if let Some(r) = matching_range { + matching_ranges.push(r); + } else { + return None; + } + } + let mut pos = smallvec![0; self.sum_len]; + let mut i = 0; + for r in matching_ranges { + for p in r { + pos[i] = p; + i += 1; + } + } + pos.sort(); + let score = 42; // maybe find a better scoring + Some(NameMatch { score, pos }) + } + + /// compute the score of the best match + pub fn score_of(&self, candidate: &str) -> Option<i32> { + self.find(candidate) + .map(|nm| nm.score) + } +} + +#[cfg(test)] +mod tok_pattern_tests { + + use { + super::*, + crate::pattern::Pos, + }; + + /// check position of the match of the pattern in name + fn check_pos(pattern: &str, name: &str, pos: &str) { + let pat = TokPattern::new(pattern, ','); + let match_pos = pat.find(name).unwrap().pos; + let target_pos: Pos = pos.chars() + .enumerate() + .filter(|(_, c)| *c=='^') + .map(|(i, _)| i) + .collect(); + assert_eq!(match_pos, target_pos); + } + + #[test] + fn check_match_pos() { + check_pos( + "m,", + "miaou", + "^ ", + ); + check_pos( + "bat", + "cabat", + " ^^^", + ); + check_pos( + "ba", + "babababaaa", + "^^ ", + ); + check_pos( + "ba,ca", + "bababacaa", + "^^ ^^ ", + ); + check_pos( + "sub,doc,2", + "/home/user/path2/subpath/Documents/", + " ^ ^^^ ^^^", + ); + check_pos( + "ab,abc", + "0123/abc/ab/cdg", + " ^^^ ^^ ", + ); + } + + fn check_match(pattern: &str, name: &str, do_match: bool) { + assert_eq!( + TokPattern::new(pattern, ',').find(name).is_some(), + do_match, + ); + } + + #[test] + fn test_match() { + check_match("mia", "android/phonegap", false); + check_match("mi", "a", false); + check_match("mi", "π", false); + check_match("mi", "miaou/a", true); + } + + #[test] + fn test_tok_repetitions() { + check_match("sub", "rasub", true); + check_match("sub,sub", "rasub", false); + check_match("sub,sub", "rasubandsub", true); + check_match("sub,sub,sub", "rasubandsub", false); + check_match("ghi,abc,def,ccc", "abccc/Defghi", false); + check_match("ghi,abc,def,ccc", "abcccc/Defghi", true); + } + +} |