summaryrefslogtreecommitdiffstats
path: root/src/pattern
diff options
context:
space:
mode:
authorCanop <cano.petrole@gmail.com>2021-05-04 19:50:53 +0200
committerCanop <cano.petrole@gmail.com>2021-05-04 19:50:53 +0200
commitd97b05a9536b08cd48dec4d3e244e14d4ed080ef (patch)
tree941b7f42b80421c06d4c2122916cb384ae25c760 /src/pattern
parent57277840522e463546476c0bda34314b9e9b2c21 (diff)
first implementation of a "tokens" pattern
Diffstat (limited to 'src/pattern')
-rw-r--r--src/pattern/fuzzy_pattern.rs3
-rw-r--r--src/pattern/mod.rs6
-rw-r--r--src/pattern/name_match.rs5
-rw-r--r--src/pattern/pattern.rs18
-rw-r--r--src/pattern/pos.rs7
-rw-r--r--src/pattern/search_mode.rs32
-rw-r--r--src/pattern/tok_pattern.rs190
7 files changed, 243 insertions, 18 deletions
diff --git a/src/pattern/fuzzy_pattern.rs b/src/pattern/fuzzy_pattern.rs
index 45857b0..b0be711 100644
--- a/src/pattern/fuzzy_pattern.rs
+++ b/src/pattern/fuzzy_pattern.rs
@@ -186,8 +186,7 @@ impl FuzzyPattern {
best_match
}
- /// compute the score of the best match, in a way mostly similar to `find` but
- /// faster by not storing match positions
+ /// compute the score of the best match
pub fn score_of(&self, candidate: &str) -> Option<i32> {
self.find(candidate)
.map(|nm| nm.score)
diff --git a/src/pattern/mod.rs b/src/pattern/mod.rs
index 43c2698..f867f7d 100644
--- a/src/pattern/mod.rs
+++ b/src/pattern/mod.rs
@@ -11,8 +11,10 @@ mod operator;
mod pattern;
mod pattern_object;
mod pattern_parts;
+mod pos;
mod regex_pattern;
mod search_mode;
+mod tok_pattern;
pub use {
candidate::Candidate,
@@ -22,13 +24,15 @@ pub use {
exact_pattern::ExactPattern,
fuzzy_pattern::FuzzyPattern,
input_pattern::InputPattern,
- name_match::{NameMatch, Pos},
+ name_match::NameMatch,
pattern::Pattern,
pattern_object::PatternObject,
pattern_parts::PatternParts,
+ pos::*,
operator::PatternOperator,
regex_pattern::RegexPattern,
search_mode::*,
+ tok_pattern::*,
};
use crate::errors::PatternError;
diff --git a/src/pattern/name_match.rs b/src/pattern/name_match.rs
index 2459252..a4bffcf 100644
--- a/src/pattern/name_match.rs
+++ b/src/pattern/name_match.rs
@@ -1,10 +1,7 @@
use {
- smallvec::SmallVec,
+ super::Pos,
};
-/// a vector of indexes of the matching characters (not bytes)
-pub type Pos = SmallVec<[usize; 8]>;
-
/// A NameMatch is a positive result of pattern matching inside
/// a filename or subpath
#[derive(Debug, Clone)]
diff --git a/src/pattern/pattern.rs b/src/pattern/pattern.rs
index 6bf7c75..157c0ac 100644
--- a/src/pattern/pattern.rs
+++ b/src/pattern/pattern.rs
@@ -17,9 +17,11 @@ pub enum Pattern {
NameExact(ExactPattern),
NameFuzzy(FuzzyPattern),
NameRegex(RegexPattern),
+ NameTokens(TokPattern),
PathExact(ExactPattern),
PathFuzzy(FuzzyPattern),
PathRegex(RegexPattern),
+ PathTokens(TokPattern),
ContentExact(ContentExactPattern),
ContentRegex(ContentRegexPattern),
Composite(CompositePattern),
@@ -51,6 +53,9 @@ impl Pattern {
SearchMode::NameRegex => Self::NameRegex(
RegexPattern::from(core, flags.unwrap_or(""))?
),
+ SearchMode::NameTokens => Self::NameTokens(
+ TokPattern::new(core, ',')
+ ),
SearchMode::PathExact => Self::PathExact(
ExactPattern::from(core)
),
@@ -60,6 +65,9 @@ impl Pattern {
SearchMode::PathRegex => Self::PathRegex(
RegexPattern::from(core, flags.unwrap_or(""))?
),
+ SearchMode::PathTokens => Self::PathTokens(
+ TokPattern::new(core, ',')
+ ),
SearchMode::ContentExact => Self::ContentExact(
ContentExactPattern::from(core)
),
@@ -83,10 +91,10 @@ impl Pattern {
let mut object = PatternObject::default();
match self {
Self::None => {}
- Self::NameExact(_) | Self::NameFuzzy(_) | Self::NameRegex(_) => {
+ Self::NameExact(_) | Self::NameFuzzy(_) | Self::NameRegex(_) | Self::NameTokens(_) => {
object.name = true;
}
- Self::PathExact(_) | Self::PathFuzzy(_) | Self::PathRegex(_) => {
+ Self::PathExact(_) | Self::PathFuzzy(_) | Self::PathRegex(_) | Self::PathTokens(_) => {
object.subpath = true;
}
Self::ContentExact(_) | Self::ContentRegex(_) => {
@@ -109,9 +117,11 @@ impl Pattern {
Self::NameExact(ep) => ep.find(candidate),
Self::NameFuzzy(fp) => fp.find(candidate),
Self::NameRegex(rp) => rp.find(candidate),
+ Self::NameTokens(tp) => tp.find(candidate),
Self::PathExact(ep) => ep.find(candidate),
Self::PathFuzzy(fp) => fp.find(candidate),
Self::PathRegex(rp) => rp.find(candidate),
+ Self::PathTokens(tp) => tp.find(candidate),
Self::Composite(cp) => cp.search_string(candidate),
_ => None,
}
@@ -137,9 +147,11 @@ impl Pattern {
Self::NameExact(ep) => ep.score_of(&candidate.name),
Self::NameFuzzy(fp) => fp.score_of(&candidate.name),
Self::NameRegex(rp) => rp.find(&candidate.name).map(|m| m.score),
+ Self::NameTokens(tp) => tp.score_of(&candidate.name),
Self::PathExact(ep) => ep.score_of(&candidate.subpath),
Self::PathFuzzy(fp) => fp.score_of(&candidate.subpath),
Self::PathRegex(rp) => rp.find(&candidate.subpath).map(|m| m.score),
+ Self::PathTokens(tp) => tp.score_of(&candidate.subpath),
Self::ContentExact(cp) => cp.score_of(candidate),
Self::ContentRegex(cp) => cp.score_of(candidate),
Self::Composite(cp) => cp.score_of(candidate),
@@ -152,9 +164,11 @@ impl Pattern {
Self::NameExact(ep) => ep.score_of(&candidate),
Self::NameFuzzy(fp) => fp.score_of(&candidate),
Self::NameRegex(rp) => rp.find(&candidate).map(|m| m.score),
+ Self::NameTokens(tp) => tp.score_of(&candidate),
Self::PathExact(ep) => ep.score_of(&candidate),
Self::PathFuzzy(fp) => fp.score_of(&candidate),
Self::PathRegex(rp) => rp.find(&candidate).map(|m| m.score),
+ Self::PathTokens(tp) => tp.score_of(&candidate),
Self::ContentExact(_) => None, // this isn't suitable
Self::ContentRegex(_) => None, // this isn't suitable
Self::Composite(cp) => cp.score_of_string(candidate),
diff --git a/src/pattern/pos.rs b/src/pattern/pos.rs
new file mode 100644
index 0000000..cee14a5
--- /dev/null
+++ b/src/pattern/pos.rs
@@ -0,0 +1,7 @@
+use {
+ smallvec::SmallVec,
+};
+
+/// a vector of indexes of the matching characters (not bytes)
+pub type Pos = SmallVec<[usize; 8]>;
+
diff --git a/src/pattern/search_mode.rs b/src/pattern/search_mode.rs
index 1254c62..a3267fe 100644
--- a/src/pattern/search_mode.rs
+++ b/src/pattern/search_mode.rs
@@ -20,6 +20,7 @@ pub enum SearchKind {
Exact,
Fuzzy,
Regex,
+ Tokens,
Unspecified,
}
@@ -30,9 +31,11 @@ pub enum SearchMode {
NameExact,
NameFuzzy,
NameRegex,
+ NameTokens,
PathExact,
PathFuzzy,
PathRegex,
+ PathTokens,
ContentExact,
ContentRegex,
}
@@ -41,9 +44,11 @@ pub static SEARCH_MODES: &[SearchMode] = &[
SearchMode::NameFuzzy,
SearchMode::NameRegex,
SearchMode::NameExact,
+ SearchMode::NameTokens,
+ SearchMode::PathExact,
SearchMode::PathFuzzy,
SearchMode::PathRegex,
- SearchMode::PathExact,
+ SearchMode::PathTokens,
SearchMode::ContentExact,
SearchMode::ContentRegex,
];
@@ -59,22 +64,25 @@ impl SearchMode {
(Name, Exact) => Some(Self::NameExact),
(Name, Fuzzy) => Some(Self::NameFuzzy),
(Name, Regex) => Some(Self::NameRegex),
+ (Name, Tokens) => Some(Self::NameTokens),
(Path, Unspecified) => Some(Self::PathFuzzy),
(Path, Exact) => Some(Self::PathExact),
(Path, Fuzzy) => Some(Self::PathFuzzy),
(Path, Regex) => Some(Self::PathRegex),
+ (Path, Tokens) => Some(Self::PathTokens),
(Content, Unspecified) => Some(Self::ContentExact),
(Content, Exact) => Some(Self::ContentExact),
(Content, Fuzzy) => None, // unsupported for now - could be but why ?
(Content, Regex) => Some(Self::ContentRegex),
+ (Content, Tokens) => None, // unsupported for now - could be but need bench
}
}
pub fn object(&self) -> SearchObject {
match self {
- Self::NameExact | Self::NameFuzzy | Self::NameRegex => SearchObject::Name,
- Self::PathExact | Self::PathFuzzy | Self::PathRegex => SearchObject::Path,
+ Self::NameExact | Self::NameFuzzy | Self::NameRegex | Self::NameTokens => SearchObject::Name,
+ Self::PathExact | Self::PathFuzzy | Self::PathRegex | Self::PathTokens => SearchObject::Path,
Self::ContentExact | Self::ContentRegex => SearchObject::Content,
}
}
@@ -83,9 +91,11 @@ impl SearchMode {
Self::NameExact => SearchKind::Exact,
Self::NameFuzzy => SearchKind::Fuzzy,
Self::NameRegex => SearchKind::Regex,
+ Self::NameTokens => SearchKind::Tokens,
Self::PathExact => SearchKind::Exact,
Self::PathFuzzy => SearchKind::Fuzzy,
Self::PathRegex => SearchKind::Regex,
+ Self::PathTokens => SearchKind::Tokens,
Self::ContentExact => SearchKind::Exact,
Self::ContentRegex => SearchKind::Regex,
}
@@ -130,14 +140,16 @@ impl SearchModeMapEntry {
let exact = s.contains("exact");
let fuzzy = s.contains("fuzzy");
let regex = s.contains("regex");
- let search_kind = match (exact, fuzzy, regex) {
- (false, false, false) => SearchKind::Unspecified,
- (true, false, false) => SearchKind::Exact,
- (false, true, false) => SearchKind::Fuzzy,
- (false, false, true) => SearchKind::Regex,
+ let tokens = s.contains("tokens");
+ let search_kind = match (exact, fuzzy, regex, tokens) {
+ (false, false, false, false) => SearchKind::Unspecified,
+ (true, false, false, false) => SearchKind::Exact,
+ (false, true, false, false) => SearchKind::Fuzzy,
+ (false, false, true, false) => SearchKind::Regex,
+ (false, false, false, true) => SearchKind::Tokens,
_ => {
return Err(ConfError::InvalidSearchMode {
- details: "you may have at most one of \"exact\", \"fuzzy\" or \"regex\"".to_string()
+ details: "you may have at most one of \"exact\", \"fuzzy\", \"regex\", or \"tokens\"".to_string()
});
}
};
@@ -181,6 +193,8 @@ impl Default for SearchModeMap {
smm.setm(&["pr", "rp"], SearchMode::PathRegex);
smm.setm(&["ce", "ec", "c"], SearchMode::ContentExact);
smm.setm(&["rx", "cr"], SearchMode::ContentRegex);
+ smm.setm(&["pt", "tp", "t"], SearchMode::PathTokens);
+ smm.setm(&["pn", "np"], SearchMode::NameTokens);
smm.set(SearchModeMapEntry { key: None, mode: SearchMode::NameFuzzy });
smm
}
diff --git a/src/pattern/tok_pattern.rs b/src/pattern/tok_pattern.rs
new file mode 100644
index 0000000..fc98d68
--- /dev/null
+++ b/src/pattern/tok_pattern.rs
@@ -0,0 +1,190 @@
+use {
+ super::NameMatch,
+ secular,
+ smallvec::{smallvec, SmallVec},
+ std::{
+ cmp::Reverse,
+ ops::Range,
+ },
+};
+
+type CandChars = SmallVec<[char; 32]>;
+
+/// a list of tokens we want to find, non overlapping
+/// and in any order, in strings
+#[derive(Debug, Clone)]
+pub struct TokPattern {
+ toks: Vec<Box<[char]>>,
+ sum_len: usize,
+}
+
+// optimizations to test:
+// - score_of doesn't need to compute pos, just the ranges
+// - try the first tok before creating the matching_ranges vec
+// (the first one is also impler to test)
+// - if there's no room for any of the other toks before the
+// first matching tok, start other loops after the first
+// matching range
+// But until I add some scoring and multiple positions tests
+// there should not be any perf concerns
+// scoring basis ?
+// - number of parts of the candidats (separated by / for example)
+// that are touched by a tok ?
+// - malus for adjacent ranges
+// - bonus for ranges starting just after a separator
+// - bonus for order ?
+impl TokPattern {
+
+ pub fn new(pattern: &str, sep: char) -> Self {
+ let mut toks: Vec<Box<[char]>> = pattern.split(sep)
+ .filter(|s| s.len() > 0)
+ .map(|s| {
+ secular::normalized_lower_lay_string(s)
+ .chars()
+ .collect::<Vec<char>>()
+ .into_boxed_slice()
+ })
+ .collect();
+ // we sort the tokens from biggest to smallest
+ // because the current algorithm stops at the
+ // first match for any tok. Thus it would fail
+ // to find "abc,b" in "abcdb" if it looked first
+ // at the "b" token
+ toks.sort_by_key(|t| Reverse(t.len()));
+ let sum_len = toks.iter().map(|s| s.len()).sum();
+ Self {
+ toks,
+ sum_len,
+ }
+ }
+
+ pub fn find(&self, candidate: &str) -> Option<NameMatch> {
+ if candidate.len() < self.sum_len {
+ return None;
+ }
+ let mut cand_chars: CandChars = SmallVec::with_capacity(candidate.len());
+ cand_chars.extend(candidate.chars().map(secular::lower_lay_char));
+ let mut matching_ranges: Vec<Range<usize>> = Vec::with_capacity(self.toks.len());
+ for tok in &self.toks {
+ let l = tok.len();
+ let matching_range = (0..cand_chars.len()+1-l)
+ .map(|idx| idx..idx+l)
+ .filter(|r| {
+ &cand_chars[r.start..r.end] == tok.as_ref()
+ })
+ .filter(|r| {
+ // check we're not intersecting a previous range
+ for pr in &matching_ranges {
+ if pr.contains(&r.start) || pr.contains(&(r.end-1)) {
+ return false;
+ }
+ }
+ true
+ })
+ .next();
+ if let Some(r) = matching_range {
+ matching_ranges.push(r);
+ } else {
+ return None;
+ }
+ }
+ let mut pos = smallvec![0; self.sum_len];
+ let mut i = 0;
+ for r in matching_ranges {
+ for p in r {
+ pos[i] = p;
+ i += 1;
+ }
+ }
+ pos.sort();
+ let score = 42; // maybe find a better scoring
+ Some(NameMatch { score, pos })
+ }
+
+ /// compute the score of the best match
+ pub fn score_of(&self, candidate: &str) -> Option<i32> {
+ self.find(candidate)
+ .map(|nm| nm.score)
+ }
+}
+
+#[cfg(test)]
+mod tok_pattern_tests {
+
+ use {
+ super::*,
+ crate::pattern::Pos,
+ };
+
+ /// check position of the match of the pattern in name
+ fn check_pos(pattern: &str, name: &str, pos: &str) {
+ let pat = TokPattern::new(pattern, ',');
+ let match_pos = pat.find(name).unwrap().pos;
+ let target_pos: Pos = pos.chars()
+ .enumerate()
+ .filter(|(_, c)| *c=='^')
+ .map(|(i, _)| i)
+ .collect();
+ assert_eq!(match_pos, target_pos);
+ }
+
+ #[test]
+ fn check_match_pos() {
+ check_pos(
+ "m,",
+ "miaou",
+ "^ ",
+ );
+ check_pos(
+ "bat",
+ "cabat",
+ " ^^^",
+ );
+ check_pos(
+ "ba",
+ "babababaaa",
+ "^^ ",
+ );
+ check_pos(
+ "ba,ca",
+ "bababacaa",
+ "^^ ^^ ",
+ );
+ check_pos(
+ "sub,doc,2",
+ "/home/user/path2/subpath/Documents/",
+ " ^ ^^^ ^^^",
+ );
+ check_pos(
+ "ab,abc",
+ "0123/abc/ab/cdg",
+ " ^^^ ^^ ",
+ );
+ }
+
+ fn check_match(pattern: &str, name: &str, do_match: bool) {
+ assert_eq!(
+ TokPattern::new(pattern, ',').find(name).is_some(),
+ do_match,
+ );
+ }
+
+ #[test]
+ fn test_match() {
+ check_match("mia", "android/phonegap", false);
+ check_match("mi", "a", false);
+ check_match("mi", "π", false);
+ check_match("mi", "miaou/a", true);
+ }
+
+ #[test]
+ fn test_tok_repetitions() {
+ check_match("sub", "rasub", true);
+ check_match("sub,sub", "rasub", false);
+ check_match("sub,sub", "rasubandsub", true);
+ check_match("sub,sub,sub", "rasubandsub", false);
+ check_match("ghi,abc,def,ccc", "abccc/Defghi", false);
+ check_match("ghi,abc,def,ccc", "abcccc/Defghi", true);
+ }
+
+}