first implementation of a "tokens" pattern

author: Canop <cano.petrole@gmail.com> 2021-05-04 19:50:53 +0200
committer: Canop <cano.petrole@gmail.com> 2021-05-04 19:50:53 +0200
commit: d97b05a9536b08cd48dec4d3e244e14d4ed080ef (patch)
tree: 941b7f42b80421c06d4c2122916cb384ae25c760 /src/pattern
parent: 57277840522e463546476c0bda34314b9e9b2c21 (diff)
7 files changed, 243 insertions, 18 deletions
diff --git a/src/pattern/fuzzy_pattern.rs b/src/pattern/fuzzy_pattern.rs
index 45857b0..b0be711 100644
--- a/src/pattern/fuzzy_pattern.rs
+++ b/src/pattern/fuzzy_pattern.rs
@@ -186,8 +186,7 @@ impl FuzzyPattern {
         best_match
     }
 
-    /// compute the score of the best match, in a way mostly similar to `find` but
-    /// faster by not storing match positions
+    /// compute the score of the best match
     pub fn score_of(&self, candidate: &str) -> Option<i32> {
         self.find(candidate)
             .map(|nm| nm.score)
diff --git a/src/pattern/mod.rs b/src/pattern/mod.rs
index 43c2698..f867f7d 100644
--- a/src/pattern/mod.rs
+++ b/src/pattern/mod.rs
@@ -11,8 +11,10 @@ mod operator;
 mod pattern;
 mod pattern_object;
 mod pattern_parts;
+mod pos;
 mod regex_pattern;
 mod search_mode;
+mod tok_pattern;
 
 pub use {
     candidate::Candidate,
@@ -22,13 +24,15 @@ pub use {
     exact_pattern::ExactPattern,
     fuzzy_pattern::FuzzyPattern,
     input_pattern::InputPattern,
-    name_match::{NameMatch, Pos},
+    name_match::NameMatch,
     pattern::Pattern,
     pattern_object::PatternObject,
     pattern_parts::PatternParts,
+    pos::*,
     operator::PatternOperator,
     regex_pattern::RegexPattern,
     search_mode::*,
+    tok_pattern::*,
 };
 
 use crate::errors::PatternError;
diff --git a/src/pattern/name_match.rs b/src/pattern/name_match.rs
index 2459252..a4bffcf 100644
--- a/src/pattern/name_match.rs
+++ b/src/pattern/name_match.rs
@@ -1,10 +1,7 @@
 use {
-    smallvec::SmallVec,
+    super::Pos,
 };
 
-/// a vector of indexes of the matching characters (not bytes)
-pub type Pos = SmallVec<[usize; 8]>;
-
 /// A NameMatch is a positive result of pattern matching inside
 /// a filename or subpath
 #[derive(Debug, Clone)]
diff --git a/src/pattern/pattern.rs b/src/pattern/pattern.rs
index 6bf7c75..157c0ac 100644
--- a/src/pattern/pattern.rs
+++ b/src/pattern/pattern.rs
@@ -17,9 +17,11 @@ pub enum Pattern {
     NameExact(ExactPattern),
     NameFuzzy(FuzzyPattern),
     NameRegex(RegexPattern),
+    NameTokens(TokPattern),
     PathExact(ExactPattern),
     PathFuzzy(FuzzyPattern),
     PathRegex(RegexPattern),
+    PathTokens(TokPattern),
     ContentExact(ContentExactPattern),
     ContentRegex(ContentRegexPattern),
     Composite(CompositePattern),
@@ -51,6 +53,9 @@ impl Pattern {
                             SearchMode::NameRegex => Self::NameRegex(
                                 RegexPattern::from(core, flags.unwrap_or(""))?
                             ),
+                            SearchMode::NameTokens => Self::NameTokens(
+                                TokPattern::new(core, ',')
+                            ),
                             SearchMode::PathExact => Self::PathExact(
                                 ExactPattern::from(core)
                             ),
@@ -60,6 +65,9 @@ impl Pattern {
                             SearchMode::PathRegex => Self::PathRegex(
                                 RegexPattern::from(core, flags.unwrap_or(""))?
                             ),
+                            SearchMode::PathTokens => Self::PathTokens(
+                                TokPattern::new(core, ',')
+                            ),
                             SearchMode::ContentExact => Self::ContentExact(
                                 ContentExactPattern::from(core)
                             ),
@@ -83,10 +91,10 @@ impl Pattern {
         let mut object = PatternObject::default();
         match self {
             Self::None => {}
-            Self::NameExact(_) | Self::NameFuzzy(_) | Self::NameRegex(_) => {
+            Self::NameExact(_) | Self::NameFuzzy(_) | Self::NameRegex(_) | Self::NameTokens(_) => {
                 object.name = true;
             }
-            Self::PathExact(_) | Self::PathFuzzy(_) | Self::PathRegex(_) => {
+            Self::PathExact(_) | Self::PathFuzzy(_) | Self::PathRegex(_) | Self::PathTokens(_) => {
                 object.subpath = true;
             }
             Self::ContentExact(_) | Self::ContentRegex(_) => {
@@ -109,9 +117,11 @@ impl Pattern {
             Self::NameExact(ep) => ep.find(candidate),
             Self::NameFuzzy(fp) => fp.find(candidate),
             Self::NameRegex(rp) => rp.find(candidate),
+            Self::NameTokens(tp) => tp.find(candidate),
             Self::PathExact(ep) => ep.find(candidate),
             Self::PathFuzzy(fp) => fp.find(candidate),
             Self::PathRegex(rp) => rp.find(candidate),
+            Self::PathTokens(tp) => tp.find(candidate),
             Self::Composite(cp) => cp.search_string(candidate),
             _ => None,
         }
@@ -137,9 +147,11 @@ impl Pattern {
             Self::NameExact(ep) => ep.score_of(&candidate.name),
             Self::NameFuzzy(fp) => fp.score_of(&candidate.name),
             Self::NameRegex(rp) => rp.find(&candidate.name).map(|m| m.score),
+            Self::NameTokens(tp) => tp.score_of(&candidate.name),
             Self::PathExact(ep) => ep.score_of(&candidate.subpath),
             Self::PathFuzzy(fp) => fp.score_of(&candidate.subpath),
             Self::PathRegex(rp) => rp.find(&candidate.subpath).map(|m| m.score),
+            Self::PathTokens(tp) => tp.score_of(&candidate.subpath),
             Self::ContentExact(cp) => cp.score_of(candidate),
             Self::ContentRegex(cp) => cp.score_of(candidate),
             Self::Composite(cp) => cp.score_of(candidate),
@@ -152,9 +164,11 @@ impl Pattern {
             Self::NameExact(ep) => ep.score_of(&candidate),
             Self::NameFuzzy(fp) => fp.score_of(&candidate),
             Self::NameRegex(rp) => rp.find(&candidate).map(|m| m.score),
+            Self::NameTokens(tp) => tp.score_of(&candidate),
             Self::PathExact(ep) => ep.score_of(&candidate),
             Self::PathFuzzy(fp) => fp.score_of(&candidate),
             Self::PathRegex(rp) => rp.find(&candidate).map(|m| m.score),
+            Self::PathTokens(tp) => tp.score_of(&candidate),
             Self::ContentExact(_) => None, // this isn't suitable
             Self::ContentRegex(_) => None, // this isn't suitable
             Self::Composite(cp) => cp.score_of_string(candidate),
diff --git a/src/pattern/pos.rs b/src/pattern/pos.rs
new file mode 100644
index 0000000..cee14a5
--- /dev/null
+++ b/src/pattern/pos.rs
@@ -0,0 +1,7 @@
+use {
+    smallvec::SmallVec,
+};
+
+/// a vector of indexes of the matching characters (not bytes)
+pub type Pos = SmallVec<[usize; 8]>;
+
diff --git a/src/pattern/search_mode.rs b/src/pattern/search_mode.rs
index 1254c62..a3267fe 100644
--- a/src/pattern/search_mode.rs
+++ b/src/pattern/search_mode.rs
@@ -20,6 +20,7 @@ pub enum SearchKind {
     Exact,
     Fuzzy,
     Regex,
+    Tokens,
     Unspecified,
 }
 
@@ -30,9 +31,11 @@ pub enum SearchMode {
     NameExact,
     NameFuzzy,
     NameRegex,
+    NameTokens,
     PathExact,
     PathFuzzy,
     PathRegex,
+    PathTokens,
     ContentExact,
     ContentRegex,
 }
@@ -41,9 +44,11 @@ pub static SEARCH_MODES: &[SearchMode] = &[
     SearchMode::NameFuzzy,
     SearchMode::NameRegex,
     SearchMode::NameExact,
+    SearchMode::NameTokens,
+    SearchMode::PathExact,
     SearchMode::PathFuzzy,
     SearchMode::PathRegex,
-    SearchMode::PathExact,
+    SearchMode::PathTokens,
     SearchMode::ContentExact,
     SearchMode::ContentRegex,
 ];
@@ -59,22 +64,25 @@ impl SearchMode {
             (Name, Exact) => Some(Self::NameExact),
             (Name, Fuzzy) => Some(Self::NameFuzzy),
             (Name, Regex) => Some(Self::NameRegex),
+            (Name, Tokens) => Some(Self::NameTokens),
 
             (Path, Unspecified) => Some(Self::PathFuzzy),
             (Path, Exact) => Some(Self::PathExact),
             (Path, Fuzzy) => Some(Self::PathFuzzy),
             (Path, Regex) => Some(Self::PathRegex),
+            (Path, Tokens) => Some(Self::PathTokens),
 
             (Content, Unspecified) => Some(Self::ContentExact),
             (Content, Exact) => Some(Self::ContentExact),
             (Content, Fuzzy) => None, // unsupported for now - could be but why ?
             (Content, Regex) => Some(Self::ContentRegex),
+            (Content, Tokens) => None, // unsupported for now - could be but need bench
         }
     }
     pub fn object(&self) -> SearchObject {
         match self {
-            Self::NameExact | Self::NameFuzzy | Self::NameRegex => SearchObject::Name,
-            Self::PathExact | Self::PathFuzzy | Self::PathRegex => SearchObject::Path,
+            Self::NameExact | Self::NameFuzzy | Self::NameRegex | Self::NameTokens => SearchObject::Name,
+            Self::PathExact | Self::PathFuzzy | Self::PathRegex | Self::PathTokens => SearchObject::Path,
             Self::ContentExact | Self::ContentRegex => SearchObject::Content,
         }
     }
@@ -83,9 +91,11 @@ impl SearchMode {
             Self::NameExact => SearchKind::Exact,
             Self::NameFuzzy => SearchKind::Fuzzy,
             Self::NameRegex => SearchKind::Regex,
+            Self::NameTokens => SearchKind::Tokens,
             Self::PathExact => SearchKind::Exact,
             Self::PathFuzzy => SearchKind::Fuzzy,
             Self::PathRegex => SearchKind::Regex,
+            Self::PathTokens => SearchKind::Tokens,
             Self::ContentExact => SearchKind::Exact,
             Self::ContentRegex => SearchKind::Regex,
         }
@@ -130,14 +140,16 @@ impl SearchModeMapEntry {
         let exact = s.contains("exact");
         let fuzzy = s.contains("fuzzy");
         let regex = s.contains("regex");
-        let search_kind = match (exact, fuzzy, regex) {
-            (false, false, false) => SearchKind::Unspecified,
-            (true, false, false) => SearchKind::Exact,
-            (false, true, false) => SearchKind::Fuzzy,
-            (false, false, true) => SearchKind::Regex,
+        let tokens = s.contains("tokens");
+        let search_kind = match (exact, fuzzy, regex, tokens) {
+            (false, false, false, false) => SearchKind::Unspecified,
+            (true, false, false, false) => SearchKind::Exact,
+            (false, true, false, false) => SearchKind::Fuzzy,
+            (false, false, true, false) => SearchKind::Regex,
+            (false, false, false, true) => SearchKind::Tokens,
             _ => {
                 return Err(ConfError::InvalidSearchMode {
-                    details: "you may have at most one of \"exact\", \"fuzzy\" or \"regex\"".to_string()
+                    details: "you may have at most one of \"exact\", \"fuzzy\", \"regex\", or \"tokens\"".to_string()
                 });
             }
         };
@@ -181,6 +193,8 @@ impl Default for SearchModeMap {
         smm.setm(&["pr", "rp"], SearchMode::PathRegex);
         smm.setm(&["ce", "ec", "c"], SearchMode::ContentExact);
         smm.setm(&["rx", "cr"], SearchMode::ContentRegex);
+        smm.setm(&["pt", "tp", "t"], SearchMode::PathTokens);
+        smm.setm(&["pn", "np"], SearchMode::NameTokens);
         smm.set(SearchModeMapEntry { key: None, mode: SearchMode::NameFuzzy });
         smm
     }
diff --git a/src/pattern/tok_pattern.rs b/src/pattern/tok_pattern.rs
new file mode 100644
index 0000000..fc98d68
--- /dev/null
+++ b/src/pattern/tok_pattern.rs
@@ -0,0 +1,190 @@
+use {
+    super::NameMatch,
+    secular,
+    smallvec::{smallvec, SmallVec},
+    std::{
+        cmp::Reverse,
+        ops::Range,
+    },
+};
+
+type CandChars = SmallVec<[char; 32]>;
+
+/// a list of tokens we want to find, non overlapping
+/// and in any order, in strings
+#[derive(Debug, Clone)]
+pub struct TokPattern {
+    toks: Vec<Box<[char]>>,
+    sum_len: usize,
+}
+
+// optimizations to test:
+// - score_of doesn't need to compute pos, just the ranges
+// - try the first tok before creating the matching_ranges vec
+//   (the first one is also impler to test)
+// - if there's no room for any of the other toks before the
+//   first matching tok, start other loops after the first
+//   matching range
+// But until I add some scoring and multiple positions tests
+//  there should not be any perf concerns
+// scoring basis ?
+// - number of parts of the candidats (separated by / for example)
+//   that are touched by a tok ?
+// - malus for adjacent ranges
+// - bonus for ranges starting just after a separator
+// - bonus for order ?
+impl TokPattern {
+
+    pub fn new(pattern: &str, sep: char) -> Self {
+        let mut toks: Vec<Box<[char]>> = pattern.split(sep)
+            .filter(|s| s.len() > 0)
+            .map(|s| {
+                secular::normalized_lower_lay_string(s)
+                    .chars()
+                    .collect::<Vec<char>>()
+                    .into_boxed_slice()
+            })
+            .collect();
+        // we sort the tokens from biggest to smallest
+        // because the current algorithm stops at the
+        // first match for any tok. Thus it would fail
+        // to find "abc,b" in "abcdb" if it looked first
+        // at the "b" token
+        toks.sort_by_key(|t| Reverse(t.len()));
+        let sum_len = toks.iter().map(|s| s.len()).sum();
+        Self {
+            toks,
+            sum_len,
+        }
+    }
+
+    pub fn find(&self, candidate: &str) -> Option<NameMatch> {
+        if candidate.len() < self.sum_len {
+            return None;
+        }
+        let mut cand_chars: CandChars = SmallVec::with_capacity(candidate.len());
+        cand_chars.extend(candidate.chars().map(secular::lower_lay_char));
+        let mut matching_ranges: Vec<Range<usize>> = Vec::with_capacity(self.toks.len());
+        for tok in &self.toks {
+            let l = tok.len();
+            let matching_range = (0..cand_chars.len()+1-l)
+                .map(|idx| idx..idx+l)
+                .filter(|r| {
+                    &cand_chars[r.start..r.end] == tok.as_ref()
+                })
+                .filter(|r| {
+                    // check we're not intersecting a previous range
+                    for pr in &matching_ranges {
+                        if pr.contains(&r.start) || pr.contains(&(r.end-1)) {
+                            return false;
+                        }
+                    }
+                    true
+                })
+                .next();
+            if let Some(r) = matching_range {
+                matching_ranges.push(r);
+            } else {
+                return None;
+            }
+        }
+        let mut pos = smallvec![0; self.sum_len];
+        let mut i = 0;
+        for r in matching_ranges {
+            for p in r {
+                pos[i] = p;
+                i += 1;
+            }
+        }
+        pos.sort();
+        let score = 42; // maybe find a better scoring
+        Some(NameMatch { score, pos })
+    }
+
+    /// compute the score of the best match
+    pub fn score_of(&self, candidate: &str) -> Option<i32> {
+        self.find(candidate)
+            .map(|nm| nm.score)
+    }
+}
+
+#[cfg(test)]
+mod tok_pattern_tests {
+
+    use {
+        super::*,
+        crate::pattern::Pos,
+    };
+
+    /// check position of the match of the pattern in name
+    fn check_pos(pattern: &str, name: &str, pos: &str) {
+        let pat = TokPattern::new(pattern, ',');
+        let match_pos = pat.find(name).unwrap().pos;
+        let target_pos: Pos = pos.chars()
+            .enumerate()
+            .filter(|(_, c)| *c=='^')
+            .map(|(i, _)| i)
+            .collect();
+        assert_eq!(match_pos, target_pos);
+    }
+
+    #[test]
+    fn check_match_pos() {
+        check_pos(
+            "m,",
+            "miaou",
+            "^   ",
+        );
+        check_pos(
+            "bat",
+            "cabat",
+            "  ^^^",
+        );
+        check_pos(
+            "ba",
+            "babababaaa",
+            "^^        ",
+        );
+        check_pos(
+            "ba,ca",
+            "bababacaa",
+            "^^    ^^ ",
+        );
+        check_pos(
+            "sub,doc,2",
+            "/home/user/path2/subpath/Documents/",
+            "               ^ ^^^     ^^^",
+        );
+        check_pos(
+            "ab,abc",
+            "0123/abc/ab/cdg",
+            "     ^^^ ^^    ",
+        );
+    }
+
+    fn check_match(pattern: &str, name: &str, do_match: bool) {
+        assert_eq!(
+            TokPattern::new(pattern, ',').find(name).is_some(),
+            do_match,
+        );
+    }
+
+    #[test]
+    fn test_match() {
+        check_match("mia", "android/phonegap", false);
+        check_match("mi", "a", false);
+        check_match("mi", "π", false);
+        check_match("mi", "miaou/a", true);
+    }
+
+    #[test]
+    fn test_tok_repetitions() {
+        check_match("sub", "rasub", true);
+        check_match("sub,sub", "rasub", false);
+        check_match("sub,sub", "rasubandsub", true);
+        check_match("sub,sub,sub", "rasubandsub", false);
+        check_match("ghi,abc,def,ccc", "abccc/Defghi", false);
+        check_match("ghi,abc,def,ccc", "abcccc/Defghi", true);
+    }
+
+}
author	Canop <cano.petrole@gmail.com>	2021-05-04 19:50:53 +0200
committer	Canop <cano.petrole@gmail.com>	2021-05-04 19:50:53 +0200
commit	d97b05a9536b08cd48dec4d3e244e14d4ed080ef (patch)
tree	941b7f42b80421c06d4c2122916cb384ae25c760 /src/pattern
parent	57277840522e463546476c0bda34314b9e9b2c21 (diff)