Rework glob sets.

We try to reduce the pressure on regexes and offload some of it to Aho-Corasick or exact lookups.
author: Andrew Gallant <jamslam@gmail.com> 2016-09-15 22:06:04 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2016-09-15 22:06:04 -0400
commit: 0e46171e3b189b2bd89f45c3a492dea36361d8bc (patch)
tree: 3ae494e5fcf54c3a679ae42991050c78c7552dce
parent: f5c85827cea05d11e1c0c50aa34e07361313f659 (diff)
9 files changed, 517 insertions, 89 deletions
diff --git a/Cargo.lock b/Cargo.lock
index dc9cac03..96da64f0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5,6 +5,7 @@ dependencies = [
  "deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "docopt 0.6.83 (registry+https://github.com/rust-lang/crates.io-index)",
  "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
+ "fnv 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
  "glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
  "grep 0.1.0",
  "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -58,6 +59,11 @@ dependencies = [
 ]
 
 [[package]]
+name = "fnv"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
 name = "fs2"
 version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -230,6 +236,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 "checksum deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1614659040e711785ed8ea24219140654da1729f3ec8a47a9719d041112fe7bf"
 "checksum docopt 0.6.83 (registry+https://github.com/rust-lang/crates.io-index)" = "fc42c6077823a361410c37d47c2535b73a190cbe10838dc4f400fe87c10c8c3b"
 "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f"
+"checksum fnv 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8e8af7b5408ab0c4910cad114c8f9eb454bf75df7afe8964307eeafb68a13a5e"
 "checksum fs2 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "bcd414e5a1a979b931bb92f41b7a54106d3f6d2e6c253e9ce943b7cd468251ef"
 "checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
 "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
diff --git a/Cargo.toml b/Cargo.toml
index 28d941c0..b77d621e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,6 +26,7 @@ path = "tests/tests.rs"
 deque = "0.3"
 docopt = "0.6"
 env_logger = "0.3"
+fnv = "1.0"
 grep = { version = "0.1", path = "grep" }
 lazy_static = "0.2"
 libc = "0.2"
diff --git a/benchsuite b/benchsuite
index d6ad1aa5..82bb31df 100755
--- a/benchsuite
+++ b/benchsuite
@@ -64,7 +64,9 @@ def bench_linux_literal_default(suite_dir):
         # doesn't read gitignore files. Instead, it has a file whitelist
         # that happens to match up exactly with the gitignores for this search.
         mkcmd('ucg', ['ucg', pat]),
-        mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'C'}),
+        # I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the
+        # default, but I'd guess it to be on most desktop systems.
+        mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}),
         mkcmd('pt', ['pt', pat]),
         # sift reports an extra line here for a binary file matched.
         mkcmd('sift', ['sift', pat]),
@@ -89,11 +91,10 @@ def bench_linux_literal(suite_dir):
 
     return Benchmark(pattern=pat, commands=[
         mkcmd('rg', ['rg', '-n', pat]),
-        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
-        mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]),
-        mkcmd('ag', ['ag', '-s', pat]),
-        mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]),
-        mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
+        mkcmd('rg (mmap)', ['rg', '-n', '--mmap', pat]),
+        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
+        mkcmd('ag (mmap)', ['ag', '-s', pat]),
+        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
         mkcmd('git grep', [
             'git', 'grep', '-I', '-n', pat,
         ], env={'LC_ALL': 'C'}),
@@ -121,13 +122,16 @@ def bench_linux_literal_casei(suite_dir):
 
     return Benchmark(pattern=pat, commands=[
         mkcmd('rg', ['rg', '-n', '-i', pat]),
-        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]),
-        mkcmd('rg-novcs-mmap', [
-            'rg', '--mmap', '--no-ignore', '-n', '-i', pat,
+        mkcmd('rg (mmap)', ['rg', '-n', '-i', pat]),
+        mkcmd('rg (whitelist)', [
+            'rg', '-n', '-i', '--no-ignore', '-tall', pat,
         ]),
-        mkcmd('ag', ['ag', '-i', pat]),
-        mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-i', pat]),
+        mkcmd('ag (mmap)', ['ag', '-i', pat]),
         mkcmd('ucg', ['ucg', '-i', pat]),
+        # It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here,
+        # since that is certainly what ripgrep is doing, but this is for an
+        # ASCII literal, so we should give `git grep` all the opportunity to
+        # do its best.
         mkcmd('git grep', [
             'git', 'grep', '-I', '-n', '-i', pat,
         ], env={'LC_ALL': 'C'}),
diff --git a/src/gitignore.rs b/src/gitignore.rs
index 31a16ea6..cac42a94 100644
--- a/src/gitignore.rs
+++ b/src/gitignore.rs
@@ -21,6 +21,7 @@ additional rules such as whitelists (prefix of `!`) or directory-only globs
 // TODO(burntsushi): Implement something similar, but for Mercurial. We can't
 // use this exact implementation because hgignore files are different.
 
+use std::cell::RefCell;
 use std::error::Error as StdError;
 use std::fmt;
 use std::fs::File;
@@ -30,6 +31,7 @@ use std::path::{Path, PathBuf};
 use regex;
 
 use glob;
+use pathutil::strip_prefix;
 
 /// Represents an error that can occur when parsing a gitignore file.
 #[derive(Debug)]
@@ -110,37 +112,35 @@ impl Gitignore {
     /// same directory as this gitignore file.
     pub fn matched<P: AsRef<Path>>(&self, path: P, is_dir: bool) -> Match {
         let mut path = path.as_ref();
-        if let Ok(p) = path.strip_prefix(&self.root) {
+        if let Some(p) = strip_prefix("./", path) {
             path = p;
         }
-        self.matched_utf8(&*path.to_string_lossy(), is_dir)
+        if let Some(p) = strip_prefix(&self.root, path) {
+            path = p;
+        }
+        self.matched_stripped(path, is_dir)
     }
 
-    /// Like matched, but takes a path that has already been stripped and
-    /// converted to UTF-8.
-    pub fn matched_utf8(&self, path: &str, is_dir: bool) -> Match {
-        // A single regex with a bunch of alternations of glob patterns is
-        // unfortunately typically faster than a regex, so we use it as a
-        // first pass filter. We still need to run the RegexSet to get the most
-        // recently defined glob that matched.
-        if !self.set.is_match(path) {
-            return Match::None;
+    /// Like matched, but takes a path that has already been stripped.
+    pub fn matched_stripped(&self, path: &Path, is_dir: bool) -> Match {
+        thread_local! {
+            static MATCHES: RefCell<Vec<usize>> = RefCell::new(vec![]);
         }
-        // The regex set can't actually pick the right glob that matched all
-        // on its own. In particular, some globs require that only directories
-        // can match. Thus, only accept a match from the regex set if the given
-        // path satisfies the corresponding glob's directory criteria.
-        for i in self.set.matches(path).iter().rev() {
-            let pat = &self.patterns[i];
-            if !pat.only_dir || is_dir {
-                return if pat.whitelist {
-                    Match::Whitelist(pat)
-                } else {
-                    Match::Ignored(pat)
-                };
+        MATCHES.with(|matches| {
+            let mut matches = matches.borrow_mut();
+            self.set.matches_into(path, &mut *matches);
+            for &i in matches.iter().rev() {
+                let pat = &self.patterns[i];
+                if !pat.only_dir || is_dir {
+                    return if pat.whitelist {
+                        Match::Whitelist(pat)
+                    } else {
+                        Match::Ignored(pat)
+                    };
+                }
             }
-        }
-        Match::None
+            Match::None
+        })
     }
 
     /// Returns the total number of ignore patterns.
@@ -390,6 +390,7 @@ mod tests {
     ignored!(ig23, ROOT, "foo", "./foo");
     ignored!(ig24, ROOT, "target", "grep/target");
     ignored!(ig25, ROOT, "Cargo.lock", "./tabwriter-bin/Cargo.lock");
+    ignored!(ig26, ROOT, "/foo/bar/baz", "./foo/bar/baz");
 
     not_ignored!(ignot1, ROOT, "amonths", "months");
     not_ignored!(ignot2, ROOT, "monthsa", "months");
diff --git a/src/glob.rs b/src/glob.rs
index f16a75e4..c5b5d8d3 100644
--- a/src/glob.rs
+++ b/src/glob.rs
@@ -26,13 +26,22 @@ to make its way into `glob` proper.
 // at the .gitignore for the chromium repo---just about every pattern satisfies
 // that assumption.)
 
+use std::borrow::Cow;
+use std::collections::HashMap;
 use std::error::Error as StdError;
+use std::ffi::{OsStr, OsString};
 use std::fmt;
+use std::hash;
 use std::iter;
+use std::path::Path;
 use std::str;
 
+use fnv;
 use regex;
-use regex::bytes::{Regex, RegexSet, SetMatches};
+use regex::bytes::Regex;
+use regex::bytes::RegexSet;
+
+use pathutil::file_name;
 
 /// Represents an error that can occur when parsing a glob pattern.
 #[derive(Clone, Debug, Eq, PartialEq)]
@@ -71,33 +80,165 @@ impl fmt::Display for Error {
     }
 }
 
-/// Set represents a group of globs that can be matched together in a single
-/// pass.
+/// SetYesNo represents a group of globs that can be matched together in a
+/// single pass. SetYesNo can only determine whether a particular path matched
+/// any pattern in the set.
 #[derive(Clone, Debug)]
-pub struct Set {
+pub struct SetYesNo {
     re: Regex,
-    set: RegexSet,
 }
 
-impl Set {
+impl SetYesNo {
     /// Returns true if and only if the given path matches at least one glob
     /// in this set.
-    pub fn is_match<T: AsRef<[u8]>>(&self, path: T) -> bool {
-        self.re.is_match(path.as_ref())
+    pub fn is_match<T: AsRef<Path>>(&self, path: T) -> bool {
+        self.re.is_match(&*path_bytes(path.as_ref()))
     }
 
-    /// Returns every glob pattern (by sequence number) that matches the given
-    /// path.
-    pub fn matches<T: AsRef<[u8]>>(&self, path: T) -> SetMatches {
-        // TODO(burntsushi): If we split this out into a separate crate, don't
-        // expose the regex::SetMatches type in the public API.
-        self.set.matches(path.as_ref())
+    fn new(
+        pats: &[(Pattern, MatchOptions)],
+    ) -> Result<SetYesNo, regex::Error> {
+        let mut joined = String::new();
+        for &(ref p, ref o) in pats {
+            let part = format!("(?:{})", p.to_regex_with(o));
+            if !joined.is_empty() {
+                joined.push('|');
+            }
+            joined.push_str(&part);
+        }
+        Ok(SetYesNo { re: try!(Regex::new(&joined)) })
     }
+}
+
+type Fnv = hash::BuildHasherDefault<fnv::FnvHasher>;
+
+/// Set represents a group of globs that can be matched together in a single
+/// pass.
+#[derive(Clone, Debug)]
+pub struct Set {
+    yesno: SetYesNo,
+    exts: HashMap<OsString, Vec<usize>, Fnv>,
+    literals: HashMap<Vec<u8>, Vec<usize>, Fnv>,
+    base_literals: HashMap<Vec<u8>, Vec<usize>, Fnv>,
+    base_prefixes: Vec<Vec<u8>>,
+    base_prefixes_map: Vec<usize>,
+    base_suffixes: Vec<Vec<u8>>,
+    base_suffixes_map: Vec<usize>,
+    regexes: RegexSet,
+    regexes_map: Vec<usize>,
+}
 
-    /// Returns the number of glob patterns in this set.
+impl Set {
+    /// Returns the sequence number of every glob pattern that matches the
+    /// given path.
     #[allow(dead_code)]
-    pub fn len(&self) -> usize {
-        self.set.len()
+    pub fn matches<T: AsRef<Path>>(&self, path: T) -> Vec<usize> {
+        let mut into = vec![];
+        self.matches_into(path, &mut into);
+        into
+    }
+
+    /// Adds the sequence number of every glob pattern that matches the given
+    /// path to the vec given.
+    pub fn matches_into<T: AsRef<Path>>(
+        &self,
+        path: T,
+        into: &mut Vec<usize>,
+    ) {
+        into.clear();
+        let path = path.as_ref();
+        let path_bytes = &*path_bytes(path);
+        let basename = file_name(path).map(|b| os_str_bytes(b));
+        if !self.yesno.is_match(path) {
+            return;
+        }
+        if !self.exts.is_empty() {
+            if let Some(ext) = path.extension() {
+                if let Some(matches) = self.exts.get(ext) {
+                    into.extend(matches.as_slice());
+                }
+            }
+        }
+        if !self.literals.is_empty() {
+            if let Some(matches) = self.literals.get(path_bytes) {
+                into.extend(matches.as_slice());
+            }
+        }
+        if !self.base_literals.is_empty() {
+            if let Some(ref basename) = basename {
+                if let Some(matches) = self.base_literals.get(&**basename) {
+                    into.extend(matches.as_slice());
+                }
+            }
+        }
+        if !self.base_prefixes.is_empty() {
+            if let Some(ref basename) = basename {
+                let basename = &**basename;
+                for (i, pre) in self.base_prefixes.iter().enumerate() {
+                    if pre.len() <= basename.len() && &**pre == &basename[0..pre.len()] {
+                        into.push(self.base_prefixes_map[i]);
+                    }
+                }
+            }
+        }
+        if !self.base_suffixes.is_empty() {
+            if let Some(ref basename) = basename {
+                let basename = &**basename;
+                for (i, suf) in self.base_suffixes.iter().enumerate() {
+                    if suf.len() > basename.len() {
+                        continue;
+                    }
+                    let (s, e) = (basename.len() - suf.len(), basename.len());
+                    if &**suf == &basename[s..e] {
+                        into.push(self.base_suffixes_map[i]);
+                    }
+                }
+            }
+        }
+        into.extend(self.regexes.matches(path_bytes));
+        into.sort();
+    }
+
+    fn new(pats: &[(Pattern, MatchOptions)]) -> Result<Set, regex::Error> {
+        let fnv = Fnv::default();
+        let mut exts = HashMap::with_hasher(fnv.clone());
+        let mut literals = HashMap::with_hasher(fnv.clone());
+        let mut base_literals = HashMap::with_hasher(fnv.clone());
+        let (mut base_prefixes, mut base_prefixes_map) = (vec![], vec![]);
+        let (mut base_suffixes, mut base_suffixes_map) = (vec![], vec![]);
+        let (mut regexes, mut regexes_map) = (vec![], vec![]);
+        for (i, &(ref p, ref o)) in pats.iter().enumerate() {
+            if let Some(ext) = p.ext() {
+                exts.entry(ext).or_insert(vec![]).push(i);
+            } else if let Some(literal) = p.literal() {
+                literals.entry(literal.into_bytes()).or_insert(vec![]).push(i);
+            } else if let Some(literal) = p.base_literal() {
+                base_literals
+                    .entry(literal.into_bytes()).or_insert(vec![]).push(i);
+            } else if let Some(literal) = p.base_literal_prefix() {
+                base_prefixes.push(literal.into_bytes());
+                base_prefixes_map.push(i);
+            } else if let Some(literal) = p.base_literal_suffix() {
+                base_suffixes.push(literal.into_bytes());
+                base_suffixes_map.push(i);
+            } else {
+                let part = format!("(?:{})", p.to_regex_with(o));
+                regexes.push(part);
+                regexes_map.push(i);
+            }
+        }
+        Ok(Set {
+            yesno: try!(SetYesNo::new(pats)),
+            exts: exts,
+            literals: literals,
+            base_literals: base_literals,
+            base_prefixes: base_prefixes,
+            base_prefixes_map: base_prefixes_map,
+            base_suffixes: base_suffixes,
+            base_suffixes_map: base_suffixes_map,
+            regexes: try!(RegexSet::new(regexes)),
+            regexes_map: regexes_map,
+        })
     }
 }
 
@@ -119,19 +260,12 @@ impl SetBuilder {
     ///
     /// Once a matcher is built, no new patterns can be added to it.
     pub fn build(&self) -> Result<Set, regex::Error> {
-        let it = self.pats.iter().map(|&(ref p, ref o)| p.to_regex_with(o));
-        let set = try!(RegexSet::new(it));
+        Set::new(&self.pats)
+    }
 
-        let mut joined = String::new();
-        for &(ref p, ref o) in &self.pats {
-            let part = format!("(?:{})", p.to_regex_with(o));
-            if !joined.is_empty() {
-                joined.push('|');
-            }
-            joined.push_str(&part);
-        }
-        let re = try!(Regex::new(&joined));
-        Ok(Set { re: re, set: set })
+    /// Like `build`, but returns a matcher that can only answer yes/no.
+    pub fn build_yesno(&self) -> Result<SetYesNo, regex::Error> {
+        SetYesNo::new(&self.pats)
     }
 
     /// Add a new pattern to this set.
@@ -149,8 +283,21 @@ impl SetBuilder {
         pat: &str,
         opts: &MatchOptions,
     ) -> Result<(), Error> {
-        let pat = try!(Pattern::new(pat));
-        self.pats.push((pat, opts.clone()));
+        let parsed = try!(Pattern::new(pat));
+        // if let Some(ext) = parsed.ext() {
+            // eprintln!("ext :: {:?} :: {:?}", ext, pat);
+        // } else if let Some(lit) = parsed.literal() {
+            // eprintln!("literal :: {:?} :: {:?}", lit, pat);
+        // } else if let Some(lit) = parsed.base_literal() {
+            // eprintln!("base_literal :: {:?} :: {:?}", lit, pat);
+        // } else if let Some(lit) = parsed.base_literal_prefix() {
+            // eprintln!("base_literal :: {:?} :: {:?}", lit, pat);
+        // } else if let Some(lit) = parsed.base_literal_suffix() {
+            // eprintln!("base_literal :: {:?} :: {:?}", lit, pat);
+        // } else {
+            // eprintln!("regex :: {:?} :: {:?}", pat, parsed);
+        // }
+        self.pats.push((parsed, opts.clone()));
         Ok(())
     }
 }
@@ -204,6 +351,114 @@ impl Pattern {
         Ok(p.p)
     }
 
+    /// Returns an extension if this pattern exclusively matches it.
+    pub fn ext(&self) -> Option<OsString> {
+        if self.tokens.len() <= 3 {
+            return None;
+        }
+        match self.tokens.get(0) {
+            Some(&Token::RecursivePrefix) => {}
+            _ => return None,
+        }
+        match self.tokens.get(1) {
+            Some(&Token::ZeroOrMore) => {}
+            _ => return None,
+        }
+        match self.tokens.get(2) {
+            Some(&Token::Literal(c)) if c == '.' => {}
+            _ => return None,
+        }
+        let mut lit = OsString::new();
+        for t in self.tokens[3..].iter() {
+            match *t {
+                Token::Literal(c) if c == '/' || c == '\\' || c == '.' => {
+                    return None;
+                }
+                Token::Literal(c) => lit.push(c.to_string()),
+                _ => return None,
+            }
+        }
+        Some(lit)
+    }
+
+    /// Returns the pattern as a literal if and only if the pattern exclusiely
+    /// matches the basename of a file path *and* is a literal.
+    ///
+    /// The basic format of these patterns is `**/{literal}`, where `{literal}`
+    /// does not contain a path separator.
+    pub fn base_literal(&self) -> Option<String> {
+        match self.tokens.get(0) {
+            Some(&Token::RecursivePrefix) => {}
+            _ => return None,
+        }
+        let mut lit = String::new();
+        for t in &self.tokens[1..] {
+            match *t {
+                Token::Literal(c) if c == '/' || c == '\\' => return None,
+                Token::Literal(c) => lit.push(c),
+                _ => return None,
+            }
+        }
+        Some(lit)
+    }
+
+    /// Returns the pattern as a literal if and only if the pattern must match
+    /// an entire path exactly.
+    ///
+    /// The basic format of these patterns is `{literal}`.
+    pub fn literal(&self) -> Option<String> {
+        let mut lit = String::new();
+        for t in &self.tokens {
+            match *t {
+                Token::Literal(c) => lit.push(c),
+                _ => return None,
+            }
+        }
+        Some(lit)
+    }
+
+    /// Returns a basename literal prefix of this pattern.
+    pub fn base_literal_prefix(&self) -> Option<String> {
+        match self.tokens.get(0) {
+            Some(&Token::RecursivePrefix) => {}
+            _ => return None,
+        }
+        match self.tokens.last() {
+            Some(&Token::ZeroOrMore) => {}
+            _ => return None,
+        }
+        let mut lit = String::new();
+        for t in &self.tokens[1..self.tokens.len()-1] {
+            match *t {
+                Token::Literal(c) if c == '/' || c == '\\' => return None,
+                Token::Literal(c) => lit.push(c),
+                _ => return None,
+            }
+        }
+        Some(lit)
+    }
+
+    /// Returns a basename literal suffix of this pattern.
+    pub fn base_literal_suffix(&self) -> Option<String> {
+        match self.tokens.get(0) {
+            Some(&Token::RecursivePrefix) => {}
+            _ => return None,
+        }
+        match self.tokens.get(1) {
+            Some(&Token::ZeroOrMore) => {}
+            _ => return None,
+        }
+        let mut lit = String::new();
+        for t in &self.tokens[2..] {
+            match *t {
+                Token::Literal(c) if c == '/' || c == '\\' => return None,
+                Token::Literal(c) => lit.push(c),
+                _ => return None,
+            }
+        }
+        Some(lit)
+    }
+
     /// Convert this pattern to a string that is guaranteed to be a valid
     /// regular expression and will represent the matching semantics of this
     /// glob pattern. This uses a default set of options.
@@ -415,13 +670,34 @@ impl<'a> Parser<'a> {
     }
 }
 
+fn path_bytes(path: &Path) -> Cow<[u8]> {
+    os_str_bytes(path.as_os_str())
+}
+
+#[cfg(unix)]
+fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
+    use std::os::unix::ffi::OsStrExt;
+    Cow::Borrowed(s.as_bytes())
+}
+
+#[cfg(not(unix))]
+fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
+    // TODO(burntsushi): On Windows, OS strings are probably UTF-16, so even
+    // if we could get at the raw bytes, they wouldn't be useful. We *must*
+    // convert to UTF-8 before doing path matching. Unfortunate, but necessary.
+    match s.to_string_lossy() {
+        Cow::Owned(s) => Cow::Owned(s.into_bytes()),
+        Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::path::Path;
 
     use regex::bytes::Regex;
 
-    use super::{Error, Pattern, MatchOptions, SetBuilder, Token};
+    use super::{Error, Pattern, MatchOptions, Set, SetBuilder, Token};
     use super::Token::*;
 
     macro_rules! syntax {
@@ -491,6 +767,37 @@ mod tests {
         };
     }
 
+    macro_rules! ext {
+        ($name:ident, $pat:expr, $ext:expr) => {
+            #[test]
+            fn $name() {
+                let pat = Pattern::new($pat).unwrap();
+                let ext = pat.ext().map(|e| e.to_string_lossy().into_owned());
+                assert_eq!($ext, ext.as_ref().map(|s| &**s));
+            }
+        };
+    }
+
+    macro_rules! baseliteral {
+        ($name:ident, $pat:expr, $yes:expr) => {
+            #[test]
+            fn $name() {
+                let pat = Pattern::new($pat).unwrap();
+                assert_eq!($yes, pat.base_literal().is_some());
+            }
+        };
+    }
+
+    macro_rules! basesuffix {
+        ($name:ident, $pat:expr, $yes:expr) => {
+            #[test]
+            fn $name() {
+                let pat = Pattern::new($pat).unwrap();
+                assert_eq!($yes, pat.is_literal_suffix());
+            }
+        };
+    }
+
     fn class(s: char, e: char) -> Token {
         Class { negated: false, ranges: vec![(s, e)] }
     }
@@ -585,6 +892,26 @@ mod tests {
     toregex!(re10, "+", r"^\+$");
     toregex!(re11, "**", r"^.*$");
 
+    ext!(ext1, "**/*.rs", Some("rs"));
+
+    baseliteral!(lit1, "**", true);
+    baseliteral!(lit2, "**/a", true);
+    baseliteral!(lit3, "**/ab", true);
+    baseliteral!(lit4, "**/a*b", false);
+    baseliteral!(lit5, "z/**/a*b", false);
+    baseliteral!(lit6, "[ab]", false);
+    baseliteral!(lit7, "?", false);
+
+    /*
+    issuffix!(suf1, "", false);
+    issuffix!(suf2, "a", true);
+    issuffix!(suf3, "ab", true);
+    issuffix!(suf4, "*ab", true);
+    issuffix!(suf5, "*.ab", true);
+    issuffix!(suf6, "?.ab", true);
+    issuffix!(suf7, "ab*", false);
+    */
+
     matches!(match1, "a", "a");
     matches!(match2, "a*b", "a_b");
     matches!(match3, "a*b*c", "abc");
@@ -681,16 +1008,22 @@ mod tests {
         builder.add("src/lib.rs").unwrap();
         let set = builder.build().unwrap();
 
-        assert!(set.is_match("foo.c"));
-        assert!(set.is_match("src/foo.c"));
-        assert!(!set.is_match("foo.rs"));
-        assert!(!set.is_match("tests/foo.rs"));
-        assert!(set.is_match("src/foo.rs"));
-        assert!(set.is_match("src/grep/src/main.rs"));
-
-        assert_eq!(2, set.matches("src/lib.rs").iter().count());
-        assert!(set.matches("src/lib.rs").matched(0));
-        assert!(!set.matches("src/lib.rs").matched(1));
-        assert!(set.matches("src/lib.rs").matched(2));
+        fn is_match(set: &Set, s: &str) -> bool {
+            let mut matches = vec![];
+            set.matches_into(s, &mut matches);
+            !matches.is_empty()
+        }
+
+        assert!(is_match(&set, "foo.c"));
+        assert!(is_match(&set, "src/foo.c"));
+        assert!(!is_match(&set, "foo.rs"));
+        assert!(!is_match(&set, "tests/foo.rs"));
+        assert!(is_match(&set, "src/foo.rs"));
+        assert!(is_match(&set, "src/grep/src/main.rs"));
+
+        let matches = set.matches("src/lib.rs");
+        assert_eq!(2, matches.len());
+        assert_eq!(0, matches[0]);
+        assert_eq!(2, matches[1]);
     }
 }
diff --git a/src/ignore.rs b/src/ignore.rs
index 09c0b302..10a81886 100644
--- a/src/ignore.rs
+++ b/src/ignore.rs
@@ -365,8 +365,7 @@ impl Overrides {
         let path = path.as_ref();
         self.gi.as_ref()
             .map(|gi| {
-                let path = &*path.to_string_lossy();
-                let mat = gi.matched_utf8(path, is_dir).invert();
+                let mat = gi.matched_stripped(path, is_dir).invert();
                 if mat.is_none() && !is_dir {
                     if gi.num_ignores() > 0 {
                         return Match::Ignored(&self.unmatched_pat);
diff --git a/src/main.rs b/src/main.rs
index aa51ec79..936e4965 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,6 +1,7 @@
 extern crate deque;
 extern crate docopt;
 extern crate env_logger;
+extern crate fnv;
 extern crate grep;
 #[cfg(windows)]
 extern crate kernel32;
@@ -36,6 +37,7 @@ use walkdir::DirEntry;
 
 use args::Args;
 use out::{ColoredTerminal, Out};
+use pathutil::strip_prefix;
 use printer::Printer;
 use search_stream::InputBuffer;
 #[cfg(windows)]
@@ -60,6 +62,7 @@ mod gitignore;
 mod glob;
 mod ignore;
 mod out;
+mod pathutil;
 mod printer;
 mod search_buffer;
 mod search_stream;
@@ -257,7 +260,7 @@ impl Worker {
             }
             WorkReady::DirFile(ent, file) => {
                 let mut path = ent.path();
-                if let Ok(p) = path.strip_prefix("./") {
+                if let Some(p) = strip_prefix("./", path) {
                     path = p;
                 }
                 if self.args.mmap() {
@@ -268,7 +271,7 @@ impl Worker {
             }
             WorkReady::PathFile(path, file) => {
                 let mut path = &*path;
-                if let Ok(p) = path.strip_prefix("./") {
+                if let Some(p) = strip_prefix("./", path) {
                     path = p;
                 }
                 if self.args.mmap() {
diff --git a/src/pathutil.rs b/src/pathutil.rs
new file mode 100644
index 00000000..e92416e5
--- /dev/null
+++ b/src/pathutil.rs
@@ -0,0 +1,80 @@
+/*!
+The pathutil module provides platform specific operations on paths that are
+typically faster than the same operations as provided in std::path. In
+particular, we really want to avoid the costly operation of parsing the path
+into its constituent components. We give up on Windows, but on Unix, we deal
+with the raw bytes directly.
+
+On large repositories (like chromium), this can have a ~25% performance
+improvement on just listing the files to search (!).
+*/
+use std::ffi::OsStr;
+use std::path::Path;
+
+/// Strip `prefix` from the `path` and return the remainder.
+///
+/// If `path` doesn't have a prefix `prefix`, then return `None`.
+#[cfg(unix)]
+pub fn strip_prefix<'a, P: AsRef<Path>>(
+    prefix: P,
+    path: &'a Path,
+) -> Option<&'a Path> {
+    use std::os::unix::ffi::OsStrExt;
+
+    let prefix = prefix.as_ref().as_os_str().as_bytes();
+    let path = path.as_os_str().as_bytes();
+    if prefix.len() > path.len() || prefix != &path[0..prefix.len()] {
+        None
+    } else {
+        Some(&Path::new(OsStr::from_bytes(&path[prefix.len()..])))
+    }
+}
+
+/// Strip `prefix` from the `path` and return the remainder.
+///
+/// If `path` doesn't have a prefix `prefix`, then return `None`.
+#[cfg(not(unix))]
+pub fn strip_prefix<'a>(prefix: &Path, path: &'a Path) -> Option<&'a Path> {
+    path.strip_prefix(prefix).ok()
+}
+
+/// The final component of the path, if it is a normal file.
+///
+/// If the path terminates in ., .., or consists solely of a root of prefix,
+/// file_name will return None.
+#[cfg(unix)]
+pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
+    path: &'a P,
+) -> Option<&'a OsStr> {
+    use std::os::unix::ffi::OsStrExt;
+
+    let path = path.as_ref().as_os_str().as_bytes();
+    if path.is_empty() {
+        return None;
+    } else if path.len() == 1 && path[0] == b'.' {
+        return None;
+    } else if path.last() == Some(&b'.') {
+        return None;
+    } else if path.len() >= 2 && &path[path.len() - 2..] == &b".."[..] {
+        return None;
+    }
+    let mut last_slash = 0;
+    for (i, &b) in path.iter().enumerate().rev() {
+        if b == b'/' {
+            last_slash = i;
+            break;
+        }
+    }
+    Some(OsStr::from_bytes(&path[last_slash + 1..]))
+}
+
+/// The final component of the path, if it is a normal file.
+///
+/// If the path terminates in ., .., or consists solely of a root of prefix,
+/// file_name will return None.
+#[cfg(not(unix))]
+pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
+    path: &'a P,
+) -> Option<&'a OsStr> {
+    path.as_ref().file_name()
+}
diff --git a/src/types.rs b/src/types.rs
index 6b9aea91..910f8132 100644
--- a/src/types.rs
+++ b/src/types.rs
@@ -151,8 +151,8 @@ impl FileTypeDef {
 /// Types is a file type matcher.
 #[derive(Clone, Debug)]
 pub struct Types {
-    selected: Option<glob::Set>,
-    negated: Option<glob::Set>,
+    selected: Option<glob::SetYesNo>,
+    negated: Option<glob::SetYesNo>,
     has_selected: bool,
     unmatched_pat: Pattern,
 }
@@ -165,8 +165,8 @@ impl Types {
     /// If has_selected is true, then at least one file type was selected.
     /// Therefore, any non-matches should be ignored.
     fn new(
-        selected: Option<glob::Set>,
-        negated: Option<glob::Set>,
+        selected: Option<glob::SetYesNo>,
+        negated: Option<glob::SetYesNo>,
         has_selected: bool,
     ) -> Types {
         Types {
@@ -268,7 +268,7 @@ impl TypesBuilder {
                         try!(bset.add_with(glob, &opts));
                     }
                 }
-                Some(try!(bset.build()))
+                Some(try!(bset.build_yesno()))
             };
         let negated_globs =
             if self.negated.is_empty() {
@@ -287,7 +287,7 @@ impl TypesBuilder {
                         try!(bset.add_with(glob, &opts));
                     }
                 }
-                Some(try!(bset.build()))
+                Some(try!(bset.build_yesno()))
             };
         Ok(Types::new(
             selected_globs, negated_globs, !self.selected.is_empty()))
author	Andrew Gallant <jamslam@gmail.com>	2016-09-15 22:06:04 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2016-09-15 22:06:04 -0400
commit	0e46171e3b189b2bd89f45c3a492dea36361d8bc (patch)
tree	3ae494e5fcf54c3a679ae42991050c78c7552dce
parent	f5c85827cea05d11e1c0c50aa34e07361313f659 (diff)