Move all gitignore matching to separate crate.

This PR introduces a new sub-crate, `ignore`, which primarily provides a fast recursive directory iterator that respects ignore files like gitignore and other configurable filtering rules based on globs or even file types. This results in a substantial source of complexity moved out of ripgrep's core and into a reusable component that others can now (hopefully) benefit from. While much of the ignore code carried over from ripgrep's core, a substantial portion of it was rewritten with the following goals in mind: 1. Reuse matchers built from gitignore files across directory iteration. 2. Design the matcher data structure to be amenable for parallelizing directory iteration. (Indeed, writing the parallel iterator is the next step.) Fixes #9, #44, #45
author: Andrew Gallant <jamslam@gmail.com> 2016-10-11 19:57:09 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2016-10-29 20:48:59 -0400
commit: d79add341ba4be10bb3459877318b9c5a30f5db3 (patch)
tree: a6c5222c63d53522635bc847c6ac2cf2e000ff7f /globset
parent: 12b2b1f6242e0c9082e93111ffef24a93fea5f6e (diff)
4 files changed, 56 insertions, 7 deletions
diff --git a/globset/Cargo.toml b/globset/Cargo.toml
index a885ea18..b302d9cd 100644
--- a/globset/Cargo.toml
+++ b/globset/Cargo.toml
@@ -28,3 +28,6 @@ regex = "0.1.77"
 
 [dev-dependencies]
 glob = "0.2"
+
+[features]
+simd-accel = ["regex/simd-accel"]
diff --git a/globset/benches/bench.rs b/globset/benches/bench.rs
index a151645d..e142ed72 100644
--- a/globset/benches/bench.rs
+++ b/globset/benches/bench.rs
@@ -11,6 +11,9 @@ extern crate lazy_static;
 extern crate regex;
 extern crate test;
 
+use std::ffi::OsStr;
+use std::path::Path;
+
 use globset::{Candidate, Glob, GlobMatcher, GlobSet, GlobSetBuilder};
 
 const EXT: &'static str = "some/a/bigger/path/to/the/crazy/needle.txt";
diff --git a/globset/src/lib.rs b/globset/src/lib.rs
index 056118a3..b9a36d3a 100644
--- a/globset/src/lib.rs
+++ b/globset/src/lib.rs
@@ -226,10 +226,21 @@ type Fnv = hash::BuildHasherDefault<fnv::FnvHasher>;
 /// single pass.
 #[derive(Clone, Debug)]
 pub struct GlobSet {
+    len: usize,
     strats: Vec<GlobSetMatchStrategy>,
 }
 
 impl GlobSet {
+    /// Returns true if this set is empty, and therefore matches nothing.
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    /// Returns the number of globs in this set.
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
     /// Returns true if any glob in this set matches the path given.
     pub fn is_match<P: AsRef<Path>>(&self, path: P) -> bool {
         self.is_match_candidate(&Candidate::new(path.as_ref()))
@@ -240,6 +251,9 @@ impl GlobSet {
     /// This takes a Candidate as input, which can be used to amortize the
     /// cost of preparing a path for matching.
     pub fn is_match_candidate(&self, path: &Candidate) -> bool {
+        if self.is_empty() {
+            return false;
+        }
         for strat in &self.strats {
             if strat.is_match(path) {
                 return true;
@@ -250,9 +264,6 @@ impl GlobSet {
 
     /// Returns the sequence number of every glob pattern that matches the
     /// given path.
-    ///
-    /// This takes a Candidate as input, which can be used to amortize the
-    /// cost of preparing a path for matching.
     pub fn matches<P: AsRef<Path>>(&self, path: P) -> Vec<usize> {
         self.matches_candidate(&Candidate::new(path.as_ref()))
     }
@@ -264,6 +275,9 @@ impl GlobSet {
     /// cost of preparing a path for matching.
     pub fn matches_candidate(&self, path: &Candidate) -> Vec<usize> {
         let mut into = vec![];
+        if self.is_empty() {
+            return into;
+        }
         self.matches_candidate_into(path, &mut into);
         into
     }
@@ -274,12 +288,32 @@ impl GlobSet {
     /// `into` is is cleared before matching begins, and contains the set of
     /// sequence numbers (in ascending order) after matching ends. If no globs
     /// were matched, then `into` will be empty.
+    pub fn matches_into<P: AsRef<Path>>(
+        &self,
+        path: P,
+        into: &mut Vec<usize>,
+    ) {
+        self.matches_candidate_into(&Candidate::new(path.as_ref()), into);
+    }
+
+    /// Adds the sequence number of every glob pattern that matches the given
+    /// path to the vec given.
+    ///
+    /// `into` is is cleared before matching begins, and contains the set of
+    /// sequence numbers (in ascending order) after matching ends. If no globs
+    /// were matched, then `into` will be empty.
+    ///
+    /// This takes a Candidate as input, which can be used to amortize the
+    /// cost of preparing a path for matching.
     pub fn matches_candidate_into(
         &self,
         path: &Candidate,
         into: &mut Vec<usize>,
     ) {
         into.clear();
+        if self.is_empty() {
+            return;
+        }
         for strat in &self.strats {
             strat.matches_into(path, into);
         }
@@ -288,6 +322,9 @@ impl GlobSet {
     }
 
     fn new(pats: &[Glob]) -> Result<GlobSet, Error> {
+        if pats.is_empty() {
+            return Ok(GlobSet { len: 0, strats: vec![] });
+        }
         let mut lits = LiteralStrategy::new();
         let mut base_lits = BasenameLiteralStrategy::new();
         let mut exts = ExtensionStrategy::new();
@@ -330,6 +367,7 @@ impl GlobSet {
                 prefixes.literals.len(), suffixes.literals.len(),
                 required_exts.0.len(), regexes.literals.len());
         Ok(GlobSet {
+            len: pats.len(),
             strats: vec![
                 GlobSetMatchStrategy::Extension(exts),
                 GlobSetMatchStrategy::BasenameLiteral(base_lits),
@@ -750,4 +788,11 @@ mod tests {
         assert_eq!(0, matches[0]);
         assert_eq!(2, matches[1]);
     }
+
+    #[test]
+    fn empty_set_works() {
+        let set = GlobSetBuilder::new().build().unwrap();
+        assert!(!set.is_match(""));
+        assert!(!set.is_match("a"));
+    }
 }
diff --git a/globset/src/pathutil.rs b/globset/src/pathutil.rs
index 15a3283b..16bd16fc 100644
--- a/globset/src/pathutil.rs
+++ b/globset/src/pathutil.rs
@@ -89,16 +89,14 @@ pub fn path_bytes(path: &Path) -> Cow<[u8]> {
     os_str_bytes(path.as_os_str())
 }
 
-/// Return the raw bytes of the given OS string, transcoded to UTF-8 if
-/// necessary.
+/// Return the raw bytes of the given OS string, possibly transcoded to UTF-8.
 #[cfg(unix)]
 pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
     use std::os::unix::ffi::OsStrExt;
     Cow::Borrowed(s.as_bytes())
 }
 
-/// Return the raw bytes of the given OS string, transcoded to UTF-8 if
-/// necessary.
+/// Return the raw bytes of the given OS string, possibly transcoded to UTF-8.
 #[cfg(not(unix))]
 pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
     // TODO(burntsushi): On Windows, OS strings are WTF-8, which is a superset
author	Andrew Gallant <jamslam@gmail.com>	2016-10-11 19:57:09 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2016-10-29 20:48:59 -0400
commit	d79add341ba4be10bb3459877318b9c5a30f5db3 (patch)
tree	a6c5222c63d53522635bc847c6ac2cf2e000ff7f /globset
parent	12b2b1f6242e0c9082e93111ffef24a93fea5f6e (diff)