From d79add341ba4be10bb3459877318b9c5a30f5db3 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 11 Oct 2016 19:57:09 -0400 Subject: Move all gitignore matching to separate crate. This PR introduces a new sub-crate, `ignore`, which primarily provides a fast recursive directory iterator that respects ignore files like gitignore and other configurable filtering rules based on globs or even file types. This results in a substantial source of complexity moved out of ripgrep's core and into a reusable component that others can now (hopefully) benefit from. While much of the ignore code carried over from ripgrep's core, a substantial portion of it was rewritten with the following goals in mind: 1. Reuse matchers built from gitignore files across directory iteration. 2. Design the matcher data structure to be amenable for parallelizing directory iteration. (Indeed, writing the parallel iterator is the next step.) Fixes #9, #44, #45 --- Cargo.lock | 57 ++-- Cargo.toml | 6 +- appveyor.yml | 8 +- ci/script.sh | 2 + globset/Cargo.toml | 3 + globset/benches/bench.rs | 3 + globset/src/lib.rs | 51 ++- globset/src/pathutil.rs | 6 +- grep/Cargo.toml | 2 +- ignore/Cargo.lock | 170 ++++++++++ ignore/Cargo.toml | 36 +++ ignore/README.md | 66 ++++ ignore/examples/walk.rs | 28 ++ ignore/src/dir.rs | 803 +++++++++++++++++++++++++++++++++++++++++++++++ ignore/src/gitignore.rs | 607 +++++++++++++++++++++++++++++++++++ ignore/src/lib.rs | 300 ++++++++++++++++++ ignore/src/overrides.rs | 202 ++++++++++++ ignore/src/pathutil.rs | 108 +++++++ ignore/src/types.rs | 568 +++++++++++++++++++++++++++++++++ ignore/src/walk.rs | 592 ++++++++++++++++++++++++++++++++++ src/args.rs | 105 +++++-- src/gitignore.rs | 455 --------------------------- src/ignore.rs | 493 ----------------------------- src/main.rs | 90 ++---- src/pathutil.rs | 78 +---- src/printer.rs | 6 +- src/terminal.rs | 0 src/types.rs | 458 --------------------------- src/walk.rs | 140 --------- tests/tests.rs | 82 +++++ 30 files changed, 3765 insertions(+), 1760 deletions(-) create mode 100644 ignore/Cargo.lock create mode 100644 ignore/Cargo.toml create mode 100644 ignore/README.md create mode 100644 ignore/examples/walk.rs create mode 100644 ignore/src/dir.rs create mode 100644 ignore/src/gitignore.rs create mode 100644 ignore/src/lib.rs create mode 100644 ignore/src/overrides.rs create mode 100644 ignore/src/pathutil.rs create mode 100644 ignore/src/types.rs create mode 100644 ignore/src/walk.rs delete mode 100644 src/gitignore.rs delete mode 100644 src/ignore.rs delete mode 100644 src/terminal.rs delete mode 100644 src/types.rs delete mode 100644 src/walk.rs diff --git a/Cargo.lock b/Cargo.lock index ba88e2cb..b10e0602 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5,20 +5,18 @@ dependencies = [ "deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "docopt 0.6.86 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", - "globset 0.1.0", "grep 0.1.3", + "ignore 0.1.0", "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", - "memmap 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "memmap 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "num_cpus 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", "rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)", "term 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", - "thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", - "walkdir 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -44,7 +42,7 @@ version = "0.6.86" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", "rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)", "strsim 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -55,7 +53,7 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -65,7 +63,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "fs2" -version = "0.2.5" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", @@ -82,7 +80,7 @@ dependencies = [ "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -91,9 +89,22 @@ version = "0.1.3" dependencies = [ "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", - "memmap 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", - "regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", + "memmap 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ignore" +version = "0.1.0" +dependencies = [ + "globset 0.1.0", + "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "walkdir 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -130,10 +141,10 @@ dependencies = [ [[package]] name = "memmap" -version = "0.2.3" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "fs2 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)", + "fs2 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", @@ -157,12 +168,12 @@ dependencies = [ [[package]] name = "regex" -version = "0.1.77" +version = "0.1.80" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", - "regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", "simd 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", "utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", @@ -170,7 +181,7 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.3.7" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -221,7 +232,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "walkdir" -version = "0.1.8" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", @@ -244,17 +255,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum docopt 0.6.86 (registry+https://github.com/rust-lang/crates.io-index)" = "4a7ef30445607f6fc8720f0a0a2c7442284b629cf0d049286860fae23e71c4d9" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" "checksum fnv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6cc484842f1e2884faf56f529f960cc12ad8c71ce96cc7abba0a067c98fee344" -"checksum fs2 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "bcd414e5a1a979b931bb92f41b7a54106d3f6d2e6c253e9ce943b7cd468251ef" +"checksum fs2 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "640001e1bd865c7c32806292822445af576a6866175b5225aa2087ca5e3de551" "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" "checksum lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "49247ec2a285bb3dcb23cbd9c35193c025e7251bfce77c1d5da97e6362dffe7f" "checksum libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)" = "044d1360593a78f5c8e5e710beccdc24ab71d1f01bc19a29bcacdba22e8475d8" "checksum log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ab83497bf8bf4ed2a74259c1c802351fcd67a65baa86394b6ba73c36f4838054" "checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20" -"checksum memmap 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "f20f72ed93291a72e22e8b16bb18762183bb4943f0f483da5b8be1a9e8192752" +"checksum memmap 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "065ce59af31c18ea2c419100bda6247dd4ec3099423202b12f0bd32e529fabd2" "checksum num_cpus 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8890e6084723d57d0df8d2720b0d60c6ee67d6c93e7169630e4371e88765dcad" "checksum rand 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "2791d88c6defac799c3f20d74f094ca33b9332612d9aef9078519c82e4fe04a5" -"checksum regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)" = "64b03446c466d35b42f2a8b203c8e03ed8b91c0f17b56e1f84f7210a257aa665" -"checksum regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "48f0573bcee95a48da786f8823465b5f2a1fae288a55407aca991e5b3e0eae11" +"checksum regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)" = "4fd4ace6a8cf7860714a2c2280d6c1f7e6a413486c13298bbc86fd3da019402f" +"checksum regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "f9ec002c35e86791825ed294b50008eea9ddfc8def4420124fbc6b08db834957" "checksum rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)" = "6159e4e6e559c81bd706afe9c8fd68f547d3e851ce12e76b1de7914bab61691b" "checksum simd 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "63b5847c2d766ca7ce7227672850955802fabd779ba616aeabead4c2c3877023" "checksum strsim 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "50c069df92e4b01425a8bf3576d5d417943a6a7272fbabaf5bd80b1aaa76442e" @@ -262,6 +273,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03" "checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5" "checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f" -"checksum walkdir 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "c66c0b9792f0a765345452775f3adbd28dde9d33f30d13e5dcc5ae17cf6f3780" +"checksum walkdir 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "98da26f00240118fbb7a06fa29579d1b39d34cd6e0505ea5c125b26d5260a967" "checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" "checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" diff --git a/Cargo.toml b/Cargo.toml index e0480c54..60db7c4b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,19 +27,17 @@ path = "tests/tests.rs" deque = "0.3" docopt = "0.6" env_logger = "0.3" -globset = { version = "0.1.0", path = "globset" } grep = { version = "0.1.3", path = "grep" } +ignore = { version = "0.1.0", path = "ignore" } lazy_static = "0.2" libc = "0.2" log = "0.3" memchr = "0.1" -memmap = "0.2" +memmap = "0.5" num_cpus = "1" regex = "0.1.77" rustc-serialize = "0.3" term = "0.4" -thread_local = "0.2.7" -walkdir = "0.1" [target.'cfg(windows)'.dependencies] kernel32-sys = "0.2" diff --git a/appveyor.yml b/appveyor.yml index 266812db..645a525d 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -30,6 +30,7 @@ test_script: - cargo test --verbose - cargo test --verbose --manifest-path grep/Cargo.toml - cargo test --verbose --manifest-path globset/Cargo.toml + - cargo test --verbose --manifest-path ignore/Cargo.toml before_deploy: # Generate artifacts for release @@ -59,7 +60,8 @@ deploy: branches: only: - - appveyor - - /\d+\.\d+\.\d+/ - except: - master + # - appveyor + # - /\d+\.\d+\.\d+/ + # except: + # - master diff --git a/ci/script.sh b/ci/script.sh index eca6c0f6..bf0731a2 100644 --- a/ci/script.sh +++ b/ci/script.sh @@ -23,6 +23,8 @@ run_test_suite() { cargo test --target $TARGET --verbose --manifest-path grep/Cargo.toml cargo build --target $TARGET --verbose --manifest-path globset/Cargo.toml cargo test --target $TARGET --verbose --manifest-path globset/Cargo.toml + cargo build --target $TARGET --verbose --manifest-path ignore/Cargo.toml + cargo test --target $TARGET --verbose --manifest-path ignore/Cargo.toml # sanity check the file type file target/$TARGET/debug/rg diff --git a/globset/Cargo.toml b/globset/Cargo.toml index a885ea18..b302d9cd 100644 --- a/globset/Cargo.toml +++ b/globset/Cargo.toml @@ -28,3 +28,6 @@ regex = "0.1.77" [dev-dependencies] glob = "0.2" + +[features] +simd-accel = ["regex/simd-accel"] diff --git a/globset/benches/bench.rs b/globset/benches/bench.rs index a151645d..e142ed72 100644 --- a/globset/benches/bench.rs +++ b/globset/benches/bench.rs @@ -11,6 +11,9 @@ extern crate lazy_static; extern crate regex; extern crate test; +use std::ffi::OsStr; +use std::path::Path; + use globset::{Candidate, Glob, GlobMatcher, GlobSet, GlobSetBuilder}; const EXT: &'static str = "some/a/bigger/path/to/the/crazy/needle.txt"; diff --git a/globset/src/lib.rs b/globset/src/lib.rs index 056118a3..b9a36d3a 100644 --- a/globset/src/lib.rs +++ b/globset/src/lib.rs @@ -226,10 +226,21 @@ type Fnv = hash::BuildHasherDefault; /// single pass. #[derive(Clone, Debug)] pub struct GlobSet { + len: usize, strats: Vec, } impl GlobSet { + /// Returns true if this set is empty, and therefore matches nothing. + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the number of globs in this set. + pub fn len(&self) -> usize { + self.len + } + /// Returns true if any glob in this set matches the path given. pub fn is_match>(&self, path: P) -> bool { self.is_match_candidate(&Candidate::new(path.as_ref())) @@ -240,6 +251,9 @@ impl GlobSet { /// This takes a Candidate as input, which can be used to amortize the /// cost of preparing a path for matching. pub fn is_match_candidate(&self, path: &Candidate) -> bool { + if self.is_empty() { + return false; + } for strat in &self.strats { if strat.is_match(path) { return true; @@ -250,9 +264,6 @@ impl GlobSet { /// Returns the sequence number of every glob pattern that matches the /// given path. - /// - /// This takes a Candidate as input, which can be used to amortize the - /// cost of preparing a path for matching. pub fn matches>(&self, path: P) -> Vec { self.matches_candidate(&Candidate::new(path.as_ref())) } @@ -264,6 +275,9 @@ impl GlobSet { /// cost of preparing a path for matching. pub fn matches_candidate(&self, path: &Candidate) -> Vec { let mut into = vec![]; + if self.is_empty() { + return into; + } self.matches_candidate_into(path, &mut into); into } @@ -274,12 +288,32 @@ impl GlobSet { /// `into` is is cleared before matching begins, and contains the set of /// sequence numbers (in ascending order) after matching ends. If no globs /// were matched, then `into` will be empty. + pub fn matches_into>( + &self, + path: P, + into: &mut Vec, + ) { + self.matches_candidate_into(&Candidate::new(path.as_ref()), into); + } + + /// Adds the sequence number of every glob pattern that matches the given + /// path to the vec given. + /// + /// `into` is is cleared before matching begins, and contains the set of + /// sequence numbers (in ascending order) after matching ends. If no globs + /// were matched, then `into` will be empty. + /// + /// This takes a Candidate as input, which can be used to amortize the + /// cost of preparing a path for matching. pub fn matches_candidate_into( &self, path: &Candidate, into: &mut Vec, ) { into.clear(); + if self.is_empty() { + return; + } for strat in &self.strats { strat.matches_into(path, into); } @@ -288,6 +322,9 @@ impl GlobSet { } fn new(pats: &[Glob]) -> Result { + if pats.is_empty() { + return Ok(GlobSet { len: 0, strats: vec![] }); + } let mut lits = LiteralStrategy::new(); let mut base_lits = BasenameLiteralStrategy::new(); let mut exts = ExtensionStrategy::new(); @@ -330,6 +367,7 @@ impl GlobSet { prefixes.literals.len(), suffixes.literals.len(), required_exts.0.len(), regexes.literals.len()); Ok(GlobSet { + len: pats.len(), strats: vec![ GlobSetMatchStrategy::Extension(exts), GlobSetMatchStrategy::BasenameLiteral(base_lits), @@ -750,4 +788,11 @@ mod tests { assert_eq!(0, matches[0]); assert_eq!(2, matches[1]); } + + #[test] + fn empty_set_works() { + let set = GlobSetBuilder::new().build().unwrap(); + assert!(!set.is_match("")); + assert!(!set.is_match("a")); + } } diff --git a/globset/src/pathutil.rs b/globset/src/pathutil.rs index 15a3283b..16bd16fc 100644 --- a/globset/src/pathutil.rs +++ b/globset/src/pathutil.rs @@ -89,16 +89,14 @@ pub fn path_bytes(path: &Path) -> Cow<[u8]> { os_str_bytes(path.as_os_str()) } -/// Return the raw bytes of the given OS string, transcoded to UTF-8 if -/// necessary. +/// Return the raw bytes of the given OS string, possibly transcoded to UTF-8. #[cfg(unix)] pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> { use std::os::unix::ffi::OsStrExt; Cow::Borrowed(s.as_bytes()) } -/// Return the raw bytes of the given OS string, transcoded to UTF-8 if -/// necessary. +/// Return the raw bytes of the given OS string, possibly transcoded to UTF-8. #[cfg(not(unix))] pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> { // TODO(burntsushi): On Windows, OS strings are WTF-8, which is a superset diff --git a/grep/Cargo.toml b/grep/Cargo.toml index d14ba886..8637f16b 100644 --- a/grep/Cargo.toml +++ b/grep/Cargo.toml @@ -15,6 +15,6 @@ license = "Unlicense/MIT" [dependencies] log = "0.3" memchr = "0.1" -memmap = "0.2" +memmap = "0.5" regex = "0.1.77" regex-syntax = "0.3.5" diff --git a/ignore/Cargo.lock b/ignore/Cargo.lock new file mode 100644 index 00000000..7046ecdd --- /dev/null +++ b/ignore/Cargo.lock @@ -0,0 +1,170 @@ +[root] +name = "ignore" +version = "0.1.0" +dependencies = [ + "globset 0.1.0", + "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", + "tempdir 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "walkdir 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "aho-corasick" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "fnv" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "globset" +version = "0.1.0" +dependencies = [ + "aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", + "fnv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "kernel32-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "lazy_static" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "libc" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "log" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "memchr" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex" +version = "0.1.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", + "simd 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", + "utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex-syntax" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "simd" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "tempdir" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rand 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "thread-id" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "thread_local" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "utf8-ranges" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "walkdir" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "winapi" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "winapi-build" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[metadata] +"checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66" +"checksum fnv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6cc484842f1e2884faf56f529f960cc12ad8c71ce96cc7abba0a067c98fee344" +"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" +"checksum lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "49247ec2a285bb3dcb23cbd9c35193c025e7251bfce77c1d5da97e6362dffe7f" +"checksum libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)" = "408014cace30ee0f767b1c4517980646a573ec61a57957aeeabcac8ac0a02e8d" +"checksum log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ab83497bf8bf4ed2a74259c1c802351fcd67a65baa86394b6ba73c36f4838054" +"checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20" +"checksum rand 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "2791d88c6defac799c3f20d74f094ca33b9332612d9aef9078519c82e4fe04a5" +"checksum regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)" = "64b03446c466d35b42f2a8b203c8e03ed8b91c0f17b56e1f84f7210a257aa665" +"checksum regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "48f0573bcee95a48da786f8823465b5f2a1fae288a55407aca991e5b3e0eae11" +"checksum simd 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "63b5847c2d766ca7ce7227672850955802fabd779ba616aeabead4c2c3877023" +"checksum tempdir 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "87974a6f5c1dfb344d733055601650059a3363de2a6104819293baff662132d6" +"checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03" +"checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5" +"checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f" +"checksum walkdir 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "98da26f00240118fbb7a06fa29579d1b39d34cd6e0505ea5c125b26d5260a967" +"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" +"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" diff --git a/ignore/Cargo.toml b/ignore/Cargo.toml new file mode 100644 index 00000000..520f9cf4 --- /dev/null +++ b/ignore/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "ignore" +version = "0.1.0" #:version +authors = ["Andrew Gallant "] +description = """ +A fast library for efficiently matching ignore files such as `.gitignore` +against file paths. +""" +documentation = "https://docs.rs/ignore" +homepage = "https://github.com/BurntSushi/ripgrep/tree/master/ignore" +repository = "https://github.com/BurntSushi/ripgrep/tree/master/ignore" +readme = "README.md" +keywords = ["glob", "ignore", "gitignore", "pattern", "file"] +license = "Unlicense/MIT" + +[lib] +name = "ignore" +bench = false + +[dependencies] +globset = { version = "0.1.0", path = "../globset" } +lazy_static = "0.2" +log = "0.3" +memchr = "0.1" +regex = "0.1.77" +thread_local = "0.2.7" +walkdir = "1" + +[dev-dependencies] +tempdir = "0.3.5" + +[features] +simd-accel = ["globset/simd-accel"] + +[profile.release] +debug = true diff --git a/ignore/README.md b/ignore/README.md new file mode 100644 index 00000000..2d2907c8 --- /dev/null +++ b/ignore/README.md @@ -0,0 +1,66 @@ +ignore +====== +The ignore crate provides a fast recursive directory iterator that respects +various filters such as globs, file types and `.gitignore` files. This crate +also provides lower level direct access to gitignore and file type matchers. + +[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.png)](https://travis-ci.org/BurntSushi/ripgrep) +[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep) +[![](https://img.shields.io/crates/v/ignore.svg)](https://crates.io/crates/ignore) + +Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). + +### Documentation + +[https://docs.rs/ignore](https://docs.rs/ignore) + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +ignore = "0.1" +``` + +and this to your crate root: + +```rust +extern crate ignore; +``` + +### Example + +This example shows the most basic usage of this crate. This code will +recursively traverse the current directory while automatically filtering out +files and directories according to ignore globs found in files like +`.ignore` and `.gitignore`: + + +```rust,no_run +use ignore::Walk; + +for result in Walk::new("./") { + // Each item yielded by the iterator is either a directory entry or an + // error, so either print the path or the error. + match result { + Ok(entry) => println!("{}", entry.path().display()), + Err(err) => println!("ERROR: {}", err), + } +} +``` + +### Example: advanced + +By default, the recursive directory iterator will ignore hidden files and +directories. This can be disabled by building the iterator with `WalkBuilder`: + +```rust,no_run +use ignore::WalkBuilder; + +for result in WalkBuilder::new("./").hidden(false).build() { + println!("{:?}", result); +} +``` + +See the documentation for `WalkBuilder` for many other options. diff --git a/ignore/examples/walk.rs b/ignore/examples/walk.rs new file mode 100644 index 00000000..0ce0a086 --- /dev/null +++ b/ignore/examples/walk.rs @@ -0,0 +1,28 @@ +/* +extern crate ignore; +extern crate walkdir; + +use std::env; +use std::io::{self, Write}; +use std::os::unix::ffi::OsStrExt; + +use ignore::ignore::IgnoreBuilder; +use walkdir::WalkDir; + +fn main() { + let path = env::args().nth(1).unwrap(); + let ig = IgnoreBuilder::new().build(); + let wd = WalkDir::new(path); + let walker = ignore::walk::Iter::new(ig, wd); + + let mut stdout = io::BufWriter::new(io::stdout()); + // let mut count = 0; + for dirent in walker { + // count += 1; + stdout.write(dirent.path().as_os_str().as_bytes()).unwrap(); + stdout.write(b"\n").unwrap(); + } + // println!("{}", count); +} +*/ +fn main() {} diff --git a/ignore/src/dir.rs b/ignore/src/dir.rs new file mode 100644 index 00000000..6ac00627 --- /dev/null +++ b/ignore/src/dir.rs @@ -0,0 +1,803 @@ +// This module provides a data structure, `Ignore`, that connects "directory +// traversal" with "ignore matchers." Specifically, it knows about gitignore +// semantics and precedence, and is organized based on directory hierarchy. +// Namely, every matcher logically corresponds to ignore rules from a single +// directory, and points to the matcher for its corresponding parent directory. +// In this sense, `Ignore` is a *persistent* data structure. +// +// This design was specifically chosen to make it possible to use this data +// structure in a parallel directory iterator. +// +// My initial intention was to expose this module as part of this crate's +// public API, but I think the data structure's public API is too complicated +// with non-obvious failure modes. Alas, such things haven't been documented +// well. + +use std::collections::HashMap; +use std::ffi::OsString; +use std::path::{Path, PathBuf}; +use std::sync::{Arc, RwLock}; + +use gitignore::{self, Gitignore, GitignoreBuilder}; +use pathutil::{is_hidden, strip_prefix}; +use overrides::{self, Override}; +use types::{self, Types}; +use {Error, Match, PartialErrorBuilder}; + +/// IgnoreMatch represents information about where a match came from when using +/// the `Ignore` matcher. +#[derive(Clone, Debug)] +pub struct IgnoreMatch<'a>(IgnoreMatchInner<'a>); + +/// IgnoreMatchInner describes precisely where the match information came from. +/// This is private to allow expansion to more matchers in the future. +#[derive(Clone, Debug)] +enum IgnoreMatchInner<'a> { + Override(overrides::Glob<'a>), + Gitignore(&'a gitignore::Glob), + Types(types::Glob<'a>), + Hidden, +} + +impl<'a> IgnoreMatch<'a> { + fn overrides(x: overrides::Glob<'a>) -> IgnoreMatch<'a> { + IgnoreMatch(IgnoreMatchInner::Override(x)) + } + + fn gitignore(x: &'a gitignore::Glob) -> IgnoreMatch<'a> { + IgnoreMatch(IgnoreMatchInner::Gitignore(x)) + } + + fn types(x: types::Glob<'a>) -> IgnoreMatch<'a> { + IgnoreMatch(IgnoreMatchInner::Types(x)) + } + + fn hidden() -> IgnoreMatch<'static> { + IgnoreMatch(IgnoreMatchInner::Hidden) + } +} + +/// Options for the ignore matcher, shared between the matcher itself and the +/// builder. +#[derive(Clone, Copy, Debug)] +struct IgnoreOptions { + /// Whether to ignore hidden file paths or not. + hidden: bool, + /// Whether to read .ignore files. + ignore: bool, + /// Whether to read git's global gitignore file. + git_global: bool, + /// Whether to read .gitignore files. + git_ignore: bool, + /// Whether to read .git/info/exclude files. + git_exclude: bool, +} + +impl IgnoreOptions { + /// Returns true if at least one type of ignore rules should be matched. + fn should_ignores(&self) -> bool { + self.ignore || self.git_global || self.git_ignore || self.git_exclude + } +} + +/// Ignore is a matcher useful for recursively walking one or more directories. +#[derive(Clone, Debug)] +pub struct Ignore(Arc); + +#[derive(Clone, Debug)] +struct IgnoreInner { + /// A map of all existing directories that have already been + /// compiled into matchers. + /// + /// Note that this is never used during matching, only when adding new + /// parent directory matchers. This avoids needing to rebuild glob sets for + /// parent directories if many paths are being searched. + compiled: Arc>>, + /// The path to the directory that this matcher was built from. + dir: PathBuf, + /// An override matcher (default is empty). + overrides: Arc, + /// A file type matcher. + types: Arc, + /// The parent directory to match next. + /// + /// If this is the root directory or there are otherwise no more + /// directories to match, then `parent` is `None`. + parent: Option, + /// Whether this is an absolute parent matcher, as added by add_parent. + is_absolute_parent: bool, + /// The absolute base path of this matcher. Populated only if parent + /// directories are added. + absolute_base: Option>, + /// Explicit ignore matchers specified by the caller. + explicit_ignores: Arc>, + /// The matcher for .ignore files. + ignore_matcher: Gitignore, + /// A global gitignore matcher, usually from $XDG_CONFIG_HOME/git/ignore. + git_global_matcher: Arc, + /// The matcher for .gitignore files. + git_ignore_matcher: Gitignore, + /// Special matcher for `.git/info/exclude` files. + git_exclude_matcher: Gitignore, + /// Whether this directory contains a .git sub-directory. + has_git: bool, + /// Ignore config. + opts: IgnoreOptions, +} + +impl Ignore { + /// Return the directory path of this matcher. + #[allow(dead_code)] + pub fn path(&self) -> &Path { + &self.0.dir + } + + /// Return true if this matcher has no parent. + pub fn is_root(&self) -> bool { + self.0.parent.is_none() + } + + /// Return this matcher's parent, if one exists. + pub fn parent(&self) -> Option { + self.0.parent.clone() + } + + /// Create a new `Ignore` matcher with the parent directories of `dir`. + /// + /// Note that this can only be called on an `Ignore` matcher with no + /// parents (i.e., `is_root` returns `true`). This will panic otherwise. + pub fn add_parents>( + &self, + path: P, + ) -> (Ignore, Option) { + if !self.is_root() { + panic!("Ignore::add_parents called on non-root matcher"); + } + let absolute_base = match path.as_ref().canonicalize() { + Ok(path) => Arc::new(path), + Err(_) => { + // There's not much we can do here, so just return our + // existing matcher. We drop the error to be consistent + // with our general pattern of ignoring I/O errors when + // processing ignore files. + return (self.clone(), None); + } + }; + // List of parents, from child to root. + let mut parents = vec![]; + let mut path = &**absolute_base; + while let Some(parent) = path.parent() { + parents.push(parent); + path = parent; + } + let mut errs = PartialErrorBuilder::default(); + let mut ig = self.clone(); + for parent in parents.into_iter().rev() { + let mut compiled = self.0.compiled.write().unwrap(); + if let Some(prebuilt) = compiled.get(parent.as_os_str()) { + ig = prebuilt.clone(); + continue; + } + let (mut igtmp, err) = ig.add_child_path(parent); + errs.maybe_push(err); + igtmp.is_absolute_parent = true; + igtmp.absolute_base = Some(absolute_base.clone()); + ig = Ignore(Arc::new(igtmp)); + compiled.insert(parent.as_os_str().to_os_string(), ig.clone()); + } + (ig, errs.into_error_option()) + } + + /// Create a new `Ignore` matcher for the given child directory. + /// + /// Since building the matcher may require reading from multiple + /// files, it's possible that this method partially succeeds. Therefore, + /// a matcher is always returned (which may match nothing) and an error is + /// returned if it exists. + /// + /// Note that all I/O errors are completely ignored. + pub fn add_child>( + &self, + dir: P, + ) -> (Ignore, Option) { + let (ig, err) = self.add_child_path(dir.as_ref()); + (Ignore(Arc::new(ig)), err) + } + + /// Like add_child, but takes a full path and returns an IgnoreInner. + fn add_child_path(&self, dir: &Path) -> (IgnoreInner, Option) { + static IG_NAMES: &'static [&'static str] = &[".rgignore", ".ignore"]; + + let mut errs = PartialErrorBuilder::default(); + let ig_matcher = + if !self.0.opts.ignore { + Gitignore::empty() + } else { + let (m, err) = create_gitignore(&dir, IG_NAMES); + errs.maybe_push(err); + m + }; + let gi_matcher = + if !self.0.opts.git_ignore { + Gitignore::empty() + } else { + let (m, err) = create_gitignore(&dir, &[".gitignore"]); + errs.maybe_push(err); + m + }; + let gi_exclude_matcher = + if !self.0.opts.git_exclude { + Gitignore::empty() + } else { + let (m, err) = create_gitignore(&dir, &[".git/info/exclude"]); + errs.maybe_push(err); + m + }; + let ig = IgnoreInner { + compiled: self.0.compiled.clone(), + dir: dir.to_path_buf(), + overrides: self.0.overrides.clone(), + types: self.0.types.clone(), + parent: Some(self.clone()), + is_absolute_parent: false, + absolute_base: self.0.absolute_base.clone(), + explicit_ignores: self.0.explicit_ignores.clone(), + ignore_matcher: ig_matcher, + git_global_matcher: self.0.git_global_matcher.clone(), + git_ignore_matcher: gi_matcher, + git_exclude_matcher: gi_exclude_matcher, + has_git: dir.join(".git").is_dir(), + opts: self.0.opts, + }; + (ig, errs.into_error_option()) + } + + /// Returns a match indicating whether the given file path should be + /// ignored or not. + /// + /// The match contains information about its origin. + pub fn matched<'a, P: AsRef>( + &'a self, + path: P, + is_dir: bool, + ) -> Match> { + // We need to be careful with our path. If it has a leading ./, then + // strip it because it causes nothing but trouble. + let mut path = path.as_ref(); + if let Some(p) = strip_prefix("./", path) { + path = p; + } + // Match against the override patterns. If an override matches + // regardless of whether it's whitelist/ignore, then we quit and + // return that result immediately. Overrides have the highest + // precedence. + if !self.0.overrides.is_empty() { + let mat = + self.0.overrides.matched(path, is_dir) + .map(IgnoreMatch::overrides); + if !mat.is_none() { + return mat; + } + } + let mut whitelisted = Match::None; + if self.0.opts.should_ignores() { + let mat = self.matched_ignore(path, is_dir); + if mat.is_ignore() { + return mat; + } else if mat.is_whitelist() { + whitelisted = mat; + } + } + if !self.0.types.is_empty() { + let mat = + self.0.types.matched(path, is_dir).map(IgnoreMatch::types); + if mat.is_ignore() { + return mat; + } else if mat.is_whitelist() { + whitelisted = mat; + } + } + if whitelisted.is_none() && self.0.opts.hidden && is_hidden(path) { + return Match::Ignore(IgnoreMatch::hidden()); + } + whitelisted + } + + /// Performs matching only on the ignore files for this directory and + /// all parent directories. + fn matched_ignore<'a>( + &'a self, + path: &Path, + is_dir: bool, + ) -> Match> { + let (mut m_ignore, mut m_gi, mut m_gi_exclude, mut m_explicit) = + (Match::None, Match::None, Match::None, Match::None); + let mut saw_git = false; + for ig in self.parents().take_while(|ig| !ig.0.is_absolute_parent) { + if m_ignore.is_none() { + m_ignore = + ig.0.ignore_matcher.matched(path, is_dir) + .map(IgnoreMatch::gitignore); + } + if !saw_git && m_gi.is_none() { + m_gi = + ig.0.git_ignore_matcher.matched(path, is_dir) + .map(IgnoreMatch::gitignore); + } + if !saw_git && m_gi_exclude.is_none() { + m_gi_exclude = + ig.0.git_exclude_matcher.matched(path, is_dir) + .map(IgnoreMatch::gitignore); + } + saw_git = saw_git || ig.0.has_git; + } + if let Some(abs_parent_path) = self.absolute_base() { + let path = abs_parent_path.join(path); + for ig in self.parents().skip_while(|ig|!ig.0.is_absolute_parent) { + if m_ignore.is_none() { + m_ignore = + ig.0.ignore_matcher.matched(&path, is_dir) + .map(IgnoreMatch::gitignore); + } + if !saw_git && m_gi.is_none() { + m_gi = + ig.0.git_ignore_matcher.matched(&path, is_dir) + .map(IgnoreMatch::gitignore); + } + if !saw_git && m_gi_exclude.is_none() { + m_gi_exclude = + ig.0.git_exclude_matcher.matched(&path, is_dir) + .map(IgnoreMatch::gitignore); + } + saw_git = saw_git || ig.0.has_git; + } + } + for gi in self.0.explicit_ignores.iter().rev() { + if !m_explicit.is_none() { + break; + } + m_explicit = gi.matched(&path, is_dir).map(IgnoreMatch::gitignore); + } + let m_global = self.0.git_global_matcher.matched(&path, is_dir) + .map(IgnoreMatch::gitignore); + if !m_ignore.is_none() { + m_ignore + } else if !m_gi.is_none() { + m_gi + } else if !m_gi_exclude.is_none() { + m_gi_exclude + } else if !m_global.is_none() { + m_global + } else if !m_explicit.is_none() { + m_explicit + } else { + Match::None + } + } + + /// Returns an iterator over parent ignore matchers, including this one. + fn parents(&self) -> Parents { + Parents(Some(self)) + } + + /// Returns the first absolute path of the first absolute parent, if + /// one exists. + fn absolute_base(&self) -> Option<&Path> { + self.0.absolute_base.as_ref().map(|p| &***p) + } +} + +struct Parents<'a>(Option<&'a Ignore>); + +impl<'a> Iterator for Parents<'a> { + type Item = &'a Ignore; + + fn next(&mut self) -> Option<&'a Ignore> { + match self.0.take() { + None => None, + Some(ig) => { + self.0 = ig.0.parent.as_ref(); + Some(ig) + } + } + } +} + +/// A builder for creating an Ignore matcher. +#[derive(Clone, Debug)] +pub struct IgnoreBuilder { + /// The root directory path for this ignore matcher. + dir: PathBuf, + /// An override matcher (default is empty). + overrides: Arc, + /// A type matcher (default is empty). + types: Arc, + /// Explicit ignore matchers. + explicit_ignores: Vec, + /// Ignore config. + opts: IgnoreOptions, +} + +impl IgnoreBuilder { + /// Create a new builder for an `Ignore` matcher. + /// + /// All relative file paths are resolved with respect to the current + /// working directory. + pub fn new() -> IgnoreBuilder { + IgnoreBuilder { + dir: Path::new("").to_path_buf(), + overrides: Arc::new(Override::empty()), + types: Arc::new(Types::empty()), + explicit_ignores: vec![], + opts: IgnoreOptions { + hidden: true, + ignore: true, + git_global: true, + git_ignore: true, + git_exclude: true, + }, + } + } + + /// Builds a new `Ignore` matcher. + /// + /// The matcher returned won't match anything until ignore rules from + /// directories are added to it. + pub fn build(&self) -> Ignore { + let git_global_matcher = + if !self.opts.git_global { + Gitignore::empty() + } else { + let (gi, err) = Gitignore::global(); + if let Some(err) = err { + debug!("{}", err); + } + gi + }; + Ignore(Arc::new(IgnoreInner { + compiled: Arc::new(RwLock::new(HashMap::new())), + dir: self.dir.clone(), + overrides: self.overrides.clone(), + types: self.types.clone(), + parent: None, + is_absolute_parent: true, + absolute_base: None, + explicit_ignores: Arc::new(self.explicit_ignores.clone()), + ignore_matcher: Gitignore::empty(), + git_global_matcher: Arc::new(git_global_matcher), + git_ignore_matcher: Gitignore::empty(), + git_exclude_matcher: Gitignore::empty(), + has_git: false, + opts: self.opts, + })) + } + + /// Add an override matcher. + /// + /// By default, no override matcher is used. + /// + /// This overrides any previous setting. + pub fn overrides(&mut self, overrides: Override) -> &mut IgnoreBuilder { + self.overrides = Arc::new(overrides); + self + } + + /// Add a file type matcher. + /// + /// By default, no file type matcher is used. + /// + /// This overrides any previous setting. + pub fn types(&mut self, types: Types) -> &mut IgnoreBuilder { + self.types = Arc::new(types); + self + } + + /// Adds a new global ignore matcher from the ignore file path given. + pub fn add_ignore(&mut self, ig: Gitignore) -> &mut IgnoreBuilder { + self.explicit_ignores.push(ig); + self + } + + /// Enables ignoring hidden files. + /// + /// This is enabled by default. + pub fn hidden(&mut self, yes: bool) -> &mut IgnoreBuilder { + self.opts.hidden = yes; + self + } + + /// Enables reading `.ignore` files. + /// + /// `.ignore` files have the same semantics as `gitignore` files and are + /// supported by search tools such as ripgrep and The Silver Searcher. + /// + /// This is enabled by default. + pub fn ignore(&mut self, yes: bool) -> &mut IgnoreBuilder { + self.opts.ignore = yes; + self + } + + /// Add a global gitignore matcher. + /// + /// Its precedence is lower than both normal `.gitignore` files and + /// `.git/info/exclude` files. + /// + /// This overwrites any previous global gitignore setting. + /// + /// This is enabled by default. + pub fn git_global(&mut self, yes: bool) -> &mut IgnoreBuilder { + self.opts.git_global = yes; + self + } + + /// Enables reading `.gitignore` files. + /// + /// `.gitignore` files have match semantics as described in the `gitignore` + /// man page. + /// + /// This is enabled by default. + pub fn git_ignore(&mut self, yes: bool) -> &mut IgnoreBuilder { + self.opts.git_ignore = yes; + self + } + + /// Enables reading `.git/info/exclude` files. + /// + /// `.git/info/exclude` files have match semantics as described in the + /// `gitignore` man page. + /// + /// This is enabled by default. + pub fn git_exclude(&mut self, yes: bool) -> &mut IgnoreBuilder { + self.opts.git_exclude = yes; + self + } +} + +/// Creates a new gitignore matcher for the directory given. +/// +/// Ignore globs are extracted from each of the file names in `dir` in the +/// order given (earlier names have lower precedence than later names). +/// +/// I/O errors are ignored. +pub fn create_gitignore( + dir: &Path, + names: &[&str], +) -> (Gitignore, Option) { + let mut builder = GitignoreBuilder::new(dir); + let mut errs = PartialErrorBuilder::default(); + for name in names { + let gipath = dir.join(name); + errs.maybe_push_ignore_io(builder.add(gipath)); + } + let gi = match builder.build() { + Ok(gi) => gi, + Err(err) => { + errs.push(err); + GitignoreBuilder::new(dir).build().unwrap() + } + }; + (gi, errs.into_error_option()) +} + +#[cfg(test)] +mod tests { + use std::fs::{self, File}; + use std::io::Write; + use std::path::Path; + + use tempdir::TempDir; + + use dir::IgnoreBuilder; + use gitignore::Gitignore; + use Error; + + fn wfile>(path: P, contents: &str) { + let mut file = File::create(path).unwrap(); + file.write_all(contents.as_bytes()).unwrap(); + } + + fn mkdirp>(path: P) { + fs::create_dir_all(path).unwrap(); + } + + fn partial(err: Error) -> Vec { + match err { + Error::Partial(errs) => errs, + _ => panic!("expected partial error but got {:?}", err), + } + } + + #[test] + fn explicit_ignore() { + let td = TempDir::new("ignore-test-").unwrap(); + wfile(td.path().join("not-an-ignore"), "foo\n!bar"); + + let (gi, err) = Gitignore::new(td.path().join("not-an-ignore")); + assert!(err.is_none()); + let (ig, err) = IgnoreBuilder::new() + .add_ignore(gi).build().add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("foo", false).is_ignore()); + assert!(ig.matched("bar", false).is_whitelist()); + assert!(ig.matched("baz", false).is_none()); + } + + #[test] + fn git_exclude() { + let td = TempDir::new("ignore-test-").unwrap(); + mkdirp(td.path().join(".git/info")); + wfile(td.path().join(".git/info/exclude"), "foo\n!bar"); + + let (ig, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("foo", false).is_ignore()); + assert!(ig.matched("bar", false).is_whitelist()); + assert!(ig.matched("baz", false).is_none()); + } + + #[test] + fn gitignore() { + let td = TempDir::new("ignore-test-").unwrap(); + wfile(td.path().join(".gitignore"), "foo\n!bar"); + + let (ig, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("foo", false).is_ignore()); + assert!(ig.matched("bar", false).is_whitelist()); + assert!(ig.matched("baz", false).is_none()); + } + + #[test] + fn ignore() { + let td = TempDir::new("ignore-test-").unwrap(); + wfile(td.path().join(".ignore"), "foo\n!bar"); + + let (ig, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("foo", false).is_ignore()); + assert!(ig.matched("bar", false).is_whitelist()); + assert!(ig.matched("baz", false).is_none()); + } + + // Tests that an .ignore will override a .gitignore. + #[test] + fn ignore_over_gitignore() { + let td = TempDir::new("ignore-test-").unwrap(); + wfile(td.path().join(".gitignore"), "foo"); + wfile(td.path().join(".ignore"), "!foo"); + + let (ig, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("foo", false).is_whitelist()); + } + + // Tests that exclude has lower precedent than both .ignore and .gitignore. + #[test] + fn exclude_lowest() { + let td = TempDir::new("ignore-test-").unwrap(); + wfile(td.path().join(".gitignore"), "!foo"); + wfile(td.path().join(".ignore"), "!bar"); + mkdirp(td.path().join(".git/info")); + wfile(td.path().join(".git/info/exclude"), "foo\nbar\nbaz"); + + let (ig, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_none()); + assert!(ig.matched("baz", false).is_ignore()); + assert!(ig.matched("foo", false).is_whitelist()); + assert!(ig.matched("bar", false).is_whitelist()); + } + + #[test] + fn errored() { + let td = TempDir::new("ignore-test-").unwrap(); + wfile(td.path().join(".gitignore"), "f**oo"); + + let (_, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_some()); + } + + #[test] + fn errored_both() { + let td = TempDir::new("ignore-test-").unwrap(); + wfile(td.path().join(".gitignore"), "f**oo"); + wfile(td.path().join(".ignore"), "fo**o"); + + let (_, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert_eq!(2, partial(err.expect("an error")).len()); + } + + #[test] + fn errored_partial() { + let td = TempDir::new("ignore-test-").unwrap(); + wfile(td.path().join(".gitignore"), "f**oo\nbar"); + + let (ig, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_some()); + assert!(ig.matched("bar", false).is_ignore()); + } + + #[test] + fn errored_partial_and_ignore() { + let td = TempDir::new("ignore-test-").unwrap(); + wfile(td.path().join(".gitignore"), "f**oo\nbar"); + wfile(td.path().join(".ignore"), "!bar"); + + let (ig, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_some()); + assert!(ig.matched("bar", false).is_whitelist()); + } + + #[test] + fn not_present_empty() { + let td = TempDir::new("ignore-test-").unwrap(); + + let (_, err) = IgnoreBuilder::new().build().add_child(td.path()); + assert!(err.is_none()); + } + + #[test] + fn stops_at_git_dir() { + // This tests that .gitignore files beyond a .git barrier aren't + // matched, but .ignore files are. + let td = TempDir::new("ignore-test-").unwrap(); + mkdirp(td.path().join(".git")); + mkdirp(td.path().join("foo/.git")); + wfile(td.path().join(".gitignore"), "foo"); + wfile(td.path().join(".ignore"), "bar"); + + let ig0 = IgnoreBuilder::new().build(); + let (ig1, err) = ig0.add_child(td.path()); + assert!(err.is_none()); + let (ig2, err) = ig1.add_child(ig1.path().join("foo")); + assert!(err.is_none()); + + assert!(ig1.matched("foo", false).is_ignore()); + assert!(ig2.matched("foo", false).is_none()); + + assert!(ig1.matched("bar", false).is_ignore()); + assert!(ig2.matched("bar", false).is_ignore()); + } + + #[test] + fn absolute_parent() { + let td = TempDir::new("ignore-test-").unwrap(); + mkdirp(td.path().join(".git")); + mkdirp(td.path().join("foo")); + wfile(td.path().join(".gitignore"), "bar"); + + // First, check that the parent gitignore file isn't detected if the + // parent isn't added. This establishes a baseline. + let ig0 = IgnoreBuilder::new().build(); + let (ig1, err) = ig0.add_child(td.path().join("foo")); + assert!(err.is_none()); + assert!(ig1.matched("bar", false).is_none()); + + // Second, check that adding a parent directory actually works. + let ig0 = IgnoreBuilder::new().build(); + let (ig1, err) = ig0.add_parents(td.path().join("foo")); + assert!(err.is_none()); + let (ig2, err) = ig1.add_child(td.path().join("foo")); + assert!(err.is_none()); + assert!(ig2.matched("bar", false).is_ignore()); + } + + #[test] + fn absolute_parent_anchored() { + let td = TempDir::new("ignore-test-").unwrap(); + mkdirp(td.path().join(".git")); + mkdirp(td.path().join("src/llvm")); + wfile(td.path().join(".gitignore"), "/llvm/\nfoo"); + + let ig0 = IgnoreBuilder::new().build(); + let (ig1, err) = ig0.add_parents(td.path().join("src")); + assert!(err.is_none()); + let (ig2, err) = ig1.add_child("src"); + assert!(err.is_none()); + + assert!(ig1.matched("llvm", true).is_none()); + assert!(ig2.matched("llvm", true).is_none()); + assert!(ig2.matched("src/llvm", true).is_none()); + assert!(ig2.matched("foo", false).is_ignore()); + assert!(ig2.matched("src/foo", false).is_ignore()); + } +} diff --git a/ignore/src/gitignore.rs b/ignore/src/gitignore.rs new file mode 100644 index 00000000..c44910ff --- /dev/null +++ b/ignore/src/gitignore.rs @@ -0,0 +1,607 @@ +/*! +The gitignore module provides a way to match globs from a gitignore file +against file paths. + +Note that this module implements the specification as described in the +`gitignore` man page from scratch. That is, this module does *not* shell out to +the `git` command line tool. +*/ + +use std::cell::RefCell; +use std::env; +use std::fs::File; +use std::io::{self, BufRead, Read}; +use std::path::{Path, PathBuf}; +use std::str; +use std::sync::Arc; + +use globset::{Candidate, GlobBuilder, GlobSet, GlobSetBuilder}; +use regex::bytes::Regex; +use thread_local::ThreadLocal; + +use pathutil::{is_file_name, strip_prefix}; +use {Error, Match, PartialErrorBuilder}; + +/// Glob represents a single glob in a gitignore file. +/// +/// This is used to report information about the highest precedent glob that +/// matched in one or more gitignore files. +#[derive(Clone, Debug)] +pub struct Glob { + /// The file path that this glob was extracted from. + from: Option, + /// The original glob string. + original: String, + /// The actual glob string used to convert to a regex. + actual: String, + /// Whether this is a whitelisted glob or not. + is_whitelist: bool, + /// Whether this glob should only match directories or not. + is_only_dir: bool, +} + +impl Glob { + /// Returns the file path that defined this glob. + pub fn from(&self) -> Option<&Path> { + self.from.as_ref().map(|p| &**p) + } + + /// The original glob as it was defined in a gitignore file. + pub fn original(&self) -> &str { + &self.original + } + + /// The actual glob that was compiled to respect gitignore + /// semantics. + pub fn actual(&self) -> &str { + &self.actual + } + + /// Whether this was a whitelisted glob or not. + pub fn is_whitelist(&self) -> bool { + self.is_whitelist + } + + /// Whether this glob must match a directory or not. + pub fn is_only_dir(&self) -> bool { + self.is_only_dir + } +} + +/// Gitignore is a matcher for the globs in one or more gitignore files +/// in the same directory. +#[derive(Clone, Debug)] +pub struct Gitignore { + set: GlobSet, + root: PathBuf, + globs: Vec, + num_ignores: u64, + num_whitelists: u64, + matches: Arc>>>, +} + +impl Gitignore { + /// Creates a new gitignore matcher from the gitignore file path given. + /// + /// If it's desirable to include multiple gitignore files in a single + /// matcher, or read gitignore globs from a different source, then + /// use `GitignoreBuilder`. + /// + /// This always returns a valid matcher, even if it's empty. In particular, + /// a Gitignore file can be partially valid, e.g., when one glob is invalid + /// but the rest aren't. + /// + /// Note that I/O errors are ignored. For more granular control over + /// errors, use `GitignoreBuilder`. + pub fn new>( + gitignore_path: P, + ) -> (Gitignore, Option) { + let path = gitignore_path.as_ref(); + let parent = path.parent().unwrap_or(Path::new("/")); + let mut builder = GitignoreBuilder::new(paren