summaryrefslogtreecommitdiffstats
path: root/ignore/src/gitignore.rs
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2016-10-11 19:57:09 -0400
committerAndrew Gallant <jamslam@gmail.com>2016-10-29 20:48:59 -0400
commitd79add341ba4be10bb3459877318b9c5a30f5db3 (patch)
treea6c5222c63d53522635bc847c6ac2cf2e000ff7f /ignore/src/gitignore.rs
parent12b2b1f6242e0c9082e93111ffef24a93fea5f6e (diff)
Move all gitignore matching to separate crate.
This PR introduces a new sub-crate, `ignore`, which primarily provides a fast recursive directory iterator that respects ignore files like gitignore and other configurable filtering rules based on globs or even file types. This results in a substantial source of complexity moved out of ripgrep's core and into a reusable component that others can now (hopefully) benefit from. While much of the ignore code carried over from ripgrep's core, a substantial portion of it was rewritten with the following goals in mind: 1. Reuse matchers built from gitignore files across directory iteration. 2. Design the matcher data structure to be amenable for parallelizing directory iteration. (Indeed, writing the parallel iterator is the next step.) Fixes #9, #44, #45
Diffstat (limited to 'ignore/src/gitignore.rs')
-rw-r--r--ignore/src/gitignore.rs607
1 files changed, 607 insertions, 0 deletions
diff --git a/ignore/src/gitignore.rs b/ignore/src/gitignore.rs
new file mode 100644
index 00000000..c44910ff
--- /dev/null
+++ b/ignore/src/gitignore.rs
@@ -0,0 +1,607 @@
+/*!
+The gitignore module provides a way to match globs from a gitignore file
+against file paths.
+
+Note that this module implements the specification as described in the
+`gitignore` man page from scratch. That is, this module does *not* shell out to
+the `git` command line tool.
+*/
+
+use std::cell::RefCell;
+use std::env;
+use std::fs::File;
+use std::io::{self, BufRead, Read};
+use std::path::{Path, PathBuf};
+use std::str;
+use std::sync::Arc;
+
+use globset::{Candidate, GlobBuilder, GlobSet, GlobSetBuilder};
+use regex::bytes::Regex;
+use thread_local::ThreadLocal;
+
+use pathutil::{is_file_name, strip_prefix};
+use {Error, Match, PartialErrorBuilder};
+
+/// Glob represents a single glob in a gitignore file.
+///
+/// This is used to report information about the highest precedent glob that
+/// matched in one or more gitignore files.
+#[derive(Clone, Debug)]
+pub struct Glob {
+ /// The file path that this glob was extracted from.
+ from: Option<PathBuf>,
+ /// The original glob string.
+ original: String,
+ /// The actual glob string used to convert to a regex.
+ actual: String,
+ /// Whether this is a whitelisted glob or not.
+ is_whitelist: bool,
+ /// Whether this glob should only match directories or not.
+ is_only_dir: bool,
+}
+
+impl Glob {
+ /// Returns the file path that defined this glob.
+ pub fn from(&self) -> Option<&Path> {
+ self.from.as_ref().map(|p| &**p)
+ }
+
+ /// The original glob as it was defined in a gitignore file.
+ pub fn original(&self) -> &str {
+ &self.original
+ }
+
+ /// The actual glob that was compiled to respect gitignore
+ /// semantics.
+ pub fn actual(&self) -> &str {
+ &self.actual
+ }
+
+ /// Whether this was a whitelisted glob or not.
+ pub fn is_whitelist(&self) -> bool {
+ self.is_whitelist
+ }
+
+ /// Whether this glob must match a directory or not.
+ pub fn is_only_dir(&self) -> bool {
+ self.is_only_dir
+ }
+}
+
+/// Gitignore is a matcher for the globs in one or more gitignore files
+/// in the same directory.
+#[derive(Clone, Debug)]
+pub struct Gitignore {
+ set: GlobSet,
+ root: PathBuf,
+ globs: Vec<Glob>,
+ num_ignores: u64,
+ num_whitelists: u64,
+ matches: Arc<ThreadLocal<RefCell<Vec<usize>>>>,
+}
+
+impl Gitignore {
+ /// Creates a new gitignore matcher from the gitignore file path given.
+ ///
+ /// If it's desirable to include multiple gitignore files in a single
+ /// matcher, or read gitignore globs from a different source, then
+ /// use `GitignoreBuilder`.
+ ///
+ /// This always returns a valid matcher, even if it's empty. In particular,
+ /// a Gitignore file can be partially valid, e.g., when one glob is invalid
+ /// but the rest aren't.
+ ///
+ /// Note that I/O errors are ignored. For more granular control over
+ /// errors, use `GitignoreBuilder`.
+ pub fn new<P: AsRef<Path>>(
+ gitignore_path: P,
+ ) -> (Gitignore, Option<Error>) {
+ let path = gitignore_path.as_ref();
+ let parent = path.parent().unwrap_or(Path::new("/"));
+ let mut builder = GitignoreBuilder::new(parent);
+ let mut errs = PartialErrorBuilder::default();
+ errs.maybe_push_ignore_io(builder.add(path));
+ match builder.build() {
+ Ok(gi) => (gi, errs.into_error_option()),
+ Err(err) => {
+ errs.push(err);
+ (Gitignore::empty(), errs.into_error_option())
+ }
+ }
+ }
+
+ /// Creates a new gitignore matcher from the global ignore file, if one
+ /// exists.
+ ///
+ /// The global config file path is specified by git's `core.excludesFile`
+ /// config option.
+ ///
+ /// Git's config file location is `$HOME/.gitconfig`. If `$HOME/.gitconfig`
+ /// does not exist or does not specify `core.excludesFile`, then
+ /// `$XDG_CONFIG_HOME/git/ignore` is read. If `$XDG_CONFIG_HOME` is not
+ /// set or is empty, then `$HOME/.config/git/ignore` is used instead.
+ pub fn global() -> (Gitignore, Option<Error>) {
+ match gitconfig_excludes_path() {
+ None => (Gitignore::empty(), None),
+ Some(path) => {
+ if !path.is_file() {
+ (Gitignore::empty(), None)
+ } else {
+ Gitignore::new(path)
+ }
+ }
+ }
+ }
+
+ /// Creates a new empty gitignore matcher that never matches anything.
+ ///
+ /// Its path is empty.
+ pub fn empty() -> Gitignore {
+ GitignoreBuilder::new("").build().unwrap()
+ }
+
+ /// Returns the directory containing this gitignore matcher.
+ ///
+ /// All matches are done relative to this path.
+ pub fn path(&self) -> &Path {
+ &*self.root
+ }
+
+ /// Returns true if and only if this gitignore has zero globs, and
+ /// therefore never matches any file path.
+ pub fn is_empty(&self) -> bool {
+ self.set.is_empty()
+ }
+
+ /// Returns the total number of globs, which should be equivalent to
+ /// `num_ignores + num_whitelists`.
+ pub fn len(&self) -> usize {
+ self.set.len()
+ }
+
+ /// Returns the total number of ignore globs.
+ pub fn num_ignores(&self) -> u64 {
+ self.num_ignores
+ }
+
+ /// Returns the total number of whitelisted globs.
+ pub fn num_whitelists(&self) -> u64 {
+ self.num_whitelists
+ }
+
+ /// Returns whether the given file path matched a pattern in this gitignore
+ /// matcher.
+ ///
+ /// `is_dir` should be true if the path refers to a directory and false
+ /// otherwise.
+ ///
+ /// The given path is matched relative to the path given when building
+ /// the matcher. Specifically, before matching `path`, its prefix (as
+ /// determined by a common suffix of the directory containing this
+ /// gitignore) is stripped. If there is no common suffix/prefix overlap,
+ /// then `path` is assumed to be relative to this matcher.
+ pub fn matched<P: AsRef<Path>>(
+ &self,
+ path: P,
+ is_dir: bool,
+ ) -> Match<&Glob> {
+ if self.is_empty() {
+ return Match::None;
+ }
+ self.matched_stripped(self.strip(path.as_ref()), is_dir)
+ }
+
+ /// Like matched, but takes a path that has already been stripped.
+ fn matched_stripped<P: AsRef<Path>>(
+ &self,
+ path: P,
+ is_dir: bool,
+ ) -> Match<&Glob> {
+ if self.is_empty() {
+ return Match::None;
+ }
+ let path = path.as_ref();
+ let _matches = self.matches.get_default();
+ let mut matches = _matches.borrow_mut();
+ let candidate = Candidate::new(path);
+ self.set.matches_candidate_into(&candidate, &mut *matches);
+ for &i in matches.iter().rev() {
+ let glob = &self.globs[i];
+ if !glob.is_only_dir() || is_dir {
+ return if glob.is_whitelist() {
+ Match::Whitelist(glob)
+ } else {
+ Match::Ignore(glob)
+ };
+ }
+ }
+ Match::None
+ }
+
+ /// Strips the given path such that it's suitable for matching with this
+ /// gitignore matcher.
+ fn strip<'a, P: 'a + AsRef<Path> + ?Sized>(
+ &'a self,
+ path: &'a P,
+ ) -> &'a Path {
+ let mut path = path.as_ref();
+ // A leading ./ is completely superfluous. We also strip it from
+ // our gitignore root path, so we need to strip it from our candidate
+ // path too.
+ if let Some(p) = strip_prefix("./", path) {
+ path = p;
+ }
+ // Strip any common prefix between the candidate path and the root
+ // of the gitignore, to make sure we get relative matching right.
+ // BUT, a file name might not have any directory components to it,
+ // in which case, we don't want to accidentally strip any part of the
+ // file name.
+ if !is_file_name(path) {
+ if let Some(p) = strip_prefix(&self.root, path) {
+ path = p;
+ // If we're left with a leading slash, get rid of it.
+ if let Some(p) = strip_prefix("/", path) {
+ path = p;
+ }
+ }
+ }
+ path
+ }
+}
+
+/// Builds a matcher for a single set of globs from a .gitignore file.
+pub struct GitignoreBuilder {
+ builder: GlobSetBuilder,
+ root: PathBuf,
+ globs: Vec<Glob>,
+}
+
+impl GitignoreBuilder {
+ /// Create a new builder for a gitignore file.
+ ///
+ /// The path given should be the path at which the globs for this gitignore
+ /// file should be matched. Note that paths are always matched relative
+ /// to the root path given here. Generally, the root path should correspond
+ /// to the *directory* containing a `.gitignore` file.
+ pub fn new<P: AsRef<Path>>(root: P) -> GitignoreBuilder {
+ let root = root.as_ref();
+ GitignoreBuilder {
+ builder: GlobSetBuilder::new(),
+ root: strip_prefix("./", root).unwrap_or(root).to_path_buf(),
+ globs: vec![],
+ }
+ }
+
+ /// Builds a new matcher from the globs added so far.
+ ///
+ /// Once a matcher is built, no new globs can be added to it.
+ pub fn build(&self) -> Result<Gitignore, Error> {
+ let nignore = self.globs.iter().filter(|g| !g.is_whitelist()).count();
+ let nwhite = self.globs.iter().filter(|g| g.is_whitelist()).count();
+ let set = try!(
+ self.builder.build().map_err(|err| Error::Glob(err.to_string())));
+ Ok(Gitignore {
+ set: set,
+ root: self.root.clone(),
+ globs: self.globs.clone(),
+ num_ignores: nignore as u64,
+ num_whitelists: nwhite as u64,
+ matches: Arc::new(ThreadLocal::default()),
+ })
+ }
+
+ /// Add each glob from the file path given.
+ ///
+ /// The file given should be formatted as a `gitignore` file.
+ ///
+ /// Note that partial errors can be returned. For example, if there was
+ /// a problem adding one glob, an error for that will be returned, but
+ /// all other valid globs will still be added.
+ pub fn add<P: AsRef<Path>>(&mut self, path: P) -> Option<Error> {
+ let path = path.as_ref();
+ let file = match File::open(path) {
+ Err(err) => return Some(Error::Io(err).with_path(path)),
+ Ok(file) => file,
+ };
+ let rdr = io::BufReader::new(file);
+ let mut errs = PartialErrorBuilder::default();
+ for (i, line) in rdr.lines().enumerate() {
+ let lineno = (i + 1) as u64;
+ let line = match line {
+ Ok(line) => line,
+ Err(err) => {
+ errs.push(Error::Io(err).tagged(path, lineno));
+ continue;
+ }
+ };
+ if let Err(err) = self.add_line(Some(path.to_path_buf()), &line) {
+ errs.push(err.tagged(path, lineno));
+ }
+ }
+ errs.into_error_option()
+ }
+
+ /// Add each glob line from the string given.
+ ///
+ /// If this string came from a particular `gitignore` file, then its path
+ /// should be provided here.
+ ///
+ /// The string given should be formatted as a `gitignore` file.
+ #[cfg(test)]
+ fn add_str(
+ &mut self,
+ from: Option<PathBuf>,
+ gitignore: &str,
+ ) -> Result<&mut GitignoreBuilder, Error> {
+ for line in gitignore.lines() {
+ try!(self.add_line(from.clone(), line));
+ }
+ Ok(self)
+ }
+
+ /// Add a line from a gitignore file to this builder.
+ ///
+ /// If this line came from a particular `gitignore` file, then its path
+ /// should be provided here.
+ ///
+ /// If the line could not be parsed as a glob, then an error is returned.
+ pub fn add_line(
+ &mut self,
+ from: Option<PathBuf>,
+ mut line: &str,
+ ) -> Result<&mut GitignoreBuilder, Error> {
+ if line.starts_with("#") {
+ return Ok(self);
+ }
+ if !line.ends_with("\\ ") {
+ line = line.trim_right();
+ }
+ if line.is_empty() {
+ return Ok(self);
+ }
+ let mut glob = Glob {
+ from: from,
+ original: line.to_string(),
+ actual: String::new(),
+ is_whitelist: false,
+ is_only_dir: false,
+ };
+ let mut literal_separator = false;
+ let has_slash = line.chars().any(|c| c == '/');
+ let is_absolute = line.chars().nth(0).unwrap() == '/';
+ if line.starts_with("\\!") || line.starts_with("\\#") {
+ line = &line[1..];
+ } else {
+ if line.starts_with("!") {
+ glob.is_whitelist = true;
+ line = &line[1..];
+ }
+ if line.starts_with("/") {
+ // `man gitignore` says that if a glob starts with a slash,
+ // then the glob can only match the beginning of a path
+ // (relative to the location of gitignore). We achieve this by
+ // simply banning wildcards from matching /.
+ literal_separator = true;
+ line = &line[1..];
+ }
+ }
+ // If it ends with a slash, then this should only match directories,
+ // but the slash should otherwise not be used while globbing.
+ if let Some((i, c)) = line.char_indices().rev().nth(0) {
+ if c == '/' {
+ glob.is_only_dir = true;
+ line = &line[..i];
+ }
+ }
+ // If there is a literal slash, then we note that so that globbing
+ // doesn't let wildcards match slashes.
+ glob.actual = line.to_string();
+ if has_slash {
+ literal_separator = true;
+ }
+ // If there was a leading slash, then this is a glob that must
+ // match the entire path name. Otherwise, we should let it match
+ // anywhere, so use a **/ prefix.
+ if !is_absolute {
+ // ... but only if we don't already have a **/ prefix.
+ if !glob.actual.starts_with("**/") {
+ glob.actual = format!("**/{}", glob.actual);
+ }
+ }
+ // If the glob ends with `/**`, then we should only match everything
+ // inside a directory, but not the directory itself. Standard globs
+ // will match the directory. So we add `/*` to force the issue.
+ if glob.actual.ends_with("/**") {
+ glob.actual = format!("{}/*", glob.actual);
+ }
+ let parsed = try!(
+ GlobBuilder::new(&glob.actual)
+ .literal_separator(literal_separator)
+ .build()
+ .map_err(|err| Error::Glob(err.to_string())));
+ self.builder.add(parsed);
+ self.globs.push(glob);
+ Ok(self)
+ }
+}
+
+/// Return the file path of the current environment's global gitignore file.
+///
+/// Note that the file path returned may not exist.
+fn gitconfig_excludes_path() -> Option<PathBuf> {
+ gitconfig_contents()
+ .and_then(|data| parse_excludes_file(&data))
+ .or_else(excludes_file_default)
+}
+
+/// Returns the file contents of git's global config file, if one exists.
+fn gitconfig_contents() -> Option<Vec<u8>> {
+ let home = match env::var_os("HOME") {
+ None => return None,
+ Some(home) => PathBuf::from(home),
+ };
+ let mut file = match File::open(home.join(".gitconfig")) {
+ Err(_) => return None,
+ Ok(file) => io::BufReader::new(file),
+ };
+ let mut contents = vec![];
+ file.read_to_end(&mut contents).ok().map(|_| contents)
+}
+
+/// Returns the default file path for a global .gitignore file.
+///
+/// Specifically, this respects XDG_CONFIG_HOME.
+fn excludes_file_default() -> Option<PathBuf> {
+ env::var_os("XDG_CONFIG_HOME")
+ .and_then(|x| if x.is_empty() { None } else { Some(x) })
+ .or_else(|| env::var_os("HOME"))
+ .map(|x| PathBuf::from(x).join("git/ignore"))
+}
+
+/// Extract git's `core.excludesfile` config setting from the raw file contents
+/// given.
+fn parse_excludes_file(data: &[u8]) -> Option<PathBuf> {
+ // N.B. This is the lazy approach, and isn't technically correct, but
+ // probably works in more circumstances. I guess we would ideally have
+ // a full INI parser. Yuck.
+ lazy_static! {
+ static ref RE: Regex = Regex::new(
+ r"(?ium)^\s*excludesfile\s*=\s*(.+)\s*$").unwrap();
+ };
+ let caps = match RE.captures(data) {
+ None => return None,
+ Some(caps) => caps,
+ };
+ str::from_utf8(&caps[1]).ok().map(|s| PathBuf::from(expand_tilde(s)))
+}
+
+/// Expands ~ in file paths to the value of $HOME.
+fn expand_tilde(path: &str) -> String {
+ let home = match env::var("HOME") {
+ Err(_) => return path.to_string(),
+ Ok(home) => home,
+ };
+ path.replace("~", &home)
+}
+
+#[cfg(test)]
+mod tests {
+ use std::path::Path;
+ use super::{Gitignore, GitignoreBuilder};
+
+ fn gi_from_str<P: AsRef<Path>>(root: P, s: &str) -> Gitignore {
+ let mut builder = GitignoreBuilder::new(root);
+ builder.add_str(None, s).unwrap();
+ builder.build().unwrap()
+ }
+
+ macro_rules! ignored {
+ ($name:ident, $root:expr, $gi:expr, $path:expr) => {
+ ignored!($name, $root, $gi, $path, false);
+ };
+ ($name:ident, $root:expr, $gi:expr, $path:expr, $is_dir:expr) => {
+ #[test]
+ fn $name() {
+ let gi = gi_from_str($root, $gi);
+ assert!(gi.matched($path, $is_dir).is_ignore());
+ }
+ };
+ }
+
+ macro_rules! not_ignored {
+ ($name:ident, $root:expr, $gi:expr, $path:expr) => {
+ not_ignored!($name, $root, $gi, $path, false);
+ };
+ ($name:ident, $root:expr, $gi:expr, $path:expr, $is_dir:expr) => {
+ #[test]
+ fn $name() {
+ let gi = gi_from_str($root, $gi);
+ assert!(!gi.matched($path, $is_dir).is_ignore());
+ }
+ };
+ }
+
+ const ROOT: &'static str = "/home/foobar/rust/rg";
+
+ ignored!(ig1, ROOT, "months", "months");
+ ignored!(ig2, ROOT, "*.lock", "Cargo.lock");
+ ignored!(ig3, ROOT, "*.rs", "src/main.rs");
+ ignored!(ig4, ROOT, "src/*.rs", "src/main.rs");
+ ignored!(ig5, ROOT, "/*.c", "cat-file.c");
+ ignored!(ig6, ROOT, "/src/*.rs", "src/main.rs");
+ ignored!(ig7, ROOT, "!src/main.rs\n*.rs", "src/main.rs");
+ ignored!(ig8, ROOT, "foo/", "foo", true);
+ ignored!(ig9, ROOT, "**/foo", "foo");
+ ignored!(ig10, ROOT, "**/foo", "src/foo");
+ ignored!(ig11, ROOT, "**/foo/**", "src/foo/bar");
+ ignored!(ig12, ROOT, "**/foo/**", "wat/src/foo/bar/baz");
+ ignored!(ig13, ROOT, "**/foo/bar", "foo/bar");
+ ignored!(ig14, ROOT, "**/foo/bar", "src/foo/bar");
+ ignored!(ig15, ROOT, "abc/**", "abc/x");
+ ignored!(ig16, ROOT, "abc/**", "abc/x/y");
+ ignored!(ig17, ROOT, "abc/**", "abc/x/y/z");
+ ignored!(ig18, ROOT, "a/**/b", "a/b");
+ ignored!(ig19, ROOT, "a/**/b", "a/x/b");
+ ignored!(ig20, ROOT, "a/**/b", "a/x/y/b");
+ ignored!(ig21, ROOT, r"\!xy", "!xy");
+ ignored!(ig22, ROOT, r"\#foo", "#foo");
+ ignored!(ig23, ROOT, "foo", "./foo");
+ ignored!(ig24, ROOT, "target", "grep/target");
+ ignored!(ig25, ROOT, "Cargo.lock", "./tabwriter-bin/Cargo.lock");
+ ignored!(ig26, ROOT, "/foo/bar/baz", "./foo/bar/baz");
+ ignored!(ig27, ROOT, "foo/", "xyz/foo", true);
+ ignored!(ig28, ROOT, "src/*.rs", "src/grep/src/main.rs");
+ ignored!(ig29, "./src", "/llvm/", "./src/llvm", true);
+ ignored!(ig30, ROOT, "node_modules/ ", "node_modules", true);
+
+ not_ignored!(ignot1, ROOT, "amonths", "months");
+ not_ignored!(ignot2, ROOT, "monthsa", "months");
+ not_ignored!(ignot3, ROOT, "/src/*.rs", "src/grep/src/main.rs");
+ not_ignored!(ignot4, ROOT, "/*.c", "mozilla-sha1/sha1.c");
+ not_ignored!(ignot5, ROOT, "/src/*.rs", "src/grep/src/main.rs");
+ not_ignored!(ignot6, ROOT, "*.rs\n!src/main.rs", "src/main.rs");
+ not_ignored!(ignot7, ROOT, "foo/", "foo", false);
+ not_ignored!(ignot8, ROOT, "**/foo/**", "wat/src/afoo/bar/baz");
+ not_ignored!(ignot9, ROOT, "**/foo/**", "wat/src/fooa/bar/baz");
+ not_ignored!(ignot10, ROOT, "**/foo/bar", "foo/src/bar");
+ not_ignored!(ignot11, ROOT, "#foo", "#foo");
+ not_ignored!(ignot12, ROOT, "\n\n\n", "foo");
+ not_ignored!(ignot13, ROOT, "foo/**", "foo", true);
+ not_ignored!(
+ ignot14, "./third_party/protobuf", "m4/ltoptions.m4",
+ "./third_party/protobuf/csharp/src/packages/repositories.config");
+
+ fn bytes(s: &str) -> Vec<u8> {
+ s.to_string().into_bytes()
+ }
+
+ fn path_string<P: AsRef<Path>>(path: P) -> String {
+ path.as_ref().to_str().unwrap().to_string()
+ }
+
+ #[test]
+ fn parse_excludes_file1() {
+ let data = bytes("[core]\nexcludesFile = /foo/bar");
+ let got = super::parse_excludes_file(&data).unwrap();
+ assert_eq!(path_string(got), "/foo/bar");
+ }
+
+ #[test]
+ fn parse_excludes_file2() {
+ let data = bytes("[core]\nexcludesFile = ~/foo/bar");
+ let got = super::parse_excludes_file(&data).unwrap();
+ assert_eq!(path_string(got), super::expand_tilde("~/foo/bar"));
+ }
+
+ #[test]
+ fn parse_excludes_file3() {
+ let data = bytes("[core]\nexcludeFile = /foo/bar");
+ assert!(super::parse_excludes_file(&data).is_none());
+ }
+
+ // See: https://github.com/BurntSushi/ripgrep/issues/106
+ #[test]
+ fn regression_106() {
+ gi_from_str("/", " ");
+ }
+}