From 0e46171e3b189b2bd89f45c3a492dea36361d8bc Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 15 Sep 2016 22:06:04 -0400 Subject: Rework glob sets. We try to reduce the pressure on regexes and offload some of it to Aho-Corasick or exact lookups. --- Cargo.lock | 7 + Cargo.toml | 1 + benchsuite | 26 ++-- src/gitignore.rs | 51 +++---- src/glob.rs | 419 +++++++++++++++++++++++++++++++++++++++++++++++++------ src/ignore.rs | 3 +- src/main.rs | 7 +- src/pathutil.rs | 80 +++++++++++ src/types.rs | 12 +- 9 files changed, 517 insertions(+), 89 deletions(-) create mode 100644 src/pathutil.rs diff --git a/Cargo.lock b/Cargo.lock index dc9cac03..96da64f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5,6 +5,7 @@ dependencies = [ "deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "docopt 0.6.83 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", + "fnv 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", "grep 0.1.0", "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", @@ -57,6 +58,11 @@ dependencies = [ "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "fnv" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "fs2" version = "0.2.5" @@ -230,6 +236,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1614659040e711785ed8ea24219140654da1729f3ec8a47a9719d041112fe7bf" "checksum docopt 0.6.83 (registry+https://github.com/rust-lang/crates.io-index)" = "fc42c6077823a361410c37d47c2535b73a190cbe10838dc4f400fe87c10c8c3b" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" +"checksum fnv 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8e8af7b5408ab0c4910cad114c8f9eb454bf75df7afe8964307eeafb68a13a5e" "checksum fs2 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "bcd414e5a1a979b931bb92f41b7a54106d3f6d2e6c253e9ce943b7cd468251ef" "checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb" "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" diff --git a/Cargo.toml b/Cargo.toml index 28d941c0..b77d621e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ path = "tests/tests.rs" deque = "0.3" docopt = "0.6" env_logger = "0.3" +fnv = "1.0" grep = { version = "0.1", path = "grep" } lazy_static = "0.2" libc = "0.2" diff --git a/benchsuite b/benchsuite index d6ad1aa5..82bb31df 100755 --- a/benchsuite +++ b/benchsuite @@ -64,7 +64,9 @@ def bench_linux_literal_default(suite_dir): # doesn't read gitignore files. Instead, it has a file whitelist # that happens to match up exactly with the gitignores for this search. mkcmd('ucg', ['ucg', pat]), - mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'C'}), + # I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the + # default, but I'd guess it to be on most desktop systems. + mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}), mkcmd('pt', ['pt', pat]), # sift reports an extra line here for a binary file matched. mkcmd('sift', ['sift', pat]), @@ -89,11 +91,10 @@ def bench_linux_literal(suite_dir): return Benchmark(pattern=pat, commands=[ mkcmd('rg', ['rg', '-n', pat]), - mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]), - mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]), - mkcmd('ag', ['ag', '-s', pat]), - mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]), - mkcmd('ucg', ['ucg', '--nosmart-case', pat]), + mkcmd('rg (mmap)', ['rg', '-n', '--mmap', pat]), + mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]), + mkcmd('ag (mmap)', ['ag', '-s', pat]), + mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]), mkcmd('git grep', [ 'git', 'grep', '-I', '-n', pat, ], env={'LC_ALL': 'C'}), @@ -121,13 +122,16 @@ def bench_linux_literal_casei(suite_dir): return Benchmark(pattern=pat, commands=[ mkcmd('rg', ['rg', '-n', '-i', pat]), - mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]), - mkcmd('rg-novcs-mmap', [ - 'rg', '--mmap', '--no-ignore', '-n', '-i', pat, + mkcmd('rg (mmap)', ['rg', '-n', '-i', pat]), + mkcmd('rg (whitelist)', [ + 'rg', '-n', '-i', '--no-ignore', '-tall', pat, ]), - mkcmd('ag', ['ag', '-i', pat]), - mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-i', pat]), + mkcmd('ag (mmap)', ['ag', '-i', pat]), mkcmd('ucg', ['ucg', '-i', pat]), + # It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here, + # since that is certainly what ripgrep is doing, but this is for an + # ASCII literal, so we should give `git grep` all the opportunity to + # do its best. mkcmd('git grep', [ 'git', 'grep', '-I', '-n', '-i', pat, ], env={'LC_ALL': 'C'}), diff --git a/src/gitignore.rs b/src/gitignore.rs index 31a16ea6..cac42a94 100644 --- a/src/gitignore.rs +++ b/src/gitignore.rs @@ -21,6 +21,7 @@ additional rules such as whitelists (prefix of `!`) or directory-only globs // TODO(burntsushi): Implement something similar, but for Mercurial. We can't // use this exact implementation because hgignore files are different. +use std::cell::RefCell; use std::error::Error as StdError; use std::fmt; use std::fs::File; @@ -30,6 +31,7 @@ use std::path::{Path, PathBuf}; use regex; use glob; +use pathutil::strip_prefix; /// Represents an error that can occur when parsing a gitignore file. #[derive(Debug)] @@ -110,37 +112,35 @@ impl Gitignore { /// same directory as this gitignore file. pub fn matched>(&self, path: P, is_dir: bool) -> Match { let mut path = path.as_ref(); - if let Ok(p) = path.strip_prefix(&self.root) { + if let Some(p) = strip_prefix("./", path) { path = p; } - self.matched_utf8(&*path.to_string_lossy(), is_dir) + if let Some(p) = strip_prefix(&self.root, path) { + path = p; + } + self.matched_stripped(path, is_dir) } - /// Like matched, but takes a path that has already been stripped and - /// converted to UTF-8. - pub fn matched_utf8(&self, path: &str, is_dir: bool) -> Match { - // A single regex with a bunch of alternations of glob patterns is - // unfortunately typically faster than a regex, so we use it as a - // first pass filter. We still need to run the RegexSet to get the most - // recently defined glob that matched. - if !self.set.is_match(path) { - return Match::None; + /// Like matched, but takes a path that has already been stripped. + pub fn matched_stripped(&self, path: &Path, is_dir: bool) -> Match { + thread_local! { + static MATCHES: RefCell> = RefCell::new(vec![]); } - // The regex set can't actually pick the right glob that matched all - // on its own. In particular, some globs require that only directories - // can match. Thus, only accept a match from the regex set if the given - // path satisfies the corresponding glob's directory criteria. - for i in self.set.matches(path).iter().rev() { - let pat = &self.patterns[i]; - if !pat.only_dir || is_dir { - return if pat.whitelist { - Match::Whitelist(pat) - } else { - Match::Ignored(pat) - }; + MATCHES.with(|matches| { + let mut matches = matches.borrow_mut(); + self.set.matches_into(path, &mut *matches); + for &i in matches.iter().rev() { + let pat = &self.patterns[i]; + if !pat.only_dir || is_dir { + return if pat.whitelist { + Match::Whitelist(pat) + } else { + Match::Ignored(pat) + }; + } } - } - Match::None + Match::None + }) } /// Returns the total number of ignore patterns. @@ -390,6 +390,7 @@ mod tests { ignored!(ig23, ROOT, "foo", "./foo"); ignored!(ig24, ROOT, "target", "grep/target"); ignored!(ig25, ROOT, "Cargo.lock", "./tabwriter-bin/Cargo.lock"); + ignored!(ig26, ROOT, "/foo/bar/baz", "./foo/bar/baz"); not_ignored!(ignot1, ROOT, "amonths", "months"); not_ignored!(ignot2, ROOT, "monthsa", "months"); diff --git a/src/glob.rs b/src/glob.rs index f16a75e4..c5b5d8d3 100644 --- a/src/glob.rs +++ b/src/glob.rs @@ -26,13 +26,22 @@ to make its way into `glob` proper. // at the .gitignore for the chromium repo---just about every pattern satisfies // that assumption.) +use std::borrow::Cow; +use std::collections::HashMap; use std::error::Error as StdError; +use std::ffi::{OsStr, OsString}; use std::fmt; +use std::hash; use std::iter; +use std::path::Path; use std::str; +use fnv; use regex; -use regex::bytes::{Regex, RegexSet, SetMatches}; +use regex::bytes::Regex; +use regex::bytes::RegexSet; + +use pathutil::file_name; /// Represents an error that can occur when parsing a glob pattern. #[derive(Clone, Debug, Eq, PartialEq)] @@ -71,33 +80,165 @@ impl fmt::Display for Error { } } -/// Set represents a group of globs that can be matched together in a single -/// pass. +/// SetYesNo represents a group of globs that can be matched together in a +/// single pass. SetYesNo can only determine whether a particular path matched +/// any pattern in the set. #[derive(Clone, Debug)] -pub struct Set { +pub struct SetYesNo { re: Regex, - set: RegexSet, } -impl Set { +impl SetYesNo { /// Returns true if and only if the given path matches at least one glob /// in this set. - pub fn is_match>(&self, path: T) -> bool { - self.re.is_match(path.as_ref()) + pub fn is_match>(&self, path: T) -> bool { + self.re.is_match(&*path_bytes(path.as_ref())) } - /// Returns every glob pattern (by sequence number) that matches the given - /// path. - pub fn matches>(&self, path: T) -> SetMatches { - // TODO(burntsushi): If we split this out into a separate crate, don't - // expose the regex::SetMatches type in the public API. - self.set.matches(path.as_ref()) + fn new( + pats: &[(Pattern, MatchOptions)], + ) -> Result { + let mut joined = String::new(); + for &(ref p, ref o) in pats { + let part = format!("(?:{})", p.to_regex_with(o)); + if !joined.is_empty() { + joined.push('|'); + } + joined.push_str(&part); + } + Ok(SetYesNo { re: try!(Regex::new(&joined)) }) } +} + +type Fnv = hash::BuildHasherDefault; + +/// Set represents a group of globs that can be matched together in a single +/// pass. +#[derive(Clone, Debug)] +pub struct Set { + yesno: SetYesNo, + exts: HashMap, Fnv>, + literals: HashMap, Vec, Fnv>, + base_literals: HashMap, Vec, Fnv>, + base_prefixes: Vec>, + base_prefixes_map: Vec, + base_suffixes: Vec>, + base_suffixes_map: Vec, + regexes: RegexSet, + regexes_map: Vec, +} - /// Returns the number of glob patterns in this set. +impl Set { + /// Returns the sequence number of every glob pattern that matches the + /// given path. #[allow(dead_code)] - pub fn len(&self) -> usize { - self.set.len() + pub fn matches>(&self, path: T) -> Vec { + let mut into = vec![]; + self.matches_into(path, &mut into); + into + } + + /// Adds the sequence number of every glob pattern that matches the given + /// path to the vec given. + pub fn matches_into>( + &self, + path: T, + into: &mut Vec, + ) { + into.clear(); + let path = path.as_ref(); + let path_bytes = &*path_bytes(path); + let basename = file_name(path).map(|b| os_str_bytes(b)); + if !self.yesno.is_match(path) { + return; + } + if !self.exts.is_empty() { + if let Some(ext) = path.extension() { + if let Some(matches) = self.exts.get(ext) { + into.extend(matches.as_slice()); + } + } + } + if !self.literals.is_empty() { + if let Some(matches) = self.literals.get(path_bytes) { + into.extend(matches.as_slice()); + } + } + if !self.base_literals.is_empty() { + if let Some(ref basename) = basename { + if let Some(matches) = self.base_literals.get(&**basename) { + into.extend(matches.as_slice()); + } + } + } + if !self.base_prefixes.is_empty() { + if let Some(ref basename) = basename { + let basename = &**basename; + for (i, pre) in self.base_prefixes.iter().enumerate() { + if pre.len() <= basename.len() && &**pre == &basename[0..pre.len()] { + into.push(self.base_prefixes_map[i]); + } + } + } + } + if !self.base_suffixes.is_empty() { + if let Some(ref basename) = basename { + let basename = &**basename; + for (i, suf) in self.base_suffixes.iter().enumerate() { + if suf.len() > basename.len() { + continue; + } + let (s, e) = (basename.len() - suf.len(), basename.len()); + if &**suf == &basename[s..e] { + into.push(self.base_suffixes_map[i]); + } + } + } + } + into.extend(self.regexes.matches(path_bytes)); + into.sort(); + } + + fn new(pats: &[(Pattern, MatchOptions)]) -> Result { + let fnv = Fnv::default(); + let mut exts = HashMap::with_hasher(fnv.clone()); + let mut literals = HashMap::with_hasher(fnv.clone()); + let mut base_literals = HashMap::with_hasher(fnv.clone()); + let (mut base_prefixes, mut base_prefixes_map) = (vec![], vec![]); + let (mut base_suffixes, mut base_suffixes_map) = (vec![], vec![]); + let (mut regexes, mut regexes_map) = (vec![], vec![]); + for (i, &(ref p, ref o)) in pats.iter().enumerate() { + if let Some(ext) = p.ext() { + exts.entry(ext).or_insert(vec![]).push(i); + } else if let Some(literal) = p.literal() { + literals.entry(literal.into_bytes()).or_insert(vec![]).push(i); + } else if let Some(literal) = p.base_literal() { + base_literals + .entry(literal.into_bytes()).or_insert(vec![]).push(i); + } else if let Some(literal) = p.base_literal_prefix() { + base_prefixes.push(literal.into_bytes()); + base_prefixes_map.push(i); + } else if let Some(literal) = p.base_literal_suffix() { + base_suffixes.push(literal.into_bytes()); + base_suffixes_map.push(i); + } else { + let part = format!("(?:{})", p.to_regex_with(o)); + regexes.push(part); + regexes_map.push(i); + } + } + Ok(Set { + yesno: try!(SetYesNo::new(pats)), + exts: exts, + literals: literals, + base_literals: base_literals, + base_prefixes: base_prefixes, + base_prefixes_map: base_prefixes_map, + base_suffixes: base_suffixes, + base_suffixes_map: base_suffixes_map, + regexes: try!(RegexSet::new(regexes)), + regexes_map: regexes_map, + }) } } @@ -119,19 +260,12 @@ impl SetBuilder { /// /// Once a matcher is built, no new patterns can be added to it. pub fn build(&self) -> Result { - let it = self.pats.iter().map(|&(ref p, ref o)| p.to_regex_with(o)); - let set = try!(RegexSet::new(it)); + Set::new(&self.pats) + } - let mut joined = String::new(); - for &(ref p, ref o) in &self.pats { - let part = format!("(?:{})", p.to_regex_with(o)); - if !joined.is_empty() { - joined.push('|'); - } - joined.push_str(&part); - } - let re = try!(Regex::new(&joined)); - Ok(Set { re: re, set: set }) + /// Like `build`, but returns a matcher that can only answer yes/no. + pub fn build_yesno(&self) -> Result { + SetYesNo::new(&self.pats) } /// Add a new pattern to this set. @@ -149,8 +283,21 @@ impl SetBuilder { pat: &str, opts: &MatchOptions, ) -> Result<(), Error> { - let pat = try!(Pattern::new(pat)); - self.pats.push((pat, opts.clone())); + let parsed = try!(Pattern::new(pat)); + // if let Some(ext) = parsed.ext() { + // eprintln!("ext :: {:?} :: {:?}", ext, pat); + // } else if let Some(lit) = parsed.literal() { + // eprintln!("literal :: {:?} :: {:?}", lit, pat); + // } else if let Some(lit) = parsed.base_literal() { + // eprintln!("base_literal :: {:?} :: {:?}", lit, pat); + // } else if let Some(lit) = parsed.base_literal_prefix() { + // eprintln!("base_literal :: {:?} :: {:?}", lit, pat); + // } else if let Some(lit) = parsed.base_literal_suffix() { + // eprintln!("base_literal :: {:?} :: {:?}", lit, pat); + // } else { + // eprintln!("regex :: {:?} :: {:?}", pat, parsed); + // } + self.pats.push((parsed, opts.clone())); Ok(()) } } @@ -204,6 +351,114 @@ impl Pattern { Ok(p.p) } + /// Returns an extension if this pattern exclusively matches it. + pub fn ext(&self) -> Option { + if self.tokens.len() <= 3 { + return None; + } + match self.tokens.get(0) { + Some(&Token::RecursivePrefix) => {} + _ => return None, + } + match self.tokens.get(1) { + Some(&Token::ZeroOrMore) => {} + _ => return None, + } + match self.tokens.get(2) { + Some(&Token::Literal(c)) if c == '.' => {} + _ => return None, + } + let mut lit = OsString::new(); + for t in self.tokens[3..].iter() { + match *t { + Token::Literal(c) if c == '/' || c == '\\' || c == '.' => { + return None; + } + Token::Literal(c) => lit.push(c.to_string()), + _ => return None, + } + } + Some(lit) + } + + /// Returns the pattern as a literal if and only if the pattern exclusiely + /// matches the basename of a file path *and* is a literal. + /// + /// The basic format of these patterns is `**/{literal}`, where `{literal}` + /// does not contain a path separator. + pub fn base_literal(&self) -> Option { + match self.tokens.get(0) { + Some(&Token::RecursivePrefix) => {} + _ => return None, + } + let mut lit = String::new(); + for t in &self.tokens[1..] { + match *t { + Token::Literal(c) if c == '/' || c == '\\' => return None, + Token::Literal(c) => lit.push(c), + _ => return None, + } + } + Some(lit) + } + + /// Returns the pattern as a literal if and only if the pattern must match + /// an entire path exactly. + /// + /// The basic format of these patterns is `{literal}`. + pub fn literal(&self) -> Option { + let mut lit = String::new(); + for t in &self.tokens { + match *t { + Token::Literal(c) => lit.push(c), + _ => return None, + } + } + Some(lit) + } + + /// Returns a basename literal prefix of this pattern. + pub fn base_literal_prefix(&self) -> Option { + match self.tokens.get(0) { + Some(&Token::RecursivePrefix) => {} + _ => return None, + } + match self.tokens.last() { + Some(&Token::ZeroOrMore) => {} + _ => return None, + } + let mut lit = String::new(); + for t in &self.tokens[1..self.tokens.len()-1] { + match *t { + Token::Literal(c) if c == '/' || c == '\\' => return None, + Token::Literal(c) => lit.push(c), + _ => return None, + } + } + Some(lit) + } + + /// Returns a basename literal suffix of this pattern. + pub fn base_literal_suffix(&self) -> Option { + match self.tokens.get(0) { + Some(&Token::RecursivePrefix) => {} + _ => return None, + } + match self.tokens.get(1) { + Some(&Token::ZeroOrMore) => {} + _ => return None, + } + let mut lit = String::new(); + for t in &self.tokens[2..] { + match *t { + Token::Literal(c) if c == '/' || c == '\\' => return None, + Token::Literal(c) => lit.push(c), + _ => return None, + } + } + Some(lit) + } + /// Convert this pattern to a string that is guaranteed to be a valid /// regular expression and will represent the matching semantics of this /// glob pattern. This uses a default set of options. @@ -415,13 +670,34 @@ impl<'a> Parser<'a> { } } +fn path_bytes(path: &Path) -> Cow<[u8]> { + os_str_bytes(path.as_os_str()) +} + +#[cfg(unix)] +fn os_str_bytes(s: &OsStr) -> Cow<[u8]> { + use std::os::unix::ffi::OsStrExt; + Cow::Borrowed(s.as_bytes()) +} + +#[cfg(not(unix))] +fn os_str_bytes(s: &OsStr) -> Cow<[u8]> { + // TODO(burntsushi): On Windows, OS strings are probably UTF-16, so even + // if we could get at the raw bytes, they wouldn't be useful. We *must* + // convert to UTF-8 before doing path matching. Unfortunate, but necessary. + match s.to_string_lossy() { + Cow::Owned(s) => Cow::Owned(s.into_bytes()), + Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()), + } +} + #[cfg(test)] mod tests { use std::path::Path; use regex::bytes::Regex; - use super::{Error, Pattern, MatchOptions, SetBuilder, Token}; + use super::{Error, Pattern, MatchOptions, Set, SetBuilder, Token}; use super::Token::*; macro_rules! syntax { @@ -491,6 +767,37 @@ mod tests { }; } + macro_rules! ext { + ($name:ident, $pat:expr, $ext:expr) => { + #[test] + fn $name() { + let pat = Pattern::new($pat).unwrap(); + let ext = pat.ext().map(|e| e.to_string_lossy().into_owned()); + assert_eq!($ext, ext.as_ref().map(|s| &**s)); + } + }; + } + + macro_rules! baseliteral { + ($name:ident, $pat:expr, $yes:expr) => { + #[test] + fn $name() { + let pat = Pattern::new($pat).unwrap(); + assert_eq!($yes, pat.base_literal().is_some()); + } + }; + } + + macro_rules! basesuffix { + ($name:ident, $pat:expr, $yes:expr) => { + #[test] + fn $name() { + let pat = Pattern::new($pat).unwrap(); + assert_eq!($yes, pat.is_literal_suffix()); + } + }; + } + fn class(s: char, e: char) -> Token { Class { negated: false, ranges: vec![(s, e)] } } @@ -585,6 +892,26 @@ mod tests { toregex!(re10, "+", r"^\+$"); toregex!(re11, "**", r"^.*$"); + ext!(ext1, "**/*.rs", Some("rs")); + + baseliteral!(lit1, "**", true); + baseliteral!(lit2, "**/a", true); + baseliteral!(lit3, "**/ab", true); + baseliteral!(lit4, "**/a*b", false); + baseliteral!(lit5, "z/**/a*b", false); + baseliteral!(lit6, "[ab]", false); + baseliteral!(lit7, "?", false); + + /* + issuffix!(suf1, "", false); + issuffix!(suf2, "a", true); + issuffix!(suf3, "ab", true); + issuffix!(suf4, "*ab", true); + issuffix!(suf5, "*.ab", true); + issuffix!(suf6, "?.ab", true); + issuffix!(suf7, "ab*", false); + */ + matches!(match1, "a", "a"); matches!(match2, "a*b", "a_b"); matches!(match3, "a*b*c", "abc"); @@ -681,16 +1008,22 @@ mod tests { builder.add("src/lib.rs").unwrap(); let set = builder.build().unwrap(); - assert!(set.is_match("foo.c")); - assert!(set.is_match("src/foo.c")); - assert!(!set.is_match("foo.rs")); - assert!(!set.is_match("tests/foo.rs")); - assert!(set.is_match("src/foo.rs")); - assert!(set.is_match("src/grep/src/main.rs")); - - assert_eq!(2, set.matches("src/lib.rs").iter().count()); - assert!(set.matches("src/lib.rs").matched(0)); - assert!(!set.matches("src/lib.rs").matched(1)); - assert!(set.matches("src/lib.rs").matched(2)); + fn is_match(set: &Set, s: &str) -> bool { + let mut matches = vec![]; + set.matches_into(s, &mut matches); + !matches.is_empty() + } + + assert!(is_match(&set, "foo.c")); + assert!(is_match(&set, "src/foo.c")); + assert!(!is_match(&set, "foo.rs")); + assert!(!is_match(&set, "tests/foo.rs")); + assert!(is_match(&set, "src/foo.rs")); + assert!(is_match(&set, "src/grep/src/main.rs")); + + let matches = set.matches("src/lib.rs"); + assert_eq!(2, matches.len()); + assert_eq!(0, matches[0]); + assert_eq!(2, matches[1]); } } diff --git a/src/ignore.rs b/src/ignore.rs index 09c0b302..10a81886 100644 --- a/src/ignore.rs +++ b/src/ignore.rs @@ -365,8 +365,7 @@ impl Overrides { let path = path.as_ref(); self.gi.as_ref() .map(|gi| { - let path = &*path.to_string_lossy(); - let mat = gi.matched_utf8(path, is_dir).invert(); + let mat = gi.matched_stripped(path, is_dir).invert(); if mat.is_none() && !is_dir { if gi.num_ignores() > 0 { return Match::Ignored(&self.unmatched_pat); diff --git a/src/main.rs b/src/main.rs index aa51ec79..936e4965 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,7 @@ extern crate deque; extern crate docopt; extern crate env_logger; +extern crate fnv; extern crate grep; #[cfg(windows)] extern crate kernel32; @@ -36,6 +37,7 @@ use walkdir::DirEntry; use args::Args; use out::{ColoredTerminal, Out}; +use pathutil::strip_prefix; use printer::Printer; use search_stream::InputBuffer; #[cfg(windows)] @@ -60,6 +62,7 @@ mod gitignore; mod glob; mod ignore; mod out; +mod pathutil; mod printer; mod search_buffer; mod search_stream; @@ -257,7 +260,7 @@ impl Worker { } WorkReady::DirFile(ent, file) => { let mut path = ent.path(); - if let Ok(p) = path.strip_prefix("./") { + if let Some(p) = strip_prefix("./", path) { path = p; } if self.args.mmap() { @@ -268,7 +271,7 @@ impl Worker { } WorkReady::PathFile(path, file) => { let mut path = &*path; - if let Ok(p) = path.strip_prefix("./") { + if let Some(p) = strip_prefix("./", path) { path = p; } if self.args.mmap() { diff --git a/src/pathutil.rs b/src/pathutil.rs new file mode 100644 index 00000000..e92416e5 --- /dev/null +++ b/src/pathutil.rs @@ -0,0 +1,80 @@ +/*! +The pathutil module provides platform specific operations on paths that are +typically faster than the same operations as provided in std::path. In +particular, we really want to avoid the costly operation of parsing the path +into its constituent components. We give up on Windows, but on Unix, we deal +with the raw bytes directly. + +On large repositories (like chromium), this can have a ~25% performance +improvement on just listing the files to search (!). +*/ +use std::ffi::OsStr; +use std::path::Path; + +/// Strip `prefix` from the `path` and return the remainder. +/// +/// If `path` doesn't have a prefix `prefix`, then return `None`. +#[cfg(unix)] +pub fn strip_prefix<'a, P: AsRef>( + prefix: P, + path: &'a Path, +) -> Option<&'a Path> { + use std::os::unix::ffi::OsStrExt; + + let prefix = prefix.as_ref().as_os_str().as_bytes(); + let path = path.as_os_str().as_bytes(); + if prefix.len() > path.len() || prefix != &path[0..prefix.len()] { + None + } else { + Some(&Path::new(OsStr::from_bytes(&path[prefix.len()..]))) + } +} + +/// Strip `prefix` from the `path` and return the remainder. +/// +/// If `path` doesn't have a prefix `prefix`, then return `None`. +#[cfg(not(unix))] +pub fn strip_prefix<'a>(prefix: &Path, path: &'a Path) -> Option<&'a Path> { + path.strip_prefix(prefix).ok() +} + +/// The final component of the path, if it is a normal file. +/// +/// If the path terminates in ., .., or consists solely of a root of prefix, +/// file_name will return None. +#[cfg(unix)] +pub fn file_name<'a, P: AsRef + ?Sized>( + path: &'a P, +) -> Option<&'a OsStr> { + use std::os::unix::ffi::OsStrExt; + + let path = path.as_ref().as_os_str().as_bytes(); + if path.is_empty() { + return None; + } else if path.len() == 1 && path[0] == b'.' { + return None; + } else if path.last() == Some(&b'.') { + return None; + } else if path.len() >= 2 && &path[path.len() - 2..] == &b".."[..] { + return None; + } + let mut last_slash = 0; + for (i, &b) in path.iter().enumerate().rev() { + if b == b'/' { + last_slash = i; + break; + } + } + Some(OsStr::from_bytes(&path[last_slash + 1..])) +} + +/// The final component of the path, if it is a normal file. +/// +/// If the path terminates in ., .., or consists solely of a root of prefix, +/// file_name will return None. +#[cfg(not(unix))] +pub fn file_name<'a, P: AsRef + ?Sized>( + path: &'a P, +) -> Option<&'a OsStr> { + path.as_ref().file_name() +} diff --git a/src/types.rs b/src/types.rs index 6b9aea91..910f8132 100644 --- a/src/types.rs +++ b/src/types.rs @@ -151,8 +151,8 @@ impl FileTypeDef { /// Types is a file type matcher. #[derive(Clone, Debug)] pub struct Types { - selected: Option, - negated: Option, + selected: Option, + negated: Option, has_selected: bool, unmatched_pat: Pattern, } @@ -165,8 +165,8 @@ impl Types { /// If has_selected is true, then at least one file type was selected. /// Therefore, any non-matches should be ignored. fn new( - selected: Option, - negated: Option, + selected: Option, + negated: Option, has_selected: bool, ) -> Types { Types { @@ -268,7 +268,7 @@ impl TypesBuilder { try!(bset.add_with(glob, &opts)); } } - Some(try!(bset.build())) + Some(try!(bset.build_yesno())) }; let negated_globs = if self.negated.is_empty() { @@ -287,7 +287,7 @@ impl TypesBuilder { try!(bset.add_with(glob, &opts)); } } - Some(try!(bset.build())) + Some(try!(bset.build_yesno())) }; Ok(Types::new( selected_globs, negated_globs, !self.selected.is_empty())) -- cgit v1.2.3