summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2016-10-04 20:22:13 -0400
committerAndrew Gallant <jamslam@gmail.com>2016-10-04 20:28:56 -0400
commit175406df01c704d557715b6f558f1624f9e8aaf9 (patch)
tree83b4edfbf25becc598a4e6aeafece2ed362b931e
parent89811d43d4239053bafef15f9f13acd9e8986bab (diff)
Refactor and test glob sets.
This commit goes a long way toward refactoring glob sets so that the code is easier to maintain going forward. In particular, it makes the literal optimizations that glob sets used a lot more structured and much easier to extend. Tests have also been modified to include glob sets. There's still a bit of polish work left to do before a release. This also fixes the immediate issue where large gitignore files were causing ripgrep to slow way down. While we don't technically fix it for good, we're a lot better about reducing the number of regexes we compile. In particular, if a gitignore file contains thousands of patterns that can't be matched more simply using literals, then ripgrep will slow down again. We could fix this for good by avoiding RegexSet if the number of regexes grows too large. Fixes #134.
-rw-r--r--Cargo.lock2
-rw-r--r--globset/Cargo.toml2
-rw-r--r--globset/src/lib.rs1270
-rw-r--r--globset/src/pathutil.rs96
-rw-r--r--globset/src/pattern.rs1379
-rw-r--r--src/gitignore.rs24
-rw-r--r--src/types.rs31
-rw-r--r--tests/tests.rs1
8 files changed, 1884 insertions, 921 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 21640763..cce72a7b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -82,8 +82,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
name = "globset"
version = "0.1.0"
dependencies = [
+ "aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
"fnv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)",
]
diff --git a/globset/Cargo.toml b/globset/Cargo.toml
index 48e375fb..cf63f397 100644
--- a/globset/Cargo.toml
+++ b/globset/Cargo.toml
@@ -4,7 +4,9 @@ version = "0.1.0"
authors = ["Andrew Gallant <jamslam@gmail.com>"]
[dependencies]
+aho-corasick = "0.5.3"
fnv = "1.0"
lazy_static = "0.2"
+log = "0.3"
memchr = "0.1"
regex = "0.1.77"
diff --git a/globset/src/lib.rs b/globset/src/lib.rs
index b5cbb5be..f608a74a 100644
--- a/globset/src/lib.rs
+++ b/globset/src/lib.rs
@@ -13,40 +13,42 @@ that rigamorole when I wrote this. In particular, it could be fast/good enough
to make its way into `glob` proper.
*/
-// TODO(burntsushi): I'm pretty dismayed by the performance of regex sets
-// here. For example, we do a first pass single-regex-of-all-globs filter
-// before actually running the regex set. This turns out to be faster,
-// especially in fresh checkouts of repos that don't have a lot of ignored
-// files. It's not clear how hard it is to make the regex set faster.
-//
-// An alternative avenue is to stop doing "regex all the things." (Which, to
-// be fair, is pretty fast---I just expected it to be faster.) We could do
-// something clever using assumptions along the lines of "oh, most ignore
-// patterns are either literals or are for ignoring file extensions." (Look
-// at the .gitignore for the chromium repo---just about every pattern satisfies
-// that assumption.)
+#![deny(missing_docs)]
+extern crate aho_corasick;
extern crate fnv;
#[macro_use]
extern crate lazy_static;
+#[macro_use]
+extern crate log;
extern crate memchr;
extern crate regex;
use std::borrow::Cow;
-use std::collections::HashMap;
+use std::collections::{BTreeMap, HashMap};
use std::error::Error as StdError;
use std::ffi::{OsStr, OsString};
use std::fmt;
use std::hash;
-use std::iter;
use std::path::Path;
use std::str;
-use regex::bytes::Regex;
+use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton};
+use regex::bytes::{Regex, RegexBuilder, RegexSet};
-use pathutil::file_name;
+use pathutil::{file_name, file_name_ext, os_str_bytes, path_bytes};
+use pattern::MatchStrategy;
+pub use pattern::{Pattern, PatternBuilder, PatternMatcher};
mod pathutil;
+mod pattern;
+
+macro_rules! eprintln {
+ ($($tt:tt)*) => {{
+ use std::io::Write;
+ let _ = writeln!(&mut ::std::io::stderr(), $($tt)*);
+ }}
+}
lazy_static! {
static ref FILE_SEPARATORS: String = regex::quote(r"/\");
@@ -55,12 +57,24 @@ lazy_static! {
/// Represents an error that can occur when parsing a glob pattern.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum Error {
+ /// Occurs when a use of `**` is invalid. Namely, `**` can only appear
+ /// adjacent to a path separator, or the beginning/end of a glob.
InvalidRecursive,
+ /// Occurs when a character class (e.g., `[abc]`) is not closed.
UnclosedClass,
+ /// Occurs when a range in a character (e.g., `[a-z]`) is invalid. For
+ /// example, if the range starts with a lexicographically larger character
+ /// than it ends with.
InvalidRange(char, char),
+ /// Occurs when a `}` is found without a matching `{`.
UnopenedAlternates,
+ /// Occurs when a `{` is found without a matching `}`.
UnclosedAlternates,
+ /// Occurs when an alternating group is nested inside another alternating
+ /// group, e.g., `{{a,b},{c,d}}`.
NestedAlternates,
+ /// An error associated with parsing or compiling a regex.
+ Regex(String),
}
impl StdError for Error {
@@ -86,6 +100,7 @@ impl StdError for Error {
Error::NestedAlternates => {
"nested alternate groups are not allowed"
}
+ Error::Regex(ref err) => err,
}
}
}
@@ -97,7 +112,8 @@ impl fmt::Display for Error {
| Error::UnclosedClass
| Error::UnopenedAlternates
| Error::UnclosedAlternates
- | Error::NestedAlternates => {
+ | Error::NestedAlternates
+ | Error::Regex(_) => {
write!(f, "{}", self.description())
}
Error::InvalidRange(s, e) => {
@@ -107,34 +123,18 @@ impl fmt::Display for Error {
}
}
-/// SetYesNo represents a group of globs that can be matched together in a
-/// single pass. SetYesNo can only determine whether a particular path matched
-/// any pattern in the set.
-#[derive(Clone, Debug)]
-pub struct SetYesNo {
- re: Regex,
+fn new_regex(pat: &str) -> Result<Regex, Error> {
+ RegexBuilder::new(pat)
+ .dot_matches_new_line(true)
+ .size_limit(10 * (1 << 20))
+ .dfa_size_limit(10 * (1 << 20))
+ .compile()
+ .map_err(|err| Error::Regex(err.to_string()))
}
-impl SetYesNo {
- /// Returns true if and only if the given path matches at least one glob
- /// in this set.
- pub fn is_match<T: AsRef<Path>>(&self, path: T) -> bool {
- self.re.is_match(&*path_bytes(path.as_ref()))
- }
-
- fn new(
- pats: &[(Pattern, MatchOptions)],
- ) -> Result<SetYesNo, regex::Error> {
- let mut joined = String::new();
- for &(ref p, ref o) in pats {
- let part = format!("(?:{})", p.to_regex_with(o));
- if !joined.is_empty() {
- joined.push('|');
- }
- joined.push_str(&part);
- }
- Ok(SetYesNo { re: try!(Regex::new(&joined)) })
- }
+fn new_regex_set<I, S>(pats: I) -> Result<RegexSet, Error>
+ where S: AsRef<str>, I: IntoIterator<Item=S> {
+ RegexSet::new(pats).map_err(|err| Error::Regex(err.to_string()))
}
type Fnv = hash::BuildHasherDefault<fnv::FnvHasher>;
@@ -143,20 +143,21 @@ type Fnv = hash::BuildHasherDefault<fnv::FnvHasher>;
/// pass.
#[derive(Clone, Debug)]
pub struct Set {
- exts: HashMap<OsString, Vec<usize>, Fnv>,
- literals: HashMap<Vec<u8>, Vec<usize>, Fnv>,
- base_literals: HashMap<Vec<u8>, Vec<usize>, Fnv>,
- base_prefixes: Vec<Vec<u8>>,
- base_prefixes_map: Vec<usize>,
- base_suffixes: Vec<Vec<u8>>,
- base_suffixes_map: Vec<usize>,
- base_regexes: Vec<Regex>,
- base_regexes_map: Vec<usize>,
- regexes: Vec<Regex>,
- regexes_map: Vec<usize>,
+ strats: Vec<SetMatchStrategy>,
}
impl Set {
+ /// Returns true if any glob in this set matches the path given.
+ pub fn is_match<T: AsRef<Path>>(&self, path: T) -> bool {
+ let candidate = Candidate::new(path.as_ref());
+ for strat in &self.strats {
+ if strat.is_match(&candidate) {
+ return true;
+ }
+ }
+ false
+ }
+
/// Returns the sequence number of every glob pattern that matches the
/// given path.
#[allow(dead_code)]
@@ -174,110 +175,67 @@ impl Set {
into: &mut Vec<usize>,
) {
into.clear();
- let path = path.as_ref();
- let path_bytes = &*path_bytes(path);
- let basename = file_name(path).map(|b| os_str_bytes(b));
- if !self.exts.is_empty() {
- if let Some(ext) = path.extension() {
- if let Some(matches) = self.exts.get(ext) {
- into.extend(matches.as_slice());
- }
- }
+ let candidate = Candidate::new(path.as_ref());
+ for strat in &self.strats {
+ strat.matches_into(&candidate, into);
}
- if !self.literals.is_empty() {
- if let Some(matches) = self.literals.get(path_bytes) {
- into.extend(matches.as_slice());
- }
- }
- if !self.base_literals.is_empty() {
- if let Some(ref basename) = basename {
- if let Some(matches) = self.base_literals.get(&**basename) {
- into.extend(matches.as_slice());
+ into.sort();
+ into.dedup();
+ }
+
+ fn new(pats: &[Pattern]) -> Result<Set, Error> {
+ let mut lits = LiteralStrategy::new();
+ let mut base_lits = BasenameLiteralStrategy::new();
+ let mut exts = ExtensionStrategy::new();
+ let mut prefixes = MultiStrategyBuilder::new();
+ let mut suffixes = MultiStrategyBuilder::new();
+ let mut required_exts = RequiredExtensionStrategyBuilder::new();
+ let mut regexes = MultiStrategyBuilder::new();
+ for (i, p) in pats.iter().enumerate() {
+ match MatchStrategy::new(p) {
+ MatchStrategy::Literal(lit) => {
+ lits.add(i, lit);
}
- }
- }
- if !self.base_prefixes.is_empty() {
- if let Some(ref basename) = basename {
- let basename = &**basename;
- for (i, pre) in self.base_prefixes.iter().enumerate() {
- if pre.len() <= basename.len() && &**pre == &basename[0..pre.len()] {
- into.push(self.base_prefixes_map[i]);
- }
+ MatchStrategy::BasenameLiteral(lit) => {
+ base_lits.add(i, lit);
}
- }
- }
- if !self.base_suffixes.is_empty() {
- if let Some(ref basename) = basename {
- let basename = &**basename;
- for (i, suf) in self.base_suffixes.iter().enumerate() {
- if suf.len() > basename.len() {
- continue;
- }
- let (s, e) = (basename.len() - suf.len(), basename.len());
- if &**suf == &basename[s..e] {
- into.push(self.base_suffixes_map[i]);
+ MatchStrategy::Extension(ext) => {
+ exts.add(i, ext);
+ }
+ MatchStrategy::Prefix(prefix) => {
+ prefixes.add(i, prefix);
+ }
+ MatchStrategy::Suffix { suffix, component } => {
+ if component {
+ lits.add(i, suffix[1..].to_string());
}
+ suffixes.add(i, suffix);
}
- }
- }
- if let Some(ref basename) = basename {
- for (i, re) in self.base_regexes.iter().enumerate() {
- if re.is_match(&**basename) {
- into.push(self.base_regexes_map[i]);
+ MatchStrategy::RequiredExtension(ext) => {
+ required_exts.add(i, ext, p.regex().to_owned());
+ }
+ MatchStrategy::Regex => {
+ debug!("glob converted to regex: {:?}", p);
+ regexes.add(i, p.regex().to_owned());
}
}
}
- for (i, re) in self.regexes.iter().enumerate() {
- if re.is_match(path_bytes) {
- into.push(self.regexes_map[i]);
- }
- }
- into.sort();
- }
-
- fn new(pats: &[(Pattern, MatchOptions)]) -> Result<Set, regex::Error> {
- let fnv = Fnv::default();
- let mut exts = HashMap::with_hasher(fnv.clone());
- let mut literals = HashMap::with_hasher(fnv.clone());
- let mut base_literals = HashMap::with_hasher(fnv.clone());
- let (mut base_prefixes, mut base_prefixes_map) = (vec![], vec![]);
- let (mut base_suffixes, mut base_suffixes_map) = (vec![], vec![]);
- let (mut regexes, mut regexes_map) = (vec![], vec![]);
- let (mut base_regexes, mut base_regexes_map) = (vec![], vec![]);
- for (i, &(ref p, ref o)) in pats.iter().enumerate() {
- if let Some(ext) = p.ext() {
- exts.entry(ext).or_insert(vec![]).push(i);
- } else if let Some(literal) = p.literal() {
- literals.entry(literal.into_bytes()).or_insert(vec![]).push(i);
- } else if let Some(literal) = p.base_literal() {
- base_literals
- .entry(literal.into_bytes()).or_insert(vec![]).push(i);
- } else if let Some(literal) = p.base_literal_prefix() {
- base_prefixes.push(literal.into_bytes());
- base_prefixes_map.push(i);
- } else if let Some(literal) = p.base_literal_suffix() {
- base_suffixes.push(literal.into_bytes());
- base_suffixes_map.push(i);
- } else if p.is_only_basename() {
- base_regexes.push(try!(Regex::new(&p.to_regex_with(o))));
- base_regexes_map.push(i);
- } else {
- regexes.push(try!(Regex::new(&p.to_regex_with(o))));
- regexes_map.push(i);
- }
- }
+ debug!("built glob set; {} literals, {} basenames, {} extensions, \
+ {} prefixes, {} suffixes, {} required extensions, {} regexes",
+ lits.0.len(), base_lits.0.len(), exts.0.len(),
+ prefixes.literals.len(), suffixes.literals.len(),
+ required_exts.0.len(), regexes.literals.len());
Ok(Set {
- exts: exts,
- literals: literals,
- base_literals: base_literals,
- base_prefixes: base_prefixes,
- base_prefixes_map: base_prefixes_map,
- base_suffixes: base_suffixes,
- base_suffixes_map: base_suffixes_map,
- base_regexes: base_regexes,
- base_regexes_map: base_regexes_map,
- regexes: regexes,
- regexes_map: regexes_map,
+ strats: vec![
+ SetMatchStrategy::Extension(exts),
+ SetMatchStrategy::BasenameLiteral(base_lits),
+ SetMatchStrategy::Literal(lits),
+ SetMatchStrategy::Suffix(suffixes.suffix()),
+ SetMatchStrategy::Prefix(prefixes.prefix()),
+ SetMatchStrategy::RequiredExtension(
+ try!(required_exts.build())),
+ SetMatchStrategy::Regex(try!(regexes.regex_set())),
+ ],
})
}
}
@@ -285,7 +243,7 @@ impl Set {
/// SetBuilder builds a group of patterns that can be used to simultaneously
/// match a file path.
pub struct SetBuilder {
- pats: Vec<(Pattern, MatchOptions)>,
+ pats: Vec<Pattern>,
}
impl SetBuilder {
@@ -299,858 +257,374 @@ impl SetBuilder {
/// Builds a new matcher from all of the glob patterns added so far.
///
/// Once a matcher is built, no new patterns can be added to it.
- pub fn build(&self) -> Result<Set, regex::Error> {
+ pub fn build(&self) -> Result<Set, Error> {
Set::new(&self.pats)
}
- /// Like `build`, but returns a matcher that can only answer yes/no.
- pub fn build_yesno(&self) -> Result<SetYesNo, regex::Error> {
- SetYesNo::new(&self.pats)
- }
-
/// Add a new pattern to this set.
- ///
- /// If the pattern could not be parsed as a glob, then an error is
- /// returned.
#[allow(dead_code)]
- pub fn add(&mut self, pat: &str) -> Result<(), Error> {
- self.add_with(pat, &MatchOptions::default())
- }
-
- /// Like add, but sets the match options for this particular pattern.
- pub fn add_with(
- &mut self,
- pat: &str,
- opts: &MatchOptions,
- ) -> Result<(), Error> {
- let parsed = try!(Pattern::new(pat));
- // if let Some(ext) = parsed.ext() {
- // eprintln!("ext :: {:?} :: {:?}", ext, pat);
- // } else if let Some(lit) = parsed.literal() {
- // eprintln!("literal :: {:?} :: {:?}", lit, pat);
- // } else if let Some(lit) = parsed.base_literal() {
- // eprintln!("base_literal :: {:?} :: {:?}", lit, pat);
- // } else if let Some(lit) = parsed.base_literal_prefix() {
- // eprintln!("base_literal_prefix :: {:?} :: {:?}", lit, pat);
- // } else if let Some(lit) = parsed.base_literal_suffix() {
- // eprintln!("base_literal_suffix :: {:?} :: {:?}", lit, pat);
- // } else if parsed.is_only_basename() {
- // eprintln!("basename-regex :: {:?} :: {:?}", pat, parsed);
- // } else {
- // eprintln!("regex :: {:?} :: {:?}", pat, parsed);
- // }
- self.pats.push((parsed, opts.clone()));
- Ok(())
+ pub fn add(&mut self, pat: Pattern) -> &mut SetBuilder {
+ self.pats.push(pat);
+ self
}
}
-/// Pattern represents a successfully parsed shell glob pattern.
-///
-/// It cannot be used directly to match file paths, but it can be converted
-/// to a regular expression string.
-#[derive(Clone, Debug, Default, Eq, PartialEq)]
-pub struct Pattern {
- tokens: Vec<Token>,
-}
-
-/// Options to control the matching semantics of a glob. The default value
-/// has all options disabled.
-#[derive(Clone, Debug, Default)]
-pub struct MatchOptions {
- /// When true, matching is done case insensitively.
- pub case_insensitive: bool,
- /// When true, neither `*` nor `?` match the current system's path
- /// separator.
- pub require_literal_separator: bool,
-}
-
-#[derive(Clone, Debug, Eq, PartialEq)]
-enum Token {
- Literal(char),
- Any,
- ZeroOrMore,
- RecursivePrefix,
- RecursiveSuffix,
- RecursiveZeroOrMore,
- Class {
- negated: bool,
- ranges: Vec<(char, char)>,
- },
- Alternates(Vec<Pattern>),
+#[derive(Clone, Debug)]
+struct Candidate<'a> {
+ path: Cow<'a, [u8]>,
+ basename: Cow<'a, [u8]>,
+ ext: &'a OsStr,
}
-impl Pattern {
- /// Parse a shell glob pattern.
- ///
- /// If the pattern is not a valid glob, then an error is returned.
- pub fn new(pat: &str) -> Result<Pattern, Error> {
- let mut p = Parser {
- stack: vec![Pattern::default()],
- chars: pat.chars().peekable(),
- prev: None,
- cur: None,
- };
- try!(p.parse());
- if p.stack.is_empty() {
- Err(Error::UnopenedAlternates)
- } else if p.stack.len() > 1 {
- Err(Error::UnclosedAlternates)
- } else {
- Ok(p.stack.pop().unwrap())
+impl<'a> Candidate<'a> {
+ fn new<P: AsRef<Path> + ?Sized>(path: &'a P) -> Candidate<'a> {
+ let path = path.as_ref();
+ let basename = file_name(path).unwrap_or(OsStr::new(""));
+ Candidate {
+ path: path_bytes(path),
+ basename: os_str_bytes(basename),
+ ext: file_name_ext(basename).unwrap_or(OsStr::new("")),
}
}
- /// Returns an extension if this pattern exclusively matches it.
- pub fn ext(&self) -> Option<OsString> {
- if self.tokens.len() <= 3 {
- return None;
- }
- match self.tokens.get(0) {
- Some(&Token::RecursivePrefix) => {}
- _ => return None,
- }
- match self.tokens.get(1) {
- Some(&Token::ZeroOrMore) => {}
- _ => return None,
- }
- match self.tokens.get(2) {
- Some(&Token::Literal(c)) if c == '.' => {}
- _ => return None,
- }
- let mut lit = OsString::new();
- for t in self.tokens[3..].iter() {
- match *t {
- Token::Literal(c) if c == '/' || c == '\\' || c == '.' => {
- return None;
- }
- Token::Literal(c) => lit.push(c.to_string()),
- _ => return None,
- }
+ fn path_prefix(&self, max: usize) -> &[u8] {
+ if self.path.len() <= max {
+ &*self.path
+ } else {
+ &self.path[..max]
}
- Some(lit)
}
- /// Returns the pattern as a literal if and only if the pattern exclusiely
- /// matches the basename of a file path *and* is a literal.
- ///
- /// The basic format of these patterns is `**/{literal}`, where `{literal}`
- /// does not contain a path separator.
- pub fn base_literal(&self) -> Option<String> {
- match self.tokens.get(0) {
- Some(&Token::RecursivePrefix) => {}
- _ => return None,
- }
- let mut lit = String::new();
- for t in &self.tokens[1..] {
- match *t {
- Token::Literal(c) if c == '/' || c == '\\' => return None,
- Token::Literal(c) => lit.push(c),
- _ => return None,
- }
+ fn path_suffix(&self, max: usize) -> &[u8] {
+ if self.path.len() <= max {
+ &*self.path
+ } else {
+ &self.path[self.path.len() - max..]
}
- Some(lit)
}
+}
- /// Returns true if and only if this pattern only inspects the basename
- /// of a path.
- pub fn is_only_basename(&self) -> bool {
- match self.tokens.get(0) {
- Some(&Token::RecursivePrefix) => {}
- _ => return false,
- }
- for t in &self.tokens[1..] {
- match *t {
- Token::Literal(c) if c == '/' || c == '\\' => return false,
- Token::RecursivePrefix
- | Token::RecursiveSuffix
- | Token::RecursiveZeroOrMore => return false,
- _ => {}
- }
- }
- true
- }
+#[derive(Clone, Debug)]
+enum SetMatchStrategy {
+ Literal(LiteralStrategy),
+ BasenameLiteral(BasenameLiteralStrategy),
+ Extension(ExtensionStrategy),
+ Prefix(PrefixStrategy),
+ Suffix(SuffixStrategy),
+ RequiredExtension(RequiredExtensionStrategy),
+ Regex(RegexSetStrategy),
+}
- /// Returns the pattern as a literal if and only if the pattern must match
- /// an entire path exactly.
- ///
- /// The basic format of these patterns is `{literal}`.
- pub fn literal(&self) -> Option<String> {
- let mut lit = String::new();
- for t in &self.tokens {
- match *t {
- Token::Literal(c) => lit.push(c),
- _ => return None,
- }
+impl SetMatchStrategy {
+ fn is_match(&self, candidate: &Candidate) -> bool {
+ use self::SetMatchStrategy::*;
+ match *self {
+ Literal(ref s) => s.is_match(candidate),
+ BasenameLiteral(ref s) => s.is_match(candidate),
+ Extension(ref s) => s.is_match(candidate),
+ Prefix(ref s) => s.is_match(candidate),
+ Suffix(ref s) => s.is_match(candidate),
+ RequiredExtension(ref s) => s.is_match(candidate),
+ Regex(ref s) => s.is_match(candidate),
}
- Some(lit)
}
- /// Returns a basename literal prefix of this pattern.
- pub fn base_literal_prefix(&self) -> Option<String> {
- match self.tokens.get(0) {
- Some(&Token::RecursivePrefix) => {}
- _ => return None,
- }
- match self.tokens.last() {
- Some(&Token::ZeroOrMore) => {}
- _ => return None,
- }
- let mut lit = String::new();
- for t in &self.tokens[1..self.tokens.len()-1] {
- match *t {
- Token::Literal(c) if c == '/' || c == '\\' => return None,
- Token::Literal(c) => lit.push(c),
- _ => return None,
- }
+ fn matches_into(&self, candidate: &Candidate, matches: &mut Vec<usize>) {
+ use self::SetMatchStrategy::*;
+ match *self {
+ Literal(ref s) => s.matches_into(candidate, matches),
+ BasenameLiteral(ref s) => s.matches_into(candidate, matches),
+ Extension(ref s) => s.matches_into(candidate, matches),
+ Prefix(ref s) => s.matches_into(candidate, matches),
+ Suffix(ref s) => s.matches_into(candidate, matches),
+ RequiredExtension(ref s) => s.matches_into(candidate, matches),
+ Regex(ref s) => s.matches_into(candidate, matches),
}
- Some(lit)
}
+}
- /// Returns a basename literal suffix of this pattern.
- pub fn base_literal_suffix(&self) -> Option<String> {
- match self.tokens.get(0) {
- Some(&Token::RecursivePrefix) => {}
- _ => return None,
- }
- match self.tokens.get(1) {
- Some(&Token::ZeroOrMore) => {}
- _ => return None,
- }
- let mut lit = String::new();
- for t in &self.tokens[2..] {
- match *t {
- Token::Literal(c) if c == '/' || c == '\\' => return None,
- Token::Literal(c) => lit.push(c),
- _ => return None,
- }
- }
- Some(lit)
- }
+#[derive(Clone, Debug)]
+struct LiteralStrategy(BTreeMap<Vec<u8>, Vec<usize>>);
- /// Convert this pattern to a string that is guaranteed to be a valid
- /// regular expression and will represent the matching semantics of this
- /// glob pattern. This uses a default set of options.
- #[allow(dead_code)]
- pub fn to_regex(&self) -> String {
- self.to_regex_with(&MatchOptions::default())
+impl LiteralStrategy {
+ fn new() -> LiteralStrategy {
+ LiteralStrategy(BTreeMap::new())
}
- /// Convert this pattern to a string that is guaranteed to be a valid
- /// regular expression and will represent the matching semantics of this
- /// glob pattern and the options given.
- pub fn to_regex_with(&self, options: &MatchOptions) -> String {
- let mut re = String::new();
- re.push_str("(?-u)");
- if options.case_insensitive {
- re.push_str("(?i)");
- }
- re.push('^');
- // Special case. If the entire glob is just `**`, then it should match
- // everything.
- if self.tokens.len() == 1 && self.tokens[0] == Token::RecursivePrefix {
- re.push_str(".*");
- re.push('$');
- return re;
- }
- self.tokens_to_regex(options, &self.tokens, &mut re);
- re.push('$');
- re
+ fn add(&mut self, global_index: usize, lit: String) {
+ self.0.entry(lit.into_bytes()).or_insert(vec![]).push(global_index);
}
- fn tokens_to_regex(
- &self,
- options: &MatchOptions,
- tokens: &[Token],
- re: &mut String,
- ) {
- let seps = &*FILE_SEPARATORS;
+ fn is_match(&self, candidate: &Candidate) -> bool {
+ self.0.contains_key(&*candidate.path)
+ }
- for tok in tokens {
- match *tok {
- Token::Literal(c) => {
- re.push_str(&regex::quote(&c.to_string()));
- }
- Token::Any => {
- if options.require_literal_separator {
- re.push_str(&format!("[^{}]", seps));
- } else {
- re.push_str(".");
- }
- }
- Token::ZeroOrMore => {
- if options.require_literal_separator {
- re.push_str(&format!("[^{}]*", seps));
- } else {
- re.push_str(".*");
- }
- }
- Token::RecursivePrefix => {
- re.push_str(&format!("(?:[{sep}]?|.*[{sep}])", sep=seps));
- }
- Token::RecursiveSuffix => {
- re.push_str(&format!("(?:[{sep}]?|[{sep}].*)", sep=seps));
- }
- Token::RecursiveZeroOrMore => {
- re.push_str(&format!("(?:[{sep}]|[{sep}].*[{sep}])",
- sep=seps));
- }
- Token::Class { negated, ref ranges } => {
- re.push('[');
- if negated {
- re.push('^');
- }
- for r in ranges {
- if r.0 == r.1 {
- // Not strictly necessary, but nicer to look at.
- re.push_str(&regex::quote(&r.0.to_string()));
- } else {
- re.push_str(&regex::quote(&r.0.to_string()));
- re.push('-');
- re.push_str(&regex::quote(&r.1.to_string()));
- }
- }
- re.push(']');
- }
- Token::Alternates(ref patterns) => {
- let mut parts = vec![];
- for pat in patterns {
- let mut altre = String::new();
- self.tokens_to_regex(options, &pat.tokens, &mut altre);
- parts.push(altre);
- }
- re.push_str(&parts.join("|"));
- }
- }
+ #[inline(never)]
+ fn matches_into(&self, candidate: &Candidate, matches: &mut Vec<usize>) {
+ if let Some(hits) = self.0.get(&*candidate.path) {
+ matches.extend(hits);
}
}
}
-struct Parser<'a> {
- stack: Vec<Pattern>,
- chars: iter::Peekable<str::Chars<'a>>,
- prev: Option<char>,
- cur: Option<char>,
-}
+#[derive(Clone, Debug)]
+struct BasenameLiteralStrategy(BTreeMap<Vec<u8>, Vec<usize>>);
-impl<'a> Parser<'a> {
- fn parse(&mut self) -> Result<(), Error> {
- while let Some(c) = self.bump() {
- match c {
- '?' => try!(self.push_token(Token::Any)),
- '*' => try!(self.parse_star()),
- '[' => try!(self.parse_class()),
- '{' => try!(self.push_alternate()),
- '}' => try!(self.pop_alternate()),
- ',' => try!(self.parse_comma()),
- c => try!(self.push_token(Token::Literal(c))),
- }
- }
- Ok(())
+impl BasenameLiteralStrategy {
+ fn new() -> BasenameLiteralStrategy {
+ BasenameLiteralStrategy(BTreeMap::new())
}
- fn push_alternate(&mut self) -> Result<(), Error> {
- if self.stack.len() > 1 {
- return Err(Error::NestedAlternates);
- }
- Ok(self.stack.push(Pattern::default()))
+ fn add(&mut self, global_index: usize, lit: String) {
+ self.0.entry(lit.into_bytes()).or_insert(vec![]).push(global_index);
}
- fn pop_alternate(&mut self) -> Result<(), Error> {
- let mut alts = vec![];
- while self.stack.len() >= 2 {
- alts.push(self.stack.pop().unwrap());
+ fn is_match(&self, candidate: &Candidate) -> bool {
+ if candidate.basename.is_empty() {
+ return false;
}
- self.push_token(Token::Alternates(alts))
+ self.0.contains_key(&*candidate.basename)
}
- fn push_token(&mut self, tok: Token) -> Result<(), Error> {
- match self.stack.last_mut() {
- None => Err(Error::UnopenedAlternates),
- Some(ref mut pat) => Ok(pat.tokens.push(tok)),
+ #[inline(never)]
+ fn matches_into(&self, candidate: &Candidate, matches: &mut Vec<usize>) {
+ if candidate.basename.is_empty() {
+ return;
+ }
+ if let Some(hits) = self.0.get(&*candidate.basename) {
+ matches.extend(hits);
}
}
+}
- fn pop_token(&mut self) -> Result<Token, Error> {
- match self.stack.last_mut() {
- None => Err(Error::UnopenedAlternates),
- Some(ref mut pat) => Ok(pat.tokens.pop().unwrap()),
- }
+#[derive(Clone, Debug)]
+struct ExtensionStrategy(HashMap<OsString, Vec<usize>, Fnv>);
+
+impl ExtensionStrategy {
+ fn new() -> ExtensionStrategy {
+ ExtensionStrategy(HashMap::with_hasher(Fnv::default()))
}
- fn have_tokens(&self) -> Result<bool, Error> {
- match self.stack.last() {
- None => Err(Error::UnopenedAlternates),
- Some(ref pat) => Ok(!pat.tokens.is_empty()),
- }
+ fn add(&mut self, global_index: usize, ext: OsString) {
+ self.0.entry(ext).or_insert(vec![]).push(global_index);
}
- fn parse_comma(&mut self) -> Result<(), Error> {
- // If we aren't inside a group alternation, then don't
- // treat commas specially. Otherwise, we need to start
- // a new alternate.
- if self.stack.len() <= 1 {
- self.push_token(Token::Literal(','))
- } else {
- Ok(self.stack.push(Pattern::default()))
+ fn is_match(&self, candidate: &Candidate) -> bool {
+ if candidate.ext.is_empty() {
+ return false;
}
+ self.0.contains_key(candidate.ext)
}
- fn parse_star(&mut self) -> Result<(), Error> {
- let prev = self.prev;
- if self.chars.peek() != Some(&'*') {
- try!(self.push_token(Token::ZeroOrMore));
- return Ok(());
+ #[inline(never)]
+ fn matches_into(&self, candidate: &Candidate, matches: &mut Vec<