summaryrefslogtreecommitdiffstats
path: root/globset
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2016-09-30 19:42:41 -0400
committerAndrew Gallant <jamslam@gmail.com>2016-09-30 19:42:41 -0400
commitfdf24317ac4c34fc0a21efa0a775bc7d3663d498 (patch)
treea6e94a61439b0568bfb7bdb5d80e9a0138c48317 /globset
parentb9d5f22a4d20862dfbcbdfc81a07284719cc71c4 (diff)
Move glob implementation to new crate.
It is isolated and complex enough that it deserves attention all on its own. It's also eminently reusable.
Diffstat (limited to 'globset')
-rw-r--r--globset/Cargo.toml10
-rw-r--r--globset/src/lib.rs1174
-rw-r--r--globset/src/pathutil.rs38
3 files changed, 1222 insertions, 0 deletions
diff --git a/globset/Cargo.toml b/globset/Cargo.toml
new file mode 100644
index 00000000..48e375fb
--- /dev/null
+++ b/globset/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "globset"
+version = "0.1.0"
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+
+[dependencies]
+fnv = "1.0"
+lazy_static = "0.2"
+memchr = "0.1"
+regex = "0.1.77"
diff --git a/globset/src/lib.rs b/globset/src/lib.rs
new file mode 100644
index 00000000..b5cbb5be
--- /dev/null
+++ b/globset/src/lib.rs
@@ -0,0 +1,1174 @@
+/*!
+The glob module provides standard shell globbing, but is specifically
+implemented by converting glob syntax to regular expressions. The reasoning is
+two fold:
+
+1. The regex library is *really* fast. Regaining performance in a distinct
+ implementation of globbing is non-trivial.
+2. Most crucially, a `RegexSet` can be used to match many globs simultaneously.
+
+This module is written with some amount of intention of eventually splitting it
+out into its own separate crate, but I didn't quite have the energy for all
+that rigamorole when I wrote this. In particular, it could be fast/good enough
+to make its way into `glob` proper.
+*/
+
+// TODO(burntsushi): I'm pretty dismayed by the performance of regex sets
+// here. For example, we do a first pass single-regex-of-all-globs filter
+// before actually running the regex set. This turns out to be faster,
+// especially in fresh checkouts of repos that don't have a lot of ignored
+// files. It's not clear how hard it is to make the regex set faster.
+//
+// An alternative avenue is to stop doing "regex all the things." (Which, to
+// be fair, is pretty fast---I just expected it to be faster.) We could do
+// something clever using assumptions along the lines of "oh, most ignore
+// patterns are either literals or are for ignoring file extensions." (Look
+// at the .gitignore for the chromium repo---just about every pattern satisfies
+// that assumption.)
+
+extern crate fnv;
+#[macro_use]
+extern crate lazy_static;
+extern crate memchr;
+extern crate regex;
+
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::error::Error as StdError;
+use std::ffi::{OsStr, OsString};
+use std::fmt;
+use std::hash;
+use std::iter;
+use std::path::Path;
+use std::str;
+
+use regex::bytes::Regex;
+
+use pathutil::file_name;
+
+mod pathutil;
+
+lazy_static! {
+ static ref FILE_SEPARATORS: String = regex::quote(r"/\");
+}
+
+/// Represents an error that can occur when parsing a glob pattern.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Error {
+ InvalidRecursive,
+ UnclosedClass,
+ InvalidRange(char, char),
+ UnopenedAlternates,
+ UnclosedAlternates,
+ NestedAlternates,
+}
+
+impl StdError for Error {
+ fn description(&self) -> &str {
+ match *self {
+ Error::InvalidRecursive => {
+ "invalid use of **; must be one path component"
+ }
+ Error::UnclosedClass => {
+ "unclosed character class; missing ']'"
+ }
+ Error::InvalidRange(_, _) => {
+ "invalid character range"
+ }
+ Error::UnopenedAlternates => {
+ "unopened alternate group; missing '{' \
+ (maybe escape '}' with '[}]'?)"
+ }
+ Error::UnclosedAlternates => {
+ "unclosed alternate group; missing '}' \
+ (maybe escape '{' with '[{]'?)"
+ }
+ Error::NestedAlternates => {
+ "nested alternate groups are not allowed"
+ }
+ }
+ }
+}
+
+impl fmt::Display for Error {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ match *self {
+ Error::InvalidRecursive
+ | Error::UnclosedClass
+ | Error::UnopenedAlternates
+ | Error::UnclosedAlternates
+ | Error::NestedAlternates => {
+ write!(f, "{}", self.description())
+ }
+ Error::InvalidRange(s, e) => {
+ write!(f, "invalid range; '{}' > '{}'", s, e)
+ }
+ }
+ }
+}
+
+/// SetYesNo represents a group of globs that can be matched together in a
+/// single pass. SetYesNo can only determine whether a particular path matched
+/// any pattern in the set.
+#[derive(Clone, Debug)]
+pub struct SetYesNo {
+ re: Regex,
+}
+
+impl SetYesNo {
+ /// Returns true if and only if the given path matches at least one glob
+ /// in this set.
+ pub fn is_match<T: AsRef<Path>>(&self, path: T) -> bool {
+ self.re.is_match(&*path_bytes(path.as_ref()))
+ }
+
+ fn new(
+ pats: &[(Pattern, MatchOptions)],
+ ) -> Result<SetYesNo, regex::Error> {
+ let mut joined = String::new();
+ for &(ref p, ref o) in pats {
+ let part = format!("(?:{})", p.to_regex_with(o));
+ if !joined.is_empty() {
+ joined.push('|');
+ }
+ joined.push_str(&part);
+ }
+ Ok(SetYesNo { re: try!(Regex::new(&joined)) })
+ }
+}
+
+type Fnv = hash::BuildHasherDefault<fnv::FnvHasher>;
+
+/// Set represents a group of globs that can be matched together in a single
+/// pass.
+#[derive(Clone, Debug)]
+pub struct Set {
+ exts: HashMap<OsString, Vec<usize>, Fnv>,
+ literals: HashMap<Vec<u8>, Vec<usize>, Fnv>,
+ base_literals: HashMap<Vec<u8>, Vec<usize>, Fnv>,
+ base_prefixes: Vec<Vec<u8>>,
+ base_prefixes_map: Vec<usize>,
+ base_suffixes: Vec<Vec<u8>>,
+ base_suffixes_map: Vec<usize>,
+ base_regexes: Vec<Regex>,
+ base_regexes_map: Vec<usize>,
+ regexes: Vec<Regex>,
+ regexes_map: Vec<usize>,
+}
+
+impl Set {
+ /// Returns the sequence number of every glob pattern that matches the
+ /// given path.
+ #[allow(dead_code)]
+ pub fn matches<T: AsRef<Path>>(&self, path: T) -> Vec<usize> {
+ let mut into = vec![];
+ self.matches_into(path, &mut into);
+ into
+ }
+
+ /// Adds the sequence number of every glob pattern that matches the given
+ /// path to the vec given.
+ pub fn matches_into<T: AsRef<Path>>(
+ &self,
+ path: T,
+ into: &mut Vec<usize>,
+ ) {
+ into.clear();
+ let path = path.as_ref();
+ let path_bytes = &*path_bytes(path);
+ let basename = file_name(path).map(|b| os_str_bytes(b));
+ if !self.exts.is_empty() {
+ if let Some(ext) = path.extension() {
+ if let Some(matches) = self.exts.get(ext) {
+ into.extend(matches.as_slice());
+ }
+ }
+ }
+ if !self.literals.is_empty() {
+ if let Some(matches) = self.literals.get(path_bytes) {
+ into.extend(matches.as_slice());
+ }
+ }
+ if !self.base_literals.is_empty() {
+ if let Some(ref basename) = basename {
+ if let Some(matches) = self.base_literals.get(&**basename) {
+ into.extend(matches.as_slice());
+ }
+ }
+ }
+ if !self.base_prefixes.is_empty() {
+ if let Some(ref basename) = basename {
+ let basename = &**basename;
+ for (i, pre) in self.base_prefixes.iter().enumerate() {
+ if pre.len() <= basename.len() && &**pre == &basename[0..pre.len()] {
+ into.push(self.base_prefixes_map[i]);
+ }
+ }
+ }
+ }
+ if !self.base_suffixes.is_empty() {
+ if let Some(ref basename) = basename {
+ let basename = &**basename;
+ for (i, suf) in self.base_suffixes.iter().enumerate() {
+ if suf.len() > basename.len() {
+ continue;
+ }
+ let (s, e) = (basename.len() - suf.len(), basename.len());
+ if &**suf == &basename[s..e] {
+ into.push(self.base_suffixes_map[i]);
+ }
+ }
+ }
+ }
+ if let Some(ref basename) = basename {
+ for (i, re) in self.base_regexes.iter().enumerate() {
+ if re.is_match(&**basename) {
+ into.push(self.base_regexes_map[i]);
+ }
+ }
+ }
+ for (i, re) in self.regexes.iter().enumerate() {
+ if re.is_match(path_bytes) {
+ into.push(self.regexes_map[i]);
+ }
+ }
+ into.sort();
+ }
+
+ fn new(pats: &[(Pattern, MatchOptions)]) -> Result<Set, regex::Error> {
+ let fnv = Fnv::default();
+ let mut exts = HashMap::with_hasher(fnv.clone());
+ let mut literals = HashMap::with_hasher(fnv.clone());
+ let mut base_literals = HashMap::with_hasher(fnv.clone());
+ let (mut base_prefixes, mut base_prefixes_map) = (vec![], vec![]);
+ let (mut base_suffixes, mut base_suffixes_map) = (vec![], vec![]);
+ let (mut regexes, mut regexes_map) = (vec![], vec![]);
+ let (mut base_regexes, mut base_regexes_map) = (vec![], vec![]);
+ for (i, &(ref p, ref o)) in pats.iter().enumerate() {
+ if let Some(ext) = p.ext() {
+ exts.entry(ext).or_insert(vec![]).push(i);
+ } else if let Some(literal) = p.literal() {
+ literals.entry(literal.into_bytes()).or_insert(vec![]).push(i);
+ } else if let Some(literal) = p.base_literal() {
+ base_literals
+ .entry(literal.into_bytes()).or_insert(vec![]).push(i);
+ } else if let Some(literal) = p.base_literal_prefix() {
+ base_prefixes.push(literal.into_bytes());
+ base_prefixes_map.push(i);
+ } else if let Some(literal) = p.base_literal_suffix() {
+ base_suffixes.push(literal.into_bytes());
+ base_suffixes_map.push(i);
+ } else if p.is_only_basename() {
+ base_regexes.push(try!(Regex::new(&p.to_regex_with(o))));
+ base_regexes_map.push(i);
+ } else {
+ regexes.push(try!(Regex::new(&p.to_regex_with(o))));
+ regexes_map.push(i);
+ }
+ }
+ Ok(Set {
+ exts: exts,
+ literals: literals,
+ base_literals: base_literals,
+ base_prefixes: base_prefixes,
+ base_prefixes_map: base_prefixes_map,
+ base_suffixes: base_suffixes,
+ base_suffixes_map: base_suffixes_map,
+ base_regexes: base_regexes,
+ base_regexes_map: base_regexes_map,
+ regexes: regexes,
+ regexes_map: regexes_map,
+ })
+ }
+}
+
+/// SetBuilder builds a group of patterns that can be used to simultaneously
+/// match a file path.
+pub struct SetBuilder {
+ pats: Vec<(Pattern, MatchOptions)>,
+}
+
+impl SetBuilder {
+ /// Create a new SetBuilder. A SetBuilder can be used to add new patterns.
+ /// Once all patterns have been added, `build` should be called to produce
+ /// a `Set`, which can then be used for matching.
+ pub fn new() -> SetBuilder {
+ SetBuilder { pats: vec![] }
+ }
+
+ /// Builds a new matcher from all of the glob patterns added so far.
+ ///
+ /// Once a matcher is built, no new patterns can be added to it.
+ pub fn build(&self) -> Result<Set, regex::Error> {
+ Set::new(&self.pats)
+ }
+
+ /// Like `build`, but returns a matcher that can only answer yes/no.
+ pub fn build_yesno(&self) -> Result<SetYesNo, regex::Error> {
+ SetYesNo::new(&self.pats)
+ }
+
+ /// Add a new pattern to this set.
+ ///
+ /// If the pattern could not be parsed as a glob, then an error is
+ /// returned.
+ #[allow(dead_code)]
+ pub fn add(&mut self, pat: &str) -> Result<(), Error> {
+ self.add_with(pat, &MatchOptions::default())
+ }
+
+ /// Like add, but sets the match options for this particular pattern.
+ pub fn add_with(
+ &mut self,
+ pat: &str,
+ opts: &MatchOptions,
+ ) -> Result<(), Error> {
+ let parsed = try!(Pattern::new(pat));
+ // if let Some(ext) = parsed.ext() {
+ // eprintln!("ext :: {:?} :: {:?}", ext, pat);
+ // } else if let Some(lit) = parsed.literal() {
+ // eprintln!("literal :: {:?} :: {:?}", lit, pat);
+ // } else if let Some(lit) = parsed.base_literal() {
+ // eprintln!("base_literal :: {:?} :: {:?}", lit, pat);
+ // } else if let Some(lit) = parsed.base_literal_prefix() {
+ // eprintln!("base_literal_prefix :: {:?} :: {:?}", lit, pat);
+ // } else if let Some(lit) = parsed.base_literal_suffix() {
+ // eprintln!("base_literal_suffix :: {:?} :: {:?}", lit, pat);
+ // } else if parsed.is_only_basename() {
+ // eprintln!("basename-regex :: {:?} :: {:?}", pat, parsed);
+ // } else {
+ // eprintln!("regex :: {:?} :: {:?}", pat, parsed);
+ // }
+ self.pats.push((parsed, opts.clone()));
+ Ok(())
+ }
+}
+
+/// Pattern represents a successfully parsed shell glob pattern.
+///
+/// It cannot be used directly to match file paths, but it can be converted
+/// to a regular expression string.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct Pattern {
+ tokens: Vec<Token>,
+}
+
+/// Options to control the matching semantics of a glob. The default value
+/// has all options disabled.
+#[derive(Clone, Debug, Default)]
+pub struct MatchOptions {
+ /// When true, matching is done case insensitively.
+ pub case_insensitive: bool,
+ /// When true, neither `*` nor `?` match the current system's path
+ /// separator.
+ pub require_literal_separator: bool,
+}
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+enum Token {
+ Literal(char),
+ Any,
+ ZeroOrMore,
+ RecursivePrefix,
+ RecursiveSuffix,
+ RecursiveZeroOrMore,
+ Class {
+ negated: bool,
+ ranges: Vec<(char, char)>,
+ },
+ Alternates(Vec<Pattern>),
+}
+
+impl Pattern {
+ /// Parse a shell glob pattern.
+ ///
+ /// If the pattern is not a valid glob, then an error is returned.
+ pub fn new(pat: &str) -> Result<Pattern, Error> {
+ let mut p = Parser {
+ stack: vec![Pattern::default()],
+ chars: pat.chars().peekable(),
+ prev: None,
+ cur: None,
+ };
+ try!(p.parse());
+ if p.stack.is_empty() {
+ Err(Error::UnopenedAlternates)
+ } else if p.stack.len() > 1 {
+ Err(Error::UnclosedAlternates)
+ } else {
+ Ok(p.stack.pop().unwrap())
+ }
+ }
+
+ /// Returns an extension if this pattern exclusively matches it.
+ pub fn ext(&self) -> Option<OsString> {
+ if self.tokens.len() <= 3 {
+ return None;
+ }
+ match self.tokens.get(0) {
+ Some(&Token::RecursivePrefix) => {}
+ _ => return None,
+ }
+ match self.tokens.get(1) {
+ Some(&Token::ZeroOrMore) => {}
+ _ => return None,
+ }
+ match self.tokens.get(2) {
+ Some(&Token::Literal(c)) if c == '.' => {}
+ _ => return None,
+ }
+ let mut lit = OsString::new();
+ for t in self.tokens[3..].iter() {
+ match *t {
+ Token::Literal(c) if c == '/' || c == '\\' || c == '.' => {
+ return None;
+ }
+ Token::Literal(c) => lit.push(c.to_string()),
+ _ => return None,
+ }
+ }
+ Some(lit)
+ }
+
+ /// Returns the pattern as a literal if and only if the pattern exclusiely
+ /// matches the basename of a file path *and* is a literal.
+ ///
+ /// The basic format of these patterns is `**/{literal}`, where `{literal}`
+ /// does not contain a path separator.
+ pub fn base_literal(&self) -> Option<String> {
+ match self.tokens.get(0) {
+ Some(&Token::RecursivePrefix) => {}
+ _ => return None,
+ }
+ let mut lit = String::new();
+ for t in &self.tokens[1..] {
+ match *t {
+ Token::Literal(c) if c == '/' || c == '\\' => return None,
+ Token::Literal(c) => lit.push(c),
+ _ => return None,
+ }
+ }
+ Some(lit)
+ }
+
+ /// Returns true if and only if this pattern only inspects the basename
+ /// of a path.
+ pub fn is_only_basename(&self) -> bool {
+ match self.tokens.get(0) {
+ Some(&Token::RecursivePrefix) => {}
+ _ => return false,
+ }
+ for t in &self.tokens[1..] {
+ match *t {
+ Token::Literal(c) if c == '/' || c == '\\' => return false,
+ Token::RecursivePrefix
+ | Token::RecursiveSuffix
+ | Token::RecursiveZeroOrMore => return false,
+ _ => {}
+ }
+ }
+ true
+ }
+
+ /// Returns the pattern as a literal if and only if the pattern must match
+ /// an entire path exactly.
+ ///
+ /// The basic format of these patterns is `{literal}`.
+ pub fn literal(&self) -> Option<String> {
+ let mut lit = String::new();
+ for t in &self.tokens {
+ match *t {
+ Token::Literal(c) => lit.push(c),
+ _ => return None,
+ }
+ }
+ Some(lit)
+ }
+
+ /// Returns a basename literal prefix of this pattern.
+ pub fn base_literal_prefix(&self) -> Option<String> {
+ match self.tokens.get(0) {
+ Some(&Token::RecursivePrefix) => {}
+ _ => return None,
+ }
+ match self.tokens.last() {
+ Some(&Token::ZeroOrMore) => {}
+ _ => return None,
+ }
+ let mut lit = String::new();
+ for t in &self.tokens[1..self.tokens.len()-1] {
+ match *t {
+ Token::Literal(c) if c == '/' || c == '\\' => return None,
+ Token::Literal(c) => lit.push(c),
+ _ => return None,
+ }
+ }
+ Some(lit)
+ }
+
+ /// Returns a basename literal suffix of this pattern.
+ pub fn base_literal_suffix(&self) -> Option<String> {
+ match self.tokens.get(0) {
+ Some(&Token::RecursivePrefix) => {}
+ _ => return None,
+ }
+ match self.tokens.get(1) {
+ Some(&Token::ZeroOrMore) => {}
+ _ => return None,
+ }
+ let mut lit = String::new();
+ for t in &self.tokens[2..] {
+ match *t {
+ Token::Literal(c) if c == '/' || c == '\\' => return None,
+ Token::Literal(c) => lit.push(c),
+ _ => return None,
+ }
+ }
+ Some(lit)
+ }
+
+ /// Convert this pattern to a string that is guaranteed to be a valid
+ /// regular expression and will represent the matching semantics of this
+ /// glob pattern. This uses a default set of options.
+ #[allow(dead_code)]
+ pub fn to_regex(&self) -> String {
+ self.to_regex_with(&MatchOptions::default())
+ }
+
+ /// Convert this pattern to a string that is guaranteed to be a valid
+ /// regular expression and will represent the matching semantics of this
+ /// glob pattern and the options given.
+ pub fn to_regex_with(&self, options: &MatchOptions) -> String {
+ let mut re = String::new();
+ re.push_str("(?-u)");
+ if options.case_insensitive {
+ re.push_str("(?i)");
+ }
+ re.push('^');
+ // Special case. If the entire glob is just `**`, then it should match
+ // everything.
+ if self.tokens.len() == 1 && self.tokens[0] == Token::RecursivePrefix {
+ re.push_str(".*");
+ re.push('$');
+ return re;
+ }
+ self.tokens_to_regex(options, &self.tokens, &mut re);
+ re.push('$');
+ re
+ }
+
+ fn tokens_to_regex(
+ &self,
+ options: &MatchOptions,
+ tokens: &[Token],
+ re: &mut String,
+ ) {
+ let seps = &*FILE_SEPARATORS;
+
+ for tok in tokens {
+ match *tok {
+ Token::Literal(c) => {
+ re.push_str(&regex::quote(&c.to_string()));
+ }
+ Token::Any => {
+ if options.require_literal_separator {
+ re.push_str(&format!("[^{}]", seps));
+ } else {
+ re.push_str(".");
+ }
+ }
+ Token::ZeroOrMore => {
+ if options.require_literal_separator {
+ re.push_str(&format!("[^{}]*", seps));
+ } else {
+ re.push_str(".*");
+ }
+ }
+ Token::RecursivePrefix => {
+ re.push_str(&format!("(?:[{sep}]?|.*[{sep}])", sep=seps));
+ }
+ Token::RecursiveSuffix => {
+ re.push_str(&format!("(?:[{sep}]?|[{sep}].*)", sep=seps));
+ }
+ Token::RecursiveZeroOrMore => {
+ re.push_str(&format!("(?:[{sep}]|[{sep}].*[{sep}])",
+ sep=seps));
+ }
+ Token::Class { negated, ref ranges } => {
+ re.push('[');
+ if negated {
+ re.push('^');
+ }
+ for r in ranges {
+ if r.0 == r.1 {
+ // Not strictly necessary, but nicer to look at.
+ re.push_str(&regex::quote(&r.0.to_string()));
+ } else {
+ re.push_str(&regex::quote(&r.0.to_string()));
+ re.push('-');
+ re.push_str(&regex::quote(&r.1.to_string()));
+ }
+ }
+ re.push(']');
+ }
+ Token::Alternates(ref patterns) => {
+ let mut parts = vec![];
+ for pat in patterns {
+ let mut altre = String::new();
+ self.tokens_to_regex(options, &pat.tokens, &mut altre);
+ parts.push(altre);
+ }
+ re.push_str(&parts.join("|"));
+ }
+ }
+ }
+ }
+}
+
+struct Parser<'a> {
+ stack: Vec<Pattern>,
+ chars: iter::Peekable<str::Chars<'a>>,
+ prev: Option<char>,
+ cur: Option<char>,
+}
+
+impl<'a> Parser<'a> {
+ fn parse(&mut self) -> Result<(), Error> {
+ while let Some(c) = self.bump() {
+ match c {
+ '?' => try!(self.push_token(Token::Any)),
+ '*' => try!(self.parse_star()),
+ '[' => try!(self.parse_class()),
+ '{' => try!(self.push_alternate()),
+ '}' => try!(self.pop_alternate()),
+ ',' => try!(self.parse_comma()),
+ c => try!(self.push_token(Token::Literal(c))),
+ }
+ }
+ Ok(())
+ }
+
+ fn push_alternate(&mut self) -> Result<(), Error> {
+ if self.stack.len() > 1 {
+ return Err(Error::NestedAlternates);
+ }
+ Ok(self.stack.push(Pattern::default()))
+ }
+
+ fn pop_alternate(&mut self) -> Result<(), Error> {
+ let mut alts = vec![];
+ while self.stack.len() >= 2 {
+ alts.push(self.stack.pop().unwrap());
+ }
+ self.push_token(Token::Alternates(alts))
+ }
+
+ fn push_token(&mut self, tok: Token) -> Result<(), Error> {
+ match self.stack.last_mut() {
+ None => Err(Error::UnopenedAlternates),
+ Some(ref mut pat) => Ok(pat.tokens.push(tok)),
+ }
+ }
+
+ fn pop_token(&mut self) -> Result<Token, Error> {
+ match self.stack.last_mut() {
+ None => Err(Error::UnopenedAlternates),
+ Some(ref mut pat) => Ok(pat.tokens.pop().unwrap()),
+ }
+ }
+
+ fn have_tokens(&self) -> Result<bool, Error> {
+ match self.stack.last() {
+ None => Err(Error::UnopenedAlternates),
+ Some(ref pat) => Ok(!pat.tokens.is_empty()),
+ }
+ }
+
+ fn parse_comma(&mut self) -> Result<(), Error> {
+ // If we aren't inside a group alternation, then don't
+ // treat commas specially. Otherwise, we need to start
+ // a new alternate.
+ if self.stack.len() <= 1 {
+ self.push_token(Token::Literal(','))
+ } else {
+ Ok(self.stack.push(Pattern::default()))
+ }
+ }
+
+ fn parse_star(&mut self) -> Result<(), Error> {
+ let prev = self.prev;
+ if self.chars.peek() != Some(&'*') {
+ try!(self.push_token(Token::ZeroOrMore));
+ return Ok(());
+ }
+ assert!(self.bump() == Some('*'));
+ if !try!(self.have_tokens()) {
+ try!(self.push_token(Token::RecursivePrefix));
+ let next = self.bump();
+ if !next.is_none() && next != Some('/') {
+ return Err(Error::InvalidRecursive);
+ }
+ return Ok(());
+ }
+ try!(self.pop_token());
+ if prev != Some('/') {
+ if self.stack.len() <= 1
+ || (prev != Some(',') && prev != Some('{')) {
+ return Err(Error::InvalidRecursive);
+ }
+ }
+ match self.chars.peek() {
+ None => {
+ assert!(self.bump().is_none());
+ self.push_token(Token::RecursiveSuffix)
+ }
+ Some(&',') | Some(&'}') if self.stack.len() >= 2 => {
+ self.push_token(Token::RecursiveSuffix)
+ }
+ Some(&'/') => {
+ assert!(self.bump() == Some('/'));
+ self.push_token(Token::RecursiveZeroOrMore)
+ }
+ _ => Err(Error::InvalidRecursive),
+ }
+ }
+
+ fn parse_class(&mut self) -> Result<(), Error> {
+ fn add_to_last_range(
+ r: &mut (char, char),
+ add: char,
+ ) -> Result<(), Error> {
+ r.1 = add;
+ if r.1 < r.0 {
+ Err(Error::InvalidRange(r.0, r.1))
+ } else {
+ Ok(())
+ }
+ }
+ let mut negated = false;
+ let mut ranges = vec![];
+ if self.chars.peek() == Some(&'!') {
+ assert!(self.bump() == Some('!'));
+ negated = true;
+ }
+ let mut first = true;
+ let mut in_range = false;
+ loop {
+ let c = match self.bump() {
+ Some(c) => c,
+ // The only way to successfully break this loop is to observe
+ // a ']'.
+ None => return Err(Error::UnclosedClass),
+ };
+ match c {
+ ']' => {
+ if first {
+ ranges.push((']', ']'));
+ } else {
+ break;
+ }
+ }
+ '-' => {
+ if first {
+ ranges.push(('-', '-'));
+ } else if in_range {
+ // invariant: in_range is only set when there is
+ // already at least one character seen.
+ let r = ranges.last_mut().unwrap();
+ try!(add_to_last_range(r, '-'));
+ in_range = false;
+ } else {
+ assert!(!ranges.is_empty());
+ in_range = true;
+ }
+ }
+ c => {
+ if in_range {
+ // invariant: in_range is only set when there is
+ // already at least one character seen.
+ try!(add_to_last_range(ranges.last_mut().unwrap(), c));
+ } else {
+ ranges.push((c, c));
+ }
+ in_range = false;
+ }
+ }
+ first = false;
+ }
+ if in_range {
+ // Means that the last character in the class was a '-', so add
+ // it as a literal.
+ ranges.push(('-', '-'));
+ }
+ self.push_token(Token::Class {
+ negated: negated,
+ ranges: ranges,
+ })
+ }
+
+ fn bump(&mut self) -> Option<char> {
+ self.prev = self.cur;
+ self.cur = self.chars.next();
+ self.cur
+ }
+}
+
+fn path_bytes(path: &Path) -> Cow<[u8]> {
+ os_str_bytes(path.as_os_str())
+}
+
+#[cfg(unix)]
+fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
+ use std::os::unix::ffi::OsStrExt;
+ Cow::Borrowed(s.as_bytes())
+}
+
+#[cfg(not(unix))]
+fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
+ // TODO(burntsushi): On Windows, OS strings are probably UTF-16, so even
+ // if we could get at the raw bytes, they wouldn't be useful. We *must*
+ // convert to UTF-8 before doing path matching. Unfortunate, but necessary.
+ match s.to_string_lossy() {
+ Cow::Owned(s) => Cow::Owned(s.into_bytes()),
+ Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use std::path::Path;
+
+ use regex::bytes::Regex;
+
+ use super::{Error, Pattern, MatchOptions, Set, SetBuilder, Token};
+ use super::Token::*;
+
+ macro_rules! syntax {
+ ($name:ident, $pat:expr, $tokens:expr) => {
+ #[test]
+ fn $name() {
+ let pat = Pattern::new($pat).unwrap();
+ assert_eq!($tokens, pat.tokens);
+ }
+ }
+ }
+
+ macro_rules! syntaxerr {
+ ($name:ident, $pat:expr, $err:expr) => {
+ #[test]
+ fn $name() {
+ let err = Pattern::new($pat).unwrap_err();
+ assert_eq!($err, err);
+ }
+ }
+ }
+
+ macro_rules! toregex {
+ ($name:ident, $pat:expr, $re:expr) => {
+ toregex!($name, $pat, $re, MatchOptions::default());
+ };
+ ($name:ident, $pat:expr, $re:expr, $options:expr) => {
+ #[test]
+ fn $name() {
+ let pat = Pattern::new($pat).unwrap();
+ assert_eq!(
+ format!("(?-u){}", $re), pat.to_regex_with(&$options));
+ }
+ };
+ }
+
+ macro_rules! matches {
+ ($name:ident, $pat:expr, $path:expr) => {
+ matches!($name, $pat, $path, MatchOptions::default());
+ };
+ ($name:ident, $pat:expr, $path:expr, $options:expr) => {
+ #[test]
+ fn $name() {
+ let pat = Pattern::new($pat).unwrap();
+ let path = &Path::new($path).to_str().unwrap();
+ let re = Regex::new(&pat.to_regex_with(&$options)).unwrap();
+ assert!(re.is_match(path.as_bytes()));
+ }
+ };
+ }
+
+ macro_rules! nmatches {
+ ($name:ident, $pat:expr, $path:expr) => {
+ nmatches!($name, $pat, $path, MatchOptions::default());
+ };
+ ($name:ident, $pat:expr, $path:expr, $options:expr) => {
+ #[test]
+ fn $name() {
+ let pat = Pattern::new($pat).unwrap();
+ let path = &Path::new($path).to_str().unwrap();
+ let re = Regex::new(&pat.to_regex_with(&$options)).unwrap();
+ assert!(!re.is_match(path.as_bytes()));
+ }
+ };
+ }
+
+ macro_rules! ext {
+ ($name:ident, $pat:expr, $ext:expr) => {
+ #[test]
+ fn $name() {
+ let pat = Pattern::new($pat).unwrap();
+ let ext = pat.ext().map(|e| e.to_string_lossy().into_owned());
+ assert_eq!($ext, ext.as_ref().map(|s| &**s));
+ }
+ };
+ }
+
+ macro_rules! baseliteral {
+ ($name:ident, $pat:expr, $yes:expr) => {
+ #[test]
+ fn $name() {
+ let pat = Pattern::new($pat).unwrap();
+ assert_eq!($yes, pat.base_literal().is_some());
+ }
+ };
+ }
+
+ macro_rules! basesuffix {
+ ($name:ident, $pat:expr, $yes:expr) => {
+ #[test]
+ fn $name() {
+ let pat = Pattern::new($pat).unwrap();
+ assert_eq!($yes, pat.is_literal_suffix());
+ }
+ };
+ }
+
+ fn class(s: char, e: char) -> Token {
+ Class { negated: false, ranges: vec![(s, e)] }
+ }
+
+ fn classn(s: char, e: char) -> Token {
+ Class { negated: true, ranges: vec![(s, e)] }
+ }
+
+ fn rclass(ranges: &[(char, char)]) -> Token {
+ Class { negated: false, ranges: ranges.to_vec() }
+ }
+
+ fn rclassn(ranges: &[(char, char)]) -> Token {
+ Class { negated: true, ranges: ranges.to_vec() }
+ }
+
+ syntax!(literal1, "a", vec![Literal('a')]);
+ syntax!(literal2, "ab", vec![Literal('a'), Literal('b')]);
+ syntax!(any1, "?", vec![Any]);
+ syntax!(any2, "a?b", vec![Literal('a'), Any, Literal('b')]);
+ syntax!(seq1, "*", vec![ZeroOrMore]);
+ syntax!(seq2, "a*b", vec![Literal('a'), ZeroOrMore, Literal('b')]);
+ syntax!(seq3, "*a*b*", vec![
+ ZeroOrMore, Literal('a'), ZeroOrMore, Literal('b'), ZeroOrMore,
+ ]);
+ syntax!(rseq1, "**", vec![RecursivePrefix]);
+ syntax!(rseq2, "**/", vec![RecursivePrefix]);
+