Move glob implementation to new crate.

It is isolated and complex enough that it deserves attention all on its own. It's also eminently reusable.
author: Andrew Gallant <jamslam@gmail.com> 2016-09-30 19:42:41 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2016-09-30 19:42:41 -0400
commit: fdf24317ac4c34fc0a21efa0a775bc7d3663d498 (patch)
tree: a6e94a61439b0568bfb7bdb5d80e9a0138c48317 /globset
parent: b9d5f22a4d20862dfbcbdfc81a07284719cc71c4 (diff)
3 files changed, 1222 insertions, 0 deletions
diff --git a/globset/Cargo.toml b/globset/Cargo.toml
new file mode 100644
index 00000000..48e375fb
--- /dev/null
+++ b/globset/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "globset"
+version = "0.1.0"
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+
+[dependencies]
+fnv = "1.0"
+lazy_static = "0.2"
+memchr = "0.1"
+regex = "0.1.77"
diff --git a/globset/src/lib.rs b/globset/src/lib.rs
new file mode 100644
index 00000000..b5cbb5be
--- /dev/null
+++ b/globset/src/lib.rs
@@ -0,0 +1,1174 @@
+/*!
+The glob module provides standard shell globbing, but is specifically
+implemented by converting glob syntax to regular expressions. The reasoning is
+two fold:
+
+1. The regex library is *really* fast. Regaining performance in a distinct
+   implementation of globbing is non-trivial.
+2. Most crucially, a `RegexSet` can be used to match many globs simultaneously.
+
+This module is written with some amount of intention of eventually splitting it
+out into its own separate crate, but I didn't quite have the energy for all
+that rigamorole when I wrote this. In particular, it could be fast/good enough
+to make its way into `glob` proper.
+*/
+
+// TODO(burntsushi): I'm pretty dismayed by the performance of regex sets
+// here. For example, we do a first pass single-regex-of-all-globs filter
+// before actually running the regex set. This turns out to be faster,
+// especially in fresh checkouts of repos that don't have a lot of ignored
+// files. It's not clear how hard it is to make the regex set faster.
+//
+// An alternative avenue is to stop doing "regex all the things." (Which, to
+// be fair, is pretty fast---I just expected it to be faster.) We could do
+// something clever using assumptions along the lines of "oh, most ignore
+// patterns are either literals or are for ignoring file extensions." (Look
+// at the .gitignore for the chromium repo---just about every pattern satisfies
+// that assumption.)
+
+extern crate fnv;
+#[macro_use]
+extern crate lazy_static;
+extern crate memchr;
+extern crate regex;
+
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::error::Error as StdError;
+use std::ffi::{OsStr, OsString};
+use std::fmt;
+use std::hash;
+use std::iter;
+use std::path::Path;
+use std::str;
+
+use regex::bytes::Regex;
+
+use pathutil::file_name;
+
+mod pathutil;
+
+lazy_static! {
+    static ref FILE_SEPARATORS: String = regex::quote(r"/\");
+}
+
+/// Represents an error that can occur when parsing a glob pattern.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Error {
+    InvalidRecursive,
+    UnclosedClass,
+    InvalidRange(char, char),
+    UnopenedAlternates,
+    UnclosedAlternates,
+    NestedAlternates,
+}
+
+impl StdError for Error {
+    fn description(&self) -> &str {
+        match *self {
+            Error::InvalidRecursive => {
+                "invalid use of **; must be one path component"
+            }
+            Error::UnclosedClass => {
+                "unclosed character class; missing ']'"
+            }
+            Error::InvalidRange(_, _) => {
+                "invalid character range"
+            }
+            Error::UnopenedAlternates => {
+                "unopened alternate group; missing '{' \
+                (maybe escape '}' with '[}]'?)"
+            }
+            Error::UnclosedAlternates => {
+                "unclosed alternate group; missing '}' \
+                (maybe escape '{' with '[{]'?)"
+            }
+            Error::NestedAlternates => {
+                "nested alternate groups are not allowed"
+            }
+        }
+    }
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match *self {
+            Error::InvalidRecursive
+            | Error::UnclosedClass
+            | Error::UnopenedAlternates
+            | Error::UnclosedAlternates
+            | Error::NestedAlternates => {
+                write!(f, "{}", self.description())
+            }
+            Error::InvalidRange(s, e) => {
+                write!(f, "invalid range; '{}' > '{}'", s, e)
+            }
+        }
+    }
+}
+
+/// SetYesNo represents a group of globs that can be matched together in a
+/// single pass. SetYesNo can only determine whether a particular path matched
+/// any pattern in the set.
+#[derive(Clone, Debug)]
+pub struct SetYesNo {
+    re: Regex,
+}
+
+impl SetYesNo {
+    /// Returns true if and only if the given path matches at least one glob
+    /// in this set.
+    pub fn is_match<T: AsRef<Path>>(&self, path: T) -> bool {
+        self.re.is_match(&*path_bytes(path.as_ref()))
+    }
+
+    fn new(
+        pats: &[(Pattern, MatchOptions)],
+    ) -> Result<SetYesNo, regex::Error> {
+        let mut joined = String::new();
+        for &(ref p, ref o) in pats {
+            let part = format!("(?:{})", p.to_regex_with(o));
+            if !joined.is_empty() {
+                joined.push('|');
+            }
+            joined.push_str(&part);
+        }
+        Ok(SetYesNo { re: try!(Regex::new(&joined)) })
+    }
+}
+
+type Fnv = hash::BuildHasherDefault<fnv::FnvHasher>;
+
+/// Set represents a group of globs that can be matched together in a single
+/// pass.
+#[derive(Clone, Debug)]
+pub struct Set {
+    exts: HashMap<OsString, Vec<usize>, Fnv>,
+    literals: HashMap<Vec<u8>, Vec<usize>, Fnv>,
+    base_literals: HashMap<Vec<u8>, Vec<usize>, Fnv>,
+    base_prefixes: Vec<Vec<u8>>,
+    base_prefixes_map: Vec<usize>,
+    base_suffixes: Vec<Vec<u8>>,
+    base_suffixes_map: Vec<usize>,
+    base_regexes: Vec<Regex>,
+    base_regexes_map: Vec<usize>,
+    regexes: Vec<Regex>,
+    regexes_map: Vec<usize>,
+}
+
+impl Set {
+    /// Returns the sequence number of every glob pattern that matches the
+    /// given path.
+    #[allow(dead_code)]
+    pub fn matches<T: AsRef<Path>>(&self, path: T) -> Vec<usize> {
+        let mut into = vec![];
+        self.matches_into(path, &mut into);
+        into
+    }
+
+    /// Adds the sequence number of every glob pattern that matches the given
+    /// path to the vec given.
+    pub fn matches_into<T: AsRef<Path>>(
+        &self,
+        path: T,
+        into: &mut Vec<usize>,
+    ) {
+        into.clear();
+        let path = path.as_ref();
+        let path_bytes = &*path_bytes(path);
+        let basename = file_name(path).map(|b| os_str_bytes(b));
+        if !self.exts.is_empty() {
+            if let Some(ext) = path.extension() {
+                if let Some(matches) = self.exts.get(ext) {
+                    into.extend(matches.as_slice());
+                }
+            }
+        }
+        if !self.literals.is_empty() {
+            if let Some(matches) = self.literals.get(path_bytes) {
+                into.extend(matches.as_slice());
+            }
+        }
+        if !self.base_literals.is_empty() {
+            if let Some(ref basename) = basename {
+                if let Some(matches) = self.base_literals.get(&**basename) {
+                    into.extend(matches.as_slice());
+                }
+            }
+        }
+        if !self.base_prefixes.is_empty() {
+            if let Some(ref basename) = basename {
+                let basename = &**basename;
+                for (i, pre) in self.base_prefixes.iter().enumerate() {
+                    if pre.len() <= basename.len() && &**pre == &basename[0..pre.len()] {
+                        into.push(self.base_prefixes_map[i]);
+                    }
+                }
+            }
+        }
+        if !self.base_suffixes.is_empty() {
+            if let Some(ref basename) = basename {
+                let basename = &**basename;
+                for (i, suf) in self.base_suffixes.iter().enumerate() {
+                    if suf.len() > basename.len() {
+                        continue;
+                    }
+                    let (s, e) = (basename.len() - suf.len(), basename.len());
+                    if &**suf == &basename[s..e] {
+                        into.push(self.base_suffixes_map[i]);
+                    }
+                }
+            }
+        }
+        if let Some(ref basename) = basename {
+            for (i, re) in self.base_regexes.iter().enumerate() {
+                if re.is_match(&**basename) {
+                    into.push(self.base_regexes_map[i]);
+                }
+            }
+        }
+        for (i, re) in self.regexes.iter().enumerate() {
+            if re.is_match(path_bytes) {
+                into.push(self.regexes_map[i]);
+            }
+        }
+        into.sort();
+    }
+
+    fn new(pats: &[(Pattern, MatchOptions)]) -> Result<Set, regex::Error> {
+        let fnv = Fnv::default();
+        let mut exts = HashMap::with_hasher(fnv.clone());
+        let mut literals = HashMap::with_hasher(fnv.clone());
+        let mut base_literals = HashMap::with_hasher(fnv.clone());
+        let (mut base_prefixes, mut base_prefixes_map) = (vec![], vec![]);
+        let (mut base_suffixes, mut base_suffixes_map) = (vec![], vec![]);
+        let (mut regexes, mut regexes_map) = (vec![], vec![]);
+        let (mut base_regexes, mut base_regexes_map) = (vec![], vec![]);
+        for (i, &(ref p, ref o)) in pats.iter().enumerate() {
+            if let Some(ext) = p.ext() {
+                exts.entry(ext).or_insert(vec![]).push(i);
+            } else if let Some(literal) = p.literal() {
+                literals.entry(literal.into_bytes()).or_insert(vec![]).push(i);
+            } else if let Some(literal) = p.base_literal() {
+                base_literals
+                    .entry(literal.into_bytes()).or_insert(vec![]).push(i);
+            } else if let Some(literal) = p.base_literal_prefix() {
+                base_prefixes.push(literal.into_bytes());
+                base_prefixes_map.push(i);
+            } else if let Some(literal) = p.base_literal_suffix() {
+                base_suffixes.push(literal.into_bytes());
+                base_suffixes_map.push(i);
+            } else if p.is_only_basename() {
+                base_regexes.push(try!(Regex::new(&p.to_regex_with(o))));
+                base_regexes_map.push(i);
+            } else {
+                regexes.push(try!(Regex::new(&p.to_regex_with(o))));
+                regexes_map.push(i);
+            }
+        }
+        Ok(Set {
+            exts: exts,
+            literals: literals,
+            base_literals: base_literals,
+            base_prefixes: base_prefixes,
+            base_prefixes_map: base_prefixes_map,
+            base_suffixes: base_suffixes,
+            base_suffixes_map: base_suffixes_map,
+            base_regexes: base_regexes,
+            base_regexes_map: base_regexes_map,
+            regexes: regexes,
+            regexes_map: regexes_map,
+        })
+    }
+}
+
+/// SetBuilder builds a group of patterns that can be used to simultaneously
+/// match a file path.
+pub struct SetBuilder {
+    pats: Vec<(Pattern, MatchOptions)>,
+}
+
+impl SetBuilder {
+    /// Create a new SetBuilder. A SetBuilder can be used to add new patterns.
+    /// Once all patterns have been added, `build` should be called to produce
+    /// a `Set`, which can then be used for matching.
+    pub fn new() -> SetBuilder {
+        SetBuilder { pats: vec![] }
+    }
+
+    /// Builds a new matcher from all of the glob patterns added so far.
+    ///
+    /// Once a matcher is built, no new patterns can be added to it.
+    pub fn build(&self) -> Result<Set, regex::Error> {
+        Set::new(&self.pats)
+    }
+
+    /// Like `build`, but returns a matcher that can only answer yes/no.
+    pub fn build_yesno(&self) -> Result<SetYesNo, regex::Error> {
+        SetYesNo::new(&self.pats)
+    }
+
+    /// Add a new pattern to this set.
+    ///
+    /// If the pattern could not be parsed as a glob, then an error is
+    /// returned.
+    #[allow(dead_code)]
+    pub fn add(&mut self, pat: &str) -> Result<(), Error> {
+        self.add_with(pat, &MatchOptions::default())
+    }
+
+    /// Like add, but sets the match options for this particular pattern.
+    pub fn add_with(
+        &mut self,
+        pat: &str,
+        opts: &MatchOptions,
+    ) -> Result<(), Error> {
+        let parsed = try!(Pattern::new(pat));
+        // if let Some(ext) = parsed.ext() {
+            // eprintln!("ext :: {:?} :: {:?}", ext, pat);
+        // } else if let Some(lit) = parsed.literal() {
+            // eprintln!("literal :: {:?} :: {:?}", lit, pat);
+        // } else if let Some(lit) = parsed.base_literal() {
+            // eprintln!("base_literal :: {:?} :: {:?}", lit, pat);
+        // } else if let Some(lit) = parsed.base_literal_prefix() {
+            // eprintln!("base_literal_prefix :: {:?} :: {:?}", lit, pat);
+        // } else if let Some(lit) = parsed.base_literal_suffix() {
+            // eprintln!("base_literal_suffix :: {:?} :: {:?}", lit, pat);
+        // } else if parsed.is_only_basename() {
+            // eprintln!("basename-regex :: {:?} :: {:?}", pat, parsed);
+        // } else {
+            // eprintln!("regex :: {:?} :: {:?}", pat, parsed);
+        // }
+        self.pats.push((parsed, opts.clone()));
+        Ok(())
+    }
+}
+
+/// Pattern represents a successfully parsed shell glob pattern.
+///
+/// It cannot be used directly to match file paths, but it can be converted
+/// to a regular expression string.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct Pattern {
+    tokens: Vec<Token>,
+}
+
+/// Options to control the matching semantics of a glob. The default value
+/// has all options disabled.
+#[derive(Clone, Debug, Default)]
+pub struct MatchOptions {
+    /// When true, matching is done case insensitively.
+    pub case_insensitive: bool,
+    /// When true, neither `*` nor `?` match the current system's path
+    /// separator.
+    pub require_literal_separator: bool,
+}
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+enum Token {
+    Literal(char),
+    Any,
+    ZeroOrMore,
+    RecursivePrefix,
+    RecursiveSuffix,
+    RecursiveZeroOrMore,
+    Class {
+        negated: bool,
+        ranges: Vec<(char, char)>,
+    },
+    Alternates(Vec<Pattern>),
+}
+
+impl Pattern {
+    /// Parse a shell glob pattern.
+    ///
+    /// If the pattern is not a valid glob, then an error is returned.
+    pub fn new(pat: &str) -> Result<Pattern, Error> {
+        let mut p = Parser {
+            stack: vec![Pattern::default()],
+            chars: pat.chars().peekable(),
+            prev: None,
+            cur: None,
+        };
+        try!(p.parse());
+        if p.stack.is_empty() {
+            Err(Error::UnopenedAlternates)
+        } else if p.stack.len() > 1 {
+            Err(Error::UnclosedAlternates)
+        } else {
+            Ok(p.stack.pop().unwrap())
+        }
+    }
+
+    /// Returns an extension if this pattern exclusively matches it.
+    pub fn ext(&self) -> Option<OsString> {
+        if self.tokens.len() <= 3 {
+            return None;
+        }
+        match self.tokens.get(0) {
+            Some(&Token::RecursivePrefix) => {}
+            _ => return None,
+        }
+        match self.tokens.get(1) {
+            Some(&Token::ZeroOrMore) => {}
+            _ => return None,
+        }
+        match self.tokens.get(2) {
+            Some(&Token::Literal(c)) if c == '.' => {}
+            _ => return None,
+        }
+        let mut lit = OsString::new();
+        for t in self.tokens[3..].iter() {
+            match *t {
+                Token::Literal(c) if c == '/' || c == '\\' || c == '.' => {
+                    return None;
+                }
+                Token::Literal(c) => lit.push(c.to_string()),
+                _ => return None,
+            }
+        }
+        Some(lit)
+    }
+
+    /// Returns the pattern as a literal if and only if the pattern exclusiely
+    /// matches the basename of a file path *and* is a literal.
+    ///
+    /// The basic format of these patterns is `**/{literal}`, where `{literal}`
+    /// does not contain a path separator.
+    pub fn base_literal(&self) -> Option<String> {
+        match self.tokens.get(0) {
+            Some(&Token::RecursivePrefix) => {}
+            _ => return None,
+        }
+        let mut lit = String::new();
+        for t in &self.tokens[1..] {
+            match *t {
+                Token::Literal(c) if c == '/' || c == '\\' => return None,
+                Token::Literal(c) => lit.push(c),
+                _ => return None,
+            }
+        }
+        Some(lit)
+    }
+
+    /// Returns true if and only if this pattern only inspects the basename
+    /// of a path.
+    pub fn is_only_basename(&self) -> bool {
+        match self.tokens.get(0) {
+            Some(&Token::RecursivePrefix) => {}
+            _ => return false,
+        }
+        for t in &self.tokens[1..] {
+            match *t {
+                Token::Literal(c) if c == '/' || c == '\\' => return false,
+                Token::RecursivePrefix
+                | Token::RecursiveSuffix
+                | Token::RecursiveZeroOrMore => return false,
+                _ => {}
+            }
+        }
+        true
+    }
+
+    /// Returns the pattern as a literal if and only if the pattern must match
+    /// an entire path exactly.
+    ///
+    /// The basic format of these patterns is `{literal}`.
+    pub fn literal(&self) -> Option<String> {
+        let mut lit = String::new();
+        for t in &self.tokens {
+            match *t {
+                Token::Literal(c) => lit.push(c),
+                _ => return None,
+            }
+        }
+        Some(lit)
+    }
+
+    /// Returns a basename literal prefix of this pattern.
+    pub fn base_literal_prefix(&self) -> Option<String> {
+        match self.tokens.get(0) {
+            Some(&Token::RecursivePrefix) => {}
+            _ => return None,
+        }
+        match self.tokens.last() {
+            Some(&Token::ZeroOrMore) => {}
+            _ => return None,
+        }
+        let mut lit = String::new();
+        for t in &self.tokens[1..self.tokens.len()-1] {
+            match *t {
+                Token::Literal(c) if c == '/' || c == '\\' => return None,
+                Token::Literal(c) => lit.push(c),
+                _ => return None,
+            }
+        }
+        Some(lit)
+    }
+
+    /// Returns a basename literal suffix of this pattern.
+    pub fn base_literal_suffix(&self) -> Option<String> {
+        match self.tokens.get(0) {
+            Some(&Token::RecursivePrefix) => {}
+            _ => return None,
+        }
+        match self.tokens.get(1) {
+            Some(&Token::ZeroOrMore) => {}
+            _ => return None,
+        }
+        let mut lit = String::new();
+        for t in &self.tokens[2..] {
+            match *t {
+                Token::Literal(c) if c == '/' || c == '\\' => return None,
+                Token::Literal(c) => lit.push(c),
+                _ => return None,
+            }
+        }
+        Some(lit)
+    }
+
+    /// Convert this pattern to a string that is guaranteed to be a valid
+    /// regular expression and will represent the matching semantics of this
+    /// glob pattern. This uses a default set of options.
+    #[allow(dead_code)]
+    pub fn to_regex(&self) -> String {
+        self.to_regex_with(&MatchOptions::default())
+    }
+
+    /// Convert this pattern to a string that is guaranteed to be a valid
+    /// regular expression and will represent the matching semantics of this
+    /// glob pattern and the options given.
+    pub fn to_regex_with(&self, options: &MatchOptions) -> String {
+        let mut re = String::new();
+        re.push_str("(?-u)");
+        if options.case_insensitive {
+            re.push_str("(?i)");
+        }
+        re.push('^');
+        // Special case. If the entire glob is just `**`, then it should match
+        // everything.
+        if self.tokens.len() == 1 && self.tokens[0] == Token::RecursivePrefix {
+            re.push_str(".*");
+            re.push('$');
+            return re;
+        }
+        self.tokens_to_regex(options, &self.tokens, &mut re);
+        re.push('$');
+        re
+    }
+
+    fn tokens_to_regex(
+        &self,
+        options: &MatchOptions,
+        tokens: &[Token],
+        re: &mut String,
+    ) {
+        let seps = &*FILE_SEPARATORS;
+
+        for tok in tokens {
+            match *tok {
+                Token::Literal(c) => {
+                    re.push_str(&regex::quote(&c.to_string()));
+                }
+                Token::Any => {
+                    if options.require_literal_separator {
+                        re.push_str(&format!("[^{}]", seps));
+                    } else {
+                        re.push_str(".");
+                    }
+                }
+                Token::ZeroOrMore => {
+                    if options.require_literal_separator {
+                        re.push_str(&format!("[^{}]*", seps));
+                    } else {
+                        re.push_str(".*");
+                    }
+                }
+                Token::RecursivePrefix => {
+                    re.push_str(&format!("(?:[{sep}]?|.*[{sep}])", sep=seps));
+                }
+                Token::RecursiveSuffix => {
+                    re.push_str(&format!("(?:[{sep}]?|[{sep}].*)", sep=seps));
+                }
+                Token::RecursiveZeroOrMore => {
+                    re.push_str(&format!("(?:[{sep}]|[{sep}].*[{sep}])",
+                                         sep=seps));
+                }
+                Token::Class { negated, ref ranges } => {
+                    re.push('[');
+                    if negated {
+                        re.push('^');
+                    }
+                    for r in ranges {
+                        if r.0 == r.1 {
+                            // Not strictly necessary, but nicer to look at.
+                            re.push_str(&regex::quote(&r.0.to_string()));
+                        } else {
+                            re.push_str(&regex::quote(&r.0.to_string()));
+                            re.push('-');
+                            re.push_str(&regex::quote(&r.1.to_string()));
+                        }
+                    }
+                    re.push(']');
+                }
+                Token::Alternates(ref patterns) => {
+                    let mut parts = vec![];
+                    for pat in patterns {
+                        let mut altre = String::new();
+                        self.tokens_to_regex(options, &pat.tokens, &mut altre);
+                        parts.push(altre);
+                    }
+                    re.push_str(&parts.join("|"));
+                }
+            }
+        }
+    }
+}
+
+struct Parser<'a> {
+    stack: Vec<Pattern>,
+    chars: iter::Peekable<str::Chars<'a>>,
+    prev: Option<char>,
+    cur: Option<char>,
+}
+
+impl<'a> Parser<'a> {
+    fn parse(&mut self) -> Result<(), Error> {
+        while let Some(c) = self.bump() {
+            match c {
+                '?' => try!(self.push_token(Token::Any)),
+                '*' => try!(self.parse_star()),
+                '[' => try!(self.parse_class()),
+                '{' => try!(self.push_alternate()),
+                '}' => try!(self.pop_alternate()),
+                ',' => try!(self.parse_comma()),
+                c => try!(self.push_token(Token::Literal(c))),
+            }
+        }
+        Ok(())
+    }
+
+    fn push_alternate(&mut self) -> Result<(), Error> {
+        if self.stack.len() > 1 {
+            return Err(Error::NestedAlternates);
+        }
+        Ok(self.stack.push(Pattern::default()))
+    }
+
+    fn pop_alternate(&mut self) -> Result<(), Error> {
+        let mut alts = vec![];
+        while self.stack.len() >= 2 {
+            alts.push(self.stack.pop().unwrap());
+        }
+        self.push_token(Token::Alternates(alts))
+    }
+
+    fn push_token(&mut self, tok: Token) -> Result<(), Error> {
+        match self.stack.last_mut() {
+            None => Err(Error::UnopenedAlternates),
+            Some(ref mut pat) => Ok(pat.tokens.push(tok)),
+        }
+    }
+
+    fn pop_token(&mut self) -> Result<Token, Error> {
+        match self.stack.last_mut() {
+            None => Err(Error::UnopenedAlternates),
+            Some(ref mut pat) => Ok(pat.tokens.pop().unwrap()),
+        }
+    }
+
+    fn have_tokens(&self) -> Result<bool, Error> {
+        match self.stack.last() {
+            None => Err(Error::UnopenedAlternates),
+            Some(ref pat) => Ok(!pat.tokens.is_empty()),
+        }
+    }
+
+    fn parse_comma(&mut self) -> Result<(), Error> {
+        // If we aren't inside a group alternation, then don't
+        // treat commas specially. Otherwise, we need to start
+        // a new alternate.
+        if self.stack.len() <= 1 {
+            self.push_token(Token::Literal(','))
+        } else {
+            Ok(self.stack.push(Pattern::default()))
+        }
+    }
+
+    fn parse_star(&mut self) -> Result<(), Error> {
+        let prev = self.prev;
+        if self.chars.peek() != Some(&'*') {
+            try!(self.push_token(Token::ZeroOrMore));
+            return Ok(());
+        }
+        assert!(self.bump() == Some('*'));
+        if !try!(self.have_tokens()) {
+            try!(self.push_token(Token::RecursivePrefix));
+            let next = self.bump();
+            if !next.is_none() && next != Some('/') {
+                return Err(Error::InvalidRecursive);
+            }
+            return Ok(());
+        }
+        try!(self.pop_token());
+        if prev != Some('/') {
+            if self.stack.len() <= 1
+                || (prev != Some(',') && prev != Some('{')) {
+                return Err(Error::InvalidRecursive);
+            }
+        }
+        match self.chars.peek() {
+            None => {
+                assert!(self.bump().is_none());
+                self.push_token(Token::RecursiveSuffix)
+            }
+            Some(&',') | Some(&'}') if self.stack.len() >= 2 => {
+                self.push_token(Token::RecursiveSuffix)
+            }
+            Some(&'/') => {
+                assert!(self.bump() == Some('/'));
+                self.push_token(Token::RecursiveZeroOrMore)
+            }
+            _ => Err(Error::InvalidRecursive),
+        }
+    }
+
+    fn parse_class(&mut self) -> Result<(), Error> {
+        fn add_to_last_range(
+            r: &mut (char, char),
+            add: char,
+        ) -> Result<(), Error> {
+            r.1 = add;
+            if r.1 < r.0 {
+                Err(Error::InvalidRange(r.0, r.1))
+            } else {
+                Ok(())
+            }
+        }
+        let mut negated = false;
+        let mut ranges = vec![];
+        if self.chars.peek() == Some(&'!') {
+            assert!(self.bump() == Some('!'));
+            negated = true;
+        }
+        let mut first = true;
+        let mut in_range = false;
+        loop {
+            let c = match self.bump() {
+                Some(c) => c,
+                // The only way to successfully break this loop is to observe
+                // a ']'.
+                None => return Err(Error::UnclosedClass),
+            };
+            match c {
+                ']' => {
+                    if first {
+                        ranges.push((']', ']'));
+                    } else {
+                        break;
+                    }
+                }
+                '-' => {
+                    if first {
+                        ranges.push(('-', '-'));
+                    } else if in_range {
+                        // invariant: in_range is only set when there is
+                        // already at least one character seen.
+                        let r = ranges.last_mut().unwrap();
+                        try!(add_to_last_range(r, '-'));
+                        in_range = false;
+                    } else {
+                        assert!(!ranges.is_empty());
+                        in_range = true;
+                    }
+                }
+                c => {
+                    if in_range {
+                        // invariant: in_range is only set when there is
+                        // already at least one character seen.
+                        try!(add_to_last_range(ranges.last_mut().unwrap(), c));
+                    } else {
+                        ranges.push((c, c));
+                    }
+                    in_range = false;
+                }
+            }
+            first = false;
+        }
+        if in_range {
+            // Means that the last character in the class was a '-', so add
+            // it as a literal.
+            ranges.push(('-', '-'));
+        }
+        self.push_token(Token::Class {
+            negated: negated,
+            ranges: ranges,
+        })
+    }
+
+    fn bump(&mut self) -> Option<char> {
+        self.prev = self.cur;
+        self.cur = self.chars.next();
+        self.cur
+    }
+}
+
+fn path_bytes(path: &Path) -> Cow<[u8]> {
+    os_str_bytes(path.as_os_str())
+}
+
+#[cfg(unix)]
+fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
+    use std::os::unix::ffi::OsStrExt;
+    Cow::Borrowed(s.as_bytes())
+}
+
+#[cfg(not(unix))]
+fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
+    // TODO(burntsushi): On Windows, OS strings are probably UTF-16, so even
+    // if we could get at the raw bytes, they wouldn't be useful. We *must*
+    // convert to UTF-8 before doing path matching. Unfortunate, but necessary.
+    match s.to_string_lossy() {
+        Cow::Owned(s) => Cow::Owned(s.into_bytes()),
+        Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::path::Path;
+
+    use regex::bytes::Regex;
+
+    use super::{Error, Pattern, MatchOptions, Set, SetBuilder, Token};
+    use super::Token::*;
+
+    macro_rules! syntax {
+        ($name:ident, $pat:expr, $tokens:expr) => {
+            #[test]
+            fn $name() {
+                let pat = Pattern::new($pat).unwrap();
+                assert_eq!($tokens, pat.tokens);
+            }
+        }
+    }
+
+    macro_rules! syntaxerr {
+        ($name:ident, $pat:expr, $err:expr) => {
+            #[test]
+            fn $name() {
+                let err = Pattern::new($pat).unwrap_err();
+                assert_eq!($err, err);
+            }
+        }
+    }
+
+    macro_rules! toregex {
+        ($name:ident, $pat:expr, $re:expr) => {
+            toregex!($name, $pat, $re, MatchOptions::default());
+        };
+        ($name:ident, $pat:expr, $re:expr, $options:expr) => {
+            #[test]
+            fn $name() {
+                let pat = Pattern::new($pat).unwrap();
+                assert_eq!(
+                    format!("(?-u){}", $re), pat.to_regex_with(&$options));
+            }
+        };
+    }
+
+    macro_rules! matches {
+        ($name:ident, $pat:expr, $path:expr) => {
+            matches!($name, $pat, $path, MatchOptions::default());
+        };
+        ($name:ident, $pat:expr, $path:expr, $options:expr) => {
+            #[test]
+            fn $name() {
+                let pat = Pattern::new($pat).unwrap();
+                let path = &Path::new($path).to_str().unwrap();
+                let re = Regex::new(&pat.to_regex_with(&$options)).unwrap();
+                assert!(re.is_match(path.as_bytes()));
+            }
+        };
+    }
+
+    macro_rules! nmatches {
+        ($name:ident, $pat:expr, $path:expr) => {
+            nmatches!($name, $pat, $path, MatchOptions::default());
+        };
+        ($name:ident, $pat:expr, $path:expr, $options:expr) => {
+            #[test]
+            fn $name() {
+                let pat = Pattern::new($pat).unwrap();
+                let path = &Path::new($path).to_str().unwrap();
+                let re = Regex::new(&pat.to_regex_with(&$options)).unwrap();
+                assert!(!re.is_match(path.as_bytes()));
+            }
+        };
+    }
+
+    macro_rules! ext {
+        ($name:ident, $pat:expr, $ext:expr) => {
+            #[test]
+            fn $name() {
+                let pat = Pattern::new($pat).unwrap();
+                let ext = pat.ext().map(|e| e.to_string_lossy().into_owned());
+                assert_eq!($ext, ext.as_ref().map(|s| &**s));
+            }
+        };
+    }
+
+    macro_rules! baseliteral {
+        ($name:ident, $pat:expr, $yes:expr) => {
+            #[test]
+            fn $name() {
+                let pat = Pattern::new($pat).unwrap();
+                assert_eq!($yes, pat.base_literal().is_some());
+            }
+        };
+    }
+
+    macro_rules! basesuffix {
+        ($name:ident, $pat:expr, $yes:expr) => {
+            #[test]
+            fn $name() {
+                let pat = Pattern::new($pat).unwrap();
+                assert_eq!($yes, pat.is_literal_suffix());
+            }
+        };
+    }
+
+    fn class(s: char, e: char) -> Token {
+        Class { negated: false, ranges: vec![(s, e)] }
+    }
+
+    fn classn(s: char, e: char) -> Token {
+        Class { negated: true, ranges: vec![(s, e)] }
+    }
+
+    fn rclass(ranges: &[(char, char)]) -> Token {
+        Class { negated: false, ranges: ranges.to_vec() }
+    }
+
+    fn rclassn(ranges: &[(char, char)]) -> Token {
+        Class { negated: true, ranges: ranges.to_vec() }
+    }
+
+    syntax!(literal1, "a", vec![Literal('a')]);
+    syntax!(literal2, "ab", vec![Literal('a'), Literal('b')]);
+    syntax!(any1, "?", vec![Any]);
+    syntax!(any2, "a?b", vec![Literal('a'), Any, Literal('b')]);
+    syntax!(seq1, "*", vec![ZeroOrMore]);
+    syntax!(seq2, "a*b", vec![Literal('a'), ZeroOrMore, Literal('b')]);
+    syntax!(seq3, "*a*b*", vec![
+        ZeroOrMore, Literal('a'), ZeroOrMore, Literal('b'), ZeroOrMore,
+    ]);
+    syntax!(rseq1, "**", vec![RecursivePrefix]);
+    syntax!(rseq2, "**/", vec![RecursivePrefix]);
+
author	Andrew Gallant <jamslam@gmail.com>	2016-09-30 19:42:41 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2016-09-30 19:42:41 -0400
commit	fdf24317ac4c34fc0a21efa0a775bc7d3663d498 (patch)
tree	a6e94a61439b0568bfb7bdb5d80e9a0138c48317 /globset
parent	b9d5f22a4d20862dfbcbdfc81a07284719cc71c4 (diff)