Finish overhaul of glob matching.

This commit completes the initial move of glob matching to an external crate, including fixing up cross platform support, polishing the external crate for others to use and fixing a number of bugs in the process. Fixes #87, #127, #131
author: Andrew Gallant <jamslam@gmail.com> 2016-10-10 19:16:52 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2016-10-10 19:24:18 -0400
commit: e96d93034a4829250f61b190901a8faf9a1eeb1f (patch)
tree: e402ad6c7a66b95cda7e3ac2dae058aac5bdc1fc /globset
parent: bc5accc035846a930bc75cb5d710e477e4527a39 (diff)
6 files changed, 602 insertions, 283 deletions
diff --git a/globset/Cargo.toml b/globset/Cargo.toml
index cf63f397..67a954dd 100644
--- a/globset/Cargo.toml
+++ b/globset/Cargo.toml
@@ -3,6 +3,10 @@ name = "globset"
 version = "0.1.0"
 authors = ["Andrew Gallant <jamslam@gmail.com>"]
 
+[lib]
+name = "globset"
+bench = false
+
 [dependencies]
 aho-corasick = "0.5.3"
 fnv = "1.0"
@@ -10,3 +14,6 @@ lazy_static = "0.2"
 log = "0.3"
 memchr = "0.1"
 regex = "0.1.77"
+
+[dev-dependencies]
+glob = "0.2"
diff --git a/globset/README.md b/globset/README.md
new file mode 100644
index 00000000..f40b8aac
--- /dev/null
+++ b/globset/README.md
@@ -0,0 +1,122 @@
+globset
+=======
+Cross platform single glob and glob set matching. Glob set matching is the
+process of matching one or more glob patterns against a single candidate path
+simultaneously, and returning all of the globs that matched.
+
+[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.png)](https://travis-ci.org/BurntSushi/ripgrep)
+[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep)
+[![](https://img.shields.io/crates/v/globset.svg)](https://crates.io/crates/globset)
+
+Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
+
+### Documentation
+
+[https://docs.rs/globset](https://docs.rs/globset)
+
+### Usage
+
+Add this to your `Cargo.toml`:
+
+```toml
+[dependencies]
+globset = "0.1"
+```
+
+and this to your crate root:
+
+```rust
+extern crate globset;
+```
+
+### Example: one glob
+
+This example shows how to match a single glob against a single file path.
+
+```rust
+use globset::Glob;
+
+let glob = try!(Glob::new("*.rs")).compile_matcher();
+
+assert!(glob.is_match("foo.rs"));
+assert!(glob.is_match("foo/bar.rs"));
+assert!(!glob.is_match("Cargo.toml"));
+```
+
+### Example: configuring a glob matcher
+
+This example shows how to use a `GlobBuilder` to configure aspects of match
+semantics. In this example, we prevent wildcards from matching path separators.
+
+```rust
+use globset::GlobBuilder;
+
+let glob = try!(GlobBuilder::new("*.rs")
+    .literal_separator(true).build()).compile_matcher();
+
+assert!(glob.is_match("foo.rs"));
+assert!(!glob.is_match("foo/bar.rs")); // no longer matches
+assert!(!glob.is_match("Cargo.toml"));
+```
+
+### Example: match multiple globs at once
+
+This example shows how to match multiple glob patterns at once.
+
+```rust
+use globset::{Glob, GlobSetBuilder};
+
+let mut builder = GlobSetBuilder::new();
+// A GlobBuilder can be used to configure each glob's match semantics
+// independently.
+builder.add(try!(Glob::new("*.rs")));
+builder.add(try!(Glob::new("src/lib.rs")));
+builder.add(try!(Glob::new("src/**/foo.rs")));
+let set = try!(builder.build());
+
+assert_eq!(set.matches("src/bar/baz/foo.rs"), vec![0, 2]);
+```
+
+### Performance
+
+This crate implements globs by converting them to regular expressions, and
+executing them with the
+[`regex`](https://github.com/rust-lang-nursery/regex)
+crate.
+
+For single glob matching, performance of this crate should be roughly on par
+with the performance of the
+[`glob`](https://github.com/rust-lang-nursery/glob)
+crate. (`*_regex` correspond to benchmarks for this library while `*_glob`
+correspond to benchmarks for the `glob` library.)
+Optimizations in the `regex` crate may propel this library past `glob`,
+particularly when matching longer paths.
+
+```
+test ext_glob             ... bench:         425 ns/iter (+/- 21)
+test ext_regex            ... bench:         175 ns/iter (+/- 10)
+test long_glob            ... bench:         182 ns/iter (+/- 11)
+test long_regex           ... bench:         173 ns/iter (+/- 10)
+test short_glob           ... bench:          69 ns/iter (+/- 4)
+test short_regex          ... bench:          83 ns/iter (+/- 2)
+```
+
+The primary performance advantage of this crate is when matching multiple
+globs against a single path. With the `glob` crate, one must match each glob
+synchronously, one after the other. In this crate, many can be matched
+simultaneously. For example:
+
+```
+test many_short_glob      ... bench:       1,063 ns/iter (+/- 47)
+test many_short_regex_set ... bench:         186 ns/iter (+/- 11)
+```
+
+### Comparison with the [`glob`](https://github.com/rust-lang-nursery/glob) crate
+
+* Supports alternate "or" globs, e.g., `*.{foo,bar}`.
+* Can match non-UTF-8 file paths correctly.
+* Supports matching multiple globs at once.
+* Doesn't provide a recursive directory iterator of matching file paths,
+  although I believe this crate should grow one eventually.
+* Supports case insensitive and require-literal-separator match options, but
+  **doesn't** support the require-literal-leading-dot option.
diff --git a/globset/benches/bench.rs b/globset/benches/bench.rs
new file mode 100644
index 00000000..a151645d
--- /dev/null
+++ b/globset/benches/bench.rs
@@ -0,0 +1,118 @@
+/*!
+This module benchmarks the glob implementation. For benchmarks on the ripgrep
+tool itself, see the benchsuite directory.
+*/
+#![feature(test)]
+
+extern crate glob;
+extern crate globset;
+#[macro_use]
+extern crate lazy_static;
+extern crate regex;
+extern crate test;
+
+use globset::{Candidate, Glob, GlobMatcher, GlobSet, GlobSetBuilder};
+
+const EXT: &'static str = "some/a/bigger/path/to/the/crazy/needle.txt";
+const EXT_PAT: &'static str = "*.txt";
+
+const SHORT: &'static str = "some/needle.txt";
+const SHORT_PAT: &'static str = "some/**/needle.txt";
+
+const LONG: &'static str = "some/a/bigger/path/to/the/crazy/needle.txt";
+const LONG_PAT: &'static str = "some/**/needle.txt";
+
+fn new_glob(pat: &str) -> glob::Pattern {
+    glob::Pattern::new(pat).unwrap()
+}
+
+fn new_reglob(pat: &str) -> GlobMatcher {
+    Glob::new(pat).unwrap().compile_matcher()
+}
+
+fn new_reglob_many(pats: &[&str]) -> GlobSet {
+    let mut builder = GlobSetBuilder::new();
+    for pat in pats {
+        builder.add(Glob::new(pat).unwrap());
+    }
+    builder.build().unwrap()
+}
+
+#[bench]
+fn ext_glob(b: &mut test::Bencher) {
+    let pat = new_glob(EXT_PAT);
+    b.iter(|| assert!(pat.matches(EXT)));
+}
+
+#[bench]
+fn ext_regex(b: &mut test::Bencher) {
+    let set = new_reglob(EXT_PAT);
+    let cand = Candidate::new(EXT);
+    b.iter(|| assert!(set.is_match_candidate(&cand)));
+}
+
+#[bench]
+fn short_glob(b: &mut test::Bencher) {
+    let pat = new_glob(SHORT_PAT);
+    b.iter(|| assert!(pat.matches(SHORT)));
+}
+
+#[bench]
+fn short_regex(b: &mut test::Bencher) {
+    let set = new_reglob(SHORT_PAT);
+    let cand = Candidate::new(SHORT);
+    b.iter(|| assert!(set.is_match_candidate(&cand)));
+}
+
+#[bench]
+fn long_glob(b: &mut test::Bencher) {
+    let pat = new_glob(LONG_PAT);
+    b.iter(|| assert!(pat.matches(LONG)));
+}
+
+#[bench]
+fn long_regex(b: &mut test::Bencher) {
+    let set = new_reglob(LONG_PAT);
+    let cand = Candidate::new(LONG);
+    b.iter(|| assert!(set.is_match_candidate(&cand)));
+}
+
+const MANY_SHORT_GLOBS: &'static [&'static str] = &[
+    // Taken from a random .gitignore on my system.
+    ".*.swp",
+    "tags",
+    "target",
+    "*.lock",
+    "tmp",
+    "*.csv",
+    "*.fst",
+    "*-got",
+    "*.csv.idx",
+    "words",
+    "98m*",
+    "dict",
+    "test",
+    "months",
+];
+
+const MANY_SHORT_SEARCH: &'static str = "98m-blah.csv.idx";
+
+#[bench]
+fn many_short_glob(b: &mut test::Bencher) {
+    let pats: Vec<_> = MANY_SHORT_GLOBS.iter().map(|&s| new_glob(s)).collect();
+    b.iter(|| {
+        let mut count = 0;
+        for pat in &pats {
+            if pat.matches(MANY_SHORT_SEARCH) {
+                count += 1;
+            }
+        }
+        assert_eq!(2, count);
+    })
+}
+
+#[bench]
+fn many_short_regex_set(b: &mut test::Bencher) {
+    let set = new_reglob_many(MANY_SHORT_GLOBS);
+    b.iter(|| assert_eq!(2, set.matches(MANY_SHORT_SEARCH).iter().count()));
+}
diff --git a/globset/src/pattern.rs b/globset/src/glob.rs
index 1eff726a..279d5201 100644
--- a/globset/src/pattern.rs
+++ b/globset/src/glob.rs
@@ -2,14 +2,13 @@ use std::ffi::{OsStr, OsString};
 use std::fmt;
 use std::iter;
 use std::ops::{Deref, DerefMut};
-use std::path::Path;
+use std::path::{Path, is_separator};
 use std::str;
 
 use regex;
 use regex::bytes::Regex;
 
-use {Error, FILE_SEPARATORS, new_regex};
-use pathutil::path_bytes;
+use {Candidate, Error, new_regex};
 
 /// Describes a matching strategy for a particular pattern.
 ///
@@ -54,7 +53,7 @@ pub enum MatchStrategy {
 
 impl MatchStrategy {
     /// Returns a matching strategy for the given pattern.
-    pub fn new(pat: &Pattern) -> MatchStrategy {
+    pub fn new(pat: &Glob) -> MatchStrategy {
         if let Some(lit) = pat.basename_literal() {
             MatchStrategy::BasenameLiteral(lit)
         } else if let Some(lit) = pat.literal() {
@@ -73,19 +72,19 @@ impl MatchStrategy {
     }
 }
 
-/// Pattern represents a successfully parsed shell glob pattern.
+/// Glob represents a successfully parsed shell glob pattern.
 ///
 /// It cannot be used directly to match file paths, but it can be converted
-/// to a regular expression string.
+/// to a regular expression string or a matcher.
 #[derive(Clone, Debug, Eq, PartialEq)]
-pub struct Pattern {
+pub struct Glob {
     glob: String,
     re: String,
-    opts: PatternOptions,
+    opts: GlobOptions,
     tokens: Tokens,
 }
 
-impl fmt::Display for Pattern {
+impl fmt::Display for Glob {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         self.glob.fmt(f)
     }
@@ -93,52 +92,55 @@ impl fmt::Display for Pattern {
 
 /// A matcher for a single pattern.
 #[derive(Clone, Debug)]
-pub struct PatternMatcher {
+pub struct GlobMatcher {
     /// The underlying pattern.
-    pat: Pattern,
+    pat: Glob,
     /// The pattern, as a compiled regex.
     re: Regex,
 }
 
-impl PatternMatcher {
+impl GlobMatcher {
     /// Tests whether the given path matches this pattern or not.
     pub fn is_match<P: AsRef<Path>>(&self, path: P) -> bool {
-        self.re.is_match(&*path_bytes(path.as_ref()))
+        self.is_match_candidate(&Candidate::new(path.as_ref()))
+    }
+
+    /// Tests whether the given path matches this pattern or not.
+    pub fn is_match_candidate(&self, path: &Candidate) -> bool {
+        self.re.is_match(&path.path)
     }
 }
 
 /// A strategic matcher for a single pattern.
 #[cfg(test)]
 #[derive(Clone, Debug)]
-struct PatternStrategic {
+struct GlobStrategic {
     /// The match strategy to use.
     strategy: MatchStrategy,
     /// The underlying pattern.
-    pat: Pattern,
+    pat: Glob,
     /// The pattern, as a compiled regex.
     re: Regex,
 }
 
 #[cfg(test)]
-impl PatternStrategic {
+impl GlobStrategic {
     /// Tests whether the given path matches this pattern or not.
-    pub fn is_match<P: AsRef<Path>>(&self, path: P) -> bool {
-        use pathutil::file_name_ext;
+    fn is_match<P: AsRef<Path>>(&self, path: P) -> bool {
+        self.is_match_candidate(&Candidate::new(path.as_ref()))
+    }
 
-        let cow_path = path_bytes(path.as_ref());
-        let byte_path = &*cow_path;
+    /// Tests whether the given path matches this pattern or not.
+    fn is_match_candidate(&self, candidate: &Candidate) -> bool {
+        let byte_path = &*candidate.path;
 
         match self.strategy {
             MatchStrategy::Literal(ref lit) => lit.as_bytes() == byte_path,
             MatchStrategy::BasenameLiteral(ref lit) => {
-                let lit = OsStr::new(lit);
-                path.as_ref().file_name().map(|n| n == lit).unwrap_or(false)
+                lit.as_bytes() == &*candidate.basename
             }
             MatchStrategy::Extension(ref ext) => {
-                path.as_ref().file_name()
-                    .and_then(file_name_ext)
-                    .map(|got| got == ext)
-                    .unwrap_or(false)
+                candidate.ext == ext
             }
             MatchStrategy::Prefix(ref pre) => {
                 starts_with(pre.as_bytes(), byte_path)
@@ -150,10 +152,7 @@ impl PatternStrategic {
                 ends_with(suffix.as_bytes(), byte_path)
             }
             MatchStrategy::RequiredExtension(ref ext) => {
-                path.as_ref().file_name()
-                    .and_then(file_name_ext)
-                    .map(|got| got == ext && self.re.is_match(byte_path))
-                    .unwrap_or(false)
+                candidate.ext == ext && self.re.is_match(byte_path)
             }
             MatchStrategy::Regex => self.re.is_match(byte_path),
         }
@@ -167,15 +166,15 @@ impl PatternStrategic {
 ///
 /// The lifetime `'a` refers to the lifetime of the pattern string.
 #[derive(Clone, Debug)]
-pub struct PatternBuilder<'a> {
+pub struct GlobBuilder<'a> {
     /// The glob pattern to compile.
     glob: &'a str,
     /// Options for the pattern.
-    opts: PatternOptions,
+    opts: GlobOptions,
 }
 
 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
-struct PatternOptions {
+struct GlobOptions {
     /// Whether to match case insensitively.
     case_insensitive: bool,
     /// Whether to require a literal separator to match a separator in a file
@@ -210,17 +209,17 @@ enum Token {
     Alternates(Vec<Tokens>),
 }
 
-impl Pattern {
+impl Glob {
     /// Builds a new pattern with default options.
-    pub fn new(glob: &str) -> Result<Pattern, Error> {
-        PatternBuilder::new(glob).build()
+    pub fn new(glob: &str) -> Result<Glob, Error> {
+        GlobBuilder::new(glob).build()
     }
 
     /// Returns a matcher for this pattern.
-    pub fn compile_matcher(&self) -> PatternMatcher {
+    pub fn compile_matcher(&self) -> GlobMatcher {
         let re = new_regex(&self.re)
             .expect("regex compilation shouldn't fail");
-        PatternMatcher {
+        GlobMatcher {
             pat: self.clone(),
             re: re,
         }
@@ -230,13 +229,13 @@ impl Pattern {
     ///
     /// This isn't exposed because it's not clear whether it's actually
     /// faster than just running a regex for a *single* pattern. If it
-    /// is faster, then PatternMatcher should do it automatically.
+    /// is faster, then GlobMatcher should do it automatically.
     #[cfg(test)]
-    fn compile_strategic_matcher(&self) -> PatternStrategic {
+    fn compile_strategic_matcher(&self) -> GlobStrategic {
         let strategy = MatchStrategy::new(self);
         let re = new_regex(&self.re)
             .expect("regex compilation shouldn't fail");
-        PatternStrategic {
+        GlobStrategic {
             strategy: strategy,
             pat: self.clone(),
             re: re,
@@ -253,30 +252,11 @@ impl Pattern {
         &self.re
     }
 
-    /// Returns true if and only if this pattern only inspects the basename
-    /// of a path.
-    pub fn is_only_basename(&self) -> bool {
-        match self.tokens.get(0) {
-            Some(&Token::RecursivePrefix) => {}
-            _ => return false,
-        }
-        for t in &self.tokens[1..] {
-            match *t {
-                Token::Literal(c) if c == '/' || c == '\\' => return false,
-                Token::RecursivePrefix
-                | Token::RecursiveSuffix
-                | Token::RecursiveZeroOrMore => return false,
-                _ => {}
-            }
-        }
-        true
-    }
-
     /// Returns the pattern as a literal if and only if the pattern must match
     /// an entire path exactly.
     ///
     /// The basic format of these patterns is `{literal}`.
-    pub fn literal(&self) -> Option<String> {
+    fn literal(&self) -> Option<String> {
         if self.opts.case_insensitive {
             return None;
         }
@@ -301,7 +281,7 @@ impl Pattern {
     /// std::path::Path::extension returns. Namely, this extension includes
     /// the '.'. Also, paths like `.rs` are considered to have an extension
     /// of `.rs`.
-    pub fn ext(&self) -> Option<OsString> {
+    fn ext(&self) -> Option<OsString> {
         if self.opts.case_insensitive {
             return None;
         }
@@ -343,7 +323,7 @@ impl Pattern {
     /// This is like `ext`, but returns an extension even if it isn't sufficent
     /// to imply a match. Namely, if an extension is returned, then it is
     /// necessary but not sufficient for a match.
-    pub fn required_ext(&self) -> Option<OsString> {
+    fn required_ext(&self) -> Option<OsString> {
         if self.opts.case_insensitive {
             return None;
         }
@@ -372,7 +352,7 @@ impl Pattern {
 
     /// Returns a literal prefix of this pattern if the entire pattern matches
     /// if the literal prefix matches.
-    pub fn prefix(&self) -> Option<String> {
+    fn prefix(&self) -> Option<String> {
         if self.opts.case_insensitive {
             return None;
         }
@@ -417,7 +397,7 @@ impl Pattern {
     ///
     /// When this returns true, the suffix literal is guaranteed to start with
     /// a `/`.
-    pub fn suffix(&self) -> Option<(String, bool)> {
+    fn suffix(&self) -> Option<(String, bool)> {
         if self.opts.case_insensitive {
             return None;
         }
@@ -520,16 +500,7 @@ impl Pattern {
     ///
     /// The basic format of these patterns is `**/{literal}`, where `{literal}`
     /// does not contain a path separator.
-    pub fn basename_literal(&self) -> Option<String> {
-        self.base_literal()
-    }
-
-    /// Returns the pattern as a literal if and only if the pattern exclusiely
-    /// matches the basename of a file path *and* is a literal.
-    ///
-    /// The basic format of these patterns is `**/{literal}`, where `{literal}`
-    /// does not contain a path separator.
-    pub fn base_literal(&self) -> Option<String> {
+    fn basename_literal(&self) -> Option<String> {
         let tokens = match self.basename_tokens() {
             None => return None,
             Some(tokens) => tokens,
@@ -543,102 +514,21 @@ impl Pattern {
         }
         Some(lit)
     }
-
-    /// Returns a literal prefix of this pattern if and only if the entire
-    /// pattern matches if the literal prefix matches.
-    pub fn literal_prefix(&self) -> Option<String> {
-        match self.tokens.last() {
-            Some(&Token::ZeroOrMore) => {}
-            _ => return None,
-        }
-        let mut lit = String::new();
-        for t in &self.tokens[0..self.tokens.len()-1] {
-            match *t {
-                Token::Literal(c) => lit.push(c),
-                _ => return None,
-            }
-        }
-        Some(lit)
-    }
-
-    /// Returns a literal suffix of this pattern if and only if the entire
-    /// pattern matches if the literal suffix matches.
-    pub fn literal_suffix(&self) -> Option<String> {
-        match self.tokens.get(0) {
-            Some(&Token::RecursivePrefix) => {}
-            _ => return None,
-        }
-        let start =
-            match self.tokens.get(1) {
-                Some(&Token::ZeroOrMore) => 2,
-                _ => 1,
-            };
-        let mut lit = String::new();
-        for t in &self.tokens[start..] {
-            match *t {
-                Token::Literal(c) => lit.push(c),
-                _ => return None,
-            }
-        }
-        Some(lit)
-    }
-
-    /// Returns a basename literal prefix of this pattern.
-    pub fn base_literal_prefix(&self) -> Option<String> {
-        match self.tokens.get(0) {
-            Some(&Token::RecursivePrefix) => {}
-            _ => return None,
-        }
-        match self.tokens.last() {
-            Some(&Token::ZeroOrMore) => {}
-            _ => return None,
-        }
-        let mut lit = String::new();
-        for t in &self.tokens[1..self.tokens.len()-1] {
-            match *t {
-                Token::Literal(c) if c == '/' || c == '\\' => return None,
-                Token::Literal(c) => lit.push(c),
-                _ => return None,
-            }
-        }
-        Some(lit)
-    }
-
-    /// Returns a basename literal suffix of this pattern.
-    pub fn base_literal_suffix(&self) -> Option<String> {
-        match self.tokens.get(0) {
-            Some(&Token::RecursivePrefix) => {}
-            _ => return None,
-        }
-        match self.tokens.get(1) {
-            Some(&Token::ZeroOrMore) => {}
-            _ => return None,
-        }
-        let mut lit = String::new();
-        for t in &self.tokens[2..] {
-            match *t {
-                Token::Literal(c) if c == '/' || c == '\\' => return None,
-                Token::Literal(c) => lit.push(c),
-                _ => return None,
-            }
-        }
-        Some(lit)
-    }
 }
 
-impl<'a> PatternBuilder<'a> {
+impl<'a> GlobBuilder<'a> {
     /// Create a new builder for the pattern given.
     ///
     /// The pattern is not compiled until `build` is called.
-    pub fn new(glob: &'a str) -> PatternBuilder<'a> {
-        PatternBuilder {
+    pub fn new(glob: &'a str) -> GlobBuilder<'a> {
+        GlobBuilder {
             glob: glob,
-            opts: PatternOptions::default(),
+            opts: GlobOptions::default(),
         }
     }
 
     /// Parses and builds the pattern.
-    pub fn build(&self) -> Result<Pattern, Error> {
+    pub fn build(&self) -> Result<Glob, Error> {
         let mut p = Parser {
             stack: vec![Tokens::default()],
             chars: self.glob.chars().peekable(),
@@ -652,7 +542,7 @@ impl<'a> PatternBuilder<'a> {
             Err(Error::UnclosedAlternates)
         } else {
             let tokens = p.stack.pop().unwrap();
-            Ok(Pattern {
+            Ok(Glob {
                 glob: self.glob.to_string(),
                 re: tokens.to_regex_with(&self.opts),
                 opts: self.opts,
@@ -664,13 +554,13 @@ impl<'a> PatternBuilder<'a> {
     /// Toggle whether the pattern matches case insensitively or not.
     ///
     /// This is disabled by default.
-    pub fn case_insensitive(&mut self, yes: bool) -> &mut PatternBuilder<'a> {
+    pub fn case_insensitive(&mut self, yes: bool) -> &mut GlobBuilder<'a> {
         self.opts.case_insensitive = yes;
         self
     }
 
     /// Toggle whether a literal `/` is required to match a path separator.
-    pub fn literal_separator(&mut self, yes: bool) -> &mut PatternBuilder<'a> {
+    pub fn literal_separator(&mut self, yes: bool) -> &mut GlobBuilder<'a> {
         self.opts.literal_separator = yes;
         self
     }
@@ -680,7 +570,7 @@ impl Tokens {
     /// Convert this pattern to a string that is guaranteed to be a valid
     /// regular expression and will represent the matching semantics of this
     /// glob pattern and the options given.
-    fn to_regex_with(&self, options: &PatternOptions) -> String {
+    fn to_regex_with(&self, options: &GlobOptions) -> String {
         let mut re = String::new();
         re.push_str("(?-u)");
         if options.case_insensitive {
@@ -699,43 +589,39 @@ impl Tokens {
         re
     }
 
-
     fn tokens_to_regex(
         &self,
-        options: &PatternOptions,
+        options: &GlobOptions,
         tokens: &[Token],
         re: &mut String,
     ) {
-        let seps = &*FILE_SEPARATORS;
-
         for tok in tokens {
             match *tok {
                 Token::Literal(c) => {
-                    re.push_str(&regex::quote(&c.to_string()));
+                    re.push_str(&char_to_escaped_literal(c));
                 }
                 Token::Any => {
                     if options.literal_separator {
-                        re.push_str(&format!("[^{}]", seps));
+                        re.push_str("[^/]");
                     } else {
                         re.push_str(".");
                     }
                 }
                 Token::ZeroOrMore => {
                     if options.literal_separator {
-                        re.push_str(&format!("[^{}]*", seps));
+                        re.push_str("[^/]*");
                     } else {
                         re.push_str(".*");
                     }
                 }
                 Token::RecursivePrefix => {
-                    re.push_str(&format!("(?:[{sep}]?|.*[{sep}])", sep=seps));
+                    re.push_str("(?:/?|.*/)");
                 }
                 Token::RecursiveSuffix => {
-                    re.push_str(&format!("(?:[{sep}]?|[{sep}].*)", sep=seps));
+                    re.push_str("(?:/?|/.*)");
                 }
                 Token::RecursiveZeroOrMore => {
-                    re.push_str(&format!("(?:[{sep}]|[{sep}].*[{sep}])",
-                                         sep=seps));
+                    re.push_str("(?:/|/.*/)");
                 }
                 Token::Class { negated, ref ranges } => {
                     re.push('[');
@@ -745,11 +631,11 @@ impl Tokens {
                     for r in ranges {
                         if r.0 == r.1 {
                             // Not strictly necessary, but nicer to look at.
-                            re.push_str(&regex::quote(&r.0.to_string()));
+                            re.push_str(&char_to_escaped_literal(r.0));
                         } else {
-                            re.push_str(&regex::quote(&r.0.to_string()));
+                            re.push_str(&char_to_escaped_literal(r.0));
                             re.push('-');
-                            re.push_str(&regex::quote(&r.1.to_string()));
+                            re.push_str(&char_to_escaped_literal(r.1));
                         }
                     }
                     re.push(']');
@@ -768,6 +654,26 @@ impl Tokens {
     }
 }
 
+/// Convert a Unicode scalar value to an escaped string suitable for use as
+/// a literal in a non-Unicode regex.
+fn char_to_escaped_literal(c: char) -> String {
+    bytes_to_escaped_literal(&c.to_string().into_bytes())
+}
+
+/// Converts an arbitrary sequence of bytes to a UTF-8 string. All non-ASCII
+/// code units are converted to their escaped form.
+fn bytes_to_escaped_literal(bs: &[u8]) -> String {
+    let mut s = String::with_capacity(bs.len());
+    for &b in bs {
+        if b <= 0x7F {
+            s.push_str(&regex::quote(&(b as char).to_string()));
+        } else {
+            s.push_str(&format!("\\x{:02x}", b));
+        }
+    }
+    s
+}
+
 struct Parser<'a> {
     stack: Vec<Tokens>,
     chars: iter::Peekable<str::Chars<'a>>,
@@ -785,7 +691,14 @@ impl<'a> Parser<'a> {
                 '{' => try!(self.push_alternate()),
                 '}' => try!(self.pop_alternate()),
                 ',' => try!(self.parse_comma()),
-                c => try!(self.push_token(Token::Literal(c))),
+                c => {
+                    if is_separator(c) {
+                        // Normalize all patterns to use / as a separator.
+                        try!(self.push_token(Token::Literal('/')))
+                    } else {
+                        try!(self.push_token(Token::Literal(c)))
+                    }
+                }
             }
         }
         Ok(())
@@ -848,13 +761,13 @@ impl<'a> Parser<'a> {
         if !try!(self.have_tokens()) {
             try!(self.push_token(Token::RecursivePrefix));
             let next = self.bump();
-            if !next.is_none() && next != Some('/') {
+            if !next.map(is_separator).unwrap_or(true) {
                 return Err(Error::InvalidRecursive);
             }
             return Ok(());
         }
         try!(self.pop_token());
-        if prev != Some('/') {
+        if !prev.map(is_separator).unwrap_or(false) {
             if self.stack.len() <= 1
                 || (prev != Some(',') && prev != Some('{')) {
                 return Err(Error::InvalidRecursive);
@@ -868,8 +781,8 @@ impl<'a> Parser<'a> {
             Some(&',') | Some(&'}') if self.stack.len() >= 2 => {
                 self.push_token(Token::RecursiveSuffix)
             }
-            Some(&'/') => {
-                assert!(self.bump() == Some('/'));
+            Some(&c) if is_separator(c) => {
+                assert!(self.bump().map(is_separator).unwrap_or(false));
                 self.push_token(Token::RecursiveZeroOrMore)
             }
             _ => Err(Error::InvalidRecursive),
@@ -973,8 +886,8 @@ fn ends_with(needle: &[u8], haystack: &[u8]) -> bool {
 mod tests {
     use std::ffi::{OsStr, OsString};
 
-    use {SetBuilder, Error};
-    use super::{Pattern, PatternBuilder, Token};
+    use {GlobSetBuilder, Error};
+    use super::{Glob, GlobBuilder, Token};
     use super::Token::*;
 
     #[derive(Clone, Copy, Debug, Default)]
@@ -987,7 +900,7 @@ mod tests {
         ($name:ident, $pat:expr, $tokens:expr) => {
             #[test]
             fn $name() {
-                let pat = Pattern::new($pat).unwrap();
+                let pat = Glob::new($pat).unwrap();
                 assert_eq!($tokens, pat.tokens.0);
             }
         }
@@ -997,7 +910,7 @@ mod tests {
         ($name:ident, $pat:expr, $err:expr) => {
             #[test]
             fn $name() {
-                let err = Pattern::new($pat).unwrap_err();
+                let err = Glob::new($pat).unwrap_err();
                 assert_eq!($err, err);
             }
         }
@@ -1010,7 +923,7 @@ mod tests {
         ($name:ident, $pat:expr, $re:expr, $options:expr) => {
             #[test]
             fn $name() {
-                let pat = PatternBuilder::new($pat)
+                let pat = GlobBuilder::new($pat)
                     .case_insensitive($options.casei)
                     .literal_separator($options.litsep)
                     .build()
@@ -1027,14 +940,14 @@ mod tests {
         ($name:ident, $pat:expr, $path:expr, $options:expr) => {
             #[test]
             fn $name() {
-                let pat = PatternBuilder::new($pat)
+                let pat = GlobBuilder::new($pat)
                     .case_insensitive($options.casei)
                     .literal_separator($options.litsep)
                     .build()
                     .unwrap();
                 let matcher = pat.compile_matcher();
                 let strategic = pat.compile_strategic_matcher();
-                let set = SetBuilder::new().add(pat).build().unwrap();
+                let set = GlobSetBuilder::new().add(pat).build().unwrap();
                 assert!(matcher.is_match($path));
                 assert!(strategic.is_match($path));
                 assert!(set.is_match($path));
@@ -1049,14 +962,14 @@ mod tests {
         ($name:ident, $pat:expr, $path:expr, $options:expr) => {
             #[test]
             fn $name() {
-                let pat = PatternBuilder::new($pat)
+                let pat = GlobBuilder::new($pat)
                     .case_insensitive($options.casei)
                     .literal_separator($options.litsep)
                     .build()
                     .unwrap();
                 let matcher = pat.compile_matcher();
                 let strategic = pat.compile_strategic_matcher();
-                let set = SetBuilder::new().add(pat).build().unwrap();
+                let set = GlobSetBuilder::new().add(pat).build().unwrap();
                 assert!(!matcher.is_match($path));
                 assert!(!strategic.is_match($path));
                 assert!(!set.is_match($path));
@@ -1146,8 +1059,8 @@ mod tests {
 
     toregex!(re_casei, "a", "(?i)^a$", &CASEI);
 
-    toregex!(re_slash1, "?", r"^[^/\\]$", SLASHLIT);
-    toregex!(re_slash2, "*", r"^[^/\\]*$", SLASHLIT);
+    toregex!(re_slash1, "?", r"^[^/]$", SLASHLIT);
+    toregex!(re_slash2, "*", r"^[^/]*$", SLASHLIT);
 
     toregex!(re1, "a", "^a$");
     toregex!(re2, "?", "^.$");
@@ -1160,6 +1073,7 @@ mod tests {
     toregex!(re9, "[+]", r"^[\+]$");
     toregex!(re10, "+", r"^\+$");
     toregex!(re11, "**", r"^.*$");
+    toregex!(re12, "☃", r"^\xe2\x98\x83$");
 
     matches!(match1, "a", "a");
     matches!(match2, "a*b", "a_b");
@@ -1170,6 +1084,7 @@ mod tests {
     matches!(match7, "a*a*a*a*a*a*a*a*a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
     matches!(match8, "a*b[xyz]c*d", "abxcdbxcddd");
     matches!(match9, "*.rs", ".rs");
+    matches!(match10, "☃", "☃");
 
     matches!(matchrec1, "some/**/needle.txt", "some/needle.txt");
     matches!(matchrec2, "some/**/needle.txt", "some/one/needle.txt");
@@ -1239,10 +1154,16 @@ mod tests {
     matches!(matchalt13, "{*.foo,*.bar,*.wat}", "test.wat");
 
     matches!(matchslash1, "abc/def", "abc/def", SLASHLIT);
+    #[cfg(unix)]
     nmatches!(matchslash2, "abc?def", "abc/def", SLASHLIT);
-    nmatches!(matchslash2_win, "abc?def", "abc\\def", SLASHLIT);
+    #[cfg(not(unix))]
+    nmatches!(matchslash2, "abc?def", "abc\\def", SLASHLIT);
     nmatches!(matchslash3, "abc*def", "abc/def", SLASHLIT);
     matches!(matchslash4, "abc[/]def", "abc/def", SLASHLIT); // differs
+    #[cfg(unix)]
+    nmatches!(matchslash5, "abc\\def", "abc/def", SLASHLIT);
+    #[cfg(not(unix))]
+    matches!(matchslash5, "abc\\def", "abc/def", SLASHLIT);
 
     nmatches!(matchno
author	Andrew Gallant <jamslam@gmail.com>	2016-10-10 19:16:52 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2016-10-10 19:24:18 -0400
commit	e96d93034a4829250f61b190901a8faf9a1eeb1f (patch)
tree	e402ad6c7a66b95cda7e3ac2dae058aac5bdc1fc /globset
parent	bc5accc035846a930bc75cb5d710e477e4527a39 (diff)