globbing by regex

author: Andrew Gallant <jamslam@gmail.com> 2016-08-25 21:44:37 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2016-08-25 21:44:37 -0400
commit: b55ecf34c713392b012dd652fbbd11d7e0126d97 (patch)
tree: 323cc2f2cbf8f58cfee742f4307094df6a13a753
parent: 957f90c8988779fae8c466e2697dbc2ae542f0ff (diff)
4 files changed, 867 insertions, 39 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 9f36d765..d4d59f63 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,14 +13,25 @@ readme = "README.md"
 keywords = ["regex", "grep", "egrep", "search", "pattern"]
 license = "Unlicense/MIT"
 
+[[bin]]
+bench = false
+path = "src/main.rs"
+name = "xrep"
+
 [dependencies]
 docopt = "0.6"
 grep = { version = "0.1", path = "grep" }
 memchr = "0.1"
 memmap = "0.2"
+num_cpus = "1"
 regex = { version = "0.1", path = "/home/andrew/rust/regex" }
 regex-syntax = { version = "0.3.1", path = "/home/andrew/rust/regex/regex-syntax" }
 rustc-serialize = "0.3"
+walkdir = "0.1"
+
+[dev-dependencies]
+glob = "0.2"
+lazy_static = "0.2"
 
 [profile.release]
 debug = true
diff --git a/benches/bench.rs b/benches/bench.rs
new file mode 100644
index 00000000..a529172f
--- /dev/null
+++ b/benches/bench.rs
@@ -0,0 +1,122 @@
+#![feature(test)]
+
+extern crate glob;
+#[macro_use]
+extern crate lazy_static;
+extern crate regex;
+extern crate test;
+
+const SHORT: &'static str = "some/needle.txt";
+const SHORT_PAT: &'static str = "some/**/needle.txt";
+
+const LONG: &'static str = "some/a/bigger/path/to/the/crazy/needle.txt";
+const LONG_PAT: &'static str = "some/**/needle.txt";
+
+#[allow(dead_code, unused_variables)]
+#[path = "../src/glob.rs"]
+mod reglob;
+
+fn new_glob(pat: &str) -> glob::Pattern {
+    glob::Pattern::new(pat).unwrap()
+}
+
+fn new_reglob(pat: &str) -> reglob::Set {
+    let mut builder = reglob::SetBuilder::new();
+    builder.add(pat).unwrap();
+    builder.build().unwrap()
+}
+
+fn new_reglob_many(pats: &[&str]) -> reglob::Set {
+    let mut builder = reglob::SetBuilder::new();
+    for pat in pats {
+        builder.add(pat).unwrap();
+    }
+    builder.build().unwrap()
+}
+
+#[bench]
+fn short_glob(b: &mut test::Bencher) {
+    let pat = new_glob(SHORT_PAT);
+    b.iter(|| assert!(pat.matches(SHORT)));
+}
+
+#[bench]
+fn short_regex(b: &mut test::Bencher) {
+    let set = new_reglob(SHORT_PAT);
+    b.iter(|| assert!(set.is_match(SHORT)));
+}
+
+#[bench]
+fn long_glob(b: &mut test::Bencher) {
+    let pat = new_glob(LONG_PAT);
+    b.iter(|| assert!(pat.matches(LONG)));
+}
+
+#[bench]
+fn long_regex(b: &mut test::Bencher) {
+    let set = new_reglob(LONG_PAT);
+    b.iter(|| assert!(set.is_match(LONG)));
+}
+
+const MANY_SHORT_GLOBS: &'static [&'static str] = &[
+    // Taken from a random .gitignore on my system.
+    ".*.swp",
+    "tags",
+    "target",
+    "*.lock",
+    "tmp",
+    "*.csv",
+    "*.fst",
+    "*-got",
+    "*.csv.idx",
+    "words",
+    "98m*",
+    "dict",
+    "test",
+    "months",
+];
+
+const MANY_SHORT_SEARCH: &'static str = "98m-blah.csv.idx";
+
+#[bench]
+fn many_short_glob(b: &mut test::Bencher) {
+    let pats: Vec<_> = MANY_SHORT_GLOBS.iter().map(|&s| new_glob(s)).collect();
+    b.iter(|| {
+        let mut count = 0;
+        for pat in &pats {
+            if pat.matches(MANY_SHORT_SEARCH) {
+                count += 1;
+            }
+        }
+        assert_eq!(2, count);
+    })
+}
+
+#[bench]
+fn many_short_regex_set(b: &mut test::Bencher) {
+    let set = new_reglob_many(MANY_SHORT_GLOBS);
+    b.iter(|| assert_eq!(2, set.matches(MANY_SHORT_SEARCH).iter().count()));
+}
+
+// This is the fastest on my system (beating many_glob by about 2x). This
+// suggests that a RegexSet needs quite a few regexes (or a larger haystack)
+// in order for it to scale.
+//
+// TODO(burntsushi): come up with a benchmark that uses more complex patterns
+// or a longer haystack.
+#[bench]
+fn many_short_regex_pattern(b: &mut test::Bencher) {
+    let pats: Vec<_> = MANY_SHORT_GLOBS.iter().map(|&s| {
+        let pat = reglob::Pattern::new(s).unwrap();
+        regex::Regex::new(&pat.to_regex()).unwrap()
+    }).collect();
+    b.iter(|| {
+        let mut count = 0;
+        for pat in &pats {
+            if pat.is_match(MANY_SHORT_SEARCH) {
+                count += 1;
+            }
+        }
+        assert_eq!(2, count);
+    })
+}
diff --git a/src/glob.rs b/src/glob.rs
new file mode 100644
index 00000000..6ae57a34
--- /dev/null
+++ b/src/glob.rs
@@ -0,0 +1,668 @@
+/*!
+The glob submodule provides standard shell globbing, but is specifically
+implemented by converting glob syntax to regular expressions. The reasoning
+is two fold:
+
+1. The regex library is *really* fast. Regaining performance in a distinct
+   implementation of globbing is non-trivial.
+2. Most crucially, a `RegexSet` can be used to match many globs simultaneously.
+
+This module is written with some amount of intention of eventually splitting it
+out into its own separate crate, but I didn't quite have the energy for all
+that rigamorole when I wrote this. In particular, it could be fast/good enough
+to make its way into `glob` proper.
+*/
+
+use std::error::Error as StdError;
+use std::fmt;
+use std::iter;
+use std::path;
+use std::str;
+
+use regex;
+use regex::bytes::{RegexSet, SetMatches};
+
+/// Represents an error that can occur when parsing a glob pattern.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum Error {
+    InvalidRecursive,
+    UnclosedClass,
+    InvalidRange(char, char),
+}
+
+impl StdError for Error {
+    fn description(&self) -> &str {
+        match *self {
+            Error::InvalidRecursive => {
+                "invalid use of **; must be one path component"
+            }
+            Error::UnclosedClass => {
+                "unclosed character class; missing ']'"
+            }
+            Error::InvalidRange(_, _) => {
+                "invalid character range"
+            }
+        }
+    }
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match *self {
+            Error::InvalidRecursive | Error::UnclosedClass => {
+                write!(f, "{}", self.description())
+            }
+            Error::InvalidRange(s, e) => {
+                write!(f, "invalid range; '{}' > '{}'", s, e)
+            }
+        }
+    }
+}
+
+/// Set represents a group of globs that can be matched together in a single
+/// pass.
+#[derive(Clone, Debug)]
+pub struct Set {
+    re: RegexSet,
+}
+
+impl Set {
+    /// Returns true if and only if the given path matches at least one glob
+    /// in this set.
+    pub fn is_match<T: AsRef<[u8]>>(&self, path: T) -> bool {
+        self.re.is_match(path.as_ref())
+    }
+
+    /// Returns every glob pattern (by sequence number) that matches the given
+    /// path.
+    pub fn matches<T: AsRef<[u8]>>(&self, path: T) -> SetMatches {
+        self.re.matches(path.as_ref())
+    }
+
+    /// Populates the given slice with corresponding patterns that matched.
+    pub fn matches_with<T: AsRef<[u8]>>(
+        &self,
+        path: T,
+        matches: &mut [bool],
+    ) -> bool {
+        self.re.matches_with(path.as_ref(), matches)
+    }
+
+    /// Returns the number of glob patterns in this set.
+    pub fn len(&self) -> usize {
+        self.re.len()
+    }
+}
+
+/// SetBuilder builds a group of patterns that can be used to simultaneously
+/// match a file path.
+pub struct SetBuilder {
+    pats: Vec<(Pattern, MatchOptions)>,
+}
+
+impl SetBuilder {
+    /// Create a new SetBuilder. A SetBuilder can be used to add new patterns.
+    /// Once all patterns have been added, `build` should be called to produce
+    /// a `Set`, which can then be used for matching.
+    pub fn new() -> SetBuilder {
+        SetBuilder { pats: vec![] }
+    }
+
+    /// Builds a new matcher from all of the glob patterns added so far.
+    ///
+    /// Once a matcher is built, no new patterns can be added to it.
+    pub fn build(&self) -> Result<Set, regex::Error> {
+        let it = self.pats.iter().map(|&(ref p, ref o)| p.to_regex_with(o));
+        let re = try!(RegexSet::new(it));
+        Ok(Set { re: re })
+    }
+
+    /// Add a new pattern to this set.
+    ///
+    /// If the pattern could not be parsed as a glob, then an error is
+    /// returned.
+    pub fn add(&mut self, pat: &str) -> Result<(), Error> {
+        self.add_with(pat, &MatchOptions::default())
+    }
+
+    /// Like add, but sets the match options for this particular pattern.
+    pub fn add_with(
+        &mut self,
+        pat: &str,
+        opts: &MatchOptions,
+    ) -> Result<(), Error> {
+        let pat = try!(Pattern::new(pat));
+        self.pats.push((pat, opts.clone()));
+        Ok(())
+    }
+}
+
+/// Pattern represents a successfully parsed shell glob pattern.
+///
+/// It cannot be used directly to match file paths, but it can be converted
+/// to a regular expression string.
+#[derive(Clone, Debug, Default)]
+pub struct Pattern {
+    tokens: Vec<Token>,
+}
+
+/// Options to control the matching semantics of a glob. The default value
+/// has all options disabled.
+#[derive(Clone, Debug, Default)]
+pub struct MatchOptions {
+    /// When true, matching is done case insensitively.
+    case_insensitive: bool,
+    /// When true, neither `*` nor `?` match the current system's path
+    /// separator.
+    require_literal_separator: bool,
+}
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+enum Token {
+    Literal(char),
+    Any,
+    ZeroOrMore,
+    RecursivePrefix,
+    RecursiveSuffix,
+    RecursiveZeroOrMore,
+    Class {
+        negated: bool,
+        ranges: Vec<(char, char)>,
+    },
+}
+
+impl Pattern {
+    /// Parse a shell glob pattern.
+    ///
+    /// If the pattern is not a valid glob, then an error is returned.
+    pub fn new(pat: &str) -> Result<Pattern, Error> {
+        let mut p = Parser {
+            p: Pattern::default(),
+            chars: pat.chars().peekable(),
+            prev: None,
+            cur: None,
+        };
+        try!(p.parse());
+        Ok(p.p)
+    }
+
+    /// Convert this pattern to a string that is guaranteed to be a valid
+    /// regular expression and will represent the matching semantics of this
+    /// glob pattern. This uses a default set of options.
+    pub fn to_regex(&self) -> String {
+        self.to_regex_with(&MatchOptions::default())
+    }
+
+    /// Convert this pattern to a string that is guaranteed to be a valid
+    /// regular expression and will represent the matching semantics of this
+    /// glob pattern and the options given.
+    pub fn to_regex_with(&self, options: &MatchOptions) -> String {
+        let sep = path::MAIN_SEPARATOR.to_string();
+        let mut re = String::new();
+        if options.case_insensitive {
+            re.push_str("(?i)");
+        }
+        re.push('^');
+        // Special case. If the entire glob is just `**`, then it should match
+        // everything.
+        if self.tokens.len() == 1 && self.tokens[0] == Token::RecursivePrefix {
+            re.push_str(".*");
+            re.push('$');
+            return re;
+        }
+        for tok in &self.tokens {
+            match *tok {
+                Token::Literal(c) => {
+                    re.push_str(&regex::quote(&c.to_string()));
+                }
+                Token::Any => {
+                    if options.require_literal_separator {
+                        re.push_str(&format!("[^{}]", regex::quote(&sep)));
+                    } else {
+                        re.push_str(".");
+                    }
+                }
+                Token::ZeroOrMore => {
+                    if options.require_literal_separator {
+                        re.push_str(&format!("[^{}]*", regex::quote(&sep)));
+                    } else {
+                        re.push_str(".*");
+                    }
+                }
+                Token::RecursivePrefix => {
+                    re.push_str(&format!("(?:{sep}?|.*{sep})", sep=sep));
+                }
+                Token::RecursiveSuffix => {
+                    re.push_str(&format!("(?:{sep}?|{sep}.*)", sep=sep));
+                }
+                Token::RecursiveZeroOrMore => {
+                    re.push_str(&format!("(?:{sep}|{sep}.*{sep})", sep=sep));
+                }
+                Token::Class { negated, ref ranges } => {
+                    re.push('[');
+                    if negated {
+                        re.push('^');
+                    }
+                    for r in ranges {
+                        if r.0 == r.1 {
+                            // Not strictly necessary, but nicer to look at.
+                            re.push_str(&regex::quote(&r.0.to_string()));
+                        } else {
+                            re.push_str(&regex::quote(&r.0.to_string()));
+                            re.push('-');
+                            re.push_str(&regex::quote(&r.1.to_string()));
+                        }
+                    }
+                    re.push(']');
+                }
+            }
+        }
+        re.push('$');
+        re
+    }
+}
+
+struct Parser<'a> {
+    p: Pattern,
+    chars: iter::Peekable<str::Chars<'a>>,
+    prev: Option<char>,
+    cur: Option<char>,
+}
+
+impl<'a> Parser<'a> {
+    fn parse(&mut self) -> Result<(), Error> {
+        while let Some(c) = self.bump() {
+            match c {
+                '?' => self.p.tokens.push(Token::Any),
+                '*' => try!(self.parse_star()),
+                '[' => try!(self.parse_class()),
+                c => self.p.tokens.push(Token::Literal(c)),
+            }
+        }
+        Ok(())
+    }
+
+    fn parse_star(&mut self) -> Result<(), Error> {
+        let prev = self.prev;
+        if self.chars.peek() != Some(&'*') {
+            self.p.tokens.push(Token::ZeroOrMore);
+            return Ok(());
+        }
+        assert!(self.bump() == Some('*'));
+        if self.p.tokens.is_empty() {
+            self.p.tokens.push(Token::RecursivePrefix);
+            let next = self.bump();
+            if !next.is_none() && next != Some('/') {
+                return Err(Error::InvalidRecursive);
+            }
+            return Ok(());
+        }
+        let last = self.p.tokens.pop().unwrap();
+        if prev != Some('/') {
+            return Err(Error::InvalidRecursive);
+        }
+        let next = self.bump();
+        if next.is_none() {
+            self.p.tokens.push(Token::RecursiveSuffix);
+            return Ok(());
+        }
+        if next != Some('/') {
+            return Err(Error::InvalidRecursive);
+        }
+        self.p.tokens.push(Token::RecursiveZeroOrMore);
+        Ok(())
+    }
+
+    fn parse_class(&mut self) -> Result<(), Error> {
+        fn add_to_last_range(
+            r: &mut (char, char),
+            add: char,
+        ) -> Result<(), Error> {
+            r.1 = add;
+            if r.1 < r.0 {
+                Err(Error::InvalidRange(r.0, r.1))
+            } else {
+                Ok(())
+            }
+        }
+        let mut negated = false;
+        let mut ranges = vec![];
+        if self.chars.peek() == Some(&'!') {
+            assert!(self.bump() == Some('!'));
+            negated = true;
+        }
+        let mut first = true;
+        let mut in_range = false;
+        loop {
+            let c = match self.bump() {
+                Some(c) => c,
+                // The only way to successfully break this loop is to observe
+                // a ']'.
+                None => return Err(Error::UnclosedClass),
+            };
+            match c {
+                ']' => {
+                    if first {
+                        ranges.push((']', ']'));
+                    } else {
+                        break;
+                    }
+                }
+                '-' => {
+                    if first {
+                        ranges.push(('-', '-'));
+                    } else if in_range {
+                        // invariant: in_range is only set when there is
+                        // already at least one character seen.
+                        let r = ranges.last_mut().unwrap();
+                        try!(add_to_last_range(r, '-'));
+                        in_range = false;
+                    } else {
+                        assert!(!ranges.is_empty());
+                        in_range = true;
+                    }
+                }
+                c => {
+                    if in_range {
+                        // invariant: in_range is only set when there is
+                        // already at least one character seen.
+                        try!(add_to_last_range(ranges.last_mut().unwrap(), c));
+                    } else {
+                        ranges.push((c, c));
+                    }
+                    in_range = false;
+                }
+            }
+            first = false;
+        }
+        if in_range {
+            // Means that the last character in the class was a '-', so add
+            // it as a literal.
+            ranges.push(('-', '-'));
+        }
+        self.p.tokens.push(Token::Class {
+            negated: negated,
+            ranges: ranges,
+        });
+        Ok(())
+    }
+
+    fn bump(&mut self) -> Option<char> {
+        self.prev = self.cur;
+        self.cur = self.chars.next();
+        self.cur
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use regex::Regex;
+
+    use super::{Error, Pattern, MatchOptions, SetBuilder, Token};
+    use super::Token::*;
+
+    macro_rules! syntax {
+        ($name:ident, $pat:expr, $tokens:expr) => {
+            #[test]
+            fn $name() {
+                let pat = Pattern::new($pat).unwrap();
+                assert_eq!($tokens, pat.tokens);
+            }
+        }
+    }
+
+    macro_rules! syntaxerr {
+        ($name:ident, $pat:expr, $err:expr) => {
+            #[test]
+            fn $name() {
+                let err = Pattern::new($pat).unwrap_err();
+                assert_eq!($err, err);
+            }
+        }
+    }
+
+    macro_rules! toregex {
+        ($name:ident, $pat:expr, $re:expr) => {
+            toregex!($name, $pat, $re, MatchOptions::default());
+        };
+        ($name:ident, $pat:expr, $re:expr, $options:expr) => {
+            #[test]
+            fn $name() {
+                let pat = Pattern::new($pat).unwrap();
+                assert_eq!($re, pat.to_regex_with(&$options));
+            }
+        };
+    }
+
+    macro_rules! matches {
+        ($name:ident, $pat:expr, $path:expr) => {
+            matches!($name, $pat, $path, MatchOptions::default());
+        };
+        ($name:ident, $pat:expr, $path:expr, $options:expr) => {
+            #[test]
+            fn $name() {
+                let pat = Pattern::new($pat).unwrap();
+                let re = Regex::new(&pat.to_regex_with(&$options)).unwrap();
+                assert!(re.is_match($path));
+            }
+        };
+    }
+
+    macro_rules! nmatches {
+        ($name:ident, $pat:expr, $path:expr) => {
+            nmatches!($name, $pat, $path, MatchOptions::default());
+        };
+        ($name:ident, $pat:expr, $path:expr, $options:expr) => {
+            #[test]
+            fn $name() {
+                let pat = Pattern::new($pat).unwrap();
+                let re = Regex::new(&pat.to_regex_with(&$options)).unwrap();
+                // println!("{:?}", re);
+                assert!(!re.is_match($path));
+            }
+        };
+    }
+
+    fn class(s: char, e: char) -> Token {
+        Class { negated: false, ranges: vec![(s, e)] }
+    }
+
+    fn classn(s: char, e: char) -> Token {
+        Class { negated: true, ranges: vec![(s, e)] }
+    }
+
+    fn rclass(ranges: &[(char, char)]) -> Token {
+        Class { negated: false, ranges: ranges.to_vec() }
+    }
+
+    fn rclassn(ranges: &[(char, char)]) -> Token {
+        Class { negated: true, ranges: ranges.to_vec() }
+    }
+
+    syntax!(literal1, "a", vec![Literal('a')]);
+    syntax!(literal2, "ab", vec![Literal('a'), Literal('b')]);
+    syntax!(any1, "?", vec![Any]);
+    syntax!(any2, "a?b", vec![Literal('a'), Any, Literal('b')]);
+    syntax!(seq1, "*", vec![ZeroOrMore]);
+    syntax!(seq2, "a*b", vec![Literal('a'), ZeroOrMore, Literal('b')]);
+    syntax!(seq3, "*a*b*", vec![
+        ZeroOrMore, Literal('a'), ZeroOrMore, Literal('b'), ZeroOrMore,
+    ]);
+    syntax!(rseq1, "**", vec![RecursivePrefix]);
+    syntax!(rseq2, "**/", vec![RecursivePrefix]);
+    syntax!(rseq3, "/**", vec![RecursiveSuffix]);
+    syntax!(rseq4, "/**/", vec![RecursiveZeroOrMore]);
+    syntax!(rseq5, "a/**/b", vec![
+        Literal('a'), RecursiveZeroOrMore, Literal('b'),
+    ]);
+    syntax!(cls1, "[a]", vec![class('a', 'a')]);
+    syntax!(cls2, "[!a]", vec![classn('a', 'a')]);
+    syntax!(cls3, "[a-z]", vec![class('a', 'z')]);
+    syntax!(cls4, "[!a-z]", vec![classn('a', 'z')]);
+    syntax!(cls5, "[-]", vec![class('-', '-')]);
+    syntax!(cls6, "[]]", vec![class(']', ']')]);
+    syntax!(cls7, "[*]", vec![class('*', '*')]);
+    syntax!(cls8, "[!!]", vec![classn('!', '!')]);
+    syntax!(cls9, "[a-]", vec![rclass(&[('a', 'a'), ('-', '-')])]);
+    syntax!(cls10, "[-a-z]", vec![rclass(&[('-', '-'), ('a', 'z')])]);
+    syntax!(cls11, "[a-z-]", vec![rclass(&[('a', 'z'), ('-', '-')])]);
+    syntax!(cls12, "[-a-z-]", vec![
+        rclass(&[('-', '-'), ('a', 'z'), ('-', '-')]),
+    ]);
+    syntax!(cls13, "[]-z]", vec![class(']', 'z')]);
+    syntax!(cls14, "[--z]", vec![class('-', 'z')]);
+    syntax!(cls15, "[ --]", vec![class(' ', '-')]);
+    syntax!(cls16, "[0-9a-z]", vec![rclass(&[('0', '9'), ('a', 'z')])]);
+    syntax!(cls17, "[a-z0-9]", vec![rclass(&[('a', 'z'), ('0', '9')])]);
+    syntax!(cls18, "[!0-9a-z]", vec![rclassn(&[('0', '9'), ('a', 'z')])]);
+    syntax!(cls19, "[!a-z0-9]", vec![rclassn(&[('a', 'z'), ('0', '9')])]);
+
+    syntaxerr!(err_rseq1, "a**", Error::InvalidRecursive);
+    syntaxerr!(err_rseq2, "**a", Error::InvalidRecursive);
+    syntaxerr!(err_rseq3, "a**b", Error::InvalidRecursive);
+    syntaxerr!(err_rseq4, "***", Error::InvalidRecursive);
+    syntaxerr!(err_rseq5, "/a**", Error::InvalidRecursive);
+    syntaxerr!(err_rseq6, "/**a", Error::InvalidRecursive);
+    syntaxerr!(err_rseq7, "/a**b", Error::InvalidRecursive);
+    syntaxerr!(err_unclosed1, "[", Error::UnclosedClass);
+    syntaxerr!(err_unclosed2, "[]", Error::UnclosedClass);
+    syntaxerr!(err_unclosed3, "[!", Error::UnclosedClass);
+    syntaxerr!(err_unclosed4, "[!]", Error::UnclosedClass);
+    syntaxerr!(err_range1, "[z-a]", Error::InvalidRange('z', 'a'));
+    syntaxerr!(err_range2, "[z--]", Error::InvalidRange('z', '-'));
+
+    const SLASHLIT: MatchOptions = MatchOptions {
+        case_insensitive: false,
+        require_literal_separator: true,
+    };
+    const CASEI: MatchOptions = MatchOptions {
+        case_insensitive: true,
+        require_literal_separator: false,
+    };
+    const SEP: char = ::std::path::MAIN_SEPARATOR;
+
+    toregex!(re_casei, "a", "(?i)^a$", &CASEI);
+
+    toregex!(re_slash1, "?", format!("^[^{}]$", SEP), SLASHLIT);
+    toregex!(re_slash2, "*", format!("^[^{}]*$", SEP), SLASHLIT);
+
+    toregex!(re1, "a", "^a$");
+    toregex!(re2, "?", "^.$");
+    toregex!(re3, "*", "^.*$");
+    toregex!(re4, "a?", "^a.$");
+    toregex!(re5, "?a", "^.a$");
+    toregex!(re6, "a*", "^a.*$");
+    toregex!(re7, "*a", "^.*a$");
+    toregex!(re8, "[*]", r"^[\*]$");
+    toregex!(re9, "[+]", r"^[\+]$");
+    toregex!(re10, "+", r"^\+$");
+    toregex!(re11, "**", r"^.*$");
+
+    matches!(match1, "a", "a");
+    matches!(match2, "a*b", "a_b");
+    matches!(match3, "a*b*c", "abc");
+    matches!(match4, "a*b*c", "a_b_c");
+    matches!(match5, "a*b*c", "a___b___c");
+    matches!(match6, "abc*abc*abc", "abcabcabcabcabcabcabc");
+    matches!(match7, "a*a*a*a*a*a*a*a*a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
+    matches!(match8, "a*b[xyz]c*d", "abxcdbxcddd");
+
+    matches!(matchrec1, "some/**/needle.txt", "some/needle.txt");
+    matches!(matchrec2, "some/**/needle.txt", "some/one/needle.txt");
+    matches!(matchrec3, "some/**/needle.txt", "some/one/two/needle.txt");
+    matches!(matchrec4, "some/**/needle.txt", "some/other/needle.txt");
+    matches!(matchrec5, "**", "abcde");
+    matches!(matchrec6, "**", "");
+    matches!(matchrec7, "**", ".asdf");
+    matches!(matchrec8, "**", "/x/.asdf");
+    matches!(matchrec9, "some/**/**/needle.txt", "some/needle.txt");
+    matches!(matchrec10, "some/**/**/needle.txt", "some/one/needle.txt");
+    matches!(matchrec11, "some/**/**/needle.txt", "some/one/two/needle.txt");
+    matches!(matchrec12, "some/**/**/needle.txt", "some/other/needle.txt");
+    matches!(matchrec13, "**/test", "one/two/test");
+    matches!(matchrec14, "**/test", "one/test");
+    matches!(matchrec15, "**/test", "test");
+    matches!(matchrec16, "/**/test", "/one/two/test");
+    matches!(matchrec17, "/**/test", "/one/test");
+    matches!(matchrec18, "/**/test", "/test");
+    matches!(matchrec19, "**/.*", ".abc");
+    matches!(matchrec20, "**/.*", "abc/.abc");
+    matches!(matchrec21, ".*/**", ".abc");
+    matches!(matchrec22, ".*/**", ".abc/abc");
+
+    matches!(matchrange1, "a[0-9]b", "a0b");
+    matches!(matchrange2, "a[0-9]b", "a9b");
+    matches!(matchrange3, "a[!0-9]b", "a_b");
+    matches!(matchrange4, "[a-z123]", "1");
+    matches!(matchrange5, "[1a-z23]", "1");
+    matches!(matchrange6, "[123a-z]", "1");
+    matches!(matchrange7, "[abc-]", "-");
+    matches!(matchrange8, "[-abc]", "-");
+    matches!(matchrange9, "[-a-c]", "b");
+    matches!(matchrange10, "[a-c-]", "b");
+    matches!(matchrange11, "[-]", "-");
+
+    matches!(matchpat1, "*hello.txt", "hello.txt");
+    matches!(matchpat2, "*hello.txt", "gareth_says_hello.txt");
+    matches!(matchpat3, "*hello.txt", "some/path/to/hello.txt");
+    matches!(matchpat4, "*hello.txt", "some\\path\\to\\hello.txt");
+    matches!(matchpat5, "*hello.txt", "/an/absolute/path/to/hello.txt");
+    matches!(matchpat6, "*some/path/to/hello.txt", "some/path/to/hello.txt");
+    matches!(matchpat7, "*some/path/to/hello.txt",
+             "a/bigger/some/path/to/hello.txt");
+
+    matches!(matchescape, "_[[]_[]]_[?]_[*]_!_", "_[_]_?_*_!_");
+
+    matches!(matchcasei1, "aBcDeFg", "aBcDeFg", CASEI);
+    matches!(matchcasei2, "aBcDeFg", "abcdefg", CASEI);
+    matches!(matchcasei3, "aBcDeFg", "ABCDEFG", CASEI);
+    matches!(matchcasei4, "aBcDeFg", "AbCdEfG", CASEI);
+
+    matches!(matchslash1, "abc/def", "abc/def", SLASHLIT);
+    nmatches!(matchslash2, "abc?def", "abc/def", SLASHLIT);
+    nmatches!(matchslash3, "abc*def", "abc/def", SLASHLIT);
+    matches!(matchslash4, "abc[/]def", "abc/def", SLASHLIT); // differs
+
+    nmatches!(matchnot1, "a*b*c", "abcd");
+    nmatches!(matchnot2, "abc*abc*abc", "abcabcabcabcabcabcabca");
+    nmatches!(matchnot3, "some/**/needle.txt", "some/other/notthis.txt");
+    nmatches!(matchnot4, "some/**/**/needle.txt", "some/other/notthis.txt");
+    nmatches!(matchnot5, "/**/test", "test");
+    nmatches!(matchnot6, "/**/test", "/one/notthis");
+    nmatches!(matchnot7, "/**/test", "/notthis");
+    nmatches!(matchnot8, "**/.*", "ab.c");
+    nmatches!(matchnot9, "**/.*", "abc/ab.c");
+    nmatches!(matchnot10, ".*/**", "a.bc");
+    nmatches!(matchnot11, ".*/**", "abc/a.bc");
+    nmatches!(matchnot12, "a[0-9]b", "a_b");
+    nmatches!(matchnot13, "a[!0-9]b", "a0b");
+    nmatches!(matchnot14, "a[!0-9]b", "a9b");
+    nmatches!(matchnot15, "[!-]", "-");
+    nmatches!(matchnot16, "*hello.txt", "hello.txt-and-then-some");
+    nmatches!(matchnot17, "*hello.txt", "goodbye.txt");
+    nmatches!(matchnot18, "*some/path/to/hello.txt",
+              "some/path/to/hello.txt-and-then-some");
+    nmatches!(matchnot19, "*some/path/to/hello.txt",
+              "some/other/path/to/hello.txt");
+
+    #[test]
+    fn set_works() {
+        let mut builder = SetBuilder::new();
+        builder.add("src/**/*.rs").unwrap();
+        builder.add("*.c").unwrap();
+        builder.add("src/lib.rs").unwrap();
+        let set = builder.build().unwrap();
+
+        assert!(set.is_match("foo.c"));
+        assert!(set.is_match("src/foo.c"));
+        assert!(!set.is_match("foo.rs"));
+        assert!(!set.is_match("tests/foo.rs"));
+        assert!(set.is_match("src/foo.rs"));
+        assert!(set.is_match("src/grep/src/main.rs"));
+
+        assert_eq!(2, set.matches("src/lib.rs").iter().count());
+        assert!(set.matches("src/lib.rs").matched(0));
+        assert!(!set.matches("src/lib.rs").matched(1));
+        assert!(set.matches("src/lib.rs").matched(2));
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index 999b6d98..4d90cae7 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -4,15 +4,21 @@ extern crate docopt;
 extern crate grep;
 extern crate memchr;
 extern crate memmap;
+extern crate num_cpus;
 extern crate regex;
 extern crate regex_syntax as syntax;
 extern crate rustc_serialize;
+extern crate walkdir;
 
 const USAGE: &'static str = "
-Usage: rep [options] <pattern> [<file> ...]
+Usage: xrep [options] <pattern> <path> ...
+
+xrep is like the silver searcher, but faster than it and grep.
+
+At least one path is required. Searching stdin isn't yet supported.
 
 Options:
-    -c, --count   Suppress normal output and show count of matches.
+    -c, --count   Suppress normal output and show count of line matches.
 ";
 
 use std::error::Error;
@@ -21,22 +27,37 @@ use std::process;
 use std::result;
 
 use docopt::Docopt;
+use grep::Grep;
+use walkdir::{WalkDir, WalkDirIterator};
+
+macro_rules! errored {
+    ($($tt:tt)*) => {
+        return Err(From::from(format!($($tt)*)));
+    }
+}
 
-use grep::{Grep, GrepBuilder};
+macro_rules! eprintln {
+    ($($tt:tt)*) => {{
+        use std::io::Write;
+        let _ = writeln!(&mut ::std::io::stderr(), $($tt)*);
+    }}
+}
+
+mod glob;
 
 pub type Result<T> = result::Result<T, Box<Error + Send + Sync>>;
 
 #[derive(RustcDecodable)]
 struct Args {
     arg_pattern: String,
-    arg_file: Vec<String>,
+    arg_path: Vec<String>,
     flag_count: bool,
 }
 
 fn main() {
-    let args = Docopt::new(USAGE).and_then(|d| d.decode())
-                                 .unwrap_or_else(|e| e.exit());
-    match run(&args) {
+    let args: Args = Docopt::new(USAGE).and_then(|d| d.decode())
+                                       .unwrap_or_else(|e| e.exit());
+    match args.run() {
         Ok(count) if count == 0 => process::exit(1),
         Ok(_) => process::exit(0),
         Err(err) => {
@@ -46,42 +67,48 @@ fn main() {
     }
 }
 
-fn run(args: &Args) -> Result<u64> {
-    if args.arg_file.is_empty() {
-        unimplemented!()
-    } else {
-        let searcher = try!(GrepBuilder::new(&args.arg_pattern).create());
-        if args.flag_count {
-            run_mmap_count_only(args, &searcher)
-        } else {
-            run_mmap(args, &searcher)
+impl Args {
+    fn run(&self) -> Result<u64> {
+        if self.arg_path.is_empty() {
+            return errored!("Searching stdin is not currently supported.");
+        }
+        for p in &self.arg_path {
+            let mut it = WalkDir::new(p).into_iter();
+            loop {
+                let ent = match it.next() {
+                    None => break,
+                    Some(Err(err)) => {
+                        eprintln!("{}", err);
+                        continue;
+                    }
+                    Some(Ok(ent)) => ent,
+                };
+                if is_hidden(&ent) {
+                    if ent.file_type().is_dir() {
+                        it.skip_current_dir();
+                    }
+                    continue;
+                }
+                println!("{}", ent.path().display());
+            }
         }
+        Ok(0)
     }
-}
 
-fn run_mmap(args: &Args, searcher: &Grep) -> Result<u64> {
-    unimplemented!()
-    /*
-    for m in searcher.iter(text) {
-        if !args.flag_count {
-            try!(wtr.write(&text[m.start()..m.end()]));
-            try!(wtr.write(b"\n"));
-        }
-        count += 1;
+    fn run_mmap_count_only(&self, searcher: &Grep) -> Result<u64> {
+        use memmap::{Mmap, Protection};
+
+        assert!(self.arg_path.len() == 1);
+        let mut wtr = io::BufWriter::new(io::stdout());
+        let mmap = try!(Mmap::open_path(&self.arg_path[0], Protection::Read));
+        let text = unsafe { mmap.as_slice() };
+        let count = searcher.iter(text).count() as u64;
+        try!(writeln!(wtr, "{}", count));
+        Ok(count)
     }
-    Ok(count)
-    */
 }
 
-#[inline(never)]
-fn run_mmap_count_only(args: &Args, searcher: &Grep) -> Result<u64> {
-    use memmap::{Mmap, Protection};
-
-    assert!(args.arg_file.len() == 1);
-    let mut wtr = io::BufWriter::new(io::stdout());
-    let mmap = try!(Mmap::open_path(&args.arg_file[0], Protection::Read));
-    let text = unsafe { mmap.as_slice() };
-    let count = searcher.iter(text).count() as u64;
-    try!(writeln!(wtr, "{}", count));
-    Ok(count)
+fn is_hidden(ent: &walkdir::DirEntry) -> bool {
+    ent.depth() > 0 &&
+    ent.file_name().to_str().map(|s| s.starts_with(".")).unwrap_or(false)
 }
author	Andrew Gallant <jamslam@gmail.com>	2016-08-25 21:44:37 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2016-08-25 21:44:37 -0400
commit	b55ecf34c713392b012dd652fbbd11d7e0126d97 (patch)
tree	323cc2f2cbf8f58cfee742f4307094df6a13a753
parent	957f90c8988779fae8c466e2697dbc2ae542f0ff (diff)