From b55ecf34c713392b012dd652fbbd11d7e0126d97 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 25 Aug 2016 21:44:37 -0400 Subject: globbing by regex --- Cargo.toml | 11 + benches/bench.rs | 122 ++++++++++ src/glob.rs | 668 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 105 +++++---- 4 files changed, 867 insertions(+), 39 deletions(-) create mode 100644 benches/bench.rs create mode 100644 src/glob.rs diff --git a/Cargo.toml b/Cargo.toml index 9f36d765..d4d59f63 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,14 +13,25 @@ readme = "README.md" keywords = ["regex", "grep", "egrep", "search", "pattern"] license = "Unlicense/MIT" +[[bin]] +bench = false +path = "src/main.rs" +name = "xrep" + [dependencies] docopt = "0.6" grep = { version = "0.1", path = "grep" } memchr = "0.1" memmap = "0.2" +num_cpus = "1" regex = { version = "0.1", path = "/home/andrew/rust/regex" } regex-syntax = { version = "0.3.1", path = "/home/andrew/rust/regex/regex-syntax" } rustc-serialize = "0.3" +walkdir = "0.1" + +[dev-dependencies] +glob = "0.2" +lazy_static = "0.2" [profile.release] debug = true diff --git a/benches/bench.rs b/benches/bench.rs new file mode 100644 index 00000000..a529172f --- /dev/null +++ b/benches/bench.rs @@ -0,0 +1,122 @@ +#![feature(test)] + +extern crate glob; +#[macro_use] +extern crate lazy_static; +extern crate regex; +extern crate test; + +const SHORT: &'static str = "some/needle.txt"; +const SHORT_PAT: &'static str = "some/**/needle.txt"; + +const LONG: &'static str = "some/a/bigger/path/to/the/crazy/needle.txt"; +const LONG_PAT: &'static str = "some/**/needle.txt"; + +#[allow(dead_code, unused_variables)] +#[path = "../src/glob.rs"] +mod reglob; + +fn new_glob(pat: &str) -> glob::Pattern { + glob::Pattern::new(pat).unwrap() +} + +fn new_reglob(pat: &str) -> reglob::Set { + let mut builder = reglob::SetBuilder::new(); + builder.add(pat).unwrap(); + builder.build().unwrap() +} + +fn new_reglob_many(pats: &[&str]) -> reglob::Set { + let mut builder = reglob::SetBuilder::new(); + for pat in pats { + builder.add(pat).unwrap(); + } + builder.build().unwrap() +} + +#[bench] +fn short_glob(b: &mut test::Bencher) { + let pat = new_glob(SHORT_PAT); + b.iter(|| assert!(pat.matches(SHORT))); +} + +#[bench] +fn short_regex(b: &mut test::Bencher) { + let set = new_reglob(SHORT_PAT); + b.iter(|| assert!(set.is_match(SHORT))); +} + +#[bench] +fn long_glob(b: &mut test::Bencher) { + let pat = new_glob(LONG_PAT); + b.iter(|| assert!(pat.matches(LONG))); +} + +#[bench] +fn long_regex(b: &mut test::Bencher) { + let set = new_reglob(LONG_PAT); + b.iter(|| assert!(set.is_match(LONG))); +} + +const MANY_SHORT_GLOBS: &'static [&'static str] = &[ + // Taken from a random .gitignore on my system. + ".*.swp", + "tags", + "target", + "*.lock", + "tmp", + "*.csv", + "*.fst", + "*-got", + "*.csv.idx", + "words", + "98m*", + "dict", + "test", + "months", +]; + +const MANY_SHORT_SEARCH: &'static str = "98m-blah.csv.idx"; + +#[bench] +fn many_short_glob(b: &mut test::Bencher) { + let pats: Vec<_> = MANY_SHORT_GLOBS.iter().map(|&s| new_glob(s)).collect(); + b.iter(|| { + let mut count = 0; + for pat in &pats { + if pat.matches(MANY_SHORT_SEARCH) { + count += 1; + } + } + assert_eq!(2, count); + }) +} + +#[bench] +fn many_short_regex_set(b: &mut test::Bencher) { + let set = new_reglob_many(MANY_SHORT_GLOBS); + b.iter(|| assert_eq!(2, set.matches(MANY_SHORT_SEARCH).iter().count())); +} + +// This is the fastest on my system (beating many_glob by about 2x). This +// suggests that a RegexSet needs quite a few regexes (or a larger haystack) +// in order for it to scale. +// +// TODO(burntsushi): come up with a benchmark that uses more complex patterns +// or a longer haystack. +#[bench] +fn many_short_regex_pattern(b: &mut test::Bencher) { + let pats: Vec<_> = MANY_SHORT_GLOBS.iter().map(|&s| { + let pat = reglob::Pattern::new(s).unwrap(); + regex::Regex::new(&pat.to_regex()).unwrap() + }).collect(); + b.iter(|| { + let mut count = 0; + for pat in &pats { + if pat.is_match(MANY_SHORT_SEARCH) { + count += 1; + } + } + assert_eq!(2, count); + }) +} diff --git a/src/glob.rs b/src/glob.rs new file mode 100644 index 00000000..6ae57a34 --- /dev/null +++ b/src/glob.rs @@ -0,0 +1,668 @@ +/*! +The glob submodule provides standard shell globbing, but is specifically +implemented by converting glob syntax to regular expressions. The reasoning +is two fold: + +1. The regex library is *really* fast. Regaining performance in a distinct + implementation of globbing is non-trivial. +2. Most crucially, a `RegexSet` can be used to match many globs simultaneously. + +This module is written with some amount of intention of eventually splitting it +out into its own separate crate, but I didn't quite have the energy for all +that rigamorole when I wrote this. In particular, it could be fast/good enough +to make its way into `glob` proper. +*/ + +use std::error::Error as StdError; +use std::fmt; +use std::iter; +use std::path; +use std::str; + +use regex; +use regex::bytes::{RegexSet, SetMatches}; + +/// Represents an error that can occur when parsing a glob pattern. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum Error { + InvalidRecursive, + UnclosedClass, + InvalidRange(char, char), +} + +impl StdError for Error { + fn description(&self) -> &str { + match *self { + Error::InvalidRecursive => { + "invalid use of **; must be one path component" + } + Error::UnclosedClass => { + "unclosed character class; missing ']'" + } + Error::InvalidRange(_, _) => { + "invalid character range" + } + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Error::InvalidRecursive | Error::UnclosedClass => { + write!(f, "{}", self.description()) + } + Error::InvalidRange(s, e) => { + write!(f, "invalid range; '{}' > '{}'", s, e) + } + } + } +} + +/// Set represents a group of globs that can be matched together in a single +/// pass. +#[derive(Clone, Debug)] +pub struct Set { + re: RegexSet, +} + +impl Set { + /// Returns true if and only if the given path matches at least one glob + /// in this set. + pub fn is_match>(&self, path: T) -> bool { + self.re.is_match(path.as_ref()) + } + + /// Returns every glob pattern (by sequence number) that matches the given + /// path. + pub fn matches>(&self, path: T) -> SetMatches { + self.re.matches(path.as_ref()) + } + + /// Populates the given slice with corresponding patterns that matched. + pub fn matches_with>( + &self, + path: T, + matches: &mut [bool], + ) -> bool { + self.re.matches_with(path.as_ref(), matches) + } + + /// Returns the number of glob patterns in this set. + pub fn len(&self) -> usize { + self.re.len() + } +} + +/// SetBuilder builds a group of patterns that can be used to simultaneously +/// match a file path. +pub struct SetBuilder { + pats: Vec<(Pattern, MatchOptions)>, +} + +impl SetBuilder { + /// Create a new SetBuilder. A SetBuilder can be used to add new patterns. + /// Once all patterns have been added, `build` should be called to produce + /// a `Set`, which can then be used for matching. + pub fn new() -> SetBuilder { + SetBuilder { pats: vec![] } + } + + /// Builds a new matcher from all of the glob patterns added so far. + /// + /// Once a matcher is built, no new patterns can be added to it. + pub fn build(&self) -> Result { + let it = self.pats.iter().map(|&(ref p, ref o)| p.to_regex_with(o)); + let re = try!(RegexSet::new(it)); + Ok(Set { re: re }) + } + + /// Add a new pattern to this set. + /// + /// If the pattern could not be parsed as a glob, then an error is + /// returned. + pub fn add(&mut self, pat: &str) -> Result<(), Error> { + self.add_with(pat, &MatchOptions::default()) + } + + /// Like add, but sets the match options for this particular pattern. + pub fn add_with( + &mut self, + pat: &str, + opts: &MatchOptions, + ) -> Result<(), Error> { + let pat = try!(Pattern::new(pat)); + self.pats.push((pat, opts.clone())); + Ok(()) + } +} + +/// Pattern represents a successfully parsed shell glob pattern. +/// +/// It cannot be used directly to match file paths, but it can be converted +/// to a regular expression string. +#[derive(Clone, Debug, Default)] +pub struct Pattern { + tokens: Vec, +} + +/// Options to control the matching semantics of a glob. The default value +/// has all options disabled. +#[derive(Clone, Debug, Default)] +pub struct MatchOptions { + /// When true, matching is done case insensitively. + case_insensitive: bool, + /// When true, neither `*` nor `?` match the current system's path + /// separator. + require_literal_separator: bool, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +enum Token { + Literal(char), + Any, + ZeroOrMore, + RecursivePrefix, + RecursiveSuffix, + RecursiveZeroOrMore, + Class { + negated: bool, + ranges: Vec<(char, char)>, + }, +} + +impl Pattern { + /// Parse a shell glob pattern. + /// + /// If the pattern is not a valid glob, then an error is returned. + pub fn new(pat: &str) -> Result { + let mut p = Parser { + p: Pattern::default(), + chars: pat.chars().peekable(), + prev: None, + cur: None, + }; + try!(p.parse()); + Ok(p.p) + } + + /// Convert this pattern to a string that is guaranteed to be a valid + /// regular expression and will represent the matching semantics of this + /// glob pattern. This uses a default set of options. + pub fn to_regex(&self) -> String { + self.to_regex_with(&MatchOptions::default()) + } + + /// Convert this pattern to a string that is guaranteed to be a valid + /// regular expression and will represent the matching semantics of this + /// glob pattern and the options given. + pub fn to_regex_with(&self, options: &MatchOptions) -> String { + let sep = path::MAIN_SEPARATOR.to_string(); + let mut re = String::new(); + if options.case_insensitive { + re.push_str("(?i)"); + } + re.push('^'); + // Special case. If the entire glob is just `**`, then it should match + // everything. + if self.tokens.len() == 1 && self.tokens[0] == Token::RecursivePrefix { + re.push_str(".*"); + re.push('$'); + return re; + } + for tok in &self.tokens { + match *tok { + Token::Literal(c) => { + re.push_str(®ex::quote(&c.to_string())); + } + Token::Any => { + if options.require_literal_separator { + re.push_str(&format!("[^{}]", regex::quote(&sep))); + } else { + re.push_str("."); + } + } + Token::ZeroOrMore => { + if options.require_literal_separator { + re.push_str(&format!("[^{}]*", regex::quote(&sep))); + } else { + re.push_str(".*"); + } + } + Token::RecursivePrefix => { + re.push_str(&format!("(?:{sep}?|.*{sep})", sep=sep)); + } + Token::RecursiveSuffix => { + re.push_str(&format!("(?:{sep}?|{sep}.*)", sep=sep)); + } + Token::RecursiveZeroOrMore => { + re.push_str(&format!("(?:{sep}|{sep}.*{sep})", sep=sep)); + } + Token::Class { negated, ref ranges } => { + re.push('['); + if negated { + re.push('^'); + } + for r in ranges { + if r.0 == r.1 { + // Not strictly necessary, but nicer to look at. + re.push_str(®ex::quote(&r.0.to_string())); + } else { + re.push_str(®ex::quote(&r.0.to_string())); + re.push('-'); + re.push_str(®ex::quote(&r.1.to_string())); + } + } + re.push(']'); + } + } + } + re.push('$'); + re + } +} + +struct Parser<'a> { + p: Pattern, + chars: iter::Peekable>, + prev: Option, + cur: Option, +} + +impl<'a> Parser<'a> { + fn parse(&mut self) -> Result<(), Error> { + while let Some(c) = self.bump() { + match c { + '?' => self.p.tokens.push(Token::Any), + '*' => try!(self.parse_star()), + '[' => try!(self.parse_class()), + c => self.p.tokens.push(Token::Literal(c)), + } + } + Ok(()) + } + + fn parse_star(&mut self) -> Result<(), Error> { + let prev = self.prev; + if self.chars.peek() != Some(&'*') { + self.p.tokens.push(Token::ZeroOrMore); + return Ok(()); + } + assert!(self.bump() == Some('*')); + if self.p.tokens.is_empty() { + self.p.tokens.push(Token::RecursivePrefix); + let next = self.bump(); + if !next.is_none() && next != Some('/') { + return Err(Error::InvalidRecursive); + } + return Ok(()); + } + let last = self.p.tokens.pop().unwrap(); + if prev != Some('/') { + return Err(Error::InvalidRecursive); + } + let next = self.bump(); + if next.is_none() { + self.p.tokens.push(Token::RecursiveSuffix); + return Ok(()); + } + if next != Some('/') { + return Err(Error::InvalidRecursive); + } + self.p.tokens.push(Token::RecursiveZeroOrMore); + Ok(()) + } + + fn parse_class(&mut self) -> Result<(), Error> { + fn add_to_last_range( + r: &mut (char, char), + add: char, + ) -> Result<(), Error> { + r.1 = add; + if r.1 < r.0 { + Err(Error::InvalidRange(r.0, r.1)) + } else { + Ok(()) + } + } + let mut negated = false; + let mut ranges = vec![]; + if self.chars.peek() == Some(&'!') { + assert!(self.bump() == Some('!')); + negated = true; + } + let mut first = true; + let mut in_range = false; + loop { + let c = match self.bump() { + Some(c) => c, + // The only way to successfully break this loop is to observe + // a ']'. + None => return Err(Error::UnclosedClass), + }; + match c { + ']' => { + if first { + ranges.push((']', ']')); + } else { + break; + } + } + '-' => { + if first { + ranges.push(('-', '-')); + } else if in_range { + // invariant: in_range is only set when there is + // already at least one character seen. + let r = ranges.last_mut().unwrap(); + try!(add_to_last_range(r, '-')); + in_range = false; + } else { + assert!(!ranges.is_empty()); + in_range = true; + } + } + c => { + if in_range { + // invariant: in_range is only set when there is + // already at least one character seen. + try!(add_to_last_range(ranges.last_mut().unwrap(), c)); + } else { + ranges.push((c, c)); + } + in_range = false; + } + } + first = false; + } + if in_range { + // Means that the last character in the class was a '-', so add + // it as a literal. + ranges.push(('-', '-')); + } + self.p.tokens.push(Token::Class { + negated: negated, + ranges: ranges, + }); + Ok(()) + } + + fn bump(&mut self) -> Option { + self.prev = self.cur; + self.cur = self.chars.next(); + self.cur + } +} + +#[cfg(test)] +mod tests { + use regex::Regex; + + use super::{Error, Pattern, MatchOptions, SetBuilder, Token}; + use super::Token::*; + + macro_rules! syntax { + ($name:ident, $pat:expr, $tokens:expr) => { + #[test] + fn $name() { + let pat = Pattern::new($pat).unwrap(); + assert_eq!($tokens, pat.tokens); + } + } + } + + macro_rules! syntaxerr { + ($name:ident, $pat:expr, $err:expr) => { + #[test] + fn $name() { + let err = Pattern::new($pat).unwrap_err(); + assert_eq!($err, err); + } + } + } + + macro_rules! toregex { + ($name:ident, $pat:expr, $re:expr) => { + toregex!($name, $pat, $re, MatchOptions::default()); + }; + ($name:ident, $pat:expr, $re:expr, $options:expr) => { + #[test] + fn $name() { + let pat = Pattern::new($pat).unwrap(); + assert_eq!($re, pat.to_regex_with(&$options)); + } + }; + } + + macro_rules! matches { + ($name:ident, $pat:expr, $path:expr) => { + matches!($name, $pat, $path, MatchOptions::default()); + }; + ($name:ident, $pat:expr, $path:expr, $options:expr) => { + #[test] + fn $name() { + let pat = Pattern::new($pat).unwrap(); + let re = Regex::new(&pat.to_regex_with(&$options)).unwrap(); + assert!(re.is_match($path)); + } + }; + } + + macro_rules! nmatches { + ($name:ident, $pat:expr, $path:expr) => { + nmatches!($name, $pat, $path, MatchOptions::default()); + }; + ($name:ident, $pat:expr, $path:expr, $options:expr) => { + #[test] + fn $name() { + let pat = Pattern::new($pat).unwrap(); + let re = Regex::new(&pat.to_regex_with(&$options)).unwrap(); + // println!("{:?}", re); + assert!(!re.is_match($path)); + } + }; + } + + fn class(s: char, e: char) -> Token { + Class { negated: false, ranges: vec![(s, e)] } + } + + fn classn(s: char, e: char) -> Token { + Class { negated: true, ranges: vec![(s, e)] } + } + + fn rclass(ranges: &[(char, char)]) -> Token { + Class { negated: false, ranges: ranges.to_vec() } + } + + fn rclassn(ranges: &[(char, char)]) -> Token { + Class { negated: true, ranges: ranges.to_vec() } + } + + syntax!(literal1, "a", vec![Literal('a')]); + syntax!(literal2, "ab", vec![Literal('a'), Literal('b')]); + syntax!(any1, "?", vec![Any]); + syntax!(any2, "a?b", vec![Literal('a'), Any, Literal('b')]); + syntax!(seq1, "*", vec![ZeroOrMore]); + syntax!(seq2, "a*b", vec![Literal('a'), ZeroOrMore, Literal('b')]); + syntax!(seq3, "*a*b*", vec![ + ZeroOrMore, Literal('a'), ZeroOrMore, Literal('b'), ZeroOrMore, + ]); + syntax!(rseq1, "**", vec![RecursivePrefix]); + syntax!(rseq2, "**/", vec![RecursivePrefix]); + syntax!(rseq3, "/**", vec![RecursiveSuffix]); + syntax!(rseq4, "/**/", vec![RecursiveZeroOrMore]); + syntax!(rseq5, "a/**/b", vec![ + Literal('a'), RecursiveZeroOrMore, Literal('b'), + ]); + syntax!(cls1, "[a]", vec![class('a', 'a')]); + syntax!(cls2, "[!a]", vec![classn('a', 'a')]); + syntax!(cls3, "[a-z]", vec![class('a', 'z')]); + syntax!(cls4, "[!a-z]", vec![classn('a', 'z')]); + syntax!(cls5, "[-]", vec![class('-', '-')]); + syntax!(cls6, "[]]", vec![class(']', ']')]); + syntax!(cls7, "[*]", vec![class('*', '*')]); + syntax!(cls8, "[!!]", vec![classn('!', '!')]); + syntax!(cls9, "[a-]", vec![rclass(&[('a', 'a'), ('-', '-')])]); + syntax!(cls10, "[-a-z]", vec![rclass(&[('-', '-'), ('a', 'z')])]); + syntax!(cls11, "[a-z-]", vec![rclass(&[('a', 'z'), ('-', '-')])]); + syntax!(cls12, "[-a-z-]", vec![ + rclass(&[('-', '-'), ('a', 'z'), ('-', '-')]), + ]); + syntax!(cls13, "[]-z]", vec![class(']', 'z')]); + syntax!(cls14, "[--z]", vec![class('-', 'z')]); + syntax!(cls15, "[ --]", vec![class(' ', '-')]); + syntax!(cls16, "[0-9a-z]", vec![rclass(&[('0', '9'), ('a', 'z')])]); + syntax!(cls17, "[a-z0-9]", vec![rclass(&[('a', 'z'), ('0', '9')])]); + syntax!(cls18, "[!0-9a-z]", vec![rclassn(&[('0', '9'), ('a', 'z')])]); + syntax!(cls19, "[!a-z0-9]", vec![rclassn(&[('a', 'z'), ('0', '9')])]); + + syntaxerr!(err_rseq1, "a**", Error::InvalidRecursive); + syntaxerr!(err_rseq2, "**a", Error::InvalidRecursive); + syntaxerr!(err_rseq3, "a**b", Error::InvalidRecursive); + syntaxerr!(err_rseq4, "***", Error::InvalidRecursive); + syntaxerr!(err_rseq5, "/a**", Error::InvalidRecursive); + syntaxerr!(err_rseq6, "/**a", Error::InvalidRecursive); + syntaxerr!(err_rseq7, "/a**b", Error::InvalidRecursive); + syntaxerr!(err_unclosed1, "[", Error::UnclosedClass); + syntaxerr!(err_unclosed2, "[]", Error::UnclosedClass); + syntaxerr!(err_unclosed3, "[!", Error::UnclosedClass); + syntaxerr!(err_unclosed4, "[!]", Error::UnclosedClass); + syntaxerr!(err_range1, "[z-a]", Error::InvalidRange('z', 'a')); + syntaxerr!(err_range2, "[z--]", Error::InvalidRange('z', '-')); + + const SLASHLIT: MatchOptions = MatchOptions { + case_insensitive: false, + require_literal_separator: true, + }; + const CASEI: MatchOptions = MatchOptions { + case_insensitive: true, + require_literal_separator: false, + }; + const SEP: char = ::std::path::MAIN_SEPARATOR; + + toregex!(re_casei, "a", "(?i)^a$", &CASEI); + + toregex!(re_slash1, "?", format!("^[^{}]$", SEP), SLASHLIT); + toregex!(re_slash2, "*", format!("^[^{}]*$", SEP), SLASHLIT); + + toregex!(re1, "a", "^a$"); + toregex!(re2, "?", "^.$"); + toregex!(re3, "*", "^.*$"); + toregex!(re4, "a?", "^a.$"); + toregex!(re5, "?a", "^.a$"); + toregex!(re6, "a*", "^a.*$"); + toregex!(re7, "*a", "^.*a$"); + toregex!(re8, "[*]", r"^[\*]$"); + toregex!(re9, "[+]", r"^[\+]$"); + toregex!(re10, "+", r"^\+$"); + toregex!(re11, "**", r"^.*$"); + + matches!(match1, "a", "a"); + matches!(match2, "a*b", "a_b"); + matches!(match3, "a*b*c", "abc"); + matches!(match4, "a*b*c", "a_b_c"); + matches!(match5, "a*b*c", "a___b___c"); + matches!(match6, "abc*abc*abc", "abcabcabcabcabcabcabc"); + matches!(match7, "a*a*a*a*a*a*a*a*a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); + matches!(match8, "a*b[xyz]c*d", "abxcdbxcddd"); + + matches!(matchrec1, "some/**/needle.txt", "some/needle.txt"); + matches!(matchrec2, "some/**/needle.txt", "some/one/needle.txt"); + matches!(matchrec3, "some/**/needle.txt", "some/one/two/needle.txt"); + matches!(matchrec4, "some/**/needle.txt", "some/other/needle.txt"); + matches!(matchrec5, "**", "abcde"); + matches!(matchrec6, "**", ""); + matches!(matchrec7, "**", ".asdf"); + matches!(matchrec8, "**", "/x/.asdf"); + matches!(matchrec9, "some/**/**/needle.txt", "some/needle.txt"); + matches!(matchrec10, "some/**/**/needle.txt", "some/one/needle.txt"); + matches!(matchrec11, "some/**/**/needle.txt", "some/one/two/needle.txt"); + matches!(matchrec12, "some/**/**/needle.txt", "some/other/needle.txt"); + matches!(matchrec13, "**/test", "one/two/test"); + matches!(matchrec14, "**/test", "one/test"); + matches!(matchrec15, "**/test", "test"); + matches!(matchrec16, "/**/test", "/one/two/test"); + matches!(matchrec17, "/**/test", "/one/test"); + matches!(matchrec18, "/**/test", "/test"); + matches!(matchrec19, "**/.*", ".abc"); + matches!(matchrec20, "**/.*", "abc/.abc"); + matches!(matchrec21, ".*/**", ".abc"); + matches!(matchrec22, ".*/**", ".abc/abc"); + + matches!(matchrange1, "a[0-9]b", "a0b"); + matches!(matchrange2, "a[0-9]b", "a9b"); + matches!(matchrange3, "a[!0-9]b", "a_b"); + matches!(matchrange4, "[a-z123]", "1"); + matches!(matchrange5, "[1a-z23]", "1"); + matches!(matchrange6, "[123a-z]", "1"); + matches!(matchrange7, "[abc-]", "-"); + matches!(matchrange8, "[-abc]", "-"); + matches!(matchrange9, "[-a-c]", "b"); + matches!(matchrange10, "[a-c-]", "b"); + matches!(matchrange11, "[-]", "-"); + + matches!(matchpat1, "*hello.txt", "hello.txt"); + matches!(matchpat2, "*hello.txt", "gareth_says_hello.txt"); + matches!(matchpat3, "*hello.txt", "some/path/to/hello.txt"); + matches!(matchpat4, "*hello.txt", "some\\path\\to\\hello.txt"); + matches!(matchpat5, "*hello.txt", "/an/absolute/path/to/hello.txt"); + matches!(matchpat6, "*some/path/to/hello.txt", "some/path/to/hello.txt"); + matches!(matchpat7, "*some/path/to/hello.txt", + "a/bigger/some/path/to/hello.txt"); + + matches!(matchescape, "_[[]_[]]_[?]_[*]_!_", "_[_]_?_*_!_"); + + matches!(matchcasei1, "aBcDeFg", "aBcDeFg", CASEI); + matches!(matchcasei2, "aBcDeFg", "abcdefg", CASEI); + matches!(matchcasei3, "aBcDeFg", "ABCDEFG", CASEI); + matches!(matchcasei4, "aBcDeFg", "AbCdEfG", CASEI); + + matches!(matchslash1, "abc/def", "abc/def", SLASHLIT); + nmatches!(matchslash2, "abc?def", "abc/def", SLASHLIT); + nmatches!(matchslash3, "abc*def", "abc/def", SLASHLIT); + matches!(matchslash4, "abc[/]def", "abc/def", SLASHLIT); // differs + + nmatches!(matchnot1, "a*b*c", "abcd"); + nmatches!(matchnot2, "abc*abc*abc", "abcabcabcabcabcabcabca"); + nmatches!(matchnot3, "some/**/needle.txt", "some/other/notthis.txt"); + nmatches!(matchnot4, "some/**/**/needle.txt", "some/other/notthis.txt"); + nmatches!(matchnot5, "/**/test", "test"); + nmatches!(matchnot6, "/**/test", "/one/notthis"); + nmatches!(matchnot7, "/**/test", "/notthis"); + nmatches!(matchnot8, "**/.*", "ab.c"); + nmatches!(matchnot9, "**/.*", "abc/ab.c"); + nmatches!(matchnot10, ".*/**", "a.bc"); + nmatches!(matchnot11, ".*/**", "abc/a.bc"); + nmatches!(matchnot12, "a[0-9]b", "a_b"); + nmatches!(matchnot13, "a[!0-9]b", "a0b"); + nmatches!(matchnot14, "a[!0-9]b", "a9b"); + nmatches!(matchnot15, "[!-]", "-"); + nmatches!(matchnot16, "*hello.txt", "hello.txt-and-then-some"); + nmatches!(matchnot17, "*hello.txt", "goodbye.txt"); + nmatches!(matchnot18, "*some/path/to/hello.txt", + "some/path/to/hello.txt-and-then-some"); + nmatches!(matchnot19, "*some/path/to/hello.txt", + "some/other/path/to/hello.txt"); + + #[test] + fn set_works() { + let mut builder = SetBuilder::new(); + builder.add("src/**/*.rs").unwrap(); + builder.add("*.c").unwrap(); + builder.add("src/lib.rs").unwrap(); + let set = builder.build().unwrap(); + + assert!(set.is_match("foo.c")); + assert!(set.is_match("src/foo.c")); + assert!(!set.is_match("foo.rs")); + assert!(!set.is_match("tests/foo.rs")); + assert!(set.is_match("src/foo.rs")); + assert!(set.is_match("src/grep/src/main.rs")); + + assert_eq!(2, set.matches("src/lib.rs").iter().count()); + assert!(set.matches("src/lib.rs").matched(0)); + assert!(!set.matches("src/lib.rs").matched(1)); + assert!(set.matches("src/lib.rs").matched(2)); + } +} diff --git a/src/main.rs b/src/main.rs index 999b6d98..4d90cae7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,15 +4,21 @@ extern crate docopt; extern crate grep; extern crate memchr; extern crate memmap; +extern crate num_cpus; extern crate regex; extern crate regex_syntax as syntax; extern crate rustc_serialize; +extern crate walkdir; const USAGE: &'static str = " -Usage: rep [options] [ ...] +Usage: xrep [options] ... + +xrep is like the silver searcher, but faster than it and grep. + +At least one path is required. Searching stdin isn't yet supported. Options: - -c, --count Suppress normal output and show count of matches. + -c, --count Suppress normal output and show count of line matches. "; use std::error::Error; @@ -21,22 +27,37 @@ use std::process; use std::result; use docopt::Docopt; +use grep::Grep; +use walkdir::{WalkDir, WalkDirIterator}; + +macro_rules! errored { + ($($tt:tt)*) => { + return Err(From::from(format!($($tt)*))); + } +} -use grep::{Grep, GrepBuilder}; +macro_rules! eprintln { + ($($tt:tt)*) => {{ + use std::io::Write; + let _ = writeln!(&mut ::std::io::stderr(), $($tt)*); + }} +} + +mod glob; pub type Result = result::Result>; #[derive(RustcDecodable)] struct Args { arg_pattern: String, - arg_file: Vec, + arg_path: Vec, flag_count: bool, } fn main() { - let args = Docopt::new(USAGE).and_then(|d| d.decode()) - .unwrap_or_else(|e| e.exit()); - match run(&args) { + let args: Args = Docopt::new(USAGE).and_then(|d| d.decode()) + .unwrap_or_else(|e| e.exit()); + match args.run() { Ok(count) if count == 0 => process::exit(1), Ok(_) => process::exit(0), Err(err) => { @@ -46,42 +67,48 @@ fn main() { } } -fn run(args: &Args) -> Result { - if args.arg_file.is_empty() { - unimplemented!() - } else { - let searcher = try!(GrepBuilder::new(&args.arg_pattern).create()); - if args.flag_count { - run_mmap_count_only(args, &searcher) - } else { - run_mmap(args, &searcher) +impl Args { + fn run(&self) -> Result { + if self.arg_path.is_empty() { + return errored!("Searching stdin is not currently supported."); + } + for p in &self.arg_path { + let mut it = WalkDir::new(p).into_iter(); + loop { + let ent = match it.next() { + None => break, + Some(Err(err)) => { + eprintln!("{}", err); + continue; + } + Some(Ok(ent)) => ent, + }; + if is_hidden(&ent) { + if ent.file_type().is_dir() { + it.skip_current_dir(); + } + continue; + } + println!("{}", ent.path().display()); + } } + Ok(0) } -} -fn run_mmap(args: &Args, searcher: &Grep) -> Result { - unimplemented!() - /* - for m in searcher.iter(text) { - if !args.flag_count { - try!(wtr.write(&text[m.start()..m.end()])); - try!(wtr.write(b"\n")); - } - count += 1; + fn run_mmap_count_only(&self, searcher: &Grep) -> Result { + use memmap::{Mmap, Protection}; + + assert!(self.arg_path.len() == 1); + let mut wtr = io::BufWriter::new(io::stdout()); + let mmap = try!(Mmap::open_path(&self.arg_path[0], Protection::Read)); + let text = unsafe { mmap.as_slice() }; + let count = searcher.iter(text).count() as u64; + try!(writeln!(wtr, "{}", count)); + Ok(count) } - Ok(count) - */ } -#[inline(never)] -fn run_mmap_count_only(args: &Args, searcher: &Grep) -> Result { - use memmap::{Mmap, Protection}; - - assert!(args.arg_file.len() == 1); - let mut wtr = io::BufWriter::new(io::stdout()); - let mmap = try!(Mmap::open_path(&args.arg_file[0], Protection::Read)); - let text = unsafe { mmap.as_slice() }; - let count = searcher.iter(text).count() as u64; - try!(writeln!(wtr, "{}", count)); - Ok(count) +fn is_hidden(ent: &walkdir::DirEntry) -> bool { + ent.depth() > 0 && + ent.file_name().to_str().map(|s| s.starts_with(".")).unwrap_or(false) } -- cgit v1.2.3