diff options
author | Andrew Gallant <jamslam@gmail.com> | 2016-09-21 19:12:07 -0400 |
---|---|---|
committer | Andrew Gallant <jamslam@gmail.com> | 2016-09-21 19:12:07 -0400 |
commit | 2a2b1506d42c64916e27d848cc86f1e8330881c4 (patch) | |
tree | a99bb58f32f1bcbc4093b5b67e70e356f60a7bd8 | |
parent | 4d6b3c727e88ce840a709e5a13cc3f9cfc5960a9 (diff) |
Fix a performance bug where using -w could result in very bad performance.
The specific issue is that -w causes the regex to be wrapped in Unicode
word boundaries. Regrettably, Unicode word boundaries are the one thing
our regex engine can't handle well in the presence of non-ASCII text. We
work around its slowness by stripping word boundaries in some
circumstances, and using the resulting expression as a way to produce match
candidates that are then verified by the full original regex.
This doesn't fix all cases, but it should fix all cases where -w is used.
-rw-r--r-- | grep/src/lib.rs | 1 | ||||
-rw-r--r-- | grep/src/search.rs | 35 | ||||
-rw-r--r-- | grep/src/word_boundary.rs | 54 |
3 files changed, 80 insertions, 10 deletions
diff --git a/grep/src/lib.rs b/grep/src/lib.rs index 7526621d..1fd01269 100644 --- a/grep/src/lib.rs +++ b/grep/src/lib.rs @@ -19,6 +19,7 @@ pub use search::{Grep, GrepBuilder, Iter, Match}; mod literals; mod nonl; mod search; +mod word_boundary; /// Result is a convenient type alias that fixes the type of the error to /// the `Error` type defined in this crate. diff --git a/grep/src/search.rs b/grep/src/search.rs index 33cb0fbe..6bff2ba9 100644 --- a/grep/src/search.rs +++ b/grep/src/search.rs @@ -4,6 +4,8 @@ use syntax; use literals::LiteralSets; use nonl; +use syntax::Expr; +use word_boundary::strip_unicode_word_boundaries; use Result; /// A matched line. @@ -127,22 +129,35 @@ impl GrepBuilder { pub fn build(self) -> Result<Grep> { let expr = try!(self.parse()); let literals = LiteralSets::create(&expr); - let re = try!( - RegexBuilder::new(&expr.to_string()) - .case_insensitive(self.opts.case_insensitive) - .multi_line(true) - .unicode(true) - .size_limit(self.opts.size_limit) - .dfa_size_limit(self.opts.dfa_size_limit) - .compile() - ); + let re = try!(self.regex(&expr)); + let required = literals.to_regex().or_else(|| { + let expr = match strip_unicode_word_boundaries(&expr) { + None => return None, + Some(expr) => expr, + }; + debug!("Stripped Unicode word boundaries. New AST:\n{:?}", expr); + self.regex(&expr).ok() + }); Ok(Grep { re: re, - required: literals.to_regex(), + required: required, opts: self.opts, }) } + /// Creates a new regex from the given expression with the current + /// configuration. + fn regex(&self, expr: &Expr) -> Result<Regex> { + RegexBuilder::new(&expr.to_string()) + .case_insensitive(self.opts.case_insensitive) + .multi_line(true) + .unicode(true) + .size_limit(self.opts.size_limit) + .dfa_size_limit(self.opts.dfa_size_limit) + .compile() + .map_err(From::from) + } + /// Parses the underlying pattern and ensures the pattern can never match /// the line terminator. fn parse(&self) -> Result<syntax::Expr> { diff --git a/grep/src/word_boundary.rs b/grep/src/word_boundary.rs new file mode 100644 index 00000000..6df5c657 --- /dev/null +++ b/grep/src/word_boundary.rs @@ -0,0 +1,54 @@ +use syntax::Expr; + +/// Strips Unicode word boundaries from the given expression. +/// +/// The key invariant this maintains is that the expression returned will match +/// *at least* every where the expression given will match. Namely, a match of +/// the returned expression can report false positives but it will never report +/// false negatives. +/// +/// If no word boundaries could be stripped, then None is returned. +pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> { + // The real reason we do this is because Unicode word boundaries are the + // one thing that Rust's regex DFA engine can't handle. When it sees a + // Unicode word boundary among non-ASCII text, it falls back to one of the + // slower engines. We work around this limitation by attempting to use + // a regex to find candidate matches without a Unicode word boundary. We'll + // only then use the full (and slower) regex to confirm a candidate as a + // match or not during search. + use syntax::Expr::*; + + match *expr { + Concat(ref es) if !es.is_empty() => { + let first = is_unicode_word_boundary(&es[0]); + let last = is_unicode_word_boundary(es.last().unwrap()); + // Be careful not to strip word boundaries if there are no other + // expressions to match. + match (first, last) { + (true, false) if es.len() > 1 => { + Some(Concat(es[1..].to_vec())) + } + (false, true) if es.len() > 1 => { + Some(Concat(es[..es.len() - 1].to_vec())) + } + (true, true) if es.len() > 2 => { + Some(Concat(es[1..es.len() - 1].to_vec())) + } + _ => None, + } + } + _ => None, + } +} + +/// Returns true if the given expression is a Unicode word boundary. +fn is_unicode_word_boundary(expr: &Expr) -> bool { + use syntax::Expr::*; + + match *expr { + WordBoundary => true, + NotWordBoundary => true, + Group { ref e, .. } => is_unicode_word_boundary(e), + _ => false, + } +} |