summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2016-09-21 19:12:07 -0400
committerAndrew Gallant <jamslam@gmail.com>2016-09-21 19:12:07 -0400
commit2a2b1506d42c64916e27d848cc86f1e8330881c4 (patch)
treea99bb58f32f1bcbc4093b5b67e70e356f60a7bd8
parent4d6b3c727e88ce840a709e5a13cc3f9cfc5960a9 (diff)
Fix a performance bug where using -w could result in very bad performance.
The specific issue is that -w causes the regex to be wrapped in Unicode word boundaries. Regrettably, Unicode word boundaries are the one thing our regex engine can't handle well in the presence of non-ASCII text. We work around its slowness by stripping word boundaries in some circumstances, and using the resulting expression as a way to produce match candidates that are then verified by the full original regex. This doesn't fix all cases, but it should fix all cases where -w is used.
-rw-r--r--grep/src/lib.rs1
-rw-r--r--grep/src/search.rs35
-rw-r--r--grep/src/word_boundary.rs54
3 files changed, 80 insertions, 10 deletions
diff --git a/grep/src/lib.rs b/grep/src/lib.rs
index 7526621d..1fd01269 100644
--- a/grep/src/lib.rs
+++ b/grep/src/lib.rs
@@ -19,6 +19,7 @@ pub use search::{Grep, GrepBuilder, Iter, Match};
mod literals;
mod nonl;
mod search;
+mod word_boundary;
/// Result is a convenient type alias that fixes the type of the error to
/// the `Error` type defined in this crate.
diff --git a/grep/src/search.rs b/grep/src/search.rs
index 33cb0fbe..6bff2ba9 100644
--- a/grep/src/search.rs
+++ b/grep/src/search.rs
@@ -4,6 +4,8 @@ use syntax;
use literals::LiteralSets;
use nonl;
+use syntax::Expr;
+use word_boundary::strip_unicode_word_boundaries;
use Result;
/// A matched line.
@@ -127,22 +129,35 @@ impl GrepBuilder {
pub fn build(self) -> Result<Grep> {
let expr = try!(self.parse());
let literals = LiteralSets::create(&expr);
- let re = try!(
- RegexBuilder::new(&expr.to_string())
- .case_insensitive(self.opts.case_insensitive)
- .multi_line(true)
- .unicode(true)
- .size_limit(self.opts.size_limit)
- .dfa_size_limit(self.opts.dfa_size_limit)
- .compile()
- );
+ let re = try!(self.regex(&expr));
+ let required = literals.to_regex().or_else(|| {
+ let expr = match strip_unicode_word_boundaries(&expr) {
+ None => return None,
+ Some(expr) => expr,
+ };
+ debug!("Stripped Unicode word boundaries. New AST:\n{:?}", expr);
+ self.regex(&expr).ok()
+ });
Ok(Grep {
re: re,
- required: literals.to_regex(),
+ required: required,
opts: self.opts,
})
}
+ /// Creates a new regex from the given expression with the current
+ /// configuration.
+ fn regex(&self, expr: &Expr) -> Result<Regex> {
+ RegexBuilder::new(&expr.to_string())
+ .case_insensitive(self.opts.case_insensitive)
+ .multi_line(true)
+ .unicode(true)
+ .size_limit(self.opts.size_limit)
+ .dfa_size_limit(self.opts.dfa_size_limit)
+ .compile()
+ .map_err(From::from)
+ }
+
/// Parses the underlying pattern and ensures the pattern can never match
/// the line terminator.
fn parse(&self) -> Result<syntax::Expr> {
diff --git a/grep/src/word_boundary.rs b/grep/src/word_boundary.rs
new file mode 100644
index 00000000..6df5c657
--- /dev/null
+++ b/grep/src/word_boundary.rs
@@ -0,0 +1,54 @@
+use syntax::Expr;
+
+/// Strips Unicode word boundaries from the given expression.
+///
+/// The key invariant this maintains is that the expression returned will match
+/// *at least* every where the expression given will match. Namely, a match of
+/// the returned expression can report false positives but it will never report
+/// false negatives.
+///
+/// If no word boundaries could be stripped, then None is returned.
+pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
+ // The real reason we do this is because Unicode word boundaries are the
+ // one thing that Rust's regex DFA engine can't handle. When it sees a
+ // Unicode word boundary among non-ASCII text, it falls back to one of the
+ // slower engines. We work around this limitation by attempting to use
+ // a regex to find candidate matches without a Unicode word boundary. We'll
+ // only then use the full (and slower) regex to confirm a candidate as a
+ // match or not during search.
+ use syntax::Expr::*;
+
+ match *expr {
+ Concat(ref es) if !es.is_empty() => {
+ let first = is_unicode_word_boundary(&es[0]);
+ let last = is_unicode_word_boundary(es.last().unwrap());
+ // Be careful not to strip word boundaries if there are no other
+ // expressions to match.
+ match (first, last) {
+ (true, false) if es.len() > 1 => {
+ Some(Concat(es[1..].to_vec()))
+ }
+ (false, true) if es.len() > 1 => {
+ Some(Concat(es[..es.len() - 1].to_vec()))
+ }
+ (true, true) if es.len() > 2 => {
+ Some(Concat(es[1..es.len() - 1].to_vec()))
+ }
+ _ => None,
+ }
+ }
+ _ => None,
+ }
+}
+
+/// Returns true if the given expression is a Unicode word boundary.
+fn is_unicode_word_boundary(expr: &Expr) -> bool {
+ use syntax::Expr::*;
+
+ match *expr {
+ WordBoundary => true,
+ NotWordBoundary => true,
+ Group { ref e, .. } => is_unicode_word_boundary(e),
+ _ => false,
+ }
+}