regex: avoid using literal optimizations when whitespace is detected

If a literal is entirely whitespace, then it's quite likely that it is very common. So when that case occurs, just don't do (inner) literal optimizations at all. The regex engine may still make sub-optimal decisions here, but that's a problem for another day. Fixes #1087
author: Andrew Gallant <jamslam@gmail.com> 2020-03-15 12:01:42 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2020-03-15 13:19:14 -0400
commit: e772a95b58b77604e9a4657e949431539ec15b08 (patch)
tree: 9329292c5fffee78ada284f68e4f9ca881569f51 /crates
parent: 9dd4bf8d7fe291ebdb3d3277d06d4708d98169e9 (diff)
2 files changed, 28 insertions, 2 deletions
diff --git a/crates/regex/src/lib.rs b/crates/regex/src/lib.rs
index 91fe93b1..4004a692 100644
--- a/crates/regex/src/lib.rs
+++ b/crates/regex/src/lib.rs
@@ -5,6 +5,7 @@ An implementation of `grep-matcher`'s `Matcher` trait for Rust's regex engine.
 #![deny(missing_docs)]
 
 extern crate aho_corasick;
+extern crate bstr;
 extern crate grep_matcher;
 #[macro_use]
 extern crate log;
diff --git a/crates/regex/src/literal.rs b/crates/regex/src/literal.rs
index cbb49dbe..49bc4ca2 100644
--- a/crates/regex/src/literal.rs
+++ b/crates/regex/src/literal.rs
@@ -5,6 +5,7 @@ the regex engine doesn't look for inner literals. Since we're doing line based
 searching, we can use them, so we need to do it ourselves.
 */
 
+use bstr::ByteSlice;
 use regex_syntax::hir::literal::{Literal, Literals};
 use regex_syntax::hir::{self, Hir, HirKind};
 
@@ -99,7 +100,12 @@ impl LiteralSets {
         // helps with case insensitive matching, which can generate lots of
         // inner required literals.
         let any_empty = req_lits.iter().any(|lit| lit.is_empty());
-        if req.len() > lit.len() && req_lits.len() > 1 && !any_empty {
+        let any_white = has_only_whitespace(&req_lits);
+        if req.len() > lit.len()
+            && req_lits.len() > 1
+            && !any_empty
+            && !any_white
+        {
             debug!("required literals found: {:?}", req_lits);
             let alts: Vec<String> = req_lits
                 .into_iter()
@@ -121,7 +127,7 @@ impl LiteralSets {
             // with the highest minimum length and use that to build our "fast"
             // regex.
             //
-            // This manifest in fairly common scenarios. e.g.,
+            // This manifests in fairly common scenarios. e.g.,
             //
             //     rg -w 'foo|bar|baz|quux'
             //
@@ -159,12 +165,20 @@ impl LiteralSets {
             };
 
             debug!("prefix/suffix literals found: {:?}", lits);
+            if has_only_whitespace(lits) {
+                debug!("dropping literals because one was whitespace");
+                return None;
+            }
             let alts: Vec<String> =
                 lits.into_iter().map(|x| util::bytes_to_regex(x)).collect();
             // We're matching raw bytes, so disable Unicode mode.
             Some(format!("(?-u:{})", alts.join("|")))
         } else {
             debug!("required literal found: {:?}", util::show_bytes(lit));
+            if lit.chars().all(|c| c.is_whitespace()) {
+                debug!("dropping literal because one was whitespace");
+                return None;
+            }
             Some(format!("(?-u:{})", util::bytes_to_regex(&lit)))
         }
     }
@@ -328,6 +342,17 @@ fn count_byte_class(cls: &hir::ClassBytes) -> u32 {
     cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
 }
 
+/// Returns true if and only if any of the literals in the given set is
+/// entirely whitespace.
+fn has_only_whitespace(lits: &[Literal]) -> bool {
+    for lit in lits {
+        if lit.chars().all(|c| c.is_whitespace()) {
+            return true;
+        }
+    }
+    false
+}
+
 #[cfg(test)]
 mod tests {
     use super::LiteralSets;
author	Andrew Gallant <jamslam@gmail.com>	2020-03-15 12:01:42 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2020-03-15 13:19:14 -0400
commit	e772a95b58b77604e9a4657e949431539ec15b08 (patch)
tree	9329292c5fffee78ada284f68e4f9ca881569f51 /crates
parent	9dd4bf8d7fe291ebdb3d3277d06d4708d98169e9 (diff)