grep-regex: fix inner literal detection

It seems the inner literal detector fails spectacularly in cases of concatenations that involve groups. The issue here is that if the prefix of a group inside a concatenation can match the empty string, then any literals generated to that point in the concatenation need to be cut such that they are never extended. The detector isn't really built to handle this case, so we just act conservative cut literals whenever we see a sub-group. This may make some regexes slower, but the inner literal detector already misses plenty of cases. Literal detection (including in the regex engine) is a key component that needs to be completely rethought at some point. Fixes #1064
author: Andrew Gallant <jamslam@gmail.com> 2018-09-24 19:24:33 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2018-09-25 16:56:04 -0400
commit: ba503eb6775f0031c684a3bfeeb1c9cb07601dd6 (patch)
tree: f762c930ce30d8b84651159545fead545484f492
parent: f72c2dfd90f98cfbd678d3d957e3d104237ab15b (diff)
2 files changed, 30 insertions, 2 deletions
diff --git a/grep-regex/src/literal.rs b/grep-regex/src/literal.rs
index c3960ae7..b8a0c1d5 100644
--- a/grep-regex/src/literal.rs
+++ b/grep-regex/src/literal.rs
@@ -166,10 +166,10 @@ fn union_required(expr: &Hir, lits: &mut Literals) {
                     lits.cut();
                     continue;
                 }
-                if lits2.contains_empty() {
+                if lits2.contains_empty() || !is_simple(&e) {
                     lits.cut();
                 }
-                if !lits.cross_product(&lits2) {
+                if !lits.cross_product(&lits2) || !lits2.any_complete() {
                     // If this expression couldn't yield any literal that
                     // could be extended, then we need to quit. Since we're
                     // short-circuiting, we also need to freeze every member.
@@ -250,6 +250,20 @@ fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
     }
 }
 
+fn is_simple(expr: &Hir) -> bool {
+    match *expr.kind() {
+        HirKind::Empty
+        | HirKind::Literal(_)
+        | HirKind::Class(_)
+        | HirKind::Repetition(_)
+        | HirKind::Concat(_)
+        | HirKind::Alternation(_) => true,
+        HirKind::Anchor(_)
+        | HirKind::WordBoundary(_)
+        | HirKind::Group(_) => false,
+    }
+}
+
 /// Return the number of characters in the given class.
 fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
     cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
@@ -301,4 +315,12 @@ mod tests {
         // assert_eq!(one_regex(r"\w(foo|bar|baz)"), pat("foo|bar|baz"));
         // assert_eq!(one_regex(r"\w(foo|bar|baz)\w"), pat("foo|bar|baz"));
     }
+
+    #[test]
+    fn regression_1064() {
+        // Regression from:
+        // https://github.com/BurntSushi/ripgrep/issues/1064
+        // assert_eq!(one_regex(r"a.*c"), pat("a"));
+        assert_eq!(one_regex(r"a(.*c)"), pat("a"));
+    }
 }
diff --git a/tests/regression.rs b/tests/regression.rs
index f2553d96..abf29bdd 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -562,3 +562,9 @@ rgtest!(r900, |dir: Dir, mut cmd: TestCommand| {
 
     cmd.arg("-fpat").arg("sherlock").assert_err();
 });
+
+// See: https://github.com/BurntSushi/ripgrep/issues/1064
+rgtest!(r1064, |dir: Dir, mut cmd: TestCommand| {
+    dir.create("input", "abc");
+    eqnice!("input:abc\n", cmd.arg("a(.*c)").stdout());
+});
author	Andrew Gallant <jamslam@gmail.com>	2018-09-24 19:24:33 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2018-09-25 16:56:04 -0400
commit	ba503eb6775f0031c684a3bfeeb1c9cb07601dd6 (patch)
tree	f762c930ce30d8b84651159545fead545484f492
parent	f72c2dfd90f98cfbd678d3d957e3d104237ab15b (diff)