impl: fix --multiline anchored match bug

This fixes a bug where using \A or (?-m)^ in combination with -U/--multiline would permit matches that aren't anchored to the beginning of the file. The underlying cause was an optimization that occurred when mmaps couldn't be used. Namely, ripgrep tries to still read the input incrementally if it knows the pattern can't match through a new line. But the detection logic was flawed, since it didn't account for line anchors. This commit fixes that. Fixes #1878, Fixes #1879
author: Andrew Gallant <jamslam@gmail.com> 2021-05-29 07:34:14 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2021-05-29 07:37:28 -0400
commit: 581a35e568c3acd32461d276a4cfe746524e17cd (patch)
tree: ef3de275dfeb8a0f93db684157d03dadcd6c5386
parent: ba965962fe2fc3513aeeaa99665f09099d92045d (diff)
3 files changed, 29 insertions, 1 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4d0e1175..fc64451d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,8 @@ Bug fixes:
   Fix stdin detection when using PowerShell in UNIX environments.
 * [BUG #1866](https://github.com/BurntSushi/ripgrep/issues/1866#issuecomment-841635553):
   Fix bug when computing column numbers in `--vimgrep` mode.
+* [BUG #1878](https://github.com/BurntSushi/ripgrep/issues/1878):
+  Fix bug where `\A` could produce unanchored matches in multiline search.
 
 
 12.1.1 (2020-05-29)
diff --git a/crates/regex/src/non_matching.rs b/crates/regex/src/non_matching.rs
index 2270f94d..e2e0755b 100644
--- a/crates/regex/src/non_matching.rs
+++ b/crates/regex/src/non_matching.rs
@@ -13,7 +13,10 @@ pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
 /// the given expression.
 fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
     match *expr.kind() {
-        HirKind::Empty | HirKind::Anchor(_) | HirKind::WordBoundary(_) => {}
+        HirKind::Empty | HirKind::WordBoundary(_) => {}
+        HirKind::Anchor(_) => {
+            set.remove(b'\n');
+        }
         HirKind::Literal(hir::Literal::Unicode(c)) => {
             for &b in c.encode_utf8(&mut [0; 4]).as_bytes() {
                 set.remove(b);
diff --git a/tests/regression.rs b/tests/regression.rs
index 2935a43e..9aba2746 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -882,3 +882,26 @@ test:3:5:foo quux
 ";
     eqnice!(expected, cmd.stdout());
 });
+
+rgtest!(r1878, |dir: Dir, _: TestCommand| {
+    dir.create("test", "a\nbaz\nabc\n");
+
+    // Since ripgrep enables (?m) by default, '^' will match at the beginning
+    // of a line, even when -U/--multiline is used.
+    let args = &["-U", "--no-mmap", r"^baz", "test"];
+    eqnice!("baz\n", dir.command().args(args).stdout());
+    let args = &["-U", "--mmap", r"^baz", "test"];
+    eqnice!("baz\n", dir.command().args(args).stdout());
+
+    // But when (?-m) is disabled, or when \A is used, then there should be no
+    // matches that aren't anchored to the beginning of the file.
+    let args = &["-U", "--no-mmap", r"(?-m)^baz", "test"];
+    dir.command().args(args).assert_err();
+    let args = &["-U", "--mmap", r"(?-m)^baz", "test"];
+    dir.command().args(args).assert_err();
+
+    let args = &["-U", "--no-mmap", r"\Abaz", "test"];
+    dir.command().args(args).assert_err();
+    let args = &["-U", "--mmap", r"\Abaz", "test"];
+    dir.command().args(args).assert_err();
+});
author	Andrew Gallant <jamslam@gmail.com>	2021-05-29 07:34:14 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2021-05-29 07:37:28 -0400
commit	581a35e568c3acd32461d276a4cfe746524e17cd (patch)
tree	ef3de275dfeb8a0f93db684157d03dadcd6c5386
parent	ba965962fe2fc3513aeeaa99665f09099d92045d (diff)