From 8e57989cd20379acfca28a341da61df177daca1c Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 15 Jul 2022 09:53:39 -0400 Subject: regex: fix matching bug when text anchors are used It turns out that if there are text anchors (that is, \A or \z, or ^/$ when multi-line is disabled), then the "fast" line searching path isn't quite correct. Since searching without multi-line mode is exceptionally rare, we just look for the presence of text anchors and specifically disable the line terminator option in 'grep-regex'. This in turn inhibits the "fast" line searching path. Fixes #2260 --- crates/regex/src/config.rs | 30 ++++++++++++++++++++++++++++++ crates/regex/src/matcher.rs | 6 +++++- crates/searcher/src/searcher/core.rs | 2 +- crates/searcher/src/searcher/glue.rs | 27 +++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 2 deletions(-) diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs index 4f3cc0fd..75dfa575 100644 --- a/crates/regex/src/config.rs +++ b/crates/regex/src/config.rs @@ -175,6 +175,36 @@ impl ConfiguredHIR { self.config.crlf && self.expr.is_line_anchored_end() } + /// Returns the line terminator configured on this expression. + /// + /// When we have beginning/end anchors (NOT line anchors), the fast line + /// searching path isn't quite correct. Or at least, doesn't match the + /// slow path. Namely, the slow path strips line terminators while the + /// fast path does not. Since '$' (when multi-line mode is disabled) + /// doesn't match at line boundaries, the existence of a line terminator + /// might cause it to not match when it otherwise would with the line + /// terminator stripped. + /// + /// Since searching with text anchors is exceptionally rare in the + /// context of line oriented searching (multi-line mode is basically + /// always enabled), we just disable this optimization when there are + /// text anchors. We disable it by not returning a line terminator, since + /// without a line terminator, the fast search path can't be executed. + /// + /// See: https://github.com/BurntSushi/ripgrep/issues/2260 + pub fn line_terminator(&self) -> Option { + if self.is_any_anchored() { + None + } else { + self.config.line_terminator + } + } + + /// Returns true if and only if the underlying HIR has any text anchors. + fn is_any_anchored(&self) -> bool { + self.expr.is_any_anchored_start() || self.expr.is_any_anchored_end() + } + /// Builds a regular expression from this HIR expression. pub fn regex(&self) -> Result { self.pattern_to_regex(&self.expr.to_string()) diff --git a/crates/regex/src/matcher.rs b/crates/regex/src/matcher.rs index 92ef5c76..725aae3b 100644 --- a/crates/regex/src/matcher.rs +++ b/crates/regex/src/matcher.rs @@ -52,8 +52,12 @@ impl RegexMatcherBuilder { let matcher = RegexMatcherImpl::new(&chir)?; log::trace!("final regex: {:?}", matcher.regex()); + let mut config = self.config.clone(); + // We override the line terminator in case the configured expr doesn't + // support it. + config.line_terminator = chir.line_terminator(); Ok(RegexMatcher { - config: self.config.clone(), + config, matcher, fast_line_regex, non_matching_bytes, diff --git a/crates/searcher/src/searcher/core.rs b/crates/searcher/src/searcher/core.rs index 7d6ccd66..e68780ac 100644 --- a/crates/searcher/src/searcher/core.rs +++ b/crates/searcher/src/searcher/core.rs @@ -108,7 +108,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { } pub fn match_by_line(&mut self, buf: &[u8]) -> Result { - if self.is_line_by_line_fast() { + if dbg!(self.is_line_by_line_fast()) { self.match_by_line_fast(buf) } else { self.match_by_line_slow(buf) diff --git a/crates/searcher/src/searcher/glue.rs b/crates/searcher/src/searcher/glue.rs index 1ff208bc..217c70e4 100644 --- a/crates/searcher/src/searcher/glue.rs +++ b/crates/searcher/src/searcher/glue.rs @@ -1512,4 +1512,31 @@ and exhibited clearly, with a label attached.\ ) .unwrap(); } + + // See: https://github.com/BurntSushi/ripgrep/issues/2260 + #[test] + fn regression_2260() { + use grep_regex::RegexMatcherBuilder; + + use crate::SearcherBuilder; + + let matcher = RegexMatcherBuilder::new() + .line_terminator(Some(b'\n')) + .build(r"^\w+$") + .unwrap(); + let mut searcher = SearcherBuilder::new().line_number(true).build(); + + let mut matched = false; + searcher + .search_slice( + &matcher, + b"GATC\n", + crate::sinks::UTF8(|_, _| { + matched = true; + Ok(true) + }), + ) + .unwrap(); + assert!(matched); + } } -- cgit v1.2.3