summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2022-07-15 09:53:39 -0400
committerAndrew Gallant <jamslam@gmail.com>2022-07-15 09:53:39 -0400
commit8e57989cd20379acfca28a341da61df177daca1c (patch)
treeedadca9c55694a060ed8ad6faa5fee634627cf63
parentb9f5835534f0cf726e536c534c0725ea2848644e (diff)
regex: fix matching bug when text anchors are used
It turns out that if there are text anchors (that is, \A or \z, or ^/$ when multi-line is disabled), then the "fast" line searching path isn't quite correct. Since searching without multi-line mode is exceptionally rare, we just look for the presence of text anchors and specifically disable the line terminator option in 'grep-regex'. This in turn inhibits the "fast" line searching path. Fixes #2260
-rw-r--r--crates/regex/src/config.rs30
-rw-r--r--crates/regex/src/matcher.rs6
-rw-r--r--crates/searcher/src/searcher/core.rs2
-rw-r--r--crates/searcher/src/searcher/glue.rs27
4 files changed, 63 insertions, 2 deletions
diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs
index 4f3cc0fd..75dfa575 100644
--- a/crates/regex/src/config.rs
+++ b/crates/regex/src/config.rs
@@ -175,6 +175,36 @@ impl ConfiguredHIR {
self.config.crlf && self.expr.is_line_anchored_end()
}
+ /// Returns the line terminator configured on this expression.
+ ///
+ /// When we have beginning/end anchors (NOT line anchors), the fast line
+ /// searching path isn't quite correct. Or at least, doesn't match the
+ /// slow path. Namely, the slow path strips line terminators while the
+ /// fast path does not. Since '$' (when multi-line mode is disabled)
+ /// doesn't match at line boundaries, the existence of a line terminator
+ /// might cause it to not match when it otherwise would with the line
+ /// terminator stripped.
+ ///
+ /// Since searching with text anchors is exceptionally rare in the
+ /// context of line oriented searching (multi-line mode is basically
+ /// always enabled), we just disable this optimization when there are
+ /// text anchors. We disable it by not returning a line terminator, since
+ /// without a line terminator, the fast search path can't be executed.
+ ///
+ /// See: https://github.com/BurntSushi/ripgrep/issues/2260
+ pub fn line_terminator(&self) -> Option<LineTerminator> {
+ if self.is_any_anchored() {
+ None
+ } else {
+ self.config.line_terminator
+ }
+ }
+
+ /// Returns true if and only if the underlying HIR has any text anchors.
+ fn is_any_anchored(&self) -> bool {
+ self.expr.is_any_anchored_start() || self.expr.is_any_anchored_end()
+ }
+
/// Builds a regular expression from this HIR expression.
pub fn regex(&self) -> Result<Regex, Error> {
self.pattern_to_regex(&self.expr.to_string())
diff --git a/crates/regex/src/matcher.rs b/crates/regex/src/matcher.rs
index 92ef5c76..725aae3b 100644
--- a/crates/regex/src/matcher.rs
+++ b/crates/regex/src/matcher.rs
@@ -52,8 +52,12 @@ impl RegexMatcherBuilder {
let matcher = RegexMatcherImpl::new(&chir)?;
log::trace!("final regex: {:?}", matcher.regex());
+ let mut config = self.config.clone();
+ // We override the line terminator in case the configured expr doesn't
+ // support it.
+ config.line_terminator = chir.line_terminator();
Ok(RegexMatcher {
- config: self.config.clone(),
+ config,
matcher,
fast_line_regex,
non_matching_bytes,
diff --git a/crates/searcher/src/searcher/core.rs b/crates/searcher/src/searcher/core.rs
index 7d6ccd66..e68780ac 100644
--- a/crates/searcher/src/searcher/core.rs
+++ b/crates/searcher/src/searcher/core.rs
@@ -108,7 +108,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
}
pub fn match_by_line(&mut self, buf: &[u8]) -> Result<bool, S::Error> {
- if self.is_line_by_line_fast() {
+ if dbg!(self.is_line_by_line_fast()) {
self.match_by_line_fast(buf)
} else {
self.match_by_line_slow(buf)
diff --git a/crates/searcher/src/searcher/glue.rs b/crates/searcher/src/searcher/glue.rs
index 1ff208bc..217c70e4 100644
--- a/crates/searcher/src/searcher/glue.rs
+++ b/crates/searcher/src/searcher/glue.rs
@@ -1512,4 +1512,31 @@ and exhibited clearly, with a label attached.\
)
.unwrap();
}
+
+ // See: https://github.com/BurntSushi/ripgrep/issues/2260
+ #[test]
+ fn regression_2260() {
+ use grep_regex::RegexMatcherBuilder;
+
+ use crate::SearcherBuilder;
+
+ let matcher = RegexMatcherBuilder::new()
+ .line_terminator(Some(b'\n'))
+ .build(r"^\w+$")
+ .unwrap();
+ let mut searcher = SearcherBuilder::new().line_number(true).build();
+
+ let mut matched = false;
+ searcher
+ .search_slice(
+ &matcher,
+ b"GATC\n",
+ crate::sinks::UTF8(|_, _| {
+ matched = true;
+ Ok(true)
+ }),
+ )
+ .unwrap();
+ assert!(matched);
+ }
}