summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG.md2
-rw-r--r--crates/regex/src/word.rs16
-rw-r--r--tests/regression.rs15
3 files changed, 31 insertions, 2 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1b435273..c637aeae 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -38,6 +38,8 @@ Bug fixes:
Fix bug when using inline regex flags with `-e/--regexp`.
* [BUG #2523](https://github.com/BurntSushi/ripgrep/issues/2523):
Make executable searching take `.com` into account on Windows.
+* [BUG #2574](https://github.com/BurntSushi/ripgrep/issues/2574):
+ Fix bug in `-w/--word-regexp` that would result in incorrect match offsets.
13.0.0 (2021-06-12)
diff --git a/crates/regex/src/word.rs b/crates/regex/src/word.rs
index af4480ab..52fb61ce 100644
--- a/crates/regex/src/word.rs
+++ b/crates/regex/src/word.rs
@@ -128,6 +128,9 @@ impl WordMatcher {
// The reason why we cannot handle the ^/$ cases here is because we
// can't assume anything about the original pattern. (Try commenting
// out the checks for ^/$ below and run the tests to see examples.)
+ //
+ // NOTE(2023-07-31): After fixing #2574, this logic honestly still
+ // doesn't seem correct. Regex composition is hard.
let input = Input::new(haystack).span(at..haystack.len());
let mut cand = match self.regex.find(input) {
None => return Ok(None),
@@ -136,8 +139,17 @@ impl WordMatcher {
if cand.start() == 0 || cand.end() == haystack.len() {
return Err(());
}
- let (_, slen) = bstr::decode_utf8(&haystack[cand]);
- let (_, elen) = bstr::decode_last_utf8(&haystack[cand]);
+ // We decode the chars on either side of the match. If either char is
+ // a word character, then that means the ^/$ matched and not \W. In
+ // that case, we defer to the slower engine.
+ let (ch, slen) = bstr::decode_utf8(&haystack[cand]);
+ if ch.map_or(true, regex_syntax::is_word_character) {
+ return Err(());
+ }
+ let (ch, elen) = bstr::decode_last_utf8(&haystack[cand]);
+ if ch.map_or(true, regex_syntax::is_word_character) {
+ return Err(());
+ }
let new_start = cand.start() + slen;
let new_end = cand.end() - elen;
// This occurs the original regex can match the empty string. In this
diff --git a/tests/regression.rs b/tests/regression.rs
index b9076803..5ef741cf 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -1173,3 +1173,18 @@ rgtest!(r2480, |dir: Dir, mut cmd: TestCommand| {
cmd.args(&["--only-matching", "-e", "(?i)notfoo", "-e", "bar", "file"]);
cmd.assert_err();
});
+
+// See: https://github.com/BurntSushi/ripgrep/issues/2574
+rgtest!(r2574, |dir: Dir, mut cmd: TestCommand| {
+ dir.create("haystack", "some.domain.com\nsome.domain.com/x\n");
+ let got = cmd
+ .args(&[
+ "--no-filename",
+ "--no-unicode",
+ "-w",
+ "-o",
+ r"(\w+\.)*domain\.(\w+)",
+ ])
+ .stdout();
+ eqnice!("some.domain.com\nsome.domain.com\n", got);
+});