summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2019-08-01 16:58:12 -0400
committerAndrew Gallant <jamslam@gmail.com>2019-08-01 16:58:12 -0400
commitadb9332f52b884c8b872d1c969a7841188fe3089 (patch)
tree2871eb4f72be03e97c3135780502ee5d49276d77
parentbc37c32717301abf26e5aecc942688d2ddd63692 (diff)
regex: fix -F aho-corasick optimization
It turns out that when the -F flag was used, if any of the patterns contained a regex meta character (such as `.`), then we winded up escaping the pattern first before handing it off to Aho-Corasick, which treats all patterns literally. We continue to apply band-aides here and just avoid Aho-Corasick if there is an escape in any of the literal patterns. This is unfortunate, but making this work better requires more refactoring, and the right solution is to get this optimization pushed down into the regex engine. Fixes #1334
-rw-r--r--grep-regex/src/matcher.rs25
-rw-r--r--tests/regression.rs10
2 files changed, 33 insertions, 2 deletions
diff --git a/grep-regex/src/matcher.rs b/grep-regex/src/matcher.rs
index 7f30252a..61af0518 100644
--- a/grep-regex/src/matcher.rs
+++ b/grep-regex/src/matcher.rs
@@ -71,10 +71,31 @@ impl RegexMatcherBuilder {
&self,
literals: &[B],
) -> Result<RegexMatcher, Error> {
- let slices: Vec<_> = literals.iter().map(|s| s.as_ref()).collect();
- if !self.config.can_plain_aho_corasick() || literals.len() < 40 {
+ let mut has_escape = false;
+ let mut slices = vec![];
+ for lit in literals {
+ slices.push(lit.as_ref());
+ has_escape = has_escape || lit.as_ref().contains('\\');
+ }
+ // Even when we have a fixed set of literals, we might still want to
+ // use the regex engine. Specifically, if any string has an escape
+ // in it, then we probably can't feed it to Aho-Corasick without
+ // removing the escape. Additionally, if there are any particular
+ // special match semantics we need to honor, that Aho-Corasick isn't
+ // enough. Finally, the regex engine can do really well with a small
+ // number of literals (at time of writing, this is changing soon), so
+ // we use it when there's a small set.
+ //
+ // Yes, this is one giant hack. Ideally, this entirely separate literal
+ // matcher that uses Aho-Corasick would be pushed down into the regex
+ // engine.
+ if has_escape
+ || !self.config.can_plain_aho_corasick()
+ || literals.len() < 40
+ {
return self.build(&slices.join("|"));
}
+
let matcher = MultiLiteralMatcher::new(&slices)?;
let imp = RegexMatcherImpl::MultiLiteral(matcher);
Ok(RegexMatcher {
diff --git a/tests/regression.rs b/tests/regression.rs
index 40a84654..88f2194d 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -716,3 +716,13 @@ rgtest!(r1259_drop_last_byte_nonl, |dir: Dir, mut cmd: TestCommand| {
cmd = dir.command();
eqnice!("fz\n", cmd.arg("-f").arg("patterns-nl").arg("test").stdout());
});
+
+// See: https://github.com/BurntSushi/ripgrep/issues/1334
+rgtest!(r1334_crazy_literals, |dir: Dir, mut cmd: TestCommand| {
+ dir.create("patterns", &"1.208.0.0/12\n".repeat(40));
+ dir.create("corpus", "1.208.0.0/12\n");
+ eqnice!(
+ "1.208.0.0/12\n",
+ cmd.arg("-Ff").arg("patterns").arg("corpus").stdout()
+ );
+});