summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2019-04-05 21:02:47 -0400
committerAndrew Gallant <jamslam@gmail.com>2019-04-05 23:24:08 -0400
commit9f15e3b671d64efa261f6de0fd4c6c06e0caa21c (patch)
tree587d0dcb9a62eabe31c83ea529920cb5d2b9c2cc
parent254b8b67bbe0245a2f117c030d2c10a2dde2ed46 (diff)
regex: fix a perf bug when using -w flag
When looking for an inner literal to speed up searches, if only a prefix is found, then we generally give up doing inner literal optimizations since the regex engine will generally handle it for us. Unfortunately, this decision was being made *before* we wrap the regex in (^|\W)...($|\W) when using the -w/--word-regexp flag, which would then defeat the literal optimizations inside the regex engine. We fix this with a bit of a hack that says, "if we're doing a word regexp, then give me back any literal you find, even if it's a prefix."
-rw-r--r--grep-regex/src/config.rs2
-rw-r--r--grep-regex/src/literal.rs19
2 files changed, 13 insertions, 8 deletions
diff --git a/grep-regex/src/config.rs b/grep-regex/src/config.rs
index efed9d48..4defc151 100644
--- a/grep-regex/src/config.rs
+++ b/grep-regex/src/config.rs
@@ -207,7 +207,7 @@ impl ConfiguredHIR {
if self.config.line_terminator.is_none() {
return Ok(None);
}
- match LiteralSets::new(&self.expr).one_regex() {
+ match LiteralSets::new(&self.expr).one_regex(self.config.word) {
None => Ok(None),
Some(pattern) => self.pattern_to_regex(&pattern).map(Some),
}
diff --git a/grep-regex/src/literal.rs b/grep-regex/src/literal.rs
index b8a0c1d5..1563ca05 100644
--- a/grep-regex/src/literal.rs
+++ b/grep-regex/src/literal.rs
@@ -47,18 +47,23 @@ impl LiteralSets {
/// generated these literal sets. The idea here is that the pattern
/// returned by this method is much cheaper to search for. i.e., It is
/// usually a single literal or an alternation of literals.
- pub fn one_regex(&self) -> Option<String> {
+ pub fn one_regex(&self, word: bool) -> Option<String> {
// TODO: The logic in this function is basically inscrutable. It grew
// organically in the old grep 0.1 crate. Ideally, it would be
// re-worked. In fact, the entire inner literal extraction should be
// re-worked. Actually, most of regex-syntax's literal extraction
// should also be re-worked. Alas... only so much time in the day.
- if self.prefixes.all_complete() && !self.prefixes.is_empty() {
- debug!("literal prefixes detected: {:?}", self.prefixes);
- // When this is true, the regex engine will do a literal scan,
- // so we don't need to return anything.
- return None;
+ if !word {
+ if self.prefixes.all_complete() && !self.prefixes.is_empty() {
+ debug!("literal prefixes detected: {:?}", self.prefixes);
+ // When this is true, the regex engine will do a literal scan,
+ // so we don't need to return anything. But we only do this
+ // if we aren't doing a word regex, since a word regex adds
+ // a `(?:\W|^)` to the beginning of the regex, thereby
+ // defeating the regex engine's literal detection.
+ return None;
+ }
}
// Out of inner required literals, prefixes and suffixes, which one
@@ -285,7 +290,7 @@ mod tests {
}
fn one_regex(pattern: &str) -> Option<String> {
- sets(pattern).one_regex()
+ sets(pattern).one_regex(false)
}
// Put a pattern into the same format as the one returned by `one_regex`.