From db7e828989dec76ab2d09dfe64c8936f2eca5ecc Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 5 Apr 2019 21:02:47 -0400 Subject: regex: fix a perf bug when using -w flag When looking for an inner literal to speed up searches, if only a prefix is found, then we generally give up doing inner literal optimizations since the regex engine will generally handle it for us. Unfortunately, this decision was being made *before* we wrap the regex in (^|\W)...($|\W) when using the -w/--word-regexp flag, which would then defeat the literal optimizations inside the regex engine. We fix this with a bit of a hack that says, "if we're doing a word regexp, then give me back any literal you find, even if it's a prefix." --- grep-regex/src/config.rs | 2 +- grep-regex/src/literal.rs | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/grep-regex/src/config.rs b/grep-regex/src/config.rs index efed9d48..4defc151 100644 --- a/grep-regex/src/config.rs +++ b/grep-regex/src/config.rs @@ -207,7 +207,7 @@ impl ConfiguredHIR { if self.config.line_terminator.is_none() { return Ok(None); } - match LiteralSets::new(&self.expr).one_regex() { + match LiteralSets::new(&self.expr).one_regex(self.config.word) { None => Ok(None), Some(pattern) => self.pattern_to_regex(&pattern).map(Some), } diff --git a/grep-regex/src/literal.rs b/grep-regex/src/literal.rs index b8a0c1d5..1563ca05 100644 --- a/grep-regex/src/literal.rs +++ b/grep-regex/src/literal.rs @@ -47,18 +47,23 @@ impl LiteralSets { /// generated these literal sets. The idea here is that the pattern /// returned by this method is much cheaper to search for. i.e., It is /// usually a single literal or an alternation of literals. - pub fn one_regex(&self) -> Option { + pub fn one_regex(&self, word: bool) -> Option { // TODO: The logic in this function is basically inscrutable. It grew // organically in the old grep 0.1 crate. Ideally, it would be // re-worked. In fact, the entire inner literal extraction should be // re-worked. Actually, most of regex-syntax's literal extraction // should also be re-worked. Alas... only so much time in the day. - if self.prefixes.all_complete() && !self.prefixes.is_empty() { - debug!("literal prefixes detected: {:?}", self.prefixes); - // When this is true, the regex engine will do a literal scan, - // so we don't need to return anything. - return None; + if !word { + if self.prefixes.all_complete() && !self.prefixes.is_empty() { + debug!("literal prefixes detected: {:?}", self.prefixes); + // When this is true, the regex engine will do a literal scan, + // so we don't need to return anything. But we only do this + // if we aren't doing a word regex, since a word regex adds + // a `(?:\W|^)` to the beginning of the regex, thereby + // defeating the regex engine's literal detection. + return None; + } } // Out of inner required literals, prefixes and suffixes, which one @@ -285,7 +290,7 @@ mod tests { } fn one_regex(pattern: &str) -> Option { - sets(pattern).one_regex() + sets(pattern).one_regex(false) } // Put a pattern into the same format as the one returned by `one_regex`. -- cgit v1.2.3