summaryrefslogtreecommitdiffstats
path: root/grep-regex/src/config.rs
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2019-04-07 18:43:01 -0400
committerAndrew Gallant <jamslam@gmail.com>2019-04-07 19:11:03 -0400
commit09108b7fda7af6db7c1c4f0366301f9a21cc485d (patch)
treefc1afd8e8b036312f97965f51d7de1e5e9d0db7d /grep-regex/src/config.rs
parent743d64f2e4093a3302895e128fbbc58e6fb8ed18 (diff)
regex: make multi-literal searcher faster
This makes the case of searching for a dictionary of a very large number of literals much much faster. (~10x or so.) In particular, we achieve this by short-circuiting the construction of a full regex when we know we have a simple alternation of literals. Building the regex for a large dictionary (>100,000 literals) turns out to be quite slow, even if it internally will dispatch to Aho-Corasick. Even that isn't quite enough. It turns out that even *parsing* such a regex is quite slow. So when the -F/--fixed-strings flag is set, we short circuit regex parsing completely and jump straight to Aho-Corasick. We aren't quite as fast as GNU grep here, but it's much closer (less than 2x slower). In general, this is somewhat of a hack. In particular, it seems plausible that this optimization could be implemented entirely in the regex engine. Unfortunately, the regex engine's internals are just not amenable to this at all, so it would require a larger refactoring effort. For now, it's good enough to add this fairly simple hack at a higher level. Unfortunately, if you don't pass -F/--fixed-strings, then ripgrep will be slower, because of the aforementioned missing optimization. Moreover, passing flags like `-i` or `-S` will cause ripgrep to abandon this optimization and fall back to something potentially much slower. Again, this fix really needs to happen inside the regex engine, although we might be able to special case -i when the input literals are pure ASCII via Aho-Corasick's `ascii_case_insensitive`. Fixes #497, Fixes #838
Diffstat (limited to 'grep-regex/src/config.rs')
-rw-r--r--grep-regex/src/config.rs49
1 files changed, 35 insertions, 14 deletions
diff --git a/grep-regex/src/config.rs b/grep-regex/src/config.rs
index 4defc151..e04582e4 100644
--- a/grep-regex/src/config.rs
+++ b/grep-regex/src/config.rs
@@ -1,12 +1,13 @@
use grep_matcher::{ByteSet, LineTerminator};
use regex::bytes::{Regex, RegexBuilder};
use regex_syntax::ast::{self, Ast};
-use regex_syntax::hir::Hir;
+use regex_syntax::hir::{self, Hir};
use ast::AstAnalysis;
use crlf::crlfify;
use error::Error;
use literal::LiteralSets;
+use multi::alternation_literals;
use non_matching::non_matching_bytes;
use strip::strip_from_match;
@@ -67,19 +68,17 @@ impl Config {
/// If there was a problem parsing the given expression then an error
/// is returned.
pub fn hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
- let analysis = self.analysis(pattern)?;
- let expr = ::regex_syntax::ParserBuilder::new()
- .nest_limit(self.nest_limit)
- .octal(self.octal)
+ let ast = self.ast(pattern)?;
+ let analysis = self.analysis(&ast)?;
+ let expr = hir::translate::TranslatorBuilder::new()
.allow_invalid_utf8(true)
- .ignore_whitespace(self.ignore_whitespace)
- .case_insensitive(self.is_case_insensitive(&analysis)?)
+ .case_insensitive(self.is_case_insensitive(&analysis))
.multi_line(self.multi_line)
.dot_matches_new_line(self.dot_matches_new_line)
.swap_greed(self.swap_greed)
.unicode(self.unicode)
.build()
- .parse(pattern)
+ .translate(pattern, &ast)
.map_err(Error::regex)?;
let expr = match self.line_terminator {
None => expr,
@@ -99,21 +98,34 @@ impl Config {
fn is_case_insensitive(
&self,
analysis: &AstAnalysis,
- ) -> Result<bool, Error> {
+ ) -> bool {
if self.case_insensitive {
- return Ok(true);
+ return true;
}
if !self.case_smart {
- return Ok(false);
+ return false;
}
- Ok(analysis.any_literal() && !analysis.any_uppercase())
+ analysis.any_literal() && !analysis.any_uppercase()
+ }
+
+ /// Returns true if and only if this config is simple enough such that
+ /// if the pattern is a simple alternation of literals, then it can be
+ /// constructed via a plain Aho-Corasick automaton.
+ ///
+ /// Note that it is OK to return true even when settings like `multi_line`
+ /// are enabled, since if multi-line can impact the match semantics of a
+ /// regex, then it is by definition not a simple alternation of literals.
+ pub fn can_plain_aho_corasick(&self) -> bool {
+ !self.word
+ && !self.case_insensitive
+ && !self.case_smart
}
/// Perform analysis on the AST of this pattern.
///
/// This returns an error if the given pattern failed to parse.
- fn analysis(&self, pattern: &str) -> Result<AstAnalysis, Error> {
- Ok(AstAnalysis::from_ast(&self.ast(pattern)?))
+ fn analysis(&self, ast: &Ast) -> Result<AstAnalysis, Error> {
+ Ok(AstAnalysis::from_ast(ast))
}
/// Parse the given pattern into its abstract syntax.
@@ -173,6 +185,15 @@ impl ConfiguredHIR {
self.pattern_to_regex(&self.expr.to_string())
}
+ /// If this HIR corresponds to an alternation of literals with no
+ /// capturing groups, then this returns those literals.
+ pub fn alternation_literals(&self) -> Option<Vec<Vec<u8>>> {
+ if !self.config.can_plain_aho_corasick() {
+ return None;
+ }
+ alternation_literals(&self.expr)
+ }
+
/// Applies the given function to the concrete syntax of this HIR and then
/// generates a new HIR based on the result of the function in a way that
/// preserves the configuration.