summaryrefslogtreecommitdiffstats
path: root/crates/regex/src/config.rs
diff options
context:
space:
mode:
Diffstat (limited to 'crates/regex/src/config.rs')
-rw-r--r--crates/regex/src/config.rs288
1 files changed, 288 insertions, 0 deletions
diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs
new file mode 100644
index 00000000..1f81a802
--- /dev/null
+++ b/crates/regex/src/config.rs
@@ -0,0 +1,288 @@
+use grep_matcher::{ByteSet, LineTerminator};
+use regex::bytes::{Regex, RegexBuilder};
+use regex_syntax::ast::{self, Ast};
+use regex_syntax::hir::{self, Hir};
+
+use ast::AstAnalysis;
+use crlf::crlfify;
+use error::Error;
+use literal::LiteralSets;
+use multi::alternation_literals;
+use non_matching::non_matching_bytes;
+use strip::strip_from_match;
+
+/// Config represents the configuration of a regex matcher in this crate.
+/// The configuration is itself a rough combination of the knobs found in
+/// the `regex` crate itself, along with additional `grep-matcher` specific
+/// options.
+///
+/// The configuration can be used to build a "configured" HIR expression. A
+/// configured HIR expression is an HIR expression that is aware of the
+/// configuration which generated it, and provides transformation on that HIR
+/// such that the configuration is preserved.
+#[derive(Clone, Debug)]
+pub struct Config {
+ pub case_insensitive: bool,
+ pub case_smart: bool,
+ pub multi_line: bool,
+ pub dot_matches_new_line: bool,
+ pub swap_greed: bool,
+ pub ignore_whitespace: bool,
+ pub unicode: bool,
+ pub octal: bool,
+ pub size_limit: usize,
+ pub dfa_size_limit: usize,
+ pub nest_limit: u32,
+ pub line_terminator: Option<LineTerminator>,
+ pub crlf: bool,
+ pub word: bool,
+}
+
+impl Default for Config {
+ fn default() -> Config {
+ Config {
+ case_insensitive: false,
+ case_smart: false,
+ multi_line: false,
+ dot_matches_new_line: false,
+ swap_greed: false,
+ ignore_whitespace: false,
+ unicode: true,
+ octal: false,
+ // These size limits are much bigger than what's in the regex
+ // crate.
+ size_limit: 100 * (1 << 20),
+ dfa_size_limit: 1000 * (1 << 20),
+ nest_limit: 250,
+ line_terminator: None,
+ crlf: false,
+ word: false,
+ }
+ }
+}
+
+impl Config {
+ /// Parse the given pattern and returned its HIR expression along with
+ /// the current configuration.
+ ///
+ /// If there was a problem parsing the given expression then an error
+ /// is returned.
+ pub fn hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
+ let ast = self.ast(pattern)?;
+ let analysis = self.analysis(&ast)?;
+ let expr = hir::translate::TranslatorBuilder::new()
+ .allow_invalid_utf8(true)
+ .case_insensitive(self.is_case_insensitive(&analysis))
+ .multi_line(self.multi_line)
+ .dot_matches_new_line(self.dot_matches_new_line)
+ .swap_greed(self.swap_greed)
+ .unicode(self.unicode)
+ .build()
+ .translate(pattern, &ast)
+ .map_err(Error::regex)?;
+ let expr = match self.line_terminator {
+ None => expr,
+ Some(line_term) => strip_from_match(expr, line_term)?,
+ };
+ Ok(ConfiguredHIR {
+ original: pattern.to_string(),
+ config: self.clone(),
+ analysis: analysis,
+ // If CRLF mode is enabled, replace `$` with `(?:\r?$)`.
+ expr: if self.crlf { crlfify(expr) } else { expr },
+ })
+ }
+
+ /// Accounting for the `smart_case` config knob, return true if and only if
+ /// this pattern should be matched case insensitively.
+ fn is_case_insensitive(&self, analysis: &AstAnalysis) -> bool {
+ if self.case_insensitive {
+ return true;
+ }
+ if !self.case_smart {
+ return false;
+ }
+ analysis.any_literal() && !analysis.any_uppercase()
+ }
+
+ /// Returns true if and only if this config is simple enough such that
+ /// if the pattern is a simple alternation of literals, then it can be
+ /// constructed via a plain Aho-Corasick automaton.
+ ///
+ /// Note that it is OK to return true even when settings like `multi_line`
+ /// are enabled, since if multi-line can impact the match semantics of a
+ /// regex, then it is by definition not a simple alternation of literals.
+ pub fn can_plain_aho_corasick(&self) -> bool {
+ !self.word && !self.case_insensitive && !self.case_smart
+ }
+
+ /// Perform analysis on the AST of this pattern.
+ ///
+ /// This returns an error if the given pattern failed to parse.
+ fn analysis(&self, ast: &Ast) -> Result<AstAnalysis, Error> {
+ Ok(AstAnalysis::from_ast(ast))
+ }
+
+ /// Parse the given pattern into its abstract syntax.
+ ///
+ /// This returns an error if the given pattern failed to parse.
+ fn ast(&self, pattern: &str) -> Result<Ast, Error> {
+ ast::parse::ParserBuilder::new()
+ .nest_limit(self.nest_limit)
+ .octal(self.octal)
+ .ignore_whitespace(self.ignore_whitespace)
+ .build()
+ .parse(pattern)
+ .map_err(Error::regex)
+ }
+}
+
+/// A "configured" HIR expression, which is aware of the configuration which
+/// produced this HIR.
+///
+/// Since the configuration is tracked, values with this type can be
+/// transformed into other HIR expressions (or regular expressions) in a way
+/// that preserves the configuration. For example, the `fast_line_regex`
+/// method will apply literal extraction to the inner HIR and use that to build
+/// a new regex that matches the extracted literals in a way that is
+/// consistent with the configuration that produced this HIR. For example, the
+/// size limits set on the configured HIR will be propagated out to any
+/// subsequently constructed HIR or regular expression.
+#[derive(Clone, Debug)]
+pub struct ConfiguredHIR {
+ original: String,
+ config: Config,
+ analysis: AstAnalysis,
+ expr: Hir,
+}
+
+impl ConfiguredHIR {
+ /// Return the configuration for this HIR expression.
+ pub fn config(&self) -> &Config {
+ &self.config
+ }
+
+ /// Compute the set of non-matching bytes for this HIR expression.
+ pub fn non_matching_bytes(&self) -> ByteSet {
+ non_matching_bytes(&self.expr)
+ }
+
+ /// Returns true if and only if this regex needs to have its match offsets
+ /// tweaked because of CRLF support. Specifically, this occurs when the
+ /// CRLF hack is enabled and the regex is line anchored at the end. In
+ /// this case, matches that end with a `\r` have the `\r` stripped.
+ pub fn needs_crlf_stripped(&self) -> bool {
+ self.config.crlf && self.expr.is_line_anchored_end()
+ }
+
+ /// Builds a regular expression from this HIR expression.
+ pub fn regex(&self) -> Result<Regex, Error> {
+ self.pattern_to_regex(&self.expr.to_string())
+ }
+
+ /// If this HIR corresponds to an alternation of literals with no
+ /// capturing groups, then this returns those literals.
+ pub fn alternation_literals(&self) -> Option<Vec<Vec<u8>>> {
+ if !self.config.can_plain_aho_corasick() {
+ return None;
+ }
+ alternation_literals(&self.expr)
+ }
+
+ /// Applies the given function to the concrete syntax of this HIR and then
+ /// generates a new HIR based on the result of the function in a way that
+ /// preserves the configuration.
+ ///
+ /// For example, this can be used to wrap a user provided regular
+ /// expression with additional semantics. e.g., See the `WordMatcher`.
+ pub fn with_pattern<F: FnMut(&str) -> String>(
+ &self,
+ mut f: F,
+ ) -> Result<ConfiguredHIR, Error> {
+ self.pattern_to_hir(&f(&self.expr.to_string()))
+ }
+
+ /// If the current configuration has a line terminator set and if useful
+ /// literals could be extracted, then a regular expression matching those
+ /// literals is returned. If no line terminator is set, then `None` is
+ /// returned.
+ ///
+ /// If compiling the resulting regular expression failed, then an error
+ /// is returned.
+ ///
+ /// This method only returns something when a line terminator is set
+ /// because matches from this regex are generally candidates that must be
+ /// confirmed before reporting a match. When performing a line oriented
+ /// search, confirmation is easy: just extend the candidate match to its
+ /// respective line boundaries and then re-search that line for a full
+ /// match. This only works when the line terminator is set because the line
+ /// terminator setting guarantees that the regex itself can never match
+ /// through the line terminator byte.
+ pub fn fast_line_regex(&self) -> Result<Option<Regex>, Error> {
+ if self.config.line_terminator.is_none() {
+ return Ok(None);
+ }
+ match LiteralSets::new(&self.expr).one_regex(self.config.word) {
+ None => Ok(None),
+ Some(pattern) => self.pattern_to_regex(&pattern).map(Some),
+ }
+ }
+
+ /// Create a regex from the given pattern using this HIR's configuration.
+ fn pattern_to_regex(&self, pattern: &str) -> Result<Regex, Error> {
+ // The settings we explicitly set here are intentionally a subset
+ // of the settings we have. The key point here is that our HIR
+ // expression is computed with the settings in mind, such that setting
+ // them here could actually lead to unintended behavior. For example,
+ // consider the pattern `(?U)a+`. This will get folded into the HIR
+ // as a non-greedy repetition operator which will in turn get printed
+ // to the concrete syntax as `a+?`, which is correct. But if we
+ // set the `swap_greed` option again, then we'll wind up with `(?U)a+?`
+ // which is equal to `a+` which is not the same as what we were given.
+ //
+ // We also don't need to apply `case_insensitive` since this gets
+ // folded into the HIR and would just cause us to do redundant work.
+ //
+ // Finally, we don't need to set `ignore_whitespace` since the concrete
+ // syntax emitted by the HIR printer never needs it.
+ //
+ // We set the rest of the options. Some of them are important, such as
+ // the size limit, and some of them are necessary to preserve the
+ // intention of the original pattern. For example, the Unicode flag
+ // will impact how the WordMatcher functions, namely, whether its
+ // word boundaries are Unicode aware or not.
+ RegexBuilder::new(&pattern)
+ .nest_limit(self.config.nest_limit)
+ .octal(self.config.octal)
+ .multi_line(self.config.multi_line)
+ .dot_matches_new_line(self.config.dot_matches_new_line)
+ .unicode(self.config.unicode)
+ .size_limit(self.config.size_limit)
+ .dfa_size_limit(self.config.dfa_size_limit)
+ .build()
+ .map_err(Error::regex)
+ }
+
+ /// Create an HIR expression from the given pattern using this HIR's
+ /// configuration.
+ fn pattern_to_hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
+ // See `pattern_to_regex` comment for explanation of why we only set
+ // a subset of knobs here. e.g., `swap_greed` is explicitly left out.
+ let expr = ::regex_syntax::ParserBuilder::new()
+ .nest_limit(self.config.nest_limit)
+ .octal(self.config.octal)
+ .allow_invalid_utf8(true)
+ .multi_line(self.config.multi_line)
+ .dot_matches_new_line(self.config.dot_matches_new_line)
+ .unicode(self.config.unicode)
+ .build()
+ .parse(pattern)
+ .map_err(Error::regex)?;
+ Ok(ConfiguredHIR {
+ original: self.original.clone(),
+ config: self.config.clone(),
+ analysis: self.analysis.clone(),
+ expr: expr,
+ })
+ }
+}