diff options
Diffstat (limited to 'crates/regex/src/config.rs')
-rw-r--r-- | crates/regex/src/config.rs | 288 |
1 files changed, 288 insertions, 0 deletions
diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs new file mode 100644 index 00000000..1f81a802 --- /dev/null +++ b/crates/regex/src/config.rs @@ -0,0 +1,288 @@ +use grep_matcher::{ByteSet, LineTerminator}; +use regex::bytes::{Regex, RegexBuilder}; +use regex_syntax::ast::{self, Ast}; +use regex_syntax::hir::{self, Hir}; + +use ast::AstAnalysis; +use crlf::crlfify; +use error::Error; +use literal::LiteralSets; +use multi::alternation_literals; +use non_matching::non_matching_bytes; +use strip::strip_from_match; + +/// Config represents the configuration of a regex matcher in this crate. +/// The configuration is itself a rough combination of the knobs found in +/// the `regex` crate itself, along with additional `grep-matcher` specific +/// options. +/// +/// The configuration can be used to build a "configured" HIR expression. A +/// configured HIR expression is an HIR expression that is aware of the +/// configuration which generated it, and provides transformation on that HIR +/// such that the configuration is preserved. +#[derive(Clone, Debug)] +pub struct Config { + pub case_insensitive: bool, + pub case_smart: bool, + pub multi_line: bool, + pub dot_matches_new_line: bool, + pub swap_greed: bool, + pub ignore_whitespace: bool, + pub unicode: bool, + pub octal: bool, + pub size_limit: usize, + pub dfa_size_limit: usize, + pub nest_limit: u32, + pub line_terminator: Option<LineTerminator>, + pub crlf: bool, + pub word: bool, +} + +impl Default for Config { + fn default() -> Config { + Config { + case_insensitive: false, + case_smart: false, + multi_line: false, + dot_matches_new_line: false, + swap_greed: false, + ignore_whitespace: false, + unicode: true, + octal: false, + // These size limits are much bigger than what's in the regex + // crate. + size_limit: 100 * (1 << 20), + dfa_size_limit: 1000 * (1 << 20), + nest_limit: 250, + line_terminator: None, + crlf: false, + word: false, + } + } +} + +impl Config { + /// Parse the given pattern and returned its HIR expression along with + /// the current configuration. + /// + /// If there was a problem parsing the given expression then an error + /// is returned. + pub fn hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> { + let ast = self.ast(pattern)?; + let analysis = self.analysis(&ast)?; + let expr = hir::translate::TranslatorBuilder::new() + .allow_invalid_utf8(true) + .case_insensitive(self.is_case_insensitive(&analysis)) + .multi_line(self.multi_line) + .dot_matches_new_line(self.dot_matches_new_line) + .swap_greed(self.swap_greed) + .unicode(self.unicode) + .build() + .translate(pattern, &ast) + .map_err(Error::regex)?; + let expr = match self.line_terminator { + None => expr, + Some(line_term) => strip_from_match(expr, line_term)?, + }; + Ok(ConfiguredHIR { + original: pattern.to_string(), + config: self.clone(), + analysis: analysis, + // If CRLF mode is enabled, replace `$` with `(?:\r?$)`. + expr: if self.crlf { crlfify(expr) } else { expr }, + }) + } + + /// Accounting for the `smart_case` config knob, return true if and only if + /// this pattern should be matched case insensitively. + fn is_case_insensitive(&self, analysis: &AstAnalysis) -> bool { + if self.case_insensitive { + return true; + } + if !self.case_smart { + return false; + } + analysis.any_literal() && !analysis.any_uppercase() + } + + /// Returns true if and only if this config is simple enough such that + /// if the pattern is a simple alternation of literals, then it can be + /// constructed via a plain Aho-Corasick automaton. + /// + /// Note that it is OK to return true even when settings like `multi_line` + /// are enabled, since if multi-line can impact the match semantics of a + /// regex, then it is by definition not a simple alternation of literals. + pub fn can_plain_aho_corasick(&self) -> bool { + !self.word && !self.case_insensitive && !self.case_smart + } + + /// Perform analysis on the AST of this pattern. + /// + /// This returns an error if the given pattern failed to parse. + fn analysis(&self, ast: &Ast) -> Result<AstAnalysis, Error> { + Ok(AstAnalysis::from_ast(ast)) + } + + /// Parse the given pattern into its abstract syntax. + /// + /// This returns an error if the given pattern failed to parse. + fn ast(&self, pattern: &str) -> Result<Ast, Error> { + ast::parse::ParserBuilder::new() + .nest_limit(self.nest_limit) + .octal(self.octal) + .ignore_whitespace(self.ignore_whitespace) + .build() + .parse(pattern) + .map_err(Error::regex) + } +} + +/// A "configured" HIR expression, which is aware of the configuration which +/// produced this HIR. +/// +/// Since the configuration is tracked, values with this type can be +/// transformed into other HIR expressions (or regular expressions) in a way +/// that preserves the configuration. For example, the `fast_line_regex` +/// method will apply literal extraction to the inner HIR and use that to build +/// a new regex that matches the extracted literals in a way that is +/// consistent with the configuration that produced this HIR. For example, the +/// size limits set on the configured HIR will be propagated out to any +/// subsequently constructed HIR or regular expression. +#[derive(Clone, Debug)] +pub struct ConfiguredHIR { + original: String, + config: Config, + analysis: AstAnalysis, + expr: Hir, +} + +impl ConfiguredHIR { + /// Return the configuration for this HIR expression. + pub fn config(&self) -> &Config { + &self.config + } + + /// Compute the set of non-matching bytes for this HIR expression. + pub fn non_matching_bytes(&self) -> ByteSet { + non_matching_bytes(&self.expr) + } + + /// Returns true if and only if this regex needs to have its match offsets + /// tweaked because of CRLF support. Specifically, this occurs when the + /// CRLF hack is enabled and the regex is line anchored at the end. In + /// this case, matches that end with a `\r` have the `\r` stripped. + pub fn needs_crlf_stripped(&self) -> bool { + self.config.crlf && self.expr.is_line_anchored_end() + } + + /// Builds a regular expression from this HIR expression. + pub fn regex(&self) -> Result<Regex, Error> { + self.pattern_to_regex(&self.expr.to_string()) + } + + /// If this HIR corresponds to an alternation of literals with no + /// capturing groups, then this returns those literals. + pub fn alternation_literals(&self) -> Option<Vec<Vec<u8>>> { + if !self.config.can_plain_aho_corasick() { + return None; + } + alternation_literals(&self.expr) + } + + /// Applies the given function to the concrete syntax of this HIR and then + /// generates a new HIR based on the result of the function in a way that + /// preserves the configuration. + /// + /// For example, this can be used to wrap a user provided regular + /// expression with additional semantics. e.g., See the `WordMatcher`. + pub fn with_pattern<F: FnMut(&str) -> String>( + &self, + mut f: F, + ) -> Result<ConfiguredHIR, Error> { + self.pattern_to_hir(&f(&self.expr.to_string())) + } + + /// If the current configuration has a line terminator set and if useful + /// literals could be extracted, then a regular expression matching those + /// literals is returned. If no line terminator is set, then `None` is + /// returned. + /// + /// If compiling the resulting regular expression failed, then an error + /// is returned. + /// + /// This method only returns something when a line terminator is set + /// because matches from this regex are generally candidates that must be + /// confirmed before reporting a match. When performing a line oriented + /// search, confirmation is easy: just extend the candidate match to its + /// respective line boundaries and then re-search that line for a full + /// match. This only works when the line terminator is set because the line + /// terminator setting guarantees that the regex itself can never match + /// through the line terminator byte. + pub fn fast_line_regex(&self) -> Result<Option<Regex>, Error> { + if self.config.line_terminator.is_none() { + return Ok(None); + } + match LiteralSets::new(&self.expr).one_regex(self.config.word) { + None => Ok(None), + Some(pattern) => self.pattern_to_regex(&pattern).map(Some), + } + } + + /// Create a regex from the given pattern using this HIR's configuration. + fn pattern_to_regex(&self, pattern: &str) -> Result<Regex, Error> { + // The settings we explicitly set here are intentionally a subset + // of the settings we have. The key point here is that our HIR + // expression is computed with the settings in mind, such that setting + // them here could actually lead to unintended behavior. For example, + // consider the pattern `(?U)a+`. This will get folded into the HIR + // as a non-greedy repetition operator which will in turn get printed + // to the concrete syntax as `a+?`, which is correct. But if we + // set the `swap_greed` option again, then we'll wind up with `(?U)a+?` + // which is equal to `a+` which is not the same as what we were given. + // + // We also don't need to apply `case_insensitive` since this gets + // folded into the HIR and would just cause us to do redundant work. + // + // Finally, we don't need to set `ignore_whitespace` since the concrete + // syntax emitted by the HIR printer never needs it. + // + // We set the rest of the options. Some of them are important, such as + // the size limit, and some of them are necessary to preserve the + // intention of the original pattern. For example, the Unicode flag + // will impact how the WordMatcher functions, namely, whether its + // word boundaries are Unicode aware or not. + RegexBuilder::new(&pattern) + .nest_limit(self.config.nest_limit) + .octal(self.config.octal) + .multi_line(self.config.multi_line) + .dot_matches_new_line(self.config.dot_matches_new_line) + .unicode(self.config.unicode) + .size_limit(self.config.size_limit) + .dfa_size_limit(self.config.dfa_size_limit) + .build() + .map_err(Error::regex) + } + + /// Create an HIR expression from the given pattern using this HIR's + /// configuration. + fn pattern_to_hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> { + // See `pattern_to_regex` comment for explanation of why we only set + // a subset of knobs here. e.g., `swap_greed` is explicitly left out. + let expr = ::regex_syntax::ParserBuilder::new() + .nest_limit(self.config.nest_limit) + .octal(self.config.octal) + .allow_invalid_utf8(true) + .multi_line(self.config.multi_line) + .dot_matches_new_line(self.config.dot_matches_new_line) + .unicode(self.config.unicode) + .build() + .parse(pattern) + .map_err(Error::regex)?; + Ok(ConfiguredHIR { + original: self.original.clone(), + config: self.config.clone(), + analysis: self.analysis.clone(), + expr: expr, + }) + } +} |