use grep_matcher::{ByteSet, LineTerminator}; use regex::bytes::{Regex, RegexBuilder}; use regex_syntax::ast::{self, Ast}; use regex_syntax::hir::{self, Hir}; use ast::AstAnalysis; use crlf::crlfify; use error::Error; use literal::LiteralSets; use multi::alternation_literals; use non_matching::non_matching_bytes; use strip::strip_from_match; /// Config represents the configuration of a regex matcher in this crate. /// The configuration is itself a rough combination of the knobs found in /// the `regex` crate itself, along with additional `grep-matcher` specific /// options. /// /// The configuration can be used to build a "configured" HIR expression. A /// configured HIR expression is an HIR expression that is aware of the /// configuration which generated it, and provides transformation on that HIR /// such that the configuration is preserved. #[derive(Clone, Debug)] pub struct Config { pub case_insensitive: bool, pub case_smart: bool, pub multi_line: bool, pub dot_matches_new_line: bool, pub swap_greed: bool, pub ignore_whitespace: bool, pub unicode: bool, pub octal: bool, pub size_limit: usize, pub dfa_size_limit: usize, pub nest_limit: u32, pub line_terminator: Option, pub crlf: bool, pub word: bool, } impl Default for Config { fn default() -> Config { Config { case_insensitive: false, case_smart: false, multi_line: false, dot_matches_new_line: false, swap_greed: false, ignore_whitespace: false, unicode: true, octal: false, // These size limits are much bigger than what's in the regex // crate. size_limit: 100 * (1<<20), dfa_size_limit: 1000 * (1<<20), nest_limit: 250, line_terminator: None, crlf: false, word: false, } } } impl Config { /// Parse the given pattern and returned its HIR expression along with /// the current configuration. /// /// If there was a problem parsing the given expression then an error /// is returned. pub fn hir(&self, pattern: &str) -> Result { let ast = self.ast(pattern)?; let analysis = self.analysis(&ast)?; let expr = hir::translate::TranslatorBuilder::new() .allow_invalid_utf8(true) .case_insensitive(self.is_case_insensitive(&analysis)) .multi_line(self.multi_line) .dot_matches_new_line(self.dot_matches_new_line) .swap_greed(self.swap_greed) .unicode(self.unicode) .build() .translate(pattern, &ast) .map_err(Error::regex)?; let expr = match self.line_terminator { None => expr, Some(line_term) => strip_from_match(expr, line_term)?, }; Ok(ConfiguredHIR { original: pattern.to_string(), config: self.clone(), analysis: analysis, // If CRLF mode is enabled, replace `$` with `(?:\r?$)`. expr: if self.crlf { crlfify(expr) } else { expr }, }) } /// Accounting for the `smart_case` config knob, return true if and only if /// this pattern should be matched case insensitively. fn is_case_insensitive( &self, analysis: &AstAnalysis, ) -> bool { if self.case_insensitive { return true; } if !self.case_smart { return false; } analysis.any_literal() && !analysis.any_uppercase() } /// Returns true if and only if this config is simple enough such that /// if the pattern is a simple alternation of literals, then it can be /// constructed via a plain Aho-Corasick automaton. /// /// Note that it is OK to return true even when settings like `multi_line` /// are enabled, since if multi-line can impact the match semantics of a /// regex, then it is by definition not a simple alternation of literals. pub fn can_plain_aho_corasick(&self) -> bool { !self.word && !self.case_insensitive && !self.case_smart } /// Perform analysis on the AST of this pattern. /// /// This returns an error if the given pattern failed to parse. fn analysis(&self, ast: &Ast) -> Result { Ok(AstAnalysis::from_ast(ast)) } /// Parse the given pattern into its abstract syntax. /// /// This returns an error if the given pattern failed to parse. fn ast(&self, pattern: &str) -> Result { ast::parse::ParserBuilder::new() .nest_limit(self.nest_limit) .octal(self.octal) .ignore_whitespace(self.ignore_whitespace) .build() .parse(pattern) .map_err(Error::regex) } } /// A "configured" HIR expression, which is aware of the configuration which /// produced this HIR. /// /// Since the configuration is tracked, values with this type can be /// transformed into other HIR expressions (or regular expressions) in a way /// that preserves the configuration. For example, the `fast_line_regex` /// method will apply literal extraction to the inner HIR and use that to build /// a new regex that matches the extracted literals in a way that is /// consistent with the configuration that produced this HIR. For example, the /// size limits set on the configured HIR will be propagated out to any /// subsequently constructed HIR or regular expression. #[derive(Clone, Debug)] pub struct ConfiguredHIR { original: String, config: Config, analysis: AstAnalysis, expr: Hir, } impl ConfiguredHIR { /// Return the configuration for this HIR expression. pub fn config(&self) -> &Config { &self.config } /// Compute the set of non-matching bytes for this HIR expression. pub fn non_matching_bytes(&self) -> ByteSet { non_matching_bytes(&self.expr) } /// Returns true if and only if this regex needs to have its match offsets /// tweaked because of CRLF support. Specifically, this occurs when the /// CRLF hack is enabled and the regex is line anchored at the end. In /// this case, matches that end with a `\r` have the `\r` stripped. pub fn needs_crlf_stripped(&self) -> bool { self.config.crlf && self.expr.is_line_anchored_end() } /// Builds a regular expression from this HIR expression. pub fn regex(&self) -> Result { self.pattern_to_regex(&self.expr.to_string()) } /// If this HIR corresponds to an alternation of literals with no /// capturing groups, then this returns those literals. pub fn alternation_literals(&self) -> Option>> { if !self.config.can_plain_aho_corasick() { return None; } alternation_literals(&self.expr) } /// Applies the given function to the concrete syntax of this HIR and then /// generates a new HIR based on the result of the function in a way that /// preserves the configuration. /// /// For example, this can be used to wrap a user provided regular /// expression with additional semantics. e.g., See the `WordMatcher`. pub fn with_pattern String>( &self, mut f: F, ) -> Result { self.pattern_to_hir(&f(&self.expr.to_string())) } /// If the current configuration has a line terminator set and if useful /// literals could be extracted, then a regular expression matching those /// literals is returned. If no line terminator is set, then `None` is /// returned. /// /// If compiling the resulting regular expression failed, then an error /// is returned. /// /// This method only returns something when a line terminator is set /// because matches from this regex are generally candidates that must be /// confirmed before reporting a match. When performing a line oriented /// search, confirmation is easy: just extend the candidate match to its /// respective line boundaries and then re-search that line for a full /// match. This only works when the line terminator is set because the line /// terminator setting guarantees that the regex itself can never match /// through the line terminator byte. pub fn fast_line_regex(&self) -> Result, Error> { if self.config.line_terminator.is_none() { return Ok(None); } match LiteralSets::new(&self.expr).one_regex(self.config.word) { None => Ok(None), Some(pattern) => self.pattern_to_regex(&pattern).map(Some), } } /// Create a regex from the given pattern using this HIR's configuration. fn pattern_to_regex(&self, pattern: &str) -> Result { // The settings we explicitly set here are intentionally a subset // of the settings we have. The key point here is that our HIR // expression is computed with the settings in mind, such that setting // them here could actually lead to unintended behavior. For example, // consider the pattern `(?U)a+`. This will get folded into the HIR // as a non-greedy repetition operator which will in turn get printed // to the concrete syntax as `a+?`, which is correct. But if we // set the `swap_greed` option again, then we'll wind up with `(?U)a+?` // which is equal to `a+` which is not the same as what we were given. // // We also don't need to apply `case_insensitive` since this gets // folded into the HIR and would just cause us to do redundant work. // // Finally, we don't need to set `ignore_whitespace` since the concrete // syntax emitted by the HIR printer never needs it. // // We set the rest of the options. Some of them are important, such as // the size limit, and some of them are necessary to preserve the // intention of the original pattern. For example, the Unicode flag // will impact how the WordMatcher functions, namely, whether its // word boundaries are Unicode aware or not. RegexBuilder::new(&pattern) .nest_limit(self.config.nest_limit) .octal(self.config.octal) .multi_line(self.config.multi_line) .dot_matches_new_line(self.config.dot_matches_new_line) .unicode(self.config.unicode) .size_limit(self.config.size_limit) .dfa_size_limit(self.config.dfa_size_limit) .build() .map_err(Error::regex) } /// Create an HIR expression from the given pattern using this HIR's /// configuration. fn pattern_to_hir(&self, pattern: &str) -> Result { // See `pattern_to_regex` comment for explanation of why we only set // a subset of knobs here. e.g., `swap_greed` is explicitly left out. let expr = ::regex_syntax::ParserBuilder::new() .nest_limit(self.config.nest_limit) .octal(self.config.octal) .allow_invalid_utf8(true) .multi_line(self.config.multi_line) .dot_matches_new_line(self.config.dot_matches_new_line) .unicode(self.config.unicode) .build() .parse(pattern) .map_err(Error::regex)?; Ok(ConfiguredHIR { original: self.original.clone(), config: self.config.clone(), analysis: self.analysis.clone(), expr: expr, }) } }