diff options
Diffstat (limited to 'grep-regex/src/matcher.rs')
-rw-r--r-- | grep-regex/src/matcher.rs | 216 |
1 files changed, 170 insertions, 46 deletions
diff --git a/grep-regex/src/matcher.rs b/grep-regex/src/matcher.rs index d71f5777..7f30252a 100644 --- a/grep-regex/src/matcher.rs +++ b/grep-regex/src/matcher.rs @@ -8,6 +8,7 @@ use regex::bytes::{CaptureLocations, Regex}; use config::{Config, ConfiguredHIR}; use crlf::CRLFMatcher; use error::Error; +use multi::MultiLiteralMatcher; use word::WordMatcher; /// A builder for constructing a `Matcher` using regular expressions. @@ -52,7 +53,7 @@ impl RegexMatcherBuilder { } let matcher = RegexMatcherImpl::new(&chir)?; - trace!("final regex: {:?}", matcher.regex().to_string()); + trace!("final regex: {:?}", matcher.regex()); Ok(RegexMatcher { config: self.config.clone(), matcher: matcher, @@ -61,6 +62,29 @@ impl RegexMatcherBuilder { }) } + /// Build a new matcher from a plain alternation of literals. + /// + /// Depending on the configuration set by the builder, this may be able to + /// build a matcher substantially faster than by joining the patterns with + /// a `|` and calling `build`. + pub fn build_literals<B: AsRef<str>>( + &self, + literals: &[B], + ) -> Result<RegexMatcher, Error> { + let slices: Vec<_> = literals.iter().map(|s| s.as_ref()).collect(); + if !self.config.can_plain_aho_corasick() || literals.len() < 40 { + return self.build(&slices.join("|")); + } + let matcher = MultiLiteralMatcher::new(&slices)?; + let imp = RegexMatcherImpl::MultiLiteral(matcher); + Ok(RegexMatcher { + config: self.config.clone(), + matcher: imp, + fast_line_regex: None, + non_matching_bytes: ByteSet::empty(), + }) + } + /// Set the value for the case insensitive (`i`) flag. /// /// When enabled, letters in the pattern will match both upper case and @@ -348,6 +372,8 @@ impl RegexMatcher { enum RegexMatcherImpl { /// The standard matcher used for all regular expressions. Standard(StandardMatcher), + /// A matcher for an alternation of plain literals. + MultiLiteral(MultiLiteralMatcher), /// A matcher that strips `\r` from the end of matches. /// /// This is only used when the CRLF hack is enabled and the regex is line @@ -370,16 +396,23 @@ impl RegexMatcherImpl { } else if expr.needs_crlf_stripped() { Ok(RegexMatcherImpl::CRLF(CRLFMatcher::new(expr)?)) } else { + if let Some(lits) = expr.alternation_literals() { + if lits.len() >= 40 { + let matcher = MultiLiteralMatcher::new(&lits)?; + return Ok(RegexMatcherImpl::MultiLiteral(matcher)); + } + } Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?)) } } /// Return the underlying regex object used. - fn regex(&self) -> &Regex { + fn regex(&self) -> String { match *self { - RegexMatcherImpl::Word(ref x) => x.regex(), - RegexMatcherImpl::CRLF(ref x) => x.regex(), - RegexMatcherImpl::Standard(ref x) => &x.regex, + RegexMatcherImpl::Word(ref x) => x.regex().to_string(), + RegexMatcherImpl::CRLF(ref x) => x.regex().to_string(), + RegexMatcherImpl::MultiLiteral(_) => "<N/A>".to_string(), + RegexMatcherImpl::Standard(ref x) => x.regex.to_string(), } } } @@ -399,6 +432,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.find_at(haystack, at), + MultiLiteral(ref m) => m.find_at(haystack, at), CRLF(ref m) => m.find_at(haystack, at), Word(ref m) => m.find_at(haystack, at), } @@ -408,6 +442,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.new_captures(), + MultiLiteral(ref m) => m.new_captures(), CRLF(ref m) => m.new_captures(), Word(ref m) => m.new_captures(), } @@ -417,6 +452,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.capture_count(), + MultiLiteral(ref m) => m.capture_count(), CRLF(ref m) => m.capture_count(), Word(ref m) => m.capture_count(), } @@ -426,6 +462,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.capture_index(name), + MultiLiteral(ref m) => m.capture_index(name), CRLF(ref m) => m.capture_index(name), Word(ref m) => m.capture_index(name), } @@ -435,6 +472,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.find(haystack), + MultiLiteral(ref m) => m.find(haystack), CRLF(ref m) => m.find(haystack), Word(ref m) => m.find(haystack), } @@ -450,6 +488,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.find_iter(haystack, matched), + MultiLiteral(ref m) => m.find_iter(haystack, matched), CRLF(ref m) => m.find_iter(haystack, matched), Word(ref m) => m.find_iter(haystack, matched), } @@ -465,6 +504,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.try_find_iter(haystack, matched), + MultiLiteral(ref m) => m.try_find_iter(haystack, matched), CRLF(ref m) => m.try_find_iter(haystack, matched), Word(ref m) => m.try_find_iter(haystack, matched), } @@ -478,6 +518,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.captures(haystack, caps), + MultiLiteral(ref m) => m.captures(haystack, caps), CRLF(ref m) => m.captures(haystack, caps), Word(ref m) => m.captures(haystack, caps), } @@ -494,6 +535,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.captures_iter(haystack, caps, matched), + MultiLiteral(ref m) => m.captures_iter(haystack, caps, matched), CRLF(ref m) => m.captures_iter(haystack, caps, matched), Word(ref m) => m.captures_iter(haystack, caps, matched), } @@ -510,6 +552,9 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.try_captures_iter(haystack, caps, matched), + MultiLiteral(ref m) => { + m.try_captures_iter(haystack, caps, matched) + } CRLF(ref m) => m.try_captures_iter(haystack, caps, matched), Word(ref m) => m.try_captures_iter(haystack, caps, matched), } @@ -524,6 +569,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.captures_at(haystack, at, caps), + MultiLiteral(ref m) => m.captures_at(haystack, at, caps), CRLF(ref m) => m.captures_at(haystack, at, caps), Word(ref m) => m.captures_at(haystack, at, caps), } @@ -540,6 +586,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.replace(haystack, dst, append), + MultiLiteral(ref m) => m.replace(haystack, dst, append), CRLF(ref m) => m.replace(haystack, dst, append), Word(ref m) => m.replace(haystack, dst, append), } @@ -559,6 +606,9 @@ impl Matcher for RegexMatcher { Standard(ref m) => { m.replace_with_captures(haystack, caps, dst, append) } + MultiLiteral(ref m) => { + m.replace_with_captures(haystack, caps, dst, append) + } CRLF(ref m) => { m.replace_with_captures(haystack, caps, dst, append) } @@ -572,6 +622,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.is_match(haystack), + MultiLiteral(ref m) => m.is_match(haystack), CRLF(ref m) => m.is_match(haystack), Word(ref m) => m.is_match(haystack), } @@ -585,6 +636,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.is_match_at(haystack, at), + MultiLiteral(ref m) => m.is_match_at(haystack, at), CRLF(ref m) => m.is_match_at(haystack, at), Word(ref m) => m.is_match_at(haystack, at), } @@ -597,6 +649,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.shortest_match(haystack), + MultiLiteral(ref m) => m.shortest_match(haystack), CRLF(ref m) => m.shortest_match(haystack), Word(ref m) => m.shortest_match(haystack), } @@ -610,6 +663,7 @@ impl Matcher for RegexMatcher { use self::RegexMatcherImpl::*; match self.matcher { Standard(ref m) => m.shortest_match_at(haystack, at), + MultiLiteral(ref m) => m.shortest_match_at(haystack, at), CRLF(ref m) => m.shortest_match_at(haystack, at), Word(ref m) => m.shortest_match_at(haystack, at), } @@ -710,7 +764,9 @@ impl Matcher for StandardMatcher { at: usize, caps: &mut RegexCaptures, ) -> Result<bool, NoError> { - Ok(self.regex.captures_read_at(&mut caps.locs, haystack, at).is_some()) + Ok(self.regex.captures_read_at( + &mut caps.locations_mut(), haystack, at, + ).is_some()) } fn shortest_match_at( @@ -737,54 +793,84 @@ impl Matcher for StandardMatcher { /// index of the group using the corresponding matcher's `capture_index` /// method, and then use that index with `RegexCaptures::get`. #[derive(Clone, Debug)] -pub struct RegexCaptures { - /// Where the locations are stored. - locs: CaptureLocations, - /// These captures behave as if the capturing groups begin at the given - /// offset. When set to `0`, this has no affect and capture groups are - /// indexed like normal. - /// - /// This is useful when building matchers that wrap arbitrary regular - /// expressions. For example, `WordMatcher` takes an existing regex `re` - /// and creates `(?:^|\W)(re)(?:$|\W)`, but hides the fact that the regex - /// has been wrapped from the caller. In order to do this, the matcher - /// and the capturing groups must behave as if `(re)` is the `0`th capture - /// group. - offset: usize, - /// When enable, the end of a match has `\r` stripped from it, if one - /// exists. - strip_crlf: bool, +pub struct RegexCaptures(RegexCapturesImp); + +#[derive(Clone, Debug)] +enum RegexCapturesImp { + AhoCorasick { + /// The start and end of the match, corresponding to capture group 0. + mat: Option<Match>, + }, + Regex { + /// Where the locations are stored. + locs: CaptureLocations, + /// These captures behave as if the capturing groups begin at the given + /// offset. When set to `0`, this has no affect and capture groups are + /// indexed like normal. + /// + /// This is useful when building matchers that wrap arbitrary regular + /// expressions. For example, `WordMatcher` takes an existing regex + /// `re` and creates `(?:^|\W)(re)(?:$|\W)`, but hides the fact that + /// the regex has been wrapped from the caller. In order to do this, + /// the matcher and the capturing groups must behave as if `(re)` is + /// the `0`th capture group. + offset: usize, + /// When enable, the end of a match has `\r` stripped from it, if one + /// exists. + strip_crlf: bool, + }, } impl Captures for RegexCaptures { fn len(&self) -> usize { - self.locs.len().checked_sub(self.offset).unwrap() + match self.0 { + RegexCapturesImp::AhoCorasick { .. } => 1, + RegexCapturesImp::Regex { ref locs, offset, .. } => { + locs.len().checked_sub(offset).unwrap() + } + } } fn get(&self, i: usize) -> Option<Match> { - if !self.strip_crlf { - let actual = i.checked_add(self.offset).unwrap(); - return self.locs.pos(actual).map(|(s, e)| Match::new(s, e)); - } - - // currently don't support capture offsetting with CRLF stripping - assert_eq!(self.offset, 0); - let m = match self.locs.pos(i).map(|(s, e)| Match::new(s, e)) { - None => return None, - Some(m) => m, - }; - // If the end position of this match corresponds to the end position - // of the overall match, then we apply our CRLF stripping. Otherwise, - // we cannot assume stripping is correct. - if i == 0 || m.end() == self.locs.pos(0).unwrap().1 { - Some(m.with_end(m.end() - 1)) - } else { - Some(m) + match self.0 { + RegexCapturesImp::AhoCorasick { mat, .. } => { + if i == 0 { + mat + } else { + None + } + } + RegexCapturesImp::Regex { ref locs, offset, strip_crlf } => { + if !strip_crlf { + let actual = i.checked_add(offset).unwrap(); + return locs.pos(actual).map(|(s, e)| Match::new(s, e)); + } + + // currently don't support capture offsetting with CRLF + // stripping + assert_eq!(offset, 0); + let m = match locs.pos(i).map(|(s, e)| Match::new(s, e)) { + None => return None, + Some(m) => m, + }; + // If the end position of this match corresponds to the end + // position of the overall match, then we apply our CRLF + // stripping. Otherwise, we cannot assume stripping is correct. + if i == 0 || m.end() == locs.pos(0).unwrap().1 { + Some(m.with_end(m.end() - 1)) + } else { + Some(m) + } + } } } } impl RegexCaptures { + pub(crate) fn simple() -> RegexCaptures { + RegexCaptures(RegexCapturesImp::AhoCorasick { mat: None }) + } + pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures { RegexCaptures::with_offset(locs, 0) } @@ -793,15 +879,53 @@ impl RegexCaptures { locs: CaptureLocations, offset: usize, ) -> RegexCaptures { - RegexCaptures { locs, offset, strip_crlf: false } + RegexCaptures(RegexCapturesImp::Regex { + locs, offset, strip_crlf: false, + }) + } + + pub(crate) fn locations(&self) -> &CaptureLocations { + match self.0 { + RegexCapturesImp::AhoCorasick { .. } => { + panic!("getting locations for simple captures is invalid") + } + RegexCapturesImp::Regex { ref locs, .. } => { + locs + } + } } - pub(crate) fn locations(&mut self) -> &mut CaptureLocations { - &mut self.locs + pub(crate) fn locations_mut(&mut self) -> &mut CaptureLocations { + match self.0 { + RegexCapturesImp::AhoCorasick { .. } => { + panic!("getting locations for simple captures is invalid") + } + RegexCapturesImp::Regex { ref mut locs, .. } => { + locs + } + } } pub(crate) fn strip_crlf(&mut self, yes: bool) { - self.strip_crlf = yes; + match self.0 { + RegexCapturesImp::AhoCorasick { .. } => { + panic!("setting strip_crlf for simple captures is invalid") + } + RegexCapturesImp::Regex { ref mut strip_crlf, .. } => { + *strip_crlf = yes; + } + } + } + + pub(crate) fn set_simple(&mut self, one: Option<Match>) { + match self.0 { + RegexCapturesImp::AhoCorasick { ref mut mat } => { + *mat = one; + } + RegexCapturesImp::Regex { .. } => { + panic!("setting simple captures for regex is invalid") + } + } } } |