summaryrefslogtreecommitdiffstats
path: root/grep-regex/src/matcher.rs
diff options
context:
space:
mode:
Diffstat (limited to 'grep-regex/src/matcher.rs')
-rw-r--r--grep-regex/src/matcher.rs216
1 files changed, 170 insertions, 46 deletions
diff --git a/grep-regex/src/matcher.rs b/grep-regex/src/matcher.rs
index d71f5777..7f30252a 100644
--- a/grep-regex/src/matcher.rs
+++ b/grep-regex/src/matcher.rs
@@ -8,6 +8,7 @@ use regex::bytes::{CaptureLocations, Regex};
use config::{Config, ConfiguredHIR};
use crlf::CRLFMatcher;
use error::Error;
+use multi::MultiLiteralMatcher;
use word::WordMatcher;
/// A builder for constructing a `Matcher` using regular expressions.
@@ -52,7 +53,7 @@ impl RegexMatcherBuilder {
}
let matcher = RegexMatcherImpl::new(&chir)?;
- trace!("final regex: {:?}", matcher.regex().to_string());
+ trace!("final regex: {:?}", matcher.regex());
Ok(RegexMatcher {
config: self.config.clone(),
matcher: matcher,
@@ -61,6 +62,29 @@ impl RegexMatcherBuilder {
})
}
+ /// Build a new matcher from a plain alternation of literals.
+ ///
+ /// Depending on the configuration set by the builder, this may be able to
+ /// build a matcher substantially faster than by joining the patterns with
+ /// a `|` and calling `build`.
+ pub fn build_literals<B: AsRef<str>>(
+ &self,
+ literals: &[B],
+ ) -> Result<RegexMatcher, Error> {
+ let slices: Vec<_> = literals.iter().map(|s| s.as_ref()).collect();
+ if !self.config.can_plain_aho_corasick() || literals.len() < 40 {
+ return self.build(&slices.join("|"));
+ }
+ let matcher = MultiLiteralMatcher::new(&slices)?;
+ let imp = RegexMatcherImpl::MultiLiteral(matcher);
+ Ok(RegexMatcher {
+ config: self.config.clone(),
+ matcher: imp,
+ fast_line_regex: None,
+ non_matching_bytes: ByteSet::empty(),
+ })
+ }
+
/// Set the value for the case insensitive (`i`) flag.
///
/// When enabled, letters in the pattern will match both upper case and
@@ -348,6 +372,8 @@ impl RegexMatcher {
enum RegexMatcherImpl {
/// The standard matcher used for all regular expressions.
Standard(StandardMatcher),
+ /// A matcher for an alternation of plain literals.
+ MultiLiteral(MultiLiteralMatcher),
/// A matcher that strips `\r` from the end of matches.
///
/// This is only used when the CRLF hack is enabled and the regex is line
@@ -370,16 +396,23 @@ impl RegexMatcherImpl {
} else if expr.needs_crlf_stripped() {
Ok(RegexMatcherImpl::CRLF(CRLFMatcher::new(expr)?))
} else {
+ if let Some(lits) = expr.alternation_literals() {
+ if lits.len() >= 40 {
+ let matcher = MultiLiteralMatcher::new(&lits)?;
+ return Ok(RegexMatcherImpl::MultiLiteral(matcher));
+ }
+ }
Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?))
}
}
/// Return the underlying regex object used.
- fn regex(&self) -> &Regex {
+ fn regex(&self) -> String {
match *self {
- RegexMatcherImpl::Word(ref x) => x.regex(),
- RegexMatcherImpl::CRLF(ref x) => x.regex(),
- RegexMatcherImpl::Standard(ref x) => &x.regex,
+ RegexMatcherImpl::Word(ref x) => x.regex().to_string(),
+ RegexMatcherImpl::CRLF(ref x) => x.regex().to_string(),
+ RegexMatcherImpl::MultiLiteral(_) => "<N/A>".to_string(),
+ RegexMatcherImpl::Standard(ref x) => x.regex.to_string(),
}
}
}
@@ -399,6 +432,7 @@ impl Matcher for RegexMatcher {
use self::RegexMatcherImpl::*;
match self.matcher {
Standard(ref m) => m.find_at(haystack, at),
+ MultiLiteral(ref m) => m.find_at(haystack, at),
CRLF(ref m) => m.find_at(haystack, at),
Word(ref m) => m.find_at(haystack, at),
}
@@ -408,6 +442,7 @@ impl Matcher for RegexMatcher {
use self::RegexMatcherImpl::*;
match self.matcher {
Standard(ref m) => m.new_captures(),
+ MultiLiteral(ref m) => m.new_captures(),
CRLF(ref m) => m.new_captures(),
Word(ref m) => m.new_captures(),
}
@@ -417,6 +452,7 @@ impl Matcher for RegexMatcher {
use self::RegexMatcherImpl::*;
match self.matcher {
Standard(ref m) => m.capture_count(),
+ MultiLiteral(ref m) => m.capture_count(),
CRLF(ref m) => m.capture_count(),
Word(ref m) => m.capture_count(),
}
@@ -426,6 +462,7 @@ impl Matcher for RegexMatcher {
use self::RegexMatcherImpl::*;
match self.matcher {
Standard(ref m) => m.capture_index(name),
+ MultiLiteral(ref m) => m.capture_index(name),
CRLF(ref m) => m.capture_index(name),
Word(ref m) => m.capture_index(name),
}
@@ -435,6 +472,7 @@ impl Matcher for RegexMatcher {
use self::RegexMatcherImpl::*;
match self.matcher {
Standard(ref m) => m.find(haystack),
+ MultiLiteral(ref m) => m.find(haystack),
CRLF(ref m) => m.find(haystack),
Word(ref m) => m.find(haystack),
}
@@ -450,6 +488,7 @@ impl Matcher for RegexMatcher {
use self::RegexMatcherImpl::*;
match self.matcher {
Standard(ref m) => m.find_iter(haystack, matched),
+ MultiLiteral(ref m) => m.find_iter(haystack, matched),
CRLF(ref m) => m.find_iter(haystack, matched),
Word(ref m) => m.find_iter(haystack, matched),
}
@@ -465,6 +504,7 @@ impl Matcher for RegexMatcher {
use self::RegexMatcherImpl::*;
match self.matcher {
Standard(ref m) => m.try_find_iter(haystack, matched),
+ MultiLiteral(ref m) => m.try_find_iter(haystack, matched),
CRLF(ref m) => m.try_find_iter(haystack, matched),
Word(ref m) => m.try_find_iter(haystack, matched),
}
@@ -478,6 +518,7 @@ impl Matcher for RegexMatcher {
use self::RegexMatcherImpl::*;
match self.matcher {
Standard(ref m) => m.captures(haystack, caps),
+ MultiLiteral(ref m) => m.captures(haystack, caps),
CRLF(ref m) => m.captures(haystack, caps),
Word(ref m) => m.captures(haystack, caps),
}
@@ -494,6 +535,7 @@ impl Matcher for RegexMatcher {
use self::RegexMatcherImpl::*;
match self.matcher {
Standard(ref m) => m.captures_iter(haystack, caps, matched),
+ MultiLiteral(ref m) => m.captures_iter(haystack, caps, matched),
CRLF(ref m) => m.captures_iter(haystack, caps, matched),
Word(ref m) => m.captures_iter(haystack, caps, matched),
}
@@ -510,6 +552,9 @@ impl Matcher for RegexMatcher {
use self::RegexMatcherImpl::*;
match self.matcher {
Standard(ref m) => m.try_captures_iter(haystack, caps, matched),
+ MultiLiteral(ref m) => {
+ m.try_captures_iter(haystack, caps, matched)
+ }
CRLF(ref m) => m.try_captures_iter(haystack, caps, matched),
Word(ref m) => m.try_captures_iter(haystack, caps, matched),
}
@@ -524,6 +569,7 @@ impl Matcher for RegexMatcher {
use self::RegexMatcherImpl::*;
match self.matcher {
Standard(ref m) => m.captures_at(haystack, at, caps),
+ MultiLiteral(ref m) => m.captures_at(haystack, at, caps),
CRLF(ref m) => m.captures_at(haystack, at, caps),
Word(ref m) => m.captures_at(haystack, at, caps),
}
@@ -540,6 +586,7 @@ impl Matcher for RegexMatcher {
use self::RegexMatcherImpl::*;
match self.matcher {
Standard(ref m) => m.replace(haystack, dst, append),
+ MultiLiteral(ref m) => m.replace(haystack, dst, append),
CRLF(ref m) => m.replace(haystack, dst, append),
Word(ref m) => m.replace(haystack, dst, append),
}
@@ -559,6 +606,9 @@ impl Matcher for RegexMatcher {
Standard(ref m) => {
m.replace_with_captures(haystack, caps, dst, append)
}
+ MultiLiteral(ref m) => {
+ m.replace_with_captures(haystack, caps, dst, append)
+ }
CRLF(ref m) => {
m.replace_with_captures(haystack, caps, dst, append)
}
@@ -572,6 +622,7 @@ impl Matcher for RegexMatcher {
use self::RegexMatcherImpl::*;
match self.matcher {
Standard(ref m) => m.is_match(haystack),
+ MultiLiteral(ref m) => m.is_match(haystack),
CRLF(ref m) => m.is_match(haystack),
Word(ref m) => m.is_match(haystack),
}
@@ -585,6 +636,7 @@ impl Matcher for RegexMatcher {
use self::RegexMatcherImpl::*;
match self.matcher {
Standard(ref m) => m.is_match_at(haystack, at),
+ MultiLiteral(ref m) => m.is_match_at(haystack, at),
CRLF(ref m) => m.is_match_at(haystack, at),
Word(ref m) => m.is_match_at(haystack, at),
}
@@ -597,6 +649,7 @@ impl Matcher for RegexMatcher {
use self::RegexMatcherImpl::*;
match self.matcher {
Standard(ref m) => m.shortest_match(haystack),
+ MultiLiteral(ref m) => m.shortest_match(haystack),
CRLF(ref m) => m.shortest_match(haystack),
Word(ref m) => m.shortest_match(haystack),
}
@@ -610,6 +663,7 @@ impl Matcher for RegexMatcher {
use self::RegexMatcherImpl::*;
match self.matcher {
Standard(ref m) => m.shortest_match_at(haystack, at),
+ MultiLiteral(ref m) => m.shortest_match_at(haystack, at),
CRLF(ref m) => m.shortest_match_at(haystack, at),
Word(ref m) => m.shortest_match_at(haystack, at),
}
@@ -710,7 +764,9 @@ impl Matcher for StandardMatcher {
at: usize,
caps: &mut RegexCaptures,
) -> Result<bool, NoError> {
- Ok(self.regex.captures_read_at(&mut caps.locs, haystack, at).is_some())
+ Ok(self.regex.captures_read_at(
+ &mut caps.locations_mut(), haystack, at,
+ ).is_some())
}
fn shortest_match_at(
@@ -737,54 +793,84 @@ impl Matcher for StandardMatcher {
/// index of the group using the corresponding matcher's `capture_index`
/// method, and then use that index with `RegexCaptures::get`.
#[derive(Clone, Debug)]
-pub struct RegexCaptures {
- /// Where the locations are stored.
- locs: CaptureLocations,
- /// These captures behave as if the capturing groups begin at the given
- /// offset. When set to `0`, this has no affect and capture groups are
- /// indexed like normal.
- ///
- /// This is useful when building matchers that wrap arbitrary regular
- /// expressions. For example, `WordMatcher` takes an existing regex `re`
- /// and creates `(?:^|\W)(re)(?:$|\W)`, but hides the fact that the regex
- /// has been wrapped from the caller. In order to do this, the matcher
- /// and the capturing groups must behave as if `(re)` is the `0`th capture
- /// group.
- offset: usize,
- /// When enable, the end of a match has `\r` stripped from it, if one
- /// exists.
- strip_crlf: bool,
+pub struct RegexCaptures(RegexCapturesImp);
+
+#[derive(Clone, Debug)]
+enum RegexCapturesImp {
+ AhoCorasick {
+ /// The start and end of the match, corresponding to capture group 0.
+ mat: Option<Match>,
+ },
+ Regex {
+ /// Where the locations are stored.
+ locs: CaptureLocations,
+ /// These captures behave as if the capturing groups begin at the given
+ /// offset. When set to `0`, this has no affect and capture groups are
+ /// indexed like normal.
+ ///
+ /// This is useful when building matchers that wrap arbitrary regular
+ /// expressions. For example, `WordMatcher` takes an existing regex
+ /// `re` and creates `(?:^|\W)(re)(?:$|\W)`, but hides the fact that
+ /// the regex has been wrapped from the caller. In order to do this,
+ /// the matcher and the capturing groups must behave as if `(re)` is
+ /// the `0`th capture group.
+ offset: usize,
+ /// When enable, the end of a match has `\r` stripped from it, if one
+ /// exists.
+ strip_crlf: bool,
+ },
}
impl Captures for RegexCaptures {
fn len(&self) -> usize {
- self.locs.len().checked_sub(self.offset).unwrap()
+ match self.0 {
+ RegexCapturesImp::AhoCorasick { .. } => 1,
+ RegexCapturesImp::Regex { ref locs, offset, .. } => {
+ locs.len().checked_sub(offset).unwrap()
+ }
+ }
}
fn get(&self, i: usize) -> Option<Match> {
- if !self.strip_crlf {
- let actual = i.checked_add(self.offset).unwrap();
- return self.locs.pos(actual).map(|(s, e)| Match::new(s, e));
- }
-
- // currently don't support capture offsetting with CRLF stripping
- assert_eq!(self.offset, 0);
- let m = match self.locs.pos(i).map(|(s, e)| Match::new(s, e)) {
- None => return None,
- Some(m) => m,
- };
- // If the end position of this match corresponds to the end position
- // of the overall match, then we apply our CRLF stripping. Otherwise,
- // we cannot assume stripping is correct.
- if i == 0 || m.end() == self.locs.pos(0).unwrap().1 {
- Some(m.with_end(m.end() - 1))
- } else {
- Some(m)
+ match self.0 {
+ RegexCapturesImp::AhoCorasick { mat, .. } => {
+ if i == 0 {
+ mat
+ } else {
+ None
+ }
+ }
+ RegexCapturesImp::Regex { ref locs, offset, strip_crlf } => {
+ if !strip_crlf {
+ let actual = i.checked_add(offset).unwrap();
+ return locs.pos(actual).map(|(s, e)| Match::new(s, e));
+ }
+
+ // currently don't support capture offsetting with CRLF
+ // stripping
+ assert_eq!(offset, 0);
+ let m = match locs.pos(i).map(|(s, e)| Match::new(s, e)) {
+ None => return None,
+ Some(m) => m,
+ };
+ // If the end position of this match corresponds to the end
+ // position of the overall match, then we apply our CRLF
+ // stripping. Otherwise, we cannot assume stripping is correct.
+ if i == 0 || m.end() == locs.pos(0).unwrap().1 {
+ Some(m.with_end(m.end() - 1))
+ } else {
+ Some(m)
+ }
+ }
}
}
}
impl RegexCaptures {
+ pub(crate) fn simple() -> RegexCaptures {
+ RegexCaptures(RegexCapturesImp::AhoCorasick { mat: None })
+ }
+
pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures {
RegexCaptures::with_offset(locs, 0)
}
@@ -793,15 +879,53 @@ impl RegexCaptures {
locs: CaptureLocations,
offset: usize,
) -> RegexCaptures {
- RegexCaptures { locs, offset, strip_crlf: false }
+ RegexCaptures(RegexCapturesImp::Regex {
+ locs, offset, strip_crlf: false,
+ })
+ }
+
+ pub(crate) fn locations(&self) -> &CaptureLocations {
+ match self.0 {
+ RegexCapturesImp::AhoCorasick { .. } => {
+ panic!("getting locations for simple captures is invalid")
+ }
+ RegexCapturesImp::Regex { ref locs, .. } => {
+ locs
+ }
+ }
}
- pub(crate) fn locations(&mut self) -> &mut CaptureLocations {
- &mut self.locs
+ pub(crate) fn locations_mut(&mut self) -> &mut CaptureLocations {
+ match self.0 {
+ RegexCapturesImp::AhoCorasick { .. } => {
+ panic!("getting locations for simple captures is invalid")
+ }
+ RegexCapturesImp::Regex { ref mut locs, .. } => {
+ locs
+ }
+ }
}
pub(crate) fn strip_crlf(&mut self, yes: bool) {
- self.strip_crlf = yes;
+ match self.0 {
+ RegexCapturesImp::AhoCorasick { .. } => {
+ panic!("setting strip_crlf for simple captures is invalid")
+ }
+ RegexCapturesImp::Regex { ref mut strip_crlf, .. } => {
+ *strip_crlf = yes;
+ }
+ }
+ }
+
+ pub(crate) fn set_simple(&mut self, one: Option<Match>) {
+ match self.0 {
+ RegexCapturesImp::AhoCorasick { ref mut mat } => {
+ *mat = one;
+ }
+ RegexCapturesImp::Regex { .. } => {
+ panic!("setting simple captures for regex is invalid")
+ }
+ }
}
}