From 47744a47c63fc6c22530504cba5d394fd7fa7e2d Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 21 Jun 2023 20:38:02 -0400 Subject: regex: add new inner literal extractor This is mostly a copy of the prefix literal extractor in regex-syntax, but with a tweaked notion of Seq that keeps track of whether it's a prefix of an expression or not. If it isn't, then we can't cross it as a suffix to another Seq. This new extractor should be a lot more robust than the old one. We actually will keep going through the regex to try and find the "best" literals to search for (according to some heuristic). --- crates/regex/src/literal.rs | 967 +++++++++++++++++++++++++++++++++++++++++++- crates/regex/src/matcher.rs | 10 +- 2 files changed, 958 insertions(+), 19 deletions(-) diff --git a/crates/regex/src/literal.rs b/crates/regex/src/literal.rs index 518e54bf..e52fb899 100644 --- a/crates/regex/src/literal.rs +++ b/crates/regex/src/literal.rs @@ -1,15 +1,42 @@ -#![allow(warnings)] - use { regex_automata::meta::Regex, regex_syntax::hir::{ - literal::{self, Seq}, + self, + literal::{Literal, Seq}, Hir, }, }; -use crate::config::ConfiguredHIR; +use crate::{config::ConfiguredHIR, error::Error}; +/// A type that encapsulates "inner" literal extractiong from a regex. +/// +/// It uses a huge pile of heuristics to try to pluck out literals from a regex +/// that are in turn used to build a simpler regex that is more amenable to +/// optimization. +/// +/// The main idea underyling the validity of this technique is the fact +/// that ripgrep searches individuals lines and not across lines. (Unless +/// -U/--multiline is enabled.) Namely, we can pluck literals out of the regex, +/// search for them, find the bounds of the line in which that literal occurs +/// and then run the original regex on only that line. This overall works +/// really really well in throughput oriented searches because it potentially +/// allows ripgrep to spend a lot more time in a fast vectorized routine for +/// finding literals as opposed to the (much) slower regex engine. +/// +/// This optimization was far more important in the old days, but since then, +/// Rust's regex engine has actually grown its own (albeit limited) support for +/// inner literal optimizations. So this technique doesn't apply as much as it +/// used to. +/// +/// A good example of a regex where this particular extractor helps is +/// `\s+(Sherlock|[A-Z]atso[a-z]|Moriarty)\s+`. The `[A-Z]` before the `atso` +/// in particular is what inhibits the regex engine's own inner literal +/// optimizations from kicking in. This particular regex also did not have any +/// inner literals extracted in the old implementation (ripgrep <=13). So this +/// particular implementation represents a strict improvement from both the old +/// implementation and from the regex engine's own optimizations. (Which could +/// in theory be improved still.) #[derive(Clone, Debug)] pub(crate) struct InnerLiterals { seq: Seq, @@ -26,10 +53,35 @@ impl InnerLiterals { /// because it will query some state about the compiled regex. That state /// may influence inner literal extraction. pub(crate) fn new(chir: &ConfiguredHIR, re: &Regex) -> InnerLiterals { - let seq = Seq::infinite(); - if chir.config().line_terminator.is_none() || re.is_accelerated() { + // If there's no line terminator, then the inner literal optimization + // at this level is not valid. + if chir.config().line_terminator.is_none() { + log::trace!( + "skipping inner literal extraction, \ + no line terminator is set" + ); + return InnerLiterals::none(); + } + // If we believe the regex is already accelerated, then just let + // the regex engine do its thing. We'll skip the inner literal + // optimization. + if re.is_accelerated() { + log::trace!( + "skipping inner literal extraction, \ + existing regex is believed to already be accelerated", + ); return InnerLiterals::none(); } + // In this case, we pretty much know that the regex engine will handle + // it as best as possible, even if it isn't reported as accelerated. + if chir.hir().properties().is_alternation_literal() { + log::trace!( + "skipping inner literal extraction, \ + found alternation of literals, deferring to regex engine", + ); + return InnerLiterals::none(); + } + let seq = Extractor::new().extract_untagged(chir.hir()); InnerLiterals { seq } } @@ -45,10 +97,905 @@ impl InnerLiterals { /// generated these literal sets. The idea here is that the pattern /// returned by this method is much cheaper to search for. i.e., It is /// usually a single literal or an alternation of literals. - pub(crate) fn one_regex(&self) -> Option { - if !self.seq.is_finite() { - return None; + pub(crate) fn one_regex(&self) -> Result, Error> { + let Some(lits) = self.seq.literals() else { return Ok(None) }; + if lits.is_empty() { + return Ok(None); + } + let mut alts = vec![]; + for lit in lits.iter() { + alts.push(Hir::literal(lit.as_bytes())); + } + let hir = Hir::alternation(alts); + log::debug!("extracted fast line regex: {:?}", hir.to_string()); + let re = Regex::builder() + .configure(Regex::config().utf8_empty(false)) + .build_from_hir(&hir) + .map_err(Error::regex)?; + Ok(Some(re)) + } +} + +/// An inner literal extractor. +/// +/// This is a somewhat stripped down version of the extractor from +/// regex-syntax. The main difference is that we try to identify a "best" set +/// of required literals while traversing the HIR. +#[derive(Debug)] +struct Extractor { + limit_class: usize, + limit_repeat: usize, + limit_literal_len: usize, + limit_total: usize, +} + +impl Extractor { + /// Create a new inner literal extractor with a default configuration. + fn new() -> Extractor { + Extractor { + limit_class: 10, + limit_repeat: 10, + limit_literal_len: 100, + limit_total: 64, + } + } + + /// Execute the extractor at the top-level and return an untagged sequence + /// of literals. + fn extract_untagged(&self, hir: &Hir) -> Seq { + let mut seq = self.extract(hir); + log::trace!("extracted inner literals: {:?}", seq.seq); + seq.seq.optimize_for_prefix_by_preference(); + log::trace!( + "extracted inner literals after optimization: {:?}", + seq.seq + ); + if !seq.is_good() { + log::trace!( + "throwing away inner literals because they might be slow" + ); + seq.make_infinite(); + } + seq.seq + } + + /// Execute the extractor and return a sequence of literals. + fn extract(&self, hir: &Hir) -> TSeq { + use regex_syntax::hir::HirKind::*; + + match *hir.kind() { + Empty | Look(_) => TSeq::singleton(self::Literal::exact(vec![])), + Literal(hir::Literal(ref bytes)) => { + let mut seq = + TSeq::singleton(self::Literal::exact(bytes.to_vec())); + self.enforce_literal_len(&mut seq); + seq + } + Class(hir::Class::Unicode(ref cls)) => { + self.extract_class_unicode(cls) + } + Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls), + Repetition(ref rep) => self.extract_repetition(rep), + Capture(hir::Capture { ref sub, .. }) => self.extract(sub), + Concat(ref hirs) => self.extract_concat(hirs.iter()), + Alternation(ref hirs) => self.extract_alternation(hirs.iter()), + } + } + + /// Extract a sequence from the given concatenation. Sequences from each of + /// the child HIR expressions are combined via cross product. + /// + /// This short circuits once the cross product turns into a sequence + /// containing only inexact literals. + fn extract_concat<'a, I: Iterator>(&self, it: I) -> TSeq { + let mut seq = TSeq::singleton(self::Literal::exact(vec![])); + let mut prev: Option = None; + for hir in it { + // If every element in the sequence is inexact, then a cross + // product will always be a no-op. Thus, there is nothing else we + // can add to it and can quit early. Note that this also includes + // infinite sequences. + if seq.is_inexact() { + // If a concatenation has an empty sequence anywhere, then + // it's impossible for the concatenantion to ever match. So we + // can just quit now. + if seq.is_empty() { + return seq; + } + if seq.is_really_good() { + return seq; + } + prev = Some(match prev { + None => seq, + Some(prev) => prev.choose(seq), + }); + seq = TSeq::singleton(self::Literal::exact(vec![])); + seq.make_not_prefix(); + } + // Note that 'cross' also dispatches based on whether we're + // extracting prefixes or suffixes. + seq = self.cross(seq, self.extract(hir)); + } + if let Some(prev) = prev { + prev.choose(seq) + } else { + seq + } + } + + /// Extract a sequence from the given alternation. + /// + /// This short circuits once the union turns into an infinite sequence. + fn extract_alternation<'a, I: Iterator>( + &self, + it: I, + ) -> TSeq { + let mut seq = TSeq::empty(); + for hir in it { + // Once our 'seq' is infinite, every subsequent union + // operation on it will itself always result in an + // infinite sequence. Thus, it can never change and we can + // short-circuit. + if !seq.is_finite() { + break; + } + seq = self.union(seq, &mut self.extract(hir)); + } + seq + } + + /// Extract a sequence of literals from the given repetition. We do our + /// best, Some examples: + /// + /// 'a*' => [inexact(a), exact("")] + /// 'a*?' => [exact(""), inexact(a)] + /// 'a+' => [inexact(a)] + /// 'a{3}' => [exact(aaa)] + /// 'a{3,5} => [inexact(aaa)] + /// + /// The key here really is making sure we get the 'inexact' vs 'exact' + /// attributes correct on each of the literals we add. For example, the + /// fact that 'a*' gives us an inexact 'a' and an exact empty string means + /// that a regex like 'ab*c' will result in [inexact(ab), exact(ac)] + /// literals being extracted, which might actually be a better prefilter + /// than just 'a'. + fn extract_repetition(&self, rep: &hir::Repetition) -> TSeq { + let mut subseq = self.extract(&rep.sub); + match *rep { + hir::Repetition { min: 0, max, greedy, .. } => { + // When 'max=1', we can retain exactness, since 'a?' is + // equivalent to 'a|'. Similarly below, 'a??' is equivalent to + // '|a'. + if max != Some(1) { + subseq.make_inexact(); + } + let mut empty = TSeq::singleton(Literal::exact(vec![])); + if !greedy { + std::mem::swap(&mut subseq, &mut empty); + } + self.union(subseq, &mut empty) + } + hir::Repetition { min, max: Some(max), .. } if min == max => { + assert!(min > 0); // handled above + let limit = + u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); + let mut seq = TSeq::singleton(Literal::exact(vec![])); + for _ in 0..std::cmp::min(min, limit) { + if seq.is_inexact() { + break; + } + seq = self.cross(seq, subseq.clone()); + } + if usize::try_from(min).is_err() || min > limit { + seq.make_inexact(); + } + seq + } + hir::Repetition { min, max: Some(max), .. } if min < max => { + assert!(min > 0); // handled above + let limit = + u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); + let mut seq = TSeq::singleton(Literal::exact(vec![])); + for _ in 0..std::cmp::min(min, limit) { + if seq.is_inexact() { + break; + } + seq = self.cross(seq, subseq.clone()); + } + seq.make_inexact(); + seq + } + hir::Repetition { .. } => { + subseq.make_inexact(); + subseq + } + } + } + + /// Convert the given Unicode class into a sequence of literals if the + /// class is small enough. If the class is too big, return an infinite + /// sequence. + fn extract_class_unicode(&self, cls: &hir::ClassUnicode) -> TSeq { + if self.class_over_limit_unicode(cls) { + return TSeq::infinite(); + } + let mut seq = TSeq::empty(); + for r in cls.iter() { + for ch in r.start()..=r.end() { + seq.push(Literal::from(ch)); + } + } + self.enforce_literal_len(&mut seq); + seq + } + + /// Convert the given byte class into a sequence of literals if the class + /// is small enough. If the class is too big, return an infinite sequence. + fn extract_class_bytes(&self, cls: &hir::ClassBytes) -> TSeq { + if self.class_over_limit_bytes(cls) { + return TSeq::infinite(); + } + let mut seq = TSeq::empty(); + for r in cls.iter() { + for b in r.start()..=r.end() { + seq.push(Literal::from(b)); + } + } + self.enforce_literal_len(&mut seq); + seq + } + + /// Returns true if the given Unicode class exceeds the configured limits + /// on this extractor. + fn class_over_limit_unicode(&self, cls: &hir::ClassUnicode) -> bool { + let mut count = 0; + for r in cls.iter() { + if count > self.limit_class { + return true; + } + count += r.len(); + } + count > self.limit_class + } + + /// Returns true if the given byte class exceeds the configured limits on + /// this extractor. + fn class_over_limit_bytes(&self, cls: &hir::ClassBytes) -> bool { + let mut count = 0; + for r in cls.iter() { + if count > self.limit_class { + return true; + } + count += r.len(); + } + count > self.limit_class + } + + /// Compute the cross product of the two sequences if the result would be + /// within configured limits. Otherwise, make `seq2` infinite and cross the + /// infinite sequence with `seq1`. + fn cross(&self, mut seq1: TSeq, mut seq2: TSeq) -> TSeq { + if !seq2.prefix { + return seq1.choose(seq2); + } + if seq1 + .max_cross_len(&seq2) + .map_or(false, |len| len > self.limit_total) + { + seq2.make_infinite(); } - None + seq1.cross_forward(&mut seq2); + assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); + self.enforce_literal_len(&mut seq1); + seq1 + } + + /// Union the two sequences if the result would be within configured + /// limits. Otherwise, make `seq2` infinite and union the infinite sequence + /// with `seq1`. + fn union(&self, mut seq1: TSeq, seq2: &mut TSeq) -> TSeq { + if seq1.max_union_len(seq2).map_or(false, |len| len > self.limit_total) + { + // We try to trim our literal sequences to see if we can make + // room for more literals. The idea is that we'd rather trim down + // literals already in our sequence if it means we can add a few + // more and retain a finite sequence. Otherwise, we'll union with + // an infinite sequence and that infects everything and effectively + // stops literal extraction in its tracks. + // + // We do we keep 4 bytes here? Well, it's a bit of an abstraction + // leakage. Downstream, the literals may wind up getting fed to + // the Teddy algorithm, which supports searching literals up to + // length 4. So that's why we pick that number here. Arguably this + // should be a tuneable parameter, but it seems a little tricky to + // describe. And I'm still unsure if this is the right way to go + // about culling literal sequences. + seq1.keep_first_bytes(4); + seq2.keep_first_bytes(4); + seq1.dedup(); + seq2.dedup(); + if seq1 + .max_union_len(seq2) + .map_or(false, |len| len > self.limit_total) + { + seq2.make_infinite(); + } + } + seq1.union(seq2); + assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); + seq1 + } + + /// Applies the literal length limit to the given sequence. If none of the + /// literals in the sequence exceed the limit, then this is a no-op. + fn enforce_literal_len(&self, seq: &mut TSeq) { + seq.keep_first_bytes(self.limit_literal_len); + } +} + +#[derive(Clone, Debug)] +struct TSeq { + seq: Seq, + prefix: bool, +} + +#[allow(dead_code)] +impl TSeq { + fn empty() -> TSeq { + TSeq { seq: Seq::empty(), prefix: true } + } + + fn infinite() -> TSeq { + TSeq { seq: Seq::infinite(), prefix: true } + } + + fn singleton(lit: Literal) -> TSeq { + TSeq { seq: Seq::singleton(lit), prefix: true } + } + + fn new(it: I) -> TSeq + where + I: IntoIterator, + B: AsRef<[u8]>, + { + TSeq { seq: Seq::new(it), prefix: true } + } + + fn literals(&self) -> Option<&[Literal]> { + self.seq.literals() + } + + fn push(&mut self, lit: Literal) { + self.seq.push(lit); + } + + fn make_inexact(&mut self) { + self.seq.make_inexact(); + } + + fn make_infinite(&mut self) { + self.seq.make_infinite(); + } + + fn cross_forward(&mut self, other: &mut TSeq) { + assert!(other.prefix); + self.seq.cross_forward(&mut other.seq); + } + + fn union(&mut self, other: &mut TSeq) { + self.seq.union(&mut other.seq); + } + + fn dedup(&mut self) { + self.seq.dedup(); + } + + fn sort(&mut self) { + self.seq.sort(); + } + + fn keep_first_bytes(&mut self, len: usize) { + self.seq.keep_first_bytes(len); + } + + fn is_finite(&self) -> bool { + self.seq.is_finite() + } + + fn is_empty(&self) -> bool { + self.seq.is_empty() + } + + fn len(&self) -> Option { + self.seq.len() + } + + fn is_exact(&self) -> bool { + self.seq.is_exact() + } + + fn is_inexact(&self) -> bool { + self.seq.is_inexact() + } + + fn max_union_len(&self, other: &TSeq) -> Option { + self.seq.max_union_len(&other.seq) + } + + fn max_cross_len(&self, other: &TSeq) -> Option { + assert!(other.prefix); + self.seq.max_cross_len(&other.seq) + } + + fn min_literal_len(&self) -> Option { + self.seq.min_literal_len() + } + + fn max_literal_len(&self) -> Option { + self.seq.max_literal_len() + } + + // Below are methods specific to a TSeq that aren't just forwarding calls + // to a Seq method. + + /// Tags this sequence as "not a prefix." When this happens, this sequence + /// can't be crossed as a suffix of another sequence. + fn make_not_prefix(&mut self) { + self.prefix = false; + } + + /// Returns true if it's believed that the sequence given is "good" for + /// acceleration. This is useful for determining whether a sequence of + /// literals has any shot of being fast. + fn is_good(&self) -> bool { + if self.has_poisonous_literal() { + return false; + } + let Some(min) = self.min_literal_len() else { return false }; + let Some(len) = self.len() else { return false }; + // If we have some very short literals, then let's require that our + // sequence is itself very small. + if min <= 1 { + return len <= 3; + } + min >= 2 && len <= 64 + } + + /// Returns true if it's believed that the sequence given is "really + /// good" for acceleration. This is useful for short circuiting literal + /// extraction. + fn is_really_good(&self) -> bool { + if self.has_poisonous_literal() { + return false; + } + let Some(min) = self.min_literal_len() else { return false }; + let Some(len) = self.len() else { return false }; + min >= 3 && len <= 8 + } + + /// Returns true if the given sequence contains a poisonous literal. + fn has_poisonous_literal(&self) -> bool { + let Some(lits) = self.literals() else { return false }; + lits.iter().any(is_poisonous) + } + + /// Compare the two sequences and return the one that is believed to be best + /// according to a hodge podge of heuristics. + fn choose(self, other: TSeq) -> TSeq { + let (seq1, seq2) = (self, other); + if !seq1.is_finite() { + return seq2; + } else if !seq2.is_finite() { + return seq1; + } + if seq1.has_poisonous_literal() { + return seq2; + } else if seq2.has_poisonous_literal() { + return seq1; + } + let Some(min1) = seq1.min_literal_len() else { return seq2 }; + let Some(min2) = seq2.min_literal_len() else { return seq1 }; + if min1 < min2 { + return seq2; + } else if min2 < min1 { + return seq1; + } + // OK because we know both sequences are finite, otherwise they wouldn't + // have a minimum literal length. + let len1 = seq1.len().unwrap(); + let len2 = seq2.len().unwrap(); + if len1 < len2 { + return seq2; + } else if len2 < len1 { + return seq1; + } + // We could do extra stuff like looking at a background frequency + // distribution of bytes and picking the one that looks more rare, but for + // now we just pick one. + seq1 + } +} + +impl FromIterator for TSeq { + fn from_iter>(it: T) -> TSeq { + TSeq { seq: Seq::from_iter(it), prefix: true } + } +} + +/// Returns true if it is believe that this literal is likely to match very +/// frequently, and is thus not a good candidate for a prefilter. +fn is_poisonous(lit: &Literal) -> bool { + use regex_syntax::hir::literal::rank; + + lit.is_empty() || (lit.len() == 1 && rank(lit.as_bytes()[0]) >= 250) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn e(pattern: impl AsRef) -> Seq { + let pattern = pattern.as_ref(); + let hir = regex_syntax::ParserBuilder::new() + .utf8(false) + .build() + .parse(pattern) + .unwrap(); + Extractor::new().extract_untagged(&hir) + } + + #[allow(non_snake_case)] + fn E(x: &str) -> Literal { + Literal::exact(x.as_bytes()) + } + + #[allow(non_snake_case)] + fn I(x: &str) -> Literal { + Literal::inexact(x.as_bytes()) + } + + fn seq>(it: I) -> Seq { + Seq::from_iter(it) + } + + fn inexact(it: I) -> Seq + where + I: IntoIterator, + { + Seq::from_iter(it) + } + + fn exact, I: IntoIterator>(it: I) -> Seq { + Seq::new(it) + } + + #[test] + fn various() { + assert_eq!(e(r"foo"), seq([E("foo")])); + assert_eq!(e(r"[a-z]foo[a-z]"), seq([I("foo")])); + assert_eq!(e(r"[a-z](foo)(bar)[a-z]"), seq([I("foobar")])); + assert_eq!(e(r"[a-z]([a-z]foo)(bar[a-z])[a-z]"), seq([I("foobar")])); + assert_eq!(e(r"[a-z]([a-z]foo)([a-z]foo)[a-z]"), seq([I("foo")])); + assert_eq!(e(r"(\d{1,3}\.){3}\d{1,3}"), seq([I(".")])); + assert_eq!(e(r"[a-z]([a-z]foo){3}[a-z]"), seq([I("foo")])); + assert_eq!(e(r"[a-z](foo[a-z]){3}[a-z]"), seq([I("foo")])); + assert_eq!(e(r"[a-z]([a-z]foo[a-z]){3}[a-z]"), seq([I("foo")])); + assert_eq!( + e(r"[a-z]([a-z]foo){3}(bar[a-z]){3}[a-z]"), + seq([I("foobar")]) + ); + } + + // These test that some of our suspicious heuristics try to "pick better + // literals." + #[test] + fn heuristics() { + // Here, the first literals we stumble across are {ab, cd, ef}. But we + // keep going and our heuristics decide that {hiya} is better. (And it + // should be, since it's just one literal and it's longer.) + assert_eq!(e(r"[a-z]+(ab|cd|ef)[a-z]+hiya[a-z]+"), seq([I("hiya")])); + // But here, the first alternation becomes "good enough" that literal + // extraction short circuits early. {hiya} is probably still a better + // choice here, but {abc, def, ghi} is not bad. + assert_eq!( + e(r"[a-z]+(abc|def|ghi)[a-z]+hiya[a-z]+"), + seq([I("abc"), I("def"), I("ghi")]) + ); + } + + #[test] + fn literal() { + assert_eq!(exact(["a"]), e("a")); + assert_eq!(exact(["aaaaa"]), e("aaaaa")); + assert_eq!(exact(["A", "a"]), e("(?i-u)a")); + assert_eq!(exact(["AB", "Ab", "aB", "ab"]), e("(?i-u)ab")); + assert_eq!(exact(["abC", "abc"]), e("ab(?i-u)c")); + + assert_eq!(Seq::infinite(), e(r"(?-u:\xFF)")); + assert_eq!(exact([b"Z"]), e(r"Z")); + + assert_eq!(exact(["☃"]), e("☃")); + assert_eq!(exact(["☃"]), e("(?i)☃")); + assert_eq!(exact(["☃☃☃☃☃"]), e("☃☃☃☃☃")); + + assert_eq!(exact(["Δ"]), e("Δ")); + assert_eq!(exact(["δ"]), e("δ")); + assert_eq!(exact(["Δ", "δ"]), e("(?i)Δ")); + assert_eq!(exact(["Δ", "δ"]), e("(?i)δ")); + + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)S")); + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)s")); + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)ſ")); + + let letters = "ͱͳͷΐάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋ"; + assert_eq!(exact([letters]), e(letters)); + } + + #[test] + fn class() { + assert_eq!(exact(["a", "b", "c"]), e("[abc]")); + assert_eq!(exact(["a1b", "a2b", "a3b"]), e("a[123]b")); + assert_eq!(exact(["δ", "ε"]), e("[εδ]")); + assert_eq!(exact(["Δ", "Ε", "δ", "ε", "ϵ"]), e(r"(?i)[εδ]")); + } + + #[test] + fn look() { + assert_eq!(exact(["ab"]), e(r"a\Ab")); + assert_eq!(exact(["ab"]), e(r"a\zb")); + assert_eq!(exact(["ab"]), e(r"a(?m:^)b")); + assert_eq!(exact(["ab"]), e(r"a(?m:$)b")); + assert_eq!(exact(["ab"]), e(r"a\bb")); + assert_eq!(exact(["ab"]), e(r"a\Bb")); + assert_eq!(exact(["ab"]), e(r"a(?-u:\b)b")); + assert_eq!(exact(["ab"]), e(r"a(?-u:\B)b")); + + assert_eq!(exact(["ab"]), e(r"^ab")); + assert_eq!(exact(["ab"]), e(r"$ab")); + assert_eq!(exact(["ab"]), e(r"(?m:^)ab")); + assert_eq!(exact(["ab"]), e(r"(?m:$)ab")); + assert_eq!(exact(["ab"]), e(r"\bab")); + assert_eq!(exact(["ab"]), e(r"\Bab")); + assert_eq!(exact(["ab"]), e(r"(?-u:\b)ab")); + assert_eq!(exact(["ab"]), e(r"(?-u:\B)ab")); + + assert_eq!(exact(["ab"]), e(r"ab^")); + assert_eq!(exact(["ab"]), e(r"ab$")); + assert_eq!(exact(["ab"]), e(r"ab(?m:^)")); + assert_eq!(exact(["ab"]), e(r"ab(?m:$)")); + assert_eq!(exact(["ab"]), e(r"ab\b")); + assert_eq!(exact(["ab"]), e(r"ab\B")); + assert_eq!(exact(["ab"]), e(r"ab(?-u:\b)")); + assert_eq!(exact(["ab"]), e(r"ab(?-u:\B)")); + + assert_eq!(seq([I("aZ"), E("ab")]), e(r"^aZ*b")); + } + + #[test] + fn repetition() { + assert_eq!(Seq::infinite(), e(r"a?")); + assert_eq!(Seq::infinite(), e(r"a??")); + assert_eq!(Seq::infinite(), e(r"a*")); + assert_eq!(Seq::infinite(), e(r"a*?")); + assert_eq!(inexact([I("a")]), e(r"a+")); + assert_eq!(inexact([I("a")]), e(r"(a+)+")); + + assert_eq!(exact(["ab"]), e(r"aZ{0}b")); + assert_eq!(exact(["aZb", "ab"]), e(r"aZ?b")); + assert_eq!(exact(["ab", "aZb"]), e(r"aZ??b")); + assert_eq!(inexact([I("aZ"), E("ab")]), e(r"aZ*b")); + assert_eq!(inexact([E("ab"), I("aZ")]), e(r"aZ*?b")); + assert_eq!(inexact([I("aZ")]), e(r"aZ+b")); + assert_eq!(inexact([I("aZ")]), e(r"aZ+?b")); + + assert_eq!(exact(["aZZb"]), e(r"aZ{2}b")); + assert_eq!(inexact([I("aZZ")]), e(r"aZ{2,3}b")); + + assert_eq!(Seq::infinite(), e(r"(abc)?")); + assert_eq!(Seq::infinite(), e(r"(abc)??")); + + assert_eq!(inexact([I("a"), E("b")]), e(r"a*b")); + assert_eq!(inexact([E("b"), I("a")]), e(r"a*?b")); + assert_eq!(inexact([I("ab")]), e(r"ab+")); + assert_eq!(inexact([I("a"), I("b")]), e(r"a*b+")); + + assert_eq!(inexact([I("a"), I("b"), E("c")]), e(r"a*b*c")); + assert_eq!(inexact([I("a"), I("b"), E("c")]), e(r"(a+)?(b+)?c")); + assert_eq!(inexact([I("a"), I("b"), E("c")]), e(r"(a+|)(b+|)c")); + // A few more similarish but not identical regexes. These may have a + // similar problem as above. + assert_eq!(Seq::infinite(), e(r"a*b*c*")); + assert_eq!(inexact([I("a"), I("b"), I("c")]), e(r"a*b*c+")); + assert_eq!(inexact([I("a"), I("b")]), e(r"a*b+c")); + assert_eq!(inexact([I("a"), I("b")]), e(r"a*b+c*")); + assert_eq!(inexact([I("ab"), E("a")]), e(r"ab*")); + assert_eq!(inexact([I("ab"), E("ac")]), e(r"ab*c")); + assert_eq!(inexact([I("ab")]), e(r"ab+")); + assert_eq!(inexact([I("ab")]), e(r"ab+c")); + + assert_eq!(inexact([I("z"), E("azb")]), e(r"z*azb")); + + let expected = + exact(["aaa", "aab", "aba", "abb", "baa", "bab", "bba", "bbb"]); + assert_eq!(expected, e(r"[ab]{3}")); + let expected = inexact([ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb"), + ]); + assert_eq!(expected, e(r"[ab]{3,4}")); + } + + #[test] + fn concat() { + assert_eq!(exact(["abcxyz"]), e(r"abc()xyz")); + assert_eq!(exact(["abcxyz"]), e(r"(abc)(xyz)")); + assert_eq!(exact(["abcmnoxyz"]), e(r"abc()mno()xyz")); + assert_eq!(Seq::infinite(), e(r"abc[a&&b]xyz")); + assert_eq!(exact(["abcxyz"]), e(r"abc[a&&b]*xyz")); + } + + #[test] + fn alternation() { + assert_eq!(exact(["abc", "mno", "xyz"]), e(r"abc|mno|xyz")); + assert_eq!( + inexact([E("abc"), I("mZ"), E("mo"), E("xyz")]), + e(r"abc|mZ*o|xyz") + ); + assert_eq!(exact(["abc", "xyz"]), e(r"abc|M[a&&b]N|xyz")); + assert_eq!(exact(["abc", "MN", "xyz"]), e(r"abc|M[a&&b]*N|xyz")); + + assert_eq!(exact(["aaa"]), e(r"(?:|aa)aaa")); + assert_eq!(Seq::infinite(), e(r"(?:|aa)(?:aaa)*")); + assert_eq!(Seq::infinite(), e(r"(?:|aa)(?:aaa)*?")); + + assert_eq!(Seq::infinite(), e(r"a|b*")); + assert_eq!(inexact([E("a"), I("b")]), e(r"a|b+")); + + assert_eq!(inexact([I("a"), E("b"), E("c")]), e(r"a*b|c")); + + assert_eq!(Seq::infinite(), e(r"a|(?:b|c*)")); + + assert_eq!(inexact([I("a"), I("b"), E("c")]), e(r"(a|b)*c|(a|ab)*c")); + + assert_eq!( + exact(["abef", "abgh", "cdef", "cdgh"]), + e(r"(ab|cd)(ef|gh)") + ); + assert_eq!( + exact([ + "abefij", "abefkl", "abghij", "abghkl", "cdefij", "cdefkl", + "cdghij", "cdghkl", + ]), + e(r"(ab|cd)(ef|gh)(ij|kl)") + ); + } + + #[test] + fn impossible() { + // N.B. The extractor in this module "optimizes" the sequence and makes + // it infinite if it isn't "good." An empty sequence (generated by a + // concatenantion containing an expression that can never match) is + // considered "not good." Since infinite sequences are not actionably + // and disable optimizations, this winds up being okay. + // + // The literal extractor in regex-syntax doesn't combine these two + // steps and makes the caller choose to optimize. That is, it returns + // the sequences as they are. Which in this case, for some of the tests + // below, would be an empty Seq and not an infinite Seq. + assert_eq!(Seq::infinite(), e(r"[a&&b]")); + assert_eq!(Seq::infinite(), e(r"a[a&&b]")); + assert_eq!(Seq::infinite(), e(r"[a&&b]b")); + assert_eq!(Seq::infinite(), e(r"a[a&&b]b")); + assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]|b")); + assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]|b")); + assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]d|b")); + assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]d|b")); + assert_eq!(Seq::infinite(), e(r"[a&&b]*")); + assert_eq!(exact(["MN"]), e(r"M[a&&b]*N")); + } + + // This tests patterns that contain something that defeats literal + // detection, usually because it would blow some limit on the total number + // of literals that can be returned. + // + // The main idea is that when literal extraction sees something that + // it knows will blow a limit, it replaces it with a marker that says + // "any literal will match here." While not necessarily true, the + // over-estimation is just fine for the purposes of literal extraction, + // because the imprecision doesn't matter: too big is too big. + // + // This is one of the trickier parts of literal extraction, since we need + // to make sure all of our literal extraction operations correctly compose + // with the markers. + // + // Note that unlike in regex-syntax, some of these have "inner" literals + // extracted where a prefix or suffix would otherwise not be found. + #[test] + fn anything() { + assert_eq!(Seq::infinite(), e(r".")); + assert_eq!(Seq::infinite(), e(r"(?s).")); + assert_eq!(Seq::infinite(), e(r"[A-Za-z]")); + assert_eq!(Seq::infinite(), e(r"[A-Z]")); + assert_eq!(Seq::infinite(), e(r"[A-Z]{0}")); + assert_eq!(Seq::infinite(), e(r"[A-Z]?")); + assert_eq!(Seq::infinite(), e(r"[A-Z]*")); + assert_eq!(Seq::infinite(), e(r"[A-Z]+")); + assert_eq!(seq([I("1")]), e(r"1[A-Z]")); + assert_eq!(seq([I("1")]), e(r"1[A-Z]2")); + assert_eq!(seq([E("123")]), e(r"[A-Z]+123")); + assert_eq!(seq([I("123")]), e(r"[A-Z]+123[A-Z]+")); + assert_eq!(Seq::infinite(), e(r"1|[A-Z]|3")); + assert_eq!(seq([E("1"), I("2"), E("3")]), e(r"1|2[A-Z]|3"),); + assert_eq!(seq([E("1"), I("2"), E("3")]), e(r"1|[A-Z]2[A-Z]|3"),); + assert_eq!(seq([E("1"), E("2"), E("3")]), e(r"1|[A-Z]2|3"),); + assert_eq!(seq([E("1"), I("2"), E("4")]), e(r"1|2[A-Z]3|4"),); + assert_eq!(seq([E("2")]), e(r"(?:|1)[A-Z]2")); + assert_eq!(inexact([I("a")]), e(r"a.z")); + } + + #[test] + fn empty() { + assert_eq!(Seq::infinite(), e(r"")); + assert_eq!(Seq::infinite(), e(r"^")); + assert_eq!(Seq::infinite(), e(r"$")); + assert_eq!(Seq::infinite(), e(r"(?m:^)")); + assert_eq!(Seq::infinite(), e(r"(?m:$)")); + assert_eq!(Seq::infinite(), e(r"\b")); + assert_eq!(Seq::infinite(), e(r"\B")); + assert_eq!(Seq::infinite(), e(r"(?-u:\b)")); + assert_eq!(Seq::infinite(), e(r"(?-u:\B)")); + } + + #[test] + fn crazy_repeats() { + assert_eq!(Seq::infinite(), e(r"(?:){4294967295}")); + assert_eq!(Seq::infinite(), e(r"(?:){64}{64}{64}{64}{64}{64}")); + assert_eq!(Seq::infinite(), e(r"x{0}{4294967295}")); + assert_eq!(Seq::infinite(), e(r"(?:|){4294967295}")); + + assert_eq!( + Seq::infinite(), + e(r"(?:){8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") + ); + let repa = "a".repeat(100); + assert_eq!( + inexact([I(&repa)]), + e(r"a{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") + ); + } + + #[test] + fn optimize() { + // This gets a common prefix that isn't too short. + let s = e(r"foobarfoobar|foobar|foobarzfoobar|foobarfoobar"); + assert_eq!(seq([I("foobar")]), s); + + // This also finds a common prefix, but since it's only one byte, it + // prefers the multiple literals. + let s = e(r"abba|akka|abccba"); + assert_eq!(exact(["abba", "akka", "abccba"]), s); + + let s = e(r"sam|samwise"); + assert_eq!(seq([E("sam")]), s); + + // The empty string is poisonous, so our seq becomes infinite, even + // though all literals are exact. + let s = e(r"foobarfoo|foo||foozfoo|foofoo"); + assert_eq!(Seq::infinite(), s); + + // A space is also poisonous, so our seq becomes infinite. But this + // only gets triggered when we don't have a completely exact sequence. + // When the sequence is exact, spaces are okay, since we presume that + // any prefilter will match a space more quickly than the regex engine. + // (When the sequence is exact, there's a chance of the prefilter being + // used without needing the regex engine at all.) + let s = e(r"foobarfoo|foo| |foofoo"); + assert_eq!(Seq::infinite(), s); } } diff --git a/crates/regex/src/matcher.rs b/crates/regex/src/matcher.rs index 73863374..7b132476 100644 --- a/crates/regex/src/matcher.rs +++ b/crates/regex/src/matcher.rs @@ -76,15 +76,7 @@ impl RegexMatcherBuilder { // then run the original regex on only that line. (In this case, the // regex engine is likely to handle this case for us since it's so // simple, but the idea applies.) - let fast_line_regex = match InnerLiterals::new(chir, re).one_regex() { - None => None, - Some(pattern) => { - Some(chir.config().build_many(&[pattern])?.to_regex()?) - } - }; - if let Some(ref re) = fast_line_regex { - log::debug!("extracted fast line regex: {:?}", re); - } + let fast_line_regex = InnerLiterals::new(chir, re).one_regex()?; // We override the line terminator in case the configured HIR doesn't // support it. -- cgit v1.2.3