diff options
author | Andrew Gallant <jamslam@gmail.com> | 2023-10-09 18:23:36 -0400 |
---|---|---|
committer | Andrew Gallant <jamslam@gmail.com> | 2023-10-09 20:29:52 -0400 |
commit | 9626f167573527858f9736a3054882de87d6cd79 (patch) | |
tree | 83a5bd062eff724bc857d8b83b7c008cba882ad9 /crates | |
parent | f7ff34fdf9d2853f9763aceb28f5dcb014728045 (diff) |
progress
Diffstat (limited to 'crates')
-rw-r--r-- | crates/globset/Cargo.toml | 4 | ||||
-rw-r--r-- | crates/ignore/Cargo.toml | 2 | ||||
-rw-r--r-- | crates/regex/Cargo.toml | 4 | ||||
-rw-r--r-- | crates/regex/src/ast.rs | 6 | ||||
-rw-r--r-- | crates/regex/src/config.rs | 91 | ||||
-rw-r--r-- | crates/regex/src/error.rs | 4 | ||||
-rw-r--r-- | crates/regex/src/lib.rs | 1 | ||||
-rw-r--r-- | crates/regex/src/matcher.rs | 397 | ||||
-rw-r--r-- | crates/regex/src/non_matching.rs | 9 | ||||
-rw-r--r-- | crates/regex/src/word.rs | 341 |
10 files changed, 80 insertions, 779 deletions
diff --git a/crates/globset/Cargo.toml b/crates/globset/Cargo.toml index b0602239..decc7804 100644 --- a/crates/globset/Cargo.toml +++ b/crates/globset/Cargo.toml @@ -26,12 +26,12 @@ log = { version = "0.4.20", optional = true } serde = { version = "1.0.188", optional = true } [dependencies.regex-syntax] -version = "0.7.5" +version = "0.8.0" default-features = false features = ["std"] [dependencies.regex-automata] -version = "0.3.8" +version = "0.4.0" default-features = false features = ["std", "perf", "syntax", "meta", "nfa", "hybrid"] diff --git a/crates/ignore/Cargo.toml b/crates/ignore/Cargo.toml index 31771f17..81dc9284 100644 --- a/crates/ignore/Cargo.toml +++ b/crates/ignore/Cargo.toml @@ -27,7 +27,7 @@ same-file = "1.0.6" walkdir = "2.4.0" [dependencies.regex-automata] -version = "0.3.8" +version = "0.4.0" default-features = false features = ["std", "perf", "syntax", "meta", "nfa", "hybrid", "dfa-onepass"] diff --git a/crates/regex/Cargo.toml b/crates/regex/Cargo.toml index f0ca8394..f3266081 100644 --- a/crates/regex/Cargo.toml +++ b/crates/regex/Cargo.toml @@ -17,5 +17,5 @@ edition = "2021" bstr = "1.6.2" grep-matcher = { version = "0.1.6", path = "../matcher" } log = "0.4.20" -regex-automata = { version = "0.3.8" } -regex-syntax = "0.7.5" +regex-automata = { version = "0.4.0" } +regex-syntax = "0.8.0" diff --git a/crates/regex/src/ast.rs b/crates/regex/src/ast.rs index 4d170565..a5a0573a 100644 --- a/crates/regex/src/ast.rs +++ b/crates/regex/src/ast.rs @@ -62,12 +62,12 @@ impl AstAnalysis { Ast::Flags(_) | Ast::Dot(_) | Ast::Assertion(_) - | Ast::Class(ast::Class::Unicode(_)) - | Ast::Class(ast::Class::Perl(_)) => {} + | Ast::ClassUnicode(_) + | Ast::ClassPerl(_) => {} Ast::Literal(ref x) => { self.from_ast_literal(x); } - Ast::Class(ast::Class::Bracketed(ref x)) => { + Ast::ClassBracketed(ref x) => { self.from_ast_class_set(&x.kind); } Ast::Repetition(ref x) => { diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs index 79642580..8c69ef54 100644 --- a/crates/regex/src/config.rs +++ b/crates/regex/src/config.rs @@ -3,7 +3,7 @@ use { regex_automata::meta::Regex, regex_syntax::{ ast, - hir::{self, Hir, HirKind}, + hir::{self, Hir}, }, }; @@ -296,35 +296,6 @@ impl ConfiguredHIR { } } - /// Turns this configured HIR into one that only matches when both sides of - /// the match correspond to a word boundary. - /// - /// Note that the HIR returned is like turning `pat` into - /// `(?m:^|\W)(pat)(?m:$|\W)`. That is, the true match is at capture group - /// `1` and not `0`. - pub(crate) fn into_word(self) -> Result<ConfiguredHIR, Error> { - // In theory building the HIR for \W should never fail, but there are - // likely some pathological cases (particularly with respect to certain - // values of limits) where it could in theory fail. - let non_word = { - let mut config = self.config.clone(); - config.fixed_strings = false; - ConfiguredHIR::new(config, &[r"\W"])? - }; - let line_anchor_start = Hir::look(self.line_anchor_start()); - let line_anchor_end = Hir::look(self.line_anchor_end()); - let hir = Hir::concat(vec![ - Hir::alternation(vec![line_anchor_start, non_word.hir.clone()]), - Hir::capture(hir::Capture { - index: 1, - name: None, - sub: Box::new(renumber_capture_indices(self.hir)?), - }), - Hir::alternation(vec![non_word.hir, line_anchor_end]), - ]); - Ok(ConfiguredHIR { config: self.config, hir }) - } - /// Turns this configured HIR into an equivalent one, but where it must /// match at the start and end of a line. pub(crate) fn into_whole_line(self) -> ConfiguredHIR { @@ -336,12 +307,20 @@ impl ConfiguredHIR { } /// Turns this configured HIR into an equivalent one, but where it must - /// match at the start and end of the haystack. - pub(crate) fn into_anchored(self) -> ConfiguredHIR { + /// match at word boundaries. + pub(crate) fn into_word(self) -> ConfiguredHIR { let hir = Hir::concat(vec![ - Hir::look(hir::Look::Start), + Hir::look(if self.config.unicode { + hir::Look::WordStartHalfUnicode + } else { + hir::Look::WordStartHalfAscii + }), self.hir, - Hir::look(hir::Look::End), + Hir::look(if self.config.unicode { + hir::Look::WordEndHalfUnicode + } else { + hir::Look::WordEndHalfAscii + }), ]); ConfiguredHIR { config: self.config, hir } } @@ -365,50 +344,6 @@ impl ConfiguredHIR { } } -/// This increments the index of every capture group in the given hir by 1. If -/// any increment results in an overflow, then an error is returned. -fn renumber_capture_indices(hir: Hir) -> Result<Hir, Error> { - Ok(match hir.into_kind() { - HirKind::Empty => Hir::empty(), - HirKind::Literal(hir::Literal(lit)) => Hir::literal(lit), - HirKind::Class(cls) => Hir::class(cls), - HirKind::Look(x) => Hir::look(x), - HirKind::Repetition(mut x) => { - x.sub = Box::new(renumber_capture_indices(*x.sub)?); - Hir::repetition(x) - } - HirKind::Capture(mut cap) => { - cap.index = match cap.index.checked_add(1) { - Some(index) => index, - None => { - // This error message kind of sucks, but it's probably - // impossible for it to happen. The only way a capture - // index can overflow addition is if the regex is huge - // (or something else has gone horribly wrong). - let msg = "could not renumber capture index, too big"; - return Err(Error::any(msg)); - } - }; - cap.sub = Box::new(renumber_capture_indices(*cap.sub)?); - Hir::capture(cap) - } - HirKind::Concat(subs) => { - let subs = subs - .into_iter() - .map(|sub| renumber_capture_indices(sub)) - .collect::<Result<Vec<Hir>, Error>>()?; - Hir::concat(subs) - } - HirKind::Alternation(subs) => { - let subs = subs - .into_iter() - .map(|sub| renumber_capture_indices(sub)) - .collect::<Result<Vec<Hir>, Error>>()?; - Hir::alternation(subs) - } - }) -} - /// Returns true if the given literal string contains any byte from the line /// terminator given. fn has_line_terminator(lineterm: LineTerminator, literal: &str) -> bool { diff --git a/crates/regex/src/error.rs b/crates/regex/src/error.rs index 1c921773..88a8adbe 100644 --- a/crates/regex/src/error.rs +++ b/crates/regex/src/error.rs @@ -30,10 +30,6 @@ impl Error { Error { kind: ErrorKind::Regex(err.to_string()) } } - pub(crate) fn any<E: ToString>(msg: E) -> Error { - Error { kind: ErrorKind::Regex(msg.to_string()) } - } - /// Return the kind of this error. pub fn kind(&self) -> &ErrorKind { &self.kind diff --git a/crates/regex/src/lib.rs b/crates/regex/src/lib.rs index 068c7c71..4693bff1 100644 --- a/crates/regex/src/lib.rs +++ b/crates/regex/src/lib.rs @@ -15,4 +15,3 @@ mod literal; mod matcher; mod non_matching; mod strip; -mod word; diff --git a/crates/regex/src/matcher.rs b/crates/regex/src/matcher.rs index 65c61d27..f3f673ff 100644 --- a/crates/regex/src/matcher.rs +++ b/crates/regex/src/matcher.rs @@ -1,5 +1,3 @@ -use std::sync::Arc; - use { grep_matcher::{ ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher, @@ -11,12 +9,7 @@ use { }, }; -use crate::{ - config::{Config, ConfiguredHIR}, - error::Error, - literal::InnerLiterals, - word::WordMatcher, -}; +use crate::{config::Config, error::Error, literal::InnerLiterals}; /// A builder for constructing a `Matcher` using regular expressions. /// @@ -61,9 +54,15 @@ impl RegexMatcherBuilder { &self, patterns: &[P], ) -> Result<RegexMatcher, Error> { - let chir = self.config.build_many(patterns)?; - let matcher = RegexMatcherImpl::new(chir)?; - let (chir, re) = (matcher.chir(), matcher.regex()); + let mut chir = self.config.build_many(patterns)?; + // 'whole_line' is a strict subset of 'word', so when it is enabled, + // we don't need to both with any specific to word matching. + if chir.config().whole_line { + chir = chir.into_whole_line(); + } else if chir.config().word { + chir = chir.into_word(); + } + let regex = chir.to_regex()?; log::trace!("final regex: {:?}", chir.hir().to_string()); let non_matching_bytes = chir.non_matching_bytes(); @@ -76,18 +75,13 @@ impl RegexMatcherBuilder { // then run the original regex on only that line. (In this case, the // regex engine is likely to handle this case for us since it's so // simple, but the idea applies.) - let fast_line_regex = InnerLiterals::new(chir, re).one_regex()?; + let fast_line_regex = InnerLiterals::new(&chir, ®ex).one_regex()?; // We override the line terminator in case the configured HIR doesn't // support it. let mut config = self.config.clone(); config.line_terminator = chir.line_terminator(); - Ok(RegexMatcher { - config, - matcher, - fast_line_regex, - non_matching_bytes, - }) + Ok(RegexMatcher { config, regex, fast_line_regex, non_matching_bytes }) } /// Build a new matcher from a plain alternation of literals. @@ -357,8 +351,9 @@ impl RegexMatcherBuilder { pub struct RegexMatcher { /// The configuration specified by the caller. config: Config, - /// The underlying matcher implementation. - matcher: RegexMatcherImpl, + /// The regular expression compiled from the pattern provided by the + /// caller. + regex: Regex, /// A regex that never reports false negatives but may report false /// positives that is believed to be capable of being matched more quickly /// than `regex`. Typically, this is a single literal or an alternation @@ -392,53 +387,6 @@ impl RegexMatcher { } } -/// An encapsulation of the type of matcher we use in `RegexMatcher`. -#[derive(Clone, Debug)] -enum RegexMatcherImpl { - /// The standard matcher used for all regular expressions. - Standard(StandardMatcher), - /// A matcher that only matches at word boundaries. This transforms the - /// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`. - /// Because of this, the WordMatcher provides its own implementation of - /// `Matcher` to encapsulate its use of capture groups to make them - /// invisible to the caller. - Word(WordMatcher), -} - -impl RegexMatcherImpl { - /// Based on the configuration, create a new implementation of the - /// `Matcher` trait. - fn new(mut chir: ConfiguredHIR) -> Result<RegexMatcherImpl, Error> { - // When whole_line is set, we don't use a word matcher even if word - // matching was requested. Why? Because `(?m:^)(pat)(?m:$)` implies - // word matching. - Ok(if chir.config().word && !chir.config().whole_line { - RegexMatcherImpl::Word(WordMatcher::new(chir)?) - } else { - if chir.config().whole_line { - chir = chir.into_whole_line(); - } - RegexMatcherImpl::Standard(StandardMatcher::new(chir)?) - }) - } - - /// Return the underlying regex object used. - fn regex(&self) -> &Regex { - match *self { - RegexMatcherImpl::Word(ref x) => x.regex(), - RegexMatcherImpl::Standard(ref x) => &x.regex, - } - } - - /// Return the underlying HIR of the regex used for searching. - fn chir(&self) -> &ConfiguredHIR { - match *self { - RegexMatcherImpl::Word(ref x) => x.chir(), - RegexMatcherImpl::Standard(ref x) => &x.chir, - } - } -} - // This implementation just dispatches on the internal matcher impl except // for the line terminator optimization, which is possibly executed via // `fast_line_regex`. @@ -446,221 +394,84 @@ impl Matcher for RegexMatcher { type Captures = RegexCaptures; type Error = NoError; + #[inline] fn find_at( &self, haystack: &[u8], at: usize, ) -> Result<Option<Match>, NoError> { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.find_at(haystack, at), - Word(ref m) => m.find_at(haystack, at), - } + let input = Input::new(haystack).span(at..haystack.len()); + Ok(self.regex.find(input).map(|m| Match::new(m.start(), m.end()))) } + #[inline] fn new_captures(&self) -> Result<RegexCaptures, NoError> { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.new_captures(), - Word(ref m) => m.new_captures(), - } + Ok(RegexCaptures::new(self.regex.create_captures())) } + #[inline] fn capture_count(&self) -> usize { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.capture_count(), - Word(ref m) => m.capture_count(), - } + self.regex.captures_len() } + #[inline] fn capture_index(&self, name: &str) -> Option<usize> { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.capture_index(name), - Word(ref m) => m.capture_index(name), - } - } - - fn find(&self, haystack: &[u8]) -> Result<Option<Match>, NoError> { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.find(haystack), - Word(ref m) => m.find(haystack), - } - } - - fn find_iter<F>(&self, haystack: &[u8], matched: F) -> Result<(), NoError> - where - F: FnMut(Match) -> bool, - { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.find_iter(haystack, matched), - Word(ref m) => m.find_iter(haystack, matched), - } + self.regex.group_info().to_index(PatternID::ZERO, name) } + #[inline] fn try_find_iter<F, E>( &self, haystack: &[u8], - matched: F, + mut matched: F, ) -> Result<Result<(), E>, NoError> where F: FnMut(Match) -> Result<bool, E>, { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.try_find_iter(haystack, matched), - Word(ref m) => m.try_find_iter(haystack, matched), - } - } - - fn captures( - &self, - haystack: &[u8], - caps: &mut RegexCaptures, - ) -> Result<bool, NoError> { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.captures(haystack, caps), - Word(ref m) => m.captures(haystack, caps), - } - } - - fn captures_iter<F>( - &self, - haystack: &[u8], - caps: &mut RegexCaptures, - matched: F, - ) -> Result<(), NoError> - where - F: FnMut(&RegexCaptures) -> bool, - { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.captures_iter(haystack, caps, matched), - Word(ref m) => m.captures_iter(haystack, caps, matched), - } - } - - fn try_captures_iter<F, E>( - &self, - haystack: &[u8], - caps: &mut RegexCaptures, - matched: F, - ) -> Result<Result<(), E>, NoError> - where - F: FnMut(&RegexCaptures) -> Result<bool, E>, - { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.try_captures_iter(haystack, caps, matched), - Word(ref m) => m.try_captures_iter(haystack, caps, matched), + for m in self.regex.find_iter(haystack) { + match matched(Match::new(m.start(), m.end())) { + Ok(true) => continue, + Ok(false) => return Ok(Ok(())), + Err(err) => return Ok(Err(err)), + } } + Ok(Ok(())) } + #[inline] fn captures_at( &self, haystack: &[u8], at: usize, caps: &mut RegexCaptures, ) -> Result<bool, NoError> { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.captures_at(haystack, at, caps), - Word(ref m) => m.captures_at(haystack, at, caps), - } - } - - fn replace<F>( - &self, - haystack: &[u8], - dst: &mut Vec<u8>, - append: F, - ) -> Result<(), NoError> - where - F: FnMut(Match, &mut Vec<u8>) -> bool, - { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.replace(haystack, dst, append), - Word(ref m) => m.replace(haystack, dst, append), - } - } - - fn replace_with_captures<F>( - &self, - haystack: &[u8], - caps: &mut RegexCaptures, - dst: &mut Vec<u8>, - append: F, - ) -> Result<(), NoError> - where - F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool, - { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => { - m.replace_with_captures(haystack, caps, dst, append) - } - Word(ref m) => { - m.replace_with_captures(haystack, caps, dst, append) - } - } - } - - fn is_match(&self, haystack: &[u8]) -> Result<bool, NoError> { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.is_match(haystack), - Word(ref m) => m.is_match(haystack), - } - } - - fn is_match_at( - &self, - haystack: &[u8], - at: usize, - ) -> Result<bool, NoError> { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.is_match_at(haystack, at), - Word(ref m) => m.is_match_at(haystack, at), - } - } - - fn shortest_match( - &self, - haystack: &[u8], - ) -> Result<Option<usize>, NoError> { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.shortest_match(haystack), - Word(ref m) => m.shortest_match(haystack), - } + let input = Input::new(haystack).span(at..haystack.len()); + let caps = caps.captures_mut(); + self.regex.search_captures(&input, caps); + Ok(caps.is_match()) } + #[inline] fn shortest_match_at( &self, haystack: &[u8], at: usize, ) -> Result<Option<usize>, NoError> { - use self::RegexMatcherImpl::*; - match self.matcher { - Standard(ref m) => m.shortest_match_at(haystack, at), - Word(ref m) => m.shortest_match_at(haystack, at), - } + let input = Input::new(haystack).span(at..haystack.len()); + Ok(self.regex.search_half(&input).map(|hm| hm.offset())) } + #[inline] fn non_matching_bytes(&self) -> Option<&ByteSet> { Some(&self.non_matching_bytes) } + #[inline] fn line_terminator(&self) -> Option<LineTerminator> { self.config.line_terminator } + #[inline] fn find_candidate_line( &self, haystack: &[u8], @@ -679,93 +490,6 @@ impl Matcher for RegexMatcher { } } -/// The implementation of the standard regex matcher. -#[derive(Clone, Debug)] -struct StandardMatcher { - /// The regular expression compiled from the pattern provided by the - /// caller. - regex: Regex, - /// The HIR that produced this regex. - /// - /// We put this in an `Arc` because by the time it gets here, it won't - /// change. And because cloning and dropping an `Hir` is somewhat expensive - /// due to its deep recursive representation. - chir: Arc<ConfiguredHIR>, -} - -impl StandardMatcher { - fn new(chir: ConfiguredHIR) -> Result<StandardMatcher, Error> { - let chir = Arc::new(chir); - let regex = chir.to_regex()?; - Ok(StandardMatcher { regex, chir }) - } -} - -impl Matcher for StandardMatcher { - type Captures = RegexCaptures; - type Error = NoError; - - fn find_at( - &self, - haystack: &[u8], - at: usize, - ) -> Result<Option<Match>, NoError> { - let input = Input::new(haystack).span(at..haystack.len()); - Ok(self.regex.find(input).map(|m| Match::new(m.start(), m.end()))) - } - - fn new_captures(&self) -> Result<RegexCaptures, NoError> { - Ok(RegexCaptures::new(self.regex.create_captures())) - } - - fn capture_count(&self) -> usize { - self.regex.captures_len() - } - - fn capture_index(&self, name: &str) -> Option<usize> { - self.regex.group_info().to_index(PatternID::ZERO, name) - } - - fn try_find_iter<F, E>( - &self, - haystack: &[u8], - mut matched: F, - ) -> Result<Result<(), E>, NoError> - where - F: FnMut(Match) -> Result<bool, E>, - { - for m in self.regex.find_iter(haystack) { - match matched(Match::new(m.start(), m.end())) { - Ok(true) => continue, - Ok(false) => return Ok(Ok(())), - Err(err) => return Ok(Err(err)), - } - } - Ok(Ok(())) - } - - fn captures_at( - &self, - haystack: &[u8], - at: usize, - caps: &mut RegexCaptures, - ) -> Result<bool, NoError> { - let input = Input::new(haystack).span(at..haystack.len()); - let caps = caps.captures_mut(); - self.regex.search_captures(&input, caps); - Ok(caps.is_match()) - } - - fn shortest_match_at( - &self, - haystack: &[u8], - at: usize, - ) -> Result<Option<usize>, NoError> { - let input = Input::new(haystack).span(at..haystack.len()); - Ok(self.regex.search_half(&input).map(|hm| hm.offset())) - } -} - /// Represents the match offsets of each capturing group in a match. /// /// The first, or `0`th capture group, always corresponds to the entire match @@ -784,46 +508,27 @@ impl Matcher for StandardMatcher { pub struct RegexCaptures { /// Where the captures are stored. caps: AutomataCaptures, - /// These captures behave as if the capturing groups begin at the given - /// offset. When set to `0`, this has no affect and capture groups are - /// indexed like normal. - /// - /// This is useful when building matchers that wrap arbitrary regular - /// expressions. For example, `WordMatcher` takes an existing regex - /// `re` and creates `(?:^|\W)(re)(?:$|\W)`, but hides the fact that - /// the regex has been wrapped from the caller. In order to do this, - /// the matcher and the capturing groups must behave as if `(re)` is - /// the `0`th capture group. - offset: usize, } impl Captures for RegexCaptures { + #[inline] fn len(&self) -> usize { - self.caps - .group_info() - .all_group_len() - .checked_sub(self.offset) - .unwrap() + self.caps.group_info().all_group_len() } + #[inline] fn get(&self, i: usize) -> Option<Match> { - let actual = i.checked_add(self.offset).unwrap(); - self.caps.get_group(actual).map(|sp| Match::new(sp.start, sp.end)) + self.caps.get_group(i).map(|sp| Match::new(sp.start, sp.end)) } } impl RegexCaptures { + #[inline] pub(crate) fn new(caps: AutomataCaptures) -> RegexCaptures { - RegexCaptures::with_offset(caps, 0) - } - - pub(crate) fn with_offset( - caps: AutomataCaptures, - offset: usize, - ) -> RegexCaptures { - RegexCaptures { caps, offset } + RegexCaptures { caps } } + #[inline] pub(crate) fn captures_mut(&mut self) -> &mut AutomataCaptures { &mut self.caps } diff --git a/crates/regex/src/non_matching.rs b/crates/regex/src/non_matching.rs index 7fde6c46..f93ed13b 100644 --- a/crates/regex/src/non_matching.rs +++ b/crates/regex/src/non_matching.rs @@ -19,7 +19,14 @@ fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) { match *expr.kind() { HirKind::Empty | HirKind::Look(Look::WordAscii | Look::WordAsciiNegate) - | HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) => {} + | HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) + | HirKind::Look(Look::WordStartAscii | Look::WordStartUnicode) + | HirKind::Look(Look::WordEndAscii | Look::WordEndUnicode) + | HirKind::Look( + Look::WordStartHalfAscii | Look::WordStartHalfUnicode, + ) + | HirKind::Look(Look::WordEndHalfAscii | Look::WordEndHalfUnicode) => { + } HirKind::Look(Look::Start | Look::End) => { // FIXME: This is wrong, but not doing this leads to incorrect // results because of how anchored searches are implemented in diff --git a/crates/regex/src/word.rs b/crates/regex/src/word.rs deleted file mode 100644 index 52fb61ce..00000000 --- a/crates/regex/src/word.rs +++ /dev/null @@ -1,341 +0,0 @@ -use std::{ - collections::HashMap, - panic::{RefUnwindSafe, UnwindSafe}, - sync::Arc, -}; - -use { - grep_matcher::{Match, Matcher, NoError}, - regex_automata::{ - meta::Regex, util::captures::Captures, util::pool::Pool, Input, - PatternID, - }, -}; - -use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures}; - -type PoolFn = - Box<dyn Fn() -> Captures + Send + Sync + UnwindSafe + RefUnwindSafe>; - -/// A matcher for implementing "word match" semantics. -#[derive(Debug)] -pub(crate) struct WordMatcher { - /// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`. - regex: Regex, - /// The HIR that produced the regex above. We don't keep the HIR for the - /// `original` regex. - /// - /// We put this in an `Arc` because by the time it gets here, it won't - /// change. And because cloning and dropping an `Hir` is somewhat expensive - /// due to its deep recursive representation. - chir: Arc<ConfiguredHIR>, - /// The original regex supplied by the user, which we use in a fast path - /// to try and detect matches before deferring to slower engines. - original: Regex, - /// A map from capture group name to capture group index. - names: HashMap<String, usize>, - /// A thread-safe pool of reusable buffers for finding the match offset of - /// the inner group. - caps: Arc<Pool<Captures, PoolFn>>, -} - -impl Clone for WordMatcher { - fn clone(&self) -> WordMatcher { - // We implement Clone manually so that we get a fresh Pool such that it - // can set its own thread owner. This permits each thread usings `caps` - // to hit the fast path. - // - // Note that cloning a regex is "cheap" since it uses reference - // counting internally. - let re = self.regex.clone(); - WordMatcher { - regex: self.regex.clone(), - chir: Arc::clone(&self.chir), - original: self.original.clone(), - names: self.names.clone(), - caps: Arc::new(Pool::new(Box::new(move || re.create_captures()))), - } - } -} - -impl WordMatcher { - /// Create a new matcher from the given pattern that only produces matches - /// that are considered "words." - /// - /// The given options are used to construct the regular expression - /// internally. - pub(crate) fn new(chir: ConfiguredHIR) -> Result<WordMatcher, Error> { - let original = chir.clone().into_anchored().to_regex()?; - let chir = Arc::new(chir.into_word()?); - let regex = chir.to_regex()?; - let caps = Arc::new(Pool::new({ - let regex = regex.clone(); - Box::new(move || regex.create_captures()) as PoolFn - })); - - let mut names = HashMap::new(); - let it = regex.group_info().pattern_names(PatternID::ZERO); - for (i, optional_name) in it.enumerate() { - if let Some(name) = optional_name { - names.insert(name.to_string(), i.checked_sub(1).unwrap()); - } - } - Ok(WordMatcher { regex, chir, original, names, caps }) - } - - /// Return the underlying regex used to match at word boundaries. - /// - /// The original regex is in the capture group at index 1. - pub(crate) fn regex(&self) -> &Regex { - &self.regex - } - - /// Return the underlying HIR for the regex used to match at word - /// boundaries. - pub(crate) fn chir(&self) -> &ConfiguredHIR { - &self.chir - } - - /// Attempt to do a fast confirmation of a word match that covers a subset - /// (but hopefully a big subset) of most cases. Ok(Some(..)) is returned - /// when a match is found. Ok(None) is returned when there is definitively - /// no match. Err(()) is returned when this routine could not detect - /// whether there was a match or not. - fn fast_find( - &self, - haystack: &[u8], - at: usize, - ) -> Result<Option<Match>, ()> { - // This is a bit hairy. The whole point here is to avoid running a - // slower regex engine to extract capture groups. Remember, our word - // regex looks like this: - // - // (^|\W)(<original regex>)(\W|$) - // - // What we want are the match offsets of <original regex>. So in the - // easy/common case, the original regex will be sandwiched between - // two codepoints that are in the \W class. So our approach here is to - // look for a match of the overall word regexp, strip the \W ends and - // then check whether the original regex matches what's left. If so, - // then we are guaranteed a correct match. - // - // This only works though if we know that the match is sandwiched - // between two \W codepoints. This only occurs when neither ^ nor $ - // match. This in turn only occurs when the match is at either the - // beginning or end of the haystack. In either of those cases, we - // declare defeat and defer to the slower implementation. - // - // The reason why we cannot handle the ^/$ cases here is because we - // can't assume anything about the original pattern. (Try commenting - // out the checks for ^/$ below and run the tests to see examples.) - // - // NOTE(2023-07-31): After fixing #2574, this logic honestly still - // doesn't seem correct. Regex composition is hard. - let input = Input::new(haystack).span(at..haystack.len()); - let mut cand = match self.regex.find(input) { - None => return Ok(None),< |