diff options
author | Andrew Gallant <jamslam@gmail.com> | 2023-06-15 15:05:07 -0400 |
---|---|---|
committer | Andrew Gallant <jamslam@gmail.com> | 2023-07-05 14:04:29 -0400 |
commit | e028ea37928930c80e5c3172d1df306b85a86758 (patch) | |
tree | fed757294923a7a44bfd6e8a8fe14955ea5f4de2 | |
parent | 1035f6b1ff502eb5b1a5fc49a79f45971c772d47 (diff) |
regex: migrate grep-regex to regex-automata
We just do a "basic" dumb migration. We don't try to improve anything
here.
-rw-r--r-- | Cargo.lock | 1 | ||||
-rw-r--r-- | crates/globset/src/lib.rs | 12 | ||||
-rw-r--r-- | crates/ignore/Cargo.toml | 4 | ||||
-rw-r--r-- | crates/regex/Cargo.toml | 1 | ||||
-rw-r--r-- | crates/regex/src/config.rs | 73 | ||||
-rw-r--r-- | crates/regex/src/crlf.rs | 40 | ||||
-rw-r--r-- | crates/regex/src/error.rs | 36 | ||||
-rw-r--r-- | crates/regex/src/literal.rs | 20 | ||||
-rw-r--r-- | crates/regex/src/matcher.rs | 121 | ||||
-rw-r--r-- | crates/regex/src/multi.rs | 2 | ||||
-rw-r--r-- | crates/regex/src/non_matching.rs | 1 | ||||
-rw-r--r-- | crates/regex/src/word.rs | 55 |
12 files changed, 237 insertions, 129 deletions
@@ -200,6 +200,7 @@ dependencies = [ "grep-matcher", "log", "regex", + "regex-automata 0.3.0", "regex-syntax", "thread_local", ] diff --git a/crates/globset/src/lib.rs b/crates/globset/src/lib.rs index 8ea9af11..dca0f7e0 100644 --- a/crates/globset/src/lib.rs +++ b/crates/globset/src/lib.rs @@ -498,13 +498,23 @@ impl GlobSetBuilder { /// Constructing candidates has a very small cost associated with it, so /// callers may find it beneficial to amortize that cost when matching a single /// path against multiple globs or sets of globs. -#[derive(Clone, Debug)] +#[derive(Clone)] pub struct Candidate<'a> { path: Cow<'a, [u8]>, basename: Cow<'a, [u8]>, ext: Cow<'a, [u8]>, } +impl<'a> std::fmt::Debug for Candidate<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("Candidate") + .field("path", &self.path.as_bstr()) + .field("basename", &self.basename.as_bstr()) + .field("ext", &self.ext.as_bstr()) + .finish() + } +} + impl<'a> Candidate<'a> { /// Create a new candidate for matching from the given path. pub fn new<P: AsRef<Path> + ?Sized>(path: &'a P) -> Candidate<'a> { diff --git a/crates/ignore/Cargo.toml b/crates/ignore/Cargo.toml index 1fad5865..b946e2db 100644 --- a/crates/ignore/Cargo.toml +++ b/crates/ignore/Cargo.toml @@ -22,8 +22,8 @@ bench = false globset = { version = "0.4.10", path = "../globset" } lazy_static = "1.1" log = "0.4.5" -memchr = "2.1" -regex = "1.1" +memchr = "2.5" +regex = "1.8.3" same-file = "1.0.4" thread_local = "1" walkdir = "2.2.7" diff --git a/crates/regex/Cargo.toml b/crates/regex/Cargo.toml index 4f9b2951..52293a33 100644 --- a/crates/regex/Cargo.toml +++ b/crates/regex/Cargo.toml @@ -19,5 +19,6 @@ bstr = "1.5.0" grep-matcher = { version = "0.1.6", path = "../matcher" } log = "0.4.5" regex = "1.8.3" +regex-automata = { version = "0.3.0" } regex-syntax = "0.7.2" thread_local = "1.1.7" diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs index bb7430ee..ed1c2f86 100644 --- a/crates/regex/src/config.rs +++ b/crates/regex/src/config.rs @@ -1,15 +1,15 @@ -use grep_matcher::{ByteSet, LineTerminator}; -use regex::bytes::{Regex, RegexBuilder}; -use regex_syntax::ast::{self, Ast}; -use regex_syntax::hir::{self, Hir}; +use { + grep_matcher::{ByteSet, LineTerminator}, + regex_automata::meta::Regex, + regex_syntax::ast::{self, Ast}, + regex_syntax::hir::{self, Hir}, +}; -use crate::ast::AstAnalysis; -use crate::crlf::crlfify; -use crate::error::Error; -use crate::literal::LiteralSets; -use crate::multi::alternation_literals; -use crate::non_matching::non_matching_bytes; -use crate::strip::strip_from_match; +use crate::{ + ast::AstAnalysis, crlf::crlfify, error::Error, literal::LiteralSets, + multi::alternation_literals, non_matching::non_matching_bytes, + strip::strip_from_match, +}; /// Config represents the configuration of a regex matcher in this crate. /// The configuration is itself a rough combination of the knobs found in @@ -79,7 +79,7 @@ impl Config { .unicode(self.unicode) .build() .translate(pattern, &ast) - .map_err(Error::regex)?; + .map_err(Error::generic)?; let expr = match self.line_terminator { None => expr, Some(line_term) => strip_from_match(expr, line_term)?, @@ -133,7 +133,7 @@ impl Config { .ignore_whitespace(self.ignore_whitespace) .build() .parse(pattern) - .map_err(Error::regex) + .map_err(Error::generic) } } @@ -212,7 +212,13 @@ impl ConfiguredHIR { /// Builds a regular expression from this HIR expression. pub fn regex(&self) -> Result<Regex, Error> { - self.pattern_to_regex(&self.expr.to_string()) + self.pattern_to_regex(&self.pattern()) + } + + /// Returns the pattern string by converting this HIR to its concrete + /// syntax. + pub fn pattern(&self) -> String { + self.expr.to_string() } /// If this HIR corresponds to an alternation of literals with no @@ -234,7 +240,7 @@ impl ConfiguredHIR { &self, mut f: F, ) -> Result<ConfiguredHIR, Error> { - self.pattern_to_hir(&f(&self.expr.to_string())) + self.pattern_to_hir(&f(&self.pattern())) } /// If the current configuration has a line terminator set and if useful @@ -286,15 +292,21 @@ impl ConfiguredHIR { // intention of the original pattern. For example, the Unicode flag // will impact how the WordMatcher functions, namely, whether its // word boundaries are Unicode aware or not. - RegexBuilder::new(&pattern) + let syntax = regex_automata::util::syntax::Config::new() + .utf8(false) .nest_limit(self.config.nest_limit) .octal(self.config.octal) .multi_line(self.config.multi_line) .dot_matches_new_line(self.config.dot_matches_new_line) - .unicode(self.config.unicode) - .size_limit(self.config.size_limit) - .dfa_size_limit(self.config.dfa_size_limit) - .build() + .unicode(self.config.unicode); + let meta = Regex::config() + .utf8_empty(false) + .nfa_size_limit(Some(self.config.size_limit)) + .hybrid_cache_capacity(self.config.dfa_size_limit); + Regex::builder() + .syntax(syntax) + .configure(meta) + .build(pattern) .map_err(Error::regex) } @@ -303,7 +315,7 @@ impl ConfiguredHIR { fn pattern_to_hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> { // See `pattern_to_regex` comment for explanation of why we only set // a subset of knobs here. e.g., `swap_greed` is explicitly left out. - let expr = ::regex_syntax::ParserBuilder::new() + let expr = regex_syntax::ParserBuilder::new() .nest_limit(self.config.nest_limit) .octal(self.config.octal) .utf8(false) @@ -312,7 +324,7 @@ impl ConfiguredHIR { .unicode(self.config.unicode) .build() .parse(pattern) - .map_err(Error::regex)?; + .map_err(Error::generic)?; Ok(ConfiguredHIR { original: self.original.clone(), config: self.config.clone(), @@ -320,4 +332,21 @@ impl ConfiguredHIR { expr, }) } + + /* + fn syntax_config(&self) -> regex_automata::util::syntax::Config { + regex_automata::util::syntax::Config::new() + .nest_limit(self.config.nest_limit) + .octal(self.config.octal) + .multi_line(self.config.multi_line) + .dot_matches_new_line(self.config.dot_matches_new_line) + .unicode(self.config.unicode) + } + + fn meta_config(&self) -> regex_automata::meta::Config { + Regex::config() + .nfa_size_limit(Some(self.config.size_limit)) + .hybrid_cache_capacity(self.config.dfa_size_limit) + } + */ } diff --git a/crates/regex/src/crlf.rs b/crates/regex/src/crlf.rs index b0c85c84..e32204ef 100644 --- a/crates/regex/src/crlf.rs +++ b/crates/regex/src/crlf.rs @@ -1,18 +1,20 @@ use std::collections::HashMap; -use grep_matcher::{Match, Matcher, NoError}; -use regex::bytes::Regex; -use regex_syntax::hir::{self, Hir, HirKind}; +use { + grep_matcher::{Match, Matcher, NoError}, + regex_automata::{meta::Regex, Input, PatternID}, + regex_syntax::hir::{self, Hir, HirKind}, +}; -use crate::config::ConfiguredHIR; -use crate::error::Error; -use crate::matcher::RegexCaptures; +use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures}; /// A matcher for implementing "word match" semantics. #[derive(Clone, Debug)] pub struct CRLFMatcher { /// The regex. regex: Regex, + /// The pattern string corresponding to the regex above. + pattern: String, /// A map from capture group name to capture group index. names: HashMap<String, usize>, } @@ -26,18 +28,21 @@ impl CRLFMatcher { assert!(expr.needs_crlf_stripped()); let regex = expr.regex()?; + let pattern = expr.pattern(); let mut names = HashMap::new(); - for (i, optional_name) in regex.capture_names().enumerate() { + let it = regex.group_info().pattern_names(PatternID::ZERO); + for (i, optional_name) in it.enumerate() { if let Some(name) = optional_name { names.insert(name.to_string(), i.checked_sub(1).unwrap()); } } - Ok(CRLFMatcher { regex, names }) + Ok(CRLFMatcher { regex, pattern, names }) } - /// Return the underlying regex used by this matcher. - pub fn regex(&self) -> &Regex { - &self.regex + /// Return the underlying pattern string for the regex used by this + /// matcher. + pub fn pattern(&self) -> &str { + &self.pattern } } @@ -50,7 +55,8 @@ impl Matcher for CRLFMatcher { haystack: &[u8], at: usize, ) -> Result<Option<Match>, NoError> { - let m = match self.regex.find_at(haystack, at) { + let input = Input::new(haystack).span(at..haystack.len()); + let m = match self.regex.find(input) { None => return Ok(None), Some(m) => Match::new(m.start(), m.end()), }; @@ -58,7 +64,7 @@ impl Matcher for CRLFMatcher { } fn new_captures(&self) -> Result<RegexCaptures, NoError> { - Ok(RegexCaptures::new(self.regex.capture_locations())) + Ok(RegexCaptures::new(self.regex.create_captures())) } fn capture_count(&self) -> usize { @@ -76,15 +82,15 @@ impl Matcher for CRLFMatcher { caps: &mut RegexCaptures, ) -> Result<bool, NoError> { caps.strip_crlf(false); - let r = - self.regex.captures_read_at(caps.locations_mut(), haystack, at); - if !r.is_some() { + let input = Input::new(haystack).span(at..haystack.len()); + self.regex.search_captures(&input, caps.locations_mut()); + if !caps.locations().is_match() { return Ok(false); } // If the end of our match includes a `\r`, then strip it from all // capture groups ending at the same location. - let end = caps.locations().get(0).unwrap().1; + let end = caps.locations().get_match().unwrap().end(); if end > 0 && haystack.get(end - 1) == Some(&b'\r') { caps.strip_crlf(true); } diff --git a/crates/regex/src/error.rs b/crates/regex/src/error.rs index c5358551..2320276c 100644 --- a/crates/regex/src/error.rs +++ b/crates/regex/src/error.rs @@ -18,7 +18,21 @@ impl Error { Error { kind } } - pub(crate) fn regex<E: error::Error>(err: E) -> Error { + pub(crate) fn regex(err: regex_automata::meta::BuildError) -> Error { + // Error { kind: ErrorKind::Regex(err.to_string()) } + if let Some(size_limit) = err.size_limit() { + let kind = ErrorKind::Regex(format!( + "compiled regex exceeds size limit of {size_limit}", + )); + Error { kind } + } else if let Some(ref err) = err.syntax_error() { + Error::generic(err) + } else { + Error::generic(err) + } + } + + pub(crate) fn generic<E: error::Error>(err: E) -> Error { Error { kind: ErrorKind::Regex(err.to_string()) } } @@ -30,6 +44,7 @@ impl Error { /// The kind of an error that can occur. #[derive(Clone, Debug)] +#[non_exhaustive] pub enum ErrorKind { /// An error that occurred as a result of parsing a regular expression. /// This can be a syntax error or an error that results from attempting to @@ -51,25 +66,9 @@ pub enum ErrorKind { /// /// The invalid byte is included in this error. InvalidLineTerminator(u8), - /// Hints that destructuring should not be exhaustive. - /// - /// This enum may grow additional variants, so this makes sure clients - /// don't count on exhaustive matching. (Otherwise, adding a new variant - /// could break existing code.) - #[doc(hidden)] - __Nonexhaustive, } -impl error::Error for Error { - fn description(&self) -> &str { - match self.kind { - ErrorKind::Regex(_) => "regex error", - ErrorKind::NotAllowed(_) => "literal not allowed", - ErrorKind::InvalidLineTerminator(_) => "invalid line terminator", - ErrorKind::__Nonexhaustive => unreachable!(), - } - } -} +impl error::Error for Error {} impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -82,7 +81,6 @@ impl fmt::Display for Error { let x = util::show_bytes(&[byte]); write!(f, "line terminators must be ASCII, but '{}' is not", x) } - ErrorKind::__Nonexhaustive => unreachable!(), } } } diff --git a/crates/regex/src/literal.rs b/crates/regex/src/literal.rs index 8058d618..19d0ccc2 100644 --- a/crates/regex/src/literal.rs +++ b/crates/regex/src/literal.rs @@ -1,5 +1,25 @@ use regex_syntax::hir::Hir; +// BREADCRUMBS: +// +// The way we deal with line terminators in the regex is clunky, but probably +// the least bad option for now unfortunately. +// +// The `non_matching_bytes` routine currently hardcodes line terminators for +// anchors. But it's not really clear it should even care about line terminators +// anyway, since anchors aren't actually part of a match. If we fix that +// though, that currently reveals a different bug elsewhere: '(?-m:^)' isn't +// implemented correctly in multi-line search, because it defers to the fast +// line-by-line strategy, which ends up being wrong. I think the way forward +// there is to: +// +// 1) Adding something in the grep-matcher interface that exposes a way to +// query for \A and \z specifically. If they're in the pattern, then we can +// decide how to handle them. +// +// 2) Perhaps provide a way to "translate \A/\z to ^/$" for cases when +// mulit-line search is not enabled. + #[derive(Clone, Debug)] pub struct LiteralSets {} diff --git a/crates/regex/src/matcher.rs b/crates/regex/src/matcher.rs index 350cf0c8..a32f4f31 100644 --- a/crates/regex/src/matcher.rs +++ b/crates/regex/src/matcher.rs @@ -1,15 +1,21 @@ -use std::collections::HashMap; - -use grep_matcher::{ - ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher, NoError, +use { + grep_matcher::{ + ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher, + NoError, + }, + regex_automata::{ + meta::Regex, util::captures::Captures as AutomataCaptures, Input, + PatternID, + }, }; -use regex::bytes::{CaptureLocations, Regex}; -use crate::config::{Config, ConfiguredHIR}; -use crate::crlf::CRLFMatcher; -use crate::error::Error; -use crate::multi::MultiLiteralMatcher; -use crate::word::WordMatcher; +use crate::{ + config::{Config, ConfiguredHIR}, + crlf::CRLFMatcher, + error::Error, + multi::MultiLiteralMatcher, + word::WordMatcher, +}; /// A builder for constructing a `Matcher` using regular expressions. /// @@ -73,6 +79,33 @@ impl RegexMatcherBuilder { &self, literals: &[B], ) -> Result<RegexMatcher, Error> { + // BREADCRUMBS: Ideally we would remove this method and just let the + // underlying regex engine handle this case. But... this is tricky. + // Part of the problem is that ripgrep escapes all patterns by the + // time the regex engine is constructed, which is necessary for PCRE2 + // for example. So that logic would need to change so that we don't + // escape things first. + // + // If we adjusted that, then I think we could just build an HIR value + // directly from the literals, thus skipping the parser altogether. + // + // But that still requires using and keeping this method. But we could + // at least get rid of the MultiLiteral matcher since the regex engine + // should now handle that case. + // + // Getting rid of this method is trickier, unless we make multi-pattern + // support a first class concept. But I don't think I want to go down + // that path? That implies we still need to accept a single pattern + // everywhere, which in turn means ripgrep would be forced to join + // the literals together using | and escape meta characters. By that + // point, we've lost. So I do think we still need this special method. + // But we can at least simplify the implementation. + // + // I still wonder if "fast parse" is still a good idea though. + // Basically, reject all nesting except for single-depth alternation. + // And reject character classes and all options. Just basically + // support `foo|bar|..|quux`. Maybe skip this for now I think. + let mut has_escape = false; let mut slices = vec![]; for lit in literals { @@ -430,10 +463,10 @@ impl RegexMatcherImpl { /// Return the underlying regex object used. fn regex(&self) -> String { match *self { - RegexMatcherImpl::Word(ref x) => x.regex().to_string(), - RegexMatcherImpl::CRLF(ref x) => x.regex().to_string(), + RegexMatcherImpl::Word(ref x) => x.pattern().to_string(), + RegexMatcherImpl::CRLF(ref x) => x.pattern().to_string(), RegexMatcherImpl::MultiLiteral(_) => "<N/A>".to_string(), - RegexMatcherImpl::Standard(ref x) => x.regex.to_string(), + RegexMatcherImpl::Standard(ref x) => x.pattern.clone(), } } } @@ -706,7 +739,10 @@ impl Matcher for RegexMatcher { ) -> Result<Option<LineMatchKind>, NoError> { Ok(match self.fast_line_regex { Some(ref regex) => { - regex.shortest_match(haystack).map(LineMatchKind::Candidate) + let input = Input::new(haystack); + regex + .search_half(&input) + .map(|hm| LineMatchKind::Candidate(hm.offset())) } None => { self.shortest_match(haystack)?.map(LineMatchKind::Confirmed) @@ -721,20 +757,15 @@ struct StandardMatcher { /// The regular expression compiled from the pattern provided by the /// caller. regex: Regex, - /// A map from capture group name to its corresponding index. - names: HashMap<String, usize>, + /// The underlying pattern string for the regex. + pattern: String, } impl StandardMatcher { fn new(expr: &ConfiguredHIR) -> Result<StandardMatcher, Error> { let regex = expr.regex()?; - let mut names = HashMap::new(); - for (i, optional_name) in regex.capture_names().enumerate() { - if let Some(name) = optional_name { - names.insert(name.to_string(), i); - } - } - Ok(StandardMatcher { regex, names }) + let pattern = expr.pattern(); + Ok(StandardMatcher { regex, pattern }) } } @@ -747,14 +778,12 @@ impl Matcher for StandardMatcher { haystack: &[u8], at: usize, ) -> Result<Option<Match>, NoError> { - Ok(self - .regex - .find_at(haystack, at) - .map(|m| Match::new(m.start(), m.end()))) + let input = Input::new(haystack).span(at..haystack.len()); + Ok(self.regex.find(input).map(|m| Match::new(m.start(), m.end()))) } fn new_captures(&self) -> Result<RegexCaptures, NoError> { - Ok(RegexCaptures::new(self.regex.capture_locations())) + Ok(RegexCaptures::new(self.regex.create_captures())) } fn capture_count(&self) -> usize { @@ -762,7 +791,7 @@ impl Matcher for StandardMatcher { } fn capture_index(&self, name: &str) -> Option<usize> { - self.names.get(name).map(|i| *i) + self.regex.group_info().to_index(PatternID::ZERO, name) } fn try_find_iter<F, E>( @@ -789,10 +818,10 @@ impl Matcher for StandardMatcher { at: usize, caps: &mut RegexCaptures, ) -> Result<bool, NoError> { - Ok(self - .regex - .captures_read_at(&mut caps.locations_mut(), haystack, at) - .is_some()) + let input = Input::new(haystack).span(at..haystack.len()); + let caps = caps.locations_mut(); + self.regex.search_captures(&input, caps); + Ok(caps.is_match()) } fn shortest_match_at( @@ -800,7 +829,8 @@ impl Matcher for StandardMatcher { haystack: &[u8], at: usize, ) -> Result<Option<usize>, NoError> { - Ok(self.regex.shortest_match_at(haystack, at)) + let input = Input::new(haystack).span(at..haystack.len()); + Ok(self.regex.search_half(&input).map(|hm| hm.offset())) } } @@ -829,7 +859,7 @@ enum RegexCapturesImp { }, Regex { /// Where the locations are stored. - locs: CaptureLocations, + locs: AutomataCaptures, /// These captures behave as if the capturing groups begin at the given /// offset. When set to `0`, this has no affect and capture groups are /// indexed like normal. @@ -852,7 +882,7 @@ impl Captures for RegexCaptures { match self.0 { RegexCapturesImp::AhoCorasick { .. } => 1, RegexCapturesImp::Regex { ref locs, offset, .. } => { - locs.len().checked_sub(offset).unwrap() + locs.group_info().all_group_len().checked_sub(offset).unwrap() } } } @@ -869,20 +899,25 @@ impl Captures for RegexCaptures { RegexCapturesImp::Regex { ref locs, offset, strip_crlf } => { if !strip_crlf { let actual = i.checked_add(offset).unwrap(); - return locs.pos(actual).map(|(s, e)| Match::new(s, e)); + return locs + .get_group(actual) + .map(|sp| Match::new(sp.start, sp.end)); } // currently don't support capture offsetting with CRLF // stripping assert_eq!(offset, 0); - let m = match locs.pos(i).map(|(s, e)| Match::new(s, e)) { + let m = match locs + .get_group(i) + .map(|sp| Match::new(sp.start, sp.end)) + { None => return None, Some(m) => m, }; // If the end position of this match corresponds to the end // position of the overall match, then we apply our CRLF // stripping. Otherwise, we cannot assume stripping is correct. - if i == 0 || m.end() == locs.pos(0).unwrap().1 { + if i == 0 || m.end() == locs.get_group(0).unwrap().end { Some(m.with_end(m.end() - 1)) } else { Some(m) @@ -897,12 +932,12 @@ impl RegexCaptures { RegexCaptures(RegexCapturesImp::AhoCorasick { mat: None }) } - pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures { + pub(crate) fn new(locs: AutomataCaptures) -> RegexCaptures { RegexCaptures::with_offset(locs, 0) } pub(crate) fn with_offset( - locs: CaptureLocations, + locs: AutomataCaptures, offset: usize, ) -> RegexCaptures { RegexCaptures(RegexCapturesImp::Regex { @@ -912,7 +947,7 @@ impl RegexCaptures { }) } - pub(crate) fn locations(&self) -> &CaptureLocations { + pub(crate) fn locations(&self) -> &AutomataCaptures { match self.0 { RegexCapturesImp::AhoCorasick { .. } => { panic!("getting locations for simple captures is invalid") @@ -921,7 +956,7 @@ impl RegexCaptures { } } - pub(crate) fn locations_mut(&mut self) -> &mut CaptureLocations { + pub(crate) fn locations_mut(&mut self) -> &mut AutomataCaptures { match self.0 { RegexCapturesImp::AhoCorasick { .. } => { panic!("getting locations for simple captures is invalid") diff --git a/crates/regex/src/multi.rs b/crates/regex/src/multi.rs index 9d2b6135..8c24a845 100644 --- a/crates/regex/src/multi.rs +++ b/crates/regex/src/multi.rs @@ -26,7 +26,7 @@ impl MultiLiteralMatcher { let ac = AhoCorasick::builder() .match_kind(MatchKind::LeftmostFirst) .build(literals) - .map_err(Error::regex)?; + .map_err(Error::generic)?; Ok(MultiLiteralMatcher { ac }) } } diff --git a/crates/regex/src/non_matching.rs b/crates/regex/src/non_matching.rs index eb890821..d19119cc 100644 --- a/crates/regex/src/non_matching.rs +++ b/crates/regex/src/non_matching.rs @@ -18,7 +18,6 @@ pub fn non_matching_bytes(expr: &Hir) -> ByteSet { fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) { match *expr.kind() { HirKind::Empty - // | HirKind::Look(Look::Start | Look::End) | HirKind::Look(Look::WordAscii | Look::WordAsciiNegate) | HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) => {} HirKind::Look(Look::Start | Look::End) => { diff --git a/crates/regex/src/word.rs b/crates/regex/src/word.rs index aa08164b..289c9923 100644 --- a/crates/regex/src/word.rs +++ b/crates/regex/src/word.rs @@ -1,27 +1,29 @@ -use std::cell::RefCell; -use std::collections::HashMap; -use std::sync::Arc; +use std::{cell::RefCell, collections::HashMap, sync::Arc}; -use grep_matcher::{Match, Matcher, NoError}; -use regex::bytes::{CaptureLocations, Regex}; -use thread_local::ThreadLocal; +use { + grep_matcher::{Match, Matcher, NoError}, + regex_automata::{ + meta::Regex, util::captures::Captures, Input, PatternID, + }, + thread_local::ThreadLocal, +}; -use crate::config::ConfiguredHIR; -use crate::error::Error; -use crate::matcher::RegexCaptures; +use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures}; /// A matcher for implementing "word match" semantics. #[derive(Debug)] pub struct WordMatcher { /// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`. regex: Regex, + /// The pattern string corresponding to the above regex. + pattern: String, /// The original regex supplied by the user, which we use in a fast path /// to try and detect matches before deferring to slower engines. original: Regex, /// A map from capture group name to capture group index. names: HashMap<String, usize>, /// A reusable buffer for finding the match location of the inner group. - locs: Arc<ThreadLocal<RefCell<CaptureLocations>>>, + locs: Arc<ThreadLocal<RefCell<Captures>>>, } impl Clone for WordMatcher { @@ -31,6 +33,7 @@ impl Clone for WordMatcher { // usings `locs` to hit the fast path. WordMatcher { regex: self.regex.clone(), + pattern: self.pattern.clone(), original: self.original.clone(), names: self.names.clone(), locs: Arc::new(ThreadLocal::new()), @@ -53,20 +56,23 @@ impl WordMatcher { pat })?; let regex = word_expr.regex()?; + let pattern = word_expr.pattern(); let locs = Arc::new(ThreadLocal::new()); let mut names = HashMap::new(); - for (i, optional_name) in regex.capture_names().enumerate() { + let it = regex.group_info().pattern_names(PatternID::ZERO); + for (i, optional_name) in it.enumerate() { if let Some(name) = optional_name { names.insert(name.to_string(), i.checked_sub(1).unwrap()); } } - Ok(WordMatcher { regex, original, names, locs }) + Ok(WordMatcher { regex, pattern, original, names, locs }) } - /// Return the underlying regex used by this matcher. - pub fn regex(&self) -> &Regex { - &self.regex + /// Return the underlying pattern string for the regex used by this + /// matcher. + pub fn pattern(&self) -> &str { + &self.pattern } /// Attempt to do a fast confirmation of a word match that covers a subset @@ -102,7 +108,8 @@ impl WordMatcher { // The reason why we cannot handle the ^/$ cases here is because we // can't assume anything about the original pattern. (Try commenting // out the checks for ^/$ below and run the tests to see examples.) - let mut cand = match self.regex.find_at(haystack, at) { + let input = Input::new(haystack).span(at..haystack.len()); + let mut cand = match self.regex.find(input) { None => return Ok(None), Some(m) => Match::new(m.start(), m.end()), }; @@ -154,14 +161,15 @@ impl Matcher for WordMatcher { } let cell = - self.locs.get_or(|| RefCell::new(self.regex.capture_locations())); + self.locs.get_or(|| RefCell::new(self.regex.create_captures())); + let input = Input::new(haystack).span(at..haystack.len()); let mut caps = cell.borrow_mut(); - self.regex.captures_read_at(&mut caps, haystack, at); - Ok(caps.get(1).map(|m| Match::new(m.0, m.1))) + self.regex.search_captures(&input, &mut caps); + Ok(caps.get_group(1).map(|sp| Match::new(sp.start, sp.end))) } fn new_captures(&self) -> Result<RegexCaptures, NoError> { - Ok(RegexCaptures::with_offset(self.regex.capture_locations(), 1)) + Ok(RegexCaptures::with_offset(self.regex.create_captures(), 1)) } fn capture_count(&self) -> usize { @@ -178,9 +186,10 @@ impl Matcher for WordMatcher { at: usize, caps: &mut RegexCaptures, ) -> Result<bool, NoError> { - let r = - self.regex.captures_read_at(caps.locations_mut(), haystack, at); - Ok(r.is_some()) + let input = Input::new(haystack).span(at..haystack.len()); + let caps = caps.locations_mut(); + self.regex.search_captures(&input, caps); + Ok(caps.is_match()) } // We specifically do not implement other methods like find_iter or |