regex: migrate grep-regex to regex-automata

We just do a "basic" dumb migration. We don't try to improve anything here.
author: Andrew Gallant <jamslam@gmail.com> 2023-06-15 15:05:07 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2023-07-05 14:04:29 -0400
commit: e028ea37928930c80e5c3172d1df306b85a86758 (patch)
tree: fed757294923a7a44bfd6e8a8fe14955ea5f4de2
parent: 1035f6b1ff502eb5b1a5fc49a79f45971c772d47 (diff)
12 files changed, 237 insertions, 129 deletions
diff --git a/Cargo.lock b/Cargo.lock
index a721f8c7..b86f49d6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -200,6 +200,7 @@ dependencies = [
  "grep-matcher",
  "log",
  "regex",
+ "regex-automata 0.3.0",
  "regex-syntax",
  "thread_local",
 ]
diff --git a/crates/globset/src/lib.rs b/crates/globset/src/lib.rs
index 8ea9af11..dca0f7e0 100644
--- a/crates/globset/src/lib.rs
+++ b/crates/globset/src/lib.rs
@@ -498,13 +498,23 @@ impl GlobSetBuilder {
 /// Constructing candidates has a very small cost associated with it, so
 /// callers may find it beneficial to amortize that cost when matching a single
 /// path against multiple globs or sets of globs.
-#[derive(Clone, Debug)]
+#[derive(Clone)]
 pub struct Candidate<'a> {
     path: Cow<'a, [u8]>,
     basename: Cow<'a, [u8]>,
     ext: Cow<'a, [u8]>,
 }
 
+impl<'a> std::fmt::Debug for Candidate<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        f.debug_struct("Candidate")
+            .field("path", &self.path.as_bstr())
+            .field("basename", &self.basename.as_bstr())
+            .field("ext", &self.ext.as_bstr())
+            .finish()
+    }
+}
+
 impl<'a> Candidate<'a> {
     /// Create a new candidate for matching from the given path.
     pub fn new<P: AsRef<Path> + ?Sized>(path: &'a P) -> Candidate<'a> {
diff --git a/crates/ignore/Cargo.toml b/crates/ignore/Cargo.toml
index 1fad5865..b946e2db 100644
--- a/crates/ignore/Cargo.toml
+++ b/crates/ignore/Cargo.toml
@@ -22,8 +22,8 @@ bench = false
 globset = { version = "0.4.10", path = "../globset" }
 lazy_static = "1.1"
 log = "0.4.5"
-memchr = "2.1"
-regex = "1.1"
+memchr = "2.5"
+regex = "1.8.3"
 same-file = "1.0.4"
 thread_local = "1"
 walkdir = "2.2.7"
diff --git a/crates/regex/Cargo.toml b/crates/regex/Cargo.toml
index 4f9b2951..52293a33 100644
--- a/crates/regex/Cargo.toml
+++ b/crates/regex/Cargo.toml
@@ -19,5 +19,6 @@ bstr = "1.5.0"
 grep-matcher = { version = "0.1.6", path = "../matcher" }
 log = "0.4.5"
 regex = "1.8.3"
+regex-automata = { version = "0.3.0" }
 regex-syntax = "0.7.2"
 thread_local = "1.1.7"
diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs
index bb7430ee..ed1c2f86 100644
--- a/crates/regex/src/config.rs
+++ b/crates/regex/src/config.rs
@@ -1,15 +1,15 @@
-use grep_matcher::{ByteSet, LineTerminator};
-use regex::bytes::{Regex, RegexBuilder};
-use regex_syntax::ast::{self, Ast};
-use regex_syntax::hir::{self, Hir};
+use {
+    grep_matcher::{ByteSet, LineTerminator},
+    regex_automata::meta::Regex,
+    regex_syntax::ast::{self, Ast},
+    regex_syntax::hir::{self, Hir},
+};
 
-use crate::ast::AstAnalysis;
-use crate::crlf::crlfify;
-use crate::error::Error;
-use crate::literal::LiteralSets;
-use crate::multi::alternation_literals;
-use crate::non_matching::non_matching_bytes;
-use crate::strip::strip_from_match;
+use crate::{
+    ast::AstAnalysis, crlf::crlfify, error::Error, literal::LiteralSets,
+    multi::alternation_literals, non_matching::non_matching_bytes,
+    strip::strip_from_match,
+};
 
 /// Config represents the configuration of a regex matcher in this crate.
 /// The configuration is itself a rough combination of the knobs found in
@@ -79,7 +79,7 @@ impl Config {
             .unicode(self.unicode)
             .build()
             .translate(pattern, &ast)
-            .map_err(Error::regex)?;
+            .map_err(Error::generic)?;
         let expr = match self.line_terminator {
             None => expr,
             Some(line_term) => strip_from_match(expr, line_term)?,
@@ -133,7 +133,7 @@ impl Config {
             .ignore_whitespace(self.ignore_whitespace)
             .build()
             .parse(pattern)
-            .map_err(Error::regex)
+            .map_err(Error::generic)
     }
 }
 
@@ -212,7 +212,13 @@ impl ConfiguredHIR {
 
     /// Builds a regular expression from this HIR expression.
     pub fn regex(&self) -> Result<Regex, Error> {
-        self.pattern_to_regex(&self.expr.to_string())
+        self.pattern_to_regex(&self.pattern())
+    }
+
+    /// Returns the pattern string by converting this HIR to its concrete
+    /// syntax.
+    pub fn pattern(&self) -> String {
+        self.expr.to_string()
     }
 
     /// If this HIR corresponds to an alternation of literals with no
@@ -234,7 +240,7 @@ impl ConfiguredHIR {
         &self,
         mut f: F,
     ) -> Result<ConfiguredHIR, Error> {
-        self.pattern_to_hir(&f(&self.expr.to_string()))
+        self.pattern_to_hir(&f(&self.pattern()))
     }
 
     /// If the current configuration has a line terminator set and if useful
@@ -286,15 +292,21 @@ impl ConfiguredHIR {
         // intention of the original pattern. For example, the Unicode flag
         // will impact how the WordMatcher functions, namely, whether its
         // word boundaries are Unicode aware or not.
-        RegexBuilder::new(&pattern)
+        let syntax = regex_automata::util::syntax::Config::new()
+            .utf8(false)
             .nest_limit(self.config.nest_limit)
             .octal(self.config.octal)
             .multi_line(self.config.multi_line)
             .dot_matches_new_line(self.config.dot_matches_new_line)
-            .unicode(self.config.unicode)
-            .size_limit(self.config.size_limit)
-            .dfa_size_limit(self.config.dfa_size_limit)
-            .build()
+            .unicode(self.config.unicode);
+        let meta = Regex::config()
+            .utf8_empty(false)
+            .nfa_size_limit(Some(self.config.size_limit))
+            .hybrid_cache_capacity(self.config.dfa_size_limit);
+        Regex::builder()
+            .syntax(syntax)
+            .configure(meta)
+            .build(pattern)
             .map_err(Error::regex)
     }
 
@@ -303,7 +315,7 @@ impl ConfiguredHIR {
     fn pattern_to_hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
         // See `pattern_to_regex` comment for explanation of why we only set
         // a subset of knobs here. e.g., `swap_greed` is explicitly left out.
-        let expr = ::regex_syntax::ParserBuilder::new()
+        let expr = regex_syntax::ParserBuilder::new()
             .nest_limit(self.config.nest_limit)
             .octal(self.config.octal)
             .utf8(false)
@@ -312,7 +324,7 @@ impl ConfiguredHIR {
             .unicode(self.config.unicode)
             .build()
             .parse(pattern)
-            .map_err(Error::regex)?;
+            .map_err(Error::generic)?;
         Ok(ConfiguredHIR {
             original: self.original.clone(),
             config: self.config.clone(),
@@ -320,4 +332,21 @@ impl ConfiguredHIR {
             expr,
         })
     }
+
+    /*
+    fn syntax_config(&self) -> regex_automata::util::syntax::Config {
+        regex_automata::util::syntax::Config::new()
+            .nest_limit(self.config.nest_limit)
+            .octal(self.config.octal)
+            .multi_line(self.config.multi_line)
+            .dot_matches_new_line(self.config.dot_matches_new_line)
+            .unicode(self.config.unicode)
+    }
+
+    fn meta_config(&self) -> regex_automata::meta::Config {
+        Regex::config()
+            .nfa_size_limit(Some(self.config.size_limit))
+            .hybrid_cache_capacity(self.config.dfa_size_limit)
+    }
+    */
 }
diff --git a/crates/regex/src/crlf.rs b/crates/regex/src/crlf.rs
index b0c85c84..e32204ef 100644
--- a/crates/regex/src/crlf.rs
+++ b/crates/regex/src/crlf.rs
@@ -1,18 +1,20 @@
 use std::collections::HashMap;
 
-use grep_matcher::{Match, Matcher, NoError};
-use regex::bytes::Regex;
-use regex_syntax::hir::{self, Hir, HirKind};
+use {
+    grep_matcher::{Match, Matcher, NoError},
+    regex_automata::{meta::Regex, Input, PatternID},
+    regex_syntax::hir::{self, Hir, HirKind},
+};
 
-use crate::config::ConfiguredHIR;
-use crate::error::Error;
-use crate::matcher::RegexCaptures;
+use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures};
 
 /// A matcher for implementing "word match" semantics.
 #[derive(Clone, Debug)]
 pub struct CRLFMatcher {
     /// The regex.
     regex: Regex,
+    /// The pattern string corresponding to the regex above.
+    pattern: String,
     /// A map from capture group name to capture group index.
     names: HashMap<String, usize>,
 }
@@ -26,18 +28,21 @@ impl CRLFMatcher {
         assert!(expr.needs_crlf_stripped());
 
         let regex = expr.regex()?;
+        let pattern = expr.pattern();
         let mut names = HashMap::new();
-        for (i, optional_name) in regex.capture_names().enumerate() {
+        let it = regex.group_info().pattern_names(PatternID::ZERO);
+        for (i, optional_name) in it.enumerate() {
             if let Some(name) = optional_name {
                 names.insert(name.to_string(), i.checked_sub(1).unwrap());
             }
         }
-        Ok(CRLFMatcher { regex, names })
+        Ok(CRLFMatcher { regex, pattern, names })
     }
 
-    /// Return the underlying regex used by this matcher.
-    pub fn regex(&self) -> &Regex {
-        &self.regex
+    /// Return the underlying pattern string for the regex used by this
+    /// matcher.
+    pub fn pattern(&self) -> &str {
+        &self.pattern
     }
 }
 
@@ -50,7 +55,8 @@ impl Matcher for CRLFMatcher {
         haystack: &[u8],
         at: usize,
     ) -> Result<Option<Match>, NoError> {
-        let m = match self.regex.find_at(haystack, at) {
+        let input = Input::new(haystack).span(at..haystack.len());
+        let m = match self.regex.find(input) {
             None => return Ok(None),
             Some(m) => Match::new(m.start(), m.end()),
         };
@@ -58,7 +64,7 @@ impl Matcher for CRLFMatcher {
     }
 
     fn new_captures(&self) -> Result<RegexCaptures, NoError> {
-        Ok(RegexCaptures::new(self.regex.capture_locations()))
+        Ok(RegexCaptures::new(self.regex.create_captures()))
     }
 
     fn capture_count(&self) -> usize {
@@ -76,15 +82,15 @@ impl Matcher for CRLFMatcher {
         caps: &mut RegexCaptures,
     ) -> Result<bool, NoError> {
         caps.strip_crlf(false);
-        let r =
-            self.regex.captures_read_at(caps.locations_mut(), haystack, at);
-        if !r.is_some() {
+        let input = Input::new(haystack).span(at..haystack.len());
+        self.regex.search_captures(&input, caps.locations_mut());
+        if !caps.locations().is_match() {
             return Ok(false);
         }
 
         // If the end of our match includes a `\r`, then strip it from all
         // capture groups ending at the same location.
-        let end = caps.locations().get(0).unwrap().1;
+        let end = caps.locations().get_match().unwrap().end();
         if end > 0 && haystack.get(end - 1) == Some(&b'\r') {
             caps.strip_crlf(true);
         }
diff --git a/crates/regex/src/error.rs b/crates/regex/src/error.rs
index c5358551..2320276c 100644
--- a/crates/regex/src/error.rs
+++ b/crates/regex/src/error.rs
@@ -18,7 +18,21 @@ impl Error {
         Error { kind }
     }
 
-    pub(crate) fn regex<E: error::Error>(err: E) -> Error {
+    pub(crate) fn regex(err: regex_automata::meta::BuildError) -> Error {
+        // Error { kind: ErrorKind::Regex(err.to_string()) }
+        if let Some(size_limit) = err.size_limit() {
+            let kind = ErrorKind::Regex(format!(
+                "compiled regex exceeds size limit of {size_limit}",
+            ));
+            Error { kind }
+        } else if let Some(ref err) = err.syntax_error() {
+            Error::generic(err)
+        } else {
+            Error::generic(err)
+        }
+    }
+
+    pub(crate) fn generic<E: error::Error>(err: E) -> Error {
         Error { kind: ErrorKind::Regex(err.to_string()) }
     }
 
@@ -30,6 +44,7 @@ impl Error {
 
 /// The kind of an error that can occur.
 #[derive(Clone, Debug)]
+#[non_exhaustive]
 pub enum ErrorKind {
     /// An error that occurred as a result of parsing a regular expression.
     /// This can be a syntax error or an error that results from attempting to
@@ -51,25 +66,9 @@ pub enum ErrorKind {
     ///
     /// The invalid byte is included in this error.
     InvalidLineTerminator(u8),
-    /// Hints that destructuring should not be exhaustive.
-    ///
-    /// This enum may grow additional variants, so this makes sure clients
-    /// don't count on exhaustive matching. (Otherwise, adding a new variant
-    /// could break existing code.)
-    #[doc(hidden)]
-    __Nonexhaustive,
 }
 
-impl error::Error for Error {
-    fn description(&self) -> &str {
-        match self.kind {
-            ErrorKind::Regex(_) => "regex error",
-            ErrorKind::NotAllowed(_) => "literal not allowed",
-            ErrorKind::InvalidLineTerminator(_) => "invalid line terminator",
-            ErrorKind::__Nonexhaustive => unreachable!(),
-        }
-    }
-}
+impl error::Error for Error {}
 
 impl fmt::Display for Error {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@@ -82,7 +81,6 @@ impl fmt::Display for Error {
                 let x = util::show_bytes(&[byte]);
                 write!(f, "line terminators must be ASCII, but '{}' is not", x)
             }
-            ErrorKind::__Nonexhaustive => unreachable!(),
         }
     }
 }
diff --git a/crates/regex/src/literal.rs b/crates/regex/src/literal.rs
index 8058d618..19d0ccc2 100644
--- a/crates/regex/src/literal.rs
+++ b/crates/regex/src/literal.rs
@@ -1,5 +1,25 @@
 use regex_syntax::hir::Hir;
 
+// BREADCRUMBS:
+//
+// The way we deal with line terminators in the regex is clunky, but probably
+// the least bad option for now unfortunately.
+//
+// The `non_matching_bytes` routine currently hardcodes line terminators for
+// anchors. But it's not really clear it should even care about line terminators
+// anyway, since anchors aren't actually part of a match. If we fix that
+// though, that currently reveals a different bug elsewhere: '(?-m:^)' isn't
+// implemented correctly in multi-line search, because it defers to the fast
+// line-by-line strategy, which ends up being wrong. I think the way forward
+// there is to:
+//
+// 1) Adding something in the grep-matcher interface that exposes a way to
+// query for \A and \z specifically. If they're in the pattern, then we can
+// decide how to handle them.
+//
+// 2) Perhaps provide a way to "translate \A/\z to ^/$" for cases when
+// mulit-line search is not enabled.
+
 #[derive(Clone, Debug)]
 pub struct LiteralSets {}
 
diff --git a/crates/regex/src/matcher.rs b/crates/regex/src/matcher.rs
index 350cf0c8..a32f4f31 100644
--- a/crates/regex/src/matcher.rs
+++ b/crates/regex/src/matcher.rs
@@ -1,15 +1,21 @@
-use std::collections::HashMap;
-
-use grep_matcher::{
-    ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher, NoError,
+use {
+    grep_matcher::{
+        ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher,
+        NoError,
+    },
+    regex_automata::{
+        meta::Regex, util::captures::Captures as AutomataCaptures, Input,
+        PatternID,
+    },
 };
-use regex::bytes::{CaptureLocations, Regex};
 
-use crate::config::{Config, ConfiguredHIR};
-use crate::crlf::CRLFMatcher;
-use crate::error::Error;
-use crate::multi::MultiLiteralMatcher;
-use crate::word::WordMatcher;
+use crate::{
+    config::{Config, ConfiguredHIR},
+    crlf::CRLFMatcher,
+    error::Error,
+    multi::MultiLiteralMatcher,
+    word::WordMatcher,
+};
 
 /// A builder for constructing a `Matcher` using regular expressions.
 ///
@@ -73,6 +79,33 @@ impl RegexMatcherBuilder {
         &self,
         literals: &[B],
     ) -> Result<RegexMatcher, Error> {
+        // BREADCRUMBS: Ideally we would remove this method and just let the
+        // underlying regex engine handle this case. But... this is tricky.
+        // Part of the problem is that ripgrep escapes all patterns by the
+        // time the regex engine is constructed, which is necessary for PCRE2
+        // for example. So that logic would need to change so that we don't
+        // escape things first.
+        //
+        // If we adjusted that, then I think we could just build an HIR value
+        // directly from the literals, thus skipping the parser altogether.
+        //
+        // But that still requires using and keeping this method. But we could
+        // at least get rid of the MultiLiteral matcher since the regex engine
+        // should now handle that case.
+        //
+        // Getting rid of this method is trickier, unless we make multi-pattern
+        // support a first class concept. But I don't think I want to go down
+        // that path? That implies we still need to accept a single pattern
+        // everywhere, which in turn means ripgrep would be forced to join
+        // the literals together using | and escape meta characters. By that
+        // point, we've lost. So I do think we still need this special method.
+        // But we can at least simplify the implementation.
+        //
+        // I still wonder if "fast parse" is still a good idea though.
+        // Basically, reject all nesting except for single-depth alternation.
+        // And reject character classes and all options. Just basically
+        // support `foo|bar|..|quux`. Maybe skip this for now I think.
+
         let mut has_escape = false;
         let mut slices = vec![];
         for lit in literals {
@@ -430,10 +463,10 @@ impl RegexMatcherImpl {
     /// Return the underlying regex object used.
     fn regex(&self) -> String {
         match *self {
-            RegexMatcherImpl::Word(ref x) => x.regex().to_string(),
-            RegexMatcherImpl::CRLF(ref x) => x.regex().to_string(),
+            RegexMatcherImpl::Word(ref x) => x.pattern().to_string(),
+            RegexMatcherImpl::CRLF(ref x) => x.pattern().to_string(),
             RegexMatcherImpl::MultiLiteral(_) => "<N/A>".to_string(),
-            RegexMatcherImpl::Standard(ref x) => x.regex.to_string(),
+            RegexMatcherImpl::Standard(ref x) => x.pattern.clone(),
         }
     }
 }
@@ -706,7 +739,10 @@ impl Matcher for RegexMatcher {
     ) -> Result<Option<LineMatchKind>, NoError> {
         Ok(match self.fast_line_regex {
             Some(ref regex) => {
-                regex.shortest_match(haystack).map(LineMatchKind::Candidate)
+                let input = Input::new(haystack);
+                regex
+                    .search_half(&input)
+                    .map(|hm| LineMatchKind::Candidate(hm.offset()))
             }
             None => {
                 self.shortest_match(haystack)?.map(LineMatchKind::Confirmed)
@@ -721,20 +757,15 @@ struct StandardMatcher {
     /// The regular expression compiled from the pattern provided by the
     /// caller.
     regex: Regex,
-    /// A map from capture group name to its corresponding index.
-    names: HashMap<String, usize>,
+    /// The underlying pattern string for the regex.
+    pattern: String,
 }
 
 impl StandardMatcher {
     fn new(expr: &ConfiguredHIR) -> Result<StandardMatcher, Error> {
         let regex = expr.regex()?;
-        let mut names = HashMap::new();
-        for (i, optional_name) in regex.capture_names().enumerate() {
-            if let Some(name) = optional_name {
-                names.insert(name.to_string(), i);
-            }
-        }
-        Ok(StandardMatcher { regex, names })
+        let pattern = expr.pattern();
+        Ok(StandardMatcher { regex, pattern })
     }
 }
 
@@ -747,14 +778,12 @@ impl Matcher for StandardMatcher {
         haystack: &[u8],
         at: usize,
     ) -> Result<Option<Match>, NoError> {
-        Ok(self
-            .regex
-            .find_at(haystack, at)
-            .map(|m| Match::new(m.start(), m.end())))
+        let input = Input::new(haystack).span(at..haystack.len());
+        Ok(self.regex.find(input).map(|m| Match::new(m.start(), m.end())))
     }
 
     fn new_captures(&self) -> Result<RegexCaptures, NoError> {
-        Ok(RegexCaptures::new(self.regex.capture_locations()))
+        Ok(RegexCaptures::new(self.regex.create_captures()))
     }
 
     fn capture_count(&self) -> usize {
@@ -762,7 +791,7 @@ impl Matcher for StandardMatcher {
     }
 
     fn capture_index(&self, name: &str) -> Option<usize> {
-        self.names.get(name).map(|i| *i)
+        self.regex.group_info().to_index(PatternID::ZERO, name)
     }
 
     fn try_find_iter<F, E>(
@@ -789,10 +818,10 @@ impl Matcher for StandardMatcher {
         at: usize,
         caps: &mut RegexCaptures,
     ) -> Result<bool, NoError> {
-        Ok(self
-            .regex
-            .captures_read_at(&mut caps.locations_mut(), haystack, at)
-            .is_some())
+        let input = Input::new(haystack).span(at..haystack.len());
+        let caps = caps.locations_mut();
+        self.regex.search_captures(&input, caps);
+        Ok(caps.is_match())
     }
 
     fn shortest_match_at(
@@ -800,7 +829,8 @@ impl Matcher for StandardMatcher {
         haystack: &[u8],
         at: usize,
     ) -> Result<Option<usize>, NoError> {
-        Ok(self.regex.shortest_match_at(haystack, at))
+        let input = Input::new(haystack).span(at..haystack.len());
+        Ok(self.regex.search_half(&input).map(|hm| hm.offset()))
     }
 }
 
@@ -829,7 +859,7 @@ enum RegexCapturesImp {
     },
     Regex {
         /// Where the locations are stored.
-        locs: CaptureLocations,
+        locs: AutomataCaptures,
         /// These captures behave as if the capturing groups begin at the given
         /// offset. When set to `0`, this has no affect and capture groups are
         /// indexed like normal.
@@ -852,7 +882,7 @@ impl Captures for RegexCaptures {
         match self.0 {
             RegexCapturesImp::AhoCorasick { .. } => 1,
             RegexCapturesImp::Regex { ref locs, offset, .. } => {
-                locs.len().checked_sub(offset).unwrap()
+                locs.group_info().all_group_len().checked_sub(offset).unwrap()
             }
         }
     }
@@ -869,20 +899,25 @@ impl Captures for RegexCaptures {
             RegexCapturesImp::Regex { ref locs, offset, strip_crlf } => {
                 if !strip_crlf {
                     let actual = i.checked_add(offset).unwrap();
-                    return locs.pos(actual).map(|(s, e)| Match::new(s, e));
+                    return locs
+                        .get_group(actual)
+                        .map(|sp| Match::new(sp.start, sp.end));
                 }
 
                 // currently don't support capture offsetting with CRLF
                 // stripping
                 assert_eq!(offset, 0);
-                let m = match locs.pos(i).map(|(s, e)| Match::new(s, e)) {
+                let m = match locs
+                    .get_group(i)
+                    .map(|sp| Match::new(sp.start, sp.end))
+                {
                     None => return None,
                     Some(m) => m,
                 };
                 // If the end position of this match corresponds to the end
                 // position of the overall match, then we apply our CRLF
                 // stripping. Otherwise, we cannot assume stripping is correct.
-                if i == 0 || m.end() == locs.pos(0).unwrap().1 {
+                if i == 0 || m.end() == locs.get_group(0).unwrap().end {
                     Some(m.with_end(m.end() - 1))
                 } else {
                     Some(m)
@@ -897,12 +932,12 @@ impl RegexCaptures {
         RegexCaptures(RegexCapturesImp::AhoCorasick { mat: None })
     }
 
-    pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures {
+    pub(crate) fn new(locs: AutomataCaptures) -> RegexCaptures {
         RegexCaptures::with_offset(locs, 0)
     }
 
     pub(crate) fn with_offset(
-        locs: CaptureLocations,
+        locs: AutomataCaptures,
         offset: usize,
     ) -> RegexCaptures {
         RegexCaptures(RegexCapturesImp::Regex {
@@ -912,7 +947,7 @@ impl RegexCaptures {
         })
     }
 
-    pub(crate) fn locations(&self) -> &CaptureLocations {
+    pub(crate) fn locations(&self) -> &AutomataCaptures {
         match self.0 {
             RegexCapturesImp::AhoCorasick { .. } => {
                 panic!("getting locations for simple captures is invalid")
@@ -921,7 +956,7 @@ impl RegexCaptures {
         }
     }
 
-    pub(crate) fn locations_mut(&mut self) -> &mut CaptureLocations {
+    pub(crate) fn locations_mut(&mut self) -> &mut AutomataCaptures {
         match self.0 {
             RegexCapturesImp::AhoCorasick { .. } => {
                 panic!("getting locations for simple captures is invalid")
diff --git a/crates/regex/src/multi.rs b/crates/regex/src/multi.rs
index 9d2b6135..8c24a845 100644
--- a/crates/regex/src/multi.rs
+++ b/crates/regex/src/multi.rs
@@ -26,7 +26,7 @@ impl MultiLiteralMatcher {
         let ac = AhoCorasick::builder()
             .match_kind(MatchKind::LeftmostFirst)
             .build(literals)
-            .map_err(Error::regex)?;
+            .map_err(Error::generic)?;
         Ok(MultiLiteralMatcher { ac })
     }
 }
diff --git a/crates/regex/src/non_matching.rs b/crates/regex/src/non_matching.rs
index eb890821..d19119cc 100644
--- a/crates/regex/src/non_matching.rs
+++ b/crates/regex/src/non_matching.rs
@@ -18,7 +18,6 @@ pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
 fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
     match *expr.kind() {
         HirKind::Empty
-        // | HirKind::Look(Look::Start | Look::End)
         | HirKind::Look(Look::WordAscii | Look::WordAsciiNegate)
         | HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) => {}
         HirKind::Look(Look::Start | Look::End) => {
diff --git a/crates/regex/src/word.rs b/crates/regex/src/word.rs
index aa08164b..289c9923 100644
--- a/crates/regex/src/word.rs
+++ b/crates/regex/src/word.rs
@@ -1,27 +1,29 @@
-use std::cell::RefCell;
-use std::collections::HashMap;
-use std::sync::Arc;
+use std::{cell::RefCell, collections::HashMap, sync::Arc};
 
-use grep_matcher::{Match, Matcher, NoError};
-use regex::bytes::{CaptureLocations, Regex};
-use thread_local::ThreadLocal;
+use {
+    grep_matcher::{Match, Matcher, NoError},
+    regex_automata::{
+        meta::Regex, util::captures::Captures, Input, PatternID,
+    },
+    thread_local::ThreadLocal,
+};
 
-use crate::config::ConfiguredHIR;
-use crate::error::Error;
-use crate::matcher::RegexCaptures;
+use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures};
 
 /// A matcher for implementing "word match" semantics.
 #[derive(Debug)]
 pub struct WordMatcher {
     /// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
     regex: Regex,
+    /// The pattern string corresponding to the above regex.
+    pattern: String,
     /// The original regex supplied by the user, which we use in a fast path
     /// to try and detect matches before deferring to slower engines.
     original: Regex,
     /// A map from capture group name to capture group index.
     names: HashMap<String, usize>,
     /// A reusable buffer for finding the match location of the inner group.
-    locs: Arc<ThreadLocal<RefCell<CaptureLocations>>>,
+    locs: Arc<ThreadLocal<RefCell<Captures>>>,
 }
 
 impl Clone for WordMatcher {
@@ -31,6 +33,7 @@ impl Clone for WordMatcher {
         // usings `locs` to hit the fast path.
         WordMatcher {
             regex: self.regex.clone(),
+            pattern: self.pattern.clone(),
             original: self.original.clone(),
             names: self.names.clone(),
             locs: Arc::new(ThreadLocal::new()),
@@ -53,20 +56,23 @@ impl WordMatcher {
             pat
         })?;
         let regex = word_expr.regex()?;
+        let pattern = word_expr.pattern();
         let locs = Arc::new(ThreadLocal::new());
 
         let mut names = HashMap::new();
-        for (i, optional_name) in regex.capture_names().enumerate() {
+        let it = regex.group_info().pattern_names(PatternID::ZERO);
+        for (i, optional_name) in it.enumerate() {
             if let Some(name) = optional_name {
                 names.insert(name.to_string(), i.checked_sub(1).unwrap());
             }
         }
-        Ok(WordMatcher { regex, original, names, locs })
+        Ok(WordMatcher { regex, pattern, original, names, locs })
     }
 
-    /// Return the underlying regex used by this matcher.
-    pub fn regex(&self) -> &Regex {
-        &self.regex
+    /// Return the underlying pattern string for the regex used by this
+    /// matcher.
+    pub fn pattern(&self) -> &str {
+        &self.pattern
     }
 
     /// Attempt to do a fast confirmation of a word match that covers a subset
@@ -102,7 +108,8 @@ impl WordMatcher {
         // The reason why we cannot handle the ^/$ cases here is because we
         // can't assume anything about the original pattern. (Try commenting
         // out the checks for ^/$ below and run the tests to see examples.)
-        let mut cand = match self.regex.find_at(haystack, at) {
+        let input = Input::new(haystack).span(at..haystack.len());
+        let mut cand = match self.regex.find(input) {
             None => return Ok(None),
             Some(m) => Match::new(m.start(), m.end()),
         };
@@ -154,14 +161,15 @@ impl Matcher for WordMatcher {
         }
 
         let cell =
-            self.locs.get_or(|| RefCell::new(self.regex.capture_locations()));
+            self.locs.get_or(|| RefCell::new(self.regex.create_captures()));
+        let input = Input::new(haystack).span(at..haystack.len());
         let mut caps = cell.borrow_mut();
-        self.regex.captures_read_at(&mut caps, haystack, at);
-        Ok(caps.get(1).map(|m| Match::new(m.0, m.1)))
+        self.regex.search_captures(&input, &mut caps);
+        Ok(caps.get_group(1).map(|sp| Match::new(sp.start, sp.end)))
     }
 
     fn new_captures(&self) -> Result<RegexCaptures, NoError> {
-        Ok(RegexCaptures::with_offset(self.regex.capture_locations(), 1))
+        Ok(RegexCaptures::with_offset(self.regex.create_captures(), 1))
     }
 
     fn capture_count(&self) -> usize {
@@ -178,9 +186,10 @@ impl Matcher for WordMatcher {
         at: usize,
         caps: &mut RegexCaptures,
     ) -> Result<bool, NoError> {
-        let r =
-            self.regex.captures_read_at(caps.locations_mut(), haystack, at);
-        Ok(r.is_some())
+        let input = Input::new(haystack).span(at..haystack.len());
+        let caps = caps.locations_mut();
+        self.regex.search_captures(&input, caps);
+        Ok(caps.is_match())
     }
 
     // We specifically do not implement other methods like find_iter or
author	Andrew Gallant <jamslam@gmail.com>	2023-06-15 15:05:07 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2023-07-05 14:04:29 -0400
commit	e028ea37928930c80e5c3172d1df306b85a86758 (patch)
tree	fed757294923a7a44bfd6e8a8fe14955ea5f4de2
parent	1035f6b1ff502eb5b1a5fc49a79f45971c772d47 (diff)