progress

author: Andrew Gallant <jamslam@gmail.com> 2023-10-09 18:23:36 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2023-10-09 20:29:52 -0400
commit: 9626f167573527858f9736a3054882de87d6cd79 (patch)
tree: 83a5bd062eff724bc857d8b83b7c008cba882ad9 /crates
parent: f7ff34fdf9d2853f9763aceb28f5dcb014728045 (diff)
10 files changed, 80 insertions, 779 deletions
diff --git a/crates/globset/Cargo.toml b/crates/globset/Cargo.toml
index b0602239..decc7804 100644
--- a/crates/globset/Cargo.toml
+++ b/crates/globset/Cargo.toml
@@ -26,12 +26,12 @@ log = { version = "0.4.20", optional = true }
 serde = { version = "1.0.188", optional = true }
 
 [dependencies.regex-syntax]
-version = "0.7.5"
+version = "0.8.0"
 default-features = false
 features = ["std"]
 
 [dependencies.regex-automata]
-version = "0.3.8"
+version = "0.4.0"
 default-features = false
 features = ["std", "perf", "syntax", "meta", "nfa", "hybrid"]
 
diff --git a/crates/ignore/Cargo.toml b/crates/ignore/Cargo.toml
index 31771f17..81dc9284 100644
--- a/crates/ignore/Cargo.toml
+++ b/crates/ignore/Cargo.toml
@@ -27,7 +27,7 @@ same-file = "1.0.6"
 walkdir = "2.4.0"
 
 [dependencies.regex-automata]
-version = "0.3.8"
+version = "0.4.0"
 default-features = false
 features = ["std", "perf", "syntax", "meta", "nfa", "hybrid", "dfa-onepass"]
 
diff --git a/crates/regex/Cargo.toml b/crates/regex/Cargo.toml
index f0ca8394..f3266081 100644
--- a/crates/regex/Cargo.toml
+++ b/crates/regex/Cargo.toml
@@ -17,5 +17,5 @@ edition = "2021"
 bstr = "1.6.2"
 grep-matcher = { version = "0.1.6", path = "../matcher" }
 log = "0.4.20"
-regex-automata = { version = "0.3.8" }
-regex-syntax = "0.7.5"
+regex-automata = { version = "0.4.0" }
+regex-syntax = "0.8.0"
diff --git a/crates/regex/src/ast.rs b/crates/regex/src/ast.rs
index 4d170565..a5a0573a 100644
--- a/crates/regex/src/ast.rs
+++ b/crates/regex/src/ast.rs
@@ -62,12 +62,12 @@ impl AstAnalysis {
             Ast::Flags(_)
             | Ast::Dot(_)
             | Ast::Assertion(_)
-            | Ast::Class(ast::Class::Unicode(_))
-            | Ast::Class(ast::Class::Perl(_)) => {}
+            | Ast::ClassUnicode(_)
+            | Ast::ClassPerl(_) => {}
             Ast::Literal(ref x) => {
                 self.from_ast_literal(x);
             }
-            Ast::Class(ast::Class::Bracketed(ref x)) => {
+            Ast::ClassBracketed(ref x) => {
                 self.from_ast_class_set(&x.kind);
             }
             Ast::Repetition(ref x) => {
diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs
index 79642580..8c69ef54 100644
--- a/crates/regex/src/config.rs
+++ b/crates/regex/src/config.rs
@@ -3,7 +3,7 @@ use {
     regex_automata::meta::Regex,
     regex_syntax::{
         ast,
-        hir::{self, Hir, HirKind},
+        hir::{self, Hir},
     },
 };
 
@@ -296,35 +296,6 @@ impl ConfiguredHIR {
         }
     }
 
-    /// Turns this configured HIR into one that only matches when both sides of
-    /// the match correspond to a word boundary.
-    ///
-    /// Note that the HIR returned is like turning `pat` into
-    /// `(?m:^|\W)(pat)(?m:$|\W)`. That is, the true match is at capture group
-    /// `1` and not `0`.
-    pub(crate) fn into_word(self) -> Result<ConfiguredHIR, Error> {
-        // In theory building the HIR for \W should never fail, but there are
-        // likely some pathological cases (particularly with respect to certain
-        // values of limits) where it could in theory fail.
-        let non_word = {
-            let mut config = self.config.clone();
-            config.fixed_strings = false;
-            ConfiguredHIR::new(config, &[r"\W"])?
-        };
-        let line_anchor_start = Hir::look(self.line_anchor_start());
-        let line_anchor_end = Hir::look(self.line_anchor_end());
-        let hir = Hir::concat(vec![
-            Hir::alternation(vec![line_anchor_start, non_word.hir.clone()]),
-            Hir::capture(hir::Capture {
-                index: 1,
-                name: None,
-                sub: Box::new(renumber_capture_indices(self.hir)?),
-            }),
-            Hir::alternation(vec![non_word.hir, line_anchor_end]),
-        ]);
-        Ok(ConfiguredHIR { config: self.config, hir })
-    }
-
     /// Turns this configured HIR into an equivalent one, but where it must
     /// match at the start and end of a line.
     pub(crate) fn into_whole_line(self) -> ConfiguredHIR {
@@ -336,12 +307,20 @@ impl ConfiguredHIR {
     }
 
     /// Turns this configured HIR into an equivalent one, but where it must
-    /// match at the start and end of the haystack.
-    pub(crate) fn into_anchored(self) -> ConfiguredHIR {
+    /// match at word boundaries.
+    pub(crate) fn into_word(self) -> ConfiguredHIR {
         let hir = Hir::concat(vec![
-            Hir::look(hir::Look::Start),
+            Hir::look(if self.config.unicode {
+                hir::Look::WordStartHalfUnicode
+            } else {
+                hir::Look::WordStartHalfAscii
+            }),
             self.hir,
-            Hir::look(hir::Look::End),
+            Hir::look(if self.config.unicode {
+                hir::Look::WordEndHalfUnicode
+            } else {
+                hir::Look::WordEndHalfAscii
+            }),
         ]);
         ConfiguredHIR { config: self.config, hir }
     }
@@ -365,50 +344,6 @@ impl ConfiguredHIR {
     }
 }
 
-/// This increments the index of every capture group in the given hir by 1. If
-/// any increment results in an overflow, then an error is returned.
-fn renumber_capture_indices(hir: Hir) -> Result<Hir, Error> {
-    Ok(match hir.into_kind() {
-        HirKind::Empty => Hir::empty(),
-        HirKind::Literal(hir::Literal(lit)) => Hir::literal(lit),
-        HirKind::Class(cls) => Hir::class(cls),
-        HirKind::Look(x) => Hir::look(x),
-        HirKind::Repetition(mut x) => {
-            x.sub = Box::new(renumber_capture_indices(*x.sub)?);
-            Hir::repetition(x)
-        }
-        HirKind::Capture(mut cap) => {
-            cap.index = match cap.index.checked_add(1) {
-                Some(index) => index,
-                None => {
-                    // This error message kind of sucks, but it's probably
-                    // impossible for it to happen. The only way a capture
-                    // index can overflow addition is if the regex is huge
-                    // (or something else has gone horribly wrong).
-                    let msg = "could not renumber capture index, too big";
-                    return Err(Error::any(msg));
-                }
-            };
-            cap.sub = Box::new(renumber_capture_indices(*cap.sub)?);
-            Hir::capture(cap)
-        }
-        HirKind::Concat(subs) => {
-            let subs = subs
-                .into_iter()
-                .map(|sub| renumber_capture_indices(sub))
-                .collect::<Result<Vec<Hir>, Error>>()?;
-            Hir::concat(subs)
-        }
-        HirKind::Alternation(subs) => {
-            let subs = subs
-                .into_iter()
-                .map(|sub| renumber_capture_indices(sub))
-                .collect::<Result<Vec<Hir>, Error>>()?;
-            Hir::alternation(subs)
-        }
-    })
-}
-
 /// Returns true if the given literal string contains any byte from the line
 /// terminator given.
 fn has_line_terminator(lineterm: LineTerminator, literal: &str) -> bool {
diff --git a/crates/regex/src/error.rs b/crates/regex/src/error.rs
index 1c921773..88a8adbe 100644
--- a/crates/regex/src/error.rs
+++ b/crates/regex/src/error.rs
@@ -30,10 +30,6 @@ impl Error {
         Error { kind: ErrorKind::Regex(err.to_string()) }
     }
 
-    pub(crate) fn any<E: ToString>(msg: E) -> Error {
-        Error { kind: ErrorKind::Regex(msg.to_string()) }
-    }
-
     /// Return the kind of this error.
     pub fn kind(&self) -> &ErrorKind {
         &self.kind
diff --git a/crates/regex/src/lib.rs b/crates/regex/src/lib.rs
index 068c7c71..4693bff1 100644
--- a/crates/regex/src/lib.rs
+++ b/crates/regex/src/lib.rs
@@ -15,4 +15,3 @@ mod literal;
 mod matcher;
 mod non_matching;
 mod strip;
-mod word;
diff --git a/crates/regex/src/matcher.rs b/crates/regex/src/matcher.rs
index 65c61d27..f3f673ff 100644
--- a/crates/regex/src/matcher.rs
+++ b/crates/regex/src/matcher.rs
@@ -1,5 +1,3 @@
-use std::sync::Arc;
-
 use {
     grep_matcher::{
         ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher,
@@ -11,12 +9,7 @@ use {
     },
 };
 
-use crate::{
-    config::{Config, ConfiguredHIR},
-    error::Error,
-    literal::InnerLiterals,
-    word::WordMatcher,
-};
+use crate::{config::Config, error::Error, literal::InnerLiterals};
 
 /// A builder for constructing a `Matcher` using regular expressions.
 ///
@@ -61,9 +54,15 @@ impl RegexMatcherBuilder {
         &self,
         patterns: &[P],
     ) -> Result<RegexMatcher, Error> {
-        let chir = self.config.build_many(patterns)?;
-        let matcher = RegexMatcherImpl::new(chir)?;
-        let (chir, re) = (matcher.chir(), matcher.regex());
+        let mut chir = self.config.build_many(patterns)?;
+        // 'whole_line' is a strict subset of 'word', so when it is enabled,
+        // we don't need to both with any specific to word matching.
+        if chir.config().whole_line {
+            chir = chir.into_whole_line();
+        } else if chir.config().word {
+            chir = chir.into_word();
+        }
+        let regex = chir.to_regex()?;
         log::trace!("final regex: {:?}", chir.hir().to_string());
 
         let non_matching_bytes = chir.non_matching_bytes();
@@ -76,18 +75,13 @@ impl RegexMatcherBuilder {
         // then run the original regex on only that line. (In this case, the
         // regex engine is likely to handle this case for us since it's so
         // simple, but the idea applies.)
-        let fast_line_regex = InnerLiterals::new(chir, re).one_regex()?;
+        let fast_line_regex = InnerLiterals::new(&chir, &regex).one_regex()?;
 
         // We override the line terminator in case the configured HIR doesn't
         // support it.
         let mut config = self.config.clone();
         config.line_terminator = chir.line_terminator();
-        Ok(RegexMatcher {
-            config,
-            matcher,
-            fast_line_regex,
-            non_matching_bytes,
-        })
+        Ok(RegexMatcher { config, regex, fast_line_regex, non_matching_bytes })
     }
 
     /// Build a new matcher from a plain alternation of literals.
@@ -357,8 +351,9 @@ impl RegexMatcherBuilder {
 pub struct RegexMatcher {
     /// The configuration specified by the caller.
     config: Config,
-    /// The underlying matcher implementation.
-    matcher: RegexMatcherImpl,
+    /// The regular expression compiled from the pattern provided by the
+    /// caller.
+    regex: Regex,
     /// A regex that never reports false negatives but may report false
     /// positives that is believed to be capable of being matched more quickly
     /// than `regex`. Typically, this is a single literal or an alternation
@@ -392,53 +387,6 @@ impl RegexMatcher {
     }
 }
 
-/// An encapsulation of the type of matcher we use in `RegexMatcher`.
-#[derive(Clone, Debug)]
-enum RegexMatcherImpl {
-    /// The standard matcher used for all regular expressions.
-    Standard(StandardMatcher),
-    /// A matcher that only matches at word boundaries. This transforms the
-    /// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`.
-    /// Because of this, the WordMatcher provides its own implementation of
-    /// `Matcher` to encapsulate its use of capture groups to make them
-    /// invisible to the caller.
-    Word(WordMatcher),
-}
-
-impl RegexMatcherImpl {
-    /// Based on the configuration, create a new implementation of the
-    /// `Matcher` trait.
-    fn new(mut chir: ConfiguredHIR) -> Result<RegexMatcherImpl, Error> {
-        // When whole_line is set, we don't use a word matcher even if word
-        // matching was requested. Why? Because `(?m:^)(pat)(?m:$)` implies
-        // word matching.
-        Ok(if chir.config().word && !chir.config().whole_line {
-            RegexMatcherImpl::Word(WordMatcher::new(chir)?)
-        } else {
-            if chir.config().whole_line {
-                chir = chir.into_whole_line();
-            }
-            RegexMatcherImpl::Standard(StandardMatcher::new(chir)?)
-        })
-    }
-
-    /// Return the underlying regex object used.
-    fn regex(&self) -> &Regex {
-        match *self {
-            RegexMatcherImpl::Word(ref x) => x.regex(),
-            RegexMatcherImpl::Standard(ref x) => &x.regex,
-        }
-    }
-
-    /// Return the underlying HIR of the regex used for searching.
-    fn chir(&self) -> &ConfiguredHIR {
-        match *self {
-            RegexMatcherImpl::Word(ref x) => x.chir(),
-            RegexMatcherImpl::Standard(ref x) => &x.chir,
-        }
-    }
-}
-
 // This implementation just dispatches on the internal matcher impl except
 // for the line terminator optimization, which is possibly executed via
 // `fast_line_regex`.
@@ -446,221 +394,84 @@ impl Matcher for RegexMatcher {
     type Captures = RegexCaptures;
     type Error = NoError;
 
+    #[inline]
     fn find_at(
         &self,
         haystack: &[u8],
         at: usize,
     ) -> Result<Option<Match>, NoError> {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => m.find_at(haystack, at),
-            Word(ref m) => m.find_at(haystack, at),
-        }
+        let input = Input::new(haystack).span(at..haystack.len());
+        Ok(self.regex.find(input).map(|m| Match::new(m.start(), m.end())))
     }
 
+    #[inline]
     fn new_captures(&self) -> Result<RegexCaptures, NoError> {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => m.new_captures(),
-            Word(ref m) => m.new_captures(),
-        }
+        Ok(RegexCaptures::new(self.regex.create_captures()))
     }
 
+    #[inline]
     fn capture_count(&self) -> usize {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => m.capture_count(),
-            Word(ref m) => m.capture_count(),
-        }
+        self.regex.captures_len()
     }
 
+    #[inline]
     fn capture_index(&self, name: &str) -> Option<usize> {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => m.capture_index(name),
-            Word(ref m) => m.capture_index(name),
-        }
-    }
-
-    fn find(&self, haystack: &[u8]) -> Result<Option<Match>, NoError> {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => m.find(haystack),
-            Word(ref m) => m.find(haystack),
-        }
-    }
-
-    fn find_iter<F>(&self, haystack: &[u8], matched: F) -> Result<(), NoError>
-    where
-        F: FnMut(Match) -> bool,
-    {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => m.find_iter(haystack, matched),
-            Word(ref m) => m.find_iter(haystack, matched),
-        }
+        self.regex.group_info().to_index(PatternID::ZERO, name)
     }
 
+    #[inline]
     fn try_find_iter<F, E>(
         &self,
         haystack: &[u8],
-        matched: F,
+        mut matched: F,
     ) -> Result<Result<(), E>, NoError>
     where
         F: FnMut(Match) -> Result<bool, E>,
     {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => m.try_find_iter(haystack, matched),
-            Word(ref m) => m.try_find_iter(haystack, matched),
-        }
-    }
-
-    fn captures(
-        &self,
-        haystack: &[u8],
-        caps: &mut RegexCaptures,
-    ) -> Result<bool, NoError> {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => m.captures(haystack, caps),
-            Word(ref m) => m.captures(haystack, caps),
-        }
-    }
-
-    fn captures_iter<F>(
-        &self,
-        haystack: &[u8],
-        caps: &mut RegexCaptures,
-        matched: F,
-    ) -> Result<(), NoError>
-    where
-        F: FnMut(&RegexCaptures) -> bool,
-    {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => m.captures_iter(haystack, caps, matched),
-            Word(ref m) => m.captures_iter(haystack, caps, matched),
-        }
-    }
-
-    fn try_captures_iter<F, E>(
-        &self,
-        haystack: &[u8],
-        caps: &mut RegexCaptures,
-        matched: F,
-    ) -> Result<Result<(), E>, NoError>
-    where
-        F: FnMut(&RegexCaptures) -> Result<bool, E>,
-    {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => m.try_captures_iter(haystack, caps, matched),
-            Word(ref m) => m.try_captures_iter(haystack, caps, matched),
+        for m in self.regex.find_iter(haystack) {
+            match matched(Match::new(m.start(), m.end())) {
+                Ok(true) => continue,
+                Ok(false) => return Ok(Ok(())),
+                Err(err) => return Ok(Err(err)),
+            }
         }
+        Ok(Ok(()))
     }
 
+    #[inline]
     fn captures_at(
         &self,
         haystack: &[u8],
         at: usize,
         caps: &mut RegexCaptures,
     ) -> Result<bool, NoError> {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => m.captures_at(haystack, at, caps),
-            Word(ref m) => m.captures_at(haystack, at, caps),
-        }
-    }
-
-    fn replace<F>(
-        &self,
-        haystack: &[u8],
-        dst: &mut Vec<u8>,
-        append: F,
-    ) -> Result<(), NoError>
-    where
-        F: FnMut(Match, &mut Vec<u8>) -> bool,
-    {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => m.replace(haystack, dst, append),
-            Word(ref m) => m.replace(haystack, dst, append),
-        }
-    }
-
-    fn replace_with_captures<F>(
-        &self,
-        haystack: &[u8],
-        caps: &mut RegexCaptures,
-        dst: &mut Vec<u8>,
-        append: F,
-    ) -> Result<(), NoError>
-    where
-        F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
-    {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => {
-                m.replace_with_captures(haystack, caps, dst, append)
-            }
-            Word(ref m) => {
-                m.replace_with_captures(haystack, caps, dst, append)
-            }
-        }
-    }
-
-    fn is_match(&self, haystack: &[u8]) -> Result<bool, NoError> {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => m.is_match(haystack),
-            Word(ref m) => m.is_match(haystack),
-        }
-    }
-
-    fn is_match_at(
-        &self,
-        haystack: &[u8],
-        at: usize,
-    ) -> Result<bool, NoError> {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => m.is_match_at(haystack, at),
-            Word(ref m) => m.is_match_at(haystack, at),
-        }
-    }
-
-    fn shortest_match(
-        &self,
-        haystack: &[u8],
-    ) -> Result<Option<usize>, NoError> {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => m.shortest_match(haystack),
-            Word(ref m) => m.shortest_match(haystack),
-        }
+        let input = Input::new(haystack).span(at..haystack.len());
+        let caps = caps.captures_mut();
+        self.regex.search_captures(&input, caps);
+        Ok(caps.is_match())
     }
 
+    #[inline]
     fn shortest_match_at(
         &self,
         haystack: &[u8],
         at: usize,
     ) -> Result<Option<usize>, NoError> {
-        use self::RegexMatcherImpl::*;
-        match self.matcher {
-            Standard(ref m) => m.shortest_match_at(haystack, at),
-            Word(ref m) => m.shortest_match_at(haystack, at),
-        }
+        let input = Input::new(haystack).span(at..haystack.len());
+        Ok(self.regex.search_half(&input).map(|hm| hm.offset()))
     }
 
+    #[inline]
     fn non_matching_bytes(&self) -> Option<&ByteSet> {
         Some(&self.non_matching_bytes)
     }
 
+    #[inline]
     fn line_terminator(&self) -> Option<LineTerminator> {
         self.config.line_terminator
     }
 
+    #[inline]
     fn find_candidate_line(
         &self,
         haystack: &[u8],
@@ -679,93 +490,6 @@ impl Matcher for RegexMatcher {
     }
 }
 
-/// The implementation of the standard regex matcher.
-#[derive(Clone, Debug)]
-struct StandardMatcher {
-    /// The regular expression compiled from the pattern provided by the
-    /// caller.
-    regex: Regex,
-    /// The HIR that produced this regex.
-    ///
-    /// We put this in an `Arc` because by the time it gets here, it won't
-    /// change. And because cloning and dropping an `Hir` is somewhat expensive
-    /// due to its deep recursive representation.
-    chir: Arc<ConfiguredHIR>,
-}
-
-impl StandardMatcher {
-    fn new(chir: ConfiguredHIR) -> Result<StandardMatcher, Error> {
-        let chir = Arc::new(chir);
-        let regex = chir.to_regex()?;
-        Ok(StandardMatcher { regex, chir })
-    }
-}
-
-impl Matcher for StandardMatcher {
-    type Captures = RegexCaptures;
-    type Error = NoError;
-
-    fn find_at(
-        &self,
-        haystack: &[u8],
-        at: usize,
-    ) -> Result<Option<Match>, NoError> {
-        let input = Input::new(haystack).span(at..haystack.len());
-        Ok(self.regex.find(input).map(|m| Match::new(m.start(), m.end())))
-    }
-
-    fn new_captures(&self) -> Result<RegexCaptures, NoError> {
-        Ok(RegexCaptures::new(self.regex.create_captures()))
-    }
-
-    fn capture_count(&self) -> usize {
-        self.regex.captures_len()
-    }
-
-    fn capture_index(&self, name: &str) -> Option<usize> {
-        self.regex.group_info().to_index(PatternID::ZERO, name)
-    }
-
-    fn try_find_iter<F, E>(
-        &self,
-        haystack: &[u8],
-        mut matched: F,
-    ) -> Result<Result<(), E>, NoError>
-    where
-        F: FnMut(Match) -> Result<bool, E>,
-    {
-        for m in self.regex.find_iter(haystack) {
-            match matched(Match::new(m.start(), m.end())) {
-                Ok(true) => continue,
-                Ok(false) => return Ok(Ok(())),
-                Err(err) => return Ok(Err(err)),
-            }
-        }
-        Ok(Ok(()))
-    }
-
-    fn captures_at(
-        &self,
-        haystack: &[u8],
-        at: usize,
-        caps: &mut RegexCaptures,
-    ) -> Result<bool, NoError> {
-        let input = Input::new(haystack).span(at..haystack.len());
-        let caps = caps.captures_mut();
-        self.regex.search_captures(&input, caps);
-        Ok(caps.is_match())
-    }
-
-    fn shortest_match_at(
-        &self,
-        haystack: &[u8],
-        at: usize,
-    ) -> Result<Option<usize>, NoError> {
-        let input = Input::new(haystack).span(at..haystack.len());
-        Ok(self.regex.search_half(&input).map(|hm| hm.offset()))
-    }
-}
-
 /// Represents the match offsets of each capturing group in a match.
 ///
 /// The first, or `0`th capture group, always corresponds to the entire match
@@ -784,46 +508,27 @@ impl Matcher for StandardMatcher {
 pub struct RegexCaptures {
     /// Where the captures are stored.
     caps: AutomataCaptures,
-    /// These captures behave as if the capturing groups begin at the given
-    /// offset. When set to `0`, this has no affect and capture groups are
-    /// indexed like normal.
-    ///
-    /// This is useful when building matchers that wrap arbitrary regular
-    /// expressions. For example, `WordMatcher` takes an existing regex
-    /// `re` and creates `(?:^|\W)(re)(?:$|\W)`, but hides the fact that
-    /// the regex has been wrapped from the caller. In order to do this,
-    /// the matcher and the capturing groups must behave as if `(re)` is
-    /// the `0`th capture group.
-    offset: usize,
 }
 
 impl Captures for RegexCaptures {
+    #[inline]
     fn len(&self) -> usize {
-        self.caps
-            .group_info()
-            .all_group_len()
-            .checked_sub(self.offset)
-            .unwrap()
+        self.caps.group_info().all_group_len()
     }
 
+    #[inline]
     fn get(&self, i: usize) -> Option<Match> {
-        let actual = i.checked_add(self.offset).unwrap();
-        self.caps.get_group(actual).map(|sp| Match::new(sp.start, sp.end))
+        self.caps.get_group(i).map(|sp| Match::new(sp.start, sp.end))
     }
 }
 
 impl RegexCaptures {
+    #[inline]
     pub(crate) fn new(caps: AutomataCaptures) -> RegexCaptures {
-        RegexCaptures::with_offset(caps, 0)
-    }
-
-    pub(crate) fn with_offset(
-        caps: AutomataCaptures,
-        offset: usize,
-    ) -> RegexCaptures {
-        RegexCaptures { caps, offset }
+        RegexCaptures { caps }
     }
 
+    #[inline]
     pub(crate) fn captures_mut(&mut self) -> &mut AutomataCaptures {
         &mut self.caps
     }
diff --git a/crates/regex/src/non_matching.rs b/crates/regex/src/non_matching.rs
index 7fde6c46..f93ed13b 100644
--- a/crates/regex/src/non_matching.rs
+++ b/crates/regex/src/non_matching.rs
@@ -19,7 +19,14 @@ fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
     match *expr.kind() {
         HirKind::Empty
         | HirKind::Look(Look::WordAscii | Look::WordAsciiNegate)
-        | HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) => {}
+        | HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate)
+        | HirKind::Look(Look::WordStartAscii | Look::WordStartUnicode)
+        | HirKind::Look(Look::WordEndAscii | Look::WordEndUnicode)
+        | HirKind::Look(
+            Look::WordStartHalfAscii | Look::WordStartHalfUnicode,
+        )
+        | HirKind::Look(Look::WordEndHalfAscii | Look::WordEndHalfUnicode) => {
+        }
         HirKind::Look(Look::Start | Look::End) => {
             // FIXME: This is wrong, but not doing this leads to incorrect
             // results because of how anchored searches are implemented in
diff --git a/crates/regex/src/word.rs b/crates/regex/src/word.rs
deleted file mode 100644
index 52fb61ce..00000000
--- a/crates/regex/src/word.rs
+++ /dev/null
@@ -1,341 +0,0 @@
-use std::{
-    collections::HashMap,
-    panic::{RefUnwindSafe, UnwindSafe},
-    sync::Arc,
-};
-
-use {
-    grep_matcher::{Match, Matcher, NoError},
-    regex_automata::{
-        meta::Regex, util::captures::Captures, util::pool::Pool, Input,
-        PatternID,
-    },
-};
-
-use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures};
-
-type PoolFn =
-    Box<dyn Fn() -> Captures + Send + Sync + UnwindSafe + RefUnwindSafe>;
-
-/// A matcher for implementing "word match" semantics.
-#[derive(Debug)]
-pub(crate) struct WordMatcher {
-    /// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
-    regex: Regex,
-    /// The HIR that produced the regex above. We don't keep the HIR for the
-    /// `original` regex.
-    ///
-    /// We put this in an `Arc` because by the time it gets here, it won't
-    /// change. And because cloning and dropping an `Hir` is somewhat expensive
-    /// due to its deep recursive representation.
-    chir: Arc<ConfiguredHIR>,
-    /// The original regex supplied by the user, which we use in a fast path
-    /// to try and detect matches before deferring to slower engines.
-    original: Regex,
-    /// A map from capture group name to capture group index.
-    names: HashMap<String, usize>,
-    /// A thread-safe pool of reusable buffers for finding the match offset of
-    /// the inner group.
-    caps: Arc<Pool<Captures, PoolFn>>,
-}
-
-impl Clone for WordMatcher {
-    fn clone(&self) -> WordMatcher {
-        // We implement Clone manually so that we get a fresh Pool such that it
-        // can set its own thread owner. This permits each thread usings `caps`
-        // to hit the fast path.
-        //
-        // Note that cloning a regex is "cheap" since it uses reference
-        // counting internally.
-        let re = self.regex.clone();
-        WordMatcher {
-            regex: self.regex.clone(),
-            chir: Arc::clone(&self.chir),
-            original: self.original.clone(),
-            names: self.names.clone(),
-            caps: Arc::new(Pool::new(Box::new(move || re.create_captures()))),
-        }
-    }
-}
-
-impl WordMatcher {
-    /// Create a new matcher from the given pattern that only produces matches
-    /// that are considered "words."
-    ///
-    /// The given options are used to construct the regular expression
-    /// internally.
-    pub(crate) fn new(chir: ConfiguredHIR) -> Result<WordMatcher, Error> {
-        let original = chir.clone().into_anchored().to_regex()?;
-        let chir = Arc::new(chir.into_word()?);
-        let regex = chir.to_regex()?;
-        let caps = Arc::new(Pool::new({
-            let regex = regex.clone();
-            Box::new(move || regex.create_captures()) as PoolFn
-        }));
-
-        let mut names = HashMap::new();
-        let it = regex.group_info().pattern_names(PatternID::ZERO);
-        for (i, optional_name) in it.enumerate() {
-            if let Some(name) = optional_name {
-                names.insert(name.to_string(), i.checked_sub(1).unwrap());
-            }
-        }
-        Ok(WordMatcher { regex, chir, original, names, caps })
-    }
-
-    /// Return the underlying regex used to match at word boundaries.
-    ///
-    /// The original regex is in the capture group at index 1.
-    pub(crate) fn regex(&self) -> &Regex {
-        &self.regex
-    }
-
-    /// Return the underlying HIR for the regex used to match at word
-    /// boundaries.
-    pub(crate) fn chir(&self) -> &ConfiguredHIR {
-        &self.chir
-    }
-
-    /// Attempt to do a fast confirmation of a word match that covers a subset
-    /// (but hopefully a big subset) of most cases. Ok(Some(..)) is returned
-    /// when a match is found. Ok(None) is returned when there is definitively
-    /// no match. Err(()) is returned when this routine could not detect
-    /// whether there was a match or not.
-    fn fast_find(
-        &self,
-        haystack: &[u8],
-        at: usize,
-    ) -> Result<Option<Match>, ()> {
-        // This is a bit hairy. The whole point here is to avoid running a
-        // slower regex engine to extract capture groups. Remember, our word
-        // regex looks like this:
-        //
-        //     (^|\W)(<original regex>)(\W|$)
-        //
-        // What we want are the match offsets of <original regex>. So in the
-        // easy/common case, the original regex will be sandwiched between
-        // two codepoints that are in the \W class. So our approach here is to
-        // look for a match of the overall word regexp, strip the \W ends and
-        // then check whether the original regex matches what's left. If so,
-        // then we are guaranteed a correct match.
-        //
-        // This only works though if we know that the match is sandwiched
-        // between two \W codepoints. This only occurs when neither ^ nor $
-        // match. This in turn only occurs when the match is at either the
-        // beginning or end of the haystack. In either of those cases, we
-        // declare defeat and defer to the slower implementation.
-        //
-        // The reason why we cannot handle the ^/$ cases here is because we
-        // can't assume anything about the original pattern. (Try commenting
-        // out the checks for ^/$ below and run the tests to see examples.)
-        //
-        // NOTE(2023-07-31): After fixing #2574, this logic honestly still
-        // doesn't seem correct. Regex composition is hard.
-        let input = Input::new(haystack).span(at..haystack.len());
-        let mut cand = match self.regex.find(input) {
-            None => return Ok(None),<
author	Andrew Gallant <jamslam@gmail.com>	2023-10-09 18:23:36 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2023-10-09 20:29:52 -0400
commit	9626f167573527858f9736a3054882de87d6cd79 (patch)
tree	83a5bd062eff724bc857d8b83b7c008cba882ad9 /crates
parent	f7ff34fdf9d2853f9763aceb28f5dcb014728045 (diff)