summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Cargo.lock24
-rw-r--r--Cargo.toml5
-rw-r--r--crates/globset/Cargo.toml4
-rw-r--r--crates/ignore/Cargo.toml2
-rw-r--r--crates/regex/Cargo.toml4
-rw-r--r--crates/regex/src/ast.rs6
-rw-r--r--crates/regex/src/config.rs91
-rw-r--r--crates/regex/src/error.rs4
-rw-r--r--crates/regex/src/lib.rs1
-rw-r--r--crates/regex/src/matcher.rs397
-rw-r--r--crates/regex/src/non_matching.rs9
-rw-r--r--crates/regex/src/word.rs341
-rw-r--r--tests/misc.rs12
-rw-r--r--tests/regression.rs9
14 files changed, 107 insertions, 802 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 4605c11e..73e6ad5f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4,9 +4,9 @@ version = 3
[[package]]
name = "aho-corasick"
-version = "1.1.1"
+version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab"
+checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
dependencies = [
"memchr",
]
@@ -31,9 +31,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bstr"
-version = "1.6.2"
+version = "1.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a"
+checksum = "c79ad7fb2dd38f3dabd76b09c6a5a20c038fc0213ef1e9afd30eb777f120f019"
dependencies = [
"memchr",
"regex-automata",
@@ -305,9 +305,9 @@ checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
[[package]]
name = "memchr"
-version = "2.6.3"
+version = "2.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
+checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
[[package]]
name = "memmap2"
@@ -395,9 +395,7 @@ dependencies = [
[[package]]
name = "regex"
-version = "1.9.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebee201405406dbf528b8b672104ae6d6d63e6d118cb10e4d51abbc7b58044ff"
+version = "1.10.0"
dependencies = [
"aho-corasick",
"memchr",
@@ -407,9 +405,7 @@ dependencies = [
[[package]]
name = "regex-automata"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9"
+version = "0.4.1"
dependencies = [
"aho-corasick",
"memchr",
@@ -418,9 +414,7 @@ dependencies = [
[[package]]
name = "regex-syntax"
-version = "0.7.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
+version = "0.8.0"
[[package]]
name = "ripgrep"
diff --git a/Cargo.toml b/Cargo.toml
index 3a905569..1a3a70c9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -48,6 +48,11 @@ members = [
"crates/ignore",
]
+[patch.crates-io]
+regex = { path = "/home/andrew/rust/regex" }
+regex-automata = { path = "/home/andrew/rust/regex/regex-automata" }
+regex-syntax = { path = "/home/andrew/rust/regex/regex-syntax" }
+
[dependencies]
bstr = "1.6.0"
grep = { version = "0.2.12", path = "crates/grep" }
diff --git a/crates/globset/Cargo.toml b/crates/globset/Cargo.toml
index b0602239..decc7804 100644
--- a/crates/globset/Cargo.toml
+++ b/crates/globset/Cargo.toml
@@ -26,12 +26,12 @@ log = { version = "0.4.20", optional = true }
serde = { version = "1.0.188", optional = true }
[dependencies.regex-syntax]
-version = "0.7.5"
+version = "0.8.0"
default-features = false
features = ["std"]
[dependencies.regex-automata]
-version = "0.3.8"
+version = "0.4.0"
default-features = false
features = ["std", "perf", "syntax", "meta", "nfa", "hybrid"]
diff --git a/crates/ignore/Cargo.toml b/crates/ignore/Cargo.toml
index 31771f17..81dc9284 100644
--- a/crates/ignore/Cargo.toml
+++ b/crates/ignore/Cargo.toml
@@ -27,7 +27,7 @@ same-file = "1.0.6"
walkdir = "2.4.0"
[dependencies.regex-automata]
-version = "0.3.8"
+version = "0.4.0"
default-features = false
features = ["std", "perf", "syntax", "meta", "nfa", "hybrid", "dfa-onepass"]
diff --git a/crates/regex/Cargo.toml b/crates/regex/Cargo.toml
index f0ca8394..f3266081 100644
--- a/crates/regex/Cargo.toml
+++ b/crates/regex/Cargo.toml
@@ -17,5 +17,5 @@ edition = "2021"
bstr = "1.6.2"
grep-matcher = { version = "0.1.6", path = "../matcher" }
log = "0.4.20"
-regex-automata = { version = "0.3.8" }
-regex-syntax = "0.7.5"
+regex-automata = { version = "0.4.0" }
+regex-syntax = "0.8.0"
diff --git a/crates/regex/src/ast.rs b/crates/regex/src/ast.rs
index 4d170565..a5a0573a 100644
--- a/crates/regex/src/ast.rs
+++ b/crates/regex/src/ast.rs
@@ -62,12 +62,12 @@ impl AstAnalysis {
Ast::Flags(_)
| Ast::Dot(_)
| Ast::Assertion(_)
- | Ast::Class(ast::Class::Unicode(_))
- | Ast::Class(ast::Class::Perl(_)) => {}
+ | Ast::ClassUnicode(_)
+ | Ast::ClassPerl(_) => {}
Ast::Literal(ref x) => {
self.from_ast_literal(x);
}
- Ast::Class(ast::Class::Bracketed(ref x)) => {
+ Ast::ClassBracketed(ref x) => {
self.from_ast_class_set(&x.kind);
}
Ast::Repetition(ref x) => {
diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs
index 79642580..8c69ef54 100644
--- a/crates/regex/src/config.rs
+++ b/crates/regex/src/config.rs
@@ -3,7 +3,7 @@ use {
regex_automata::meta::Regex,
regex_syntax::{
ast,
- hir::{self, Hir, HirKind},
+ hir::{self, Hir},
},
};
@@ -296,35 +296,6 @@ impl ConfiguredHIR {
}
}
- /// Turns this configured HIR into one that only matches when both sides of
- /// the match correspond to a word boundary.
- ///
- /// Note that the HIR returned is like turning `pat` into
- /// `(?m:^|\W)(pat)(?m:$|\W)`. That is, the true match is at capture group
- /// `1` and not `0`.
- pub(crate) fn into_word(self) -> Result<ConfiguredHIR, Error> {
- // In theory building the HIR for \W should never fail, but there are
- // likely some pathological cases (particularly with respect to certain
- // values of limits) where it could in theory fail.
- let non_word = {
- let mut config = self.config.clone();
- config.fixed_strings = false;
- ConfiguredHIR::new(config, &[r"\W"])?
- };
- let line_anchor_start = Hir::look(self.line_anchor_start());
- let line_anchor_end = Hir::look(self.line_anchor_end());
- let hir = Hir::concat(vec![
- Hir::alternation(vec![line_anchor_start, non_word.hir.clone()]),
- Hir::capture(hir::Capture {
- index: 1,
- name: None,
- sub: Box::new(renumber_capture_indices(self.hir)?),
- }),
- Hir::alternation(vec![non_word.hir, line_anchor_end]),
- ]);
- Ok(ConfiguredHIR { config: self.config, hir })
- }
-
/// Turns this configured HIR into an equivalent one, but where it must
/// match at the start and end of a line.
pub(crate) fn into_whole_line(self) -> ConfiguredHIR {
@@ -336,12 +307,20 @@ impl ConfiguredHIR {
}
/// Turns this configured HIR into an equivalent one, but where it must
- /// match at the start and end of the haystack.
- pub(crate) fn into_anchored(self) -> ConfiguredHIR {
+ /// match at word boundaries.
+ pub(crate) fn into_word(self) -> ConfiguredHIR {
let hir = Hir::concat(vec![
- Hir::look(hir::Look::Start),
+ Hir::look(if self.config.unicode {
+ hir::Look::WordStartHalfUnicode
+ } else {
+ hir::Look::WordStartHalfAscii
+ }),
self.hir,
- Hir::look(hir::Look::End),
+ Hir::look(if self.config.unicode {
+ hir::Look::WordEndHalfUnicode
+ } else {
+ hir::Look::WordEndHalfAscii
+ }),
]);
ConfiguredHIR { config: self.config, hir }
}
@@ -365,50 +344,6 @@ impl ConfiguredHIR {
}
}
-/// This increments the index of every capture group in the given hir by 1. If
-/// any increment results in an overflow, then an error is returned.
-fn renumber_capture_indices(hir: Hir) -> Result<Hir, Error> {
- Ok(match hir.into_kind() {
- HirKind::Empty => Hir::empty(),
- HirKind::Literal(hir::Literal(lit)) => Hir::literal(lit),
- HirKind::Class(cls) => Hir::class(cls),
- HirKind::Look(x) => Hir::look(x),
- HirKind::Repetition(mut x) => {
- x.sub = Box::new(renumber_capture_indices(*x.sub)?);
- Hir::repetition(x)
- }
- HirKind::Capture(mut cap) => {
- cap.index = match cap.index.checked_add(1) {
- Some(index) => index,
- None => {
- // This error message kind of sucks, but it's probably
- // impossible for it to happen. The only way a capture
- // index can overflow addition is if the regex is huge
- // (or something else has gone horribly wrong).
- let msg = "could not renumber capture index, too big";
- return Err(Error::any(msg));
- }
- };
- cap.sub = Box::new(renumber_capture_indices(*cap.sub)?);
- Hir::capture(cap)
- }
- HirKind::Concat(subs) => {
- let subs = subs
- .into_iter()
- .map(|sub| renumber_capture_indices(sub))
- .collect::<Result<Vec<Hir>, Error>>()?;
- Hir::concat(subs)
- }
- HirKind::Alternation(subs) => {
- let subs = subs
- .into_iter()
- .map(|sub| renumber_capture_indices(sub))
- .collect::<Result<Vec<Hir>, Error>>()?;
- Hir::alternation(subs)
- }
- })
-}
-
/// Returns true if the given literal string contains any byte from the line
/// terminator given.
fn has_line_terminator(lineterm: LineTerminator, literal: &str) -> bool {
diff --git a/crates/regex/src/error.rs b/crates/regex/src/error.rs
index 1c921773..88a8adbe 100644
--- a/crates/regex/src/error.rs
+++ b/crates/regex/src/error.rs
@@ -30,10 +30,6 @@ impl Error {
Error { kind: ErrorKind::Regex(err.to_string()) }
}
- pub(crate) fn any<E: ToString>(msg: E) -> Error {
- Error { kind: ErrorKind::Regex(msg.to_string()) }
- }
-
/// Return the kind of this error.
pub fn kind(&self) -> &ErrorKind {
&self.kind
diff --git a/crates/regex/src/lib.rs b/crates/regex/src/lib.rs
index 068c7c71..4693bff1 100644
--- a/crates/regex/src/lib.rs
+++ b/crates/regex/src/lib.rs
@@ -15,4 +15,3 @@ mod literal;
mod matcher;
mod non_matching;
mod strip;
-mod word;
diff --git a/crates/regex/src/matcher.rs b/crates/regex/src/matcher.rs
index 65c61d27..f3f673ff 100644
--- a/crates/regex/src/matcher.rs
+++ b/crates/regex/src/matcher.rs
@@ -1,5 +1,3 @@
-use std::sync::Arc;
-
use {
grep_matcher::{
ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher,
@@ -11,12 +9,7 @@ use {
},
};
-use crate::{
- config::{Config, ConfiguredHIR},
- error::Error,
- literal::InnerLiterals,
- word::WordMatcher,
-};
+use crate::{config::Config, error::Error, literal::InnerLiterals};
/// A builder for constructing a `Matcher` using regular expressions.
///
@@ -61,9 +54,15 @@ impl RegexMatcherBuilder {
&self,
patterns: &[P],
) -> Result<RegexMatcher, Error> {
- let chir = self.config.build_many(patterns)?;
- let matcher = RegexMatcherImpl::new(chir)?;
- let (chir, re) = (matcher.chir(), matcher.regex());
+ let mut chir = self.config.build_many(patterns)?;
+ // 'whole_line' is a strict subset of 'word', so when it is enabled,
+ // we don't need to both with any specific to word matching.
+ if chir.config().whole_line {
+ chir = chir.into_whole_line();
+ } else if chir.config().word {
+ chir = chir.into_word();
+ }
+ let regex = chir.to_regex()?;
log::trace!("final regex: {:?}", chir.hir().to_string());
let non_matching_bytes = chir.non_matching_bytes();
@@ -76,18 +75,13 @@ impl RegexMatcherBuilder {
// then run the original regex on only that line. (In this case, the
// regex engine is likely to handle this case for us since it's so
// simple, but the idea applies.)
- let fast_line_regex = InnerLiterals::new(chir, re).one_regex()?;
+ let fast_line_regex = InnerLiterals::new(&chir, &regex).one_regex()?;
// We override the line terminator in case the configured HIR doesn't
// support it.
let mut config = self.config.clone();
config.line_terminator = chir.line_terminator();
- Ok(RegexMatcher {
- config,
- matcher,
- fast_line_regex,
- non_matching_bytes,
- })
+ Ok(RegexMatcher { config, regex, fast_line_regex, non_matching_bytes })
}
/// Build a new matcher from a plain alternation of literals.
@@ -357,8 +351,9 @@ impl RegexMatcherBuilder {
pub struct RegexMatcher {
/// The configuration specified by the caller.
config: Config,
- /// The underlying matcher implementation.
- matcher: RegexMatcherImpl,
+ /// The regular expression compiled from the pattern provided by the
+ /// caller.
+ regex: Regex,
/// A regex that never reports false negatives but may report false
/// positives that is believed to be capable of being matched more quickly
/// than `regex`. Typically, this is a single literal or an alternation
@@ -392,53 +387,6 @@ impl RegexMatcher {
}
}
-/// An encapsulation of the type of matcher we use in `RegexMatcher`.
-#[derive(Clone, Debug)]
-enum RegexMatcherImpl {
- /// The standard matcher used for all regular expressions.
- Standard(StandardMatcher),
- /// A matcher that only matches at word boundaries. This transforms the
- /// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`.
- /// Because of this, the WordMatcher provides its own implementation of
- /// `Matcher` to encapsulate its use of capture groups to make them
- /// invisible to the caller.
- Word(WordMatcher),
-}
-
-impl RegexMatcherImpl {
- /// Based on the configuration, create a new implementation of the
- /// `Matcher` trait.
- fn new(mut chir: ConfiguredHIR) -> Result<RegexMatcherImpl, Error> {
- // When whole_line is set, we don't use a word matcher even if word
- // matching was requested. Why? Because `(?m:^)(pat)(?m:$)` implies
- // word matching.
- Ok(if chir.config().word && !chir.config().whole_line {
- RegexMatcherImpl::Word(WordMatcher::new(chir)?)
- } else {
- if chir.config().whole_line {
- chir = chir.into_whole_line();
- }
- RegexMatcherImpl::Standard(StandardMatcher::new(chir)?)
- })
- }
-
- /// Return the underlying regex object used.
- fn regex(&self) -> &Regex {
- match *self {
- RegexMatcherImpl::Word(ref x) => x.regex(),
- RegexMatcherImpl::Standard(ref x) => &x.regex,
- }
- }
-
- /// Return the underlying HIR of the regex used for searching.
- fn chir(&self) -> &ConfiguredHIR {
- match *self {
- RegexMatcherImpl::Word(ref x) => x.chir(),
- RegexMatcherImpl::Standard(ref x) => &x.chir,
- }
- }
-}
-
// This implementation just dispatches on the internal matcher impl except
// for the line terminator optimization, which is possibly executed via
// `fast_line_regex`.
@@ -446,221 +394,84 @@ impl Matcher for RegexMatcher {
type Captures = RegexCaptures;
type Error = NoError;
+ #[inline]
fn find_at(
&self,
haystack: &[u8],
at: usize,
) -> Result<Option<Match>, NoError> {
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => m.find_at(haystack, at),
- Word(ref m) => m.find_at(haystack, at),
- }
+ let input = Input::new(haystack).span(at..haystack.len());
+ Ok(self.regex.find(input).map(|m| Match::new(m.start(), m.end())))
}
+ #[inline]
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => m.new_captures(),
- Word(ref m) => m.new_captures(),
- }
+ Ok(RegexCaptures::new(self.regex.create_captures()))
}
+ #[inline]
fn capture_count(&self) -> usize {
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => m.capture_count(),
- Word(ref m) => m.capture_count(),
- }
+ self.regex.captures_len()
}
+ #[inline]
fn capture_index(&self, name: &str) -> Option<usize> {
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => m.capture_index(name),
- Word(ref m) => m.capture_index(name),
- }
- }
-
- fn find(&self, haystack: &[u8]) -> Result<Option<Match>, NoError> {
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => m.find(haystack),
- Word(ref m) => m.find(haystack),
- }
- }
-
- fn find_iter<F>(&self, haystack: &[u8], matched: F) -> Result<(), NoError>
- where
- F: FnMut(Match) -> bool,
- {
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => m.find_iter(haystack, matched),
- Word(ref m) => m.find_iter(haystack, matched),
- }
+ self.regex.group_info().to_index(PatternID::ZERO, name)
}
+ #[inline]
fn try_find_iter<F, E>(
&self,
haystack: &[u8],
- matched: F,
+ mut matched: F,
) -> Result<Result<(), E>, NoError>
where
F: FnMut(Match) -> Result<bool, E>,
{
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => m.try_find_iter(haystack, matched),
- Word(ref m) => m.try_find_iter(haystack, matched),
- }
- }
-
- fn captures(
- &self,
- haystack: &[u8],
- caps: &mut RegexCaptures,
- ) -> Result<bool, NoError> {
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => m.captures(haystack, caps),
- Word(ref m) => m.captures(haystack, caps),
- }
- }
-
- fn captures_iter<F>(
- &self,
- haystack: &[u8],
- caps: &mut RegexCaptures,
- matched: F,
- ) -> Result<(), NoError>
- where
- F: FnMut(&RegexCaptures) -> bool,
- {
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => m.captures_iter(haystack, caps, matched),
- Word(ref m) => m.captures_iter(haystack, caps, matched),
- }
- }
-
- fn try_captures_iter<F, E>(
- &self,
- haystack: &[u8],
- caps: &mut RegexCaptures,
- matched: F,
- ) -> Result<Result<(), E>, NoError>
- where
- F: FnMut(&RegexCaptures) -> Result<bool, E>,
- {
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => m.try_captures_iter(haystack, caps, matched),
- Word(ref m) => m.try_captures_iter(haystack, caps, matched),
+ for m in self.regex.find_iter(haystack) {
+ match matched(Match::new(m.start(), m.end())) {
+ Ok(true) => continue,
+ Ok(false) => return Ok(Ok(())),
+ Err(err) => return Ok(Err(err)),
+ }
}
+ Ok(Ok(()))
}
+ #[inline]
fn captures_at(
&self,
haystack: &[u8],
at: usize,
caps: &mut RegexCaptures,
) -> Result<bool, NoError> {
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => m.captures_at(haystack, at, caps),
- Word(ref m) => m.captures_at(haystack, at, caps),
- }
- }
-
- fn replace<F>(
- &self,
- haystack: &[u8],
- dst: &mut Vec<u8>,
- append: F,
- ) -> Result<(), NoError>
- where
- F: FnMut(Match, &mut Vec<u8>) -> bool,
- {
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => m.replace(haystack, dst, append),
- Word(ref m) => m.replace(haystack, dst, append),
- }
- }
-
- fn replace_with_captures<F>(
- &self,
- haystack: &[u8],
- caps: &mut RegexCaptures,
- dst: &mut Vec<u8>,
- append: F,
- ) -> Result<(), NoError>
- where
- F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
- {
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => {
- m.replace_with_captures(haystack, caps, dst, append)
- }
- Word(ref m) => {
- m.replace_with_captures(haystack, caps, dst, append)
- }
- }
- }
-
- fn is_match(&self, haystack: &[u8]) -> Result<bool, NoError> {
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => m.is_match(haystack),
- Word(ref m) => m.is_match(haystack),
- }
- }
-
- fn is_match_at(
- &self,
- haystack: &[u8],
- at: usize,
- ) -> Result<bool, NoError> {
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => m.is_match_at(haystack, at),
- Word(ref m) => m.is_match_at(haystack, at),
- }
- }
-
- fn shortest_match(
- &self,
- haystack: &[u8],
- ) -> Result<Option<usize>, NoError> {
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => m.shortest_match(haystack),
- Word(ref m) => m.shortest_match(haystack),
- }
+ let input = Input::new(haystack).span(at..haystack.len());
+ let caps = caps.captures_mut();
+ self.regex.search_captures(&input, caps);
+ Ok(caps.is_match())
}
+ #[inline]
fn shortest_match_at(
&self,
haystack: &[u8],
at: usize,
) -> Result<Option<usize>, NoError> {
- use self::RegexMatcherImpl::*;
- match self.matcher {
- Standard(ref m) => m.shortest_match_at(haystack, at),
- Word(ref m) => m.shortest_match_at(haystack, at),
- }
+ let input = Input::new(haystack).span(at..haystack.len());
+ Ok(self.regex.search_half(&input).map(|hm| hm.offset()))
}
+ #[inline]
fn non_matching_bytes(&self) -> Option<&ByteSet> {
Some(&self.non_matching_bytes)
}
+ #[inline]
fn line_terminator(&self) -> Option<LineTerminator> {
self.config.line_terminator
}
+ #[inline]
fn find_candidate_line(
&self,
haystack: &[u8],
@@ -679,93 +490,6 @@ impl Matcher for RegexMatcher {
}
}
-/// The implementation of the standard regex matcher.
-#[derive(Clone, Debug)]
-struct StandardMatcher {
- /// The regular expression compiled from the pattern provided by the
- /// caller.
- regex: Regex,
- /// The HIR that produced this regex.
- ///
- /// We put this in an `Arc` because by the time it gets here, it won't
- /// change. And because cloning and dropping an `Hir` is somewhat expensive
- /// due to its deep recursive representation.
- chir: Arc<ConfiguredHIR>,
-}
-
-impl StandardMatcher {
- fn new(chir: ConfiguredHIR) -> Result<StandardMatcher, Error> {
- let chir = Arc::new(chir);
- let regex = chir.to_regex()?;
- Ok(StandardMatcher { regex, chir })
- }
-}
-
-impl Matcher for StandardMatcher {
- type Captures = RegexCaptures;
- type Error = NoError;
-
- fn find_at(
- &self,
- haystack: &[u8],
- at: usize,
- ) -> Result<Option<Match>, NoError> {
- let input = Input::new(haystack).span(at..haystack.len());
- Ok(self.regex.find(input).map(|m| Match::new(m.start(), m.end())))
- }
-
- fn new_captures(&self) -> Result<RegexCaptures, NoError> {
- Ok(RegexCaptures::new(self.regex.create_captures()))
- }
-
- fn capture_count(&self) -> usize {
- self.regex.captures_len()
- }
-
- fn capture_index(&self, name: &str) -> Option<usize> {
- self.regex.group_info().to_index(PatternID::ZERO, name)
- }
-
- fn try_find_iter<F, E>(
- &self,
- haystack: &[u8],
- mut matched: F,
- ) -> Result<Result<(), E>, NoError>
- where
- F: FnMut(Match) -> Result<bool, E>,
- {
- for m in self.regex.find_iter(haystack) {
- match matched(Match::new(m.start(), m.end())) {
- Ok(true) => continue,
- Ok(false) => return Ok(Ok(())),
- Err(err) => return Ok(Err(err)),
- }
- }
- Ok(Ok(()))
- }
-
- fn captures_at(
- &self,
- haystack: &[u8],
- at: usize,
- caps: &mut RegexCaptures,
- ) -> Result<bool, NoError> {
- let input = Input::new(haystack).span(at..haystack.len());
- let caps = caps.captures_mut();
- self.regex.search_captures(&input, caps);
- Ok(caps.is_match())
- }
-
- fn shortest_match_at(
- &self,
- haystack: &[u8],
- at: usize,
- ) -> Result<Option<usize>, NoError> {
- let input = Input::new(haystack).span(at..haystack.len());
- Ok(self.regex.search_half(&input).map(|hm| hm.offset()))
- }
-}
-
/// Represents the match offsets of each capturing group in a match.
///
/// The first, or `0`th capture group, always corresponds to the entire match
@@ -784,46 +508,27 @@ impl Matcher for StandardMatcher {
pub struct RegexCaptures {
/// Where the captures are stored.
caps: AutomataCaptures,
- /// These captures behave as if the capturing groups begin at the given
- /// offset. When set to `0`, this has no affect and capture groups are
- /// indexed like normal.
- ///
- /// This is useful when building matchers that wrap arbitrary regular
- /// expressions. For example, `WordMatcher` takes an existing regex
- /// `re` and creates `(?:^|\W)(re)(?:$|\W)`, but hides the fact that
- /// the regex has been wrapped from the caller. In order to do this,
- /// the matcher and the capturing groups must behave as if `(re)` is
- /// the `0`th capture group.
- offset: usize,
}
impl Captures for RegexCaptures {
+ #[inline]
fn len(&self) -> usize {
- self.caps
- .group_info()
- .all_group_len()
- .checked_sub(self.offset)
- .unwrap()
+ self.caps.group_info().all_group_len()
}
+ #[inline]
fn get(&self, i: usize) -> Option<Match> {
- let actual = i.checked_add(self.offset).unwrap();
- self.caps.get_group(actual).map(|sp| Match::new(sp.start, sp.end))
+ self.caps.get_group(i).map(|sp| Match::new(sp.start, sp.end))
}
}
impl RegexCaptures {
+ #[inline]
pub(crate) fn new(caps: AutomataCaptures) -> RegexCaptures {
- RegexCaptures::with_offset(caps, 0)
- }
-
- pub(crate) fn with_offset(
- caps: AutomataCaptures,
- offset: usize,
- ) -> RegexCaptures {
- RegexCaptures { caps, offset }
+ RegexCaptures { caps }
}
+ #[inline]
pub(crate) fn captures_mut(&mut self) -> &mut AutomataCaptures {
&mut self.caps
}
diff --git a/crates/regex/src/non_matching.rs b/crates/regex/src/non_matching.rs
index 7fde6c46..f93ed13b 100644
--- a/crates/regex/src/non_matching.rs
+++ b/crates/regex/src/non_matching.rs
@@ -19,7 +19,14 @@ fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
match *expr.kind() {
HirKind::Empty
| HirKind::Look(Look::WordAscii | Look::WordAsciiNegate)
- | HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) => {}
+ | HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate)
+ | HirKind::Look(Look::WordStartAscii | Look::WordStartUnicode)
+ | HirKind::Look(Look::WordEndAscii | Look::WordEndUnicode)
+ | HirKind::Look(
+ Look::WordStartHalfAscii | Look::WordStartHalfUnicode,
+ )
+ | HirKind::Look(Look::WordEndHalfAscii | Look::WordEndHalfUnicode) => {
+ }
HirKind::Look(Look::Start | Look::End) => {
// FIXME: This is wrong, but not doing this leads to incorrect
// results because of how anchored searches are implemented in
diff --git a/crates/regex/src/word.rs b/crates/regex/src/word.rs
deleted file mode 100644
index 52fb61ce..00000000
--- a/crates/regex/src/word.rs
+++ /dev/null
@@ -1,341 +0,0 @@
-use std::{
- collections::HashMap,
- panic::{RefUnwindSafe, UnwindSafe},
- sync::Arc,
-};
-
-use {
- grep_matcher::{Match, Matcher, NoError},
- regex_automata::{
- meta::Regex, util::captures::Captures, util::pool::Pool, Input,
- PatternID,
- },
-};
-
-use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures};
-
-type PoolFn =
- Box<dyn Fn() -> Captures + Send + Sync + UnwindSafe + RefUnwindSafe>;
-
-/// A matcher for implementing "word match" semantics.
-#[derive(Debug)]
-pub(crate) struct WordMatcher {
- /// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
- regex: Regex,
- /// The HIR that produced the regex above. We don't keep the HIR for the
- /// `original` regex.
- ///
- /// We put this in an `Arc` because by the time it gets here, it won't
- /// change. And because cloning and dropping an `Hir` is somewhat expensive
- /// due to its deep recursive representation.
- chir: Arc<ConfiguredHIR>,
- /// The original regex supplied by the user, which we use in a fast path
- /// to try and detect matches before deferring to slower engines.
- original: Regex,
- /// A map from capture group name to capture group index.
- names: HashMap<String, usize>,
- /// A thread-safe pool of reusable buffers for finding the match offset of
- /// the inner group.
- caps: Arc<Pool<Captures, PoolFn>>,
-}
-
-impl Clone for WordMatcher {
- fn clone(&self) -> WordMatcher {
- // We implement Clone manually so that we get a fresh Pool such that it
- // can set its own thread owner. This permits each thread usings `caps`
- // to hit the fast path.
- //
- // Note that cloning a regex is "cheap" since it uses reference
- // counting internally.
- let re = self.regex.clone();
- WordMatcher {
- regex: self.regex.clone(),
- chir: Arc::clone(&self.chir),
- original: self.original.clone(),
- names: self.names.clone(),
- caps: Arc::new(Pool::new(Box::new(move || re.create_captures()))),
- }
- }
-}
-
-impl WordMatcher {
- /// Create a new matcher from the given pattern that only produces matches
- /// that are considered "words."
- ///
- /// The given options are used to construct the regular expression
- /// internally.
- pub(crate) fn new(chir: ConfiguredHIR) -> Result<WordMatcher, Error> {
- let original = chir.clone().into_anchored().to_regex()?;
- let chir = Arc::new(chir.into_word()?);
- let regex = chir.to_regex()?;
- let caps = Arc::new(Pool::new({
- let regex = regex.clone();
- Box::new(move || regex.create_captures()) as PoolFn
- }));
-