use std::cell::RefCell; use std::collections::HashMap; use std::sync::Arc; use grep_matcher::{Match, Matcher, NoError}; use regex::bytes::{CaptureLocations, Regex}; use thread_local::ThreadLocal; use config::ConfiguredHIR; use error::Error; use matcher::RegexCaptures; /// A matcher for implementing "word match" semantics. #[derive(Debug)] pub struct WordMatcher { /// The regex which is roughly `(?:^|\W)()(?:$|\W)`. regex: Regex, /// The original regex supplied by the user, which we use in a fast path /// to try and detect matches before deferring to slower engines. original: Regex, /// A map from capture group name to capture group index. names: HashMap, /// A reusable buffer for finding the match location of the inner group. locs: Arc>>, } impl Clone for WordMatcher { fn clone(&self) -> WordMatcher { // We implement Clone manually so that we get a fresh ThreadLocal such // that it can set its own thread owner. This permits each thread // usings `locs` to hit the fast path. WordMatcher { regex: self.regex.clone(), original: self.original.clone(), names: self.names.clone(), locs: Arc::new(ThreadLocal::new()), } } } impl WordMatcher { /// Create a new matcher from the given pattern that only produces matches /// that are considered "words." /// /// The given options are used to construct the regular expression /// internally. pub fn new(expr: &ConfiguredHIR) -> Result { let original = expr.with_pattern(|pat| format!("^(?:{})$", pat))?.regex()?; let word_expr = expr.with_pattern(|pat| { let pat = format!(r"(?:(?m:^)|\W)({})(?:\W|(?m:$))", pat); debug!("word regex: {:?}", pat); pat })?; let regex = word_expr.regex()?; let locs = Arc::new(ThreadLocal::new()); let mut names = HashMap::new(); for (i, optional_name) in regex.capture_names().enumerate() { if let Some(name) = optional_name { names.insert(name.to_string(), i.checked_sub(1).unwrap()); } } Ok(WordMatcher { regex, original, names, locs }) } /// Return the underlying regex used by this matcher. pub fn regex(&self) -> &Regex { &self.regex } /// Attempt to do a fast confirmation of a word match that covers a subset /// (but hopefully a big subset) of most cases. Ok(Some(..)) is returned /// when a match is found. Ok(None) is returned when there is definitively /// no match. Err(()) is returned when this routine could not detect /// whether there was a match or not. fn fast_find( &self, haystack: &[u8], at: usize, ) -> Result, ()> { // This is a bit hairy. The whole point here is to avoid running an // NFA simulation in the regex engine. Remember, our word regex looks // like this: // // (^|\W)()($|\W) // where ^ and $ have multiline mode DISABLED // // What we want are the match offsets of . So in the // easy/common case, the original regex will be sandwiched between // two codepoints that are in the \W class. So our approach here is to // look for a match of the overall word regexp, strip the \W ends and // then check whether the original regex matches what's left. If so, // then we are guaranteed a correct match. // // This only works though if we know that the match is sandwiched // between two \W codepoints. This only occurs when neither ^ nor $ // match. This in turn only occurs when the match is at either the // beginning or end of the haystack. In either of those cases, we // declare defeat and defer to the slower implementation. // // The reason why we cannot handle the ^/$ cases here is because we // can't assume anything about the original pattern. (Try commenting // out the checks for ^/$ below and run the tests to see examples.) let mut cand = match self.regex.find_at(haystack, at) { None => return Ok(None), Some(m) => Match::new(m.start(), m.end()), }; if cand.start() == 0 || cand.end() == haystack.len() { return Err(()); } let (_, slen) = bstr::decode_utf8(&haystack[cand]); let (_, elen) = bstr::decode_last_utf8(&haystack[cand]); cand = cand.with_start(cand.start() + slen).with_end(cand.end() - elen); if self.original.is_match(&haystack[cand]) { Ok(Some(cand)) } else { Err(()) } } } impl Matcher for WordMatcher { type Captures = RegexCaptures; type Error = NoError; fn find_at( &self, haystack: &[u8], at: usize, ) -> Result, NoError> { // To make this easy to get right, we extract captures here instead of // calling `find_at`. The actual match is at capture group `1` instead // of `0`. We *could* use `find_at` here and then trim the match after // the fact, but that's a bit harder to get right, and it's not clear // if it's worth it. // // OK, well, it turns out that it is worth it! But it is quite tricky. // See `fast_find` for details. Effectively, this lets us skip running // the NFA simulation in the regex engine in the vast majority of // cases. However, the NFA simulation is required for full correctness. match self.fast_find(haystack, at) { Ok(Some(m)) => return Ok(Some(m)), Ok(None) => return Ok(None), Err(()) => {} } let cell = self.locs.get_or(|| RefCell::new(self.regex.capture_locations())); let mut caps = cell.borrow_mut(); self.regex.captures_read_at(&mut caps, haystack, at); Ok(caps.get(1).map(|m| Match::new(m.0, m.1))) } fn new_captures(&self) -> Result { Ok(RegexCaptures::with_offset(self.regex.capture_locations(), 1)) } fn capture_count(&self) -> usize { self.regex.captures_len().checked_sub(1).unwrap() } fn capture_index(&self, name: &str) -> Option { self.names.get(name).map(|i| *i) } fn captures_at( &self, haystack: &[u8], at: usize, caps: &mut RegexCaptures, ) -> Result { let r = self.regex.captures_read_at(caps.locations_mut(), haystack, at); Ok(r.is_some()) } // We specifically do not implement other methods like find_iter or // captures_iter. Namely, the iter methods are guaranteed to be correct // by virtue of implementing find_at and captures_at above. } #[cfg(test)] mod tests { use super::WordMatcher; use config::Config; use grep_matcher::{Captures, Match, Matcher}; fn matcher(pattern: &str) -> WordMatcher { let chir = Config::default().hir(pattern).unwrap(); WordMatcher::new(&chir).unwrap() } fn find(pattern: &str, haystack: &str) -> Option<(usize, usize)> { matcher(pattern) .find(haystack.as_bytes()) .unwrap() .map(|m| (m.start(), m.end())) } fn find_by_caps(pattern: &str, haystack: &str) -> Option<(usize, usize)> { let m = matcher(pattern); let mut caps = m.new_captures().unwrap(); if !m.captures(haystack.as_bytes(), &mut caps).unwrap() { None } else { caps.get(0).map(|m| (m.start(), m.end())) } } // Test that the standard `find` API reports offsets correctly. #[test] fn various_find() { assert_eq!(Some((0, 3)), find(r"foo", "foo")); assert_eq!(Some((0, 3)), find(r"foo", "foo(")); assert_eq!(Some((1, 4)), find(r"foo", "!foo(")); assert_eq!(None, find(r"foo", "!afoo(")); assert_eq!(Some((0, 3)), find(r"foo", "foo☃")); assert_eq!(None, find(r"foo", "fooб")); assert_eq!(Some((0, 4)), find(r"foo5", "foo5")); assert_eq!(None, find(r"foo", "foo5")); assert_eq!(Some((1, 4)), find(r"foo", "!foo!")); assert_eq!(Some((1, 5)), find(r"foo!", "!foo!")); assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!")); assert_eq!(Some((0, 3)), find(r"foo", "foo\n")); assert_eq!(Some((1, 4)), find(r"foo", "!foo!\n")); assert_eq!(Some((1, 5)), find(r"foo!", "!foo!\n")); assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!\n")); assert_eq!(Some((1, 6)), find(r"!?foo!?", "!!foo!!")); assert_eq!(Some((0, 5)), find(r"!?foo!?", "!foo!")); assert_eq!(Some((2, 5)), find(r"!?foo!?", "a!foo!a")); assert_eq!(Some((2, 7)), find(r"!?foo!?", "##!foo!\n")); assert_eq!(Some((3, 8)), find(r"!?foo!?", "##\n!foo!##")); assert_eq!(Some((3, 8)), find(r"!?foo!?", "##\n!foo!\n##")); assert_eq!(Some((3, 7)), find(r"f?oo!?", "##\nfoo!##")); assert_eq!(Some((2, 5)), find(r"(?-u)foo[^a]*", "#!foo☃aaa")); } // See: https://github.com/BurntSushi/ripgrep/issues/389 #[test] fn regression_dash() { assert_eq!(Some((0, 2)), find(r"-2", "-2")); } // Test that the captures API also reports offsets correctly, just as // find does. This exercises a different path in the code since captures // are handled differently. #[test] fn various_captures() { assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo")); assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo(")); assert_eq!(Some((1, 4)), find_by_caps(r"foo", "!foo(")); assert_eq!(None, find_by_caps(r"foo", "!afoo(")); assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo☃")); assert_eq!(None, find_by_caps(r"foo", "fooб")); // assert_eq!(Some((0, 3)), find_by_caps(r"foo", "fooб")); // See: https://github.com/BurntSushi/ripgrep/issues/389 assert_eq!(Some((0, 2)), find_by_caps(r"-2", "-2")); } // Test that the capture reporting methods work as advertised. #[test] fn capture_indexing() { let m = matcher(r"(a)(?Pb)(c)"); assert_eq!(4, m.capture_count()); assert_eq!(Some(2), m.capture_index("foo")); let mut caps = m.new_captures().unwrap(); assert_eq!(4, caps.len()); assert!(m.captures(b"abc", &mut caps).unwrap()); assert_eq!(caps.get(0), Some(Match::new(0, 3))); assert_eq!(caps.get(1), Some(Match::new(0, 1))); assert_eq!(caps.get(2), Some(Match::new(1, 2))); assert_eq!(caps.get(3), Some(Match::new(2, 3))); assert_eq!(caps.get(4), None); assert!(m.captures(b"#abc#", &mut caps).unwrap()); assert_eq!(caps.get(0), Some(Match::new(1, 4))); assert_eq!(caps.get(1), Some(Match::new(1, 2))); assert_eq!(caps.get(2), Some(Match::new(2, 3))); assert_eq!(caps.get(3), Some(Match::new(3, 4))); assert_eq!(caps.get(4), None); } }