diff options
Diffstat (limited to 'crates/printer')
-rw-r--r-- | crates/printer/src/json.rs | 34 | ||||
-rw-r--r-- | crates/printer/src/lib.rs | 11 | ||||
-rw-r--r-- | crates/printer/src/standard.rs | 59 | ||||
-rw-r--r-- | crates/printer/src/summary.rs | 66 | ||||
-rw-r--r-- | crates/printer/src/util.rs | 100 |
5 files changed, 206 insertions, 64 deletions
diff --git a/crates/printer/src/json.rs b/crates/printer/src/json.rs index b1a6fa2e..8500e6a1 100644 --- a/crates/printer/src/json.rs +++ b/crates/printer/src/json.rs @@ -4,14 +4,14 @@ use std::time::Instant; use grep_matcher::{Match, Matcher}; use grep_searcher::{ - Searcher, Sink, SinkContext, SinkContextKind, SinkError, SinkFinish, - SinkMatch, + Searcher, Sink, SinkContext, SinkContextKind, SinkFinish, SinkMatch, }; use serde_json as json; use counter::CounterWriter; use jsont; use stats::Stats; +use util::find_iter_at_in_context; /// The configuration for the JSON printer. /// @@ -603,7 +603,12 @@ impl<'p, 's, M: Matcher, W: io::Write> JSONSink<'p, 's, M, W> { /// Execute the matcher over the given bytes and record the match /// locations if the current configuration demands match granularity. - fn record_matches(&mut self, bytes: &[u8]) -> io::Result<()> { + fn record_matches( + &mut self, + searcher: &Searcher, + bytes: &[u8], + range: std::ops::Range<usize>, + ) -> io::Result<()> { self.json.matches.clear(); // If printing requires knowing the location of each individual match, // then compute and stored those right now for use later. While this @@ -612,12 +617,17 @@ impl<'p, 's, M: Matcher, W: io::Write> JSONSink<'p, 's, M, W> { // the extent that it's easy to ensure that we never do more than // one search to find the matches. let matches = &mut self.json.matches; - self.matcher - .find_iter(bytes, |m| { - matches.push(m); + find_iter_at_in_context( + searcher, + &self.matcher, + bytes, + range.clone(), + |m| { + let (s, e) = (m.start() - range.start, m.end() - range.start); + matches.push(Match::new(s, e)); true - }) - .map_err(io::Error::error_message)?; + }, + )?; // Don't report empty matches appearing at the end of the bytes. if !matches.is_empty() && matches.last().unwrap().is_empty() @@ -691,7 +701,11 @@ impl<'p, 's, M: Matcher, W: io::Write> Sink for JSONSink<'p, 's, M, W> { self.after_context_remaining = searcher.after_context() as u64; } - self.record_matches(mat.bytes())?; + self.record_matches( + searcher, + mat.buffer(), + mat.bytes_range_in_buffer(), + )?; self.stats.add_matches(self.json.matches.len() as u64); self.stats.add_matched_lines(mat.lines().count() as u64); @@ -720,7 +734,7 @@ impl<'p, 's, M: Matcher, W: io::Write> Sink for JSONSink<'p, 's, M, W> { self.after_context_remaining.saturating_sub(1); } let submatches = if searcher.invert_match() { - self.record_matches(ctx.bytes())?; + self.record_matches(searcher, ctx.bytes(), 0..ctx.bytes().len())?; SubMatches::new(ctx.bytes(), &self.json.matches) } else { SubMatches::empty() diff --git a/crates/printer/src/lib.rs b/crates/printer/src/lib.rs index fa321dcd..abb1087f 100644 --- a/crates/printer/src/lib.rs +++ b/crates/printer/src/lib.rs @@ -92,6 +92,17 @@ pub use stats::Stats; pub use summary::{Summary, SummaryBuilder, SummaryKind, SummarySink}; pub use util::PrinterPath; +// The maximum number of bytes to execute a search to account for look-ahead. +// +// This is an unfortunate kludge since PCRE2 doesn't provide a way to search +// a substring of some input while accounting for look-ahead. In theory, we +// could refactor the various 'grep' interfaces to account for it, but it would +// be a large change. So for now, we just let PCRE2 go looking a bit for a +// match without searching the entire rest of the contents. +// +// Note that this kludge is only active in multi-line mode. +const MAX_LOOK_AHEAD: usize = 128; + #[macro_use] mod macros; diff --git a/crates/printer/src/standard.rs b/crates/printer/src/standard.rs index e7fe7c38..0c853f1b 100644 --- a/crates/printer/src/standard.rs +++ b/crates/printer/src/standard.rs @@ -8,15 +8,17 @@ use std::time::Instant; use bstr::ByteSlice; use grep_matcher::{Match, Matcher}; use grep_searcher::{ - LineStep, Searcher, Sink, SinkContext, SinkContextKind, SinkError, - SinkFinish, SinkMatch, + LineStep, Searcher, Sink, SinkContext, SinkContextKind, SinkFinish, + SinkMatch, }; use termcolor::{ColorSpec, NoColor, WriteColor}; use color::ColorSpecs; use counter::CounterWriter; use stats::Stats; -use util::{trim_ascii_prefix, PrinterPath, Replacer, Sunk}; +use util::{ + find_iter_at_in_context, trim_ascii_prefix, PrinterPath, Replacer, Sunk, +}; /// The configuration for the standard printer. /// @@ -682,7 +684,12 @@ impl<'p, 's, M: Matcher, W: WriteColor> StandardSink<'p, 's, M, W> { /// Execute the matcher over the given bytes and record the match /// locations if the current configuration demands match granularity. - fn record_matches(&mut self, bytes: &[u8]) -> io::Result<()> { + fn record_matches( + &mut self, + searcher: &Searcher, + bytes: &[u8], + range: std::ops::Range<usize>, + ) -> io::Result<()> { self.standard.matches.clear(); if !self.needs_match_granularity { return Ok(()); @@ -695,16 +702,21 @@ impl<'p, 's, M: Matcher, W: WriteColor> StandardSink<'p, 's, M, W> { // one search to find the matches (well, for replacements, we do one // additional search to perform the actual replacement). let matches = &mut self.standard.matches; - self.matcher - .find_iter(bytes, |m| { - matches.push(m); + find_iter_at_in_context( + searcher, + &self.matcher, + bytes, + range.clone(), + |m| { + let (s, e) = (m.start() - range.start, m.end() - range.start); + matches.push(Match::new(s, e)); true - }) - .map_err(io::Error::error_message)?; + }, + )?; // Don't report empty matches appearing at the end of the bytes. if !matches.is_empty() && matches.last().unwrap().is_empty() - && matches.last().unwrap().start() >= bytes.len() + && matches.last().unwrap().start() >= range.end { matches.pop().unwrap(); } @@ -715,14 +727,25 @@ impl<'p, 's, M: Matcher, W: WriteColor> StandardSink<'p, 's, M, W> { /// replacement, lazily allocating memory if necessary. /// /// To access the result of a replacement, use `replacer.replacement()`. - fn replace(&mut self, bytes: &[u8]) -> io::Result<()> { + fn replace( + &mut self, + searcher: &Searcher, + bytes: &[u8], + range: std::ops::Range<usize>, + ) -> io::Result<()> { self.replacer.clear(); if self.standard.config.replacement.is_some() { let replacement = (*self.standard.config.replacement) .as_ref() .map(|r| &*r) .unwrap(); - self.replacer.replace_all(&self.matcher, bytes, replacement)?; + self.replacer.replace_all( + searcher, + &self.matcher, + bytes, + range, + replacement, + )?; } Ok(()) } @@ -777,8 +800,12 @@ impl<'p, 's, M: Matcher, W: WriteColor> Sink for StandardSink<'p, 's, M, W> { self.after_context_remaining = searcher.after_context() as u64; } - self.record_matches(mat.bytes())?; - self.replace(mat.bytes())?; + self.record_matches( + searcher, + mat.buffer(), + mat.bytes_range_in_buffer(), + )?; + self.replace(searcher, mat.buffer(), mat.bytes_range_in_buffer())?; if let Some(ref mut stats) = self.stats { stats.add_matches(self.standard.matches.len() as u64); @@ -807,8 +834,8 @@ impl<'p, 's, M: Matcher, W: WriteColor> Sink for StandardSink<'p, 's, M, W> { self.after_context_remaining.saturating_sub(1); } if searcher.invert_match() { - self.record_matches(ctx.bytes())?; - self.replace(ctx.bytes())?; + self.record_matches(searcher, ctx.bytes(), 0..ctx.bytes().len())?; + self.replace(searcher, ctx.bytes(), 0..ctx.bytes().len())?; } if searcher.binary_detection().convert_byte().is_some() { if self.binary_byte_offset.is_some() { diff --git a/crates/printer/src/summary.rs b/crates/printer/src/summary.rs index 5ce087ef..e062662d 100644 --- a/crates/printer/src/summary.rs +++ b/crates/printer/src/summary.rs @@ -11,7 +11,7 @@ use termcolor::{ColorSpec, NoColor, WriteColor}; use color::ColorSpecs; use counter::CounterWriter; use stats::Stats; -use util::PrinterPath; +use util::{find_iter_at_in_context, PrinterPath}; /// The configuration for the summary printer. /// @@ -504,6 +504,17 @@ impl<'p, 's, M: Matcher, W: WriteColor> SummarySink<'p, 's, M, W> { self.stats.as_ref() } + /// Returns true if and only if the searcher may report matches over + /// multiple lines. + /// + /// Note that this doesn't just return whether the searcher is in multi + /// line mode, but also checks if the mater can match over multiple lines. + /// If it can't, then we don't need multi line handling, even if the + /// searcher has multi line mode enabled. + fn multi_line(&self, searcher: &Searcher) -> bool { + searcher.multi_line_with_matcher(&self.matcher) + } + /// Returns true if this printer should quit. /// /// This implements the logic for handling quitting after seeing a certain @@ -579,32 +590,39 @@ impl<'p, 's, M: Matcher, W: WriteColor> Sink for SummarySink<'p, 's, M, W> { fn matched( &mut self, - _searcher: &Searcher, + searcher: &Searcher, mat: &SinkMatch, ) -> Result<bool, io::Error> { - self.match_count += 1; - if let Some(ref mut stats) = self.stats { - let mut match_count = 0; - self.matcher - .find_iter(mat.bytes(), |_| { - match_count += 1; + let is_multi_line = self.multi_line(searcher); + let sink_match_count = if self.stats.is_none() && !is_multi_line { + 1 + } else { + // This gives us as many bytes as the searcher can offer. This + // isn't guaranteed to hold the necessary context to get match + // detection correct (because of look-around), but it does in + // practice. + let buf = mat.buffer(); + let range = mat.bytes_range_in_buffer(); + let mut count = 0; + find_iter_at_in_context( + searcher, + &self.matcher, + buf, + range, + |_| { + count += 1; true - }) - .map_err(io::Error::error_message)?; - if match_count == 0 { - // It is possible for the match count to be zero when - // look-around is used. Since `SinkMatch` won't necessarily - // contain the look-around in its match span, the search here - // could fail to find anything. - // - // It seems likely that setting match_count=1 here is probably - // wrong in some cases, but I don't think we can do any - // better. (Because this printer cannot assume that subsequent - // contents have been loaded into memory, so we have no way of - // increasing the search span here.) - match_count = 1; - } - stats.add_matches(match_count); + }, + )?; + count + }; + if is_multi_line { + self.match_count += sink_match_count; + } else { + self.match_count += 1; + } + if let Some(ref mut stats) = self.stats { + stats.add_matches(sink_match_count); stats.add_matched_lines(mat.lines().count() as u64); } else if self.summary.config.kind.quit_early() { return Ok(false); diff --git a/crates/printer/src/util.rs b/crates/printer/src/util.rs index 3948d970..37e56529 100644 --- a/crates/printer/src/util.rs +++ b/crates/printer/src/util.rs @@ -7,11 +7,13 @@ use std::time; use bstr::{ByteSlice, ByteVec}; use grep_matcher::{Captures, LineTerminator, Match, Matcher}; use grep_searcher::{ - LineIter, SinkContext, SinkContextKind, SinkError, SinkMatch, + LineIter, Searcher, SinkContext, SinkContextKind, SinkError, SinkMatch, }; #[cfg(feature = "serde1")] use serde::{Serialize, Serializer}; +use MAX_LOOK_AHEAD; + /// A type for handling replacements while amortizing allocation. pub struct Replacer<M: Matcher> { space: Option<Space<M>>, @@ -52,10 +54,22 @@ impl<M: Matcher> Replacer<M> { /// This can fail if the underlying matcher reports an error. pub fn replace_all<'a>( &'a mut self, + searcher: &Searcher, matcher: &M, - subject: &[u8], + mut subject: &[u8], + range: std::ops::Range<usize>, replacement: &[u8], ) -> io::Result<()> { + // See the giant comment in 'find_iter_at_in_context' below for why we + // do this dance. + let is_multi_line = searcher.multi_line_with_matcher(&matcher); + if is_multi_line { + if subject[range.end..].len() >= MAX_LOOK_AHEAD { + subject = &subject[..range.end + MAX_LOOK_AHEAD]; + } + } else { + subject = &subject[..range.end]; + } { let &mut Space { ref mut dst, ref mut caps, ref mut matches } = self.allocate(matcher)?; @@ -63,18 +77,24 @@ impl<M: Matcher> Replacer<M> { matches.clear(); matcher - .replace_with_captures(subject, caps, dst, |caps, dst| { - let start = dst.len(); - caps.interpolate( - |name| matcher.capture_index(name), - subject, - replacement, - dst, - ); - let end = dst.len(); - matches.push(Match::new(start, end)); - true - }) + .replace_with_captures_at( + subject, + range.start, + caps, + dst, + |caps, dst| { + let start = dst.len(); + caps.interpolate( + |name| matcher.capture_index(name), + subject, + replacement, + dst, + ); + let end = dst.len(); + matches.push(Match::new(start, end)); + true + }, + ) .map_err(io::Error::error_message)?; } Ok(()) @@ -357,3 +377,55 @@ pub fn trim_ascii_prefix( .count(); range.with_start(range.start() + count) } + +pub fn find_iter_at_in_context<M, F>( + searcher: &Searcher, + matcher: M, + mut bytes: &[u8], + range: std::ops::Range<usize>, + mut matched: F, +) -> io::Result<()> +where + M: Matcher, + F: FnMut(Match) -> bool, +{ + // This strange dance is to account for the possibility of look-ahead in + // the regex. The problem here is that mat.bytes() doesn't include the + // lines beyond the match boundaries in mulit-line mode, which means that + // when we try to rediscover the full set of matches here, the regex may no + // longer match if it required some look-ahead beyond the matching lines. + // + // PCRE2 (and the grep-matcher interfaces) has no way of specifying an end + // bound of the search. So we kludge it and let the regex engine search the + // rest of the buffer... But to avoid things getting too crazy, we cap the + // buffer. + // + // If it weren't for multi-line mode, then none of this would be needed. + // Alternatively, if we refactored the grep interfaces to pass along the + // full set of matches (if available) from the searcher, then that might + // also help here. But that winds up paying an upfront unavoidable cost for + // the case where matches don't need to be counted. So then you'd have to + // introduce a way to pass along matches conditionally, only when needed. + // Yikes. + // + // Maybe the bigger picture thing here is that the searcher should be + // responsible for finding matches when necessary, and the printer + // shouldn't be involved in this business in the first place. Sigh. Live + // and learn. Abstraction boundaries are hard. + let is_multi_line = searcher.multi_line_with_matcher(&matcher); + if is_multi_line { + if bytes[range.end..].len() >= MAX_LOOK_AHEAD { + bytes = &bytes[..range.end + MAX_LOOK_AHEAD]; + } + } else { + bytes = &bytes[..range.end]; + } + matcher + .find_iter_at(bytes, range.start, |m| { + if m.start() >= range.end { + return false; + } + matched(m) + }) + .map_err(io::Error::error_message) +} |