summaryrefslogtreecommitdiffstats
path: root/grep-searcher
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2019-04-08 19:28:38 -0400
committerAndrew Gallant <jamslam@gmail.com>2019-04-14 19:29:27 -0400
commita7d26c8f144a4957b75f71087a66692d0b25759a (patch)
tree4888ac5ea66643ac919d4e12c60cc51992bef11a /grep-searcher
parentbd222ae93fa0cabe7d51ba8db40ece99579bdaed (diff)
binary: rejigger ripgrep's handling of binary files
This commit attempts to surface binary filtering in a slightly more user friendly way. Namely, before, ripgrep would silently stop searching a file if it detected a NUL byte, even if it had previously printed a match. This can lead to the user quite reasonably assuming that there are no more matches, since a partial search is fairly unintuitive. (ripgrep has this behavior by default because it really wants to NOT search binary files at all, just like it doesn't search gitignored or hidden files.) With this commit, if a match has already been printed and ripgrep detects a NUL byte, then it will print a warning message indicating that the search stopped prematurely. Moreover, this commit adds a new flag, --binary, which causes ripgrep to stop filtering binary files, but in a way that still avoids dumping binary data into terminals. That is, the --binary flag makes ripgrep behave more like grep's default behavior. For files explicitly specified in a search, e.g., `rg foo some-file`, then no binary filtering is applied (just like no gitignore and no hidden file filtering is applied). Instead, ripgrep behaves as if you gave the --binary flag for all explicitly given files. This was a fairly invasive change, and potentially increases the UX complexity of ripgrep around binary files. (Before, there were two binary modes, where as now there are three.) However, ripgrep is now a bit louder with warning messages when binary file detection might otherwise be hiding potential matches, so hopefully this is a net improvement. Finally, the `-uuu` convenience now maps to `--no-ignore --hidden --binary`, since this is closer to the actualy intent of the `--unrestricted` flag, i.e., to reduce ripgrep's smart filtering. As a consequence, `rg -uuu foo` should now search roughly the same number of bytes as `grep -r foo`, and `rg -uuua foo` should search roughly the same number of bytes as `grep -ra foo`. (The "roughly" weasel word is used because grep's and ripgrep's binary file detection might differ somewhat---perhaps based on buffer sizes---which can impact exactly what is and isn't searched.) See the numerous tests in tests/binary.rs for intended behavior. Fixes #306, Fixes #855
Diffstat (limited to 'grep-searcher')
-rw-r--r--grep-searcher/src/line_buffer.rs8
-rw-r--r--grep-searcher/src/searcher/core.rs36
-rw-r--r--grep-searcher/src/searcher/glue.rs19
-rw-r--r--grep-searcher/src/searcher/mod.rs62
-rw-r--r--grep-searcher/src/sink.rs40
5 files changed, 135 insertions, 30 deletions
diff --git a/grep-searcher/src/line_buffer.rs b/grep-searcher/src/line_buffer.rs
index c2e54a9e..cc7dd578 100644
--- a/grep-searcher/src/line_buffer.rs
+++ b/grep-searcher/src/line_buffer.rs
@@ -317,6 +317,14 @@ pub struct LineBuffer {
}
impl LineBuffer {
+ /// Set the binary detection method used on this line buffer.
+ ///
+ /// This permits dynamically changing the binary detection strategy on
+ /// an existing line buffer without needing to create a new one.
+ pub fn set_binary_detection(&mut self, binary: BinaryDetection) {
+ self.config.binary = binary;
+ }
+
/// Reset this buffer, such that it can be used with a new reader.
fn clear(&mut self) {
self.pos = 0;
diff --git a/grep-searcher/src/searcher/core.rs b/grep-searcher/src/searcher/core.rs
index ff2cd18d..dd621bba 100644
--- a/grep-searcher/src/searcher/core.rs
+++ b/grep-searcher/src/searcher/core.rs
@@ -90,6 +90,13 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
self.sink_matched(buf, range)
}
+ pub fn binary_data(
+ &mut self,
+ binary_byte_offset: u64,
+ ) -> Result<bool, S::Error> {
+ self.sink.binary_data(&self.searcher, binary_byte_offset)
+ }
+
pub fn begin(&mut self) -> Result<bool, S::Error> {
self.sink.begin(&self.searcher)
}
@@ -141,19 +148,28 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
consumed
}
- pub fn detect_binary(&mut self, buf: &[u8], range: &Range) -> bool {
+ pub fn detect_binary(
+ &mut self,
+ buf: &[u8],
+ range: &Range,
+ ) -> Result<bool, S::Error> {
if self.binary_byte_offset.is_some() {
- return true;
+ return Ok(self.config.binary.quit_byte().is_some());
}
let binary_byte = match self.config.binary.0 {
BinaryDetection::Quit(b) => b,
- _ => return false,
+ BinaryDetection::Convert(b) => b,
+ _ => return Ok(false),
};
if let Some(i) = B(&buf[*range]).find_byte(binary_byte) {
- self.binary_byte_offset = Some(range.start() + i);
- true
+ let offset = range.start() + i;
+ self.binary_byte_offset = Some(offset);
+ if !self.binary_data(offset as u64)? {
+ return Ok(true);
+ }
+ Ok(self.config.binary.quit_byte().is_some())
} else {
- false
+ Ok(false)
}
}
@@ -416,7 +432,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
buf: &[u8],
range: &Range,
) -> Result<bool, S::Error> {
- if self.binary && self.detect_binary(buf, range) {
+ if self.binary && self.detect_binary(buf, range)? {
return Ok(false);
}
if !self.sink_break_context(range.start())? {
@@ -448,7 +464,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
buf: &[u8],
range: &Range,
) -> Result<bool, S::Error> {
- if self.binary && self.detect_binary(buf, range) {
+ if self.binary && self.detect_binary(buf, range)? {
return Ok(false);
}
self.count_lines(buf, range.start());
@@ -478,7 +494,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
) -> Result<bool, S::Error> {
assert!(self.after_context_left >= 1);
- if self.binary && self.detect_binary(buf, range) {
+ if self.binary && self.detect_binary(buf, range)? {
return Ok(false);
}
self.count_lines(buf, range.start());
@@ -507,7 +523,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
buf: &[u8],
range: &Range,
) -> Result<bool, S::Error> {
- if self.binary && self.detect_binary(buf, range) {
+ if self.binary && self.detect_binary(buf, range)? {
return Ok(false);
}
self.count_lines(buf, range.start());
diff --git a/grep-searcher/src/searcher/glue.rs b/grep-searcher/src/searcher/glue.rs
index 3a5d4291..4f362dab 100644
--- a/grep-searcher/src/searcher/glue.rs
+++ b/grep-searcher/src/searcher/glue.rs
@@ -51,6 +51,7 @@ where M: Matcher,
fn fill(&mut self) -> Result<bool, S::Error> {
assert!(self.rdr.buffer()[self.core.pos()..].is_empty());
+ let already_binary = self.rdr.binary_byte_offset().is_some();
let old_buf_len = self.rdr.buffer().len();
let consumed = self.core.roll(self.rdr.buffer());
self.rdr.consume(consumed);
@@ -58,7 +59,14 @@ where M: Matcher,
Err(err) => return Err(S::Error::error_io(err)),
Ok(didread) => didread,
};
- if !didread || self.rdr.binary_byte_offset().is_some() {
+ if !already_binary {
+ if let Some(offset) = self.rdr.binary_byte_offset() {
+ if !self.core.binary_data(offset)? {
+ return Ok(false);
+ }
+ }
+ }
+ if !didread || self.should_binary_quit() {
return Ok(false);
}
// If rolling the buffer didn't result in consuming anything and if
@@ -71,6 +79,11 @@ where M: Matcher,
}
Ok(true)
}
+
+ fn should_binary_quit(&self) -> bool {
+ self.rdr.binary_byte_offset().is_some()
+ && self.config.binary.quit_byte().is_some()
+ }
}
#[derive(Debug)]
@@ -103,7 +116,7 @@ impl<'s, M: Matcher, S: Sink> SliceByLine<'s, M, S> {
DEFAULT_BUFFER_CAPACITY,
);
let binary_range = Range::new(0, binary_upto);
- if !self.core.detect_binary(self.slice, &binary_range) {
+ if !self.core.detect_binary(self.slice, &binary_range)? {
while
!self.slice[self.core.pos()..].is_empty()
&& self.core.match_by_line(self.slice)?
@@ -155,7 +168,7 @@ impl<'s, M: Matcher, S: Sink> MultiLine<'s, M, S> {
DEFAULT_BUFFER_CAPACITY,
);
let binary_range = Range::new(0, binary_upto);
- if !self.core.detect_binary(self.slice, &binary_range) {
+ if !self.core.detect_binary(self.slice, &binary_range)? {
let mut keepgoing = true;
while !self.slice[self.core.pos()..].is_empty() && keepgoing {
keepgoing = self.sink()?;
diff --git a/grep-searcher/src/searcher/mod.rs b/grep-searcher/src/searcher/mod.rs
index 729b491b..e20e04a3 100644
--- a/grep-searcher/src/searcher/mod.rs
+++ b/grep-searcher/src/searcher/mod.rs
@@ -75,25 +75,41 @@ impl BinaryDetection {
BinaryDetection(line_buffer::BinaryDetection::Quit(binary_byte))
}
- // TODO(burntsushi): Figure out how to make binary conversion work. This
- // permits implementing GNU grep's default behavior, which is to zap NUL
- // bytes but still execute a search (if a match is detected, then GNU grep
- // stops and reports that a match was found but doesn't print the matching
- // line itself).
- //
- // This behavior is pretty simple to implement using the line buffer (and
- // in fact, it is already implemented and tested), since there's a fixed
- // size buffer that we can easily write to. The issue arises when searching
- // a `&[u8]` (whether on the heap or via a memory map), since this isn't
- // something we can easily write to.
-
- /// The given byte is searched in all contents read by the line buffer. If
- /// it occurs, then it is replaced by the line terminator. The line buffer
- /// guarantees that this byte will never be observable by callers.
- #[allow(dead_code)]
- fn convert(binary_byte: u8) -> BinaryDetection {
+ /// Binary detection is performed by looking for the given byte, and
+ /// replacing it with the line terminator configured on the searcher.
+ /// (If the searcher is configured to use `CRLF` as the line terminator,
+ /// then this byte is replaced by just `LF`.)
+ ///
+ /// When searching is performed using a fixed size buffer, then the
+ /// contents of that buffer are always searched for the presence of this
+ /// byte and replaced with the line terminator. In effect, the caller is
+ /// guaranteed to never observe this byte while searching.
+ ///
+ /// When searching is performed with the entire contents mapped into
+ /// memory, then this setting has no effect and is ignored.
+ pub fn convert(binary_byte: u8) -> BinaryDetection {
BinaryDetection(line_buffer::BinaryDetection::Convert(binary_byte))
}
+
+ /// If this binary detection uses the "quit" strategy, then this returns
+ /// the byte that will cause a search to quit. In any other case, this
+ /// returns `None`.
+ pub fn quit_byte(&self) -> Option<u8> {
+ match self.0 {
+ line_buffer::BinaryDetection::Quit(b) => Some(b),
+ _ => None,
+ }
+ }
+
+ /// If this binary detection uses the "convert" strategy, then this returns
+ /// the byte that will be replaced by the line terminator. In any other
+ /// case, this returns `None`.
+ pub fn convert_byte(&self) -> Option<u8> {
+ match self.0 {
+ line_buffer::BinaryDetection::Convert(b) => Some(b),
+ _ => None,
+ }
+ }
}
/// An encoding to use when searching.
@@ -739,6 +755,12 @@ impl Searcher {
}
}
+ /// Set the binary detection method used on this searcher.
+ pub fn set_binary_detection(&mut self, detection: BinaryDetection) {
+ self.config.binary = detection.clone();
+ self.line_buffer.borrow_mut().set_binary_detection(detection.0);
+ }
+
/// Check that the searcher's configuration and the matcher are consistent
/// with each other.
fn check_config<M: Matcher>(&self, matcher: M) -> Result<(), ConfigError> {
@@ -778,6 +800,12 @@ impl Searcher {
self.config.line_term
}
+ /// Returns the type of binary detection configured on this searcher.
+ #[inline]
+ pub fn binary_detection(&self) -> &BinaryDetection {
+ &self.config.binary
+ }
+
/// Returns true if and only if this searcher is configured to invert its
/// search results. That is, matching lines are lines that do **not** match
/// the searcher's matcher.
diff --git a/grep-searcher/src/sink.rs b/grep-searcher/src/sink.rs
index bf2316f7..63a8ae24 100644
--- a/grep-searcher/src/sink.rs
+++ b/grep-searcher/src/sink.rs
@@ -167,6 +167,28 @@ pub trait Sink {
Ok(true)
}
+ /// This method is called whenever binary detection is enabled and binary
+ /// data is found. If binary data is found, then this is called at least
+ /// once for the first occurrence with the absolute byte offset at which
+ /// the binary data begins.
+ ///
+ /// If this returns `true`, then searching continues. If this returns
+ /// `false`, then searching is stopped immediately and `finish` is called.
+ ///
+ /// If this returns an error, then searching is stopped immediately,
+ /// `finish` is not called and the error is bubbled back up to the caller
+ /// of the searcher.
+ ///
+ /// By default, it does nothing and returns `true`.
+ #[inline]
+ fn binary_data(
+ &mut self,
+ _searcher: &Searcher,
+ _binary_byte_offset: u64,
+ ) -> Result<bool, Self::Error> {
+ Ok(true)
+ }
+
/// This method is called when a search has begun, before any search is
/// executed. By default, this does nothing.
///
@@ -229,6 +251,15 @@ impl<'a, S: Sink> Sink for &'a mut S {
}
#[inline]
+ fn binary_data(
+ &mut self,
+ searcher: &Searcher,
+ binary_byte_offset: u64,
+ ) -> Result<bool, S::Error> {
+ (**self).binary_data(searcher, binary_byte_offset)
+ }
+
+ #[inline]
fn begin(
&mut self,
searcher: &Searcher,
@@ -276,6 +307,15 @@ impl<S: Sink + ?Sized> Sink for Box<S> {
}
#[inline]
+ fn binary_data(
+ &mut self,
+ searcher: &Searcher,
+ binary_byte_offset: u64,
+ ) -> Result<bool, S::Error> {
+ (**self).binary_data(searcher, binary_byte_offset)
+ }
+
+ #[inline]
fn begin(
&mut self,
searcher: &Searcher,