summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2023-11-25 10:39:08 -0500
committerAndrew Gallant <jamslam@gmail.com>2023-11-25 15:03:53 -0500
commit9ed7565fcb47d02e76398d41f24ba0817f6b9c2f (patch)
tree847118ef328de5b61e62ee445de9bf20d689da73
parent7bb9f35d2d0b02952e20c082b17be856788e5f44 (diff)
cli: error when searching for NUL
Basically, unless the -a/--text flag is given, it is generally always an error to search for an explicit NUL byte because the binary detection will prevent it from matching. Fixes #1838
-rw-r--r--CHANGELOG.md2
-rw-r--r--crates/core/flags/hiargs.rs28
-rw-r--r--crates/regex/src/ban.rs83
-rw-r--r--crates/regex/src/config.rs7
-rw-r--r--crates/regex/src/error.rs13
-rw-r--r--crates/regex/src/lib.rs1
-rw-r--r--crates/regex/src/matcher.rs16
-rw-r--r--crates/searcher/src/line_buffer.rs2
-rw-r--r--crates/searcher/src/searcher/mod.rs2
-rw-r--r--tests/regression.rs14
10 files changed, 162 insertions, 6 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index afdbab2c..feaac274 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -34,6 +34,8 @@ Feature enhancements:
Add new `--stop-on-nonmatch` flag.
* [FEATURE #1814](https://github.com/BurntSushi/ripgrep/issues/1814):
Flags are now categorized in `-h/--help` output and ripgrep's man page.
+* [FEATURE #1838](https://github.com/BurntSushi/ripgrep/issues/1838):
+ An error is shown when searching for NUL bytes with binary detection enabled.
* [FEATURE #2195](https://github.com/BurntSushi/ripgrep/issues/2195):
When `extra-verbose` mode is enabled in zsh, show extra file type info.
* [FEATURE #2298](https://github.com/BurntSushi/ripgrep/issues/2298):
diff --git a/crates/core/flags/hiargs.rs b/crates/core/flags/hiargs.rs
index 41a6bc29..59a178e4 100644
--- a/crates/core/flags/hiargs.rs
+++ b/crates/core/flags/hiargs.rs
@@ -499,9 +499,14 @@ impl HiArgs {
if let Some(limit) = self.dfa_size_limit {
builder.dfa_size_limit(limit);
}
+ if !self.binary.is_none() {
+ builder.ban_byte(Some(b'\x00'));
+ }
let m = match builder.build_many(&self.patterns.patterns) {
Ok(m) => m,
- Err(err) => anyhow::bail!(suggest_multiline(err.to_string())),
+ Err(err) => {
+ anyhow::bail!(suggest_text(suggest_multiline(err.to_string())))
+ }
};
Ok(PatternMatcher::RustRegex(m))
}
@@ -1144,6 +1149,13 @@ impl BinaryDetection {
};
BinaryDetection { explicit, implicit }
}
+
+ /// Returns true when both implicit and explicit binary detection is
+ /// disabled.
+ pub(crate) fn is_none(&self) -> bool {
+ let none = grep::searcher::BinaryDetection::none();
+ self.explicit == none && self.implicit == none
+ }
}
/// Builds the file type matcher from low level arguments.
@@ -1428,3 +1440,17 @@ When multiline mode is enabled, new line characters can be matched.",
msg
}
}
+
+/// Possibly suggest the `-a/--text` flag.
+fn suggest_text(msg: String) -> String {
+ if msg.contains("pattern contains \"\\0\"") {
+ format!(
+ "{msg}
+
+Consider enabling text mode with the --text flag (or -a for short). Otherwise,
+binary detection is enabled and matching a NUL byte is impossible.",
+ )
+ } else {
+ msg
+ }
+}
diff --git a/crates/regex/src/ban.rs b/crates/regex/src/ban.rs
new file mode 100644
index 00000000..1bd62e28
--- /dev/null
+++ b/crates/regex/src/ban.rs
@@ -0,0 +1,83 @@
+use regex_syntax::hir::{
+ self, ClassBytesRange, ClassUnicodeRange, Hir, HirKind,
+};
+
+use crate::error::{Error, ErrorKind};
+
+/// Returns an error when a sub-expression in `expr` must match `byte`.
+pub(crate) fn check(expr: &Hir, byte: u8) -> Result<(), Error> {
+ assert!(byte.is_ascii(), "ban byte must be ASCII");
+ let ch = char::from(byte);
+ let invalid = || Err(Error::new(ErrorKind::Banned(byte)));
+ match expr.kind() {
+ HirKind::Empty => {}
+ HirKind::Literal(hir::Literal(ref lit)) => {
+ if lit.iter().find(|&&b| b == byte).is_some() {
+ return invalid();
+ }
+ }
+ HirKind::Class(hir::Class::Unicode(ref cls)) => {
+ if cls.ranges().iter().map(|r| r.len()).sum::<usize>() == 1 {
+ let contains =
+ |r: &&ClassUnicodeRange| r.start() <= ch && ch <= r.end();
+ if cls.ranges().iter().find(contains).is_some() {
+ return invalid();
+ }
+ }
+ }
+ HirKind::Class(hir::Class::Bytes(ref cls)) => {
+ if cls.ranges().iter().map(|r| r.len()).sum::<usize>() == 1 {
+ let contains = |r: &&ClassBytesRange| {
+ r.start() <= byte && byte <= r.end()
+ };
+ if cls.ranges().iter().find(contains).is_some() {
+ return invalid();
+ }
+ }
+ }
+ HirKind::Look(_) => {}
+ HirKind::Repetition(ref x) => check(&x.sub, byte)?,
+ HirKind::Capture(ref x) => check(&x.sub, byte)?,
+ HirKind::Concat(ref xs) => {
+ for x in xs.iter() {
+ check(x, byte)?;
+ }
+ }
+ HirKind::Alternation(ref xs) => {
+ for x in xs.iter() {
+ check(x, byte)?;
+ }
+ }
+ };
+ Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+ use regex_syntax::Parser;
+
+ /// Returns true when the given pattern is detected to contain the given
+ /// banned byte.
+ fn check(pattern: &str, byte: u8) -> bool {
+ let hir = Parser::new().parse(pattern).unwrap();
+ super::check(&hir, byte).is_err()
+ }
+
+ #[test]
+ fn various() {
+ assert!(check(r"\x00", 0));
+ assert!(check(r"a\x00", 0));
+ assert!(check(r"\x00b", 0));
+ assert!(check(r"a\x00b", 0));
+ assert!(check(r"\x00|ab", 0));
+ assert!(check(r"ab|\x00", 0));
+ assert!(check(r"\x00?", 0));
+ assert!(check(r"(\x00)", 0));
+
+ assert!(check(r"[\x00]", 0));
+ assert!(check(r"[^[^\x00]]", 0));
+
+ assert!(!check(r"[^\x00]", 0));
+ assert!(!check(r"[\x00a]", 0));
+ }
+}
diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs
index 8c69ef54..7ebe035d 100644
--- a/crates/regex/src/config.rs
+++ b/crates/regex/src/config.rs
@@ -8,7 +8,7 @@ use {
};
use crate::{
- ast::AstAnalysis, error::Error, non_matching::non_matching_bytes,
+ ast::AstAnalysis, ban, error::Error, non_matching::non_matching_bytes,
strip::strip_from_match,
};
@@ -35,6 +35,7 @@ pub(crate) struct Config {
pub(crate) dfa_size_limit: usize,
pub(crate) nest_limit: u32,
pub(crate) line_terminator: Option<LineTerminator>,
+ pub(crate) ban: Option<u8>,
pub(crate) crlf: bool,
pub(crate) word: bool,
pub(crate) fixed_strings: bool,
@@ -58,6 +59,7 @@ impl Default for Config {
dfa_size_limit: 1000 * (1 << 20),
nest_limit: 250,
line_terminator: None,
+ ban: None,
crlf: false,
word: false,
fixed_strings: false,
@@ -205,6 +207,9 @@ impl ConfiguredHIR {
.build()
.translate(&pattern, &ast)
.map_err(Error::generic)?;
+ if let Some(byte) = config.ban {
+ ban::check(&hir, byte)?;
+ }
// We don't need to do this for the fixed-strings case above
// because is_fixed_strings will return false if any pattern
// contains a line terminator. Therefore, we don't need to strip
diff --git a/crates/regex/src/error.rs b/crates/regex/src/error.rs
index 88a8adbe..a75f1f5d 100644
--- a/crates/regex/src/error.rs
+++ b/crates/regex/src/error.rs
@@ -60,6 +60,8 @@ pub enum ErrorKind {
///
/// The invalid byte is included in this error.
InvalidLineTerminator(u8),
+ /// Occurs when a banned byte was found in a pattern.
+ Banned(u8),
}
impl std::error::Error for Error {}
@@ -76,8 +78,15 @@ impl std::fmt::Display for Error {
ErrorKind::InvalidLineTerminator(byte) => {
write!(
f,
- "line terminators must be ASCII, but {} is not",
- [byte].as_bstr()
+ "line terminators must be ASCII, but {byte:?} is not",
+ byte = [byte].as_bstr(),
+ )
+ }
+ ErrorKind::Banned(byte) => {
+ write!(
+ f,
+ "pattern contains {byte:?} but it is impossible to match",
+ byte = [byte].as_bstr(),
)
}
}
diff --git a/crates/regex/src/lib.rs b/crates/regex/src/lib.rs
index 4693bff1..93e477b6 100644
--- a/crates/regex/src/lib.rs
+++ b/crates/regex/src/lib.rs
@@ -9,6 +9,7 @@ pub use crate::{
};
mod ast;
+mod ban;
mod config;
mod error;
mod literal;
diff --git a/crates/regex/src/matcher.rs b/crates/regex/src/matcher.rs
index f3f673ff..003d415c 100644
--- a/crates/regex/src/matcher.rs
+++ b/crates/regex/src/matcher.rs
@@ -286,6 +286,22 @@ impl RegexMatcherBuilder {
self
}
+ /// Ban a byte from occurring in a regular expression pattern.
+ ///
+ /// If this byte is found in the regex pattern, then an error will be
+ /// returned at construction time.
+ ///
+ /// This is useful when binary detection is enabled. Callers will likely
+ /// want to ban the same byte that is used to detect binary data, i.e.,
+ /// the NUL byte. The reason for this is that when binary detection is
+ /// enabled, it's impossible to match a NUL byte because binary detection
+ /// will either quit when one is found, or will convert NUL bytes to line
+ /// terminators to avoid exorbitant heap usage.
+ pub fn ban_byte(&mut self, byte: Option<u8>) -> &mut RegexMatcherBuilder {
+ self.config.ban = byte;
+ self
+ }
+
/// Set the line terminator to `\r\n` and enable CRLF matching for `$` in
/// regex patterns.
///
diff --git a/crates/searcher/src/line_buffer.rs b/crates/searcher/src/line_buffer.rs
index badcc438..822bdee4 100644
--- a/crates/searcher/src/line_buffer.rs
+++ b/crates/searcher/src/line_buffer.rs
@@ -47,7 +47,7 @@ pub(crate) fn alloc_error(limit: usize) -> io::Error {
/// is that binary data often indicates data that is undesirable to search
/// using textual patterns. Of course, there are many cases in which this isn't
/// true, which is why binary detection is disabled by default.
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(crate) enum BinaryDetection {
/// No binary detection is performed. Data reported by the line buffer may
/// contain arbitrary bytes.
diff --git a/crates/searcher/src/searcher/mod.rs b/crates/searcher/src/searcher/mod.rs
index ff1bea59..d4bd1563 100644
--- a/crates/searcher/src/searcher/mod.rs
+++ b/crates/searcher/src/searcher/mod.rs
@@ -52,7 +52,7 @@ type Range = Match;
/// heap, then binary detection is only guaranteed to be applied to the
/// parts corresponding to a match. When `Quit` is enabled, then the first
/// few KB of the data are searched for binary data.
-#[derive(Clone, Debug, Default)]
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct BinaryDetection(line_buffer::BinaryDetection);
impl BinaryDetection {
diff --git a/tests/regression.rs b/tests/regression.rs
index 04138fdf..54490b98 100644
--- a/tests/regression.rs
+++ b/tests/regression.rs
@@ -977,6 +977,20 @@ rgtest!(r1765, |dir: Dir, mut cmd: TestCommand| {
assert!(!cmd.stdout().is_empty());
});
+// See: https://github.com/BurntSushi/ripgrep/issues/1838
+rgtest!(r1838_nul_error_with_binary_detection, |dir: Dir, _: TestCommand| {
+ // We don't support this error reporting with PCRE2 since we can't parse
+ // the pattern (easily) to give a good error message.
+ if dir.is_pcre2() {
+ return;
+ }
+ dir.create("test", "foo\n");
+
+ dir.command().args(&[r"foo\x00?"]).assert_err();
+ eqnice!("test:foo\n", dir.command().args(&["-a", r"foo\x00?"]).stdout());
+});
+
+// See: https://github.com/BurntSushi/ripgrep/issues/1866
rgtest!(r1866, |dir: Dir, mut cmd: TestCommand| {
dir.create("test", "foobar\nfoobar\nfoo quux");
cmd.args(&[