summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCharles Blake <cb@cblake.net>2018-07-13 09:54:51 -0400
committerAndrew Gallant <jamslam@gmail.com>2018-07-21 17:25:12 -0400
commit231456c409ff38c75c39d01b781b569965ddf808 (patch)
tree592b89acd3172b0c23acdfe6113bcd70ea11aa3a
parent1d09d4d31ba3ac2eb09edf31e8ec46b2b5cec388 (diff)
ripgrep: add --pre flag
The preprocessor flag accepts a command program and executes this program for every input file that is searched. Instead of searching the file directly, ripgrep will instead search the stdout contents of the program. Closes #978, Closes #981
-rw-r--r--CHANGELOG.md2
-rw-r--r--README.md3
-rw-r--r--complete/_rg3
-rw-r--r--src/app.rs55
-rw-r--r--src/args.rs16
-rw-r--r--src/main.rs1
-rw-r--r--src/preprocessor.rs92
-rw-r--r--src/worker.rs24
-rw-r--r--tests/tests.rs20
9 files changed, 211 insertions, 5 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b2130854..99303c38 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -50,6 +50,8 @@ Feature enhancements:
* [FEATURE #967](https://github.com/BurntSushi/ripgrep/issues/967):
Rename `--maxdepth` to `--max-depth` for consistency. We retain `--maxdepth`
as a synonym for backwards compatibility.
+* [FEATURE #978](https://github.com/BurntSushi/ripgrep/issues/978):
+ Add a `--pre` option to filter inputs with an arbitrary program.
* [FEATURE fca9709d](https://github.com/BurntSushi/ripgrep/commit/fca9709d):
Improve zsh completion.
diff --git a/README.md b/README.md
index 57ad0d7c..186e2fe7 100644
--- a/README.md
+++ b/README.md
@@ -107,6 +107,9 @@ increases the times to `2.640s` for ripgrep and `10.277s` for GNU grep.
specifically specified with the `-E/--encoding` flag.)
* ripgrep supports searching files compressed in a common format (gzip, xz,
lzma, bzip2 or lz4) with the `-z/--search-zip` flag.
+* ripgrep supports arbitrary input preprocessing filters which could be PDF
+ text extraction, less supported decompression, decrypting, automatic encoding
+ detection and so on.
In other words, use ripgrep if you like speed, filtering by default, fewer
bugs, and Unicode support.
diff --git a/complete/_rg b/complete/_rg
index 586c90b4..b943484d 100644
--- a/complete/_rg
+++ b/complete/_rg
@@ -170,7 +170,8 @@ _rg() {
{-w,--word-regexp}'[only show matches surrounded by word boundaries]'
{-x,--line-regexp}'[only show matches surrounded by line boundaries]'
- + '(zip)' # Compressed-file options
+ + '(input-decoding)' # Input decoding options
+ '--pre=[specify preprocessor utility]:preprocessor utility:_command_names -e'
{-z,--search-zip}'[search in compressed files]'
$no"--no-search-zip[don't search in compressed files]"
diff --git a/src/app.rs b/src/app.rs
index a0fdf946..67b7295e 100644
--- a/src/app.rs
+++ b/src/app.rs
@@ -534,6 +534,7 @@ pub fn all_args_and_flags() -> Vec<RGArg> {
flag_only_matching(&mut args);
flag_path_separator(&mut args);
flag_passthru(&mut args);
+ flag_pre(&mut args);
flag_pretty(&mut args);
flag_quiet(&mut args);
flag_regex_size_limit(&mut args);
@@ -1453,12 +1454,62 @@ This flag can be disabled with --no-search-zip.
");
let arg = RGArg::switch("search-zip").short("z")
.help(SHORT).long_help(LONG)
- .overrides("no-search-zip");
+ .overrides("no-search-zip")
+ .overrides("pre");
args.push(arg);
let arg = RGArg::switch("no-search-zip")
.hidden()
- .overrides("search-zip");
+ .overrides("search-zip")
+ .overrides("pre");
+ args.push(arg);
+}
+
+fn flag_pre(args: &mut Vec<RGArg>) {
+ const SHORT: &str = "search outputs of COMMAND FILE for each FILE";
+ const LONG: &str = long!("\
+For each input FILE, search the standard output of COMMAND FILE rather than the
+contents of FILE. This option expects the COMMAND program to either be an
+absolute path or to be available in your PATH. An empty string COMMAND
+deactivates this feature.
+
+A preprocessor is not run when ripgrep is searching stdin.
+
+When searching over sets of files that may require one of several decoders
+as preprocessors, COMMAND should be a wrapper program or script which first
+classifies FILE based on magic numbers/content or based on the FILE name and
+then dispatches to an appropriate preprocessor. Each COMMAND also has its
+standard input connected to FILE for convenience.
+
+For example, a shell script for COMMAND might look like:
+
+ case \"$1\" in
+ *.pdf)
+ exec pdftotext \"$1\" -
+ ;;
+ *)
+ case $(file \"$1\") in
+ *Zstandard*)
+ exec pzstd -cdq
+ ;;
+ *)
+ exec cat
+ ;;
+ esac
+ ;;
+ esac
+
+The above script uses `pdftotext` to convert a PDF file to plain text. For
+all other files, the script uses the `file` utility to sniff the type of the
+file based on its contents. If it is a compressed file in the Zstandard format,
+then `pzstd` is used to decompress the contents to stdout.
+
+This overrides the -z/--search-zip flag.
+");
+ let arg = RGArg::flag("pre", "COMMAND")
+ .help(SHORT).long_help(LONG)
+ .overrides("search-zip")
+ .overrides("no-search-zip");
args.push(arg);
}
diff --git a/src/args.rs b/src/args.rs
index aca9bcd5..302e330e 100644
--- a/src/args.rs
+++ b/src/args.rs
@@ -80,6 +80,7 @@ pub struct Args {
types: Types,
with_filename: bool,
search_zip_files: bool,
+ preprocessor: Option<PathBuf>,
stats: bool
}
@@ -288,6 +289,7 @@ impl Args {
.quiet(self.quiet)
.text(self.text)
.search_zip_files(self.search_zip_files)
+ .preprocessor(self.preprocessor.clone())
.build()
}
@@ -429,6 +431,7 @@ impl<'a> ArgMatches<'a> {
types: self.types()?,
with_filename: with_filename,
search_zip_files: self.is_present("search-zip"),
+ preprocessor: self.preprocessor(),
stats: self.stats()
};
if args.mmap {
@@ -722,6 +725,19 @@ impl<'a> ArgMatches<'a> {
}
}
+ /// Returns the preprocessor command
+ fn preprocessor(&self) -> Option<PathBuf> {
+ if let Some(path) = self.value_of_os("pre") {
+ if path.is_empty() {
+ None
+ } else {
+ Some(Path::new(path).to_path_buf())
+ }
+ } else {
+ None
+ }
+ }
+
/// Returns the unescaped path separator in UTF-8 bytes.
fn path_separator(&self) -> Result<Option<u8>> {
match self.value_of_lossy("path-separator") {
diff --git a/src/main.rs b/src/main.rs
index 6f010135..ab0e4118 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -43,6 +43,7 @@ mod args;
mod config;
mod decoder;
mod decompressor;
+mod preprocessor;
mod logger;
mod pathutil;
mod printer;
diff --git a/src/preprocessor.rs b/src/preprocessor.rs
new file mode 100644
index 00000000..bb464f86
--- /dev/null
+++ b/src/preprocessor.rs
@@ -0,0 +1,92 @@
+use std::fs::File;
+use std::io::{self, Read};
+use std::path::{Path, PathBuf};
+use std::process::{self, Stdio};
+
+use Result;
+
+/// PreprocessorReader provides an `io::Read` impl to read kids output.
+#[derive(Debug)]
+pub struct PreprocessorReader {
+ cmd: PathBuf,
+ path: PathBuf,
+ child: process::Child,
+ done: bool,
+}
+
+impl PreprocessorReader {
+ /// Returns a handle to the stdout of the spawned preprocessor process for
+ /// `path`, which can be directly searched in the worker. When the returned
+ /// value is exhausted, the underlying process is reaped. If the underlying
+ /// process fails, then its stderr is read and converted into a normal
+ /// io::Error.
+ ///
+ /// If there is any error in spawning the preprocessor command, then
+ /// return the corresponding error.
+ pub fn from_cmd_path(
+ cmd: PathBuf,
+ path: &Path,
+ ) -> Result<PreprocessorReader> {
+ let child = process::Command::new(&cmd)
+ .arg(path)
+ .stdin(Stdio::from(File::open(path)?))
+ .stdout(Stdio::piped())
+ .stderr(Stdio::piped())
+ .spawn()
+ .map_err(|err| {
+ format!(
+ "error running preprocessor command '{}': {}",
+ cmd.display(),
+ err,
+ )
+ })?;
+ Ok(PreprocessorReader {
+ cmd: cmd,
+ path: path.to_path_buf(),
+ child: child,
+ done: false,
+ })
+ }
+
+ fn read_error(&mut self) -> io::Result<io::Error> {
+ let mut errbytes = vec![];
+ self.child.stderr.as_mut().unwrap().read_to_end(&mut errbytes)?;
+ let errstr = String::from_utf8_lossy(&errbytes);
+ let errstr = errstr.trim();
+
+ Ok(if errstr.is_empty() {
+ let msg = format!(
+ "preprocessor command failed: '{} {}'",
+ self.cmd.display(),
+ self.path.display(),
+ );
+ io::Error::new(io::ErrorKind::Other, msg)
+ } else {
+ let msg = format!(
+ "preprocessor command failed: '{} {}': {}",
+ self.cmd.display(),
+ self.path.display(),
+ errstr,
+ );
+ io::Error::new(io::ErrorKind::Other, msg)
+ })
+ }
+}
+
+impl io::Read for PreprocessorReader {
+ fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+ if self.done {
+ return Ok(0);
+ }
+ let nread = self.child.stdout.as_mut().unwrap().read(buf)?;
+ if nread == 0 {
+ self.done = true;
+ // Reap the child now that we're done reading.
+ // If the command failed, report stderr as an error.
+ if !self.child.wait()?.success() {
+ return Err(self.read_error()?);
+ }
+ }
+ Ok(nread)
+ }
+}
diff --git a/src/worker.rs b/src/worker.rs
index a8327cda..5b7ef0a4 100644
--- a/src/worker.rs
+++ b/src/worker.rs
@@ -1,6 +1,6 @@
use std::fs::File;
use std::io;
-use std::path::Path;
+use std::path::{Path, PathBuf};
use encoding_rs::Encoding;
use grep::Grep;
@@ -10,6 +10,7 @@ use termcolor::WriteColor;
use decoder::DecodeReader;
use decompressor::{self, DecompressionReader};
+use preprocessor::PreprocessorReader;
use pathutil::strip_prefix;
use printer::Printer;
use search_buffer::BufferSearcher;
@@ -45,6 +46,7 @@ struct Options {
no_messages: bool,
quiet: bool,
text: bool,
+ preprocessor: Option<PathBuf>,
search_zip_files: bool
}
@@ -68,6 +70,7 @@ impl Default for Options {
quiet: false,
text: false,
search_zip_files: false,
+ preprocessor: None,
}
}
}
@@ -222,6 +225,12 @@ impl WorkerBuilder {
self.opts.search_zip_files = yes;
self
}
+
+ /// If non-empty, search output of preprocessor run on each file
+ pub fn preprocessor(mut self, command: Option<PathBuf>) -> Self {
+ self.opts.preprocessor = command;
+ self
+ }
}
/// Worker is responsible for executing searches on file paths, while choosing
@@ -250,7 +259,18 @@ impl Worker {
}
Work::DirEntry(dent) => {
let mut path = dent.path();
- if self.opts.search_zip_files
+ if self.opts.preprocessor.is_some() {
+ let cmd = self.opts.preprocessor.clone().unwrap();
+ match PreprocessorReader::from_cmd_path(cmd, path) {
+ Ok(reader) => self.search(printer, path, reader),
+ Err(err) => {
+ if !self.opts.no_messages {
+ eprintln!("{}", err);
+ }
+ return 0;
+ }
+ }
+ } else if self.opts.search_zip_files
&& decompressor::is_compressed(path)
{
match DecompressionReader::from_path(path) {
diff --git a/tests/tests.rs b/tests/tests.rs
index 9920c118..6a5bf73f 100644
--- a/tests/tests.rs
+++ b/tests/tests.rs
@@ -1733,6 +1733,26 @@ sherlock!(feature_419_zero_as_shortcut_for_null, "Sherlock", ".",
});
#[test]
+fn preprocessing() {
+ if !cmd_exists("xzcat") {
+ return;
+ }
+ let xz_file = include_bytes!("./data/sherlock.xz");
+
+ let wd = WorkDir::new("feature_preprocessing");
+ wd.create_bytes("sherlock.xz", xz_file);
+
+ let mut cmd = wd.command();
+ cmd.arg("--pre").arg("xzcat").arg("Sherlock").arg("sherlock.xz");
+ let lines: String = wd.stdout(&mut cmd);
+ let expected = "\
+For the Doctor Watsons of this world, as opposed to the Sherlock
+be, to a very large extent, the result of luck. Sherlock Holmes
+";
+ assert_eq!(lines, expected);
+}
+
+#[test]
fn compressed_gzip() {
if !cmd_exists("gzip") {
return;