ripgrep: add --pre flag

The preprocessor flag accepts a command program and executes this program for every input file that is searched. Instead of searching the file directly, ripgrep will instead search the stdout contents of the program. Closes #978, Closes #981
author: Charles Blake <cb@cblake.net> 2018-07-13 09:54:51 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2018-07-21 17:25:12 -0400
commit: 231456c409ff38c75c39d01b781b569965ddf808 (patch)
tree: 592b89acd3172b0c23acdfe6113bcd70ea11aa3a
parent: 1d09d4d31ba3ac2eb09edf31e8ec46b2b5cec388 (diff)
9 files changed, 211 insertions, 5 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b2130854..99303c38 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -50,6 +50,8 @@ Feature enhancements:
 * [FEATURE #967](https://github.com/BurntSushi/ripgrep/issues/967):
   Rename `--maxdepth` to `--max-depth` for consistency. We retain `--maxdepth`
   as a synonym for backwards compatibility.
+* [FEATURE #978](https://github.com/BurntSushi/ripgrep/issues/978):
+  Add a `--pre` option to filter inputs with an arbitrary program.
 * [FEATURE fca9709d](https://github.com/BurntSushi/ripgrep/commit/fca9709d):
   Improve zsh completion.
 
diff --git a/README.md b/README.md
index 57ad0d7c..186e2fe7 100644
--- a/README.md
+++ b/README.md
@@ -107,6 +107,9 @@ increases the times to `2.640s` for ripgrep and `10.277s` for GNU grep.
   specifically specified with the `-E/--encoding` flag.)
 * ripgrep supports searching files compressed in a common format (gzip, xz,
   lzma, bzip2 or lz4) with the `-z/--search-zip` flag.
+* ripgrep supports arbitrary input preprocessing filters which could be PDF
+  text extraction, less supported decompression, decrypting, automatic encoding
+  detection and so on.
 
 In other words, use ripgrep if you like speed, filtering by default, fewer
 bugs, and Unicode support.
diff --git a/complete/_rg b/complete/_rg
index 586c90b4..b943484d 100644
--- a/complete/_rg
+++ b/complete/_rg
@@ -170,7 +170,8 @@ _rg() {
     {-w,--word-regexp}'[only show matches surrounded by word boundaries]'
     {-x,--line-regexp}'[only show matches surrounded by line boundaries]'
 
-    + '(zip)' # Compressed-file options
+    + '(input-decoding)' # Input decoding options
+    '--pre=[specify preprocessor utility]:preprocessor utility:_command_names -e'
     {-z,--search-zip}'[search in compressed files]'
     $no"--no-search-zip[don't search in compressed files]"
 
diff --git a/src/app.rs b/src/app.rs
index a0fdf946..67b7295e 100644
--- a/src/app.rs
+++ b/src/app.rs
@@ -534,6 +534,7 @@ pub fn all_args_and_flags() -> Vec<RGArg> {
     flag_only_matching(&mut args);
     flag_path_separator(&mut args);
     flag_passthru(&mut args);
+    flag_pre(&mut args);
     flag_pretty(&mut args);
     flag_quiet(&mut args);
     flag_regex_size_limit(&mut args);
@@ -1453,12 +1454,62 @@ This flag can be disabled with --no-search-zip.
 ");
     let arg = RGArg::switch("search-zip").short("z")
         .help(SHORT).long_help(LONG)
-        .overrides("no-search-zip");
+        .overrides("no-search-zip")
+        .overrides("pre");
     args.push(arg);
 
     let arg = RGArg::switch("no-search-zip")
         .hidden()
-        .overrides("search-zip");
+        .overrides("search-zip")
+        .overrides("pre");
+    args.push(arg);
+}
+
+fn flag_pre(args: &mut Vec<RGArg>) {
+    const SHORT: &str = "search outputs of COMMAND FILE for each FILE";
+    const LONG: &str = long!("\
+For each input FILE, search the standard output of COMMAND FILE rather than the
+contents of FILE. This option expects the COMMAND program to either be an
+absolute path or to be available in your PATH. An empty string COMMAND
+deactivates this feature.
+
+A preprocessor is not run when ripgrep is searching stdin.
+
+When searching over sets of files that may require one of several decoders
+as preprocessors, COMMAND should be a wrapper program or script which first
+classifies FILE based on magic numbers/content or based on the FILE name and
+then dispatches to an appropriate preprocessor. Each COMMAND also has its
+standard input connected to FILE for convenience.
+
+For example, a shell script for COMMAND might look like:
+
+    case \"$1\" in
+    *.pdf)
+        exec pdftotext \"$1\" -
+        ;;
+    *)
+        case $(file \"$1\") in
+        *Zstandard*)
+            exec pzstd -cdq
+            ;;
+        *)
+            exec cat
+            ;;
+        esac
+        ;;
+    esac
+
+The above script uses `pdftotext` to convert a PDF file to plain text. For
+all other files, the script uses the `file` utility to sniff the type of the
+file based on its contents. If it is a compressed file in the Zstandard format,
+then `pzstd` is used to decompress the contents to stdout.
+
+This overrides the -z/--search-zip flag.
+");
+    let arg = RGArg::flag("pre", "COMMAND")
+        .help(SHORT).long_help(LONG)
+        .overrides("search-zip")
+        .overrides("no-search-zip");
     args.push(arg);
 }
 
diff --git a/src/args.rs b/src/args.rs
index aca9bcd5..302e330e 100644
--- a/src/args.rs
+++ b/src/args.rs
@@ -80,6 +80,7 @@ pub struct Args {
     types: Types,
     with_filename: bool,
     search_zip_files: bool,
+    preprocessor: Option<PathBuf>,
     stats: bool
 }
 
@@ -288,6 +289,7 @@ impl Args {
             .quiet(self.quiet)
             .text(self.text)
             .search_zip_files(self.search_zip_files)
+            .preprocessor(self.preprocessor.clone())
             .build()
     }
 
@@ -429,6 +431,7 @@ impl<'a> ArgMatches<'a> {
             types: self.types()?,
             with_filename: with_filename,
             search_zip_files: self.is_present("search-zip"),
+            preprocessor: self.preprocessor(),
             stats: self.stats()
         };
         if args.mmap {
@@ -722,6 +725,19 @@ impl<'a> ArgMatches<'a> {
         }
     }
 
+    /// Returns the preprocessor command
+    fn preprocessor(&self) -> Option<PathBuf> {
+        if let Some(path) = self.value_of_os("pre") {
+            if path.is_empty() {
+                None
+            } else {
+                Some(Path::new(path).to_path_buf())
+            }
+        } else {
+            None
+        }
+    }
+
     /// Returns the unescaped path separator in UTF-8 bytes.
     fn path_separator(&self) -> Result<Option<u8>> {
         match self.value_of_lossy("path-separator") {
diff --git a/src/main.rs b/src/main.rs
index 6f010135..ab0e4118 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -43,6 +43,7 @@ mod args;
 mod config;
 mod decoder;
 mod decompressor;
+mod preprocessor;
 mod logger;
 mod pathutil;
 mod printer;
diff --git a/src/preprocessor.rs b/src/preprocessor.rs
new file mode 100644
index 00000000..bb464f86
--- /dev/null
+++ b/src/preprocessor.rs
@@ -0,0 +1,92 @@
+use std::fs::File;
+use std::io::{self, Read};
+use std::path::{Path, PathBuf};
+use std::process::{self, Stdio};
+
+use Result;
+
+/// PreprocessorReader provides an `io::Read` impl to read kids output.
+#[derive(Debug)]
+pub struct PreprocessorReader {
+    cmd: PathBuf,
+    path: PathBuf,
+    child: process::Child,
+    done: bool,
+}
+
+impl PreprocessorReader {
+    /// Returns a handle to the stdout of the spawned preprocessor process for
+    /// `path`, which can be directly searched in the worker. When the returned
+    /// value is exhausted, the underlying process is reaped. If the underlying
+    /// process fails, then its stderr is read and converted into a normal
+    /// io::Error.
+    ///
+    /// If there is any error in spawning the preprocessor command, then
+    /// return the corresponding error.
+    pub fn from_cmd_path(
+        cmd: PathBuf,
+        path: &Path,
+    ) -> Result<PreprocessorReader> {
+        let child = process::Command::new(&cmd)
+            .arg(path)
+            .stdin(Stdio::from(File::open(path)?))
+            .stdout(Stdio::piped())
+            .stderr(Stdio::piped())
+            .spawn()
+            .map_err(|err| {
+                format!(
+                    "error running preprocessor command '{}': {}",
+                    cmd.display(),
+                    err,
+                )
+            })?;
+        Ok(PreprocessorReader {
+            cmd: cmd,
+            path: path.to_path_buf(),
+            child: child,
+            done: false,
+        })
+    }
+
+    fn read_error(&mut self) -> io::Result<io::Error> {
+        let mut errbytes = vec![];
+        self.child.stderr.as_mut().unwrap().read_to_end(&mut errbytes)?;
+        let errstr = String::from_utf8_lossy(&errbytes);
+        let errstr = errstr.trim();
+
+        Ok(if errstr.is_empty() {
+            let msg = format!(
+                "preprocessor command failed: '{} {}'",
+                self.cmd.display(),
+                self.path.display(),
+            );
+            io::Error::new(io::ErrorKind::Other, msg)
+        } else {
+            let msg = format!(
+                "preprocessor command failed: '{} {}': {}",
+                self.cmd.display(),
+                self.path.display(),
+                errstr,
+            );
+            io::Error::new(io::ErrorKind::Other, msg)
+        })
+    }
+}
+
+impl io::Read for PreprocessorReader {
+    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+        if self.done {
+            return Ok(0);
+        }
+        let nread = self.child.stdout.as_mut().unwrap().read(buf)?;
+        if nread == 0 {
+            self.done = true;
+            // Reap the child now that we're done reading.
+            // If the command failed, report stderr as an error.
+            if !self.child.wait()?.success() {
+                return Err(self.read_error()?);
+            }
+        }
+        Ok(nread)
+    }
+}
diff --git a/src/worker.rs b/src/worker.rs
index a8327cda..5b7ef0a4 100644
--- a/src/worker.rs
+++ b/src/worker.rs
@@ -1,6 +1,6 @@
 use std::fs::File;
 use std::io;
-use std::path::Path;
+use std::path::{Path, PathBuf};
 
 use encoding_rs::Encoding;
 use grep::Grep;
@@ -10,6 +10,7 @@ use termcolor::WriteColor;
 
 use decoder::DecodeReader;
 use decompressor::{self, DecompressionReader};
+use preprocessor::PreprocessorReader;
 use pathutil::strip_prefix;
 use printer::Printer;
 use search_buffer::BufferSearcher;
@@ -45,6 +46,7 @@ struct Options {
     no_messages: bool,
     quiet: bool,
     text: bool,
+    preprocessor: Option<PathBuf>,
     search_zip_files: bool
 }
 
@@ -68,6 +70,7 @@ impl Default for Options {
             quiet: false,
             text: false,
             search_zip_files: false,
+            preprocessor: None,
         }
     }
 }
@@ -222,6 +225,12 @@ impl WorkerBuilder {
         self.opts.search_zip_files = yes;
         self
     }
+
+    /// If non-empty, search output of preprocessor run on each file
+    pub fn preprocessor(mut self, command: Option<PathBuf>) -> Self {
+        self.opts.preprocessor = command;
+        self
+    }
 }
 
 /// Worker is responsible for executing searches on file paths, while choosing
@@ -250,7 +259,18 @@ impl Worker {
             }
             Work::DirEntry(dent) => {
                 let mut path = dent.path();
-                if self.opts.search_zip_files
+                if self.opts.preprocessor.is_some() {
+                    let cmd = self.opts.preprocessor.clone().unwrap();
+                    match PreprocessorReader::from_cmd_path(cmd, path) {
+                        Ok(reader) => self.search(printer, path, reader),
+                        Err(err) => {
+                            if !self.opts.no_messages {
+                                eprintln!("{}", err);
+                            }
+                            return 0;
+                        }
+                    }
+                } else if self.opts.search_zip_files
                      && decompressor::is_compressed(path)
                 {
                     match DecompressionReader::from_path(path) {
diff --git a/tests/tests.rs b/tests/tests.rs
index 9920c118..6a5bf73f 100644
--- a/tests/tests.rs
+++ b/tests/tests.rs
@@ -1733,6 +1733,26 @@ sherlock!(feature_419_zero_as_shortcut_for_null, "Sherlock", ".",
 });
 
 #[test]
+fn preprocessing() {
+    if !cmd_exists("xzcat") {
+        return;
+    }
+    let xz_file = include_bytes!("./data/sherlock.xz");
+
+    let wd = WorkDir::new("feature_preprocessing");
+    wd.create_bytes("sherlock.xz", xz_file);
+
+    let mut cmd = wd.command();
+    cmd.arg("--pre").arg("xzcat").arg("Sherlock").arg("sherlock.xz");
+    let lines: String = wd.stdout(&mut cmd);
+    let expected = "\
+For the Doctor Watsons of this world, as opposed to the Sherlock
+be, to a very large extent, the result of luck. Sherlock Holmes
+";
+    assert_eq!(lines, expected);
+}
+
+#[test]
 fn compressed_gzip() {
     if !cmd_exists("gzip") {
         return;
author	Charles Blake <cb@cblake.net>	2018-07-13 09:54:51 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2018-07-21 17:25:12 -0400
commit	231456c409ff38c75c39d01b781b569965ddf808 (patch)
tree	592b89acd3172b0c23acdfe6113bcd70ea11aa3a
parent	1d09d4d31ba3ac2eb09edf31e8ec46b2b5cec388 (diff)