From 231456c409ff38c75c39d01b781b569965ddf808 Mon Sep 17 00:00:00 2001 From: Charles Blake Date: Fri, 13 Jul 2018 09:54:51 -0400 Subject: ripgrep: add --pre flag The preprocessor flag accepts a command program and executes this program for every input file that is searched. Instead of searching the file directly, ripgrep will instead search the stdout contents of the program. Closes #978, Closes #981 --- CHANGELOG.md | 2 ++ README.md | 3 ++ complete/_rg | 3 +- src/app.rs | 55 ++++++++++++++++++++++++++++++-- src/args.rs | 16 ++++++++++ src/main.rs | 1 + src/preprocessor.rs | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/worker.rs | 24 ++++++++++++-- tests/tests.rs | 20 ++++++++++++ 9 files changed, 211 insertions(+), 5 deletions(-) create mode 100644 src/preprocessor.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index b2130854..99303c38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,6 +50,8 @@ Feature enhancements: * [FEATURE #967](https://github.com/BurntSushi/ripgrep/issues/967): Rename `--maxdepth` to `--max-depth` for consistency. We retain `--maxdepth` as a synonym for backwards compatibility. +* [FEATURE #978](https://github.com/BurntSushi/ripgrep/issues/978): + Add a `--pre` option to filter inputs with an arbitrary program. * [FEATURE fca9709d](https://github.com/BurntSushi/ripgrep/commit/fca9709d): Improve zsh completion. diff --git a/README.md b/README.md index 57ad0d7c..186e2fe7 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,9 @@ increases the times to `2.640s` for ripgrep and `10.277s` for GNU grep. specifically specified with the `-E/--encoding` flag.) * ripgrep supports searching files compressed in a common format (gzip, xz, lzma, bzip2 or lz4) with the `-z/--search-zip` flag. +* ripgrep supports arbitrary input preprocessing filters which could be PDF + text extraction, less supported decompression, decrypting, automatic encoding + detection and so on. In other words, use ripgrep if you like speed, filtering by default, fewer bugs, and Unicode support. diff --git a/complete/_rg b/complete/_rg index 586c90b4..b943484d 100644 --- a/complete/_rg +++ b/complete/_rg @@ -170,7 +170,8 @@ _rg() { {-w,--word-regexp}'[only show matches surrounded by word boundaries]' {-x,--line-regexp}'[only show matches surrounded by line boundaries]' - + '(zip)' # Compressed-file options + + '(input-decoding)' # Input decoding options + '--pre=[specify preprocessor utility]:preprocessor utility:_command_names -e' {-z,--search-zip}'[search in compressed files]' $no"--no-search-zip[don't search in compressed files]" diff --git a/src/app.rs b/src/app.rs index a0fdf946..67b7295e 100644 --- a/src/app.rs +++ b/src/app.rs @@ -534,6 +534,7 @@ pub fn all_args_and_flags() -> Vec { flag_only_matching(&mut args); flag_path_separator(&mut args); flag_passthru(&mut args); + flag_pre(&mut args); flag_pretty(&mut args); flag_quiet(&mut args); flag_regex_size_limit(&mut args); @@ -1453,12 +1454,62 @@ This flag can be disabled with --no-search-zip. "); let arg = RGArg::switch("search-zip").short("z") .help(SHORT).long_help(LONG) - .overrides("no-search-zip"); + .overrides("no-search-zip") + .overrides("pre"); args.push(arg); let arg = RGArg::switch("no-search-zip") .hidden() - .overrides("search-zip"); + .overrides("search-zip") + .overrides("pre"); + args.push(arg); +} + +fn flag_pre(args: &mut Vec) { + const SHORT: &str = "search outputs of COMMAND FILE for each FILE"; + const LONG: &str = long!("\ +For each input FILE, search the standard output of COMMAND FILE rather than the +contents of FILE. This option expects the COMMAND program to either be an +absolute path or to be available in your PATH. An empty string COMMAND +deactivates this feature. + +A preprocessor is not run when ripgrep is searching stdin. + +When searching over sets of files that may require one of several decoders +as preprocessors, COMMAND should be a wrapper program or script which first +classifies FILE based on magic numbers/content or based on the FILE name and +then dispatches to an appropriate preprocessor. Each COMMAND also has its +standard input connected to FILE for convenience. + +For example, a shell script for COMMAND might look like: + + case \"$1\" in + *.pdf) + exec pdftotext \"$1\" - + ;; + *) + case $(file \"$1\") in + *Zstandard*) + exec pzstd -cdq + ;; + *) + exec cat + ;; + esac + ;; + esac + +The above script uses `pdftotext` to convert a PDF file to plain text. For +all other files, the script uses the `file` utility to sniff the type of the +file based on its contents. If it is a compressed file in the Zstandard format, +then `pzstd` is used to decompress the contents to stdout. + +This overrides the -z/--search-zip flag. +"); + let arg = RGArg::flag("pre", "COMMAND") + .help(SHORT).long_help(LONG) + .overrides("search-zip") + .overrides("no-search-zip"); args.push(arg); } diff --git a/src/args.rs b/src/args.rs index aca9bcd5..302e330e 100644 --- a/src/args.rs +++ b/src/args.rs @@ -80,6 +80,7 @@ pub struct Args { types: Types, with_filename: bool, search_zip_files: bool, + preprocessor: Option, stats: bool } @@ -288,6 +289,7 @@ impl Args { .quiet(self.quiet) .text(self.text) .search_zip_files(self.search_zip_files) + .preprocessor(self.preprocessor.clone()) .build() } @@ -429,6 +431,7 @@ impl<'a> ArgMatches<'a> { types: self.types()?, with_filename: with_filename, search_zip_files: self.is_present("search-zip"), + preprocessor: self.preprocessor(), stats: self.stats() }; if args.mmap { @@ -722,6 +725,19 @@ impl<'a> ArgMatches<'a> { } } + /// Returns the preprocessor command + fn preprocessor(&self) -> Option { + if let Some(path) = self.value_of_os("pre") { + if path.is_empty() { + None + } else { + Some(Path::new(path).to_path_buf()) + } + } else { + None + } + } + /// Returns the unescaped path separator in UTF-8 bytes. fn path_separator(&self) -> Result> { match self.value_of_lossy("path-separator") { diff --git a/src/main.rs b/src/main.rs index 6f010135..ab0e4118 100644 --- a/src/main.rs +++ b/src/main.rs @@ -43,6 +43,7 @@ mod args; mod config; mod decoder; mod decompressor; +mod preprocessor; mod logger; mod pathutil; mod printer; diff --git a/src/preprocessor.rs b/src/preprocessor.rs new file mode 100644 index 00000000..bb464f86 --- /dev/null +++ b/src/preprocessor.rs @@ -0,0 +1,92 @@ +use std::fs::File; +use std::io::{self, Read}; +use std::path::{Path, PathBuf}; +use std::process::{self, Stdio}; + +use Result; + +/// PreprocessorReader provides an `io::Read` impl to read kids output. +#[derive(Debug)] +pub struct PreprocessorReader { + cmd: PathBuf, + path: PathBuf, + child: process::Child, + done: bool, +} + +impl PreprocessorReader { + /// Returns a handle to the stdout of the spawned preprocessor process for + /// `path`, which can be directly searched in the worker. When the returned + /// value is exhausted, the underlying process is reaped. If the underlying + /// process fails, then its stderr is read and converted into a normal + /// io::Error. + /// + /// If there is any error in spawning the preprocessor command, then + /// return the corresponding error. + pub fn from_cmd_path( + cmd: PathBuf, + path: &Path, + ) -> Result { + let child = process::Command::new(&cmd) + .arg(path) + .stdin(Stdio::from(File::open(path)?)) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|err| { + format!( + "error running preprocessor command '{}': {}", + cmd.display(), + err, + ) + })?; + Ok(PreprocessorReader { + cmd: cmd, + path: path.to_path_buf(), + child: child, + done: false, + }) + } + + fn read_error(&mut self) -> io::Result { + let mut errbytes = vec![]; + self.child.stderr.as_mut().unwrap().read_to_end(&mut errbytes)?; + let errstr = String::from_utf8_lossy(&errbytes); + let errstr = errstr.trim(); + + Ok(if errstr.is_empty() { + let msg = format!( + "preprocessor command failed: '{} {}'", + self.cmd.display(), + self.path.display(), + ); + io::Error::new(io::ErrorKind::Other, msg) + } else { + let msg = format!( + "preprocessor command failed: '{} {}': {}", + self.cmd.display(), + self.path.display(), + errstr, + ); + io::Error::new(io::ErrorKind::Other, msg) + }) + } +} + +impl io::Read for PreprocessorReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if self.done { + return Ok(0); + } + let nread = self.child.stdout.as_mut().unwrap().read(buf)?; + if nread == 0 { + self.done = true; + // Reap the child now that we're done reading. + // If the command failed, report stderr as an error. + if !self.child.wait()?.success() { + return Err(self.read_error()?); + } + } + Ok(nread) + } +} diff --git a/src/worker.rs b/src/worker.rs index a8327cda..5b7ef0a4 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -1,6 +1,6 @@ use std::fs::File; use std::io; -use std::path::Path; +use std::path::{Path, PathBuf}; use encoding_rs::Encoding; use grep::Grep; @@ -10,6 +10,7 @@ use termcolor::WriteColor; use decoder::DecodeReader; use decompressor::{self, DecompressionReader}; +use preprocessor::PreprocessorReader; use pathutil::strip_prefix; use printer::Printer; use search_buffer::BufferSearcher; @@ -45,6 +46,7 @@ struct Options { no_messages: bool, quiet: bool, text: bool, + preprocessor: Option, search_zip_files: bool } @@ -68,6 +70,7 @@ impl Default for Options { quiet: false, text: false, search_zip_files: false, + preprocessor: None, } } } @@ -222,6 +225,12 @@ impl WorkerBuilder { self.opts.search_zip_files = yes; self } + + /// If non-empty, search output of preprocessor run on each file + pub fn preprocessor(mut self, command: Option) -> Self { + self.opts.preprocessor = command; + self + } } /// Worker is responsible for executing searches on file paths, while choosing @@ -250,7 +259,18 @@ impl Worker { } Work::DirEntry(dent) => { let mut path = dent.path(); - if self.opts.search_zip_files + if self.opts.preprocessor.is_some() { + let cmd = self.opts.preprocessor.clone().unwrap(); + match PreprocessorReader::from_cmd_path(cmd, path) { + Ok(reader) => self.search(printer, path, reader), + Err(err) => { + if !self.opts.no_messages { + eprintln!("{}", err); + } + return 0; + } + } + } else if self.opts.search_zip_files && decompressor::is_compressed(path) { match DecompressionReader::from_path(path) { diff --git a/tests/tests.rs b/tests/tests.rs index 9920c118..6a5bf73f 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -1732,6 +1732,26 @@ sherlock!(feature_419_zero_as_shortcut_for_null, "Sherlock", ".", assert_eq!(lines, "sherlock\x002\n"); }); +#[test] +fn preprocessing() { + if !cmd_exists("xzcat") { + return; + } + let xz_file = include_bytes!("./data/sherlock.xz"); + + let wd = WorkDir::new("feature_preprocessing"); + wd.create_bytes("sherlock.xz", xz_file); + + let mut cmd = wd.command(); + cmd.arg("--pre").arg("xzcat").arg("Sherlock").arg("sherlock.xz"); + let lines: String = wd.stdout(&mut cmd); + let expected = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +be, to a very large extent, the result of luck. Sherlock Holmes +"; + assert_eq!(lines, expected); +} + #[test] fn compressed_gzip() { if !cmd_exists("gzip") { -- cgit v1.2.3