From f007f940c53a4818ead58f2fe2e0fac95cc3a40a Mon Sep 17 00:00:00 2001 From: Balaji Sivaraman Date: Sun, 7 Jan 2018 21:35:58 +0530 Subject: search: add support for searching compressed files This commit adds opt-in support for searching compressed files during recursive search. This behavior is only enabled when the `-z/--search-zip` flag is passed to ripgrep. When enabled, a limited set of common compression formats are recognized via file extension, and a new process is spawned to perform the decompression. ripgrep then searches the stdout of that spawned process. Closes #539 --- src/decompressor.rs | 191 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 src/decompressor.rs (limited to 'src/decompressor.rs') diff --git a/src/decompressor.rs b/src/decompressor.rs new file mode 100644 index 00000000..a94948af --- /dev/null +++ b/src/decompressor.rs @@ -0,0 +1,191 @@ +use std::collections::HashMap; +use std::ffi::OsStr; +use std::fmt; +use std::io::{self, Read}; +use std::path::Path; +use std::process::{self, Stdio}; + +use globset::{Glob, GlobSet, GlobSetBuilder}; + +/// A decompression command, contains the command to be spawned as well as any +/// necessary CLI args. +#[derive(Clone, Copy, Debug)] +struct DecompressionCommand { + cmd: &'static str, + args: &'static [&'static str], +} + +impl DecompressionCommand { + /// Create a new decompress command + fn new( + cmd: &'static str, + args: &'static [&'static str], + ) -> DecompressionCommand { + DecompressionCommand { + cmd, args + } + } +} + +impl fmt::Display for DecompressionCommand { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{} {}", self.cmd, self.args.join(" ")) + } +} + +lazy_static! { + static ref DECOMPRESSION_COMMANDS: HashMap< + &'static str, + DecompressionCommand, + > = { + let mut m = HashMap::new(); + + const ARGS: &[&str] = &["-d", "-c"]; + m.insert("gz", DecompressionCommand::new("gzip", ARGS)); + m.insert("bz2", DecompressionCommand::new("bzip2", ARGS)); + m.insert("xz", DecompressionCommand::new("xz", ARGS)); + + const LZMA_ARGS: &[&str] = &["--format=lzma", "-d", "-c"]; + m.insert("lzma", DecompressionCommand::new("xz", LZMA_ARGS)); + + m + }; + static ref SUPPORTED_COMPRESSION_FORMATS: GlobSet = { + let mut builder = GlobSetBuilder::new(); + builder.add(Glob::new("*.gz").unwrap()); + builder.add(Glob::new("*.bz2").unwrap()); + builder.add(Glob::new("*.xz").unwrap()); + builder.add(Glob::new("*.lzma").unwrap()); + builder.build().unwrap() + }; + static ref TAR_ARCHIVE_FORMATS: GlobSet = { + let mut builder = GlobSetBuilder::new(); + builder.add(Glob::new("*.tar.gz").unwrap()); + builder.add(Glob::new("*.tar.xz").unwrap()); + builder.add(Glob::new("*.tar.bz2").unwrap()); + builder.add(Glob::new("*.tgz").unwrap()); + builder.add(Glob::new("*.txz").unwrap()); + builder.add(Glob::new("*.tbz2").unwrap()); + builder.build().unwrap() + }; +} + +/// DecompressionReader provides an `io::Read` implementation for a limited +/// set of compression formats. +#[derive(Debug)] +pub struct DecompressionReader { + cmd: DecompressionCommand, + child: process::Child, + done: bool, +} + +impl DecompressionReader { + /// Returns a handle to the stdout of the spawned decompression process for + /// `path`, which can be directly searched in the worker. When the returned + /// value is exhausted, the underlying process is reaped. If the underlying + /// process fails, then its stderr is read and converted into a normal + /// io::Error. + /// + /// If there is any error in spawning the decompression command, then + /// return `None`, after outputting any necessary debug or error messages. + pub fn from_path(path: &Path) -> Option { + if is_tar_archive(path) { + debug!("{}: skipping tar archive", path.display()); + return None; + } + let extension = match path.extension().and_then(OsStr::to_str) { + Some(extension) => extension, + None => { + debug!( + "{}: failed to get compresson extension", path.display()); + return None; + } + }; + let decompression_cmd = match DECOMPRESSION_COMMANDS.get(extension) { + Some(cmd) => cmd, + None => { + debug!( + "{}: failed to get decompression command", path.display()); + return None; + } + }; + let cmd = process::Command::new(decompression_cmd.cmd) + .args(decompression_cmd.args) + .arg(path) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn(); + let child = match cmd { + Ok(process) => process, + Err(_) => { + debug!( + "{}: decompression command '{}' not found", + path.display(), decompression_cmd.cmd); + return None; + } + }; + Some(DecompressionReader::new(*decompression_cmd, child)) + } + + fn new( + cmd: DecompressionCommand, + child: process::Child, + ) -> DecompressionReader { + DecompressionReader { + cmd: cmd, + child: child, + done: false, + } + } + + fn read_error(&mut self) -> io::Result { + let mut errbytes = vec![]; + self.child.stderr.as_mut().unwrap().read_to_end(&mut errbytes)?; + let errstr = String::from_utf8_lossy(&errbytes); + let errstr = errstr.trim(); + + Ok(if errstr.is_empty() { + let msg = format!("decompression command failed: '{}'", self.cmd); + io::Error::new(io::ErrorKind::Other, msg) + } else { + let msg = format!( + "decompression command '{}' failed: {}", self.cmd, errstr); + io::Error::new(io::ErrorKind::Other, msg) + }) + } +} + +impl io::Read for DecompressionReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if self.done { + return Ok(0); + } + let nread = self.child.stdout.as_mut().unwrap().read(buf)?; + if nread == 0 { + self.done = true; + // Reap the child now that we're done reading. + // If the command failed, report stderr as an error. + if !self.child.wait()?.success() { + return Err(self.read_error()?); + } + } + Ok(nread) + } +} + +/// Returns true if the given path contains a supported compression format or +/// is a TAR archive. +pub fn is_compressed(path: &Path) -> bool { + is_supported_compression_format(path) || is_tar_archive(path) +} + +/// Returns true if the given path matches any one of the supported compression +/// formats +fn is_supported_compression_format(path: &Path) -> bool { + SUPPORTED_COMPRESSION_FORMATS.is_match(path) +} + +/// Returns true if the given path matches any of the known TAR file formats. +fn is_tar_archive(path: &Path) -> bool { + TAR_ARCHIVE_FORMATS.is_match(path) +} -- cgit v1.2.3