From 4846d63539690047fa58ec582d94bcba16da1c09 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 29 Aug 2018 20:53:52 -0400 Subject: grep-cli: introduce new grep-cli crate This commit moves a lot of "utility" code from ripgrep core into grep-cli. Any one of these things might not be worth creating a new crate, but combining everything together results in a fair number of a convenience routines that make up a decent sized crate. There is potentially more we could move into the crate, but much of what remains in ripgrep core is almost entirely dealing with the number of flags we support. In the course of doing moving things to the grep-cli crate, we clean up a lot of gunk and improve failure modes in a number of cases. In particular, we've fixed a bug where other processes could deadlock if they write too much to stderr. Fixes #990 --- CHANGELOG.md | 2 + Cargo.lock | 19 ++- Cargo.toml | 7 +- grep-cli/Cargo.toml | 25 +++ grep-cli/LICENSE-MIT | 21 +++ grep-cli/README.md | 38 +++++ grep-cli/UNLICENSE | 24 +++ grep-cli/src/decompress.rs | 381 ++++++++++++++++++++++++++++++++++++++++++++ grep-cli/src/escape.rs | 315 ++++++++++++++++++++++++++++++++++++ grep-cli/src/human.rs | 171 ++++++++++++++++++++ grep-cli/src/lib.rs | 251 +++++++++++++++++++++++++++++ grep-cli/src/pattern.rs | 205 ++++++++++++++++++++++++ grep-cli/src/process.rs | 267 +++++++++++++++++++++++++++++++ grep-cli/src/wtr.rs | 133 ++++++++++++++++ grep-printer/src/color.rs | 28 ++++ grep-printer/src/lib.rs | 2 +- grep/Cargo.toml | 1 + grep/examples/simplegrep.rs | 34 +--- grep/src/lib.rs | 1 + src/app.rs | 8 +- src/args.rs | 149 ++++------------- src/decompressor.rs | 190 ---------------------- src/main.rs | 10 +- src/preprocessor.rs | 93 ----------- src/search.rs | 73 +++++++-- src/unescape.rs | 137 ---------------- 26 files changed, 1987 insertions(+), 598 deletions(-) create mode 100644 grep-cli/Cargo.toml create mode 100644 grep-cli/LICENSE-MIT create mode 100644 grep-cli/README.md create mode 100644 grep-cli/UNLICENSE create mode 100644 grep-cli/src/decompress.rs create mode 100644 grep-cli/src/escape.rs create mode 100644 grep-cli/src/human.rs create mode 100644 grep-cli/src/lib.rs create mode 100644 grep-cli/src/pattern.rs create mode 100644 grep-cli/src/process.rs create mode 100644 grep-cli/src/wtr.rs delete mode 100644 src/decompressor.rs delete mode 100644 src/preprocessor.rs delete mode 100644 src/unescape.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index c0fb04b9..73406762 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,6 +71,8 @@ Bug fixes: Context from the `--passthru` flag should not impact process exit status. * [BUG #984](https://github.com/BurntSushi/ripgrep/issues/984): Fixes bug in `ignore` crate where first path was always treated as a symlink. +* [BUG #990](https://github.com/BurntSushi/ripgrep/issues/990): + Read stderr asynchronously when running a process. * [BUG #1013](https://github.com/BurntSushi/ripgrep/issues/1013): Add compile time and runtime CPU features to `--version` output. * [BUG #1028](https://github.com/BurntSushi/ripgrep/pull/1028): diff --git a/Cargo.lock b/Cargo.lock index 7ddb3f2b..b1ee1723 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -168,6 +168,7 @@ name = "grep" version = "0.2.0" dependencies = [ "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "grep-cli 0.1.0", "grep-matcher 0.1.0", "grep-pcre2 0.1.0", "grep-printer 0.1.0", @@ -177,6 +178,20 @@ dependencies = [ "walkdir 2.2.5 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "grep-cli" +version = "0.1.0" +dependencies = [ + "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "globset 0.4.1", + "lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "same-file 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", + "termcolor 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "grep-matcher" version = "0.1.0" @@ -464,21 +479,17 @@ dependencies = [ name = "ripgrep" version = "0.9.0" dependencies = [ - "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", - "globset 0.4.1", "grep 0.2.0", "ignore 0.4.3", "lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", "num_cpus 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", - "same-file 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.75 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.75 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.26 (registry+https://github.com/rust-lang/crates.io-index)", "termcolor 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 3ff769c6..0c489c46 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,7 @@ path = "tests/tests.rs" members = [ "globset", "grep", + "grep-cli", "grep-matcher", "grep-pcre2", "grep-printer", @@ -44,15 +45,12 @@ members = [ ] [dependencies] -atty = "0.2.11" -globset = { version = "0.4.0", path = "globset" } grep = { version = "0.2.0", path = "grep" } ignore = { version = "0.4.0", path = "ignore" } lazy_static = "1" log = "0.4" num_cpus = "1" regex = "1" -same-file = "1" serde_json = "1" termcolor = "1" @@ -61,9 +59,6 @@ version = "2.32.0" default-features = false features = ["suggestions"] -[target.'cfg(windows)'.dependencies.winapi-util] -version = "0.1.1" - [build-dependencies] lazy_static = "1" diff --git a/grep-cli/Cargo.toml b/grep-cli/Cargo.toml new file mode 100644 index 00000000..1d5fda22 --- /dev/null +++ b/grep-cli/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "grep-cli" +version = "0.1.0" #:version +authors = ["Andrew Gallant "] +description = """ +Utilities for search oriented command line applications. +""" +documentation = "https://docs.rs/grep-cli" +homepage = "https://github.com/BurntSushi/ripgrep" +repository = "https://github.com/BurntSushi/ripgrep" +readme = "README.md" +keywords = ["regex", "grep", "cli", "utility", "util"] +license = "Unlicense/MIT" + +[dependencies] +atty = "0.2.11" +globset = { version = "0.4.1", path = "../globset" } +lazy_static = "1.1" +log = "0.4" +regex = "1" +same-file = "1" +termcolor = "1" + +[target.'cfg(windows)'.dependencies.winapi-util] +version = "0.1.1" diff --git a/grep-cli/LICENSE-MIT b/grep-cli/LICENSE-MIT new file mode 100644 index 00000000..3b0a5dc0 --- /dev/null +++ b/grep-cli/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/grep-cli/README.md b/grep-cli/README.md new file mode 100644 index 00000000..e78317c6 --- /dev/null +++ b/grep-cli/README.md @@ -0,0 +1,38 @@ +grep-cli +-------- +A utility library that provides common routines desired in search oriented +command line applications. This includes, but is not limited to, parsing hex +escapes, detecting whether stdin is readable and more. To the extent possible, +this crate strives for compatibility across Windows, macOS and Linux. + +[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.svg)](https://travis-ci.org/BurntSushi/ripgrep) +[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep) +[![](https://img.shields.io/crates/v/grep-cli.svg)](https://crates.io/crates/grep-cli) + +Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). + + +### Documentation + +[https://docs.rs/grep-cli](https://docs.rs/grep-cli) + +**NOTE:** You probably don't want to use this crate directly. Instead, you +should prefer the facade defined in the +[`grep`](https://docs.rs/grep) +crate. + + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +grep-cli = "0.1" +``` + +and this to your crate root: + +```rust +extern crate grep_cli; +``` diff --git a/grep-cli/UNLICENSE b/grep-cli/UNLICENSE new file mode 100644 index 00000000..68a49daa --- /dev/null +++ b/grep-cli/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/grep-cli/src/decompress.rs b/grep-cli/src/decompress.rs new file mode 100644 index 00000000..ad108ea0 --- /dev/null +++ b/grep-cli/src/decompress.rs @@ -0,0 +1,381 @@ +use std::ffi::{OsStr, OsString}; +use std::fs::File; +use std::io; +use std::path::Path; +use std::process::Command; + +use globset::{Glob, GlobSet, GlobSetBuilder}; + +use process::{CommandError, CommandReader, CommandReaderBuilder}; + +/// A builder for a matcher that determines which files get decompressed. +#[derive(Clone, Debug)] +pub struct DecompressionMatcherBuilder { + /// The commands for each matching glob. + commands: Vec, + /// Whether to include the default matching rules. + defaults: bool, +} + +/// A representation of a single command for decompressing data +/// out-of-proccess. +#[derive(Clone, Debug)] +struct DecompressionCommand { + /// The glob that matches this command. + glob: String, + /// The command or binary name. + bin: OsString, + /// The arguments to invoke with the command. + args: Vec, +} + +impl Default for DecompressionMatcherBuilder { + fn default() -> DecompressionMatcherBuilder { + DecompressionMatcherBuilder::new() + } +} + +impl DecompressionMatcherBuilder { + /// Create a new builder for configuring a decompression matcher. + pub fn new() -> DecompressionMatcherBuilder { + DecompressionMatcherBuilder { + commands: vec![], + defaults: true, + } + } + + /// Build a matcher for determining how to decompress files. + /// + /// If there was a problem compiling the matcher, then an error is + /// returned. + pub fn build(&self) -> Result { + let defaults = + if !self.defaults { + vec![] + } else { + default_decompression_commands() + }; + let mut glob_builder = GlobSetBuilder::new(); + let mut commands = vec![]; + for decomp_cmd in defaults.iter().chain(&self.commands) { + let glob = Glob::new(&decomp_cmd.glob).map_err(|err| { + CommandError::io(io::Error::new(io::ErrorKind::Other, err)) + })?; + glob_builder.add(glob); + commands.push(decomp_cmd.clone()); + } + let globs = glob_builder.build().map_err(|err| { + CommandError::io(io::Error::new(io::ErrorKind::Other, err)) + })?; + Ok(DecompressionMatcher { globs, commands }) + } + + /// When enabled, the default matching rules will be compiled into this + /// matcher before any other associations. When disabled, only the + /// rules explicitly given to this builder will be used. + /// + /// This is enabled by default. + pub fn defaults(&mut self, yes: bool) -> &mut DecompressionMatcherBuilder { + self.defaults = yes; + self + } + + /// Associates a glob with a command to decompress files matching the glob. + /// + /// If multiple globs match the same file, then the most recently added + /// glob takes precedence. + /// + /// The syntax for the glob is documented in the + /// [`globset` crate](https://docs.rs/globset/#syntax). + pub fn associate( + &mut self, + glob: &str, + program: P, + args: I, + ) -> &mut DecompressionMatcherBuilder + where P: AsRef, + I: IntoIterator, + A: AsRef, + { + + let glob = glob.to_string(); + let bin = program.as_ref().to_os_string(); + let args = args + .into_iter() + .map(|a| a.as_ref().to_os_string()) + .collect(); + self.commands.push(DecompressionCommand { glob, bin, args }); + self + } +} + +/// A matcher for determining how to decompress files. +#[derive(Clone, Debug)] +pub struct DecompressionMatcher { + /// The set of globs to match. Each glob has a corresponding entry in + /// `commands`. When a glob matches, the corresponding command should be + /// used to perform out-of-process decompression. + globs: GlobSet, + /// The commands for each matching glob. + commands: Vec, +} + +impl Default for DecompressionMatcher { + fn default() -> DecompressionMatcher { + DecompressionMatcher::new() + } +} + +impl DecompressionMatcher { + /// Create a new matcher with default rules. + /// + /// To add more matching rules, build a matcher with + /// [`DecompressionMatcherBuilder`](struct.DecompressionMatcherBuilder.html). + pub fn new() -> DecompressionMatcher { + DecompressionMatcherBuilder::new() + .build() + .expect("built-in matching rules should always compile") + } + + /// Return a pre-built command based on the given file path that can + /// decompress its contents. If no such decompressor is known, then this + /// returns `None`. + /// + /// If there are multiple possible commands matching the given path, then + /// the command added last takes precedence. + pub fn command>(&self, path: P) -> Option { + for i in self.globs.matches(path).into_iter().rev() { + let decomp_cmd = &self.commands[i]; + let mut cmd = Command::new(&decomp_cmd.bin); + cmd.args(&decomp_cmd.args); + return Some(cmd); + } + None + } + + /// Returns true if and only if the given file path has at least one + /// matching command to perform decompression on. + pub fn has_command>(&self, path: P) -> bool { + self.globs.is_match(path) + } +} + +/// Configures and builds a streaming reader for decompressing data. +#[derive(Clone, Debug, Default)] +pub struct DecompressionReaderBuilder { + matcher: DecompressionMatcher, + command_builder: CommandReaderBuilder, +} + +impl DecompressionReaderBuilder { + /// Create a new builder with the default configuration. + pub fn new() -> DecompressionReaderBuilder { + DecompressionReaderBuilder::default() + } + + /// Build a new streaming reader for decompressing data. + /// + /// If decompression is done out-of-process and if there was a problem + /// spawning the process, then its error is logged at the debug level and a + /// passthru reader is returned that does no decompression. This behavior + /// typically occurs when the given file path matches a decompression + /// command, but is executing in an environment where the decompression + /// command is not available. + /// + /// If the given file path could not be matched with a decompression + /// strategy, then a passthru reader is returned that does no + /// decompression. + pub fn build>( + &self, + path: P, + ) -> Result { + let path = path.as_ref(); + let mut cmd = match self.matcher.command(path) { + None => return DecompressionReader::new_passthru(path), + Some(cmd) => cmd, + }; + cmd.arg(path); + + match self.command_builder.build(&mut cmd) { + Ok(cmd_reader) => Ok(DecompressionReader { rdr: Ok(cmd_reader) }), + Err(err) => { + debug!( + "{}: error spawning command '{:?}': {} \ + (falling back to uncompressed reader)", + path.display(), + cmd, + err, + ); + DecompressionReader::new_passthru(path) + } + } + } + + /// Set the matcher to use to look up the decompression command for each + /// file path. + /// + /// A set of sensible rules is enabled by default. Setting this will + /// completely replace the current rules. + pub fn matcher( + &mut self, + matcher: DecompressionMatcher, + ) -> &mut DecompressionReaderBuilder { + self.matcher = matcher; + self + } + + /// Get the underlying matcher currently used by this builder. + pub fn get_matcher(&self) -> &DecompressionMatcher { + &self.matcher + } + + /// When enabled, the reader will asynchronously read the contents of the + /// command's stderr output. When disabled, stderr is only read after the + /// stdout stream has been exhausted (or if the process quits with an error + /// code). + /// + /// Note that when enabled, this may require launching an additional + /// thread in order to read stderr. This is done so that the process being + /// executed is never blocked from writing to stdout or stderr. If this is + /// disabled, then it is possible for the process to fill up the stderr + /// buffer and deadlock. + /// + /// This is enabled by default. + pub fn async_stderr( + &mut self, + yes: bool, + ) -> &mut DecompressionReaderBuilder { + self.command_builder.async_stderr(yes); + self + } +} + +/// A streaming reader for decompressing the contents of a file. +/// +/// The purpose of this reader is to provide a seamless way to decompress the +/// contents of file using existing tools in the current environment. This is +/// meant to be an alternative to using decompression libraries in favor of the +/// simplicity and portability of using external commands such as `gzip` and +/// `xz`. This does impose the overhead of spawning a process, so other means +/// for performing decompression should be sought if this overhead isn't +/// acceptable. +/// +/// A decompression reader comes with a default set of matching rules that are +/// meant to associate file paths with the corresponding command to use to +/// decompress them. For example, a glob like `*.gz` matches gzip compressed +/// files with the command `gzip -d -c`. If a file path does not match any +/// existing rules, or if it matches a rule whose command does not exist in the +/// current environment, then the decompression reader passes through the +/// contents of the underlying file without doing any decompression. +/// +/// The default matching rules are probably good enough for most cases, and if +/// they require revision, pull requests are welcome. In cases where they must +/// be changed or extended, they can be customized through the use of +/// [`DecompressionMatcherBuilder`](struct.DecompressionMatcherBuilder.html) +/// and +/// [`DecompressionReaderBuilder`](struct.DecompressionReaderBuilder.html). +/// +/// By default, this reader will asynchronously read the processes' stderr. +/// This prevents subtle deadlocking bugs for noisy processes that write a lot +/// to stderr. Currently, the entire contents of stderr is read on to the heap. +/// +/// # Example +/// +/// This example shows how to read the decompressed contents of a file without +/// needing to explicitly choose the decompression command to run. +/// +/// Note that if you need to decompress multiple files, it is better to use +/// `DecompressionReaderBuilder`, which will amortize the cost of compiling the +/// matcher. +/// +/// ```no_run +/// use std::io::Read; +/// use std::process::Command; +/// use grep_cli::DecompressionReader; +/// +/// # fn example() -> Result<(), Box<::std::error::Error>> { +/// let mut rdr = DecompressionReader::new("/usr/share/man/man1/ls.1.gz")?; +/// let mut contents = vec![]; +/// rdr.read_to_end(&mut contents)?; +/// # Ok(()) } +/// ``` +#[derive(Debug)] +pub struct DecompressionReader { + rdr: Result, +} + +impl DecompressionReader { + /// Build a new streaming reader for decompressing data. + /// + /// If decompression is done out-of-process and if there was a problem + /// spawning the process, then its error is returned. + /// + /// If the given file path could not be matched with a decompression + /// strategy, then a passthru reader is returned that does no + /// decompression. + /// + /// This uses the default matching rules for determining how to decompress + /// the given file. To change those matching rules, use + /// [`DecompressionReaderBuilder`](struct.DecompressionReaderBuilder.html) + /// and + /// [`DecompressionMatcherBuilder`](struct.DecompressionMatcherBuilder.html). + /// + /// When creating readers for many paths. it is better to use the builder + /// since it will amortize the cost of constructing the matcher. + pub fn new>( + path: P, + ) -> Result { + DecompressionReaderBuilder::new().build(path) + } + + /// Creates a new "passthru" decompression reader that reads from the file + /// corresponding to the given path without doing decompression and without + /// executing another process. + fn new_passthru(path: &Path) -> Result { + let file = File::open(path)?; + Ok(DecompressionReader { rdr: Err(file) }) + } +} + +impl io::Read for DecompressionReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match self.rdr { + Ok(ref mut rdr) => rdr.read(buf), + Err(ref mut rdr) => rdr.read(buf), + } + } +} + +fn default_decompression_commands() -> Vec { + const ARGS_GZIP: &[&str] = &["gzip", "-d", "-c"]; + const ARGS_BZIP: &[&str] = &["bzip2", "-d", "-c"]; + const ARGS_XZ: &[&str] = &["xz", "-d", "-c"]; + const ARGS_LZ4: &[&str] = &["lz4", "-d", "-c"]; + const ARGS_LZMA: &[&str] = &["xz", "--format=lzma", "-d", "-c"]; + + fn cmd(glob: &str, args: &[&str]) -> DecompressionCommand { + DecompressionCommand { + glob: glob.to_string(), + bin: OsStr::new(&args[0]).to_os_string(), + args: args + .iter() + .skip(1) + .map(|s| OsStr::new(s).to_os_string()) + .collect(), + } + } + vec![ + cmd("*.gz", ARGS_GZIP), + cmd("*.tgz", ARGS_GZIP), + + cmd("*.bz2", ARGS_BZIP), + cmd("*.tbz2", ARGS_BZIP), + + cmd("*.xz", ARGS_XZ), + cmd("*.txz", ARGS_XZ), + + cmd("*.lz4", ARGS_LZ4), + + cmd("*.lzma", ARGS_LZMA), + ] +} diff --git a/grep-cli/src/escape.rs b/grep-cli/src/escape.rs new file mode 100644 index 00000000..9b350a93 --- /dev/null +++ b/grep-cli/src/escape.rs @@ -0,0 +1,315 @@ +use std::ffi::OsStr; +use std::str; + +/// A single state in the state machine used by `unescape`. +#[derive(Clone, Copy, Eq, PartialEq)] +enum State { + /// The state after seeing a `\`. + Escape, + /// The state after seeing a `\x`. + HexFirst, + /// The state after seeing a `\x[0-9A-Fa-f]`. + HexSecond(char), + /// Default state. + Literal, +} + +/// Escapes arbitrary bytes into a human readable string. +/// +/// This converts `\t`, `\r` and `\n` into their escaped forms. It also +/// converts the non-printable subset of ASCII in addition to invalid UTF-8 +/// bytes to hexadecimal escape sequences. Everything else is left as is. +/// +/// The dual of this routine is [`unescape`](fn.unescape.html). +/// +/// # Example +/// +/// This example shows how to convert a byte string that contains a `\n` and +/// invalid UTF-8 bytes into a `String`. +/// +/// Pay special attention to the use of raw strings. That is, `r"\n"` is +/// equivalent to `"\\n"`. +/// +/// ``` +/// use grep_cli::escape; +/// +/// assert_eq!(r"foo\nbar\xFFbaz", escape(b"foo\nbar\xFFbaz")); +/// ``` +pub fn escape(mut bytes: &[u8]) -> String { + let mut escaped = String::new(); + while let Some(result) = decode_utf8(bytes) { + match result { + Ok(cp) => { + escape_char(cp, &mut escaped); + bytes = &bytes[cp.len_utf8()..]; + } + Err(byte) => { + escape_byte(byte, &mut escaped); + bytes = &bytes[1..]; + } + } + } + escaped +} + +/// Escapes an OS string into a human readable string. +/// +/// This is like [`escape`](fn.escape.html), but accepts an OS string. +pub fn escape_os(string: &OsStr) -> String { + #[cfg(unix)] + fn imp(string: &OsStr) -> String { + use std::os::unix::ffi::OsStrExt; + + escape(string.as_bytes()) + } + + #[cfg(not(unix))] + fn imp(string: &OsStr) -> String { + escape(string.to_string_lossy().as_bytes()) + } + + imp(string) +} + +/// Unescapes a string. +/// +/// It supports a limited set of escape sequences: +/// +/// * `\t`, `\r` and `\n` are mapped to their corresponding ASCII bytes. +/// * `\xZZ` hexadecimal escapes are mapped to their byte. +/// +/// Everything else is left as is, including non-hexadecimal escapes like +/// `\xGG`. +/// +/// This is useful when it is desirable for a command line argument to be +/// capable of specifying arbitrary bytes or otherwise make it easier to +/// specify non-printable characters. +/// +/// The dual of this routine is [`escape`](fn.escape.html). +/// +/// # Example +/// +/// This example shows how to convert an escaped string (which is valid UTF-8) +/// into a corresponding sequence of bytes. Each escape sequence is mapped to +/// its bytes, which may include invalid UTF-8. +/// +/// Pay special attention to the use of raw strings. That is, `r"\n"` is +/// equivalent to `"\\n"`. +/// +/// ``` +/// use grep_cli::unescape; +/// +/// assert_eq!(&b"foo\nbar\xFFbaz"[..], &*unescape(r"foo\nbar\xFFbaz")); +/// ``` +pub fn unescape(s: &str) -> Vec { + use self::State::*; + + let mut bytes = vec![]; + let mut state = Literal; + for c in s.chars() { + match state { + Escape => { + match c { + '\\' => { bytes.push(b'\\'); state = Literal; } + 'n' => { bytes.push(b'\n'); state = Literal; } + 'r' => { bytes.push(b'\r'); state = Literal; } + 't' => { bytes.push(b'\t'); state = Literal; } + 'x' => { state = HexFirst; } + c => { + bytes.extend(format!(r"\{}", c).into_bytes()); + state = Literal; + } + } + } + HexFirst => { + match c { + '0'...'9' | 'A'...'F' | 'a'...'f' => { + state = HexSecond(c); + } + c => { + bytes.extend(format!(r"\x{}", c).into_bytes()); + state = Literal; + } + } + } + HexSecond(first) => { + match c { + '0'...'9' | 'A'...'F' | 'a'...'f' => { + let ordinal = format!("{}{}", first, c); + let byte = u8::from_str_radix(&ordinal, 16).unwrap(); + bytes.push(byte); + state = Literal; + } + c => { + let original = format!(r"\x{}{}", first, c); + bytes.extend(original.into_bytes()); + state = Literal; + } + } + } + Literal => { + match c { + '\\' => { state = Escape; } + c => { bytes.extend(c.to_string().as_bytes()); } + } + } + } + } + match state { + Escape => bytes.push(b'\\'), + HexFirst => bytes.extend(b"\\x"), + HexSecond(c) => bytes.extend(format!("\\x{}", c).into_bytes()), + Literal => {} + } + bytes +} + +/// Unescapes an OS string. +/// +/// This is like [`unescape`](fn.unescape.html), but accepts an OS string. +/// +/// Note that this first lossily decodes the given OS string as UTF-8. That +/// is, an escaped string (the thing given) should be valid UTF-8. +pub fn unescape_os(string: &OsStr) -> Vec { + unescape(&string.to_string_lossy()) +} + +/// Adds the given codepoint to the given string, escaping it if necessary. +fn escape_char(cp: char, into: &mut String) { + if cp.is_ascii() { + escape_byte(cp as u8, into); + } else { + into.push(cp); + } +} + +/// Adds the given byte to the given string, escaping it if necessary. +fn escape_byte(byte: u8, into: &mut String) { + match byte { + 0x21...0x5B | 0x5D...0x7D => into.push(byte as char), + b'\n' => into.push_str(r"\n"), + b'\r' => into.push_str(r"\r"), + b'\t' => into.push_str(r"\t"), + b'\\' => into.push_str(r"\\"), + _ => into.push_str(&format!(r"\x{:02X}", byte)), + } +} + +/// Decodes the next UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the beginning of the given +/// byte slice, then the first byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +fn decode_utf8(bytes: &[u8]) -> Option> { + if bytes.is_empty() { + return None; + } + let len = match utf8_len(bytes[0]) { + None => return Some(Err(bytes[0])), + Some(len) if len > bytes.len() => return Some(Err(bytes[0])), + Some(len) => len, + }; + match str::from_utf8(&bytes[..len]) { + Ok(s) => Some(Ok(s.chars().next().unwrap())), + Err(_) => Some(Err(bytes[0])), + } +} + +/// Given a UTF-8 leading byte, this returns the total number of code units +/// in the following encoded codepoint. +/// +/// If the given byte is not a valid UTF-8 leading byte, then this returns +/// `None`. +fn utf8_len(byte: u8) -> Option { + if byte <= 0x7F { + Some(1) + } else if byte <= 0b110_11111 { + Some(2) + } else if byte <= 0b1110_1111 { + Some(3) + } else if byte <= 0b1111_0111 { + Some(4) + } else { + None + } +} + +#[cfg(test)] +mod tests { + use super::{escape, unescape}; + + fn b(bytes: &'static [u8]) -> Vec { + bytes.to_vec() + } + + #[test] + fn empty() { + assert_eq!(b(b""), unescape(r"")); + assert_eq!(r"", escape(b"")); + } + + #[test] + fn backslash() { + assert_eq!(b(b"\\"), unescape(r"\\")); + assert_eq!(r"\\", escape(b"\\")); + } + + #[test] + fn nul() { + assert_eq!(b(b"\x00"), unescape(r"\x00")); + assert_eq!(r"\x00", escape(b"\x00")); + } + + #[test] + fn nl() { + assert_eq!(b(b"\n"), unescape(r"\n")); + assert_eq!(r"\n", escape(b"\n")); + } + + #[test] + fn tab() { + assert_eq!(b(b"\t"), unescape(r"\t")); + assert_eq!(r"\t", escape(b"\t")); + } + + #[test] + fn carriage() { + assert_eq!(b(b"\r"), unescape(r"\r")); + assert_eq!(r"\r", escape(b"\r")); + } + + #[test] + fn nothing_simple() { + assert_eq!(b(b"\\a"), unescape(r"\a")); + assert_eq!(b(b"\\a"), unescape(r"\\a")); + assert_eq!(r"\\a", escape(b"\\a")); + } + + #[test] + fn nothing_hex0() { + assert_eq!(b(b"\\x"), unescape(r"\x")); + assert_eq!(b(b"\\x"), unescape(r"\\x")); + assert_eq!(r"\\x", escape(b"\\x")); + } + + #[test] + fn nothing_hex1() { + assert_eq!(b(b"\\xz"), unescape(r"\xz")); + assert_eq!(b(b"\\xz"), unescape(r"\\xz")); + assert_eq!(r"\\xz", escape(b"\\xz")); + } + + #[test] + fn nothing_hex2() { + assert_eq!(b(b"\\xzz"), unescape(r"\xzz")); + assert_eq!(b(b"\\xzz"), unescape(r"\\xzz")); + assert_eq!(r"\\xzz", escape(b"\\xzz")); + } + + #[test] + fn invalid_utf8() { + assert_eq!(r"\xFF", escape(b"\xFF")); + assert_eq!(r"a\xFFb", escape(b"a\xFFb")); + } +} diff --git a/grep-cli/src/human.rs b/grep-cli/src/human.rs new file mode 100644 index 00000000..a69fd376 --- /dev/null +++ b/grep-cli/src/human.rs @@ -0,0 +1,171 @@ +use std::error; +use std::fmt; +use std::io; +use std::num::ParseIntError; + +use regex::Regex; + +/// An error that occurs when parsing a human readable size description. +/// +/// This error provides a end user friendly message describing why the +/// description coudln't be parsed and what the expected format is. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ParseSizeError { + original: String, + kind: ParseSizeErrorKind, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +enum ParseSizeErrorKind { + InvalidFormat, + InvalidInt(ParseIntError), + Overflow, +} + +impl ParseSizeError { + fn format(original: &str) -> ParseSizeError { + ParseSizeError { + original: original.to_string(), + kind: ParseSizeErrorKind::InvalidFormat, + } + } + + fn int(original: &str, err: ParseIntError) -> ParseSizeError { + ParseSizeError { + original: original.to_string(), + kind: ParseSizeErrorKind::InvalidInt(err), + } + } + + fn overflow(original: &str) -> ParseSizeError { + ParseSizeError { + original: original.to_string(), + kind: ParseSizeErrorKind::Overflow, + } + } +} + +impl error::Error for ParseSizeError { + fn description(&self) -> &str { "invalid size" } +} + +impl fmt::Display for ParseSizeError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::ParseSizeErrorKind::*; + + match self.kind { + InvalidFormat => { + write!( + f, + "invalid format for size '{}', which should be a sequence \ + of digits followed by an optional 'K', 'M' or 'G' \ + suffix", + self.original + ) + } + InvalidInt(ref err) => { + write!( + f, + "invalid integer found in size '{}': {}", + self.original, + err + ) + } + Overflow => { + write!(f, "size too big in '{}'", self.original) + } + } + } +} + +impl From for io::Error { + fn from(size_err: ParseSizeError) -> io::Error { + io::Error::new(io::ErrorKind::Other, size_err) + } +} + +/// Parse a human readable size like `2M` into a corresponding number of bytes. +/// +/// Supported size suffixes are `K` (for kilobyte), `M` (for megabyte) and `G` +/// (for gigabyte). If a size suffix is missing, then the size is interpreted +/// as bytes. If the size is too big to fit into a `u64`, then this returns an +/// error. +/// +/// Additional suffixes may be added over time. +pub fn parse_human_readable_size(size: &str) -> Result { + lazy_static! { + // Normally I'd just parse something this simple by hand to avoid the + // regex dep, but we bring regex in any way for glob matching, so might + // as well use it. + static ref RE: Regex = Regex::new(r"^([0-9]+)([KMG])?$").unwrap(); + } + + let caps = match RE.captures(size) { + Some(caps) => caps, + None => return Err(ParseSizeError::format(size)), + }; + let value: u64 = caps[1].parse().map_err(|err| { + ParseSizeError::int(size, err) + })?; + let suffix = match caps.get(2) { + None => return Ok(value), + Some(cap) => cap.as_str(), + }; + let bytes = match suffix { + "K" => value.checked_mul(1<<10), + "M" => value.checked_mul(1<<20), + "G" => value.checked_mul(1<<30), + // Because if the regex matches this group, it must be [KMG]. + _ => unreachable!(), + }; + bytes.ok_or_else(|| ParseSizeError::overflow(size)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn suffix_none() { + let x = parse_human_readable_size("123").unwrap(); + assert_eq!(123, x); + } + + #[test] + fn suffix_k() { + let x = parse_human_readable_size("123K").unwrap(); + assert_eq!(123 * (1<<10), x); + } + + #[test] + fn suffix_m() { + let x = parse_human_readable_size("123M").unwrap(); + assert_eq!(123 * (1<<20), x); + } + + #[test] + fn suffix_g() { + let x = parse_human_readable_size("123G").unwrap(); + assert_eq!(123 * (1<<30), x); + } + + #[test] + fn invalid_empty() { + assert!(parse_human_readable_size("").is_err()); + } + + #[test] + fn invalid_non_digit() { + assert!(parse_human_readable_size("a").is_err()); + } + + #[test] + fn invalid_overflow() { + assert!(parse_human_readable_size("9999999999999999G").is_err()); + } + + #[test] + fn invalid_suffix() { + assert!(parse_human_readable_size("123T").is_err()); + } +} diff --git a/grep-cli/src/lib.rs b/grep-cli/src/lib.rs new file mode 100644 index 00000000..b9909c20 --- /dev/null +++ b/grep-cli/src/lib.rs @@ -0,0 +1,251 @@ +/*! +This crate provides common routines used in command line applications, with a +focus on routines useful for search oriented applications. As a utility +library, there is no central type or function. However, a key focus of this +crate is to improve failure modes and provide user friendly error messages +when things go wrong. + +To the best extent possible, everything in this crate works on Windows, macOS +and Linux. + + +# Standard I/O + +The +[`is_readable_stdin`](fn.is_readable_stdin.html), +[`is_tty_stderr`](fn.is_tty_stderr.html), +[`is_tty_stdin`](fn.is_tty_stdin.html) +and +[`is_tty_stdout`](fn.is_tty_stdout.html) +routines query aspects of standard I/O. `is_readable_stdin` determines whether +stdin can be usefully read from, while the `tty` methods determine whether a +tty is attached to stdin/stdout/stderr. + +`is_readable_stdin` is useful when writing an application that changes behavior +based on whether the application was invoked with data on stdin. For example, +`rg foo` might recursively search the current working directory for +occurrences of `foo`, but `rg foo < file` might only search the contents of +`file`. + +The `tty` methods are useful for similar reasons. Namely, commands like `ls` +will change their output depending on whether they are printing to a terminal +or not. For example, `ls` shows a file on each line when stdout is redirected +to a file or a pipe, but condenses the output to show possibly many files on +each line when stdout is connected to a tty. + + +# Coloring and buffering + +The +[`stdout`](fn.stdout.html), +[`stdout_buffered_block`](fn.stdout_buffered_block.html) +and +[`stdout_buffered_line`](fn.stdout_buffered_line.html) +routines are alternative constructors for +[`StandardStream`](struct.StandardStream.html). +A `StandardStream` implements `termcolor::WriteColor`, which provides a way +to emit colors to terminals. Its key use is the encapsulation of buffering +style. Namely, `stdout` will return a line buffered `StandardStream` if and +only if stdout is connected to a tty, and will otherwise return a block +buffered `StandardStream`. Line buffering is important for use with a tty +because it typically decreases the latency at which the end user sees output. +Block buffering is used otherwise because it is faster, and redirecting stdout +to a file typically doesn't benefit from the decreased latency that line +buffering provides. + +The `stdout_buffered_block` and `stdout_buffered_line` can be used to +explicitly set the buffering strategy regardless of whether stdout is connected +to a tty or not. + + +# Escaping + +The +[`escape`](fn.escape.html), +[`escape_os`](fn.escape_os.html), +[`unescape`](fn.unescape.html) +and +[`unescape_os`](fn.unescape_os.html) +routines provide a user friendly way of dealing with UTF-8 encoded strings that +can express arbitrary bytes. For example, you might want to accept a string +containing arbitrary bytes as a command line argument, but most interactive +shells make such strings difficult to type. Instead, we can ask users to use +escape sequences. + +For example, `a\xFFz` is itself a valid UTF-8 string corresponding to the +following bytes: + +```ignore +[b'a', b'\\', b'x', b'F', b'F', b'z'] +``` + +However, we can +interpret `\xFF` as an escape sequence with the `unescape`/`unescape_os` +routines, which will yield + +```ignore +[b'a', b'\xFF', b'z'] +``` + +instead. For example: + +``` +use grep_cli::unescape; + +// Note the use of a raw string! +assert_eq!(vec![b'a', b'\xFF', b'z'], unescape(r"a\xFFz")); +``` + +The `escape`/`escape_os` routines provide the reverse transformation, which +makes it easy to show user friendly error messages involving arbitrary bytes. + + +# Building patterns + +Typically, regular expression patterns must be valid UTF-8. However, command +line arguments aren't guaranteed to be valid UTF-8. Unfortunately, the +standard library's UTF-8 conversion functions from `OsStr`s do not provide +good error messages. However, the +[`pattern_from_bytes`](fn.pattern_from_bytes.html) +and +[`pattern_from_os`](fn.pattern_from_os.html) +do, including reporting exactly where the first invalid UTF-8 byte is seen. + +Additionally, it can be useful to read patterns from a file while reporting +good error messages that include line numbers. The +[`patterns_from_path`](fn.patterns_from_path.html), +[`patterns_from_reader`](fn.patterns_from_reader.html) +and +[`patterns_from_stdin`](fn.patterns_from_stdin.html) +routines do just that. If any pattern is found that is invalid UTF-8, then the +error includes the file path (if available) along with the line number and the +byte offset at which the first invalid UTF-8 byte was observed. + + +# Read process output + +Sometimes a command line application needs to execute other processes and read +its stdout in a streaming fashion. The +[`CommandReader`](struct.CommandReader.html) +provides this functionality with an explicit goal of improving failure modes. +In particular, if the process exits with an error code, then stderr is read +and converted into a normal Rust error to show to end users. This makes the +underlying failure modes explicit and gives more information to end users for +debugging the problem. + +As a special case, +[`DecompressionReader`](struct.DecompressionReader.html) +provides a way to decompress arbitrary files by matching their file extensions +up with corresponding decompression programs (such as `gzip` and `xz`). This +is useful as a means of performing simplistic decompression in a portable +manner without binding to specific compression libraries. This does come with +some overhead though, so if you need to decompress lots of small files, this +may not be an appropriate convenience to use. + +Each reader has a corresponding builder for additional configuration, such as +whether to read stderr asynchronously in order to avoid deadlock (which is +enabled by default). + + +# Miscellaneous parsing + +The +[`parse_human_readable_size`](fn.parse_human_readable_size.html) +routine parses strings like `2M` and converts them to the corresponding number +of bytes (`2 * 1<<20` in this case). If an invalid size is found, then a good +error message is crafted that typically tells the user how to fix the problem. +*/ + +#![deny(missing_docs)] + +extern crate atty; +extern crate globset; +#[macro_use] +extern crate lazy_static; +#[macro_use] +extern crate log; +extern crate regex; +extern crate same_file; +extern crate termcolor; +#[cfg(windows)] +extern crate winapi_util; + +mod decompress; +mod escape; +mod human; +mod pattern; +mod process; +mod wtr; + +pub use decompress::{ + DecompressionMatcher, DecompressionMatcherBuilder, + DecompressionReader, DecompressionReaderBuilder, +}; +pub use escape::{escape, escape_os, unescape, unescape_os}; +pub use human::{ParseSizeError, parse_human_readable_size}; +pub use pattern::{ + InvalidPatternError, + pattern_from_os, pattern_from_bytes, + patterns_from_path, patterns_from_reader, patterns_from_stdin, +}; +pub use process::{CommandError, CommandReader, CommandReaderBuilder}; +pub use wtr::{ + StandardStream, + stdout, stdout_buffered_line, stdout_buffered_block, +}; + +/// Returns true if and only if stdin is believed to be readable. +/// +/// When stdin is readable, command line programs may choose to behave +/// differently than when stdin is not readable. For example, `command foo` +/// might search the current directory for occurrences of `foo` where as +/// `command foo < some-file` or `cat some-file | command foo` might instead +/// only search stdin for occurrences of `foo`. +pub fn is_readable_stdin() -> bool { + #[cfg(unix)] + fn imp() -> bool { + use std::os::unix::fs::FileTypeExt; + use same_file::Handle; + + let ft = match Handle::stdin().and_then(|h| h.as_file().metadata()) { + Err(_) => return false, + Ok(md) => md.file_type(), + }; + ft.is_file() || ft.is_fifo() + } + + #[cfg(windows)] + fn imp() -> bool { + use winapi_util as winutil; + + winutil::file::typ(winutil::HandleRef::stdin()) + .map(|t| t.is_disk() || t.is_pipe()) + .unwrap_or(false) + } + + !is_tty_stdin() && imp() +} + +/// Returns true if and only if stdin is believed to be connectted to a tty +/// or a console. +pub fn is_tty_stdin() -> bool { + atty::is(atty::Stream::Stdin) +} + +/// Returns true if and only if stdout is believed to be connectted to a tty +/// or a console. +/// +/// This is useful for when you want your command line program to produce +/// different output depending on whether it's printing directly to a user's +/// terminal or whether it's being redirected somewhere else. For example, +/// implementations of `ls` will often show one item per line when stdout is +/// redirected, but will condensed output when printing to a tty. +pub fn is_tty_stdout() -> bool { + atty::is(atty::Stream::Stdout) +} + +/// Returns true if and only if stderr is believed to be connectted to a tty +/// or a console. +pub fn is_tty_stderr() -> bool { + atty::is(atty::Stream::Stderr) +} diff --git a/grep-cli/src/pattern.rs b/grep-cli/src/pattern.rs new file mode 100644 index 00000000..ed1d95a5 --- /dev/null +++ b/grep-cli/src/pattern.rs @@ -0,0 +1,205 @@ +use std::error; +use std::ffi::OsStr; +use std::fmt; +use std::fs::File; +use std::io::{self, BufRead}; +use std::path::Path; +use std::str; + +use escape::{escape, escape_os}; + +/// An error that occurs when a pattern could not be converted to valid UTF-8. +/// +/// The purpose of this error is to give a more targeted failure mode for +/// patterns written by end users that are not valid UTF-8. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct InvalidPatternError { + original: String, + valid_up_to: usize, +} + +impl InvalidPatternError { + /// Returns the index in the given string up to which valid UTF-8 was + /// verified. + pub fn valid_up_to(&self) -> usize { + self.valid_up_to + } +} + +impl error::Error for InvalidPatternError { + fn description(&self) -> &str { "invalid pattern" } +} + +impl fmt::Display for InvalidPatternError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "found invalid UTF-8 in pattern at byte offset {} \ + (use hex escape sequences to match arbitrary bytes \ + in a pattern, e.g., \\xFF): '{}'", + self.valid_up_to, + self.original, + ) + } +} + +impl From for io::Error { + fn from(paterr: InvalidPatternError) -> io::Error { + io::Error::new(io::ErrorKind::Other, paterr) + } +} + +/// Convert an OS string into a regular expression pattern. +/// +/// This conversion fails if the given pattern is not valid UTF-8, in which +/// case, a targeted error with more information about where the invalid UTF-8 +/// occurs is given. The error also suggests the use of hex escape sequences, +/// which are supported by many regex engines. +pub fn pattern_from_os(pattern: &OsStr) -> Result<&str, InvalidPatternError> { + pattern.to_str().ok_or_else(|| { + let valid_up_to = pattern + .to_string_lossy() + .find('\u{FFFD}') + .expect("a Unicode replacement codepoint for invalid UTF-8"); + InvalidPatternError { + original: escape_os(pattern), + valid_up_to: valid_up_to, + } + }) +} + +/// Convert arbitrary bytes into a regular expression pattern. +/// +/// This conversion fails if the given pattern is not valid UTF-8, in which +/// case, a targeted error with more information about where the invalid UTF-8 +/// occurs is given. The error also suggests the use of hex escape sequences, +/// which are supported by many regex engines. +pub fn pattern_from_bytes( + pattern: &[u8], +) -> Result<&str, InvalidPatternError> { + str::from_utf8(pattern).map_err(|err| { + InvalidPatternError { + original: escape(pattern), + valid_up_to: err.valid_up_to(), + } + }) +} + +/// Read patterns from a file path, one per line. +/// +/// If there was a problem reading or if any of the patterns contain invalid +/// UTF-8, then an error is returned. If there was a problem with a specific +/// pattern, then the error message will include the line number and the file +/// path. +pub fn patterns_from_path>(path: P) -> io::Result> { + let path = path.as_ref(); + let file = File::open(path).map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!("{}: {}", path.display(), err), + ) + })?; + patterns_from_reader(file).map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!("{}:{}", path.display(), err), + ) + }) +} + +/// Read patterns from stdin, one per line. +/// +/// If there was a problem reading or if any of the patterns contain invalid +/// UTF-8, then an error is returned. If there was a problem with a specific +/// pattern, then the error message will include the line number and the fact +/// that it came from stdin. +pub fn patterns_from_stdin() -> io::Result> { + let stdin = io::stdin(); + let locked = stdin.lock(); + patterns_from_reader(locked).map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!(":{}", err), + ) + }) +} + +/// Read patterns from any reader, one per line. +/// +/// If there was a problem reading or if any of the patterns contain invalid +/// UTF-8, then an error is returned. If there was a problem with a specific +/// pattern, then the error message will include the line number. +/// +/// Note that this routine uses its own internal buffer, so the caller should +/// not provide their own buffered reader if possible. +/// +/// # Example +/// +/// This shows how to parse patterns, one per line. +/// +/// ``` +/// use grep_cli::patterns_from_reader; +/// +/// # fn example() -> Result<(), Box<::std::error::Error>> { +/// let patterns = "\ +/// foo +/// bar\\s+foo +/// [a-z]{3} +/// "; +/// +/// assert_eq!(patterns_from_reader(patterns.as_bytes())?, vec![ +/// r"foo", +/// r"bar\s+foo", +/// r"[a-z]{3}", +/// ]); +/// # Ok(()) } +/// ``` +pub fn patterns_from_reader(rdr: R) -> io::Result> { + let mut patterns = vec![]; + let mut bufrdr = io::BufReader::new(rdr); + let mut line = vec![]; + let mut line_number = 0; + while { + line.clear(); + line_number += 1; + bufrdr.read_until(b'\n', &mut line)? > 0 + } { + line.pop().unwrap(); // remove trailing '\n' + if line.last() == Some(&b'\r') { + line.pop().unwrap(); + } + match pattern_from_bytes(&line) { + Ok(pattern) => patterns.push(pattern.to_string()), + Err(err) => { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("{}: {}", line_number, err), + )); + } + } + } + Ok(patterns) +} + +#[cfg(test)] +mod tests { + use super::{pattern_from_bytes, pattern_from_os}; + + #[test] + fn bytes() { + let pat = b"abc\xFFxyz"; + let err = pattern_from_bytes(pat).unwrap_err(); + assert_eq!(3, err.valid_up_to()); + } + + #[test] + #[cfg(unix)] + fn os() { + use std::os::unix::ffi::OsStrExt; + use std::ffi::OsStr; + + let pat = OsStr::from_bytes(b"abc\xFFxyz"); + let err = pattern_from_os(pat).unwrap_err(); + assert_eq!(3, err.valid_up_to()); + } +} diff --git a/grep-cli/src/process.rs b/grep-cli/src/process.rs new file mode 100644 index 00000000..017dd0c3 --- /dev/null +++ b/grep-cli/src/process.rs @@ -0,0 +1,267 @@ +use std::error; +use std::fmt; +use std::io::{self, Read}; +use std::iter; +use std::process; +use std::thread::{self, JoinHandle}; + +/// An error that can occur while running a command and reading its output. +/// +/// This error can be seamlessly converted to an `io::Error` via a `From` +/// implementation. +#[derive(Debug)] +pub struct CommandError { + kind: CommandErrorKind, +} + +#[derive(Debug)] +enum CommandErrorKind { + Io(io::Error), + Stderr(Vec), +} + +impl CommandError { + /// Create an error from an I/O error. + pub(crate) fn io(ioerr: io::Error) -> CommandError { + CommandError { kind: CommandErrorKind::Io(ioerr) } + } + + /// Create an error from the contents of stderr (which may be empty). + pub(crate) fn stderr(bytes: Vec) -> CommandError { + CommandError { kind: CommandErrorKind::Stderr(bytes) } + } +} + +impl error::Error for CommandError { + fn description(&self) -> &str { "command error" } +} + +impl fmt::Display for CommandError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.kind { + CommandErrorKind::Io(ref e) => e.fmt(f), + CommandErrorKind::Stderr(ref bytes) => { + let msg = String::from_utf8_lossy(bytes); + if msg.trim().is_empty() { + write!(f, "") + } else { + let div = iter::repeat('-').take(79).collect::(); + write!(f, "\n{div}\n{msg}\n{div}", div=div, msg=msg.trim()) + } + } + } + } +} + +impl From for CommandError { + fn from(ioerr: io::Error) -> CommandError { + CommandError { kind: CommandErrorKind::Io(ioerr) } + } +} + +impl From for io::Error { + fn from(cmderr: CommandError) -> io::Error { + match cmderr.kind { + CommandErrorKind::Io(ioerr) => ioerr, + CommandErrorKind::Stderr(_) => { + io::Error::new(io::ErrorKind::Other, cmderr) + } + } + } +} + +/// Configures and builds a streaming reader for process output. +#[derive(Clone, Debug, Default)] +pub struct CommandReaderBuilder { + async_stderr: bool, +} + +impl CommandReaderBuilder { + /// Create a new builder with the default configuration. + pub fn new() -> CommandReaderBuilder { + CommandReaderBuilder::default() + } + + /// Build a new streaming reader for the given command's output. + /// + /// The caller should set everything that's required on the given command + /// before building a reader, such as its arguments, environment and + /// current working directory. Settings such as the stdout and stderr (but + /// not stdin) pipes will be overridden so that they can be controlled by + /// the reader. + /// + /// If there was a problem spawning the given command, then its error is + /// returned. + pub fn build( + &self, + command: &mut process::Command, + ) -> Result { + let mut child = command + .stdout(process::Stdio::piped()) + .stderr(process::Stdio::piped()) + .spawn()?; + let stdout = child.stdout.take().unwrap(); + let stderr = + if self.async_stderr { + StderrReader::async(child.stderr.take().unwrap()) + } else { + StderrReader::sync(child.stderr.take().unwrap()) + }; + Ok(CommandReader { + child: child, + stdout: stdout, + stderr: stderr, + done: false, + }) + } + + /// When enabled, the reader will asynchronously read the contents of the + /// command's stderr output. When disabled, stderr is only read after the + /// stdout stream has been exhausted (or if the process quits with an error + /// code). + /// + /// Note that when enabled, this may require launching an additional + /// thread in order to read stderr. This is done so that the process being + /// executed is never blocked from writing to stdout or stderr. If this is + /// disabled, then it is possible for the process to fill up the stderr + /// buffer and deadlock. + /// + /// This is enabled by default. + pub fn async_stderr(&mut self, yes: bool) -> &mut CommandReaderBuilder { + self.async_stderr = yes; + self + } +} + +/// A streaming reader for a command's output. +/// +/// The purpose of this reader is to provide an easy way to execute processes +/// whose stdout is read in a streaming way while also making the processes' +/// stderr available when the process fails with an exit code. This makes it +/// possible to execute processes while surfacing the underlying failure mode +/// in the case of an error. +/// +/// Moreover, by default, this reader will asynchronously read the processes' +/// stderr. This prevents subtle deadlocking bugs for noisy processes that +/// write a lot to stderr. Currently, the entire contents of stderr is read +/// on to the heap. +/// +/// # Example +/// +/// This example shows how to invoke `gzip` to decompress the contents of a +/// file. If the `gzip` command reports a failing exit status, then its stderr +/// is returned as an error. +/// +/// ```no_run +/// use std::io::Read; +/// use std::process::Command; +/// use grep_cli::CommandReader; +/// +/// # fn example() -> Result<(), Box<::std::error::Error>> { +/// let mut cmd = Command::new("gzip"); +/// cmd.arg("-d").arg("-c").arg("/usr/share/man/man1/ls.1.gz"); +/// +/// let mut rdr = CommandReader::new(&mut cmd)?; +/// let mut contents = vec![]; +/// rdr.read_to_end(&mut contents)?; +/// # Ok(()) } +/// ``` +#[derive(Debug)] +pub struct CommandReader { + child: process::Child, + stdout: process::ChildStdout, + stderr: StderrReader, + done: bool, +} + +impl CommandReader { + /// Create a new streaming reader for the given command using the default + /// configuration. + /// + /// The caller should set everything that's required on the given command + /// before building a reader, such as its arguments, environment and + /// current working directory. Settings such as the stdout and stderr (but + /// not stdin) pipes will be overridden so that they can be controlled by + /// the reader. + /// + /// If there was a problem spawning the given command, then its error is + /// returned. + /// + /// If the caller requires additional configuration for the reader + /// returned, then use + /// [`CommandReaderBuilder`](struct.CommandReaderBuilder.html). + pub fn new( + cmd: &mut process::Command, + ) -> Result { + CommandReaderBuilder::new().build(cmd) + } +} + +impl io::Read for CommandReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if self.done { + return Ok(0); + } + let nread = self.stdout.read(buf)?; + if nread == 0 { + self.done = true; + // Reap the child now that we're done reading. If the command + // failed, report stderr as an error. + if !self.child.wait()?.success() { + return Err(io::Error::from(self.stderr.read_to_end())); + } + } + Ok(nread) + } +} + +/// A reader that encapsulates the asynchronous or synchronous reading of +/// stderr. +#[derive(Debug)] +enum StderrReader { + Async(Option>), + Sync(process::ChildStderr), +} + +impl StderrReader { + /// Create a reader for stderr that reads contents asynchronously. + fn async(mut stderr: process::ChildStderr) -> StderrReader { + let handle = thread::spawn(move || { + stderr_to_command_error(&mut stderr) + }); + StderrReader::Async(Some(handle)) + } + + /// Create a reader for stderr that reads contents synchronously. + fn sync(stderr: process::ChildStderr) -> StderrReader { + StderrReader::Sync(stderr) + } + + /// Consumes all of stderr on to the heap and returns it as an error. + /// + /// If there was a problem reading stderr itself, then this returns an I/O + /// command error. + fn read_to_end(&mut self) -> CommandError { + match *self { + StderrReader::Async(ref mut handle) => { + let handle = handle + .take() + .expect("read_to_end cannot be called more than once"); + handle + .join() + .expect("stderr reading thread does not panic") + } + StderrReader::Sync(ref mut stderr) => { + stderr_to_command_error(stderr) + } + } + } +} + +fn stderr_to_command_error(stderr: &mut process::Chil