diff options
-rw-r--r-- | CHANGELOG.md | 2 | ||||
-rw-r--r-- | Cargo.lock | 19 | ||||
-rw-r--r-- | Cargo.toml | 7 | ||||
-rw-r--r-- | grep-cli/Cargo.toml | 25 | ||||
-rw-r--r-- | grep-cli/LICENSE-MIT | 21 | ||||
-rw-r--r-- | grep-cli/README.md | 38 | ||||
-rw-r--r-- | grep-cli/UNLICENSE | 24 | ||||
-rw-r--r-- | grep-cli/src/decompress.rs | 381 | ||||
-rw-r--r-- | grep-cli/src/escape.rs | 315 | ||||
-rw-r--r-- | grep-cli/src/human.rs | 171 | ||||
-rw-r--r-- | grep-cli/src/lib.rs | 251 | ||||
-rw-r--r-- | grep-cli/src/pattern.rs | 205 | ||||
-rw-r--r-- | grep-cli/src/process.rs | 267 | ||||
-rw-r--r-- | grep-cli/src/wtr.rs | 133 | ||||
-rw-r--r-- | grep-printer/src/color.rs | 28 | ||||
-rw-r--r-- | grep-printer/src/lib.rs | 2 | ||||
-rw-r--r-- | grep/Cargo.toml | 1 | ||||
-rw-r--r-- | grep/examples/simplegrep.rs | 34 | ||||
-rw-r--r-- | grep/src/lib.rs | 1 | ||||
-rw-r--r-- | src/app.rs | 8 | ||||
-rw-r--r-- | src/args.rs | 149 | ||||
-rw-r--r-- | src/decompressor.rs | 190 | ||||
-rw-r--r-- | src/main.rs | 10 | ||||
-rw-r--r-- | src/preprocessor.rs | 93 | ||||
-rw-r--r-- | src/search.rs | 73 | ||||
-rw-r--r-- | src/unescape.rs | 137 |
26 files changed, 1987 insertions, 598 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index c0fb04b9..73406762 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,6 +71,8 @@ Bug fixes: Context from the `--passthru` flag should not impact process exit status. * [BUG #984](https://github.com/BurntSushi/ripgrep/issues/984): Fixes bug in `ignore` crate where first path was always treated as a symlink. +* [BUG #990](https://github.com/BurntSushi/ripgrep/issues/990): + Read stderr asynchronously when running a process. * [BUG #1013](https://github.com/BurntSushi/ripgrep/issues/1013): Add compile time and runtime CPU features to `--version` output. * [BUG #1028](https://github.com/BurntSushi/ripgrep/pull/1028): @@ -168,6 +168,7 @@ name = "grep" version = "0.2.0" dependencies = [ "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "grep-cli 0.1.0", "grep-matcher 0.1.0", "grep-pcre2 0.1.0", "grep-printer 0.1.0", @@ -178,6 +179,20 @@ dependencies = [ ] [[package]] +name = "grep-cli" +version = "0.1.0" +dependencies = [ + "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "globset 0.4.1", + "lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", + "same-file 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", + "termcolor 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] name = "grep-matcher" version = "0.1.0" dependencies = [ @@ -464,21 +479,17 @@ dependencies = [ name = "ripgrep" version = "0.9.0" dependencies = [ - "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)", - "globset 0.4.1", "grep 0.2.0", "ignore 0.4.3", "lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", "num_cpus 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)", - "same-file 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.75 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.75 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.26 (registry+https://github.com/rust-lang/crates.io-index)", "termcolor 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", - "winapi-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -35,6 +35,7 @@ path = "tests/tests.rs" members = [ "globset", "grep", + "grep-cli", "grep-matcher", "grep-pcre2", "grep-printer", @@ -44,15 +45,12 @@ members = [ ] [dependencies] -atty = "0.2.11" -globset = { version = "0.4.0", path = "globset" } grep = { version = "0.2.0", path = "grep" } ignore = { version = "0.4.0", path = "ignore" } lazy_static = "1" log = "0.4" num_cpus = "1" regex = "1" -same-file = "1" serde_json = "1" termcolor = "1" @@ -61,9 +59,6 @@ version = "2.32.0" default-features = false features = ["suggestions"] -[target.'cfg(windows)'.dependencies.winapi-util] -version = "0.1.1" - [build-dependencies] lazy_static = "1" diff --git a/grep-cli/Cargo.toml b/grep-cli/Cargo.toml new file mode 100644 index 00000000..1d5fda22 --- /dev/null +++ b/grep-cli/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "grep-cli" +version = "0.1.0" #:version +authors = ["Andrew Gallant <jamslam@gmail.com>"] +description = """ +Utilities for search oriented command line applications. +""" +documentation = "https://docs.rs/grep-cli" +homepage = "https://github.com/BurntSushi/ripgrep" +repository = "https://github.com/BurntSushi/ripgrep" +readme = "README.md" +keywords = ["regex", "grep", "cli", "utility", "util"] +license = "Unlicense/MIT" + +[dependencies] +atty = "0.2.11" +globset = { version = "0.4.1", path = "../globset" } +lazy_static = "1.1" +log = "0.4" +regex = "1" +same-file = "1" +termcolor = "1" + +[target.'cfg(windows)'.dependencies.winapi-util] +version = "0.1.1" diff --git a/grep-cli/LICENSE-MIT b/grep-cli/LICENSE-MIT new file mode 100644 index 00000000..3b0a5dc0 --- /dev/null +++ b/grep-cli/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/grep-cli/README.md b/grep-cli/README.md new file mode 100644 index 00000000..e78317c6 --- /dev/null +++ b/grep-cli/README.md @@ -0,0 +1,38 @@ +grep-cli +-------- +A utility library that provides common routines desired in search oriented +command line applications. This includes, but is not limited to, parsing hex +escapes, detecting whether stdin is readable and more. To the extent possible, +this crate strives for compatibility across Windows, macOS and Linux. + +[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.svg)](https://travis-ci.org/BurntSushi/ripgrep) +[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep) +[![](https://img.shields.io/crates/v/grep-cli.svg)](https://crates.io/crates/grep-cli) + +Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). + + +### Documentation + +[https://docs.rs/grep-cli](https://docs.rs/grep-cli) + +**NOTE:** You probably don't want to use this crate directly. Instead, you +should prefer the facade defined in the +[`grep`](https://docs.rs/grep) +crate. + + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +grep-cli = "0.1" +``` + +and this to your crate root: + +```rust +extern crate grep_cli; +``` diff --git a/grep-cli/UNLICENSE b/grep-cli/UNLICENSE new file mode 100644 index 00000000..68a49daa --- /dev/null +++ b/grep-cli/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to <http://unlicense.org/> diff --git a/grep-cli/src/decompress.rs b/grep-cli/src/decompress.rs new file mode 100644 index 00000000..ad108ea0 --- /dev/null +++ b/grep-cli/src/decompress.rs @@ -0,0 +1,381 @@ +use std::ffi::{OsStr, OsString}; +use std::fs::File; +use std::io; +use std::path::Path; +use std::process::Command; + +use globset::{Glob, GlobSet, GlobSetBuilder}; + +use process::{CommandError, CommandReader, CommandReaderBuilder}; + +/// A builder for a matcher that determines which files get decompressed. +#[derive(Clone, Debug)] +pub struct DecompressionMatcherBuilder { + /// The commands for each matching glob. + commands: Vec<DecompressionCommand>, + /// Whether to include the default matching rules. + defaults: bool, +} + +/// A representation of a single command for decompressing data +/// out-of-proccess. +#[derive(Clone, Debug)] +struct DecompressionCommand { + /// The glob that matches this command. + glob: String, + /// The command or binary name. + bin: OsString, + /// The arguments to invoke with the command. + args: Vec<OsString>, +} + +impl Default for DecompressionMatcherBuilder { + fn default() -> DecompressionMatcherBuilder { + DecompressionMatcherBuilder::new() + } +} + +impl DecompressionMatcherBuilder { + /// Create a new builder for configuring a decompression matcher. + pub fn new() -> DecompressionMatcherBuilder { + DecompressionMatcherBuilder { + commands: vec![], + defaults: true, + } + } + + /// Build a matcher for determining how to decompress files. + /// + /// If there was a problem compiling the matcher, then an error is + /// returned. + pub fn build(&self) -> Result<DecompressionMatcher, CommandError> { + let defaults = + if !self.defaults { + vec![] + } else { + default_decompression_commands() + }; + let mut glob_builder = GlobSetBuilder::new(); + let mut commands = vec![]; + for decomp_cmd in defaults.iter().chain(&self.commands) { + let glob = Glob::new(&decomp_cmd.glob).map_err(|err| { + CommandError::io(io::Error::new(io::ErrorKind::Other, err)) + })?; + glob_builder.add(glob); + commands.push(decomp_cmd.clone()); + } + let globs = glob_builder.build().map_err(|err| { + CommandError::io(io::Error::new(io::ErrorKind::Other, err)) + })?; + Ok(DecompressionMatcher { globs, commands }) + } + + /// When enabled, the default matching rules will be compiled into this + /// matcher before any other associations. When disabled, only the + /// rules explicitly given to this builder will be used. + /// + /// This is enabled by default. + pub fn defaults(&mut self, yes: bool) -> &mut DecompressionMatcherBuilder { + self.defaults = yes; + self + } + + /// Associates a glob with a command to decompress files matching the glob. + /// + /// If multiple globs match the same file, then the most recently added + /// glob takes precedence. + /// + /// The syntax for the glob is documented in the + /// [`globset` crate](https://docs.rs/globset/#syntax). + pub fn associate<P, I, A>( + &mut self, + glob: &str, + program: P, + args: I, + ) -> &mut DecompressionMatcherBuilder + where P: AsRef<OsStr>, + I: IntoIterator<Item=A>, + A: AsRef<OsStr>, + { + + let glob = glob.to_string(); + let bin = program.as_ref().to_os_string(); + let args = args + .into_iter() + .map(|a| a.as_ref().to_os_string()) + .collect(); + self.commands.push(DecompressionCommand { glob, bin, args }); + self + } +} + +/// A matcher for determining how to decompress files. +#[derive(Clone, Debug)] +pub struct DecompressionMatcher { + /// The set of globs to match. Each glob has a corresponding entry in + /// `commands`. When a glob matches, the corresponding command should be + /// used to perform out-of-process decompression. + globs: GlobSet, + /// The commands for each matching glob. + commands: Vec<DecompressionCommand>, +} + +impl Default for DecompressionMatcher { + fn default() -> DecompressionMatcher { + DecompressionMatcher::new() + } +} + +impl DecompressionMatcher { + /// Create a new matcher with default rules. + /// + /// To add more matching rules, build a matcher with + /// [`DecompressionMatcherBuilder`](struct.DecompressionMatcherBuilder.html). + pub fn new() -> DecompressionMatcher { + DecompressionMatcherBuilder::new() + .build() + .expect("built-in matching rules should always compile") + } + + /// Return a pre-built command based on the given file path that can + /// decompress its contents. If no such decompressor is known, then this + /// returns `None`. + /// + /// If there are multiple possible commands matching the given path, then + /// the command added last takes precedence. + pub fn command<P: AsRef<Path>>(&self, path: P) -> Option<Command> { + for i in self.globs.matches(path).into_iter().rev() { + let decomp_cmd = &self.commands[i]; + let mut cmd = Command::new(&decomp_cmd.bin); + cmd.args(&decomp_cmd.args); + return Some(cmd); + } + None + } + + /// Returns true if and only if the given file path has at least one + /// matching command to perform decompression on. + pub fn has_command<P: AsRef<Path>>(&self, path: P) -> bool { + self.globs.is_match(path) + } +} + +/// Configures and builds a streaming reader for decompressing data. +#[derive(Clone, Debug, Default)] +pub struct DecompressionReaderBuilder { + matcher: DecompressionMatcher, + command_builder: CommandReaderBuilder, +} + +impl DecompressionReaderBuilder { + /// Create a new builder with the default configuration. + pub fn new() -> DecompressionReaderBuilder { + DecompressionReaderBuilder::default() + } + + /// Build a new streaming reader for decompressing data. + /// + /// If decompression is done out-of-process and if there was a problem + /// spawning the process, then its error is logged at the debug level and a + /// passthru reader is returned that does no decompression. This behavior + /// typically occurs when the given file path matches a decompression + /// command, but is executing in an environment where the decompression + /// command is not available. + /// + /// If the given file path could not be matched with a decompression + /// strategy, then a passthru reader is returned that does no + /// decompression. + pub fn build<P: AsRef<Path>>( + &self, + path: P, + ) -> Result<DecompressionReader, CommandError> { + let path = path.as_ref(); + let mut cmd = match self.matcher.command(path) { + None => return DecompressionReader::new_passthru(path), + Some(cmd) => cmd, + }; + cmd.arg(path); + + match self.command_builder.build(&mut cmd) { + Ok(cmd_reader) => Ok(DecompressionReader { rdr: Ok(cmd_reader) }), + Err(err) => { + debug!( + "{}: error spawning command '{:?}': {} \ + (falling back to uncompressed reader)", + path.display(), + cmd, + err, + ); + DecompressionReader::new_passthru(path) + } + } + } + + /// Set the matcher to use to look up the decompression command for each + /// file path. + /// + /// A set of sensible rules is enabled by default. Setting this will + /// completely replace the current rules. + pub fn matcher( + &mut self, + matcher: DecompressionMatcher, + ) -> &mut DecompressionReaderBuilder { + self.matcher = matcher; + self + } + + /// Get the underlying matcher currently used by this builder. + pub fn get_matcher(&self) -> &DecompressionMatcher { + &self.matcher + } + + /// When enabled, the reader will asynchronously read the contents of the + /// command's stderr output. When disabled, stderr is only read after the + /// stdout stream has been exhausted (or if the process quits with an error + /// code). + /// + /// Note that when enabled, this may require launching an additional + /// thread in order to read stderr. This is done so that the process being + /// executed is never blocked from writing to stdout or stderr. If this is + /// disabled, then it is possible for the process to fill up the stderr + /// buffer and deadlock. + /// + /// This is enabled by default. + pub fn async_stderr( + &mut self, + yes: bool, + ) -> &mut DecompressionReaderBuilder { + self.command_builder.async_stderr(yes); + self + } +} + +/// A streaming reader for decompressing the contents of a file. +/// +/// The purpose of this reader is to provide a seamless way to decompress the +/// contents of file using existing tools in the current environment. This is +/// meant to be an alternative to using decompression libraries in favor of the +/// simplicity and portability of using external commands such as `gzip` and +/// `xz`. This does impose the overhead of spawning a process, so other means +/// for performing decompression should be sought if this overhead isn't +/// acceptable. +/// +/// A decompression reader comes with a default set of matching rules that are +/// meant to associate file paths with the corresponding command to use to +/// decompress them. For example, a glob like `*.gz` matches gzip compressed +/// files with the command `gzip -d -c`. If a file path does not match any +/// existing rules, or if it matches a rule whose command does not exist in the +/// current environment, then the decompression reader passes through the +/// contents of the underlying file without doing any decompression. +/// +/// The default matching rules are probably good enough for most cases, and if +/// they require revision, pull requests are welcome. In cases where they must +/// be changed or extended, they can be customized through the use of +/// [`DecompressionMatcherBuilder`](struct.DecompressionMatcherBuilder.html) +/// and +/// [`DecompressionReaderBuilder`](struct.DecompressionReaderBuilder.html). +/// +/// By default, this reader will asynchronously read the processes' stderr. +/// This prevents subtle deadlocking bugs for noisy processes that write a lot +/// to stderr. Currently, the entire contents of stderr is read on to the heap. +/// +/// # Example +/// +/// This example shows how to read the decompressed contents of a file without +/// needing to explicitly choose the decompression command to run. +/// +/// Note that if you need to decompress multiple files, it is better to use +/// `DecompressionReaderBuilder`, which will amortize the cost of compiling the +/// matcher. +/// +/// ```no_run +/// use std::io::Read; +/// use std::process::Command; +/// use grep_cli::DecompressionReader; +/// +/// # fn example() -> Result<(), Box<::std::error::Error>> { +/// let mut rdr = DecompressionReader::new("/usr/share/man/man1/ls.1.gz")?; +/// let mut contents = vec![]; +/// rdr.read_to_end(&mut contents)?; +/// # Ok(()) } +/// ``` +#[derive(Debug)] +pub struct DecompressionReader { + rdr: Result<CommandReader, File>, +} + +impl DecompressionReader { + /// Build a new streaming reader for decompressing data. + /// + /// If decompression is done out-of-process and if there was a problem + /// spawning the process, then its error is returned. + /// + /// If the given file path could not be matched with a decompression + /// strategy, then a passthru reader is returned that does no + /// decompression. + /// + /// This uses the default matching rules for determining how to decompress + /// the given file. To change those matching rules, use + /// [`DecompressionReaderBuilder`](struct.DecompressionReaderBuilder.html) + /// and + /// [`DecompressionMatcherBuilder`](struct.DecompressionMatcherBuilder.html). + /// + /// When creating readers for many paths. it is better to use the builder + /// since it will amortize the cost of constructing the matcher. + pub fn new<P: AsRef<Path>>( + path: P, + ) -> Result<DecompressionReader, CommandError> { + DecompressionReaderBuilder::new().build(path) + } + + /// Creates a new "passthru" decompression reader that reads from the file + /// corresponding to the given path without doing decompression and without + /// executing another process. + fn new_passthru(path: &Path) -> Result<DecompressionReader, CommandError> { + let file = File::open(path)?; + Ok(DecompressionReader { rdr: Err(file) }) + } +} + +impl io::Read for DecompressionReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { + match self.rdr { + Ok(ref mut rdr) => rdr.read(buf), + Err(ref mut rdr) => rdr.read(buf), + } + } +} + +fn default_decompression_commands() -> Vec<DecompressionCommand> { + const ARGS_GZIP: &[&str] = &["gzip", "-d", "-c"]; + const ARGS_BZIP: &[&str] = &["bzip2", "-d", "-c"]; + const ARGS_XZ: &[&str] = &["xz", "-d", "-c"]; + const ARGS_LZ4: &[&str] = &["lz4", "-d", "-c"]; + const ARGS_LZMA: &[&str] = &["xz", "--format=lzma", "-d", "-c"]; + + fn cmd(glob: &str, args: &[&str]) -> DecompressionCommand { + DecompressionCommand { + glob: glob.to_string(), + bin: OsStr::new(&args[0]).to_os_string(), + args: args + .iter() + .skip(1) + .map(|s| OsStr::new(s).to_os_string()) + .collect(), + } + } + vec![ + cmd("*.gz", ARGS_GZIP), + cmd("*.tgz", ARGS_GZIP), + + cmd("*.bz2", ARGS_BZIP), + cmd("*.tbz2", ARGS_BZIP), + + cmd("*.xz", ARGS_XZ), + cmd("*.txz", ARGS_XZ), + + cmd("*.lz4", ARGS_LZ4), + + cmd("*.lzma", ARGS_LZMA), + ] +} diff --git a/grep-cli/src/escape.rs b/grep-cli/src/escape.rs new file mode 100644 index 00000000..9b350a93 --- /dev/null +++ b/grep-cli/src/escape.rs @@ -0,0 +1,315 @@ +use std::ffi::OsStr; +use std::str; + +/// A single state in the state machine used by `unescape`. +#[derive(Clone, Copy, Eq, PartialEq)] +enum State { + /// The state after seeing a `\`. + Escape, + /// The state after seeing a `\x`. + HexFirst, + /// The state after seeing a `\x[0-9A-Fa-f]`. + HexSecond(char), + /// Default state. + Literal, +} + +/// Escapes arbitrary bytes into a human readable string. +/// +/// This converts `\t`, `\r` and `\n` into their escaped forms. It also +/// converts the non-printable subset of ASCII in addition to invalid UTF-8 +/// bytes to hexadecimal escape sequences. Everything else is left as is. +/// +/// The dual of this routine is [`unescape`](fn.unescape.html). +/// +/// # Example +/// +/// This example shows how to convert a byte string that contains a `\n` and +/// invalid UTF-8 bytes into a `String`. +/// +/// Pay special attention to the use of raw strings. That is, `r"\n"` is +/// equivalent to `"\\n"`. +/// +/// ``` +/// use grep_cli::escape; +/// +/// assert_eq!(r"foo\nbar\xFFbaz", escape(b"foo\nbar\xFFbaz")); +/// ``` +pub fn escape(mut bytes: &[u8]) -> String { + let mut escaped = String::new(); + while let Some(result) = decode_utf8(bytes) { + match result { + Ok(cp) => { + escape_char(cp, &mut escaped); + bytes = &bytes[cp.len_utf8()..]; + } + Err(byte) => { + escape_byte(byte, &mut escaped); + bytes = &bytes[1..]; + } + } + } + escaped +} + +/// Escapes an OS string into a human readable string. +/// +/// This is like [`escape`](fn.escape.html), but accepts an OS string. +pub fn escape_os(string: &OsStr) -> String { + #[cfg(unix)] + fn imp(string: &OsStr) -> String { + use std::os::unix::ffi::OsStrExt; + + escape(string.as_bytes()) + } + + #[cfg(not(unix))] + fn imp(string: &OsStr) -> String { + escape(string.to_string_lossy().as_bytes()) + } + + imp(string) +} + +/// Unescapes a string. +/// +/// It supports a limited set of escape sequences: +/// +/// * `\t`, `\r` and `\n` are mapped to their corresponding ASCII bytes. +/// * `\xZZ` hexadecimal escapes are mapped to their byte. +/// +/// Everything else is left as is, including non-hexadecimal escapes like +/// `\xGG`. +/// +/// This is useful when it is desirable for a command line argument to be +/// capable of specifying arbitrary bytes or otherwise make it easier to +/// specify non-printable characters. +/// +/// The dual of this routine is [`escape`](fn.escape.html). +/// +/// # Example +/// +/// This example shows how to convert an escaped string (which is valid UTF-8) +/// into a corresponding sequence of bytes. Each escape sequence is mapped to +/// its bytes, which may include invalid UTF-8. +/// +/// Pay special attention to the use of raw strings. That is, `r"\n"` is +/// equivalent to `"\\n"`. +/// +/// ``` +/// use grep_cli::unescape; +/// +/// assert_eq!(&b"foo\nbar\xFFbaz"[..], &*unescape(r"foo\nbar\xFFbaz")); +/// ``` +pub fn unescape(s: &str) -> Vec<u8> { + use self::State::*; + + let mut bytes = vec![]; + let mut state = Literal; + for c in s.chars() { + match state { + Escape => { + match c { + '\\' => { bytes.push(b'\\'); state = Literal; } + 'n' => { bytes.push(b'\n'); state = Literal; } + 'r' => { bytes.push(b'\r'); state = Literal; } + 't' => { bytes.push(b'\t'); state = Literal; } + 'x' => { state = HexFirst; } + c => { + bytes.extend(format!(r"\{}", c).into_bytes()); + state = Literal; + } + } + } + HexFirst => { + match c { + '0'...'9' | 'A'...'F' | 'a'...'f' => { + state = HexSecond(c); + } + c => { + bytes.extend(format!(r"\x{}", c).into_bytes()); + state = Literal; + } + } + } + HexSecond(first) => { + match c { + '0'...'9' | 'A'...'F' | 'a'...'f' => { + let ordinal = format!("{}{}", first, c); + let byte = u8::from_str_radix(&ordinal, 16).unwrap(); + bytes.push(byte); + state = Literal; + } + c => { + let original = format!(r"\x{}{}", first, c); + bytes.extend(original.into_bytes()); + state = Literal; + } + } + } + Literal => { + match c { + '\\' => { state = Escape; } + c => { bytes.extend(c.to_string().as_bytes()); } + } + } + } + } + match state { + Escape => bytes.push(b'\\'), + HexFirst => bytes.extend(b"\\x"), + HexSecond(c) => bytes.extend(format!("\\x{}", c).into_bytes()), + Literal => {} + } + bytes +} + +/// Unescapes an OS string. +/// +/// This is like [`unescape`](fn.unescape.html), but accepts an OS string. +/// +/// Note that this first lossily decodes the given OS string as UTF-8. That +/// is, an escaped string (the thing given) should be valid UTF-8. +pub fn unescape_os(string: &OsStr) -> Vec<u8> { + unescape(&string.to_string_lossy()) +} + +/// Adds the given codepoint to the given string, escaping it if necessary. +fn escape_char(cp: char, into: &mut String) { + if cp.is_ascii() { + escape_byte(cp as u8, into); + } else { + into.push(cp); + } +} + +/// Adds the given byte to the given string, escaping it if necessary. +fn escape_byte(byte: u8, into: &mut String) { + match byte { + 0x21...0x5B | 0x5D...0x7D => into.push(byte as char), + b'\n' => into.push_str(r"\n"), + b'\r' => into.push_str(r"\r"), + b'\t' => into.push_str(r"\t"), + b'\\' => into.push_str(r"\\"), + _ => into.push_str(&format!(r"\x{:02X}", byte)), + } +} + +/// Decodes the next UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the beginning of the given +/// byte slice, then the first byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> { + if bytes.is_empty() { |