summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2019-04-04 15:15:48 -0400
committerAndrew Gallant <jamslam@gmail.com>2019-04-05 23:24:08 -0400
commitd968a27ed5298d99e46ff65b68a7f6c2c641105f (patch)
tree855c8b454b380887037e669a5dc95eeec3043e2d
parent9b8f5cbabab547904bdfcb333fca2771d43db561 (diff)
cli: use bstr
This uses bstr in the unescaping logic. This lets us remove some platform specific code, and also lets us remove a hacked UTF-8 decoder on raw bytes.
-rw-r--r--grep-cli/Cargo.toml1
-rw-r--r--grep-cli/src/escape.rs74
-rw-r--r--grep-cli/src/lib.rs1
3 files changed, 13 insertions, 63 deletions
diff --git a/grep-cli/Cargo.toml b/grep-cli/Cargo.toml
index 29e15b28..f143e401 100644
--- a/grep-cli/Cargo.toml
+++ b/grep-cli/Cargo.toml
@@ -14,6 +14,7 @@ license = "Unlicense/MIT"
[dependencies]
atty = "0.2.11"
+bstr = "0.1.2"
globset = { version = "0.4.2", path = "../globset" }
lazy_static = "1.1.0"
log = "0.4.5"
diff --git a/grep-cli/src/escape.rs b/grep-cli/src/escape.rs
index 9b350a93..7ea96788 100644
--- a/grep-cli/src/escape.rs
+++ b/grep-cli/src/escape.rs
@@ -1,6 +1,8 @@
use std::ffi::OsStr;
use std::str;
+use bstr::{BStr, BString};
+
/// A single state in the state machine used by `unescape`.
#[derive(Clone, Copy, Eq, PartialEq)]
enum State {
@@ -35,18 +37,16 @@ enum State {
///
/// assert_eq!(r"foo\nbar\xFFbaz", escape(b"foo\nbar\xFFbaz"));
/// ```
-pub fn escape(mut bytes: &[u8]) -> String {
+pub fn escape(bytes: &[u8]) -> String {
+ let bytes = BStr::new(bytes);
let mut escaped = String::new();
- while let Some(result) = decode_utf8(bytes) {
- match result {
- Ok(cp) => {
- escape_char(cp, &mut escaped);
- bytes = &bytes[cp.len_utf8()..];
- }
- Err(byte) => {
- escape_byte(byte, &mut escaped);
- bytes = &bytes[1..];
+ for (s, e, ch) in bytes.char_indices() {
+ if ch == '\u{FFFD}' {
+ for b in bytes[s..e].bytes() {
+ escape_byte(b, &mut escaped);
}
+ } else {
+ escape_char(ch, &mut escaped);
}
}
escaped
@@ -56,19 +56,7 @@ pub fn escape(mut bytes: &[u8]) -> String {
///
/// This is like [`escape`](fn.escape.html), but accepts an OS string.
pub fn escape_os(string: &OsStr) -> String {
- #[cfg(unix)]
- fn imp(string: &OsStr) -> String {
- use std::os::unix::ffi::OsStrExt;
-
- escape(string.as_bytes())
- }
-
- #[cfg(not(unix))]
- fn imp(string: &OsStr) -> String {
- escape(string.to_string_lossy().as_bytes())
- }
-
- imp(string)
+ escape(BString::from_os_str_lossy(string).as_bytes())
}
/// Unescapes a string.
@@ -195,46 +183,6 @@ fn escape_byte(byte: u8, into: &mut String) {
}
}
-/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
-///
-/// If no valid encoding of a codepoint exists at the beginning of the given
-/// byte slice, then the first byte is returned instead.
-///
-/// This returns `None` if and only if `bytes` is empty.
-fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
- if bytes.is_empty() {
- return None;
- }
- let len = match utf8_len(bytes[0]) {
- None => return Some(Err(bytes[0])),
- Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
- Some(len) => len,
- };
- match str::from_utf8(&bytes[..len]) {
- Ok(s) => Some(Ok(s.chars().next().unwrap())),
- Err(_) => Some(Err(bytes[0])),
- }
-}
-
-/// Given a UTF-8 leading byte, this returns the total number of code units
-/// in the following encoded codepoint.
-///
-/// If the given byte is not a valid UTF-8 leading byte, then this returns
-/// `None`.
-fn utf8_len(byte: u8) -> Option<usize> {
- if byte <= 0x7F {
- Some(1)
- } else if byte <= 0b110_11111 {
- Some(2)
- } else if byte <= 0b1110_1111 {
- Some(3)
- } else if byte <= 0b1111_0111 {
- Some(4)
- } else {
- None
- }
-}
-
#[cfg(test)]
mod tests {
use super::{escape, unescape};
diff --git a/grep-cli/src/lib.rs b/grep-cli/src/lib.rs
index b9909c20..9c5d71ad 100644
--- a/grep-cli/src/lib.rs
+++ b/grep-cli/src/lib.rs
@@ -159,6 +159,7 @@ error message is crafted that typically tells the user how to fix the problem.
#![deny(missing_docs)]
extern crate atty;
+extern crate bstr;
extern crate globset;
#[macro_use]
extern crate lazy_static;