summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2018-07-21 20:36:32 -0400
committerAndrew Gallant <jamslam@gmail.com>2018-07-21 20:36:32 -0400
commit209a125ea25615e8fa605c844f8cbcca8c672208 (patch)
tree640f6abf39ad6cae599524bb529aa7fedd03c6c6
parent090216cf002a579c987df76125fcdcb97478d48e (diff)
ripgrep: replace decoder with encoding_rs_io
This commit mostly moves the transcoder implementation to its own crate: https://github.com/BurntSushi/encoding_rs_io The new crate adds clear documentation and cleans up the implementation to fully implement the contract of io::Read.
-rw-r--r--Cargo.lock10
-rw-r--r--Cargo.toml1
-rw-r--r--src/decoder.rs456
-rw-r--r--src/main.rs2
-rw-r--r--src/worker.rs9
5 files changed, 18 insertions, 460 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 7aa50c3f..2b47012b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -70,6 +70,14 @@ dependencies = [
]
[[package]]
+name = "encoding_rs_io"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
name = "fnv"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -234,6 +242,7 @@ dependencies = [
"bytecount 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
+ "encoding_rs_io 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"globset 0.4.0",
"grep 0.1.8",
"ignore 0.4.2",
@@ -385,6 +394,7 @@ dependencies = [
"checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e"
"checksum crossbeam 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "24ce9782d4d5c53674646a6a4c1863a21a8fc0cb649b3c94dfc16e45071dea19"
"checksum encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)" = "88a1b66a0d28af4b03a8c8278c6dcb90e6e600d89c14500a9e7a02e64b9ee3ac"
+"checksum encoding_rs_io 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7bcd05bae9dfcb6d689427192bdf740d92daf53ff8e4d11ae46aad626353e48a"
"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
"checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82"
"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
diff --git a/Cargo.toml b/Cargo.toml
index c9b02d4c..8686d515 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -38,6 +38,7 @@ members = ["grep", "globset", "ignore"]
atty = "0.2.10"
bytecount = "0.3.1"
encoding_rs = "0.8"
+encoding_rs_io = "0.1"
globset = { version = "0.4.0", path = "globset" }
grep = { version = "0.1.8", path = "grep" }
ignore = { version = "0.4.0", path = "ignore" }
diff --git a/src/decoder.rs b/src/decoder.rs
deleted file mode 100644
index 0842fb5c..00000000
--- a/src/decoder.rs
+++ /dev/null
@@ -1,456 +0,0 @@
-use std::cmp;
-use std::io::{self, Read};
-
-use encoding_rs::{Decoder, Encoding, UTF_8};
-
-/// A BOM is at least 2 bytes and at most 3 bytes.
-///
-/// If fewer than 2 bytes are available to be read at the beginning of a
-/// reader, then a BOM is `None`.
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
-struct Bom {
- bytes: [u8; 3],
- len: usize,
-}
-
-impl Bom {
- fn as_slice(&self) -> &[u8] {
- &self.bytes[0..self.len]
- }
-
- fn decoder(&self) -> Option<Decoder> {
- let bom = self.as_slice();
- if bom.len() < 3 {
- return None;
- }
- if let Some((enc, _)) = Encoding::for_bom(bom) {
- if enc != UTF_8 {
- return Some(enc.new_decoder_with_bom_removal());
- }
- }
- None
- }
-}
-
-/// `BomPeeker` wraps `R` and satisfies the `io::Read` interface while also
-/// providing a peek at the BOM if one exists. Peeking at the BOM does not
-/// advance the reader.
-struct BomPeeker<R> {
- rdr: R,
- bom: Option<Bom>,
- nread: usize,
-}
-
-impl<R: io::Read> BomPeeker<R> {
- /// Create a new BomPeeker.
- ///
- /// The first three bytes can be read using the `peek_bom` method, but
- /// will not advance the reader.
- fn new(rdr: R) -> BomPeeker<R> {
- BomPeeker { rdr: rdr, bom: None, nread: 0 }
- }
-
- /// Peek at the first three bytes of the underlying reader.
- ///
- /// This does not advance the reader provided by `BomPeeker`.
- ///
- /// If the underlying reader does not have at least two bytes available,
- /// then `None` is returned.
- fn peek_bom(&mut self) -> io::Result<Bom> {
- if let Some(bom) = self.bom {
- return Ok(bom);
- }
- self.bom = Some(Bom { bytes: [0; 3], len: 0 });
- let mut buf = [0u8; 3];
- let bom_len = read_full(&mut self.rdr, &mut buf)?;
- self.bom = Some(Bom { bytes: buf, len: bom_len });
- Ok(self.bom.unwrap())
- }
-}
-
-impl<R: io::Read> io::Read for BomPeeker<R> {
- fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
- if self.nread < 3 {
- let bom = self.peek_bom()?;
- let bom = bom.as_slice();
- if self.nread < bom.len() {
- let rest = &bom[self.nread..];
- let len = cmp::min(buf.len(), rest.len());
- buf[..len].copy_from_slice(&rest[..len]);
- self.nread += len;
- return Ok(len);
- }
- }
- let nread = self.rdr.read(buf)?;
- self.nread += nread;
- Ok(nread)
- }
-}
-
-/// Like `io::Read::read_exact`, except it never returns `UnexpectedEof` and
-/// instead returns the number of bytes read if EOF is seen before filling
-/// `buf`.
-fn read_full<R: io::Read>(
- mut rdr: R,
- mut buf: &mut [u8],
-) -> io::Result<usize> {
- let mut nread = 0;
- while !buf.is_empty() {
- match rdr.read(buf) {
- Ok(0) => break,
- Ok(n) => {
- nread += n;
- let tmp = buf;
- buf = &mut tmp[n..];
- }
- Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
- Err(e) => return Err(e),
- }
- }
- Ok(nread)
-}
-
-/// A reader that transcodes to UTF-8. The source encoding is determined by
-/// inspecting the BOM from the stream read from `R`, if one exists. If a
-/// UTF-16 BOM exists, then the source stream is transcoded to UTF-8 with
-/// invalid UTF-16 sequences translated to the Unicode replacement character.
-/// In all other cases, the underlying reader is passed through unchanged.
-///
-/// `R` is the type of the underlying reader and `B` is the type of an internal
-/// buffer used to store the results of transcoding.
-///
-/// Note that not all methods on `io::Read` work with this implementation.
-/// For example, the `bytes` adapter method attempts to read a single byte at
-/// a time, but this implementation requires a buffer of size at least `4`. If
-/// a buffer of size less than 4 is given, then an error is returned.
-pub struct DecodeReader<R, B> {
- /// The underlying reader, wrapped in a peeker for reading a BOM if one
- /// exists.
- rdr: BomPeeker<R>,
- /// The internal buffer to store transcoded bytes before they are read by
- /// callers.
- buf: B,
- /// The current position in `buf`. Subsequent reads start here.
- pos: usize,
- /// The number of transcoded bytes in `buf`. Subsequent reads end here.
- buflen: usize,
- /// Whether this is the first read or not (in which we inspect the BOM).
- first: bool,
- /// Whether a "last" read has occurred. After this point, EOF will always
- /// be returned.
- last: bool,
- /// The underlying text decoder derived from the BOM, if one exists.
- decoder: Option<Decoder>,
-}
-
-impl<R: io::Read, B: AsMut<[u8]>> DecodeReader<R, B> {
- /// Create a new transcoder that converts a source stream to valid UTF-8.
- ///
- /// If an encoding is specified, then it is used to transcode `rdr` to
- /// UTF-8. Otherwise, if no encoding is specified, and if a UTF-16 BOM is
- /// found, then the corresponding UTF-16 encoding is used to transcode
- /// `rdr` to UTF-8. In all other cases, `rdr` is assumed to be at least
- /// ASCII-compatible and passed through untouched.
- ///
- /// Errors in the encoding of `rdr` are handled with the Unicode
- /// replacement character. If no encoding of `rdr` is specified, then
- /// errors are not handled.
- pub fn new(
- rdr: R,
- buf: B,
- enc: Option<&'static Encoding>,
- ) -> DecodeReader<R, B> {
- DecodeReader {
- rdr: BomPeeker::new(rdr),
- buf: buf,
- buflen: 0,
- pos: 0,
- first: enc.is_none(),
- last: false,
- decoder: enc.map(|enc| enc.new_decoder_with_bom_removal()),
- }
- }
-
- /// Fill the internal buffer from the underlying reader.
- ///
- /// If there are unread bytes in the internal buffer, then we move them
- /// to the beginning of the internal buffer and fill the remainder.
- ///
- /// If the internal buffer is too small to read additional bytes, then an
- /// error is returned.
- #[inline(always)] // massive perf benefit (???)
- fn fill(&mut self) -> io::Result<()> {
- if self.pos < self.buflen {
- if self.buflen >= self.buf.as_mut().len() {
- return Err(io::Error::new(
- io::ErrorKind::Other,
- "DecodeReader: internal buffer exhausted"));
- }
- let newlen = self.buflen - self.pos;
- let mut tmp = Vec::with_capacity(newlen);
- tmp.extend_from_slice(&self.buf.as_mut()[self.pos..self.buflen]);
- self.buf.as_mut()[..newlen].copy_from_slice(&tmp);
- self.buflen = newlen;
- } else {
- self.buflen = 0;
- }
- self.pos = 0;
- self.buflen +=
- self.rdr.read(&mut self.buf.as_mut()[self.buflen..])?;
- Ok(())
- }
-
- /// Transcode the inner stream to UTF-8 in `buf`. This assumes that there
- /// is a decoder capable of transcoding the inner stream to UTF-8. This
- /// returns the number of bytes written to `buf`.
- ///
- /// When this function returns, exactly one of the following things will
- /// be true:
- ///
- /// 1. A non-zero number of bytes were written to `buf`.
- /// 2. The underlying reader reached EOF.
- /// 3. An error is returned: the internal buffer ran out of room.
- /// 4. An I/O error occurred.
- ///
- /// Note that `buf` must have at least 4 bytes of space.
- fn transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> {
- assert!(buf.len() >= 4);
- if self.last {
- return Ok(0);
- }
- if self.pos >= self.buflen {
- self.fill()?;
- }
- let mut nwrite = 0;
- loop {
- let (_, nin, nout, _) =
- self.decoder.as_mut().unwrap().decode_to_utf8(
- &self.buf.as_mut()[self.pos..self.buflen], buf, false);
- self.pos += nin;
- nwrite += nout;
- // If we've written at least one byte to the caller-provided
- // buffer, then our mission is complete.
- if nwrite > 0 {
- break;
- }
- // Otherwise, we know that our internal buffer has insufficient
- // data to transcode at least one char, so we attempt to refill it.
- self.fill()?;
- // Quit on EOF.
- if self.buflen == 0 {
- self.pos = 0;
- self.last = true;
- let (_, _, nout, _) =
- self.decoder.as_mut().unwrap().decode_to_utf8(
- &[], buf, true);
- return Ok(nout);
- }
- }
- Ok(nwrite)
- }
-
- #[inline(never)] // impacts perf...
- fn detect(&mut self) -> io::Result<()> {
- let bom = self.rdr.peek_bom()?;
- self.decoder = bom.decoder();
- Ok(())
- }
-}
-
-impl<R: io::Read, B: AsMut<[u8]>> io::Read for DecodeReader<R, B> {
- fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
- if self.first {
- self.first = false;
- self.detect()?;
- }
- if self.decoder.is_none() {
- return self.rdr.read(buf);
- }
- // When decoding UTF-8, we need at least 4 bytes of space to guarantee
- // that we can decode at least one codepoint. If we don't have it, we
- // can either return `0` for the number of bytes read or return an
- // error. Since `0` would be interpreted as a possibly premature EOF,
- // we opt for an error.
- if buf.len() < 4 {
- return Err(io::Error::new(
- io::ErrorKind::Other,
- "DecodeReader: byte buffer must have length at least 4"));
- }
- self.transcode(buf)
- }
-}
-
-#[cfg(test)]
-mod tests {
- use std::io::Read;
-
- use encoding_rs::Encoding;
-
- use super::{Bom, BomPeeker, DecodeReader};
-
- fn read_to_string<R: Read>(mut rdr: R) -> String {
- let mut s = String::new();
- rdr.read_to_string(&mut s).unwrap();
- s
- }
-
- #[test]
- fn peeker_empty() {
- let buf = [];
- let mut peeker = BomPeeker::new(&buf[..]);
- assert_eq!(Bom { bytes: [0; 3], len: 0}, peeker.peek_bom().unwrap());
-
- let mut tmp = [0; 100];
- assert_eq!(0, peeker.read(&mut tmp).unwrap());
- }
-
- #[test]
- fn peeker_one() {
- let buf = [1];
- let mut peeker = BomPeeker::new(&buf[..]);
- assert_eq!(
- Bom { bytes: [1, 0, 0], len: 1},
- peeker.peek_bom().unwrap());
-
- let mut tmp = [0; 100];
- assert_eq!(1, peeker.read(&mut tmp).unwrap());
- assert_eq!(1, tmp[0]);
- assert_eq!(0, peeker.read(&mut tmp).unwrap());
- }
-
- #[test]
- fn peeker_two() {
- let buf = [1, 2];
- let mut peeker = BomPeeker::new(&buf[..]);
- assert_eq!(
- Bom { bytes: [1, 2, 0], len: 2},
- peeker.peek_bom().unwrap());
-
- let mut tmp = [0; 100];
- assert_eq!(2, peeker.read(&mut tmp).unwrap());
- assert_eq!(1, tmp[0]);
- assert_eq!(2, tmp[1]);
- assert_eq!(0, peeker.read(&mut tmp).unwrap());
- }
-
- #[test]
- fn peeker_three() {
- let buf = [1, 2, 3];
- let mut peeker = BomPeeker::new(&buf[..]);
- assert_eq!(
- Bom { bytes: [1, 2, 3], len: 3},
- peeker.peek_bom().unwrap());
-
- let mut tmp = [0; 100];
- assert_eq!(3, peeker.read(&mut tmp).unwrap());
- assert_eq!(1, tmp[0]);
- assert_eq!(2, tmp[1]);
- assert_eq!(3, tmp[2]);
- assert_eq!(0, peeker.read(&mut tmp).unwrap());
- }
-
- #[test]
- fn peeker_four() {
- let buf = [1, 2, 3, 4];
- let mut peeker = BomPeeker::new(&buf[..]);
- assert_eq!(
- Bom { bytes: [1, 2, 3], len: 3},
- peeker.peek_bom().unwrap());
-
- let mut tmp = [0; 100];
- assert_eq!(3, peeker.read(&mut tmp).unwrap());
- assert_eq!(1, tmp[0]);
- assert_eq!(2, tmp[1]);
- assert_eq!(3, tmp[2]);
- assert_eq!(1, peeker.read(&mut tmp).unwrap());
- assert_eq!(4, tmp[0]);
- assert_eq!(0, peeker.read(&mut tmp).unwrap());
- }
-
- #[test]
- fn peeker_one_at_a_time() {
- let buf = [1, 2, 3, 4];
- let mut peeker = BomPeeker::new(&buf[..]);
-
- let mut tmp = [0; 1];
- assert_eq!(0, peeker.read(&mut tmp[..0]).unwrap());
- assert_eq!(0, tmp[0]);
- assert_eq!(1, peeker.read(&mut tmp).unwrap());
- assert_eq!(1, tmp[0]);
- assert_eq!(1, peeker.read(&mut tmp).unwrap());
- assert_eq!(2, tmp[0]);
- assert_eq!(1, peeker.read(&mut tmp).unwrap());
- assert_eq!(3, tmp[0]);
- assert_eq!(1, peeker.read(&mut tmp).unwrap());
- assert_eq!(4, tmp[0]);
- }
-
- // In cases where all we have is a bom, we expect the bytes to be
- // passed through unchanged.
- #[test]
- fn trans_utf16_bom() {
- let srcbuf = vec![0xFF, 0xFE];
- let mut dstbuf = vec![0; 8 * (1<<10)];
- let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None);
- let n = rdr.read(&mut dstbuf).unwrap();
- assert_eq!(&*srcbuf, &dstbuf[..n]);
-
- let srcbuf = vec![0xFE, 0xFF];
- let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None);
- let n = rdr.read(&mut dstbuf).unwrap();
- assert_eq!(&*srcbuf, &dstbuf[..n]);
-
- let srcbuf = vec![0xEF, 0xBB, 0xBF];
- let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None);
- let n = rdr.read(&mut dstbuf).unwrap();
- assert_eq!(&*srcbuf, &dstbuf[..n]);
- }
-
- // Test basic UTF-16 decoding.
- #[test]
- fn trans_utf16_basic() {
- let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00];
- let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None);
- assert_eq!("a", read_to_string(&mut rdr));
-
- let srcbuf = vec![0xFE, 0xFF, 0x00, 0x61];
- let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None);
- assert_eq!("a", read_to_string(&mut rdr));
- }
-
- // Test incomplete UTF-16 decoding. This ensures we see a replacement char
- // if the stream ends with an unpaired code unit.
- #[test]
- fn trans_utf16_incomplete() {
- let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x00];
- let mut rdr = DecodeReader::new(&*srcbuf, vec![0; 8 * (1<<10)], None);
- assert_eq!("a\u{FFFD}", read_to_string(&mut rdr));
- }
-
- macro_rules! test_trans_simple {
- ($name:ident, $enc:expr, $srcbytes:expr, $dst:expr) => {
- #[test]
- fn $name() {
- let srcbuf = &$srcbytes[..];
- let enc = Encoding::for_label($enc.as_bytes());
- let mut rdr = DecodeReader::new(
- &*srcbuf, vec![0; 8 * (1<<10)], enc);
- assert_eq!($dst, read_to_string(&mut rdr));
- }
- }
- }
-
- // This isn't exhaustive obviously, but it lets us test base level support.
- test_trans_simple!(trans_simple_auto, "does not exist", b"\xD0\x96", "Ж");
- test_trans_simple!(trans_simple_utf8, "utf-8", b"\xD0\x96", "Ж");
- test_trans_simple!(trans_simple_utf16le, "utf-16le", b"\x16\x04", "Ж");
- test_trans_simple!(trans_simple_utf16be, "utf-16be", b"\x04\x16", "Ж");
- test_trans_simple!(trans_simple_chinese, "chinese", b"\xA7\xA8", "Ж");
- test_trans_simple!(trans_simple_korean, "korean", b"\xAC\xA8", "Ж");
- test_trans_simple!(
- trans_simple_big5_hkscs, "big5-hkscs", b"\xC7\xFA", "Ж");
- test_trans_simple!(trans_simple_gbk, "gbk", b"\xA7\xA8", "Ж");
- test_trans_simple!(trans_simple_sjis, "sjis", b"\x84\x47", "Ж");
- test_trans_simple!(trans_simple_eucjp, "euc-jp", b"\xA7\xA8", "Ж");
- test_trans_simple!(trans_simple_latin1, "latin1", b"\xA9", "©");
-}
diff --git a/src/main.rs b/src/main.rs
index ab0e4118..73bc8c23 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -3,6 +3,7 @@ extern crate bytecount;
#[macro_use]
extern crate clap;
extern crate encoding_rs;
+extern crate encoding_rs_io;
extern crate globset;
extern crate grep;
extern crate ignore;
@@ -41,7 +42,6 @@ macro_rules! errored {
mod app;
mod args;
mod config;
-mod decoder;
mod decompressor;
mod preprocessor;
mod logger;
diff --git a/src/worker.rs b/src/worker.rs
index 5b7ef0a4..8e840400 100644
--- a/src/worker.rs
+++ b/src/worker.rs
@@ -8,7 +8,8 @@ use ignore::DirEntry;
use memmap::Mmap;
use termcolor::WriteColor;
-use decoder::DecodeReader;
+// use decoder::DecodeReader;
+use encoding_rs_io::DecodeReaderBytesBuilder;
use decompressor::{self, DecompressionReader};
use preprocessor::PreprocessorReader;
use pathutil::strip_prefix;
@@ -319,8 +320,10 @@ impl Worker {
path: &Path,
rdr: R,
) -> Result<u64> {
- let rdr = DecodeReader::new(
- rdr, &mut self.decodebuf, self.opts.encoding);
+ let rdr = DecodeReaderBytesBuilder::new()
+ .encoding(self.opts.encoding)
+ .utf8_passthru(true)
+ .build_with_buffer(rdr, &mut self.decodebuf)?;
let searcher = Searcher::new(
&mut self.inpbuf, printer, &self.grep, path, rdr);
searcher