From 85417e52e919ae601b0b7eec87349fadea2da650 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 3 Apr 2019 13:04:52 -0400 Subject: searcher: partially migrate to bstr This commit causes grep-searcher to use byte strings internally for its line buffer support. We manage to remove a use of `unsafe` by doing this (by pushing it down into `bstr`). We stop short of using byte strings everywhere else because we rely heavily on the `impl ops::Index<[u8]> for grep_matcher::Match` impl, which isn't available for byte strings. (It is premature to make bstr a public dep of a core crate like grep-matcher, but maybe some day.) --- grep-searcher/Cargo.toml | 2 +- grep-searcher/src/lib.rs | 2 +- grep-searcher/src/line_buffer.rs | 125 ++++++++++++++----------------------- grep-searcher/src/lines.rs | 14 +++-- grep-searcher/src/searcher/core.rs | 4 +- grep-searcher/src/testutil.rs | 5 +- 6 files changed, 63 insertions(+), 89 deletions(-) diff --git a/grep-searcher/Cargo.toml b/grep-searcher/Cargo.toml index 893e2f5c..f4875d9f 100644 --- a/grep-searcher/Cargo.toml +++ b/grep-searcher/Cargo.toml @@ -13,12 +13,12 @@ keywords = ["regex", "grep", "egrep", "search", "pattern"] license = "Unlicense/MIT" [dependencies] +bstr = { version = "0.1.2", default-features = false, features = ["std"] } bytecount = "0.5" encoding_rs = "0.8.14" encoding_rs_io = "0.1.4" grep-matcher = { version = "0.1.1", path = "../grep-matcher" } log = "0.4.5" -memchr = "2.1" memmap = "0.7" [dev-dependencies] diff --git a/grep-searcher/src/lib.rs b/grep-searcher/src/lib.rs index f3ec02f2..6a9f4ba7 100644 --- a/grep-searcher/src/lib.rs +++ b/grep-searcher/src/lib.rs @@ -99,13 +99,13 @@ searches stdin. #![deny(missing_docs)] +extern crate bstr; extern crate bytecount; extern crate encoding_rs; extern crate encoding_rs_io; extern crate grep_matcher; #[macro_use] extern crate log; -extern crate memchr; extern crate memmap; #[cfg(test)] extern crate regex; diff --git a/grep-searcher/src/line_buffer.rs b/grep-searcher/src/line_buffer.rs index 0f5a2a7a..c2e54a9e 100644 --- a/grep-searcher/src/line_buffer.rs +++ b/grep-searcher/src/line_buffer.rs @@ -1,8 +1,7 @@ use std::cmp; use std::io; -use std::ptr; -use memchr::{memchr, memrchr}; +use bstr::{BStr, BString}; /// The default buffer capacity that we use for the line buffer. pub(crate) const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1<<10); // 8 KB @@ -123,7 +122,7 @@ impl LineBufferBuilder { pub fn build(&self) -> LineBuffer { LineBuffer { config: self.config, - buf: vec![0; self.config.capacity], + buf: BString::from(vec![0; self.config.capacity]), pos: 0, last_lineterm: 0, end: 0, @@ -255,6 +254,12 @@ impl<'b, R: io::Read> LineBufferReader<'b, R> { /// Return the contents of this buffer. pub fn buffer(&self) -> &[u8] { + self.line_buffer.buffer().as_bytes() + } + + /// Return the underlying buffer as a byte string. Used for tests only. + #[cfg(test)] + fn bstr(&self) -> &BStr { self.line_buffer.buffer() } @@ -284,7 +289,7 @@ pub struct LineBuffer { /// The configuration of this buffer. config: Config, /// The primary buffer with which to hold data. - buf: Vec, + buf: BString, /// The current position of this buffer. This is always a valid sliceable /// index into `buf`, and its maximum value is the length of `buf`. pos: usize, @@ -339,13 +344,13 @@ impl LineBuffer { } /// Return the contents of this buffer. - fn buffer(&self) -> &[u8] { + fn buffer(&self) -> &BStr { &self.buf[self.pos..self.last_lineterm] } /// Return the contents of the free space beyond the end of the buffer as /// a mutable slice. - fn free_buffer(&mut self) -> &mut [u8] { + fn free_buffer(&mut self) -> &mut BStr { &mut self.buf[self.end..] } @@ -396,7 +401,7 @@ impl LineBuffer { assert_eq!(self.pos, 0); loop { self.ensure_capacity()?; - let readlen = rdr.read(self.free_buffer())?; + let readlen = rdr.read(self.free_buffer().as_bytes_mut())?; if readlen == 0 { // We're only done reading for good once the caller has // consumed everything. @@ -416,7 +421,7 @@ impl LineBuffer { match self.config.binary { BinaryDetection::None => {} // nothing to do BinaryDetection::Quit(byte) => { - if let Some(i) = memchr(byte, newbytes) { + if let Some(i) = newbytes.find_byte(byte) { self.end = oldend + i; self.last_lineterm = self.end; self.binary_byte_offset = @@ -444,7 +449,7 @@ impl LineBuffer { } // Update our `last_lineterm` positions if we read one. - if let Some(i) = memrchr(self.config.lineterm, newbytes) { + if let Some(i) = newbytes.rfind_byte(self.config.lineterm) { self.last_lineterm = oldend + i + 1; return Ok(true); } @@ -467,40 +472,8 @@ impl LineBuffer { return; } - assert!(self.pos < self.end && self.end <= self.buf.len()); let roll_len = self.end - self.pos; - unsafe { - // SAFETY: A buffer contains Copy data, so there's no problem - // moving it around. Safety also depends on our indices being - // in bounds, which they should always be, and we enforce with - // an assert above. - // - // It seems like it should be possible to do this in safe code that - // results in the same codegen. I tried the obvious: - // - // for (src, dst) in (self.pos..self.end).zip(0..) { - // self.buf[dst] = self.buf[src]; - // } - // - // But the above does not work, and in fact compiles down to a slow - // byte-by-byte loop. I tried a few other minor variations, but - // alas, better minds might prevail. - // - // Overall, this doesn't save us *too* much. It mostly matters when - // the number of bytes we're copying is large, which can happen - // if the searcher is asked to produce a lot of context. We could - // decide this isn't worth it, but it does make an appreciable - // impact at or around the context=30 range on my machine. - // - // We could also use a temporary buffer that compiles down to two - // memcpys and is faster than the byte-at-a-time loop, but it - // complicates our options for limiting memory allocation a bit. - ptr::copy( - self.buf[self.pos..].as_ptr(), - self.buf.as_mut_ptr(), - roll_len, - ); - } + self.buf.copy_within(self.pos.., 0); self.pos = 0; self.last_lineterm = roll_len; self.end = roll_len; @@ -536,14 +509,15 @@ impl LineBuffer { } } -/// Replaces `src` with `replacement` in bytes. -fn replace_bytes(bytes: &mut [u8], src: u8, replacement: u8) -> Option { +/// Replaces `src` with `replacement` in bytes, and return the offset of the +/// first replacement, if one exists. +fn replace_bytes(bytes: &mut BStr, src: u8, replacement: u8) -> Option { if src == replacement { return None; } let mut first_pos = None; let mut pos = 0; - while let Some(i) = memchr(src, &bytes[pos..]).map(|i| pos + i) { + while let Some(i) = bytes[pos..].find_byte(src).map(|i| pos + i) { if first_pos.is_none() { first_pos = Some(i); } @@ -560,6 +534,7 @@ fn replace_bytes(bytes: &mut [u8], src: u8, replacement: u8) -> Option { #[cfg(test)] mod tests { use std::str; + use bstr::BString; use super::*; const SHERLOCK: &'static str = "\ @@ -575,18 +550,14 @@ and exhibited clearly, with a label attached.\ slice.to_string() } - fn btos(slice: &[u8]) -> &str { - str::from_utf8(slice).unwrap() - } - fn replace_str( slice: &str, src: u8, replacement: u8, ) -> (String, Option) { - let mut dst = slice.to_string().into_bytes(); + let mut dst = BString::from(slice); let result = replace_bytes(&mut dst, src, replacement); - (String::from_utf8(dst).unwrap(), result) + (dst.into_string().unwrap(), result) } #[test] @@ -607,7 +578,7 @@ and exhibited clearly, with a label attached.\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nlisa\n"); + assert_eq!(rdr.bstr(), "homer\nlisa\n"); assert_eq!(rdr.absolute_byte_offset(), 0); rdr.consume(5); assert_eq!(rdr.absolute_byte_offset(), 5); @@ -615,7 +586,7 @@ and exhibited clearly, with a label attached.\ assert_eq!(rdr.absolute_byte_offset(), 11); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "maggie"); + assert_eq!(rdr.bstr(), "maggie"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -630,7 +601,7 @@ and exhibited clearly, with a label attached.\ let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n"); + assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -645,7 +616,7 @@ and exhibited clearly, with a label attached.\ let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "\n"); + assert_eq!(rdr.bstr(), "\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -660,7 +631,7 @@ and exhibited clearly, with a label attached.\ let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "\n\n"); + assert_eq!(rdr.bstr(), "\n\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -698,12 +669,12 @@ and exhibited clearly, with a label attached.\ let mut linebuf = LineBufferBuilder::new().capacity(1).build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); - let mut got = vec![]; + let mut got = BString::new(); while rdr.fill().unwrap() { - got.extend(rdr.buffer()); + got.push(rdr.buffer()); rdr.consume_all(); } - assert_eq!(bytes, btos(&got)); + assert_eq!(bytes, got); assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); assert_eq!(rdr.binary_byte_offset(), None); } @@ -718,11 +689,11 @@ and exhibited clearly, with a label attached.\ let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\n"); + assert_eq!(rdr.bstr(), "homer\n"); rdr.consume_all(); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "lisa\n"); + assert_eq!(rdr.bstr(), "lisa\n"); rdr.consume_all(); // This returns an error because while we have just enough room to @@ -732,11 +703,11 @@ and exhibited clearly, with a label attached.\ assert!(rdr.fill().is_err()); // We can mush on though! - assert_eq!(btos(rdr.buffer()), "m"); + assert_eq!(rdr.bstr(), "m"); rdr.consume_all(); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "aggie"); + assert_eq!(rdr.bstr(), "aggie"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -752,16 +723,16 @@ and exhibited clearly, with a label attached.\ let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\n"); + assert_eq!(rdr.bstr(), "homer\n"); rdr.consume_all(); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "lisa\n"); + assert_eq!(rdr.bstr(), "lisa\n"); rdr.consume_all(); // We have just enough space. assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "maggie"); + assert_eq!(rdr.bstr(), "maggie"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -777,7 +748,7 @@ and exhibited clearly, with a label attached.\ let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().is_err()); - assert_eq!(btos(rdr.buffer()), ""); + assert_eq!(rdr.bstr(), ""); } #[test] @@ -789,7 +760,7 @@ and exhibited clearly, with a label attached.\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nli\x00sa\nmaggie\n"); + assert_eq!(rdr.bstr(), "homer\nli\x00sa\nmaggie\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -808,7 +779,7 @@ and exhibited clearly, with a label attached.\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nli"); + assert_eq!(rdr.bstr(), "homer\nli"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -825,7 +796,7 @@ and exhibited clearly, with a label attached.\ let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(!rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), ""); + assert_eq!(rdr.bstr(), ""); assert_eq!(rdr.absolute_byte_offset(), 0); assert_eq!(rdr.binary_byte_offset(), Some(0)); } @@ -841,7 +812,7 @@ and exhibited clearly, with a label attached.\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n"); + assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -860,7 +831,7 @@ and exhibited clearly, with a label attached.\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie"); + assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -878,7 +849,7 @@ and exhibited clearly, with a label attached.\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "\ + assert_eq!(rdr.bstr(), "\ For the Doctor Watsons of this world, as opposed to the Sherlock Holmeses, s\ "); @@ -901,7 +872,7 @@ Holmeses, s\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nli\nsa\nmaggie\n"); + assert_eq!(rdr.bstr(), "homer\nli\nsa\nmaggie\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -920,7 +891,7 @@ Holmeses, s\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "\nhomer\nlisa\nmaggie\n"); + assert_eq!(rdr.bstr(), "\nhomer\nlisa\nmaggie\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -939,7 +910,7 @@ Holmeses, s\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n\n"); + assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -958,7 +929,7 @@ Holmeses, s\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n\n"); + assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); diff --git a/grep-searcher/src/lines.rs b/grep-searcher/src/lines.rs index ed225a42..a32f6b72 100644 --- a/grep-searcher/src/lines.rs +++ b/grep-searcher/src/lines.rs @@ -2,8 +2,8 @@ A collection of routines for performing operations on lines. */ +use bstr::B; use bytecount; -use memchr::{memchr, memrchr}; use grep_matcher::{LineTerminator, Match}; /// An iterator over lines in a particular slice of bytes. @@ -85,7 +85,7 @@ impl LineStep { #[inline(always)] fn next_impl(&mut self, mut bytes: &[u8]) -> Option<(usize, usize)> { bytes = &bytes[..self.end]; - match memchr(self.line_term, &bytes[self.pos..]) { + match B(&bytes[self.pos..]).find_byte(self.line_term) { None => { if self.pos < bytes.len() { let m = (self.pos, bytes.len()); @@ -135,14 +135,16 @@ pub fn locate( line_term: u8, range: Match, ) -> Match { - let line_start = memrchr(line_term, &bytes[0..range.start()]) + let line_start = B(&bytes[..range.start()]) + .rfind_byte(line_term) .map_or(0, |i| i + 1); let line_end = if range.end() > line_start && bytes[range.end() - 1] == line_term { range.end() } else { - memchr(line_term, &bytes[range.end()..]) - .map_or(bytes.len(), |i| range.end() + i + 1) + B(&bytes[range.end()..]) + .find_byte(line_term) + .map_or(bytes.len(), |i| range.end() + i + 1) }; Match::new(line_start, line_end) } @@ -180,7 +182,7 @@ fn preceding_by_pos( pos -= 1; } loop { - match memrchr(line_term, &bytes[..pos]) { + match B(&bytes[..pos]).rfind_byte(line_term) { None => { return 0; } diff --git a/grep-searcher/src/searcher/core.rs b/grep-searcher/src/searcher/core.rs index c56f643d..ff2cd18d 100644 --- a/grep-searcher/src/searcher/core.rs +++ b/grep-searcher/src/searcher/core.rs @@ -1,6 +1,6 @@ use std::cmp; -use memchr::memchr; +use bstr::B; use grep_matcher::{LineMatchKind, Matcher}; use lines::{self, LineStep}; @@ -149,7 +149,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { BinaryDetection::Quit(b) => b, _ => return false, }; - if let Some(i) = memchr(binary_byte, &buf[*range]) { + if let Some(i) = B(&buf[*range]).find_byte(binary_byte) { self.binary_byte_offset = Some(range.start() + i); true } else { diff --git a/grep-searcher/src/testutil.rs b/grep-searcher/src/testutil.rs index b51508a1..416e4e32 100644 --- a/grep-searcher/src/testutil.rs +++ b/grep-searcher/src/testutil.rs @@ -1,10 +1,10 @@ use std::io::{self, Write}; use std::str; +use bstr::B; use grep_matcher::{ LineMatchKind, LineTerminator, Match, Matcher, NoCaptures, NoError, }; -use memchr::memchr; use regex::bytes::{Regex, RegexBuilder}; use searcher::{BinaryDetection, Searcher, SearcherBuilder}; @@ -94,7 +94,8 @@ impl Matcher for RegexMatcher { } // Make it interesting and return the last byte in the current // line. - let i = memchr(self.line_term.unwrap().as_byte(), haystack) + let i = B(haystack) + .find_byte(self.line_term.unwrap().as_byte()) .map(|i| i) .unwrap_or(haystack.len() - 1); Ok(Some(LineMatchKind::Candidate(i))) -- cgit v1.2.3