summaryrefslogtreecommitdiffstats
path: root/crates/searcher/src/searcher/mod.rs
diff options
context:
space:
mode:
Diffstat (limited to 'crates/searcher/src/searcher/mod.rs')
-rw-r--r--crates/searcher/src/searcher/mod.rs1012
1 files changed, 1012 insertions, 0 deletions
diff --git a/crates/searcher/src/searcher/mod.rs b/crates/searcher/src/searcher/mod.rs
new file mode 100644
index 00000000..b64a8586
--- /dev/null
+++ b/crates/searcher/src/searcher/mod.rs
@@ -0,0 +1,1012 @@
+use std::cell::RefCell;
+use std::cmp;
+use std::fmt;
+use std::fs::File;
+use std::io::{self, Read};
+use std::path::Path;
+
+use encoding_rs;
+use encoding_rs_io::DecodeReaderBytesBuilder;
+use grep_matcher::{LineTerminator, Match, Matcher};
+use line_buffer::{
+ self, alloc_error, BufferAllocation, LineBuffer, LineBufferBuilder,
+ LineBufferReader, DEFAULT_BUFFER_CAPACITY,
+};
+use searcher::glue::{MultiLine, ReadByLine, SliceByLine};
+use sink::{Sink, SinkError};
+
+pub use self::mmap::MmapChoice;
+
+mod core;
+mod glue;
+mod mmap;
+
+/// We use this type alias since we want the ergonomics of a matcher's `Match`
+/// type, but in practice, we use it for arbitrary ranges, so give it a more
+/// accurate name. This is only used in the searcher's internals.
+type Range = Match;
+
+/// The behavior of binary detection while searching.
+///
+/// Binary detection is the process of _heuristically_ identifying whether a
+/// given chunk of data is binary or not, and then taking an action based on
+/// the result of that heuristic. The motivation behind detecting binary data
+/// is that binary data often indicates data that is undesirable to search
+/// using textual patterns. Of course, there are many cases in which this isn't
+/// true, which is why binary detection is disabled by default.
+///
+/// Unfortunately, binary detection works differently depending on the type of
+/// search being executed:
+///
+/// 1. When performing a search using a fixed size buffer, binary detection is
+/// applied to the buffer's contents as it is filled. Binary detection must
+/// be applied to the buffer directly because binary files may not contain
+/// line terminators, which could result in exorbitant memory usage.
+/// 2. When performing a search using memory maps or by reading data off the
+/// heap, then binary detection is only guaranteed to be applied to the
+/// parts corresponding to a match. When `Quit` is enabled, then the first
+/// few KB of the data are searched for binary data.
+#[derive(Clone, Debug, Default)]
+pub struct BinaryDetection(line_buffer::BinaryDetection);
+
+impl BinaryDetection {
+ /// No binary detection is performed. Data reported by the searcher may
+ /// contain arbitrary bytes.
+ ///
+ /// This is the default.
+ pub fn none() -> BinaryDetection {
+ BinaryDetection(line_buffer::BinaryDetection::None)
+ }
+
+ /// Binary detection is performed by looking for the given byte.
+ ///
+ /// When searching is performed using a fixed size buffer, then the
+ /// contents of that buffer are always searched for the presence of this
+ /// byte. If it is found, then the underlying data is considered binary
+ /// and the search stops as if it reached EOF.
+ ///
+ /// When searching is performed with the entire contents mapped into
+ /// memory, then binary detection is more conservative. Namely, only a
+ /// fixed sized region at the beginning of the contents are detected for
+ /// binary data. As a compromise, any subsequent matching (or context)
+ /// lines are also searched for binary data. If binary data is detected at
+ /// any point, then the search stops as if it reached EOF.
+ pub fn quit(binary_byte: u8) -> BinaryDetection {
+ BinaryDetection(line_buffer::BinaryDetection::Quit(binary_byte))
+ }
+
+ /// Binary detection is performed by looking for the given byte, and
+ /// replacing it with the line terminator configured on the searcher.
+ /// (If the searcher is configured to use `CRLF` as the line terminator,
+ /// then this byte is replaced by just `LF`.)
+ ///
+ /// When searching is performed using a fixed size buffer, then the
+ /// contents of that buffer are always searched for the presence of this
+ /// byte and replaced with the line terminator. In effect, the caller is
+ /// guaranteed to never observe this byte while searching.
+ ///
+ /// When searching is performed with the entire contents mapped into
+ /// memory, then this setting has no effect and is ignored.
+ pub fn convert(binary_byte: u8) -> BinaryDetection {
+ BinaryDetection(line_buffer::BinaryDetection::Convert(binary_byte))
+ }
+
+ /// If this binary detection uses the "quit" strategy, then this returns
+ /// the byte that will cause a search to quit. In any other case, this
+ /// returns `None`.
+ pub fn quit_byte(&self) -> Option<u8> {
+ match self.0 {
+ line_buffer::BinaryDetection::Quit(b) => Some(b),
+ _ => None,
+ }
+ }
+
+ /// If this binary detection uses the "convert" strategy, then this returns
+ /// the byte that will be replaced by the line terminator. In any other
+ /// case, this returns `None`.
+ pub fn convert_byte(&self) -> Option<u8> {
+ match self.0 {
+ line_buffer::BinaryDetection::Convert(b) => Some(b),
+ _ => None,
+ }
+ }
+}
+
+/// An encoding to use when searching.
+///
+/// An encoding can be used to configure a
+/// [`SearcherBuilder`](struct.SearchBuilder.html)
+/// to transcode source data from an encoding to UTF-8 before searching.
+///
+/// An `Encoding` will always be cheap to clone.
+#[derive(Clone, Debug)]
+pub struct Encoding(&'static encoding_rs::Encoding);
+
+impl Encoding {
+ /// Create a new encoding for the specified label.
+ ///
+ /// The encoding label provided is mapped to an encoding via the set of
+ /// available choices specified in the
+ /// [Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get).
+ /// If the given label does not correspond to a valid encoding, then this
+ /// returns an error.
+ pub fn new(label: &str) -> Result<Encoding, ConfigError> {
+ let label = label.as_bytes();
+ match encoding_rs::Encoding::for_label_no_replacement(label) {
+ Some(encoding) => Ok(Encoding(encoding)),
+ None => {
+ Err(ConfigError::UnknownEncoding { label: label.to_vec() })
+ }
+ }
+ }
+}
+
+/// The internal configuration of a searcher. This is shared among several
+/// search related types, but is only ever written to by the SearcherBuilder.
+#[derive(Clone, Debug)]
+pub struct Config {
+ /// The line terminator to use.
+ line_term: LineTerminator,
+ /// Whether to invert matching.
+ invert_match: bool,
+ /// The number of lines after a match to include.
+ after_context: usize,
+ /// The number of lines before a match to include.
+ before_context: usize,
+ /// Whether to enable unbounded context or not.
+ passthru: bool,
+ /// Whether to count line numbers.
+ line_number: bool,
+ /// The maximum amount of heap memory to use.
+ ///
+ /// When not given, no explicit limit is enforced. When set to `0`, then
+ /// only the memory map search strategy is available.
+ heap_limit: Option<usize>,
+ /// The memory map strategy.
+ mmap: MmapChoice,
+ /// The binary data detection strategy.
+ binary: BinaryDetection,
+ /// Whether to enable matching across multiple lines.
+ multi_line: bool,
+ /// An encoding that, when present, causes the searcher to transcode all
+ /// input from the encoding to UTF-8.
+ encoding: Option<Encoding>,
+ /// Whether to do automatic transcoding based on a BOM or not.
+ bom_sniffing: bool,
+}
+
+impl Default for Config {
+ fn default() -> Config {
+ Config {
+ line_term: LineTerminator::default(),
+ invert_match: false,
+ after_context: 0,
+ before_context: 0,
+ passthru: false,
+ line_number: true,
+ heap_limit: None,
+ mmap: MmapChoice::default(),
+ binary: BinaryDetection::default(),
+ multi_line: false,
+ encoding: None,
+ bom_sniffing: true,
+ }
+ }
+}
+
+impl Config {
+ /// Return the maximal amount of lines needed to fulfill this
+ /// configuration's context.
+ ///
+ /// If this returns `0`, then no context is ever needed.
+ fn max_context(&self) -> usize {
+ cmp::max(self.before_context, self.after_context)
+ }
+
+ /// Build a line buffer from this configuration.
+ fn line_buffer(&self) -> LineBuffer {
+ let mut builder = LineBufferBuilder::new();
+ builder
+ .line_terminator(self.line_term.as_byte())
+ .binary_detection(self.binary.0);
+
+ if let Some(limit) = self.heap_limit {
+ let (capacity, additional) = if limit <= DEFAULT_BUFFER_CAPACITY {
+ (limit, 0)
+ } else {
+ (DEFAULT_BUFFER_CAPACITY, limit - DEFAULT_BUFFER_CAPACITY)
+ };
+ builder
+ .capacity(capacity)
+ .buffer_alloc(BufferAllocation::Error(additional));
+ }
+ builder.build()
+ }
+}
+
+/// An error that can occur when building a searcher.
+///
+/// This error occurs when a non-sensical configuration is present when trying
+/// to construct a `Searcher` from a `SearcherBuilder`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum ConfigError {
+ /// Indicates that the heap limit configuration prevents all possible
+ /// search strategies from being used. For example, if the heap limit is
+ /// set to 0 and memory map searching is disabled or unavailable.
+ SearchUnavailable,
+ /// Occurs when a matcher reports a line terminator that is different than
+ /// the one configured in the searcher.
+ MismatchedLineTerminators {
+ /// The matcher's line terminator.
+ matcher: LineTerminator,
+ /// The searcher's line terminator.
+ searcher: LineTerminator,
+ },
+ /// Occurs when no encoding could be found for a particular label.
+ UnknownEncoding {
+ /// The provided encoding label that could not be found.
+ label: Vec<u8>,
+ },
+ /// Hints that destructuring should not be exhaustive.
+ ///
+ /// This enum may grow additional variants, so this makes sure clients
+ /// don't count on exhaustive matching. (Otherwise, adding a new variant
+ /// could break existing code.)
+ #[doc(hidden)]
+ __Nonexhaustive,
+}
+
+impl ::std::error::Error for ConfigError {
+ fn description(&self) -> &str {
+ "grep-searcher configuration error"
+ }
+}
+
+impl fmt::Display for ConfigError {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ match *self {
+ ConfigError::SearchUnavailable => {
+ write!(f, "grep config error: no available searchers")
+ }
+ ConfigError::MismatchedLineTerminators { matcher, searcher } => {
+ write!(
+ f,
+ "grep config error: mismatched line terminators, \
+ matcher has {:?} but searcher has {:?}",
+ matcher, searcher
+ )
+ }
+ ConfigError::UnknownEncoding { ref label } => write!(
+ f,
+ "grep config error: unknown encoding: {}",
+ String::from_utf8_lossy(label),
+ ),
+ _ => panic!("BUG: unexpected variant found"),
+ }
+ }
+}
+
+/// A builder for configuring a searcher.
+///
+/// A search builder permits specifying the configuration of a searcher,
+/// including options like whether to invert the search or to enable multi
+/// line search.
+///
+/// Once a searcher has been built, it is beneficial to reuse that searcher
+/// for multiple searches, if possible.
+#[derive(Clone, Debug)]
+pub struct SearcherBuilder {
+ config: Config,
+}
+
+impl Default for SearcherBuilder {
+ fn default() -> SearcherBuilder {
+ SearcherBuilder::new()
+ }
+}
+
+impl SearcherBuilder {
+ /// Create a new searcher builder with a default configuration.
+ pub fn new() -> SearcherBuilder {
+ SearcherBuilder { config: Config::default() }
+ }
+
+ /// Build a searcher with the given matcher.
+ pub fn build(&self) -> Searcher {
+ let mut config = self.config.clone();
+ if config.passthru {
+ config.before_context = 0;
+ config.after_context = 0;
+ }
+
+ let mut decode_builder = DecodeReaderBytesBuilder::new();
+ decode_builder
+ .encoding(self.config.encoding.as_ref().map(|e| e.0))
+ .utf8_passthru(true)
+ .strip_bom(self.config.bom_sniffing)
+ .bom_override(true)
+ .bom_sniffing(self.config.bom_sniffing);
+
+ Searcher {
+ config: config,
+ decode_builder: decode_builder,
+ decode_buffer: RefCell::new(vec![0; 8 * (1 << 10)]),
+ line_buffer: RefCell::new(self.config.line_buffer()),
+ multi_line_buffer: RefCell::new(vec![]),
+ }
+ }
+
+ /// Set the line terminator that is used by the searcher.
+ ///
+ /// When using a searcher, if the matcher provided has a line terminator
+ /// set, then it must be the same as this one. If they aren't, building
+ /// a searcher will return an error.
+ ///
+ /// By default, this is set to `b'\n'`.
+ pub fn line_terminator(
+ &mut self,
+ line_term: LineTerminator,
+ ) -> &mut SearcherBuilder {
+ self.config.line_term = line_term;
+ self
+ }
+
+ /// Whether to invert matching, whereby lines that don't match are reported
+ /// instead of reporting lines that do match.
+ ///
+ /// By default, this is disabled.
+ pub fn invert_match(&mut self, yes: bool) -> &mut SearcherBuilder {
+ self.config.invert_match = yes;
+ self
+ }
+
+ /// Whether to count and include line numbers with matching lines.
+ ///
+ /// This is enabled by default. There is a small performance penalty
+ /// associated with computing line numbers, so this can be disabled when
+ /// this isn't desirable.
+ pub fn line_number(&mut self, yes: bool) -> &mut SearcherBuilder {
+ self.config.line_number = yes;
+ self
+ }
+
+ /// Whether to enable multi line search or not.
+ ///
+ /// When multi line search is enabled, matches *may* match across multiple
+ /// lines. Conversely, when multi line search is disabled, it is impossible
+ /// for any match to span more than one line.
+ ///
+ /// **Warning:** multi line search requires having the entire contents to
+ /// search mapped in memory at once. When searching files, memory maps
+ /// will be used if possible and if they are enabled, which avoids using
+ /// your program's heap. However, if memory maps cannot be used (e.g.,
+ /// for searching streams like `stdin` or if transcoding is necessary),
+ /// then the entire contents of the stream are read on to the heap before
+ /// starting the search.
+ ///
+ /// This is disabled by default.
+ pub fn multi_line(&mut self, yes: bool) -> &mut SearcherBuilder {
+ self.config.multi_line = yes;
+ self
+ }
+
+ /// Whether to include a fixed number of lines after every match.
+ ///
+ /// When this is set to a non-zero number, then the searcher will report
+ /// `line_count` contextual lines after every match.
+ ///
+ /// This is set to `0` by default.
+ pub fn after_context(
+ &mut self,
+ line_count: usize,
+ ) -> &mut SearcherBuilder {
+ self.config.after_context = line_count;
+ self
+ }
+
+ /// Whether to include a fixed number of lines before every match.
+ ///
+ /// When this is set to a non-zero number, then the searcher will report
+ /// `line_count` contextual lines before every match.
+ ///
+ /// This is set to `0` by default.
+ pub fn before_context(
+ &mut self,
+ line_count: usize,
+ ) -> &mut SearcherBuilder {
+ self.config.before_context = line_count;
+ self
+ }
+
+ /// Whether to enable the "passthru" feature or not.
+ ///
+ /// When passthru is enabled, it effectively treats all non-matching lines
+ /// as contextual lines. In other words, enabling this is akin to
+ /// requesting an unbounded number of before and after contextual lines.
+ ///
+ /// When passthru mode is enabled, any `before_context` or `after_context`
+ /// settings are ignored by setting them to `0`.
+ ///
+ /// This is disabled by default.
+ pub fn passthru(&mut self, yes: bool) -> &mut SearcherBuilder {
+ self.config.passthru = yes;
+ self
+ }
+
+ /// Set an approximate limit on the amount of heap space used by a
+ /// searcher.
+ ///
+ /// The heap limit is enforced in two scenarios:
+ ///
+ /// * When searching using a fixed size buffer, the heap limit controls
+ /// how big this buffer is allowed to be. Assuming contexts are disabled,
+ /// the minimum size of this buffer is the length (in bytes) of the
+ /// largest single line in the contents being searched. If any line
+ /// exceeds the heap limit, then an error will be returned.
+ /// * When performing a multi line search, a fixed size buffer cannot be
+ /// used. Thus, the only choices are to read the entire contents on to
+ /// the heap, or use memory maps. In the former case, the heap limit set
+ /// here is enforced.
+ ///
+ /// If a heap limit is set to `0`, then no heap space is used. If there are
+ /// no alternative strategies available for searching without heap space
+ /// (e.g., memory maps are disabled), then the searcher wil return an error
+ /// immediately.
+ ///
+ /// By default, no limit is set.
+ pub fn heap_limit(
+ &mut self,
+ bytes: Option<usize>,
+ ) -> &mut SearcherBuilder {
+ self.config.heap_limit = bytes;
+ self
+ }
+
+ /// Set the strategy to employ use of memory maps.
+ ///
+ /// Currently, there are only two strategies that can be employed:
+ ///
+ /// * **Automatic** - A searcher will use heuristics, including but not
+ /// limited to file size and platform, to determine whether to use memory
+ /// maps or not.
+ /// * **Never** - Memory maps will never be used. If multi line search is
+ /// enabled, then the entire contents will be read on to the heap before
+ /// searching begins.
+ ///
+ /// The default behavior is **never**. Generally speaking, and perhaps
+ /// against conventional wisdom, memory maps don't necessarily enable
+ /// faster searching. For example, depending on the platform, using memory
+ /// maps while searching a large directory can actually be quite a bit
+ /// slower than using normal read calls because of the overhead of managing
+ /// the memory maps.
+ ///
+ /// Memory maps can be faster in some cases however. On some platforms,
+ /// when searching a very large file that *is already in memory*, it can
+ /// be slightly faster to search it as a memory map instead of using
+ /// normal read calls.
+ ///
+ /// Finally, memory maps have a somewhat complicated safety story in Rust.
+ /// If you aren't sure whether enabling memory maps is worth it, then just
+ /// don't bother with it.
+ ///
+ /// **WARNING**: If your process is searching a file backed memory map
+ /// at the same time that file is truncated, then it's possible for the
+ /// process to terminate with a bus error.
+ pub fn memory_map(
+ &mut self,
+ strategy: MmapChoice,
+ ) -> &mut SearcherBuilder {
+ self.config.mmap = strategy;
+ self
+ }
+
+ /// Set the binary detection strategy.
+ ///
+ /// The binary detection strategy determines not only how the searcher
+ /// detects binary data, but how it responds to the presence of binary
+ /// data. See the [`BinaryDetection`](struct.BinaryDetection.html) type
+ /// for more information.
+ ///
+ /// By default, binary detection is disabled.
+ pub fn binary_detection(
+ &mut self,
+ detection: BinaryDetection,
+ ) -> &mut SearcherBuilder {
+ self.config.binary = detection;
+ self
+ }
+
+ /// Set the encoding used to read the source data before searching.
+ ///
+ /// When an encoding is provided, then the source data is _unconditionally_
+ /// transcoded using the encoding, unless a BOM is present. If a BOM is
+ /// present, then the encoding indicated by the BOM is used instead. If the
+ /// transcoding process encounters an error, then bytes are replaced with
+ /// the Unicode replacement codepoint.
+ ///
+ /// When no encoding is specified (the default), then BOM sniffing is
+ /// used (if it's enabled, which it is, by default) to determine whether
+ /// the source data is UTF-8 or UTF-16, and transcoding will be performed
+ /// automatically. If no BOM could be found, then the source data is
+ /// searched _as if_ it were UTF-8. However, so long as the source data is
+ /// at least ASCII compatible, then it is possible for a search to produce
+ /// useful results.
+ pub fn encoding(
+ &mut self,
+ encoding: Option<Encoding>,
+ ) -> &mut SearcherBuilder {
+ self.config.encoding = encoding;
+ self
+ }
+
+ /// Enable automatic transcoding based on BOM sniffing.
+ ///
+ /// When this is enabled and an explicit encoding is not set, then this
+ /// searcher will try to detect the encoding of the bytes being searched
+ /// by sniffing its byte-order mark (BOM). In particular, when this is
+ /// enabled, UTF-16 encoded files will be searched seamlessly.
+ ///
+ /// When this is disabled and if an explicit encoding is not set, then
+ /// the bytes from the source stream will be passed through unchanged,
+ /// including its BOM, if one is present.
+ ///
+ /// This is enabled by default.
+ pub fn bom_sniffing(&mut self, yes: bool) -> &mut SearcherBuilder {
+ self.config.bom_sniffing = yes;
+ self
+ }
+}
+
+/// A searcher executes searches over a haystack and writes results to a caller
+/// provided sink.
+///
+/// Matches are detected via implementations of the `Matcher` trait, which must
+/// be provided by the caller when executing a search.
+///
+/// When possible, a searcher should be reused.
+#[derive(Clone, Debug)]
+pub struct Searcher {
+ /// The configuration for this searcher.
+ ///
+ /// We make most of these settings available to users of `Searcher` via
+ /// public API methods, which can be queried in implementations of `Sink`
+ /// if necessary.
+ config: Config,
+ /// A builder for constructing a streaming reader that transcodes source
+ /// data according to either an explicitly specified encoding or via an
+ /// automatically detected encoding via BOM sniffing.
+ ///
+ /// When no transcoding is needed, then the transcoder built will pass
+ /// through the underlying bytes with no additional overhead.
+ decode_builder: DecodeReaderBytesBuilder,
+ /// A buffer that is used for transcoding scratch space.
+ decode_buffer: RefCell<Vec<u8>>,
+ /// A line buffer for use in line oriented searching.
+ ///
+ /// We wrap it in a RefCell to permit lending out borrows of `Searcher`
+ /// to sinks. We still require a mutable borrow to execute a search, so
+ /// we statically prevent callers from causing RefCell to panic at runtime
+ /// due to a borrowing violation.
+ line_buffer: RefCell<LineBuffer>,
+ /// A buffer in which to store the contents of a reader when performing a
+ /// multi line search. In particular, multi line searches cannot be
+ /// performed incrementally, and need the entire haystack in memory at
+ /// once.
+ multi_line_buffer: RefCell<Vec<u8>>,
+}
+
+impl Searcher {
+ /// Create a new searcher with a default configuration.
+ ///
+ /// To configure the searcher (e.g., invert matching, enable memory maps,
+ /// enable contexts, etc.), use the
+ /// [`SearcherBuilder`](struct.SearcherBuilder.html).
+ pub fn new() -> Searcher {
+ SearcherBuilder::new().build()
+ }
+
+ /// Execute a search over the file with the given path and write the
+ /// results to the given sink.
+ ///
+ /// If memory maps are enabled and the searcher heuristically believes
+ /// memory maps will help the search run faster, then this will use
+ /// memory maps. For this reason, callers should prefer using this method
+ /// or `search_file` over the more generic `search_reader` when possible.
+ pub fn search_path<P, M, S>(
+ &mut self,
+ matcher: M,
+ path: P,
+ write_to: S,
+ ) -> Result<(), S::Error>
+ where
+ P: AsRef<Path>,
+ M: Matcher,
+ S: Sink,
+ {
+ let path = path.as_ref();
+ let file = File::open(path).map_err(S::Error::error_io)?;
+ self.search_file_maybe_path(matcher, Some(path), &file, write_to)
+ }
+
+ /// Execute a search over a file and write the results to the given sink.
+ ///
+ /// If memory maps are enabled and the searcher heuristically believes
+ /// memory maps will help the search run faster, then this will use
+ /// memory maps. For this reason, callers should prefer using this method
+ /// or `search_path` over the more generic `search_reader` when possible.
+ pub fn search_file<M, S>(
+ &mut self,
+ matcher: M,
+ file: &File,
+ write_to: S,
+ ) -> Result<(), S::Error>
+ where
+ M: Matcher,
+ S: Sink,
+ {
+ self.search_file_maybe_path(matcher, None, file, write_to)
+ }
+
+ fn search_file_maybe_path<M, S>(
+ &mut self,
+ matcher: M,
+ path: Option<&Path>,
+ file: &File,
+ write_to: S,
+ ) -> Result<(), S::Error>
+ where
+ M: Matcher,
+ S: Sink,
+ {
+ if let Some(mmap) = self.config.mmap.open(file, path) {
+ trace!("{:?}: searching via memory map", path);
+ return self.search_slice(matcher, &mmap, write_to);
+ }
+ // Fast path for multi-line searches of files when memory maps are
+ // not enabled. This pre-allocates a buffer roughly the size of the
+ // file, which isn't possible when searching an arbitrary io::Read.
+ if self.multi_line_with_matcher(&matcher) {
+ trace!("{:?}: reading entire file on to heap for mulitline", path);
+ self.fill_multi_line_buffer_from_file::<S>(file)?;
+ trace!("{:?}: searching via multiline strategy", path);
+ MultiLine::new(
+ self,
+ matcher,
+ &*self.multi_line_buffer.borrow(),
+ write_to,
+ )
+ .run()
+ } else {
+ trace!("{:?}: searching using generic reader", path);
+ self.search_reader(matcher, file, write_to)
+ }
+ }
+
+ /// Execute a search over any implementation of `io::Read` and write the
+ /// results to the given sink.
+ ///
+ /// When possible, this implementation will search the reader incrementally
+ /// without reading it into memory. In some cases---for example, if multi
+ /// line search is enabled---an incremental search isn't possible and the
+ /// given reader is consumed completely and placed on the heap before
+ /// searching begins. For this reason, when multi line search is enabled,
+ /// one should try to use higher level APIs (e.g., searching by file or
+ /// file path) so that memory maps can be used if they are available and
+ /// enabled.
+ pub fn search_reader<M, R, S>(
+ &mut self,
+ matcher: M,
+ read_from: R,
+ write_to: S,
+ ) -> Result<(), S::Error>
+ where
+ M: Matcher,
+ R: io::Read,
+ S: Sink,
+ {
+ self.check_config(&matcher).map_err(S::Error::error_config)?;
+
+ let mut decode_buffer = self.decode_buffer.borrow_mut();
+ let read_from = self
+ .decode_builder
+ .build_with_buffer(read_from, &mut *decode_buffer)
+ .map_err(S::Error::error_io)?;
+
+ if self.multi_line_with_matcher(&matcher) {
+ trace!("generic reader: reading everything to heap for multiline");
+ self.fill_multi_line_buffer_from_reader::<_, S>(read_from)?;
+ trace!("generic reader: searching via multiline strategy");
+ MultiLine::new(
+ self,
+ matcher,
+ &*self.multi_line_buffer.borrow(),
+ write_to,
+ )
+ .run()
+ } else {
+ let mut line_buffer = self.line_buffer.borrow_mut();
+ let rdr = LineBufferReader::new(read_from, &mut *line_buffer);
+ trace!("generic reader: searching via roll buffer strategy");
+ ReadByLine::new(self, matcher, rdr, write_to).run()
+ }
+ }
+
+ /// Execute a search over the given slice and write the results to the
+ /// given sink.
+ pub fn search_slice<M, S>(
+ &mut self,
+ matcher: M,
+ slice: &[u8],
+ write_to: S,
+ ) -> Result<(), S::Error>
+ where
+ M: Matcher,
+ S: Sink,
+ {
+ self.check_config(&matcher).map_err(S::Error::error_config)?;
+
+ // We can search the slice directly, unless we need to do transcoding.
+ if self.slice_needs_transcoding(slice) {
+ trace!("slice reader: needs transcoding, using generic reader");
+ return self.search_reader(matcher, slice, write_to);
+ }
+ if self.multi_line_with_matcher(&matcher) {
+ trace!("slice reader: searching via multiline strategy");
+ MultiLine::new(self, matcher, slice, write_to).run()
+ } else {
+ trace!("slice reader: searching via slice-by-line strategy");
+ SliceByLine::new(self, matcher, slice, write_to).run()
+ }
+ }
+
+ /// Set the binary detection method used on this searcher.
+ pub fn set_binary_detection(&mut self, detection: BinaryDetection) {
+ self.config.binary = detection.clone();
+ self.line_buffer.borrow_mut().set_binary_detection(detection.0);
+ }
+
+ /// Check that the searcher's configuration and the matcher are consistent
+ /// with each other.
+ fn check_config<M: Matcher>(&self, matcher: M) -> Result<(), ConfigError> {
+ if self.config.heap_limit == Some(0) && !self.config.mmap.is_enabled()
+ {
+ return Err(ConfigError::SearchUnavailable);
+ }
+ let matcher_line_term = match matcher.line_terminator() {
+ None => return Ok(()),
+ Some(line_term) => line_term,
+ };
+ if matcher_line_term != self.config.line_term {
+ return Err(ConfigError::MismatchedLineTerminators {
+ matcher: matcher_line_term,
+ searcher: self.config.line_term,
+ });
+ }
+ Ok(())
+ }
+
+ /// Returns true if and only if the given slice needs to be transcoded.
+ fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
+ self.config.encoding.is_some()
+ || (self.config.bom_sniffing && slice_has_utf16_bom(slice))
+ }
+}
+
+/// The following methods permit querying the configuration of a searcher.
+/// These can be useful in generic implementations of
+/// [`Sink`](trait.Sink.html),
+/// where the output may be tailored based on how the searcher is configured.
+impl Searcher {
+ /// Returns the line terminator used by this searcher.
+ #[inline]
+ pub fn line_terminator(&self) -> LineTerminator {
+ self.config.line_term
+ }
+
+ /// Returns the type of binary detection configured on this searcher.
+ #[inline]
+ pub fn binary_detection(&self) -> &BinaryDetection {
+ &self.config.binary
+ }
+
+ /// Returns true if and only if this searcher is configured to invert its
+ /// search results. That is, matching lines are lines that do **not** match
+ /// the searcher's matcher.
+ #[inline]
+ pub fn invert_match(&self) -> bool {
+ self.config.invert_match
+ }
+
+ /// Returns true if and only if this searcher is configured to count line
+ /// numbers.
+ #[inline]
+ pub fn line_number(&self) -> bool {
+ self.config.line_number
+ }
+
+ /// Returns true if and only if this searcher is configured to perform
+ /// multi line search.
+ #[inline]
+ pub fn multi_line(&self) -> bool {
+ self.config.multi_line
+ }
+
+ /// Returns true if and only if this searcher will choose a multi-line
+ /// strategy given the provided matcher.
+ ///
+ /// This may diverge from the result of `multi_line` in cases where the
+ /// searcher has been configured to execute a search that can report
+ /// matches over multiple lines, but where the matcher guarantees that it
+ /// will never produce a match over multiple lines.
+ pub fn multi_line_with_matcher<M: Matcher>(&self, matcher: M) -> bool {
+ if !self.multi_line() {
+ return false;
+ }
+ if let Some(line_term) = matcher.line_terminator() {
+ if line_term == self.line_terminator() {
+ return false;
+ }
+ }
+ if let Some(non_matching) = matcher.non_matching_bytes() {
+ // If the line terminator is CRLF, we don't actually need to care
+ // whether the regex can match `\r` or not. Namely, a `\r` is
+ // neither necessary nor sufficient to terminate a line. A `\n` is
+ // always required.
+ if non_matching.contains(self.line_terminator().as_byte()) {
+ return false;
+ }
+ }
+ true
+ }
+
+ /// Returns the number of "after" context lines to report. When context
+ /// reporting is not enabled, this returns `0`.
+ #[inline]
+ pub fn after_context(&self) -> usize {
+ self.config.after_context
+ }
+
+ /// Returns the number of "before" context lines to report. When context
+ /// reporting is not enabled, this returns `0`.
+ #[inline]
+ pub fn before_context(&self) -> usize {
+ self.config.before_context
+ }
+
+ /// Returns true if and only if the searcher has "passthru" mode enabled.
+ #[inline]
+ pub fn passthru(&self) -> bool {
+ self.config.passthru
+ }
+
+ /// Fill the buffer for use with multi-line searching from the given file.
+ /// This reads from the file until EOF or until an error occurs. If the
+ /// contents exceed the configured heap limit, then an error is returned.
+ fn fill_multi_line_buffer_from_file<S: Sink>(
+ &self,
+ file: &File,
+ ) -> Result<(), S::Error> {
+ assert!(self.config.multi_line);
+
+ let mut decode_buffer = self.decode_buffer.borrow_mut();
+ let mut read_from = self
+ .decode_builder
+ .build_with_buffer(file, &mut *decode_buffer)
+ .map_err(S::Error::error_io)?;
+
+ // If we don't have a heap limit, then we can defer to std's
+ // read_to_end implementation. fill_multi_line_buffer_from_reader will
+ // do this too, but since we have a File, we can be a bit smarter about
+ // pre-allocating here.
+ //
+ // If we're transcoding, then our pre-allocation might not be exact,
+ // but is probably still better than nothing.
+ if self.config.heap_limit.is_none() {
+ let mut buf = self.multi_line_buffer.borrow_mut();
+ buf.clear();
+ let cap =
+ file.metadata().map(|m| m.len() as usize + 1).unwrap_or(0);
+ buf.reserve(cap);
+ read_from.read_to_end(&mut *buf).map_err(S::Error::error_io)?;
+ return Ok(());
+ }
+ self.fill_multi_line_buffer_from_reader::<_, S>(read_from)
+ }
+
+ /// Fill the buffer for use with multi-line searching from the given
+ /// reader. This reads from the reader until EOF or until an error occurs.
+ /// If the contents exceed the configured heap limit, then an error is
+ /// returned.
+ fn fill_multi_line_buffer_from_reader<R: io::Read, S: Sink>(
+ &self,
+ mut read_from: R,
+ ) -> Result<(), S::Error> {
+ assert!(self.config.multi_line);
+
+ let mut buf = self.multi_line_buffer.borrow_mut();
+ buf.clear();
+
+ // If we don't have a heap limit, then we can defer to std's
+ // read_to_end implementation...
+ let heap_limit = match self.config.heap_limit {
+ Some(heap_limit) => heap_limit,
+ None => {
+ read_from
+ .read_to_end(&mut *buf)
+ .map_e