summaryrefslogtreecommitdiffstats
path: root/grep-matcher
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2018-04-29 09:29:52 -0400
committerAndrew Gallant <jamslam@gmail.com>2018-08-20 07:10:19 -0400
commitd9ca5293569efb255608d3c601107bcfe7060f15 (patch)
tree7fd8611c333a2f7d703987de3a379ee8292013e2 /grep-matcher
parent0958837ee104985412f08e81b6f08df1e5291042 (diff)
libripgrep: initial commit introducing libripgrep
libripgrep is not any one library, but rather, a collection of libraries that roughly separate the following key distinct phases in a grep implementation: 1. Pattern matching (e.g., by a regex engine). 2. Searching a file using a pattern matcher. 3. Printing results. Ultimately, both (1) and (3) are defined by de-coupled interfaces, of which there may be multiple implementations. Namely, (1) is satisfied by the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by the `Sink` trait in the `grep2` crate. The searcher (2) ties everything together and finds results using a matcher and reports those results using a `Sink` implementation. Closes #162
Diffstat (limited to 'grep-matcher')
-rw-r--r--grep-matcher/Cargo.toml24
-rw-r--r--grep-matcher/LICENSE-MIT21
-rw-r--r--grep-matcher/README.md36
-rw-r--r--grep-matcher/UNLICENSE24
-rw-r--r--grep-matcher/src/interpolate.rs328
-rw-r--r--grep-matcher/src/lib.rs1126
-rw-r--r--grep-matcher/tests/test_matcher.rs208
-rw-r--r--grep-matcher/tests/tests.rs6
-rw-r--r--grep-matcher/tests/util.rs104
9 files changed, 1877 insertions, 0 deletions
diff --git a/grep-matcher/Cargo.toml b/grep-matcher/Cargo.toml
new file mode 100644
index 00000000..8056dec1
--- /dev/null
+++ b/grep-matcher/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "grep-matcher"
+version = "0.0.1" #:version
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+description = """
+A trait for regular expressions, with a focus on line oriented search.
+"""
+documentation = "https://docs.rs/grep-matcher"
+homepage = "https://github.com/BurntSushi/ripgrep"
+repository = "https://github.com/BurntSushi/ripgrep"
+readme = "README.md"
+keywords = ["regex", "pattern", "trait"]
+license = "Unlicense/MIT"
+autotests = false
+
+[dependencies]
+memchr = "2"
+
+[dev-dependencies]
+regex = "1"
+
+[[test]]
+name = "integration"
+path = "tests/tests.rs"
diff --git a/grep-matcher/LICENSE-MIT b/grep-matcher/LICENSE-MIT
new file mode 100644
index 00000000..3b0a5dc0
--- /dev/null
+++ b/grep-matcher/LICENSE-MIT
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Andrew Gallant
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/grep-matcher/README.md b/grep-matcher/README.md
new file mode 100644
index 00000000..f83ceade
--- /dev/null
+++ b/grep-matcher/README.md
@@ -0,0 +1,36 @@
+grep-matcher
+------------
+This crate provides a low level interface for describing regular expression
+matchers. The `grep` crate uses this interface in order to make the regex
+engine it uses pluggable.
+
+[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.svg)](https://travis-ci.org/BurntSushi/ripgrep)
+[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep)
+[![](https://img.shields.io/crates/v/grep-matcher.svg)](https://crates.io/crates/grep-matcher)
+
+Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
+
+### Documentation
+
+[https://docs.rs/grep-matcher](https://docs.rs/grep-matcher)
+
+**NOTE:** You probably don't want to use this crate directly. Instead, you
+should prefer the facade defined in the
+[`grep`](https://docs.rs/grep)
+crate.
+
+
+### Usage
+
+Add this to your `Cargo.toml`:
+
+```toml
+[dependencies]
+grep-matcher = "0.1"
+```
+
+and this to your crate root:
+
+```rust
+extern crate grep_matcher;
+```
diff --git a/grep-matcher/UNLICENSE b/grep-matcher/UNLICENSE
new file mode 100644
index 00000000..68a49daa
--- /dev/null
+++ b/grep-matcher/UNLICENSE
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
diff --git a/grep-matcher/src/interpolate.rs b/grep-matcher/src/interpolate.rs
new file mode 100644
index 00000000..168dd343
--- /dev/null
+++ b/grep-matcher/src/interpolate.rs
@@ -0,0 +1,328 @@
+use std::str;
+
+use memchr::memchr;
+
+/// Interpolate capture references in `replacement` and write the interpolation
+/// result to `dst`. References in `replacement` take the form of $N or $name,
+/// where `N` is a capture group index and `name` is a capture group name. The
+/// function provided, `name_to_index`, maps capture group names to indices.
+///
+/// The `append` function given is responsible for writing the replacement
+/// to the `dst` buffer. That is, it is called with the capture group index
+/// of a capture group reference and is expected to resolve the index to its
+/// corresponding matched text. If no such match exists, then `append` should
+/// not write anything to its given buffer.
+pub fn interpolate<A, N>(
+ mut replacement: &[u8],
+ mut append: A,
+ mut name_to_index: N,
+ dst: &mut Vec<u8>,
+) where
+ A: FnMut(usize, &mut Vec<u8>),
+ N: FnMut(&str) -> Option<usize>
+{
+ while !replacement.is_empty() {
+ match memchr(b'$', replacement) {
+ None => break,
+ Some(i) => {
+ dst.extend(&replacement[..i]);
+ replacement = &replacement[i..];
+ }
+ }
+ if replacement.get(1).map_or(false, |&b| b == b'$') {
+ dst.push(b'$');
+ replacement = &replacement[2..];
+ continue;
+ }
+ debug_assert!(!replacement.is_empty());
+ let cap_ref = match find_cap_ref(replacement) {
+ Some(cap_ref) => cap_ref,
+ None => {
+ dst.push(b'$');
+ replacement = &replacement[1..];
+ continue;
+ }
+ };
+ replacement = &replacement[cap_ref.end..];
+ match cap_ref.cap {
+ Ref::Number(i) => append(i, dst),
+ Ref::Named(name) => {
+ if let Some(i) = name_to_index(name) {
+ append(i, dst);
+ }
+ }
+ }
+ }
+ dst.extend(replacement);
+}
+
+/// `CaptureRef` represents a reference to a capture group inside some text.
+/// The reference is either a capture group name or a number.
+///
+/// It is also tagged with the position in the text immediately proceding the
+/// capture reference.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+struct CaptureRef<'a> {
+ cap: Ref<'a>,
+ end: usize,
+}
+
+/// A reference to a capture group in some text.
+///
+/// e.g., `$2`, `$foo`, `${foo}`.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum Ref<'a> {
+ Named(&'a str),
+ Number(usize),
+}
+
+impl<'a> From<&'a str> for Ref<'a> {
+ fn from(x: &'a str) -> Ref<'a> {
+ Ref::Named(x)
+ }
+}
+
+impl From<usize> for Ref<'static> {
+ fn from(x: usize) -> Ref<'static> {
+ Ref::Number(x)
+ }
+}
+
+/// Parses a possible reference to a capture group name in the given text,
+/// starting at the beginning of `replacement`.
+///
+/// If no such valid reference could be found, None is returned.
+fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef> {
+ let mut i = 0;
+ if replacement.len() <= 1 || replacement[0] != b'$' {
+ return None;
+ }
+ let mut brace = false;
+ i += 1;
+ if replacement[i] == b'{' {
+ brace = true;
+ i += 1;
+ }
+ let mut cap_end = i;
+ while replacement.get(cap_end).map_or(false, is_valid_cap_letter) {
+ cap_end += 1;
+ }
+ if cap_end == i {
+ return None;
+ }
+ // We just verified that the range 0..cap_end is valid ASCII, so it must
+ // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
+ // check with an unchecked conversion or by parsing the number straight
+ // from &[u8].
+ let cap = str::from_utf8(&replacement[i..cap_end])
+ .expect("valid UTF-8 capture name");
+ if brace {
+ if !replacement.get(cap_end).map_or(false, |&b| b == b'}') {
+ return None;
+ }
+ cap_end += 1;
+ }
+ Some(CaptureRef {
+ cap: match cap.parse::<u32>() {
+ Ok(i) => Ref::Number(i as usize),
+ Err(_) => Ref::Named(cap),
+ },
+ end: cap_end,
+ })
+}
+
+/// Returns true if and only if the given byte is allowed in a capture name.
+fn is_valid_cap_letter(b: &u8) -> bool {
+ match *b {
+ b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' | b'_' => true,
+ _ => false,
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::{CaptureRef, find_cap_ref, interpolate};
+
+ macro_rules! find {
+ ($name:ident, $text:expr) => {
+ #[test]
+ fn $name() {
+ assert_eq!(None, find_cap_ref($text.as_bytes()));
+ }
+ };
+ ($name:ident, $text:expr, $capref:expr) => {
+ #[test]
+ fn $name() {
+ assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
+ }
+ };
+ }
+
+ macro_rules! c {
+ ($name_or_number:expr, $pos:expr) => {
+ CaptureRef { cap: $name_or_number.into(), end: $pos }
+ };
+ }
+
+ find!(find_cap_ref1, "$foo", c!("foo", 4));
+ find!(find_cap_ref2, "${foo}", c!("foo", 6));
+ find!(find_cap_ref3, "$0", c!(0, 2));
+ find!(find_cap_ref4, "$5", c!(5, 2));
+ find!(find_cap_ref5, "$10", c!(10, 3));
+ find!(find_cap_ref6, "$42a", c!("42a", 4));
+ find!(find_cap_ref7, "${42}a", c!(42, 5));
+ find!(find_cap_ref8, "${42");
+ find!(find_cap_ref9, "${42 ");
+ find!(find_cap_ref10, " $0 ");
+ find!(find_cap_ref11, "$");
+ find!(find_cap_ref12, " ");
+ find!(find_cap_ref13, "");
+
+ // A convenience routine for using interpolate's unwieldy but flexible API.
+ fn interpolate_string(
+ mut name_to_index: Vec<(&'static str, usize)>,
+ caps: Vec<&'static str>,
+ replacement: &str,
+ ) -> String {
+ name_to_index.sort_by_key(|x| x.0);
+
+ let mut dst = vec![];
+ interpolate(
+ replacement.as_bytes(),
+ |i, dst| {
+ if let Some(&s) = caps.get(i) {
+ dst.extend(s.as_bytes());
+ }
+ },
+ |name| -> Option<usize> {
+ name_to_index
+ .binary_search_by_key(&name, |x| x.0)
+ .ok()
+ .map(|i| name_to_index[i].1)
+ },
+ &mut dst,
+ );
+ String::from_utf8(dst).unwrap()
+ }
+
+ macro_rules! interp {
+ ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => {
+ #[test]
+ fn $name() {
+ assert_eq!($expected, interpolate_string($map, $caps, $hay));
+ }
+ }
+ }
+
+ interp!(
+ interp1,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test $foo test",
+ "test xxx test",
+ );
+
+ interp!(
+ interp2,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test$footest",
+ "test",
+ );
+
+ interp!(
+ interp3,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test${foo}test",
+ "testxxxtest",
+ );
+
+ interp!(
+ interp4,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test$2test",
+ "test",
+ );
+
+ interp!(
+ interp5,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test${2}test",
+ "testxxxtest",
+ );
+
+ interp!(
+ interp6,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test $$foo test",
+ "test $foo test",
+ );
+
+ interp!(
+ interp7,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "test $foo",
+ "test xxx",
+ );
+
+ interp!(
+ interp8,
+ vec![("foo", 2)],
+ vec!["", "", "xxx"],
+ "$foo test",
+ "xxx test",
+ );
+
+ interp!(
+ interp9,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test $bar$foo",
+ "test yyyxxx",
+ );
+
+ interp!(
+ interp10,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test $ test",
+ "test $ test",
+ );
+
+ interp!(
+ interp11,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test ${} test",
+ "test ${} test",
+ );
+
+ interp!(
+ interp12,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test ${ } test",
+ "test ${ } test",
+ );
+
+ interp!(
+ interp13,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test ${a b} test",
+ "test ${a b} test",
+ );
+
+ interp!(
+ interp14,
+ vec![("bar", 1), ("foo", 2)],
+ vec!["", "yyy", "xxx"],
+ "test ${a} test",
+ "test test",
+ );
+}
diff --git a/grep-matcher/src/lib.rs b/grep-matcher/src/lib.rs
new file mode 100644
index 00000000..49d6358f
--- /dev/null
+++ b/grep-matcher/src/lib.rs
@@ -0,0 +1,1126 @@
+/*!
+This crate provides an interface for regular expressions, with a focus on line
+oriented search. The purpose of this crate is to provide a low level matching
+interface that permits any kind of substring or regex implementation to power
+the search routines provided by the
+[`grep-searcher`](https://docs.rs/grep-searcher)
+crate.
+
+The primary thing provided by this crate is the
+[`Matcher`](trait.Matcher.html)
+trait. The trait defines an abstract interface for text search. It is robust
+enough to support everything from basic substring search all the way to
+arbitrarily complex regular expression implementations without sacrificing
+performance.
+
+A key design decision made in this crate is the use of *internal iteration*,
+or otherwise known as the "push" model of searching. In this paradigm,
+implementations of the `Matcher` trait will drive search and execute callbacks
+provided by the caller when a match is found. This is in contrast to the
+usual style of *external iteration* (the "pull" model) found throughout the
+Rust ecosystem. There are two primary reasons why internal iteration was
+chosen:
+
+* Some search implementations may themselves require internal iteration.
+ Converting an internal iterator to an external iterator can be non-trivial
+ and sometimes even practically impossible.
+* Rust's type system isn't quite expressive enough to write a generic interface
+ using external iteration without giving something else up (namely, ease of
+ use and/or performance).
+
+In other words, internal iteration was chosen because it is the lowest common
+denominator and because it is probably the least bad way of expressing the
+interface in today's Rust. As a result, this trait isn't specifically intended
+for everyday use, although, you might find it to be a happy price to pay if you
+want to write code that is generic over multiple different regex
+implementations.
+*/
+
+#![deny(missing_docs)]
+
+extern crate memchr;
+
+use std::fmt;
+use std::io;
+use std::ops;
+use std::u64;
+
+use interpolate::interpolate;
+
+mod interpolate;
+
+/// The type of a match.
+///
+/// The type of a match is a possibly empty range pointing to a contiguous
+/// block of addressable memory.
+///
+/// Every `Match` is guaranteed to satisfy the invariant that `start <= end`.
+///
+/// # Indexing
+///
+/// This type is structurally identical to `std::ops::Range<usize>`, but
+/// is a bit more ergonomic for dealing with match indices. In particular,
+/// this type implements `Copy` and provides methods for building new `Match`
+/// values based on old `Match` values. Finally, the invariant that `start`
+/// is always less than or equal to `end` is enforced.
+///
+/// A `Match` can be used to slice a `&[u8]`, `&mut [u8]` or `&str` using
+/// range notation. e.g.,
+///
+/// ```
+/// use grep_matcher::Match;
+///
+/// let m = Match::new(2, 5);
+/// let bytes = b"abcdefghi";
+/// assert_eq!(b"cde", &bytes[m]);
+/// ```
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+pub struct Match {
+ start: usize,
+ end: usize,
+}
+
+impl Match {
+ /// Create a new match.
+ ///
+ /// # Panics
+ ///
+ /// This function panics if `start > end`.
+ #[inline]
+ pub fn new(start: usize, end: usize) -> Match {
+ assert!(start <= end);
+ Match { start, end }
+ }
+
+ /// Creates a zero width match at the given offset.
+ #[inline]
+ pub fn zero(offset: usize) -> Match {
+ Match { start: offset, end: offset }
+ }
+
+ /// Return the start offset of this match.
+ #[inline]
+ pub fn start(&self) -> usize {
+ self.start
+ }
+
+ /// Return the end offset of this match.
+ #[inline]
+ pub fn end(&self) -> usize {
+ self.end
+ }
+
+ /// Return a new match with the start offset replaced with the given
+ /// value.
+ ///
+ /// # Panics
+ ///
+ /// This method panics if `start > self.end`.
+ #[inline]
+ pub fn with_start(&self, start: usize) -> Match {
+ assert!(start <= self.end);
+ Match { start, ..*self }
+ }
+
+ /// Return a new match with the end offset replaced with the given
+ /// value.
+ ///
+ /// # Panics
+ ///
+ /// This method panics if `self.start > end`.
+ #[inline]
+ pub fn with_end(&self, end: usize) -> Match {
+ assert!(self.start <= end);
+ Match { end, ..*self }
+ }
+
+ /// Offset this match by the given amount and return a new match.
+ ///
+ /// This adds the given offset to the start and end of this match, and
+ /// returns the resulting match.
+ ///
+ /// # Panics
+ ///
+ /// This panics if adding the given amount to either the start or end
+ /// offset would result in an overflow.
+ #[inline]
+ pub fn offset(&self, amount: usize) -> Match {
+ Match {
+ start: self.start.checked_add(amount).unwrap(),
+ end: self.end.checked_add(amount).unwrap(),
+ }
+ }
+
+ /// Returns the number of bytes in this match.
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.end - self.start
+ }
+
+ /// Returns true if and only if this match is empty.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+}
+
+impl ops::Index<Match> for [u8] {
+ type Output = [u8];
+
+ #[inline]
+ fn index(&self, index: Match) -> &[u8] {
+ &self[index.start..index.end]
+ }
+}
+
+impl ops::IndexMut<Match> for [u8] {
+ #[inline]
+ fn index_mut(&mut self, index: Match) -> &mut [u8] {
+ &mut self[index.start..index.end]
+ }
+}
+
+impl ops::Index<Match> for str {
+ type Output = str;
+
+ #[inline]
+ fn index(&self, index: Match) -> &str {
+ &self[index.start..index.end]
+ }
+}
+
+/// A line terminator.
+///
+/// A line terminator represents the end of a line. Generally, every line is
+/// either "terminated" by the end of a stream or a specific byte (or sequence
+/// of bytes).
+///
+/// Generally, a line terminator is a single byte, specifically, `\n`, on
+/// Unix-like systems. On Windows, a line terminator is `\r\n` (referred to
+/// as `CRLF` for `Carriage Return; Line Feed`).
+///
+/// The default line terminator is `\n` on all platforms.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+pub struct LineTerminator(LineTerminatorImp);
+
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
+enum LineTerminatorImp {
+ /// Any single byte representing a line terminator.
+ ///
+ /// We represent this as an array so we can safely convert it to a slice
+ /// for convenient access. At some point, we can use `std::slice::from_ref`
+ /// instead.
+ Byte([u8; 1]),
+ /// A line terminator represented by `\r\n`.
+ ///
+ /// When this option is used, consumers may generally treat a lone `\n` as
+ /// a line terminator in addition to `\r\n`.
+ CRLF,
+}
+
+impl LineTerminator {
+ /// Return a new single-byte line terminator. Any byte is valid.
+ #[inline]
+ pub fn byte(byte: u8) -> LineTerminator {
+ LineTerminator(LineTerminatorImp::Byte([byte]))
+ }
+
+ /// Return a new line terminator represented by `\r\n`.
+ ///
+ /// When this option is used, consumers may generally treat a lone `\n` as
+ /// a line terminator in addition to `\r\n`.
+ #[inline]
+ pub fn crlf() -> LineTerminator {
+ LineTerminator(LineTerminatorImp::CRLF)
+ }
+
+ /// Returns true if and only if this line terminator is CRLF.
+ #[inline]
+ pub fn is_crlf(&self) -> bool {
+ self.0 == LineTerminatorImp::CRLF
+ }
+
+ /// Returns this line terminator as a single byte.
+ ///
+ /// If the line terminator is CRLF, then this returns `\n`. This is
+ /// useful for routines that, for example, find line boundaries by treating
+ /// `\n` as a line terminator even when it isn't preceded by `\r`.
+ #[inline]
+ pub fn as_byte(&self) -> u8 {
+ match self.0 {
+ LineTerminatorImp::Byte(array) => array[0],
+ LineTerminatorImp::CRLF => b'\n',
+ }
+ }
+
+ /// Returns this line terminator as a sequence of bytes.
+ ///
+ /// This returns a singleton sequence for all line terminators except for
+ /// `CRLF`, in which case, it returns `\r\n`.
+ ///
+ /// The slice returned is guaranteed to have length at least `1`.
+ #[inline]
+ pub fn as_bytes(&self) -> &[u8] {
+ match self.0 {
+ LineTerminatorImp::Byte(ref array) => array,
+ LineTerminatorImp::CRLF => &[b'\r', b'\n'],
+ }
+ }
+}
+
+impl Default for LineTerminator {
+ #[inline]
+ fn default() -> LineTerminator {
+ LineTerminator::byte(b'\n')
+ }
+}
+
+/// A set of bytes.
+///
+/// In this crate, byte sets are used to express bytes that can never appear
+/// anywhere in a match for a particular implementation of the `Matcher` trait.
+/// Specifically, if such a set can be determined, then it's possible for
+/// callers to perform additional operations on the basis that certain bytes
+/// may never match.
+///
+/// For example, if a search is configured to possibly produce results that
+/// span multiple lines but a caller provided pattern can never match across
+/// multiple lines, then it may make sense to divert to more optimized line
+/// oriented routines that don't need to handle the multi-line match case.
+#[derive(Clone, Debug)]
+pub struct ByteSet(BitSet);
+
+#[derive(Clone, Copy)]
+struct BitSet([u64; 4]);
+
+impl fmt::Debug for BitSet {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let mut fmtd = f.debug_set();
+ for b in (0..256).map(|b| b as u8) {
+ if ByteSet(*self).contains(b) {
+ fmtd.entry(&b);
+ }
+ }
+ fmtd.finish()
+ }
+}
+
+impl ByteSet {
+ /// Create an empty set of bytes.
+ pub fn empty() -> ByteSet {
+ ByteSet(BitSet([0; 4]))
+ }
+
+ /// Create a full set of bytes such that every possible byte is in the set
+ /// returned.
+ pub fn full() -> ByteSet {
+ ByteSet(BitSet([u64::MAX; 4]))
+ }
+
+ /// Add a byte to this set.
+ ///
+ /// If the given byte already belongs to this set, then this is a no-op.
+ pub fn add(&mut self, byte: u8) {
+ let bucket = byte / 64;
+ let bit = byte % 64;
+ (self.0).0[bucket as usize] |= 1 << bit;
+ }
+
+ /// Add an inclusive range of bytes.
+ pub fn add_all(&mut self, start: u8, end: u8) {
+ for b in (start as u64..end as u64 + 1).map(|b| b as u8) {
+ self.add(b);
+ }
+ }
+
+ /// Remove a byte from this set.
+ ///
+ /// If the given byte is not in this set, then this is a no-op.
+ pub fn remove(&mut self, byte: u8) {
+ let bucket = byte / 64;
+ let bit = byte % 64;
+ (self.0).0[bucket as usize] &= !(1 << bit);
+ }
+
+ /// Remove an inclusive range of bytes.
+ pub fn remove_all(&mut self, start: u8, end: u8) {
+ for b in (start as u64..end as u64 + 1).map(|b| b as u8) {
+ self.remove(b);
+ }
+ }
+
+ /// Return true if and only if the given byte is in this set.
+ pub fn contains(&self, byte: u8) -> bool {
+ let bucket = byte / 64;
+ let bit = byte % 64;
+ (self.0).0[bucket as usize] & (1 << bit) > 0
+ }
+}
+
+/// A trait that describes implementations of capturing groups.
+///
+/// When a matcher supports capturing group extraction, then it is the
+/// matcher's responsibility to provide an implementation of this trait.
+///
+/// Principally, this trait provides a way to access capturing groups
+/// in a uniform way that does not require any specific representation.
+/// Namely, different matcher implementations may require different in-memory
+/// representations of capturing groups. This trait permits matchers to
+/// maintain their specific in-memory representation.
+///
+/// Note that this trait explicitly does not provide a way to construct a new
+/// capture value. Instead, it is the responsibility of a `Matcher` to build
+/// one, which might require knowledge of the matcher's internal implementation
+/// details.
+pub trait Captures {
+ /// Return the total number of capturing groups. This includes capturing
+ /// groups that have not matched anything.
+ fn len(&self) -> usize;
+
+ /// Return the capturing group match at the given index. If no match of
+ /// that capturing group exists, then this returns `None`.
+ ///
+ /// When a matcher reports a match with capturing groups, then the first
+ /// capturing group (at index `0`) must always correspond to the offsets
+ /// for the overall match.
+ fn get(&self, i: usize) -> Option<Match>;
+
+ /// Returns true if and only if these captures are empty. This occurs
+ /// when `len` is `0`.
+ ///
+ /// Note that capturing groups that have non-zero length but otherwise
+ /// contain no matching groups are *not* empty.
+ fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Expands all instances of `$name` in `replacement` to the corresponding
+ /// capture group `name`, and writes them to the `dst` buffer given.
+ ///
+ /// (Note: If you're looking for a convenient way to perform replacements
+ /// with interpolation, then you'll want to use the `replace_with_captures`
+ /// method on the `Matcher` trait.)
+ ///
+ /// `name` may be an integer corresponding to the index of the
+ /// capture group (counted by order of opening parenthesis where `0` is the
+ /// entire match) or it can be a name (consisting of letters, digits or
+ /// underscores) corresponding to a named capture group.
+ ///
+ /// A `name` is translated to a capture group index via the given
+ /// `name_to_index` function. If `name` isn't a valid capture group
+ /// (whether the name doesn't exist or isn't a valid index), then it is
+ /// replaced with the empty string.
+ ///
+ /// The longest possible name is used. e.g., `$1a` looks up the capture
+ /// group named `1a` and not the capture group at index `1`. To exert
+ /// more precise control over the name, use braces, e.g., `${1}a`. In all
+ /// cases, capture group names are limited to ASCII letters, numbers and
+ /// underscores.
+ ///
+ /// To write a literal `$` use `$$`.
+ ///
+ /// Note that the capture group match indices are resolved by slicing
+ /// the given `haystack`. Generally, this means that `haystack` should be
+ /// the same slice that was searched to get the current capture group
+ /// matches.
+ fn interpolate<F>(
+ &self,
+ name_to_index: F,
+ haystack: &[u8],
+ replacement: &[u8],
+ dst: &mut Vec<u8>,
+ ) where F: FnMut(&str) -> Option<usize>
+ {
+ interpolate(
+ replacement,
+ |i, dst| {
+ if let Some(range) = self.get(i) {
+ dst.extend(&haystack[range]);
+ }
+ },
+ name_to_index,
+ dst,
+ )
+ }
+}
+
+/// NoCaptures provides an always-empty implementation of the `Captures` trait.
+///
+/// This type is useful for implementations of `Matcher` that don't support
+/// capturing groups.
+#[derive(Clone, Debug)]
+pub struct NoCaptures(());
+
+impl NoCaptures {
+ /// Create an empty set of capturing groups.
+ pub fn new() -> NoCaptures { NoCaptures(()) }
+}
+
+impl Captures for NoCaptures {
+ fn len(&self) -> usize { 0 }
+ fn get(&self, _: usize) -> Option<Match> { None }
+}
+
+/// NoError provides an error type for matchers that never produce errors.
+///
+/// This error type implements the `std::error::Error` and `fmt::Display`
+/// traits for use in matcher implementations that can never produce errors.
+///
+/// The `fmt::Debug` and `fmt::Display` impls for this type panics.
+#[derive(Debug, Eq, PartialEq)]
+pub struct NoError(());
+
+impl ::std::error::Error for NoError {
+ fn description(&self) -> &str { "no error" }
+}
+
+impl fmt::Display for NoError {
+ fn fmt(&self, _: &mut fmt::Formatter) -> fmt::Result {
+ panic!("BUG for NoError: an impossible error occurred")
+ }
+}
+
+impl From<NoError> for io::Error {
+ fn from(_: NoError) -> io::Error {
+ panic!("BUG for NoError: an impossible error occurred")
+ }
+}
+
+/// The type of match for a line oriented matcher.
+#[derive(Clone, Copy, Debug)]
+pub enum LineMatchKind {
+ /// A position inside a line that is known to contain a match.
+ ///
+ /// This position can be anywhere in the line. It does not need to point
+ /// at the location of the match.
+ Confirmed(usize),
+ /// A position inside a line that may contain a match, and must be searched
+ /// for verification.
+ ///
+ /// This position can be anywhere in the line. It does not need to point
+ /// at the location of the match.
+ Candidate(usize),
+}
+
+/// A matcher defines an interface for regular expression implementations.
+///
+/// While this trait is large, there are only two required methods that
+/// implementors must provide: `find_at` and `new_captures`. If captures
+/// aren't supported by your implementation, then `new_captures` can be