diff options
author | Andrew Gallant <jamslam@gmail.com> | 2018-04-29 09:29:52 -0400 |
---|---|---|
committer | Andrew Gallant <jamslam@gmail.com> | 2018-08-20 07:10:19 -0400 |
commit | d9ca5293569efb255608d3c601107bcfe7060f15 (patch) | |
tree | 7fd8611c333a2f7d703987de3a379ee8292013e2 /grep-matcher | |
parent | 0958837ee104985412f08e81b6f08df1e5291042 (diff) |
libripgrep: initial commit introducing libripgrep
libripgrep is not any one library, but rather, a collection of libraries
that roughly separate the following key distinct phases in a grep
implementation:
1. Pattern matching (e.g., by a regex engine).
2. Searching a file using a pattern matcher.
3. Printing results.
Ultimately, both (1) and (3) are defined by de-coupled interfaces, of
which there may be multiple implementations. Namely, (1) is satisfied by
the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by
the `Sink` trait in the `grep2` crate. The searcher (2) ties everything
together and finds results using a matcher and reports those results
using a `Sink` implementation.
Closes #162
Diffstat (limited to 'grep-matcher')
-rw-r--r-- | grep-matcher/Cargo.toml | 24 | ||||
-rw-r--r-- | grep-matcher/LICENSE-MIT | 21 | ||||
-rw-r--r-- | grep-matcher/README.md | 36 | ||||
-rw-r--r-- | grep-matcher/UNLICENSE | 24 | ||||
-rw-r--r-- | grep-matcher/src/interpolate.rs | 328 | ||||
-rw-r--r-- | grep-matcher/src/lib.rs | 1126 | ||||
-rw-r--r-- | grep-matcher/tests/test_matcher.rs | 208 | ||||
-rw-r--r-- | grep-matcher/tests/tests.rs | 6 | ||||
-rw-r--r-- | grep-matcher/tests/util.rs | 104 |
9 files changed, 1877 insertions, 0 deletions
diff --git a/grep-matcher/Cargo.toml b/grep-matcher/Cargo.toml new file mode 100644 index 00000000..8056dec1 --- /dev/null +++ b/grep-matcher/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "grep-matcher" +version = "0.0.1" #:version +authors = ["Andrew Gallant <jamslam@gmail.com>"] +description = """ +A trait for regular expressions, with a focus on line oriented search. +""" +documentation = "https://docs.rs/grep-matcher" +homepage = "https://github.com/BurntSushi/ripgrep" +repository = "https://github.com/BurntSushi/ripgrep" +readme = "README.md" +keywords = ["regex", "pattern", "trait"] +license = "Unlicense/MIT" +autotests = false + +[dependencies] +memchr = "2" + +[dev-dependencies] +regex = "1" + +[[test]] +name = "integration" +path = "tests/tests.rs" diff --git a/grep-matcher/LICENSE-MIT b/grep-matcher/LICENSE-MIT new file mode 100644 index 00000000..3b0a5dc0 --- /dev/null +++ b/grep-matcher/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/grep-matcher/README.md b/grep-matcher/README.md new file mode 100644 index 00000000..f83ceade --- /dev/null +++ b/grep-matcher/README.md @@ -0,0 +1,36 @@ +grep-matcher +------------ +This crate provides a low level interface for describing regular expression +matchers. The `grep` crate uses this interface in order to make the regex +engine it uses pluggable. + +[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.svg)](https://travis-ci.org/BurntSushi/ripgrep) +[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep) +[![](https://img.shields.io/crates/v/grep-matcher.svg)](https://crates.io/crates/grep-matcher) + +Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). + +### Documentation + +[https://docs.rs/grep-matcher](https://docs.rs/grep-matcher) + +**NOTE:** You probably don't want to use this crate directly. Instead, you +should prefer the facade defined in the +[`grep`](https://docs.rs/grep) +crate. + + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +grep-matcher = "0.1" +``` + +and this to your crate root: + +```rust +extern crate grep_matcher; +``` diff --git a/grep-matcher/UNLICENSE b/grep-matcher/UNLICENSE new file mode 100644 index 00000000..68a49daa --- /dev/null +++ b/grep-matcher/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to <http://unlicense.org/> diff --git a/grep-matcher/src/interpolate.rs b/grep-matcher/src/interpolate.rs new file mode 100644 index 00000000..168dd343 --- /dev/null +++ b/grep-matcher/src/interpolate.rs @@ -0,0 +1,328 @@ +use std::str; + +use memchr::memchr; + +/// Interpolate capture references in `replacement` and write the interpolation +/// result to `dst`. References in `replacement` take the form of $N or $name, +/// where `N` is a capture group index and `name` is a capture group name. The +/// function provided, `name_to_index`, maps capture group names to indices. +/// +/// The `append` function given is responsible for writing the replacement +/// to the `dst` buffer. That is, it is called with the capture group index +/// of a capture group reference and is expected to resolve the index to its +/// corresponding matched text. If no such match exists, then `append` should +/// not write anything to its given buffer. +pub fn interpolate<A, N>( + mut replacement: &[u8], + mut append: A, + mut name_to_index: N, + dst: &mut Vec<u8>, +) where + A: FnMut(usize, &mut Vec<u8>), + N: FnMut(&str) -> Option<usize> +{ + while !replacement.is_empty() { + match memchr(b'$', replacement) { + None => break, + Some(i) => { + dst.extend(&replacement[..i]); + replacement = &replacement[i..]; + } + } + if replacement.get(1).map_or(false, |&b| b == b'$') { + dst.push(b'$'); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement) { + Some(cap_ref) => cap_ref, + None => { + dst.push(b'$'); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => append(i, dst), + Ref::Named(name) => { + if let Some(i) = name_to_index(name) { + append(i, dst); + } + } + } + } + dst.extend(replacement); +} + +/// `CaptureRef` represents a reference to a capture group inside some text. +/// The reference is either a capture group name or a number. +/// +/// It is also tagged with the position in the text immediately proceding the +/// capture reference. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct CaptureRef<'a> { + cap: Ref<'a>, + end: usize, +} + +/// A reference to a capture group in some text. +/// +/// e.g., `$2`, `$foo`, `${foo}`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum Ref<'a> { + Named(&'a str), + Number(usize), +} + +impl<'a> From<&'a str> for Ref<'a> { + fn from(x: &'a str) -> Ref<'a> { + Ref::Named(x) + } +} + +impl From<usize> for Ref<'static> { + fn from(x: usize) -> Ref<'static> { + Ref::Number(x) + } +} + +/// Parses a possible reference to a capture group name in the given text, +/// starting at the beginning of `replacement`. +/// +/// If no such valid reference could be found, None is returned. +fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef> { + let mut i = 0; + if replacement.len() <= 1 || replacement[0] != b'$' { + return None; + } + let mut brace = false; + i += 1; + if replacement[i] == b'{' { + brace = true; + i += 1; + } + let mut cap_end = i; + while replacement.get(cap_end).map_or(false, is_valid_cap_letter) { + cap_end += 1; + } + if cap_end == i { + return None; + } + // We just verified that the range 0..cap_end is valid ASCII, so it must + // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 + // check with an unchecked conversion or by parsing the number straight + // from &[u8]. + let cap = str::from_utf8(&replacement[i..cap_end]) + .expect("valid UTF-8 capture name"); + if brace { + if !replacement.get(cap_end).map_or(false, |&b| b == b'}') { + return None; + } + cap_end += 1; + } + Some(CaptureRef { + cap: match cap.parse::<u32>() { + Ok(i) => Ref::Number(i as usize), + Err(_) => Ref::Named(cap), + }, + end: cap_end, + }) +} + +/// Returns true if and only if the given byte is allowed in a capture name. +fn is_valid_cap_letter(b: &u8) -> bool { + match *b { + b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' | b'_' => true, + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::{CaptureRef, find_cap_ref, interpolate}; + + macro_rules! find { + ($name:ident, $text:expr) => { + #[test] + fn $name() { + assert_eq!(None, find_cap_ref($text.as_bytes())); + } + }; + ($name:ident, $text:expr, $capref:expr) => { + #[test] + fn $name() { + assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); + } + }; + } + + macro_rules! c { + ($name_or_number:expr, $pos:expr) => { + CaptureRef { cap: $name_or_number.into(), end: $pos } + }; + } + + find!(find_cap_ref1, "$foo", c!("foo", 4)); + find!(find_cap_ref2, "${foo}", c!("foo", 6)); + find!(find_cap_ref3, "$0", c!(0, 2)); + find!(find_cap_ref4, "$5", c!(5, 2)); + find!(find_cap_ref5, "$10", c!(10, 3)); + find!(find_cap_ref6, "$42a", c!("42a", 4)); + find!(find_cap_ref7, "${42}a", c!(42, 5)); + find!(find_cap_ref8, "${42"); + find!(find_cap_ref9, "${42 "); + find!(find_cap_ref10, " $0 "); + find!(find_cap_ref11, "$"); + find!(find_cap_ref12, " "); + find!(find_cap_ref13, ""); + + // A convenience routine for using interpolate's unwieldy but flexible API. + fn interpolate_string( + mut name_to_index: Vec<(&'static str, usize)>, + caps: Vec<&'static str>, + replacement: &str, + ) -> String { + name_to_index.sort_by_key(|x| x.0); + + let mut dst = vec![]; + interpolate( + replacement.as_bytes(), + |i, dst| { + if let Some(&s) = caps.get(i) { + dst.extend(s.as_bytes()); + } + }, + |name| -> Option<usize> { + name_to_index + .binary_search_by_key(&name, |x| x.0) + .ok() + .map(|i| name_to_index[i].1) + }, + &mut dst, + ); + String::from_utf8(dst).unwrap() + } + + macro_rules! interp { + ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => { + #[test] + fn $name() { + assert_eq!($expected, interpolate_string($map, $caps, $hay)); + } + } + } + + interp!( + interp1, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $foo test", + "test xxx test", + ); + + interp!( + interp2, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test$footest", + "test", + ); + + interp!( + interp3, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test${foo}test", + "testxxxtest", + ); + + interp!( + interp4, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test$2test", + "test", + ); + + interp!( + interp5, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test${2}test", + "testxxxtest", + ); + + interp!( + interp6, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $$foo test", + "test $foo test", + ); + + interp!( + interp7, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $foo", + "test xxx", + ); + + interp!( + interp8, + vec![("foo", 2)], + vec!["", "", "xxx"], + "$foo test", + "xxx test", + ); + + interp!( + interp9, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test $bar$foo", + "test yyyxxx", + ); + + interp!( + interp10, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test $ test", + "test $ test", + ); + + interp!( + interp11, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${} test", + "test ${} test", + ); + + interp!( + interp12, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${ } test", + "test ${ } test", + ); + + interp!( + interp13, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${a b} test", + "test ${a b} test", + ); + + interp!( + interp14, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${a} test", + "test test", + ); +} diff --git a/grep-matcher/src/lib.rs b/grep-matcher/src/lib.rs new file mode 100644 index 00000000..49d6358f --- /dev/null +++ b/grep-matcher/src/lib.rs @@ -0,0 +1,1126 @@ +/*! +This crate provides an interface for regular expressions, with a focus on line +oriented search. The purpose of this crate is to provide a low level matching +interface that permits any kind of substring or regex implementation to power +the search routines provided by the +[`grep-searcher`](https://docs.rs/grep-searcher) +crate. + +The primary thing provided by this crate is the +[`Matcher`](trait.Matcher.html) +trait. The trait defines an abstract interface for text search. It is robust +enough to support everything from basic substring search all the way to +arbitrarily complex regular expression implementations without sacrificing +performance. + +A key design decision made in this crate is the use of *internal iteration*, +or otherwise known as the "push" model of searching. In this paradigm, +implementations of the `Matcher` trait will drive search and execute callbacks +provided by the caller when a match is found. This is in contrast to the +usual style of *external iteration* (the "pull" model) found throughout the +Rust ecosystem. There are two primary reasons why internal iteration was +chosen: + +* Some search implementations may themselves require internal iteration. + Converting an internal iterator to an external iterator can be non-trivial + and sometimes even practically impossible. +* Rust's type system isn't quite expressive enough to write a generic interface + using external iteration without giving something else up (namely, ease of + use and/or performance). + +In other words, internal iteration was chosen because it is the lowest common +denominator and because it is probably the least bad way of expressing the +interface in today's Rust. As a result, this trait isn't specifically intended +for everyday use, although, you might find it to be a happy price to pay if you +want to write code that is generic over multiple different regex +implementations. +*/ + +#![deny(missing_docs)] + +extern crate memchr; + +use std::fmt; +use std::io; +use std::ops; +use std::u64; + +use interpolate::interpolate; + +mod interpolate; + +/// The type of a match. +/// +/// The type of a match is a possibly empty range pointing to a contiguous +/// block of addressable memory. +/// +/// Every `Match` is guaranteed to satisfy the invariant that `start <= end`. +/// +/// # Indexing +/// +/// This type is structurally identical to `std::ops::Range<usize>`, but +/// is a bit more ergonomic for dealing with match indices. In particular, +/// this type implements `Copy` and provides methods for building new `Match` +/// values based on old `Match` values. Finally, the invariant that `start` +/// is always less than or equal to `end` is enforced. +/// +/// A `Match` can be used to slice a `&[u8]`, `&mut [u8]` or `&str` using +/// range notation. e.g., +/// +/// ``` +/// use grep_matcher::Match; +/// +/// let m = Match::new(2, 5); +/// let bytes = b"abcdefghi"; +/// assert_eq!(b"cde", &bytes[m]); +/// ``` +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct Match { + start: usize, + end: usize, +} + +impl Match { + /// Create a new match. + /// + /// # Panics + /// + /// This function panics if `start > end`. + #[inline] + pub fn new(start: usize, end: usize) -> Match { + assert!(start <= end); + Match { start, end } + } + + /// Creates a zero width match at the given offset. + #[inline] + pub fn zero(offset: usize) -> Match { + Match { start: offset, end: offset } + } + + /// Return the start offset of this match. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Return the end offset of this match. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Return a new match with the start offset replaced with the given + /// value. + /// + /// # Panics + /// + /// This method panics if `start > self.end`. + #[inline] + pub fn with_start(&self, start: usize) -> Match { + assert!(start <= self.end); + Match { start, ..*self } + } + + /// Return a new match with the end offset replaced with the given + /// value. + /// + /// # Panics + /// + /// This method panics if `self.start > end`. + #[inline] + pub fn with_end(&self, end: usize) -> Match { + assert!(self.start <= end); + Match { end, ..*self } + } + + /// Offset this match by the given amount and return a new match. + /// + /// This adds the given offset to the start and end of this match, and + /// returns the resulting match. + /// + /// # Panics + /// + /// This panics if adding the given amount to either the start or end + /// offset would result in an overflow. + #[inline] + pub fn offset(&self, amount: usize) -> Match { + Match { + start: self.start.checked_add(amount).unwrap(), + end: self.end.checked_add(amount).unwrap(), + } + } + + /// Returns the number of bytes in this match. + #[inline] + pub fn len(&self) -> usize { + self.end - self.start + } + + /// Returns true if and only if this match is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +impl ops::Index<Match> for [u8] { + type Output = [u8]; + + #[inline] + fn index(&self, index: Match) -> &[u8] { + &self[index.start..index.end] + } +} + +impl ops::IndexMut<Match> for [u8] { + #[inline] + fn index_mut(&mut self, index: Match) -> &mut [u8] { + &mut self[index.start..index.end] + } +} + +impl ops::Index<Match> for str { + type Output = str; + + #[inline] + fn index(&self, index: Match) -> &str { + &self[index.start..index.end] + } +} + +/// A line terminator. +/// +/// A line terminator represents the end of a line. Generally, every line is +/// either "terminated" by the end of a stream or a specific byte (or sequence +/// of bytes). +/// +/// Generally, a line terminator is a single byte, specifically, `\n`, on +/// Unix-like systems. On Windows, a line terminator is `\r\n` (referred to +/// as `CRLF` for `Carriage Return; Line Feed`). +/// +/// The default line terminator is `\n` on all platforms. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct LineTerminator(LineTerminatorImp); + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +enum LineTerminatorImp { + /// Any single byte representing a line terminator. + /// + /// We represent this as an array so we can safely convert it to a slice + /// for convenient access. At some point, we can use `std::slice::from_ref` + /// instead. + Byte([u8; 1]), + /// A line terminator represented by `\r\n`. + /// + /// When this option is used, consumers may generally treat a lone `\n` as + /// a line terminator in addition to `\r\n`. + CRLF, +} + +impl LineTerminator { + /// Return a new single-byte line terminator. Any byte is valid. + #[inline] + pub fn byte(byte: u8) -> LineTerminator { + LineTerminator(LineTerminatorImp::Byte([byte])) + } + + /// Return a new line terminator represented by `\r\n`. + /// + /// When this option is used, consumers may generally treat a lone `\n` as + /// a line terminator in addition to `\r\n`. + #[inline] + pub fn crlf() -> LineTerminator { + LineTerminator(LineTerminatorImp::CRLF) + } + + /// Returns true if and only if this line terminator is CRLF. + #[inline] + pub fn is_crlf(&self) -> bool { + self.0 == LineTerminatorImp::CRLF + } + + /// Returns this line terminator as a single byte. + /// + /// If the line terminator is CRLF, then this returns `\n`. This is + /// useful for routines that, for example, find line boundaries by treating + /// `\n` as a line terminator even when it isn't preceded by `\r`. + #[inline] + pub fn as_byte(&self) -> u8 { + match self.0 { + LineTerminatorImp::Byte(array) => array[0], + LineTerminatorImp::CRLF => b'\n', + } + } + + /// Returns this line terminator as a sequence of bytes. + /// + /// This returns a singleton sequence for all line terminators except for + /// `CRLF`, in which case, it returns `\r\n`. + /// + /// The slice returned is guaranteed to have length at least `1`. + #[inline] + pub fn as_bytes(&self) -> &[u8] { + match self.0 { + LineTerminatorImp::Byte(ref array) => array, + LineTerminatorImp::CRLF => &[b'\r', b'\n'], + } + } +} + +impl Default for LineTerminator { + #[inline] + fn default() -> LineTerminator { + LineTerminator::byte(b'\n') + } +} + +/// A set of bytes. +/// +/// In this crate, byte sets are used to express bytes that can never appear +/// anywhere in a match for a particular implementation of the `Matcher` trait. +/// Specifically, if such a set can be determined, then it's possible for +/// callers to perform additional operations on the basis that certain bytes +/// may never match. +/// +/// For example, if a search is configured to possibly produce results that +/// span multiple lines but a caller provided pattern can never match across +/// multiple lines, then it may make sense to divert to more optimized line +/// oriented routines that don't need to handle the multi-line match case. +#[derive(Clone, Debug)] +pub struct ByteSet(BitSet); + +#[derive(Clone, Copy)] +struct BitSet([u64; 4]); + +impl fmt::Debug for BitSet { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut fmtd = f.debug_set(); + for b in (0..256).map(|b| b as u8) { + if ByteSet(*self).contains(b) { + fmtd.entry(&b); + } + } + fmtd.finish() + } +} + +impl ByteSet { + /// Create an empty set of bytes. + pub fn empty() -> ByteSet { + ByteSet(BitSet([0; 4])) + } + + /// Create a full set of bytes such that every possible byte is in the set + /// returned. + pub fn full() -> ByteSet { + ByteSet(BitSet([u64::MAX; 4])) + } + + /// Add a byte to this set. + /// + /// If the given byte already belongs to this set, then this is a no-op. + pub fn add(&mut self, byte: u8) { + let bucket = byte / 64; + let bit = byte % 64; + (self.0).0[bucket as usize] |= 1 << bit; + } + + /// Add an inclusive range of bytes. + pub fn add_all(&mut self, start: u8, end: u8) { + for b in (start as u64..end as u64 + 1).map(|b| b as u8) { + self.add(b); + } + } + + /// Remove a byte from this set. + /// + /// If the given byte is not in this set, then this is a no-op. + pub fn remove(&mut self, byte: u8) { + let bucket = byte / 64; + let bit = byte % 64; + (self.0).0[bucket as usize] &= !(1 << bit); + } + + /// Remove an inclusive range of bytes. + pub fn remove_all(&mut self, start: u8, end: u8) { + for b in (start as u64..end as u64 + 1).map(|b| b as u8) { + self.remove(b); + } + } + + /// Return true if and only if the given byte is in this set. + pub fn contains(&self, byte: u8) -> bool { + let bucket = byte / 64; + let bit = byte % 64; + (self.0).0[bucket as usize] & (1 << bit) > 0 + } +} + +/// A trait that describes implementations of capturing groups. +/// +/// When a matcher supports capturing group extraction, then it is the +/// matcher's responsibility to provide an implementation of this trait. +/// +/// Principally, this trait provides a way to access capturing groups +/// in a uniform way that does not require any specific representation. +/// Namely, different matcher implementations may require different in-memory +/// representations of capturing groups. This trait permits matchers to +/// maintain their specific in-memory representation. +/// +/// Note that this trait explicitly does not provide a way to construct a new +/// capture value. Instead, it is the responsibility of a `Matcher` to build +/// one, which might require knowledge of the matcher's internal implementation +/// details. +pub trait Captures { + /// Return the total number of capturing groups. This includes capturing + /// groups that have not matched anything. + fn len(&self) -> usize; + + /// Return the capturing group match at the given index. If no match of + /// that capturing group exists, then this returns `None`. + /// + /// When a matcher reports a match with capturing groups, then the first + /// capturing group (at index `0`) must always correspond to the offsets + /// for the overall match. + fn get(&self, i: usize) -> Option<Match>; + + /// Returns true if and only if these captures are empty. This occurs + /// when `len` is `0`. + /// + /// Note that capturing groups that have non-zero length but otherwise + /// contain no matching groups are *not* empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Expands all instances of `$name` in `replacement` to the corresponding + /// capture group `name`, and writes them to the `dst` buffer given. + /// + /// (Note: If you're looking for a convenient way to perform replacements + /// with interpolation, then you'll want to use the `replace_with_captures` + /// method on the `Matcher` trait.) + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// A `name` is translated to a capture group index via the given + /// `name_to_index` function. If `name` isn't a valid capture group + /// (whether the name doesn't exist or isn't a valid index), then it is + /// replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert + /// more precise control over the name, use braces, e.g., `${1}a`. In all + /// cases, capture group names are limited to ASCII letters, numbers and + /// underscores. + /// + /// To write a literal `$` use `$$`. + /// + /// Note that the capture group match indices are resolved by slicing + /// the given `haystack`. Generally, this means that `haystack` should be + /// the same slice that was searched to get the current capture group + /// matches. + fn interpolate<F>( + &self, + name_to_index: F, + haystack: &[u8], + replacement: &[u8], + dst: &mut Vec<u8>, + ) where F: FnMut(&str) -> Option<usize> + { + interpolate( + replacement, + |i, dst| { + if let Some(range) = self.get(i) { + dst.extend(&haystack[range]); + } + }, + name_to_index, + dst, + ) + } +} + +/// NoCaptures provides an always-empty implementation of the `Captures` trait. +/// +/// This type is useful for implementations of `Matcher` that don't support +/// capturing groups. +#[derive(Clone, Debug)] +pub struct NoCaptures(()); + +impl NoCaptures { + /// Create an empty set of capturing groups. + pub fn new() -> NoCaptures { NoCaptures(()) } +} + +impl Captures for NoCaptures { + fn len(&self) -> usize { 0 } + fn get(&self, _: usize) -> Option<Match> { None } +} + +/// NoError provides an error type for matchers that never produce errors. +/// +/// This error type implements the `std::error::Error` and `fmt::Display` +/// traits for use in matcher implementations that can never produce errors. +/// +/// The `fmt::Debug` and `fmt::Display` impls for this type panics. +#[derive(Debug, Eq, PartialEq)] +pub struct NoError(()); + +impl ::std::error::Error for NoError { + fn description(&self) -> &str { "no error" } +} + +impl fmt::Display for NoError { + fn fmt(&self, _: &mut fmt::Formatter) -> fmt::Result { + panic!("BUG for NoError: an impossible error occurred") + } +} + +impl From<NoError> for io::Error { + fn from(_: NoError) -> io::Error { + panic!("BUG for NoError: an impossible error occurred") + } +} + +/// The type of match for a line oriented matcher. +#[derive(Clone, Copy, Debug)] +pub enum LineMatchKind { + /// A position inside a line that is known to contain a match. + /// + /// This position can be anywhere in the line. It does not need to point + /// at the location of the match. + Confirmed(usize), + /// A position inside a line that may contain a match, and must be searched + /// for verification. + /// + /// This position can be anywhere in the line. It does not need to point + /// at the location of the match. + Candidate(usize), +} + +/// A matcher defines an interface for regular expression implementations. +/// +/// While this trait is large, there are only two required methods that +/// implementors must provide: `find_at` and `new_captures`. If captures +/// aren't su |