diff options
author | Andrew Gallant <jamslam@gmail.com> | 2018-04-29 09:29:52 -0400 |
---|---|---|
committer | Andrew Gallant <jamslam@gmail.com> | 2018-08-20 07:10:19 -0400 |
commit | d9ca5293569efb255608d3c601107bcfe7060f15 (patch) | |
tree | 7fd8611c333a2f7d703987de3a379ee8292013e2 /grep-pcre2 | |
parent | 0958837ee104985412f08e81b6f08df1e5291042 (diff) |
libripgrep: initial commit introducing libripgrep
libripgrep is not any one library, but rather, a collection of libraries
that roughly separate the following key distinct phases in a grep
implementation:
1. Pattern matching (e.g., by a regex engine).
2. Searching a file using a pattern matcher.
3. Printing results.
Ultimately, both (1) and (3) are defined by de-coupled interfaces, of
which there may be multiple implementations. Namely, (1) is satisfied by
the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by
the `Sink` trait in the `grep2` crate. The searcher (2) ties everything
together and finds results using a matcher and reports those results
using a `Sink` implementation.
Closes #162
Diffstat (limited to 'grep-pcre2')
-rw-r--r-- | grep-pcre2/Cargo.toml | 17 | ||||
-rw-r--r-- | grep-pcre2/LICENSE-MIT | 21 | ||||
-rw-r--r-- | grep-pcre2/README.md | 39 | ||||
-rw-r--r-- | grep-pcre2/UNLICENSE | 24 | ||||
-rw-r--r-- | grep-pcre2/src/error.rs | 59 | ||||
-rw-r--r-- | grep-pcre2/src/lib.rs | 15 | ||||
-rw-r--r-- | grep-pcre2/src/matcher.rs | 425 |
7 files changed, 600 insertions, 0 deletions
diff --git a/grep-pcre2/Cargo.toml b/grep-pcre2/Cargo.toml new file mode 100644 index 00000000..28d1f167 --- /dev/null +++ b/grep-pcre2/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "grep-pcre2" +version = "0.0.1" #:version +authors = ["Andrew Gallant <jamslam@gmail.com>"] +description = """ +Use PCRE2 with the 'grep' crate. +""" +documentation = "https://docs.rs/grep-pcre2" +homepage = "https://github.com/BurntSushi/ripgrep" +repository = "https://github.com/BurntSushi/ripgrep" +readme = "README.md" +keywords = ["regex", "grep", "pcre", "backreference", "look"] +license = "Unlicense/MIT" + +[dependencies] +grep-matcher = { version = "0.0.1", path = "../grep-matcher" } +pcre2 = "0.1" diff --git a/grep-pcre2/LICENSE-MIT b/grep-pcre2/LICENSE-MIT new file mode 100644 index 00000000..3b0a5dc0 --- /dev/null +++ b/grep-pcre2/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/grep-pcre2/README.md b/grep-pcre2/README.md new file mode 100644 index 00000000..7b904256 --- /dev/null +++ b/grep-pcre2/README.md @@ -0,0 +1,39 @@ +grep-pcre2 +---------- +The `grep-pcre2` crate provides an implementation of the `Matcher` trait from +the `grep-matcher` crate. This implementation permits PCRE2 to be used in the +`grep` crate for fast line oriented searching. + +[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.svg)](https://travis-ci.org/BurntSushi/ripgrep) +[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep) +[![](https://img.shields.io/crates/v/grep-pcre2.svg)](https://crates.io/crates/grep-pcre2) + +Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). + +### Documentation + +[https://docs.rs/grep-pcre2](https://docs.rs/grep-pcre2) + +**NOTE:** You probably don't want to use this crate directly. Instead, you +should prefer the facade defined in the +[`grep`](https://docs.rs/grep) +crate. + +If you're looking to just use PCRE2 from Rust, then you probably want the +[`pcre2`](https://docs.rs/pcre2) +crate, which provide high level safe bindings to PCRE2. + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +grep-pcre2 = "0.1" +``` + +and this to your crate root: + +```rust +extern crate grep_pcre2; +``` diff --git a/grep-pcre2/UNLICENSE b/grep-pcre2/UNLICENSE new file mode 100644 index 00000000..68a49daa --- /dev/null +++ b/grep-pcre2/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to <http://unlicense.org/> diff --git a/grep-pcre2/src/error.rs b/grep-pcre2/src/error.rs new file mode 100644 index 00000000..7d0b17bb --- /dev/null +++ b/grep-pcre2/src/error.rs @@ -0,0 +1,59 @@ +use std::error; +use std::fmt; + +/// An error that can occur in this crate. +/// +/// Generally, this error corresponds to problems building a regular +/// expression, whether it's in parsing, compilation or a problem with +/// guaranteeing a configured optimization. +#[derive(Clone, Debug)] +pub struct Error { + kind: ErrorKind, +} + +impl Error { + pub(crate) fn regex<E: error::Error>(err: E) -> Error { + Error { kind: ErrorKind::Regex(err.to_string()) } + } + + /// Return the kind of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } +} + +/// The kind of an error that can occur. +#[derive(Clone, Debug)] +pub enum ErrorKind { + /// An error that occurred as a result of parsing a regular expression. + /// This can be a syntax error or an error that results from attempting to + /// compile a regular expression that is too big. + /// + /// The string here is the underlying error converted to a string. + Regex(String), + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl error::Error for Error { + fn description(&self) -> &str { + match self.kind { + ErrorKind::Regex(_) => "regex error", + ErrorKind::__Nonexhaustive => unreachable!(), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.kind { + ErrorKind::Regex(ref s) => write!(f, "{}", s), + ErrorKind::__Nonexhaustive => unreachable!(), + } + } +} diff --git a/grep-pcre2/src/lib.rs b/grep-pcre2/src/lib.rs new file mode 100644 index 00000000..245ceab7 --- /dev/null +++ b/grep-pcre2/src/lib.rs @@ -0,0 +1,15 @@ +/*! +An implementation of `grep-matcher`'s `Matcher` trait for +[PCRE2](https://www.pcre.org/). +*/ + +#![deny(missing_docs)] + +extern crate grep_matcher; +extern crate pcre2; + +pub use error::{Error, ErrorKind}; +pub use matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder}; + +mod error; +mod matcher; diff --git a/grep-pcre2/src/matcher.rs b/grep-pcre2/src/matcher.rs new file mode 100644 index 00000000..e9c51be2 --- /dev/null +++ b/grep-pcre2/src/matcher.rs @@ -0,0 +1,425 @@ +use std::collections::HashMap; + +use grep_matcher::{Captures, Match, Matcher}; +use pcre2::bytes::{CaptureLocations, Regex, RegexBuilder}; + +use error::Error; + +/// A builder for configuring the compilation of a PCRE2 regex. +#[derive(Clone, Debug)] +pub struct RegexMatcherBuilder { + builder: RegexBuilder, + case_smart: bool, + word: bool, +} + +impl RegexMatcherBuilder { + /// Create a new matcher builder with a default configuration. + pub fn new() -> RegexMatcherBuilder { + RegexMatcherBuilder { + builder: RegexBuilder::new(), + case_smart: false, + word: false, + } + } + + /// Compile the given pattern into a PCRE matcher using the current + /// configuration. + /// + /// If there was a problem compiling the pattern, then an error is + /// returned. + pub fn build(&self, pattern: &str) -> Result<RegexMatcher, Error> { + let mut builder = self.builder.clone(); + if self.case_smart && !has_uppercase_literal(pattern) { + builder.caseless(true); + } + let res = + if self.word { + let pattern = format!(r"(?<!\w)(?:{})(?!\w)", pattern); + builder.build(&pattern) + } else { + builder.build(pattern) + }; + res.map_err(Error::regex).map(|regex| { + let mut names = HashMap::new(); + for (i, name) in regex.capture_names().iter().enumerate() { + if let Some(ref name) = *name { + names.insert(name.to_string(), i); + } + } + RegexMatcher { regex, names } + }) + } + + /// Enables case insensitive matching. + /// + /// If the `utf` option is also set, then Unicode case folding is used + /// to determine case insensitivity. When the `utf` option is not set, + /// then only standard ASCII case insensitivity is considered. + /// + /// This option corresponds to the `i` flag. + pub fn caseless(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.builder.caseless(yes); + self + } + + /// Whether to enable "smart case" or not. + /// + /// When smart case is enabled, the builder will automatically enable + /// case insensitive matching based on how the pattern is written. Namely, + /// case insensitive mode is enabled when both of the following things + /// are believed to be true: + /// + /// 1. The pattern contains at least one literal character. For example, + /// `a\w` contains a literal (`a`) but `\w` does not. + /// 2. Of the literals in the pattern, none of them are considered to be + /// uppercase according to Unicode. For example, `foo\pL` has no + /// uppercase literals but `Foo\pL` does. + /// + /// Note that the implementation of this is not perfect. Namely, `\p{Ll}` + /// will prevent case insensitive matching even though it is part of a meta + /// sequence. This bug will probably never be fixed. + pub fn case_smart(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.case_smart = yes; + self + } + + /// Enables "dot all" matching. + /// + /// When enabled, the `.` metacharacter in the pattern matches any + /// character, include `\n`. When disabled (the default), `.` will match + /// any character except for `\n`. + /// + /// This option corresponds to the `s` flag. + pub fn dotall(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.builder.dotall(yes); + self + } + + /// Enable "extended" mode in the pattern, where whitespace is ignored. + /// + /// This option corresponds to the `x` flag. + pub fn extended(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.builder.extended(yes); + self + } + + /// Enable multiline matching mode. + /// + /// When enabled, the `^` and `$` anchors will match both at the beginning + /// and end of a subject string, in addition to matching at the start of + /// a line and the end of a line. When disabled, the `^` and `$` anchors + /// will only match at the beginning and end of a subject string. + /// + /// This option corresponds to the `m` flag. + pub fn multi_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.builder.multi_line(yes); + self + } + + /// Enable matching of CRLF as a line terminator. + /// + /// When enabled, anchors such as `^` and `$` will match any of the + /// following as a line terminator: `\r`, `\n` or `\r\n`. + /// + /// This is disabled by default, in which case, only `\n` is recognized as + /// a line terminator. + pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.builder.crlf(yes); + self + } + + /// Require that all matches occur on word boundaries. + /// + /// Enabling this option is subtly different than putting `\b` assertions + /// on both sides of your pattern. In particular, a `\b` assertion requires + /// that one side of it match a word character while the other match a + /// non-word character. This option, in contrast, merely requires that + /// one side match a non-word character. + /// + /// For example, `\b-2\b` will not match `foo -2 bar` since `-` is not a + /// word character. However, `-2` with this `word` option enabled will + /// match the `-2` in `foo -2 bar`. + pub fn word(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.word = yes; + self + } + + /// Enable Unicode matching mode. + /// + /// When enabled, the following patterns become Unicode aware: `\b`, `\B`, + /// `\d`, `\D`, `\s`, `\S`, `\w`, `\W`. + /// + /// When set, this implies UTF matching mode. It is not possible to enable + /// Unicode matching mode without enabling UTF matching mode. + /// + /// This is disabled by default. + pub fn ucp(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.builder.ucp(yes); + self + } + + /// Enable UTF matching mode. + /// + /// When enabled, characters are treated as sequences of code units that + /// make up a single codepoint instead of as single bytes. For example, + /// this will cause `.` to match any single UTF-8 encoded codepoint, where + /// as when this is disabled, `.` will any single byte (except for `\n` in + /// both cases, unless "dot all" mode is enabled). + /// + /// Note that when UTF matching mode is enabled, every search performed + /// will do a UTF-8 validation check, which can impact performance. The + /// UTF-8 check can be disabled via the `disable_utf_check` option, but it + /// is undefined behavior to enable UTF matching mode and search invalid + /// UTF-8. + /// + /// This is disabled by default. + pub fn utf(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.builder.utf(yes); + self + } + + /// When UTF matching mode is enabled, this will disable the UTF checking + /// that PCRE2 will normally perform automatically. If UTF matching mode + /// is not enabled, then this has no effect. + /// + /// UTF checking is enabled by default when UTF matching mode is enabled. + /// If UTF matching mode is enabled and UTF checking is enabled, then PCRE2 + /// will return an error if you attempt to search a subject string that is + /// not valid UTF-8. + /// + /// # Safety + /// + /// It is undefined behavior to disable the UTF check in UTF matching mode + /// and search a subject string that is not valid UTF-8. When the UTF check + /// is disabled, callers must guarantee that the subject string is valid + /// UTF-8. + pub unsafe fn disable_utf_check(&mut self) -> &mut RegexMatcherBuilder { + self.builder.disable_utf_check(); + self + } + + /// Enable PCRE2's JIT. + /// + /// This generally speeds up matching quite a bit. The downside is that it + /// can increase the time it takes to compile a pattern. + /// + /// This is disabled by default. + pub fn jit(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.builder.jit(yes); + self + } +} + +/// An implementation of the `Matcher` trait using PCRE2. +#[derive(Clone, Debug)] +pub struct RegexMatcher { + regex: Regex, + names: HashMap<String, usize>, +} + +impl RegexMatcher { + /// Create a new matcher from the given pattern using the default + /// configuration. + pub fn new(pattern: &str) -> Result<RegexMatcher, Error> { + RegexMatcherBuilder::new().build(pattern) + } +} + +impl Matcher for RegexMatcher { + type Captures = RegexCaptures; + type Error = Error; + + fn find_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result<Option<Match>, Error> { + Ok(self.regex + .find_at(haystack, at) + .map_err(Error::regex)? + .map(|m| Match::new(m.start(), m.end()))) + } + + fn new_captures(&self) -> Result<RegexCaptures, Error> { + Ok(RegexCaptures::new(self.regex.capture_locations())) + } + + fn capture_count(&self) -> usize { + self.regex.captures_len() + } + + fn capture_index(&self, name: &str) -> Option<usize> { + self.names.get(name).map(|i| *i) + } + + fn try_find_iter<F, E>( + &self, + haystack: &[u8], + mut matched: F, + ) -> Result<Result<(), E>, Error> + where F: FnMut(Match) -> Result<bool, E> + { + for result in self.regex.find_iter(haystack) { + let m = result.map_err(Error::regex)?; + match matched(Match::new(m.start(), m.end())) { + Ok(true) => continue, + Ok(false) => return Ok(Ok(())), + Err(err) => return Ok(Err(err)), + } + } + Ok(Ok(())) + } + + fn captures_at( + &self, + haystack: &[u8], + at: usize, + caps: &mut RegexCaptures, + ) -> Result<bool, Error> { + Ok(self.regex + .captures_read_at(&mut caps.locs, haystack, at) + .map_err(Error::regex)? + .is_some()) + } +} + +/// Represents the match offsets of each capturing group in a match. +/// +/// The first, or `0`th capture group, always corresponds to the entire match +/// and is guaranteed to be present when a match occurs. The next capture +/// group, at index `1`, corresponds to the first capturing group in the regex, +/// ordered by the position at which the left opening parenthesis occurs. +/// +/// Note that not all capturing groups are guaranteed to be present in a match. +/// For example, in the regex, `(?P<foo>\w)|(?P<bar>\W)`, only one of `foo` +/// or `bar` will ever be set in any given match. +/// +/// In order to access a capture group by name, you'll need to first find the +/// index of the group using the corresponding matcher's `capture_index` +/// method, and then use that index with `RegexCaptures::get`. +#[derive(Clone, Debug)] +pub struct RegexCaptures { + /// Where the locations are stored. + locs: CaptureLocations, +} + +impl Captures for RegexCaptures { + fn len(&self) -> usize { + self.locs.len() + } + + fn get(&self, i: usize) -> Option<Match> { + self.locs.get(i).map(|(s, e)| Match::new(s, e)) + } +} + +impl RegexCaptures { + pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures { + RegexCaptures { locs } + } +} + +/// Determine whether the pattern contains an uppercase character which should +/// negate the effect of the smart-case option. +/// +/// Ideally we would be able to check the AST in order to correctly handle +/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly +/// cased), but PCRE doesn't expose enough details for that kind of analysis. +/// For now, our 'good enough' solution is to simply perform a semi-naïve +/// scan of the input pattern and ignore all characters following a '\'. The +/// This at least lets us support the most common cases, like 'foo\w' and +/// 'foo\S', in an intuitive manner. +fn has_uppercase_literal(pattern: &str) -> bool { + let mut chars = pattern.chars(); + while let Some(c) = chars.next() { + if c == '\\' { + chars.next(); + } else if c.is_uppercase() { + return true; + } + } + false +} + +#[cfg(test)] +mod tests { + use grep_matcher::{LineMatchKind, Matcher}; + use super::*; + + // Test that enabling word matches does the right thing and demonstrate + // the difference between it and surrounding the regex in `\b`. + #[test] + fn word() { + let matcher = RegexMatcherBuilder::new() + .word(true) + .build(r"-2") + .unwrap(); + assert!(matcher.is_match(b"abc -2 foo").unwrap()); + + let matcher = RegexMatcherBuilder::new() + .word(false) + .build(r"\b-2\b") + .unwrap(); + assert!(!matcher.is_match(b"abc -2 foo").unwrap()); + } + + // Test that enabling CRLF permits `$` to match at the end of a line. + #[test] + fn line_terminator_crlf() { + // Test normal use of `$` with a `\n` line terminator. + let matcher = RegexMatcherBuilder::new() + .multi_line(true) + .build(r"abc$") + .unwrap(); + assert!(matcher.is_match(b"abc\n").unwrap()); + + // Test that `$` doesn't match at `\r\n` boundary normally. + let matcher = RegexMatcherBuilder::new() + .multi_line(true) + .build(r"abc$") + .unwrap(); + assert!(!matcher.is_match(b"abc\r\n").unwrap()); + + // Now check the CRLF handling. + let matcher = RegexMatcherBuilder::new() + .multi_line(true) + .crlf(true) + .build(r"abc$") + .unwrap(); + assert!(matcher.is_match(b"abc\r\n").unwrap()); + } + + // Test that smart case works. + #[test] + fn case_smart() { + let matcher = RegexMatcherBuilder::new() + .case_smart(true) + .build(r"abc") + .unwrap(); + assert!(matcher.is_match(b"ABC").unwrap()); + + let matcher = RegexMatcherBuilder::new() + .case_smart(true) + .build(r"aBc") + .unwrap(); + assert!(!matcher.is_match(b"ABC").unwrap()); + } + + // Test that finding candidate lines works as expected. + #[test] + fn candidate_lines() { + fn is_confirmed(m: LineMatchKind) -> bool { + match m { + LineMatchKind::Confirmed(_) => true, + _ => false, + } + } + + let matcher = RegexMatcherBuilder::new() + .build(r"\wfoo\s") + .unwrap(); + let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap(); + assert!(is_confirmed(m)); + } +} |