diff options
author | Andrew Gallant <jamslam@gmail.com> | 2018-04-29 09:29:52 -0400 |
---|---|---|
committer | Andrew Gallant <jamslam@gmail.com> | 2018-08-20 07:10:19 -0400 |
commit | d9ca5293569efb255608d3c601107bcfe7060f15 (patch) | |
tree | 7fd8611c333a2f7d703987de3a379ee8292013e2 /grep-regex | |
parent | 0958837ee104985412f08e81b6f08df1e5291042 (diff) |
libripgrep: initial commit introducing libripgrep
libripgrep is not any one library, but rather, a collection of libraries
that roughly separate the following key distinct phases in a grep
implementation:
1. Pattern matching (e.g., by a regex engine).
2. Searching a file using a pattern matcher.
3. Printing results.
Ultimately, both (1) and (3) are defined by de-coupled interfaces, of
which there may be multiple implementations. Namely, (1) is satisfied by
the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by
the `Sink` trait in the `grep2` crate. The searcher (2) ties everything
together and finds results using a matcher and reports those results
using a `Sink` implementation.
Closes #162
Diffstat (limited to 'grep-regex')
-rw-r--r-- | grep-regex/Cargo.toml | 21 | ||||
-rw-r--r-- | grep-regex/LICENSE-MIT | 21 | ||||
-rw-r--r-- | grep-regex/README.md | 35 | ||||
-rw-r--r-- | grep-regex/UNLICENSE | 24 | ||||
-rw-r--r-- | grep-regex/src/ast.rs | 263 | ||||
-rw-r--r-- | grep-regex/src/config.rs | 265 | ||||
-rw-r--r-- | grep-regex/src/crlf.rs | 83 | ||||
-rw-r--r-- | grep-regex/src/error.rs | 88 | ||||
-rw-r--r-- | grep-regex/src/lib.rs | 27 | ||||
-rw-r--r-- | grep-regex/src/literal.rs | 304 | ||||
-rw-r--r-- | grep-regex/src/matcher.rs | 864 | ||||
-rw-r--r-- | grep-regex/src/non_matching.rs | 128 | ||||
-rw-r--r-- | grep-regex/src/strip.rs | 154 | ||||
-rw-r--r-- | grep-regex/src/util.rs | 29 | ||||
-rw-r--r-- | grep-regex/src/word.rs | 196 |
15 files changed, 2502 insertions, 0 deletions
diff --git a/grep-regex/Cargo.toml b/grep-regex/Cargo.toml new file mode 100644 index 00000000..e39c68dd --- /dev/null +++ b/grep-regex/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "grep-regex" +version = "0.0.1" #:version +authors = ["Andrew Gallant <jamslam@gmail.com>"] +description = """ +Use Rust's regex library with the 'grep' crate. +""" +documentation = "https://docs.rs/grep-regex" +homepage = "https://github.com/BurntSushi/ripgrep" +repository = "https://github.com/BurntSushi/ripgrep" +readme = "README.md" +keywords = ["regex", "grep", "search", "pattern", "line"] +license = "Unlicense/MIT" + +[dependencies] +log = "0.4" +grep-matcher = { version = "0.0.1", path = "../grep-matcher" } +regex = "1" +regex-syntax = "0.6" +thread_local = "0.3.5" +utf8-ranges = "1" diff --git a/grep-regex/LICENSE-MIT b/grep-regex/LICENSE-MIT new file mode 100644 index 00000000..3b0a5dc0 --- /dev/null +++ b/grep-regex/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/grep-regex/README.md b/grep-regex/README.md new file mode 100644 index 00000000..7940c867 --- /dev/null +++ b/grep-regex/README.md @@ -0,0 +1,35 @@ +grep-regex +---------- +The `grep-regex` crate provides an implementation of the `Matcher` trait from +the `grep-matcher` crate. This implementation permits Rust's regex engine to +be used in the `grep` crate for fast line oriented searching. + +[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.svg)](https://travis-ci.org/BurntSushi/ripgrep) +[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep) +[![](https://img.shields.io/crates/v/grep-regex.svg)](https://crates.io/crates/grep-regex) + +Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). + +### Documentation + +[https://docs.rs/grep-regex](https://docs.rs/grep-regex) + +**NOTE:** You probably don't want to use this crate directly. Instead, you +should prefer the facade defined in the +[`grep`](https://docs.rs/grep) +crate. + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +grep-regex = "0.1" +``` + +and this to your crate root: + +```rust +extern crate grep_regex; +``` diff --git a/grep-regex/UNLICENSE b/grep-regex/UNLICENSE new file mode 100644 index 00000000..68a49daa --- /dev/null +++ b/grep-regex/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to <http://unlicense.org/> diff --git a/grep-regex/src/ast.rs b/grep-regex/src/ast.rs new file mode 100644 index 00000000..4e6067ee --- /dev/null +++ b/grep-regex/src/ast.rs @@ -0,0 +1,263 @@ +use regex_syntax::ast::{self, Ast}; +use regex_syntax::ast::parse::Parser; + +/// The results of analyzing AST of a regular expression (e.g., for supporting +/// smart case). +#[derive(Clone, Debug)] +pub struct AstAnalysis { + /// True if and only if a literal uppercase character occurs in the regex. + any_uppercase: bool, + /// True if and only if the regex contains any literal at all. + any_literal: bool, + /// True if and only if the regex consists entirely of a literal and no + /// other special regex characters. + all_verbatim_literal: bool, +} + +impl AstAnalysis { + /// Returns a `AstAnalysis` value by doing analysis on the AST of `pattern`. + /// + /// If `pattern` is not a valid regular expression, then `None` is + /// returned. + #[allow(dead_code)] + pub fn from_pattern(pattern: &str) -> Option<AstAnalysis> { + Parser::new() + .parse(pattern) + .map(|ast| AstAnalysis::from_ast(&ast)) + .ok() + } + + /// Perform an AST analysis given the AST. + pub fn from_ast(ast: &Ast) -> AstAnalysis { + let mut analysis = AstAnalysis::new(); + analysis.from_ast_impl(ast); + analysis + } + + /// Returns true if and only if a literal uppercase character occurs in + /// the pattern. + /// + /// For example, a pattern like `\pL` contains no uppercase literals, + /// even though `L` is uppercase and the `\pL` class contains uppercase + /// characters. + pub fn any_uppercase(&self) -> bool { + self.any_uppercase + } + + /// Returns true if and only if the regex contains any literal at all. + /// + /// For example, a pattern like `\pL` reports `false`, but a pattern like + /// `\pLfoo` reports `true`. + pub fn any_literal(&self) -> bool { + self.any_literal + } + + /// Returns true if and only if the entire pattern is a verbatim literal + /// with no special meta characters. + /// + /// When this is true, then the pattern satisfies the following law: + /// `escape(pattern) == pattern`. Notable examples where this returns + /// `false` include patterns like `a\u0061` even though `\u0061` is just + /// a literal `a`. + /// + /// The purpose of this flag is to determine whether the patterns can be + /// given to non-regex substring search algorithms as-is. + #[allow(dead_code)] + pub fn all_verbatim_literal(&self) -> bool { + self.all_verbatim_literal + } + + /// Creates a new `AstAnalysis` value with an initial configuration. + fn new() -> AstAnalysis { + AstAnalysis { + any_uppercase: false, + any_literal: false, + all_verbatim_literal: true, + } + } + + fn from_ast_impl(&mut self, ast: &Ast) { + if self.done() { + return; + } + match *ast { + Ast::Empty(_) => {} + Ast::Flags(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::Class(ast::Class::Unicode(_)) + | Ast::Class(ast::Class::Perl(_)) => { + self.all_verbatim_literal = false; + } + Ast::Literal(ref x) => { + self.from_ast_literal(x); + } + Ast::Class(ast::Class::Bracketed(ref x)) => { + self.all_verbatim_literal = false; + self.from_ast_class_set(&x.kind); + } + Ast::Repetition(ref x) => { + self.all_verbatim_literal = false; + self.from_ast_impl(&x.ast); + } + Ast::Group(ref x) => { + self.all_verbatim_literal = false; + self.from_ast_impl(&x.ast); + } + Ast::Alternation(ref alt) => { + self.all_verbatim_literal = false; + for x in &alt.asts { + self.from_ast_impl(x); + } + } + Ast::Concat(ref alt) => { + for x in &alt.asts { + self.from_ast_impl(x); + } + } + } + } + + fn from_ast_class_set(&mut self, ast: &ast::ClassSet) { + if self.done() { + return; + } + match *ast { + ast::ClassSet::Item(ref item) => { + self.from_ast_class_set_item(item); + } + ast::ClassSet::BinaryOp(ref x) => { + self.from_ast_class_set(&x.lhs); + self.from_ast_class_set(&x.rhs); + } + } + } + + fn from_ast_class_set_item(&mut self, ast: &ast::ClassSetItem) { + if self.done() { + return; + } + match *ast { + ast::ClassSetItem::Empty(_) + | ast::ClassSetItem::Ascii(_) + | ast::ClassSetItem::Unicode(_) + | ast::ClassSetItem::Perl(_) => {} + ast::ClassSetItem::Literal(ref x) => { + self.from_ast_literal(x); + } + ast::ClassSetItem::Range(ref x) => { + self.from_ast_literal(&x.start); + self.from_ast_literal(&x.end); + } + ast::ClassSetItem::Bracketed(ref x) => { + self.from_ast_class_set(&x.kind); + } + ast::ClassSetItem::Union(ref union) => { + for x in &union.items { + self.from_ast_class_set_item(x); + } + } + } + } + + fn from_ast_literal(&mut self, ast: &ast::Literal) { + if ast.kind != ast::LiteralKind::Verbatim { + self.all_verbatim_literal = false; + } + self.any_literal = true; + self.any_uppercase = self.any_uppercase || ast.c.is_uppercase(); + } + + /// Returns true if and only if the attributes can never change no matter + /// what other AST it might see. + fn done(&self) -> bool { + self.any_uppercase && self.any_literal && !self.all_verbatim_literal + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn analysis(pattern: &str) -> AstAnalysis { + AstAnalysis::from_pattern(pattern).unwrap() + } + + #[test] + fn various() { + let x = analysis(""); + assert!(!x.any_uppercase); + assert!(!x.any_literal); + assert!(x.all_verbatim_literal); + + let x = analysis("foo"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + assert!(x.all_verbatim_literal); + + let x = analysis("Foo"); + assert!(x.any_uppercase); + assert!(x.any_literal); + assert!(x.all_verbatim_literal); + + let x = analysis("foO"); + assert!(x.any_uppercase); + assert!(x.any_literal); + assert!(x.all_verbatim_literal); + + let x = analysis(r"foo\\"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"foo\w"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"foo\S"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"foo\p{Ll}"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"foo[a-z]"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"foo[A-Z]"); + assert!(x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"foo[\S\t]"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"foo\\S"); + assert!(x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"\p{Ll}"); + assert!(!x.any_uppercase); + assert!(!x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"aBc\w"); + assert!(x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"a\u0061"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + } +} diff --git a/grep-regex/src/config.rs b/grep-regex/src/config.rs new file mode 100644 index 00000000..f3d1f1c1 --- /dev/null +++ b/grep-regex/src/config.rs @@ -0,0 +1,265 @@ +use grep_matcher::{ByteSet, LineTerminator}; +use regex::bytes::{Regex, RegexBuilder}; +use regex_syntax::ast::{self, Ast}; +use regex_syntax::hir::Hir; + +use ast::AstAnalysis; +use crlf::crlfify; +use error::Error; +use literal::LiteralSets; +use non_matching::non_matching_bytes; +use strip::strip_from_match; + +/// Config represents the configuration of a regex matcher in this crate. +/// The configuration is itself a rough combination of the knobs found in +/// the `regex` crate itself, along with additional `grep-matcher` specific +/// options. +/// +/// The configuration can be used to build a "configured" HIR expression. A +/// configured HIR expression is an HIR expression that is aware of the +/// configuration which generated it, and provides transformation on that HIR +/// such that the configuration is preserved. +#[derive(Clone, Debug)] +pub struct Config { + pub case_insensitive: bool, + pub case_smart: bool, + pub multi_line: bool, + pub dot_matches_new_line: bool, + pub swap_greed: bool, + pub ignore_whitespace: bool, + pub unicode: bool, + pub octal: bool, + pub size_limit: usize, + pub dfa_size_limit: usize, + pub nest_limit: u32, + pub line_terminator: Option<LineTerminator>, + pub crlf: bool, + pub word: bool, +} + +impl Default for Config { + fn default() -> Config { + Config { + case_insensitive: false, + case_smart: false, + multi_line: false, + dot_matches_new_line: false, + swap_greed: false, + ignore_whitespace: false, + unicode: true, + octal: false, + // These size limits are much bigger than what's in the regex + // crate. + size_limit: 100 * (1<<20), + dfa_size_limit: 1000 * (1<<20), + nest_limit: 250, + line_terminator: None, + crlf: false, + word: false, + } + } +} + +impl Config { + /// Parse the given pattern and returned its HIR expression along with + /// the current configuration. + /// + /// If there was a problem parsing the given expression then an error + /// is returned. + pub fn hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> { + let analysis = self.analysis(pattern)?; + let expr = ::regex_syntax::ParserBuilder::new() + .nest_limit(self.nest_limit) + .octal(self.octal) + .allow_invalid_utf8(true) + .ignore_whitespace(self.ignore_whitespace) + .case_insensitive(self.is_case_insensitive(&analysis)?) + .multi_line(self.multi_line) + .dot_matches_new_line(self.dot_matches_new_line) + .swap_greed(self.swap_greed) + .unicode(self.unicode) + .build() + .parse(pattern) + .map_err(Error::regex)?; + let expr = match self.line_terminator { + None => expr, + Some(line_term) => strip_from_match(expr, line_term)?, + }; + Ok(ConfiguredHIR { + original: pattern.to_string(), + config: self.clone(), + analysis: analysis, + // If CRLF mode is enabled, replace `$` with `(?:\r?$)`. + expr: if self.crlf { crlfify(expr) } else { expr }, + }) + } + + /// Accounting for the `smart_case` config knob, return true if and only if + /// this pattern should be matched case insensitively. + fn is_case_insensitive( + &self, + analysis: &AstAnalysis, + ) -> Result<bool, Error> { + if self.case_insensitive { + return Ok(true); + } + if !self.case_smart { + return Ok(false); + } + Ok(analysis.any_literal() && !analysis.any_uppercase()) + } + + /// Perform analysis on the AST of this pattern. + /// + /// This returns an error if the given pattern failed to parse. + fn analysis(&self, pattern: &str) -> Result<AstAnalysis, Error> { + Ok(AstAnalysis::from_ast(&self.ast(pattern)?)) + } + + /// Parse the given pattern into its abstract syntax. + /// + /// This returns an error if the given pattern failed to parse. + fn ast(&self, pattern: &str) -> Result<Ast, Error> { + ast::parse::ParserBuilder::new() + .nest_limit(self.nest_limit) + .octal(self.octal) + .ignore_whitespace(self.ignore_whitespace) + .build() + .parse(pattern) + .map_err(Error::regex) + } +} + +/// A "configured" HIR expression, which is aware of the configuration which +/// produced this HIR. +/// +/// Since the configuration is tracked, values with this type can be +/// transformed into other HIR expressions (or regular expressions) in a way +/// that preserves the configuration. For example, the `fast_line_regex` +/// method will apply literal extraction to the inner HIR and use that to build +/// a new regex that matches the extracted literals in a way that is +/// consistent with the configuration that produced this HIR. For example, the +/// size limits set on the configured HIR will be propagated out to any +/// subsequently constructed HIR or regular expression. +#[derive(Clone, Debug)] +pub struct ConfiguredHIR { + original: String, + config: Config, + analysis: AstAnalysis, + expr: Hir, +} + +impl ConfiguredHIR { + /// Return the configuration for this HIR expression. + pub fn config(&self) -> &Config { + &self.config + } + + /// Compute the set of non-matching bytes for this HIR expression. + pub fn non_matching_bytes(&self) -> ByteSet { + non_matching_bytes(&self.expr) + } + + /// Builds a regular expression from this HIR expression. + pub fn regex(&self) -> Result<Regex, Error> { + self.pattern_to_regex(&self.expr.to_string()) + } + + /// Applies the given function to the concrete syntax of this HIR and then + /// generates a new HIR based on the result of the function in a way that + /// preserves the configuration. + /// + /// For example, this can be used to wrap a user provided regular + /// expression with additional semantics. e.g., See the `WordMatcher`. + pub fn with_pattern<F: FnMut(&str) -> String>( + &self, + mut f: F, + ) -> Result<ConfiguredHIR, Error> + { + self.pattern_to_hir(&f(&self.expr.to_string())) + } + + /// If the current configuration has a line terminator set and if useful + /// literals could be extracted, then a regular expression matching those + /// literals is returned. If no line terminator is set, then `None` is + /// returned. + /// + /// If compiling the resulting regular expression failed, then an error + /// is returned. + /// + /// This method only returns something when a line terminator is set + /// because matches from this regex are generally candidates that must be + /// confirmed before reporting a match. When performing a line oriented + /// search, confirmation is easy: just extend the candidate match to its + /// respective line boundaries and then re-search that line for a full + /// match. This only works when the line terminator is set because the line + /// terminator setting guarantees that the regex itself can never match + /// through the line terminator byte. + pub fn fast_line_regex(&self) -> Result<Option<Regex>, Error> { + if self.config.line_terminator.is_none() { + return Ok(None); + } + match LiteralSets::new(&self.expr).one_regex() { + None => Ok(None), + Some(pattern) => self.pattern_to_regex(&pattern).map(Some), + } + } + + /// Create a regex from the given pattern using this HIR's configuration. + fn pattern_to_regex(&self, pattern: &str) -> Result<Regex, Error> { + // The settings we explicitly set here are intentionally a subset + // of the settings we have. The key point here is that our HIR + // expression is computed with the settings in mind, such that setting + // them here could actually lead to unintended behavior. For example, + // consider the pattern `(?U)a+`. This will get folded into the HIR + // as a non-greedy repetition operator which will in turn get printed + // to the concrete syntax as `a+?`, which is correct. But if we + // set the `swap_greed` option again, then we'll wind up with `(?U)a+?` + // which is equal to `a+` which is not the same as what we were given. + // + // We also don't need to apply `case_insensitive` since this gets + // folded into the HIR and would just cause us to do redundant work. + // + // Finally, we don't need to set `ignore_whitespace` since the concrete + // syntax emitted by the HIR printer never needs it. + // + // We set the rest of the options. Some of them are important, such as + // the size limit, and some of them are necessary to preserve the + // intention of the original pattern. For example, the Unicode flag + // will impact how the WordMatcher functions, namely, whether its + // word boundaries are Unicode aware or not. + RegexBuilder::new(&pattern) + .nest_limit(self.config.nest_limit) + .octal(self.config.octal) + .multi_line(self.config.multi_line) + .dot_matches_new_line(self.config.dot_matches_new_line) + .unicode(self.config.unicode) + .size_limit(self.config.size_limit) + .dfa_size_limit(self.config.dfa_size_limit) + .build() + .map_err(Error::regex) + } + + /// Create an HIR expression from the given pattern using this HIR's + /// configuration. + fn pattern_to_hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> { + // See `pattern_to_regex` comment for explanation of why we only set + // a subset of knobs here. e.g., `swap_greed` is explicitly left out. + let expr = ::regex_syntax::ParserBuilder::new() + .nest_limit(self.config.nest_limit) + .octal(self.config.octal) + .allow_invalid_utf8(true) + .multi_line(self.config.multi_line) + .dot_matches_new_line(self.config.dot_matches_new_line) + .unicode(self.config.unicode) + .build() + .parse(pattern) + .map_err(Error::regex)?; + Ok(ConfiguredHIR { + original: self.original.clone(), + config: self.config.clone(), + analysis: self.analysis.clone(), + expr: expr, + }) + } +} diff --git a/grep-regex/src/crlf.rs b/grep-regex/src/crlf.rs new file mode 100644 index 00000000..ff6b15bf --- /dev/null +++ b/grep-regex/src/crlf.rs @@ -0,0 +1,83 @@ +use regex_syntax::hir::{self, Hir, HirKind}; + +/// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`. +/// +/// This does not preserve the exact semantics of the given expression, +/// however, it does have the useful property that anything that matched the +/// given expression will also match the returned expression. The difference is +/// that the returned expression can match possibly other things as well. +/// +/// The principle reason why we do this is because the underlying regex engine +/// doesn't support CRLF aware `$` look-around. It's planned to fix it at that +/// level, but we perform this kludge in the mean time. +/// +/// Note that while the match preserving semantics are nice and neat, the +/// match position semantics are quite a bit messier. Namely, `$` only ever +/// matches the position between characters where as `\r??` can match a +/// character and change the offset. This is regretable, but works out pretty +/// nicely in most cases, especially when a match is limited to a single line. +pub fn crlfify(expr: Hir) -> Hir { + match expr.into_kind() { + HirKind::Anchor(hir::Anchor::EndLine) => { + let concat = Hir::concat(vec![ + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrOne, + greedy: false, + hir: Box::new(Hir::literal(hir::Literal::Unicode('\r'))), + }), + Hir::anchor(hir::Anchor::EndLine), + ]); + Hir::group(hir::Group { + kind: hir::GroupKind::NonCapturing, + hir: Box::new(concat), + }) + } + HirKind::Empty => Hir::empty(), + HirKind::Literal(x) => Hir::literal(x), + HirKind::Class(x) => Hir::class(x), + HirKind::Anchor(x) => Hir::anchor(x), + HirKind::WordBoundary(x) => Hir::word_boundary(x), + HirKind::Repetition(mut x) => { + x.hir = Box::new(crlfify(*x.hir)); + Hir::repetition(x) + } + HirKind::Group(mut x) => { + x.hir = Box::new(crlfify(*x.hir)); + Hir::group(x) + } + HirKind::Concat(xs) => { + Hir::concat(xs.into_iter().map(crlfify).collect()) + } + HirKind::Alternation(xs) => { + Hir::alternation(xs.into_iter().map(crlfify).collect()) + } + } +} + +#[cfg(test)] +mod tests { + use regex_syntax::Parser; + use super::crlfify; + + fn roundtrip(pattern: &str) -> String { + let expr1 = Parser::new().parse(pattern).unwrap(); + let expr2 = crlfify(expr1); + expr2.to_string() + } + + #[test] + fn various() { + assert_eq!(roundtrip(r"(?m)$"), "(?:\r??(?m:$))"); + assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$))(?:\r??(?m:$))"); + assert_eq!( + roundtrip(r"(?m)(?:foo$|bar$)"), + "(?:foo(?:\r??(?m:$))|bar(?:\r??(?m:$)))" + ); + assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$))a"); + + // Not a multiline `$`, so no crlfifying occurs. + assert_eq!(roundtrip(r"$"), "\\z"); + // It's a literal, derp. + assert_eq!(roundtrip(r"\$"), "\\$"); + } +} diff --git a/grep-regex/src/error.rs b/grep-regex/src/error.rs new file mode 100644 index 00000000..f3bf0825 --- /dev/null +++ b/grep-regex/src/error.rs @@ -0,0 +1,88 @@ +use std::error; +use std::fmt; + +use util; + +/// An error that can occur in this crate. +/// +/// Generally, this error corresponds to problems building a regular +/// expression, whether it's in parsing, compilation or a problem with +/// guaranteeing a configured optimization. +#[derive(Clone, Debug)] +pub struct Error { + kind: ErrorKind, +} + +impl Error { + pub(crate) fn new(kind: ErrorKind) -> Error { + Error { kind } + } + + pub(crate) fn regex<E: error::Error>(err: E) -> Error { + Error { kind: ErrorKind::Regex(err.to_string()) } + } + + /// Return the kind of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } +} + +/// The kind of an error that can occur. +#[derive(Clone, Debug)] +pub enum ErrorKind { + /// An error that occurred as a result of parsing a regular expression. + /// This can be a syntax error or an error that results from attempting to + /// compile a regular expression that is too big. + /// + /// The string here is the underlying error converted to a string. + Regex(String), + /// An error that occurs when a building a regex that isn't permitted to + /// match a line terminator. In general, building the regex will do its + /// best to make matching a line terminator impossible (e.g., by removing + /// `\n` from the `\s` character class), but if the regex contains a + /// `\n` literal, then there is no reasonable choice that can be made and + /// therefore an error is reported. + /// + /// The string is the literal sequence found in the regex that is not + /// allowed. + NotAllowed(String), + /// This error occurs when a non-ASCII line terminator was provided. + /// + /// The invalid byte is included in this error. + InvalidLineTerminator(u8), + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl error::Error for Error { + fn description(&self) -> &str { + match self.kind { + ErrorKind::Regex(_) => "regex error", + ErrorKind::NotAllowed(_) => "literal not allowed", + ErrorKind::InvalidLineTerminator(_) => "invalid line terminator", + ErrorKind::__Nonexhaustive => unreachable!(), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.kind { + ErrorKind::Regex(ref s) => write!(f, "{}", s), + ErrorKind::NotAllowed(ref lit) => { + write!(f, "the literal '{:?}' is not allowed in a regex", lit) + } + ErrorKind::InvalidLineTerminator(byte) => { + let x = util::show_bytes(&[byte]); + write!(f, "line terminators must be ASCII, but '{}' is not", x) + } + ErrorKind::__Nonexhaustive => unreachable!(), + } + } +} diff --git a/grep-regex/src/lib.rs b/grep-regex/src/lib.rs new file mode 100644 index 00000000..a578d0fc --- /dev/null +++ b/grep-regex/src/lib.rs @@ -0,0 +1,27 @@ +/*! +An implementation of `grep-matcher`'s `Matcher` trait for Rust's regex engine. +*/ + +#![deny(missing_docs)] + +extern crate grep_matcher; +#[macro_use] +extern crate log; +extern crate regex; +extern crate regex_syntax; +extern crate thread_local; +extern crate utf8_ranges; + +pub use error::{Error, ErrorKind}; |