libripgrep: initial commit introducing libripgrep

libripgrep is not any one library, but rather, a collection of libraries that roughly separate the following key distinct phases in a grep implementation: 1. Pattern matching (e.g., by a regex engine). 2. Searching a file using a pattern matcher. 3. Printing results. Ultimately, both (1) and (3) are defined by de-coupled interfaces, of which there may be multiple implementations. Namely, (1) is satisfied by the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by the `Sink` trait in the `grep2` crate. The searcher (2) ties everything together and finds results using a matcher and reports those results using a `Sink` implementation. Closes #162
author: Andrew Gallant <jamslam@gmail.com> 2018-04-29 09:29:52 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2018-08-20 07:10:19 -0400
commit: d9ca5293569efb255608d3c601107bcfe7060f15 (patch)
tree: 7fd8611c333a2f7d703987de3a379ee8292013e2 /grep-regex
parent: 0958837ee104985412f08e81b6f08df1e5291042 (diff)
15 files changed, 2502 insertions, 0 deletions
diff --git a/grep-regex/Cargo.toml b/grep-regex/Cargo.toml
new file mode 100644
index 00000000..e39c68dd
--- /dev/null
+++ b/grep-regex/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "grep-regex"
+version = "0.0.1"  #:version
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+description = """
+Use Rust's regex library with the 'grep' crate.
+"""
+documentation = "https://docs.rs/grep-regex"
+homepage = "https://github.com/BurntSushi/ripgrep"
+repository = "https://github.com/BurntSushi/ripgrep"
+readme = "README.md"
+keywords = ["regex", "grep", "search", "pattern", "line"]
+license = "Unlicense/MIT"
+
+[dependencies]
+log = "0.4"
+grep-matcher = { version = "0.0.1", path = "../grep-matcher" }
+regex = "1"
+regex-syntax = "0.6"
+thread_local = "0.3.5"
+utf8-ranges = "1"
diff --git a/grep-regex/LICENSE-MIT b/grep-regex/LICENSE-MIT
new file mode 100644
index 00000000..3b0a5dc0
--- /dev/null
+++ b/grep-regex/LICENSE-MIT
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Andrew Gallant
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/grep-regex/README.md b/grep-regex/README.md
new file mode 100644
index 00000000..7940c867
--- /dev/null
+++ b/grep-regex/README.md
@@ -0,0 +1,35 @@
+grep-regex
+----------
+The `grep-regex` crate provides an implementation of the `Matcher` trait from
+the `grep-matcher` crate. This implementation permits Rust's regex engine to
+be used in the `grep` crate for fast line oriented searching.
+
+[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.svg)](https://travis-ci.org/BurntSushi/ripgrep)
+[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep)
+[![](https://img.shields.io/crates/v/grep-regex.svg)](https://crates.io/crates/grep-regex)
+
+Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
+
+### Documentation
+
+[https://docs.rs/grep-regex](https://docs.rs/grep-regex)
+
+**NOTE:** You probably don't want to use this crate directly. Instead, you
+should prefer the facade defined in the
+[`grep`](https://docs.rs/grep)
+crate.
+
+### Usage
+
+Add this to your `Cargo.toml`:
+
+```toml
+[dependencies]
+grep-regex = "0.1"
+```
+
+and this to your crate root:
+
+```rust
+extern crate grep_regex;
+```
diff --git a/grep-regex/UNLICENSE b/grep-regex/UNLICENSE
new file mode 100644
index 00000000..68a49daa
--- /dev/null
+++ b/grep-regex/UNLICENSE
@@ -0,0 +1,24 @@
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org/>
diff --git a/grep-regex/src/ast.rs b/grep-regex/src/ast.rs
new file mode 100644
index 00000000..4e6067ee
--- /dev/null
+++ b/grep-regex/src/ast.rs
@@ -0,0 +1,263 @@
+use regex_syntax::ast::{self, Ast};
+use regex_syntax::ast::parse::Parser;
+
+/// The results of analyzing AST of a regular expression (e.g., for supporting
+/// smart case).
+#[derive(Clone, Debug)]
+pub struct AstAnalysis {
+    /// True if and only if a literal uppercase character occurs in the regex.
+    any_uppercase: bool,
+    /// True if and only if the regex contains any literal at all.
+    any_literal: bool,
+    /// True if and only if the regex consists entirely of a literal and no
+    /// other special regex characters.
+    all_verbatim_literal: bool,
+}
+
+impl AstAnalysis {
+    /// Returns a `AstAnalysis` value by doing analysis on the AST of `pattern`.
+    ///
+    /// If `pattern` is not a valid regular expression, then `None` is
+    /// returned.
+    #[allow(dead_code)]
+    pub fn from_pattern(pattern: &str) -> Option<AstAnalysis> {
+        Parser::new()
+            .parse(pattern)
+            .map(|ast| AstAnalysis::from_ast(&ast))
+            .ok()
+    }
+
+    /// Perform an AST analysis given the AST.
+    pub fn from_ast(ast: &Ast) -> AstAnalysis {
+        let mut analysis = AstAnalysis::new();
+        analysis.from_ast_impl(ast);
+        analysis
+    }
+
+    /// Returns true if and only if a literal uppercase character occurs in
+    /// the pattern.
+    ///
+    /// For example, a pattern like `\pL` contains no uppercase literals,
+    /// even though `L` is uppercase and the `\pL` class contains uppercase
+    /// characters.
+    pub fn any_uppercase(&self) -> bool {
+        self.any_uppercase
+    }
+
+    /// Returns true if and only if the regex contains any literal at all.
+    ///
+    /// For example, a pattern like `\pL` reports `false`, but a pattern like
+    /// `\pLfoo` reports `true`.
+    pub fn any_literal(&self) -> bool {
+        self.any_literal
+    }
+
+    /// Returns true if and only if the entire pattern is a verbatim literal
+    /// with no special meta characters.
+    ///
+    /// When this is true, then the pattern satisfies the following law:
+    /// `escape(pattern) == pattern`. Notable examples where this returns
+    /// `false` include patterns like `a\u0061` even though `\u0061` is just
+    /// a literal `a`.
+    ///
+    /// The purpose of this flag is to determine whether the patterns can be
+    /// given to non-regex substring search algorithms as-is.
+    #[allow(dead_code)]
+    pub fn all_verbatim_literal(&self) -> bool {
+        self.all_verbatim_literal
+    }
+
+    /// Creates a new `AstAnalysis` value with an initial configuration.
+    fn new() -> AstAnalysis {
+        AstAnalysis {
+            any_uppercase: false,
+            any_literal: false,
+            all_verbatim_literal: true,
+        }
+    }
+
+    fn from_ast_impl(&mut self, ast: &Ast) {
+        if self.done() {
+            return;
+        }
+        match *ast {
+            Ast::Empty(_) => {}
+            Ast::Flags(_)
+            | Ast::Dot(_)
+            | Ast::Assertion(_)
+            | Ast::Class(ast::Class::Unicode(_))
+            | Ast::Class(ast::Class::Perl(_)) => {
+                self.all_verbatim_literal = false;
+            }
+            Ast::Literal(ref x) => {
+                self.from_ast_literal(x);
+            }
+            Ast::Class(ast::Class::Bracketed(ref x)) => {
+                self.all_verbatim_literal = false;
+                self.from_ast_class_set(&x.kind);
+            }
+            Ast::Repetition(ref x) => {
+                self.all_verbatim_literal = false;
+                self.from_ast_impl(&x.ast);
+            }
+            Ast::Group(ref x) => {
+                self.all_verbatim_literal = false;
+                self.from_ast_impl(&x.ast);
+            }
+            Ast::Alternation(ref alt) => {
+                self.all_verbatim_literal = false;
+                for x in &alt.asts {
+                    self.from_ast_impl(x);
+                }
+            }
+            Ast::Concat(ref alt) => {
+                for x in &alt.asts {
+                    self.from_ast_impl(x);
+                }
+            }
+        }
+    }
+
+    fn from_ast_class_set(&mut self, ast: &ast::ClassSet) {
+        if self.done() {
+            return;
+        }
+        match *ast {
+            ast::ClassSet::Item(ref item) => {
+                self.from_ast_class_set_item(item);
+            }
+            ast::ClassSet::BinaryOp(ref x) => {
+                self.from_ast_class_set(&x.lhs);
+                self.from_ast_class_set(&x.rhs);
+            }
+        }
+    }
+
+    fn from_ast_class_set_item(&mut self, ast: &ast::ClassSetItem) {
+        if self.done() {
+            return;
+        }
+        match *ast {
+            ast::ClassSetItem::Empty(_)
+            | ast::ClassSetItem::Ascii(_)
+            | ast::ClassSetItem::Unicode(_)
+            | ast::ClassSetItem::Perl(_) => {}
+            ast::ClassSetItem::Literal(ref x) => {
+                self.from_ast_literal(x);
+            }
+            ast::ClassSetItem::Range(ref x) => {
+                self.from_ast_literal(&x.start);
+                self.from_ast_literal(&x.end);
+            }
+            ast::ClassSetItem::Bracketed(ref x) => {
+                self.from_ast_class_set(&x.kind);
+            }
+            ast::ClassSetItem::Union(ref union) => {
+                for x in &union.items {
+                    self.from_ast_class_set_item(x);
+                }
+            }
+        }
+    }
+
+    fn from_ast_literal(&mut self, ast: &ast::Literal) {
+        if ast.kind != ast::LiteralKind::Verbatim {
+            self.all_verbatim_literal = false;
+        }
+        self.any_literal = true;
+        self.any_uppercase = self.any_uppercase || ast.c.is_uppercase();
+    }
+
+    /// Returns true if and only if the attributes can never change no matter
+    /// what other AST it might see.
+    fn done(&self) -> bool {
+        self.any_uppercase && self.any_literal && !self.all_verbatim_literal
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn analysis(pattern: &str) -> AstAnalysis {
+        AstAnalysis::from_pattern(pattern).unwrap()
+    }
+
+    #[test]
+    fn various() {
+        let x = analysis("");
+        assert!(!x.any_uppercase);
+        assert!(!x.any_literal);
+        assert!(x.all_verbatim_literal);
+
+        let x = analysis("foo");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(x.all_verbatim_literal);
+
+        let x = analysis("Foo");
+        assert!(x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(x.all_verbatim_literal);
+
+        let x = analysis("foO");
+        assert!(x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(x.all_verbatim_literal);
+
+        let x = analysis(r"foo\\");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"foo\w");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"foo\S");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"foo\p{Ll}");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"foo[a-z]");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"foo[A-Z]");
+        assert!(x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"foo[\S\t]");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"foo\\S");
+        assert!(x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"\p{Ll}");
+        assert!(!x.any_uppercase);
+        assert!(!x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"aBc\w");
+        assert!(x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+
+        let x = analysis(r"a\u0061");
+        assert!(!x.any_uppercase);
+        assert!(x.any_literal);
+        assert!(!x.all_verbatim_literal);
+    }
+}
diff --git a/grep-regex/src/config.rs b/grep-regex/src/config.rs
new file mode 100644
index 00000000..f3d1f1c1
--- /dev/null
+++ b/grep-regex/src/config.rs
@@ -0,0 +1,265 @@
+use grep_matcher::{ByteSet, LineTerminator};
+use regex::bytes::{Regex, RegexBuilder};
+use regex_syntax::ast::{self, Ast};
+use regex_syntax::hir::Hir;
+
+use ast::AstAnalysis;
+use crlf::crlfify;
+use error::Error;
+use literal::LiteralSets;
+use non_matching::non_matching_bytes;
+use strip::strip_from_match;
+
+/// Config represents the configuration of a regex matcher in this crate.
+/// The configuration is itself a rough combination of the knobs found in
+/// the `regex` crate itself, along with additional `grep-matcher` specific
+/// options.
+///
+/// The configuration can be used to build a "configured" HIR expression. A
+/// configured HIR expression is an HIR expression that is aware of the
+/// configuration which generated it, and provides transformation on that HIR
+/// such that the configuration is preserved.
+#[derive(Clone, Debug)]
+pub struct Config {
+    pub case_insensitive: bool,
+    pub case_smart: bool,
+    pub multi_line: bool,
+    pub dot_matches_new_line: bool,
+    pub swap_greed: bool,
+    pub ignore_whitespace: bool,
+    pub unicode: bool,
+    pub octal: bool,
+    pub size_limit: usize,
+    pub dfa_size_limit: usize,
+    pub nest_limit: u32,
+    pub line_terminator: Option<LineTerminator>,
+    pub crlf: bool,
+    pub word: bool,
+}
+
+impl Default for Config {
+    fn default() -> Config {
+        Config {
+            case_insensitive: false,
+            case_smart: false,
+            multi_line: false,
+            dot_matches_new_line: false,
+            swap_greed: false,
+            ignore_whitespace: false,
+            unicode: true,
+            octal: false,
+            // These size limits are much bigger than what's in the regex
+            // crate.
+            size_limit: 100 * (1<<20),
+            dfa_size_limit: 1000 * (1<<20),
+            nest_limit: 250,
+            line_terminator: None,
+            crlf: false,
+            word: false,
+        }
+    }
+}
+
+impl Config {
+    /// Parse the given pattern and returned its HIR expression along with
+    /// the current configuration.
+    ///
+    /// If there was a problem parsing the given expression then an error
+    /// is returned.
+    pub fn hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
+        let analysis = self.analysis(pattern)?;
+        let expr = ::regex_syntax::ParserBuilder::new()
+            .nest_limit(self.nest_limit)
+            .octal(self.octal)
+            .allow_invalid_utf8(true)
+            .ignore_whitespace(self.ignore_whitespace)
+            .case_insensitive(self.is_case_insensitive(&analysis)?)
+            .multi_line(self.multi_line)
+            .dot_matches_new_line(self.dot_matches_new_line)
+            .swap_greed(self.swap_greed)
+            .unicode(self.unicode)
+            .build()
+            .parse(pattern)
+            .map_err(Error::regex)?;
+        let expr = match self.line_terminator {
+            None => expr,
+            Some(line_term) => strip_from_match(expr, line_term)?,
+        };
+        Ok(ConfiguredHIR {
+            original: pattern.to_string(),
+            config: self.clone(),
+            analysis: analysis,
+            // If CRLF mode is enabled, replace `$` with `(?:\r?$)`.
+            expr: if self.crlf { crlfify(expr) } else { expr },
+        })
+    }
+
+    /// Accounting for the `smart_case` config knob, return true if and only if
+    /// this pattern should be matched case insensitively.
+    fn is_case_insensitive(
+        &self,
+        analysis: &AstAnalysis,
+    ) -> Result<bool, Error> {
+        if self.case_insensitive {
+            return Ok(true);
+        }
+        if !self.case_smart {
+            return Ok(false);
+        }
+        Ok(analysis.any_literal() && !analysis.any_uppercase())
+    }
+
+    /// Perform analysis on the AST of this pattern.
+    ///
+    /// This returns an error if the given pattern failed to parse.
+    fn analysis(&self, pattern: &str) -> Result<AstAnalysis, Error> {
+        Ok(AstAnalysis::from_ast(&self.ast(pattern)?))
+    }
+
+    /// Parse the given pattern into its abstract syntax.
+    ///
+    /// This returns an error if the given pattern failed to parse.
+    fn ast(&self, pattern: &str) -> Result<Ast, Error> {
+        ast::parse::ParserBuilder::new()
+            .nest_limit(self.nest_limit)
+            .octal(self.octal)
+            .ignore_whitespace(self.ignore_whitespace)
+            .build()
+            .parse(pattern)
+            .map_err(Error::regex)
+    }
+}
+
+/// A "configured" HIR expression, which is aware of the configuration which
+/// produced this HIR.
+///
+/// Since the configuration is tracked, values with this type can be
+/// transformed into other HIR expressions (or regular expressions) in a way
+/// that preserves the configuration. For example, the `fast_line_regex`
+/// method will apply literal extraction to the inner HIR and use that to build
+/// a new regex that matches the extracted literals in a way that is
+/// consistent with the configuration that produced this HIR. For example, the
+/// size limits set on the configured HIR will be propagated out to any
+/// subsequently constructed HIR or regular expression.
+#[derive(Clone, Debug)]
+pub struct ConfiguredHIR {
+    original: String,
+    config: Config,
+    analysis: AstAnalysis,
+    expr: Hir,
+}
+
+impl ConfiguredHIR {
+    /// Return the configuration for this HIR expression.
+    pub fn config(&self) -> &Config {
+        &self.config
+    }
+
+    /// Compute the set of non-matching bytes for this HIR expression.
+    pub fn non_matching_bytes(&self) -> ByteSet {
+        non_matching_bytes(&self.expr)
+    }
+
+    /// Builds a regular expression from this HIR expression.
+    pub fn regex(&self) -> Result<Regex, Error> {
+        self.pattern_to_regex(&self.expr.to_string())
+    }
+
+    /// Applies the given function to the concrete syntax of this HIR and then
+    /// generates a new HIR based on the result of the function in a way that
+    /// preserves the configuration.
+    ///
+    /// For example, this can be used to wrap a user provided regular
+    /// expression with additional semantics. e.g., See the `WordMatcher`.
+    pub fn with_pattern<F: FnMut(&str) -> String>(
+        &self,
+        mut f: F,
+    ) -> Result<ConfiguredHIR, Error>
+    {
+        self.pattern_to_hir(&f(&self.expr.to_string()))
+    }
+
+    /// If the current configuration has a line terminator set and if useful
+    /// literals could be extracted, then a regular expression matching those
+    /// literals is returned. If no line terminator is set, then `None` is
+    /// returned.
+    ///
+    /// If compiling the resulting regular expression failed, then an error
+    /// is returned.
+    ///
+    /// This method only returns something when a line terminator is set
+    /// because matches from this regex are generally candidates that must be
+    /// confirmed before reporting a match. When performing a line oriented
+    /// search, confirmation is easy: just extend the candidate match to its
+    /// respective line boundaries and then re-search that line for a full
+    /// match. This only works when the line terminator is set because the line
+    /// terminator setting guarantees that the regex itself can never match
+    /// through the line terminator byte.
+    pub fn fast_line_regex(&self) -> Result<Option<Regex>, Error> {
+        if self.config.line_terminator.is_none() {
+            return Ok(None);
+        }
+        match LiteralSets::new(&self.expr).one_regex() {
+            None => Ok(None),
+            Some(pattern) => self.pattern_to_regex(&pattern).map(Some),
+        }
+    }
+
+    /// Create a regex from the given pattern using this HIR's configuration.
+    fn pattern_to_regex(&self, pattern: &str) -> Result<Regex, Error> {
+        // The settings we explicitly set here are intentionally a subset
+        // of the settings we have. The key point here is that our HIR
+        // expression is computed with the settings in mind, such that setting
+        // them here could actually lead to unintended behavior. For example,
+        // consider the pattern `(?U)a+`. This will get folded into the HIR
+        // as a non-greedy repetition operator which will in turn get printed
+        // to the concrete syntax as `a+?`, which is correct. But if we
+        // set the `swap_greed` option again, then we'll wind up with `(?U)a+?`
+        // which is equal to `a+` which is not the same as what we were given.
+        //
+        // We also don't need to apply `case_insensitive` since this gets
+        // folded into the HIR and would just cause us to do redundant work.
+        //
+        // Finally, we don't need to set `ignore_whitespace` since the concrete
+        // syntax emitted by the HIR printer never needs it.
+        //
+        // We set the rest of the options. Some of them are important, such as
+        // the size limit, and some of them are necessary to preserve the
+        // intention of the original pattern. For example, the Unicode flag
+        // will impact how the WordMatcher functions, namely, whether its
+        // word boundaries are Unicode aware or not.
+        RegexBuilder::new(&pattern)
+            .nest_limit(self.config.nest_limit)
+            .octal(self.config.octal)
+            .multi_line(self.config.multi_line)
+            .dot_matches_new_line(self.config.dot_matches_new_line)
+            .unicode(self.config.unicode)
+            .size_limit(self.config.size_limit)
+            .dfa_size_limit(self.config.dfa_size_limit)
+            .build()
+            .map_err(Error::regex)
+    }
+
+    /// Create an HIR expression from the given pattern using this HIR's
+    /// configuration.
+    fn pattern_to_hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
+        // See `pattern_to_regex` comment for explanation of why we only set
+        // a subset of knobs here. e.g., `swap_greed` is explicitly left out.
+        let expr = ::regex_syntax::ParserBuilder::new()
+            .nest_limit(self.config.nest_limit)
+            .octal(self.config.octal)
+            .allow_invalid_utf8(true)
+            .multi_line(self.config.multi_line)
+            .dot_matches_new_line(self.config.dot_matches_new_line)
+            .unicode(self.config.unicode)
+            .build()
+            .parse(pattern)
+            .map_err(Error::regex)?;
+        Ok(ConfiguredHIR {
+            original: self.original.clone(),
+            config: self.config.clone(),
+            analysis: self.analysis.clone(),
+            expr: expr,
+        })
+    }
+}
diff --git a/grep-regex/src/crlf.rs b/grep-regex/src/crlf.rs
new file mode 100644
index 00000000..ff6b15bf
--- /dev/null
+++ b/grep-regex/src/crlf.rs
@@ -0,0 +1,83 @@
+use regex_syntax::hir::{self, Hir, HirKind};
+
+/// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`.
+///
+/// This does not preserve the exact semantics of the given expression,
+/// however, it does have the useful property that anything that matched the
+/// given expression will also match the returned expression. The difference is
+/// that the returned expression can match possibly other things as well.
+///
+/// The principle reason why we do this is because the underlying regex engine
+/// doesn't support CRLF aware `$` look-around. It's planned to fix it at that
+/// level, but we perform this kludge in the mean time.
+///
+/// Note that while the match preserving semantics are nice and neat, the
+/// match position semantics are quite a bit messier. Namely, `$` only ever
+/// matches the position between characters where as `\r??` can match a
+/// character and change the offset. This is regretable, but works out pretty
+/// nicely in most cases, especially when a match is limited to a single line.
+pub fn crlfify(expr: Hir) -> Hir {
+    match expr.into_kind() {
+        HirKind::Anchor(hir::Anchor::EndLine) => {
+            let concat = Hir::concat(vec![
+                Hir::repetition(hir::Repetition {
+                    kind: hir::RepetitionKind::ZeroOrOne,
+                    greedy: false,
+                    hir: Box::new(Hir::literal(hir::Literal::Unicode('\r'))),
+                }),
+                Hir::anchor(hir::Anchor::EndLine),
+            ]);
+            Hir::group(hir::Group {
+                kind: hir::GroupKind::NonCapturing,
+                hir: Box::new(concat),
+            })
+        }
+        HirKind::Empty => Hir::empty(),
+        HirKind::Literal(x) => Hir::literal(x),
+        HirKind::Class(x) => Hir::class(x),
+        HirKind::Anchor(x) => Hir::anchor(x),
+        HirKind::WordBoundary(x) => Hir::word_boundary(x),
+        HirKind::Repetition(mut x) => {
+            x.hir = Box::new(crlfify(*x.hir));
+            Hir::repetition(x)
+        }
+        HirKind::Group(mut x) => {
+            x.hir = Box::new(crlfify(*x.hir));
+            Hir::group(x)
+        }
+        HirKind::Concat(xs) => {
+            Hir::concat(xs.into_iter().map(crlfify).collect())
+        }
+        HirKind::Alternation(xs) => {
+            Hir::alternation(xs.into_iter().map(crlfify).collect())
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use regex_syntax::Parser;
+    use super::crlfify;
+
+    fn roundtrip(pattern: &str) -> String {
+        let expr1 = Parser::new().parse(pattern).unwrap();
+        let expr2 = crlfify(expr1);
+        expr2.to_string()
+    }
+
+    #[test]
+    fn various() {
+        assert_eq!(roundtrip(r"(?m)$"), "(?:\r??(?m:$))");
+        assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$))(?:\r??(?m:$))");
+        assert_eq!(
+            roundtrip(r"(?m)(?:foo$|bar$)"),
+            "(?:foo(?:\r??(?m:$))|bar(?:\r??(?m:$)))"
+        );
+        assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$))a");
+
+        // Not a multiline `$`, so no crlfifying occurs.
+        assert_eq!(roundtrip(r"$"), "\\z");
+        // It's a literal, derp.
+        assert_eq!(roundtrip(r"\$"), "\\$");
+    }
+}
diff --git a/grep-regex/src/error.rs b/grep-regex/src/error.rs
new file mode 100644
index 00000000..f3bf0825
--- /dev/null
+++ b/grep-regex/src/error.rs
@@ -0,0 +1,88 @@
+use std::error;
+use std::fmt;
+
+use util;
+
+/// An error that can occur in this crate.
+///
+/// Generally, this error corresponds to problems building a regular
+/// expression, whether it's in parsing, compilation or a problem with
+/// guaranteeing a configured optimization.
+#[derive(Clone, Debug)]
+pub struct Error {
+    kind: ErrorKind,
+}
+
+impl Error {
+    pub(crate) fn new(kind: ErrorKind) -> Error {
+        Error { kind }
+    }
+
+    pub(crate) fn regex<E: error::Error>(err: E) -> Error {
+        Error { kind: ErrorKind::Regex(err.to_string()) }
+    }
+
+    /// Return the kind of this error.
+    pub fn kind(&self) -> &ErrorKind {
+        &self.kind
+    }
+}
+
+/// The kind of an error that can occur.
+#[derive(Clone, Debug)]
+pub enum ErrorKind {
+    /// An error that occurred as a result of parsing a regular expression.
+    /// This can be a syntax error or an error that results from attempting to
+    /// compile a regular expression that is too big.
+    ///
+    /// The string here is the underlying error converted to a string.
+    Regex(String),
+    /// An error that occurs when a building a regex that isn't permitted to
+    /// match a line terminator. In general, building the regex will do its
+    /// best to make matching a line terminator impossible (e.g., by removing
+    /// `\n` from the `\s` character class), but if the regex contains a
+    /// `\n` literal, then there is no reasonable choice that can be made and
+    /// therefore an error is reported.
+    ///
+    /// The string is the literal sequence found in the regex that is not
+    /// allowed.
+    NotAllowed(String),
+    /// This error occurs when a non-ASCII line terminator was provided.
+    ///
+    /// The invalid byte is included in this error.
+    InvalidLineTerminator(u8),
+    /// Hints that destructuring should not be exhaustive.
+    ///
+    /// This enum may grow additional variants, so this makes sure clients
+    /// don't count on exhaustive matching. (Otherwise, adding a new variant
+    /// could break existing code.)
+    #[doc(hidden)]
+    __Nonexhaustive,
+}
+
+impl error::Error for Error {
+    fn description(&self) -> &str {
+        match self.kind {
+            ErrorKind::Regex(_) => "regex error",
+            ErrorKind::NotAllowed(_) => "literal not allowed",
+            ErrorKind::InvalidLineTerminator(_) => "invalid line terminator",
+            ErrorKind::__Nonexhaustive => unreachable!(),
+        }
+    }
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self.kind {
+            ErrorKind::Regex(ref s) => write!(f, "{}", s),
+            ErrorKind::NotAllowed(ref lit) => {
+                write!(f, "the literal '{:?}' is not allowed in a regex", lit)
+            }
+            ErrorKind::InvalidLineTerminator(byte) => {
+                let x = util::show_bytes(&[byte]);
+                write!(f, "line terminators must be ASCII, but '{}' is not", x)
+            }
+            ErrorKind::__Nonexhaustive => unreachable!(),
+        }
+    }
+}
diff --git a/grep-regex/src/lib.rs b/grep-regex/src/lib.rs
new file mode 100644
index 00000000..a578d0fc
--- /dev/null
+++ b/grep-regex/src/lib.rs
@@ -0,0 +1,27 @@
+/*!
+An implementation of `grep-matcher`'s `Matcher` trait for Rust's regex engine.
+*/
+
+#![deny(missing_docs)]
+
+extern crate grep_matcher;
+#[macro_use]
+extern crate log;
+extern crate regex;
+extern crate regex_syntax;
+extern crate thread_local;
+extern crate utf8_ranges;
+
+pub use error::{Error, ErrorKind};
author	Andrew Gallant <jamslam@gmail.com>	2018-04-29 09:29:52 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2018-08-20 07:10:19 -0400
commit	d9ca5293569efb255608d3c601107bcfe7060f15 (patch)
tree	7fd8611c333a2f7d703987de3a379ee8292013e2 /grep-regex
parent	0958837ee104985412f08e81b6f08df1e5291042 (diff)