summaryrefslogtreecommitdiffstats
path: root/grep
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2018-08-03 17:26:22 -0400
committerAndrew Gallant <jamslam@gmail.com>2018-08-20 07:10:19 -0400
commitbb110c1ebeeda452046830b3991f705f5759da92 (patch)
treecc2b0112a3ca9b8d05cf1e953553907d71564082 /grep
parentd9ca5293569efb255608d3c601107bcfe7060f15 (diff)
ripgrep: migrate to libripgrep
This commit does the work to delete the old `grep` crate and effectively rewrite most of ripgrep core to use the new libripgrep crates. The new `grep` crate is now a facade that collects the various crates that make up libripgrep. The most complex part of ripgrep core is now arguably the translation between command line parameters and the library options, which is ultimately where we want to be.
Diffstat (limited to 'grep')
-rw-r--r--grep/Cargo.toml21
-rw-r--r--grep/README.md41
-rw-r--r--grep/examples/simplegrep.rs107
-rw-r--r--grep/src/lib.rs94
-rw-r--r--grep/src/literals.rs274
-rw-r--r--grep/src/nonl.rs74
-rw-r--r--grep/src/search.rs317
-rw-r--r--grep/src/smart_case.rs191
-rw-r--r--grep/src/word_boundary.rs53
9 files changed, 178 insertions, 994 deletions
diff --git a/grep/Cargo.toml b/grep/Cargo.toml
index 562bde1e..e6e2fc07 100644
--- a/grep/Cargo.toml
+++ b/grep/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "grep"
-version = "0.1.9" #:version
+version = "0.2.0" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = """
Fast line oriented regex searching as a library.
@@ -13,7 +13,18 @@ keywords = ["regex", "grep", "egrep", "search", "pattern"]
license = "Unlicense/MIT"
[dependencies]
-log = "0.4"
-memchr = "2"
-regex = "1"
-regex-syntax = "0.6"
+grep-matcher = { version = "0.0.1", path = "../grep-matcher" }
+grep-pcre2 = { version = "0.0.1", path = "../grep-pcre2", optional = true }
+grep-printer = { version = "0.0.1", path = "../grep-printer" }
+grep-regex = { version = "0.0.1", path = "../grep-regex" }
+grep-searcher = { version = "0.0.1", path = "../grep-searcher" }
+
+[dev-dependencies]
+atty = "0.2.11"
+termcolor = "1"
+walkdir = "2.2.0"
+
+[features]
+avx-accel = ["grep-searcher/avx-accel"]
+simd-accel = ["grep-searcher/simd-accel"]
+pcre2 = ["grep-pcre2"]
diff --git a/grep/README.md b/grep/README.md
index 86cc8c2c..c376d8af 100644
--- a/grep/README.md
+++ b/grep/README.md
@@ -1,4 +1,41 @@
grep
----
-This is a *library* that provides grep-style line-by-line regex searching (with
-comparable performance to `grep` itself).
+ripgrep, as a library.
+
+[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.svg)](https://travis-ci.org/BurntSushi/ripgrep)
+[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep)
+[![](https://img.shields.io/crates/v/grep.svg)](https://crates.io/crates/grep)
+
+Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
+
+
+### Documentation
+
+[https://docs.rs/grep](https://docs.rs/grep)
+
+NOTE: This crate isn't ready for wide use yet. Ambitious individuals can
+probably piece together the parts, but there is no high level documentation
+describing how all of the pieces fit together.
+
+
+### Usage
+
+Add this to your `Cargo.toml`:
+
+```toml
+[dependencies]
+grep = "0.2"
+```
+
+and this to your crate root:
+
+```rust
+extern crate grep;
+```
+
+
+### Features
+
+This crate provides a `pcre2` feature (disabled by default) which, when
+enabled, re-exports the `grep-pcre2` crate as an alternative `Matcher`
+implementation to the standard `grep-regex` implementation.
diff --git a/grep/examples/simplegrep.rs b/grep/examples/simplegrep.rs
new file mode 100644
index 00000000..fb2d4001
--- /dev/null
+++ b/grep/examples/simplegrep.rs
@@ -0,0 +1,107 @@
+extern crate atty;
+extern crate grep;
+extern crate termcolor;
+extern crate walkdir;
+
+use std::env;
+use std::error;
+use std::ffi::OsString;
+use std::path::Path;
+use std::process;
+use std::result;
+
+use grep::printer::{ColorSpecs, StandardBuilder};
+use grep::regex::RegexMatcher;
+use grep::searcher::{BinaryDetection, SearcherBuilder};
+use termcolor::{ColorChoice, StandardStream};
+use walkdir::WalkDir;
+
+macro_rules! fail {
+ ($($tt:tt)*) => {
+ return Err(From::from(format!($($tt)*)));
+ }
+}
+
+type Result<T> = result::Result<T, Box<error::Error>>;
+
+fn main() {
+ if let Err(err) = try_main() {
+ eprintln!("{}", err);
+ process::exit(1);
+ }
+}
+
+fn try_main() -> Result<()> {
+ let mut args: Vec<OsString> = env::args_os().collect();
+ if args.len() < 2 {
+ fail!("Usage: simplegrep <pattern> [<path> ...]");
+ }
+ if args.len() == 2 {
+ args.push(OsString::from("./"));
+ }
+ let pattern = match args[1].clone().into_string() {
+ Ok(pattern) => pattern,
+ Err(_) => {
+ fail!(
+ "pattern is not valid UTF-8: '{:?}'",
+ args[1].to_string_lossy()
+ );
+ }
+ };
+ search(&pattern, &args[2..])
+}
+
+fn search(pattern: &str, paths: &[OsString]) -> Result<()> {
+ let matcher = RegexMatcher::new_line_matcher(&pattern)?;
+ let mut searcher = SearcherBuilder::new()
+ .binary_detection(BinaryDetection::quit(b'\x00'))
+ .build();
+ let mut printer = StandardBuilder::new()
+ .color_specs(colors())
+ .build(StandardStream::stdout(color_choice()));
+
+ for path in paths {
+ for result in WalkDir::new(path) {
+ let dent = match result {
+ Ok(dent) => dent,
+ Err(err) => {
+ eprintln!(
+ "{}: {}",
+ err.path().unwrap_or(Path::new("error")).display(),
+ err,
+ );
+ continue;
+ }
+ };
+ if !dent.file_type().is_file() {
+ continue;
+ }
+ let result = searcher.search_path(
+ &matcher,
+ dent.path(),
+ printer.sink_with_path(&matcher, dent.path()),
+ );
+ if let Err(err) = result {
+ eprintln!("{}: {}", dent.path().display(), err);
+ }
+ }
+ }
+ Ok(())
+}
+
+fn color_choice() -> ColorChoice {
+ if atty::is(atty::Stream::Stdout) {
+ ColorChoice::Auto
+ } else {
+ ColorChoice::Never
+ }
+}
+
+fn colors() -> ColorSpecs {
+ ColorSpecs::new(&[
+ "path:fg:magenta".parse().unwrap(),
+ "line:fg:green".parse().unwrap(),
+ "match:fg:red".parse().unwrap(),
+ "match:style:bold".parse().unwrap(),
+ ])
+}
diff --git a/grep/src/lib.rs b/grep/src/lib.rs
index 023cd64a..ab0d78eb 100644
--- a/grep/src/lib.rs
+++ b/grep/src/lib.rs
@@ -1,84 +1,22 @@
-#![deny(missing_docs)]
-
/*!
-A fast line oriented regex searcher.
-*/
-
-#[macro_use]
-extern crate log;
-extern crate memchr;
-extern crate regex;
-extern crate regex_syntax as syntax;
-
-use std::error;
-use std::fmt;
-use std::result;
-
-pub use search::{Grep, GrepBuilder, Iter, Match};
+ripgrep, as a library.
-mod literals;
-mod nonl;
-mod search;
-mod smart_case;
-mod word_boundary;
+This library is intended to provide a high level facade to the crates that
+make up ripgrep's core searching routines. However, there is no high level
+documentation available yet guiding users on how to fit all of the pieces
+together.
-/// Result is a convenient type alias that fixes the type of the error to
-/// the `Error` type defined in this crate.
-pub type Result<T> = result::Result<T, Error>;
+Every public API item in the constituent crates is documented, but examples
+are sparse.
-/// Error enumerates the list of possible error conditions when building or
-/// using a `Grep` line searcher.
-#[derive(Debug)]
-pub enum Error {
- /// An error from parsing or compiling a regex.
- Regex(regex::Error),
- /// This error occurs when an illegal literal was found in the regex
- /// pattern. For example, if the line terminator is `\n` and the regex
- /// pattern is `\w+\n\w+`, then the presence of `\n` will cause this error.
- LiteralNotAllowed(char),
- /// An unused enum variant that indicates this enum may be expanded in
- /// the future and therefore should not be exhaustively matched.
- #[doc(hidden)]
- __Nonexhaustive,
-}
-
-impl error::Error for Error {
- fn description(&self) -> &str {
- match *self {
- Error::Regex(ref err) => err.description(),
- Error::LiteralNotAllowed(_) => "use of forbidden literal",
- Error::__Nonexhaustive => unreachable!(),
- }
- }
-
- fn cause(&self) -> Option<&error::Error> {
- match *self {
- Error::Regex(ref err) => err.cause(),
- _ => None,
- }
- }
-}
-
-impl fmt::Display for Error {
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- match *self {
- Error::Regex(ref err) => err.fmt(f),
- Error::LiteralNotAllowed(chr) => {
- write!(f, "Literal {:?} not allowed.", chr)
- }
- Error::__Nonexhaustive => unreachable!(),
- }
- }
-}
+A cookbook and a guide are planned.
+*/
-impl From<regex::Error> for Error {
- fn from(err: regex::Error) -> Error {
- Error::Regex(err)
- }
-}
+#![deny(missing_docs)]
-impl From<syntax::Error> for Error {
- fn from(err: syntax::Error) -> Error {
- Error::Regex(regex::Error::Syntax(err.to_string()))
- }
-}
+pub extern crate grep_matcher as matcher;
+#[cfg(feature = "pcre2")]
+pub extern crate grep_pcre2 as pcre2;
+pub extern crate grep_printer as printer;
+pub extern crate grep_regex as regex;
+pub extern crate grep_searcher as searcher;
diff --git a/grep/src/literals.rs b/grep/src/literals.rs
deleted file mode 100644
index 5e3dc8ea..00000000
--- a/grep/src/literals.rs
+++ /dev/null
@@ -1,274 +0,0 @@
-/*!
-The literals module is responsible for extracting *inner* literals out of the
-AST of a regular expression. Normally this is the job of the regex engine
-itself, but the regex engine doesn't look for inner literals. Since we're doing
-line based searching, we can use them, so we need to do it ourselves.
-
-Note that this implementation is incredibly suspicious. We need something more
-principled.
-*/
-use std::cmp;
-
-use regex::bytes::RegexBuilder;
-use syntax::hir::{self, Hir, HirKind};
-use syntax::hir::literal::{Literal, Literals};
-
-#[derive(Clone, Debug)]
-pub struct LiteralSets {
- prefixes: Literals,
- suffixes: Literals,
- required: Literals,
-}
-
-impl LiteralSets {
- pub fn create(expr: &Hir) -> Self {
- let mut required = Literals::empty();
- union_required(expr, &mut required);
- LiteralSets {
- prefixes: Literals::prefixes(expr),
- suffixes: Literals::suffixes(expr),
- required: required,
- }
- }
-
- pub fn to_regex_builder(&self) -> Option<RegexBuilder> {
- if self.prefixes.all_complete() && !self.prefixes.is_empty() {
- debug!("literal prefixes detected: {:?}", self.prefixes);
- // When this is true, the regex engine will do a literal scan.
- return None;
- }
-
- // Out of inner required literals, prefixes and suffixes, which one
- // is the longest? We pick the longest to do fast literal scan under
- // the assumption that a longer literal will have a lower false
- // positive rate.
- let pre_lcp = self.prefixes.longest_common_prefix();
- let pre_lcs = self.prefixes.longest_common_suffix();
- let suf_lcp = self.suffixes.longest_common_prefix();
- let suf_lcs = self.suffixes.longest_common_suffix();
-
- let req_lits = self.required.literals();
- let req = match req_lits.iter().max_by_key(|lit| lit.len()) {
- None => &[],
- Some(req) => &***req,
- };
-
- let mut lit = pre_lcp;
- if pre_lcs.len() > lit.len() {
- lit = pre_lcs;
- }
- if suf_lcp.len() > lit.len() {
- lit = suf_lcp;
- }
- if suf_lcs.len() > lit.len() {
- lit = suf_lcs;
- }
- if req_lits.len() == 1 && req.len() > lit.len() {
- lit = req;
- }
-
- // Special case: if we have any literals that are all whitespace,
- // then this is probably a failing of the literal detection since
- // whitespace is typically pretty common. In this case, don't bother
- // with inner literal scanning at all and just defer to the regex.
- let any_all_white = req_lits.iter()
- .any(|lit| lit.iter().all(|&b| (b as char).is_whitespace()));
- if any_all_white {
- return None;
- }
-
- // Special case: if we detected an alternation of inner required
- // literals and its longest literal is bigger than the longest
- // prefix/suffix, then choose the alternation. In practice, this
- // helps with case insensitive matching, which can generate lots of
- // inner required literals.
- let any_empty = req_lits.iter().any(|lit| lit.is_empty());
- if req.len() > lit.len() && req_lits.len() > 1 && !any_empty {
- debug!("required literals found: {:?}", req_lits);
- let alts: Vec<String> =
- req_lits.into_iter().map(|x| bytes_to_regex(x)).collect();
- let mut builder = RegexBuilder::new(&alts.join("|"));
- builder.unicode(false);
- Some(builder)
- } else if lit.is_empty() {
- None
- } else {
- debug!("required literal found: {:?}", show(lit));
- let mut builder = RegexBuilder::new(&bytes_to_regex(&lit));
- builder.unicode(false);
- Some(builder)
- }
- }
-}
-
-fn union_required(expr: &Hir, lits: &mut Literals) {
- match *expr.kind() {
- HirKind::Literal(hir::Literal::Unicode(c)) => {
- let mut buf = [0u8; 4];
- lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
- }
- HirKind::Literal(hir::Literal::Byte(b)) => {
- lits.cross_add(&[b]);
- }
- HirKind::Class(hir::Class::Unicode(ref cls)) => {
- if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) {
- lits.cut();
- }
- }
- HirKind::Class(hir::Class::Bytes(ref cls)) => {
- if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) {
- lits.cut();
- }
- }
- HirKind::Group(hir::Group { ref hir, .. }) => {
- union_required(&**hir, lits);
- }
- HirKind::Repetition(ref x) => {
- match x.kind {
- hir::RepetitionKind::ZeroOrOne => lits.cut(),
- hir::RepetitionKind::ZeroOrMore => lits.cut(),
- hir::RepetitionKind::OneOrMore => {
- union_required(&x.hir, lits);
- lits.cut();
- }
- hir::RepetitionKind::Range(ref rng) => {
- let (min, max) = match *rng {
- hir::RepetitionRange::Exactly(m) => (m, Some(m)),
- hir::RepetitionRange::AtLeast(m) => (m, None),
- hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
- };
- repeat_range_literals(
- &x.hir, min, max, x.greedy, lits, union_required);
- }
- }
- }
- HirKind::Concat(ref es) if es.is_empty() => {}
- HirKind::Concat(ref es) if es.len() == 1 => {
- union_required(&es[0], lits)
- }
- HirKind::Concat(ref es) => {
- for e in es {
- let mut lits2 = lits.to_empty();
- union_required(e, &mut lits2);
- if lits2.is_empty() {
- lits.cut();
- continue;
- }
- if lits2.contains_empty() {
- lits.cut();
- }
- if !lits.cross_product(&lits2) {
- // If this expression couldn't yield any literal that
- // could be extended, then we need to quit. Since we're
- // short-circuiting, we also need to freeze every member.
- lits.cut();
- break;
- }
- }
- }
- HirKind::Alternation(ref es) => {
- alternate_literals(es, lits, union_required);
- }
- _ => lits.cut(),
- }
-}
-
-fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
- e: &Hir,
- min: u32,
- max: Option<u32>,
- _greedy: bool,
- lits: &mut Literals,
- mut f: F,
-) {
- if min == 0 {
- // This is a bit conservative. If `max` is set, then we could
- // treat this as a finite set of alternations. For now, we
- // just treat it as `e*`.
- lits.cut();
- } else {
- let n = cmp::min(lits.limit_size(), min as usize);
- // We only extract literals from a single repetition, even though
- // we could do more. e.g., `a{3}` will have `a` extracted instead of
- // `aaa`. The reason is that inner literal extraction can't be unioned
- // across repetitions. e.g., extracting `foofoofoo` from `(\w+foo){3}`
- // is wrong.
- f(e, lits);
- if n < min as usize {
- lits.cut();
- }
- if max.map_or(true, |max| min < max) {
- lits.cut();
- }
- }
-}
-
-fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
- es: &[Hir],
- lits: &mut Literals,
- mut f: F,
-) {
- let mut lits2 = lits.to_empty();
- for e in es {
- let mut lits3 = lits.to_empty();
- lits3.set_limit_size(lits.limit_size() / 5);
- f(e, &mut lits3);
- if lits3.is_empty() || !lits2.union(lits3) {
- // If we couldn't find suffixes for *any* of the
- // alternates, then the entire alternation has to be thrown
- // away and any existing members must be frozen. Similarly,
- // if the union couldn't complete, stop and freeze.
- lits.cut();
- return;
- }
- }
- // All we do at the moment is look for prefixes and suffixes. If both
- // are empty, then we report nothing. We should be able to do better than
- // this, but we'll need something more expressive than just a "set of
- // literals."
- let lcp = lits2.longest_common_prefix();
- let lcs = lits2.longest_common_suffix();
- if !lcp.is_empty() {
- lits.cross_add(lcp);
- }
- lits.cut();
- if !lcs.is_empty() {
- lits.add(Literal::empty());
- lits.add(Literal::new(lcs.to_vec()));
- }
-}
-
-/// Return the number of characters in the given class.
-fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
- cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
-}
-
-/// Return the number of bytes in the given class.
-fn count_byte_class(cls: &hir::ClassBytes) -> u32 {
- cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
-}
-
-/// Converts an arbitrary sequence of bytes to a literal suitable for building
-/// a regular expression.
-fn bytes_to_regex(bs: &[u8]) -> String {
- let mut s = String::with_capacity(bs.len());
- for &b in bs {
- s.push_str(&format!("\\x{:02x}", b));
- }
- s
-}
-
-/// Converts arbitrary bytes to a nice string.
-fn show(bs: &[u8]) -> String {
- // Why aren't we using this to feed to the regex? Doesn't really matter
- // I guess. ---AG
- use std::ascii::escape_default;
- use std::str;
-
- let mut nice = String::new();
- for &b in bs {
- let part: Vec<u8> = escape_default(b).collect();
- nice.push_str(str::from_utf8(&part).unwrap());
- }
- nice
-}
diff --git a/grep/src/nonl.rs b/grep/src/nonl.rs
deleted file mode 100644
index 3beb5f61..00000000
--- a/grep/src/nonl.rs
+++ /dev/null
@@ -1,74 +0,0 @@
-use syntax::hir::{self, Hir, HirKind};
-
-use {Error, Result};
-
-/// Returns a new expression that is guaranteed to never match the given
-/// ASCII character.
-///
-/// If the expression contains the literal byte, then an error is returned.
-///
-/// If `byte` is not an ASCII character (i.e., greater than `0x7F`), then this
-/// function panics.
-pub fn remove(expr: Hir, byte: u8) -> Result<Hir> {
- assert!(byte <= 0x7F);
- let chr = byte as char;
- assert!(chr.len_utf8() == 1);
-
- Ok(match expr.into_kind() {
- HirKind::Empty => Hir::empty(),
- HirKind::Literal(hir::Literal::Unicode(c)) => {
- if c == chr {
- return Err(Error::LiteralNotAllowed(chr));
- }
- Hir::literal(hir::Literal::Unicode(c))
- }
- HirKind::Literal(hir::Literal::Byte(b)) => {
- if b as char == chr {
- return Err(Error::LiteralNotAllowed(chr));
- }
- Hir::literal(hir::Literal::Byte(b))
- }
- HirKind::Class(hir::Class::Unicode(mut cls)) => {
- let remove = hir::ClassUnicode::new(Some(
- hir::ClassUnicodeRange::new(chr, chr),
- ));
- cls.difference(&remove);
- if cls.iter().next().is_none() {
- return Err(Error::LiteralNotAllowed(chr));
- }
- Hir::class(hir::Class::Unicode(cls))
- }
- HirKind::Class(hir::Class::Bytes(mut cls)) => {
- let remove = hir::ClassBytes::new(Some(
- hir::ClassBytesRange::new(byte, byte),
- ));
- cls.difference(&remove);
- if cls.iter().next().is_none() {
- return Err(Error::LiteralNotAllowed(chr));
- }
- Hir::class(hir::Class::Bytes(cls))
- }
- HirKind::Anchor(x) => Hir::anchor(x),
- HirKind::WordBoundary(x) => Hir::word_boundary(x),
- HirKind::Repetition(mut x) => {
- x.hir = Box::new(remove(*x.hir, byte)?);
- Hir::repetition(x)
- }
- HirKind::Group(mut x) => {
- x.hir = Box::new(remove(*x.hir, byte)?);
- Hir::group(x)
- }
- HirKind::Concat(xs) => {
- let xs = xs.into_iter()
- .map(|e| remove(e, byte))
- .collect::<Result<Vec<Hir>>>()?;
- Hir::concat(xs)
- }
- HirKind::Alternation(xs) => {
- let xs = xs.into_iter()
- .map(|e| remove(e, byte))
- .collect::<Result<Vec<Hir>>>()?;
- Hir::alternation(xs)
- }
- })
-}
diff --git a/grep/src/search.rs b/grep/src/search.rs
deleted file mode 100644
index af7d680d..00000000
--- a/grep/src/search.rs
+++ /dev/null
@@ -1,317 +0,0 @@
-use memchr::{memchr, memrchr};
-use syntax::ParserBuilder;
-use syntax::hir::Hir;
-use regex::bytes::{Regex, RegexBuilder};
-
-use literals::LiteralSets;
-use nonl;
-use smart_case::Cased;
-use word_boundary::strip_unicode_word_boundaries;
-use Result;
-
-/// A matched line.
-#[derive(Clone, Debug, Default, Eq, PartialEq)]
-pub struct Match {
- start: usize,
- end: usize,
-}
-
-impl Match {
- /// Create a new empty match value.
- pub fn new() -> Match {
- Match::default()
- }
-
- /// Return the starting byte offset of the line that matched.
- #[inline]
- pub fn start(&self) -> usize {
- self.start
- }
-
- /// Return the ending byte offset of the line that matched.
- #[inline]
- pub fn end(&self) -> usize {
- self.end
- }
-}
-
-/// A fast line oriented regex searcher.
-#[derive(Clone, Debug)]
-pub struct Grep {
- re: Regex,
- required: Option<Regex>,
- opts: Options,
-}
-
-/// A builder for a grep searcher.
-#[derive(Clone, Debug)]
-pub struct GrepBuilder {
- pattern: String,
- opts: Options,
-}
-
-#[derive(Clone, Debug)]
-struct Options {
- case_insensitive: bool,
- case_smart: bool,
- line_terminator: u8,
- size_limit: usize,
- dfa_size_limit: usize,
-}
-
-impl Default for Options {
- fn default() -> Options {
- Options {
- case_insensitive: false,
- case_smart: false,
- line_terminator: b'\n',
- size_limit: 10 * (1 << 20),
- dfa_size_limit: 10 * (1 << 20),
- }
- }
-}
-
-impl GrepBuilder {
- /// Create a new builder for line searching.
- ///
- /// The pattern given should be a regular expression. The precise syntax
- /// supported is documented on the regex crate.
- pub fn new(pattern: &str) -> GrepBuilder {
- GrepBuilder {
- pattern: pattern.to_string(),
- opts: Options::default(),
- }
- }
-
- /// Set the line terminator.
- ///
- /// The line terminator can be any ASCII character and serves to delineate
- /// the match boundaries in the text searched.
- ///
- /// This panics if `ascii_byte` is greater than `0x7F` (i.e., not ASCII).
- pub fn line_terminator(mut self, ascii_byte: u8) -> GrepBuilder {
- assert!(ascii_byte <= 0x7F);
- self.opts.line_terminator = ascii_byte;
- self
- }
-
- /// Set the case sensitive flag (`i`) on the regex.
- pub fn case_insensitive(mut self, yes: bool) -> GrepBuilder {
- self.opts.case_insensitive = yes;
- self
- }
-
- /// Whether to enable smart case search or not (disabled by default).
- ///
- /// Smart case uses case insensitive search if the pattern contains only
- /// lowercase characters (ignoring any characters which immediately follow
- /// a '\'). Otherwise, a case sensitive search is used instead.
- ///
- /// Enabling the case_insensitive flag overrides this.
- pub fn case_smart(mut self, yes: bool) -> GrepBuilder {
- self.opts.case_smart = yes;
- self
- }
-
- /// Set the approximate size limit of the compiled regular expression.
- ///
- /// This roughly corresponds to the number of bytes occupied by a
- /// single compiled program. If the program exceeds this number, then a
- /// compilation error is returned.
- pub fn size_limit(mut self, limit: usize) -> GrepBuilder {
- self.opts.size_limit = limit;
- self
- }
-
- /// Set the approximate size of the cache used by the DFA.
- ///
- /// This roughly corresponds to the number of bytes that the DFA will use
- /// while searching.
- ///
- /// Note that this is a per thread limit. There is no way to set a global
- /// limit. In particular, if a regex is used from multiple threads
- /// simulanteously, then each thread may use up to the number of bytes
- /// specified here.
- pub fn dfa_size_limit(mut self, limit: usize) -> GrepBuilder {
- self.opts.dfa_size_limit = limit;
- self
- }
-
- /// Create a line searcher.
- ///
- /// If there was a problem parsing or compiling the regex with the given
- /// options, then an error is returned.
- pub fn build(self) -> Result<Grep> {
- let expr = self.parse()?;
- let literals = LiteralSets::create(&expr);
- let re = self.regex(&expr)?;
- let required = match literals.to_regex_builder() {
- Some(builder) => Some(self.regex_build(builder)?),
- None => {
- match strip_unicode_word_boundaries(&expr) {
- None => None,
- Some(expr) => {
- debug!("Stripped Unicode word boundaries. \
- New AST:\n{:?}", expr);
- self.regex(&expr).ok()
- }
- }
- }
- };
- Ok(Grep {
- re: re,
- required: required,
- opts: self.opts,
- })
- }
-
- /// Creates a new regex from the given expression with the current
- /// configuration.
- fn regex(&self, expr: &Hir) -> Result<Regex> {
- let mut builder = RegexBuilder::new(&expr.to_string());
- builder.unicode(true);
- self.regex_build(builder)
- }
-
- /// Builds a new regex from the given builder using the caller's settings.
- fn regex_build(&self, mut builder: RegexBuilder) -> Result<Regex> {
- builder
- .multi_line(true)
- .size_limit(self.opts.size_limit)
- .dfa_size_limit(self.opts.dfa_size_limit)
- .build()
- .map_err(From::from)
- }
-
- /// Parses the underlying pattern and ensures the pattern can never match
- /// the line terminator.
- fn parse(&self) -> Result<Hir> {
- let expr = ParserBuilder::new()
- .allow_invalid_utf8(true)
- .case_insensitive(self.is_case_insensitive()?)
- .multi_line(true)
- .build()
- .parse(&self.pattern)?;
- debug!("original regex HIR pattern:\n{}", expr);
- let expr = nonl::remove(expr, self.opts.line_terminator)?;
- debug!("transformed regex HIR pattern:\n{}", expr);
- Ok(expr)
- }
-
- /// Determines whether the case insensitive flag should be enabled or not.
- fn is_case_insensitive(&self) -> Result<bool> {
- if self.opts.case_insensitive {
- return Ok(true);
- }
- if !self.opts.case_smart {
- return Ok(false);
- }
- let cased = match Cased::from_pattern(&self.pattern) {
- None => return Ok(false),
- Some(cased) => cased,
- };
- Ok(cased.any_literal && !cased.any_uppercase)
- }
-}
-
-impl Grep {
- /// Returns a reference to the underlying regex used by the searcher.
- pub fn regex(&self) -> &Regex {
- &self.re
- }
-
- /// Returns an iterator over all matches in the given buffer.
- pub fn iter<'b, 's>(&'s self, buf: &'b [u8]) -> Iter<'b, 's> {
- Iter {
- searcher: self,
- buf: buf,
- start: 0,
- }
- }
-
- /// Fills in the next line that matches in the given buffer starting at
- /// the position given.
- ///
- /// If no match could be found, `false` is returned, otherwise, `true` is
- /// returned.
- pub fn read_match(
- &self,
- mat: &mut Match,
- buf: &[u8],
- mut start: usize,
- ) -> bool {
- if start >= buf.len() {
- return false;
- }
- if let Some(ref req) = self.required {
- while start < buf.len() {
- let e = match req.shortest_match(&buf[start..]) {
- None => return false,
- Some(e) => start + e,
- };
- let (prevnl, nextnl) = self.find_line(buf, e, e);
- match self.re.shortest_match(&buf[prevnl..nextnl]) {
- None => {
- start = nextnl;
- continue;
- }
- Some(_) => {
- self.fill_match(mat, prevnl, nextnl);
- return true;
- }
- }
- }
- false
- } else {
- let e = match self.re.shortest_match(&buf[start..]) {
- None => return false,
- Some(e) => start + e,
- };
- let (s, e) = self.find_line(buf, e, e);
- self.fill_match(mat, s, e);
- true
- }
- }
-
- fn fill_match(&self, mat: &mut Match, start: usize, end: usize) {
- mat.start = start;
- mat.end = end;
- }
-
- fn find_line(&self, buf: &[u8], s: usize, e: usize) -> (usize, usize) {
- (self.find_line_start(buf, s), self.find_line_end(buf, e))
- }
-
- fn find_line_start(&self, buf: &[u8], pos: usize) -> usize {
- memrchr(self.opts.line_terminator, &buf[0..pos]).map_or(0, |i| i + 1)
- }
-
- fn find_line_end(&self, buf: &[u8], pos: usize) -> usize {
- memchr(self.opts.line_terminator, &buf[pos..])
- .map_or(buf.len(), |i| pos + i + 1)
- }
-}
-
-/// An iterator over all matches in a particular buffer.
-///
-/// `'b` refers to the lifetime of the buffer, and `'s` refers to the lifetime
-/// of the searcher.
-pub struct Iter<'b, 's> {
- searcher: &'s Grep,
- buf: &'b [u8],
- start: usize,
-}
-
-impl<'b, 's> Iterator for Iter<'b, 's> {
- type Item = Match;
-
- fn next(&mut self) -> Option<Match> {
- let mut mat = Match::default();
- if !self.searcher.read_match(&mut mat, self.buf, self.start) {
- self.start = self.buf.len();
- return None;
- }
- self.start = mat.end;
- Some(mat)
- }
-}
diff --git a/grep/src/smart_case.rs b/grep/src/smart_case.rs
deleted file mode 100644
index 1379b326..00000000
--- a/