diff options
author | Andrew Gallant <jamslam@gmail.com> | 2016-06-20 16:53:48 -0400 |
---|---|---|
committer | Andrew Gallant <jamslam@gmail.com> | 2016-06-20 16:55:13 -0400 |
commit | 0163b39faa14aa328ca93897a17e4d87a87bacc4 (patch) | |
tree | 4fabf9cf0d4883518073674627bd357868214d8c /grep | |
parent | 8d9d60294562bdf37b8a61661c8874d60026f815 (diff) |
refactor progress
Diffstat (limited to 'grep')
-rw-r--r-- | grep/Cargo.toml | 20 | ||||
-rw-r--r-- | grep/src/lib.rs | 72 | ||||
-rw-r--r-- | grep/src/literals.rs | 207 | ||||
-rw-r--r-- | grep/src/nonl.rs | 65 | ||||
-rw-r--r-- | grep/src/search.rs | 307 |
5 files changed, 671 insertions, 0 deletions
diff --git a/grep/Cargo.toml b/grep/Cargo.toml new file mode 100644 index 00000000..5da201e9 --- /dev/null +++ b/grep/Cargo.toml @@ -0,0 +1,20 @@ +[package] +publish = false +name = "grep" +version = "0.1.0" #:version +authors = ["Andrew Gallant <jamslam@gmail.com>"] +description = """ +Fast line oriented regex searching as a library. +""" +documentation = "https://github.com/BurntSushi/xrep" +homepage = "https://github.com/BurntSushi/xrep" +repository = "https://github.com/BurntSushi/xrep" +readme = "README.md" +keywords = ["regex", "grep", "egrep", "search", "pattern"] +license = "Unlicense/MIT" + +[dependencies] +memchr = "0.1" +memmap = "0.2" +regex = { version = "0.1", path = "/home/andrew/rust/regex" } +regex-syntax = { version = "0.3.1", path = "/home/andrew/rust/regex/regex-syntax" } diff --git a/grep/src/lib.rs b/grep/src/lib.rs new file mode 100644 index 00000000..d45b142f --- /dev/null +++ b/grep/src/lib.rs @@ -0,0 +1,72 @@ +extern crate memchr; +extern crate regex; +extern crate regex_syntax as syntax; + +use std::error; +use std::fmt; +use std::result; + +pub use search::{Grep, GrepBuilder}; + +mod literals; +mod nonl; +mod search; + +/// Result is a convenient type alias that fixes the type of the error to +/// the `Error` type defined in this crate. +pub type Result<T> = result::Result<T, Error>; + +/// Error enumerates the list of possible error conditions when building or +/// using a `Grep` line searcher. +#[derive(Debug)] +pub enum Error { + /// An error from parsing or compiling a regex. + Regex(regex::Error), + /// This error occurs when an illegal literal was found in the regex + /// pattern. For example, if the line terminator is `\n` and the regex + /// pattern is `\w+\n\w+`, then the presence of `\n` will cause this error. + LiteralNotAllowed(char), + #[doc(hidden)] + __Nonexhaustive, +} + +impl error::Error for Error { + fn description(&self) -> &str { + match *self { + Error::Regex(ref err) => err.description(), + Error::LiteralNotAllowed(_) => "use of forbidden literal", + Error::__Nonexhaustive => unreachable!(), + } + } + + fn cause(&self) -> Option<&error::Error> { + match *self { + Error::Regex(ref err) => err.cause(), + _ => None, + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Error::Regex(ref err) => err.fmt(f), + Error::LiteralNotAllowed(chr) => { + write!(f, "Literal '{}' not allowed.", chr) + } + Error::__Nonexhaustive => unreachable!(), + } + } +} + +impl From<regex::Error> for Error { + fn from(err: regex::Error) -> Error { + Error::Regex(err) + } +} + +impl From<syntax::Error> for Error { + fn from(err: syntax::Error) -> Error { + Error::Regex(regex::Error::Syntax(err)) + } +} diff --git a/grep/src/literals.rs b/grep/src/literals.rs new file mode 100644 index 00000000..5408faea --- /dev/null +++ b/grep/src/literals.rs @@ -0,0 +1,207 @@ +use std::cmp; +use std::iter; + +use regex::bytes::Regex; +use syntax::{ + Expr, Literals, Lit, + Repeater, +}; + +#[derive(Debug)] +pub struct LiteralSets { + prefixes: Literals, + suffixes: Literals, + required: Literals, +} + +impl LiteralSets { + pub fn create(expr: &Expr) -> Self { + let mut required = Literals::empty(); + union_required(expr, &mut required); + LiteralSets { + prefixes: expr.prefixes(), + suffixes: expr.suffixes(), + required: required, + } + } + + pub fn to_regex(&self) -> Option<Regex> { + if self.prefixes.all_complete() && !self.prefixes.is_empty() { + // When this is true, the regex engine will do a literal scan. + return None; + } + + // Out of inner required literals, prefixes and suffixes, which one + // is the longest? We pick the longest to do fast literal scan under + // the assumption that a longer literal will have a lower false + // positive rate. + let pre_lcp = self.prefixes.longest_common_prefix(); + let pre_lcs = self.prefixes.longest_common_suffix(); + let suf_lcp = self.suffixes.longest_common_prefix(); + let suf_lcs = self.suffixes.longest_common_suffix(); + + let req_lits = self.required.literals(); + let req = match req_lits.iter().max_by_key(|lit| lit.len()) { + None => &[], + Some(req) => &***req, + }; + + let mut lit = pre_lcp; + if pre_lcs.len() > lit.len() { + lit = pre_lcs; + } + if suf_lcp.len() > lit.len() { + lit = suf_lcp; + } + if suf_lcs.len() > lit.len() { + lit = suf_lcs; + } + if req.len() > lit.len() { + lit = req; + } + if lit.is_empty() { + None + } else { + // Literals always compile. + Some(Regex::new(&bytes_to_regex(lit)).unwrap()) + } + } +} + +fn union_required(expr: &Expr, lits: &mut Literals) { + use syntax::Expr::*; + match *expr { + Literal { ref chars, casei: false } => { + let s: String = chars.iter().cloned().collect(); + lits.cross_add(s.as_bytes()); + } + Literal { casei: true, .. } => { + lits.cut(); + } + LiteralBytes { ref bytes, casei: false } => { + lits.cross_add(bytes); + } + LiteralBytes { casei: true, .. } => { + lits.cut(); + } + Class(_) => { + lits.cut(); + } + ClassBytes(_) => { + lits.cut(); + } + Group { ref e, .. } => { + union_required(&**e, lits); + } + Repeat { r: Repeater::ZeroOrOne, .. } => lits.cut(), + Repeat { r: Repeater::ZeroOrMore, .. } => lits.cut(), + Repeat { ref e, r: Repeater::OneOrMore, .. } => { + union_required(&**e, lits); + lits.cut(); + } + Repeat { ref e, r: Repeater::Range { min, max }, greedy } => { + repeat_range_literals( + &**e, min, max, greedy, lits, union_required); + } + Concat(ref es) if es.is_empty() => {} + Concat(ref es) if es.len() == 1 => union_required(&es[0], lits), + Concat(ref es) => { + for e in es { + let mut lits2 = lits.to_empty(); + union_required(e, &mut lits2); + if lits2.is_empty() { + lits.cut(); + continue; + } + if lits2.contains_empty() { + lits.cut(); + } + // if !lits.union(lits2) { + if !lits.cross_product(&lits2) { + // If this expression couldn't yield any literal that + // could be extended, then we need to quit. Since we're + // short-circuiting, we also need to freeze every member. + lits.cut(); + break; + } + } + } + Alternate(ref es) => { + alternate_literals(es, lits, union_required); + } + _ => lits.cut(), + } +} + +fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>( + e: &Expr, + min: u32, + max: Option<u32>, + _greedy: bool, + lits: &mut Literals, + mut f: F, +) { + use syntax::Expr::*; + + if min == 0 { + // This is a bit conservative. If `max` is set, then we could + // treat this as a finite set of alternations. For now, we + // just treat it as `e*`. + lits.cut(); + } else { + let n = cmp::min(lits.limit_size(), min as usize); + let es = iter::repeat(e.clone()).take(n).collect(); + f(&Concat(es), lits); + if n < min as usize { + lits.cut(); + } + if max.map_or(true, |max| min < max) { + lits.cut(); + } + } +} + +fn alternate_literals<F: FnMut(&Expr, &mut Literals)>( + es: &[Expr], + lits: &mut Literals, + mut f: F, +) { + let mut lits2 = lits.to_empty(); + for e in es { + let mut lits3 = lits.to_empty(); + lits3.set_limit_size(lits.limit_size() / 5); + f(e, &mut lits3); + if lits3.is_empty() || !lits2.union(lits3) { + // If we couldn't find suffixes for *any* of the + // alternates, then the entire alternation has to be thrown + // away and any existing members must be frozen. Similarly, + // if the union couldn't complete, stop and freeze. + lits.cut(); + return; + } + } + // All we do at the moment is look for prefixes and suffixes. If both + // are empty, then we report nothing. We should be able to do better than + // this, but we'll need something more expressive than just a "set of + // literals." + let lcp = lits2.longest_common_prefix(); + let lcs = lits2.longest_common_suffix(); + if !lcp.is_empty() { + lits.cross_add(lcp); + } + lits.cut(); + if !lcs.is_empty() { + lits.add(Lit::empty()); + lits.add(Lit::new(lcs.to_vec())); + } +} + +/// Converts an arbitrary sequence of bytes to a literal suitable for building +/// a regular expression. +fn bytes_to_regex(bs: &[u8]) -> String { + let mut s = String::with_capacity(bs.len()); + for &b in bs { + s.push_str(&format!("\\x{:02x}", b)); + } + s +} diff --git a/grep/src/nonl.rs b/grep/src/nonl.rs new file mode 100644 index 00000000..e4dad13f --- /dev/null +++ b/grep/src/nonl.rs @@ -0,0 +1,65 @@ +use syntax::Expr; + +use {Error, Result}; + +/// Returns a new expression that is guaranteed to never match the given +/// ASCII character. +/// +/// If the expression contains the literal byte, then an error is returned. +/// +/// If `byte` is not an ASCII character (i.e., greater than `0x7F`), then this +/// function panics. +pub fn remove(expr: Expr, byte: u8) -> Result<Expr> { + use syntax::Expr::*; + assert!(byte <= 0x7F); + let chr = byte as char; + assert!(chr.len_utf8() == 1); + + Ok(match expr { + Literal { chars, casei } => { + if chars.iter().position(|&c| c == chr).is_some() { + return Err(Error::LiteralNotAllowed(chr)); + } + Literal { chars: chars, casei: casei } + } + LiteralBytes { bytes, casei } => { + if bytes.iter().position(|&b| b == byte).is_some() { + return Err(Error::LiteralNotAllowed(chr)); + } + LiteralBytes { bytes: bytes, casei: casei } + } + AnyChar => AnyCharNoNL, + AnyByte => AnyByteNoNL, + Class(mut cls) => { + cls.remove(chr); + Class(cls) + } + ClassBytes(mut cls) => { + cls.remove(byte); + ClassBytes(cls) + } + Group { e, i, name } => { + Group { + e: Box::new(try!(remove(*e, byte))), + i: i, + name: name, + } + } + Repeat { e, r, greedy } => { + Repeat { + e: Box::new(try!(remove(*e, byte))), + r: r, + greedy: greedy, + } + } + Concat(exprs) => { + Concat(try!( + exprs.into_iter().map(|e| remove(e, byte)).collect())) + } + Alternate(exprs) => { + Alternate(try!( + exprs.into_iter().map(|e| remove(e, byte)).collect())) + } + e => e, + }) +} diff --git a/grep/src/search.rs b/grep/src/search.rs new file mode 100644 index 00000000..eb4ff6fc --- /dev/null +++ b/grep/src/search.rs @@ -0,0 +1,307 @@ +use std::io; + +use memchr::{memchr, memrchr}; +use regex::bytes::{Regex, RegexBuilder}; +use syntax; + +use literals::LiteralSets; +use nonl; +use Result; + +#[derive(Clone, Debug)] +pub struct Grep { + re: Regex, + required: Option<Regex>, + opts: Options, +} + +#[derive(Clone, Debug)] +pub struct GrepBuilder { + pattern: String, + opts: Options, +} + +#[derive(Clone, Debug)] +struct Options { + case_insensitive: bool, + lines: bool, + locations: bool, + line_terminator: u8, + size_limit: usize, + dfa_size_limit: usize, +} + +impl Default for Options { + fn default() -> Options { + Options { + case_insensitive: false, + lines: false, + locations: false, + line_terminator: b'\n', + size_limit: 10 * (1 << 20), + dfa_size_limit: 10 * (1 << 20), + } + } +} + +impl GrepBuilder { + /// Create a new builder for line searching. + /// + /// The pattern given should be a regular expression. The precise syntax + /// supported is documented on the regex crate. + pub fn new(pattern: &str) -> GrepBuilder { + GrepBuilder { + pattern: pattern.to_string(), + opts: Options::default(), + } + } + + /// Sets whether line numbers are reported for each match. + /// + /// When enabled (disabled by default), every matching line is tagged with + /// its corresponding line number accoring to the line terminator that is + /// set. Note that this requires extra processing which can slow down + /// search. + pub fn line_numbers(mut self, yes: bool) -> GrepBuilder { + self.opts.lines = yes; + self + } + + /// Set whether precise match locations are reported for each matching + /// line. + /// + /// When enabled (disabled by default), every match of the regex on each + /// matchling line is reported via byte offsets. Note that this requires + /// extra processing which can slow down search. + pub fn locations(mut self, yes: bool) -> GrepBuilder { + self.opts.locations = yes; + self + } + + /// Set the line terminator. + /// + /// The line terminator can be any ASCII character and serves to delineate + /// the match boundaries in the text searched. + /// + /// This panics if `ascii_byte` is greater than `0x7F` (i.e., not ASCII). + pub fn line_terminator(mut self, ascii_byte: u8) -> GrepBuilder { + assert!(ascii_byte <= 0x7F); + self.opts.line_terminator = ascii_byte; + self + } + + /// Set the case sensitive flag (`i`) on the regex. + pub fn case_insensitive(mut self, yes: bool) -> GrepBuilder { + self.opts.case_insensitive = yes; + self + } + + /// Set the approximate size limit of the compiled regular expression. + /// + /// This roughly corresponds to the number of bytes occupied by a + /// single compiled program. If the program exceeds this number, then a + /// compilation error is returned. + pub fn size_limit(mut self, limit: usize) -> GrepBuilder { + self.opts.size_limit = limit; + self + } + + /// Set the approximate size of the cache used by the DFA. + /// + /// This roughly corresponds to the number of bytes that the DFA will use + /// while searching. + /// + /// Note that this is a per thread limit. There is no way to set a global + /// limit. In particular, if a regex is used from multiple threads + /// simulanteously, then each thread may use up to the number of bytes + /// specified here. + pub fn dfa_size_limit(mut self, limit: usize) -> GrepBuilder { + self.opts.dfa_size_limit = limit; + self + } + + /// Create a line searcher. + /// + /// If there was a problem parsing or compiling the regex with the given + /// options, then an error is returned. + pub fn create(self) -> Result<Grep> { + let expr = try!(self.parse()); + let literals = LiteralSets::create(&expr); + let re = try!( + RegexBuilder::new(&expr.to_string()) + .case_insensitive(self.opts.case_insensitive) + .multi_line(true) + .unicode(true) + .size_limit(self.opts.size_limit) + .dfa_size_limit(self.opts.dfa_size_limit) + .compile() + ); + Ok(Grep { + re: re, + required: literals.to_regex(), + opts: self.opts, + }) + } + + /// Parses the underlying pattern and ensures the pattern can never match + /// the line terminator. + fn parse(&self) -> Result<syntax::Expr> { + let expr = + try!(syntax::ExprBuilder::new() + .allow_bytes(true) + .unicode(true) + .parse(&self.pattern)); + Ok(try!(nonl::remove(expr, self.opts.line_terminator))) + } +} + +impl Grep { + pub fn iter<'b, 's>(&'s self, buf: &'b [u8]) -> Iter<'b, 's> { + Iter { + searcher: self, + buf: buf, + start: 0, + } + } + + pub fn read_match( + &self, + mat: &mut Match, + buf: &[u8], + mut start: usize, + ) -> bool { + if start >= buf.len() { + return false; + } + if let Some(ref req) = self.required { + while start < buf.len() { + let e = match req.shortest_match(&buf[start..]) { + None => return false, + Some(e) => start + e, + }; + let (prevnl, nextnl) = self.find_line(buf, e, e); + match self.re.shortest_match(&buf[prevnl..nextnl]) { + None => { + start = nextnl + 1; + continue; + } + Some(_) => { + self.fill_match(mat, prevnl, nextnl); + return true; + } + } + } + false + } else { + let e = match self.re.shortest_match(&buf[start..]) { + None => return false, + Some(e) => start + e, + }; + let (s, e) = self.find_line(buf, e, e); + self.fill_match(mat, s, e); + true + } + } + + fn fill_match(&self, mat: &mut Match, start: usize, end: usize) { + mat.start = start; + mat.end = end; + } + + fn find_line(&self, buf: &[u8], s: usize, e: usize) -> (usize, usize) { + (self.find_line_start(buf, s), self.find_line_end(buf, e)) + } + + fn find_line_start(&self, buf: &[u8], pos: usize) -> usize { + memrchr(self.opts.line_terminator, &buf[0..pos]).map_or(0, |i| i + 1) + } + + fn find_line_end(&self, buf: &[u8], pos: usize) -> usize { + memchr(self.opts.line_terminator, &buf[pos..]) + .map_or(buf.len(), |i| pos + i) + } +} + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct Match { + start: usize, + end: usize, + line: Option<usize>, + locations: Vec<(usize, usize)>, +} + +impl Match { + pub fn new() -> Match { + Match::default() + } + + /// Return the starting byte offset of the line that matched. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Return the ending byte offset of the line that matched. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Return the line number that this match corresponds to. + /// + /// Note that this is `None` if line numbers aren't being computed. Line + /// number tracking can be enabled using `GrepBuilder`. + #[inline] + pub fn line(&self) -> Option<usize> { + self.line + } + + /// Return the exact start and end locations (in byte offsets) of every + /// regex match in this line. + /// + /// Note that this always returns an empty slice if exact locations aren't + /// computed. Exact location tracking can be enabled using `GrepBuilder`. + #[inline] + pub fn locations(&self) -> &[(usize, usize)] { + &self.locations + } +} + +pub struct Iter<'b, 's> { + searcher: &'s Grep, + buf: &'b [u8], + start: usize, +} + +impl<'b, 's> Iterator for Iter<'b, 's> { + type Item = Match; + + fn next(&mut self) -> Option<Match> { + let mut mat = Match::default(); + if !self.searcher.read_match(&mut mat, self.buf, self.start) { + self.start = self.buf.len(); + return None; + } + self.start = mat.end + 1; + Some(mat) + } +} + +pub struct GrepBuffered<'g, B> { + grep: &'g Grep, + buf: B, + start: usize, +} + +impl<'g, B: BufRead> GrepBuffered { + pub fn read_match( + &self, + mat: &mut Match, + ) -> io::Result<bool> { + let buf = try!(self.buf.fill_buf()); + if buf.is_empty() { + return Ok(false); + } + Ok(false) + } +} |