diff options
author | Andrew Gallant <jamslam@gmail.com> | 2016-09-06 19:33:03 -0400 |
---|---|---|
committer | Andrew Gallant <jamslam@gmail.com> | 2016-09-06 19:33:03 -0400 |
commit | fd3e5069b6a4149e82789d5e1aad9fb7a8b49a77 (patch) | |
tree | 03131328fe423ccf2a674e92faa281cfa62f6a5b /grep | |
parent | 0891b4a3c01ad702ad0f18a6965f52a8fca1dc89 (diff) |
Fix required literal handling and add debug prints.
In particular, if we had an inner literal and were doing a case insensitive
search, then the literals are dropped because we previously only allowed
a single inner literal to have an effect. Now we allow alternations of
inner literals, but still don't quite take full advantage.
Diffstat (limited to 'grep')
-rw-r--r-- | grep/Cargo.toml | 1 | ||||
-rw-r--r-- | grep/src/lib.rs | 2 | ||||
-rw-r--r-- | grep/src/literals.rs | 71 | ||||
-rw-r--r-- | grep/src/search.rs | 1 |
4 files changed, 67 insertions, 8 deletions
diff --git a/grep/Cargo.toml b/grep/Cargo.toml index 18371c94..e01e688b 100644 --- a/grep/Cargo.toml +++ b/grep/Cargo.toml @@ -14,6 +14,7 @@ keywords = ["regex", "grep", "egrep", "search", "pattern"] license = "Unlicense/MIT" [dependencies] +log = "0.3" memchr = "0.1" memmap = "0.2" regex = "0.1.75" diff --git a/grep/src/lib.rs b/grep/src/lib.rs index d57b8fb0..68f95a4f 100644 --- a/grep/src/lib.rs +++ b/grep/src/lib.rs @@ -4,6 +4,8 @@ A fast line oriented regex searcher. */ +#[macro_use] +extern crate log; extern crate memchr; extern crate regex; extern crate regex_syntax as syntax; diff --git a/grep/src/literals.rs b/grep/src/literals.rs index 5408faea..f1685270 100644 --- a/grep/src/literals.rs +++ b/grep/src/literals.rs @@ -1,13 +1,22 @@ +/*! +The literals module is responsible for extracting *inner* literals out of the +AST of a regular expression. Normally this is the job of the regex engine +itself, but the regex engine doesn't look for inner literals. Since we're doing +line based searching, we can use them, so we need to do it ourselves. + +Note that this implementation is incredibly suspicious. We need something more +principled. +*/ use std::cmp; use std::iter; use regex::bytes::Regex; use syntax::{ Expr, Literals, Lit, - Repeater, + ByteClass, ByteRange, CharClass, ClassRange, Repeater, }; -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct LiteralSets { prefixes: Literals, suffixes: Literals, @@ -27,6 +36,7 @@ impl LiteralSets { pub fn to_regex(&self) -> Option<Regex> { if self.prefixes.all_complete() && !self.prefixes.is_empty() { + debug!("literal prefixes detected: {:?}", self.prefixes); // When this is true, the regex engine will do a literal scan. return None; } @@ -56,13 +66,27 @@ impl LiteralSets { if suf_lcs.len() > lit.len() { lit = suf_lcs; } - if req.len() > lit.len() { + if req_lits.len() == 1 && req.len() > lit.len() { lit = req; } - if lit.is_empty() { + + // Special case: if we detected an alternation of inner required + // literals and its longest literal is bigger than the longest + // prefix/suffix, then choose the alternation. In practice, this + // helps with case insensitive matching, which can generate lots of + // inner required literals. + let any_empty = req_lits.iter().any(|lit| lit.is_empty()); + if req.len() > lit.len() && req_lits.len() > 1 && !any_empty { + debug!("required literals found: {:?}", req_lits); + let alts: Vec<String> = + req_lits.into_iter().map(|x| bytes_to_regex(x)).collect(); + // Literals always compile. + Some(Regex::new(&alts.join("|")).unwrap()) + } else if lit.is_empty() { None } else { // Literals always compile. + debug!("required literal found: {:?}", show(lit)); Some(Regex::new(&bytes_to_regex(lit)).unwrap()) } } @@ -75,14 +99,30 @@ fn union_required(expr: &Expr, lits: &mut Literals) { let s: String = chars.iter().cloned().collect(); lits.cross_add(s.as_bytes()); } - Literal { casei: true, .. } => { - lits.cut(); + Literal { ref chars, casei: true } => { + for &c in chars { + let cls = CharClass::new(vec![ + ClassRange { start: c, end: c }, + ]).case_fold(); + if !lits.add_char_class(&cls) { + lits.cut(); + return; + } + } } LiteralBytes { ref bytes, casei: false } => { lits.cross_add(bytes); } - LiteralBytes { casei: true, .. } => { - lits.cut(); + LiteralBytes { ref bytes, casei: true } => { + for &b in bytes { + let cls = ByteClass::new(vec![ + ByteRange { start: b, end: b }, + ]).case_fold(); + if !lits.add_byte_class(&cls) { + lits.cut(); + return; + } + } } Class(_) => { lits.cut(); @@ -205,3 +245,18 @@ fn bytes_to_regex(bs: &[u8]) -> String { } s } + +/// Converts arbitrary bytes to a nice string. +fn show(bs: &[u8]) -> String { + // Why aren't we using this to feed to the regex? Doesn't really matter + // I guess. ---AG + use std::ascii::escape_default; + use std::str; + + let mut nice = String::new(); + for &b in bs { + let part: Vec<u8> = escape_default(b).collect(); + nice.push_str(str::from_utf8(&part).unwrap()); + } + nice +} diff --git a/grep/src/search.rs b/grep/src/search.rs index 6451cc55..a26fb151 100644 --- a/grep/src/search.rs +++ b/grep/src/search.rs @@ -152,6 +152,7 @@ impl GrepBuilder { .unicode(true) .case_insensitive(self.opts.case_insensitive) .parse(&self.pattern)); + debug!("regex ast:\n{:#?}", expr); Ok(try!(nonl::remove(expr, self.opts.line_terminator))) } } |