summaryrefslogtreecommitdiffstats
path: root/grep
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2016-09-06 19:33:03 -0400
committerAndrew Gallant <jamslam@gmail.com>2016-09-06 19:33:03 -0400
commitfd3e5069b6a4149e82789d5e1aad9fb7a8b49a77 (patch)
tree03131328fe423ccf2a674e92faa281cfa62f6a5b /grep
parent0891b4a3c01ad702ad0f18a6965f52a8fca1dc89 (diff)
Fix required literal handling and add debug prints.
In particular, if we had an inner literal and were doing a case insensitive search, then the literals are dropped because we previously only allowed a single inner literal to have an effect. Now we allow alternations of inner literals, but still don't quite take full advantage.
Diffstat (limited to 'grep')
-rw-r--r--grep/Cargo.toml1
-rw-r--r--grep/src/lib.rs2
-rw-r--r--grep/src/literals.rs71
-rw-r--r--grep/src/search.rs1
4 files changed, 67 insertions, 8 deletions
diff --git a/grep/Cargo.toml b/grep/Cargo.toml
index 18371c94..e01e688b 100644
--- a/grep/Cargo.toml
+++ b/grep/Cargo.toml
@@ -14,6 +14,7 @@ keywords = ["regex", "grep", "egrep", "search", "pattern"]
license = "Unlicense/MIT"
[dependencies]
+log = "0.3"
memchr = "0.1"
memmap = "0.2"
regex = "0.1.75"
diff --git a/grep/src/lib.rs b/grep/src/lib.rs
index d57b8fb0..68f95a4f 100644
--- a/grep/src/lib.rs
+++ b/grep/src/lib.rs
@@ -4,6 +4,8 @@
A fast line oriented regex searcher.
*/
+#[macro_use]
+extern crate log;
extern crate memchr;
extern crate regex;
extern crate regex_syntax as syntax;
diff --git a/grep/src/literals.rs b/grep/src/literals.rs
index 5408faea..f1685270 100644
--- a/grep/src/literals.rs
+++ b/grep/src/literals.rs
@@ -1,13 +1,22 @@
+/*!
+The literals module is responsible for extracting *inner* literals out of the
+AST of a regular expression. Normally this is the job of the regex engine
+itself, but the regex engine doesn't look for inner literals. Since we're doing
+line based searching, we can use them, so we need to do it ourselves.
+
+Note that this implementation is incredibly suspicious. We need something more
+principled.
+*/
use std::cmp;
use std::iter;
use regex::bytes::Regex;
use syntax::{
Expr, Literals, Lit,
- Repeater,
+ ByteClass, ByteRange, CharClass, ClassRange, Repeater,
};
-#[derive(Debug)]
+#[derive(Clone, Debug)]
pub struct LiteralSets {
prefixes: Literals,
suffixes: Literals,
@@ -27,6 +36,7 @@ impl LiteralSets {
pub fn to_regex(&self) -> Option<Regex> {
if self.prefixes.all_complete() && !self.prefixes.is_empty() {
+ debug!("literal prefixes detected: {:?}", self.prefixes);
// When this is true, the regex engine will do a literal scan.
return None;
}
@@ -56,13 +66,27 @@ impl LiteralSets {
if suf_lcs.len() > lit.len() {
lit = suf_lcs;
}
- if req.len() > lit.len() {
+ if req_lits.len() == 1 && req.len() > lit.len() {
lit = req;
}
- if lit.is_empty() {
+
+ // Special case: if we detected an alternation of inner required
+ // literals and its longest literal is bigger than the longest
+ // prefix/suffix, then choose the alternation. In practice, this
+ // helps with case insensitive matching, which can generate lots of
+ // inner required literals.
+ let any_empty = req_lits.iter().any(|lit| lit.is_empty());
+ if req.len() > lit.len() && req_lits.len() > 1 && !any_empty {
+ debug!("required literals found: {:?}", req_lits);
+ let alts: Vec<String> =
+ req_lits.into_iter().map(|x| bytes_to_regex(x)).collect();
+ // Literals always compile.
+ Some(Regex::new(&alts.join("|")).unwrap())
+ } else if lit.is_empty() {
None
} else {
// Literals always compile.
+ debug!("required literal found: {:?}", show(lit));
Some(Regex::new(&bytes_to_regex(lit)).unwrap())
}
}
@@ -75,14 +99,30 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
let s: String = chars.iter().cloned().collect();
lits.cross_add(s.as_bytes());
}
- Literal { casei: true, .. } => {
- lits.cut();
+ Literal { ref chars, casei: true } => {
+ for &c in chars {
+ let cls = CharClass::new(vec![
+ ClassRange { start: c, end: c },
+ ]).case_fold();
+ if !lits.add_char_class(&cls) {
+ lits.cut();
+ return;
+ }
+ }
}
LiteralBytes { ref bytes, casei: false } => {
lits.cross_add(bytes);
}
- LiteralBytes { casei: true, .. } => {
- lits.cut();
+ LiteralBytes { ref bytes, casei: true } => {
+ for &b in bytes {
+ let cls = ByteClass::new(vec![
+ ByteRange { start: b, end: b },
+ ]).case_fold();
+ if !lits.add_byte_class(&cls) {
+ lits.cut();
+ return;
+ }
+ }
}
Class(_) => {
lits.cut();
@@ -205,3 +245,18 @@ fn bytes_to_regex(bs: &[u8]) -> String {
}
s
}
+
+/// Converts arbitrary bytes to a nice string.
+fn show(bs: &[u8]) -> String {
+ // Why aren't we using this to feed to the regex? Doesn't really matter
+ // I guess. ---AG
+ use std::ascii::escape_default;
+ use std::str;
+
+ let mut nice = String::new();
+ for &b in bs {
+ let part: Vec<u8> = escape_default(b).collect();
+ nice.push_str(str::from_utf8(&part).unwrap());
+ }
+ nice
+}
diff --git a/grep/src/search.rs b/grep/src/search.rs
index 6451cc55..a26fb151 100644
--- a/grep/src/search.rs
+++ b/grep/src/search.rs
@@ -152,6 +152,7 @@ impl GrepBuilder {
.unicode(true)
.case_insensitive(self.opts.case_insensitive)
.parse(&self.pattern));
+ debug!("regex ast:\n{:#?}", expr);
Ok(try!(nonl::remove(expr, self.opts.line_terminator)))
}
}