Fix required literal handling and add debug prints.

In particular, if we had an inner literal and were doing a case insensitive search, then the literals are dropped because we previously only allowed a single inner literal to have an effect. Now we allow alternations of inner literals, but still don't quite take full advantage.
author: Andrew Gallant <jamslam@gmail.com> 2016-09-06 19:33:03 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2016-09-06 19:33:03 -0400
commit: fd3e5069b6a4149e82789d5e1aad9fb7a8b49a77 (patch)
tree: 03131328fe423ccf2a674e92faa281cfa62f6a5b /grep
parent: 0891b4a3c01ad702ad0f18a6965f52a8fca1dc89 (diff)
4 files changed, 67 insertions, 8 deletions
diff --git a/grep/Cargo.toml b/grep/Cargo.toml
index 18371c94..e01e688b 100644
--- a/grep/Cargo.toml
+++ b/grep/Cargo.toml
@@ -14,6 +14,7 @@ keywords = ["regex", "grep", "egrep", "search", "pattern"]
 license = "Unlicense/MIT"
 
 [dependencies]
+log = "0.3"
 memchr = "0.1"
 memmap = "0.2"
 regex = "0.1.75"
diff --git a/grep/src/lib.rs b/grep/src/lib.rs
index d57b8fb0..68f95a4f 100644
--- a/grep/src/lib.rs
+++ b/grep/src/lib.rs
@@ -4,6 +4,8 @@
 A fast line oriented regex searcher.
 */
 
+#[macro_use]
+extern crate log;
 extern crate memchr;
 extern crate regex;
 extern crate regex_syntax as syntax;
diff --git a/grep/src/literals.rs b/grep/src/literals.rs
index 5408faea..f1685270 100644
--- a/grep/src/literals.rs
+++ b/grep/src/literals.rs
@@ -1,13 +1,22 @@
+/*!
+The literals module is responsible for extracting *inner* literals out of the
+AST of a regular expression. Normally this is the job of the regex engine
+itself, but the regex engine doesn't look for inner literals. Since we're doing
+line based searching, we can use them, so we need to do it ourselves.
+
+Note that this implementation is incredibly suspicious. We need something more
+principled.
+*/
 use std::cmp;
 use std::iter;
 
 use regex::bytes::Regex;
 use syntax::{
     Expr, Literals, Lit,
-    Repeater,
+    ByteClass, ByteRange, CharClass, ClassRange, Repeater,
 };
 
-#[derive(Debug)]
+#[derive(Clone, Debug)]
 pub struct LiteralSets {
     prefixes: Literals,
     suffixes: Literals,
@@ -27,6 +36,7 @@ impl LiteralSets {
 
     pub fn to_regex(&self) -> Option<Regex> {
         if self.prefixes.all_complete() && !self.prefixes.is_empty() {
+            debug!("literal prefixes detected: {:?}", self.prefixes);
             // When this is true, the regex engine will do a literal scan.
             return None;
         }
@@ -56,13 +66,27 @@ impl LiteralSets {
         if suf_lcs.len() > lit.len() {
             lit = suf_lcs;
         }
-        if req.len() > lit.len() {
+        if req_lits.len() == 1 && req.len() > lit.len() {
             lit = req;
         }
-        if lit.is_empty() {
+
+        // Special case: if we detected an alternation of inner required
+        // literals and its longest literal is bigger than the longest
+        // prefix/suffix, then choose the alternation. In practice, this
+        // helps with case insensitive matching, which can generate lots of
+        // inner required literals.
+        let any_empty = req_lits.iter().any(|lit| lit.is_empty());
+        if req.len() > lit.len() && req_lits.len() > 1 && !any_empty {
+            debug!("required literals found: {:?}", req_lits);
+            let alts: Vec<String> =
+                req_lits.into_iter().map(|x| bytes_to_regex(x)).collect();
+            // Literals always compile.
+            Some(Regex::new(&alts.join("|")).unwrap())
+        } else if lit.is_empty() {
             None
         } else {
             // Literals always compile.
+            debug!("required literal found: {:?}", show(lit));
             Some(Regex::new(&bytes_to_regex(lit)).unwrap())
         }
     }
@@ -75,14 +99,30 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
             let s: String = chars.iter().cloned().collect();
             lits.cross_add(s.as_bytes());
         }
-        Literal { casei: true, .. } => {
-            lits.cut();
+        Literal { ref chars, casei: true } => {
+            for &c in chars {
+                let cls = CharClass::new(vec![
+                    ClassRange { start: c, end: c },
+                ]).case_fold();
+                if !lits.add_char_class(&cls) {
+                    lits.cut();
+                    return;
+                }
+            }
         }
         LiteralBytes { ref bytes, casei: false } => {
             lits.cross_add(bytes);
         }
-        LiteralBytes { casei: true, .. } => {
-            lits.cut();
+        LiteralBytes { ref bytes, casei: true } => {
+            for &b in bytes {
+                let cls = ByteClass::new(vec![
+                    ByteRange { start: b, end: b },
+                ]).case_fold();
+                if !lits.add_byte_class(&cls) {
+                    lits.cut();
+                    return;
+                }
+            }
         }
         Class(_) => {
             lits.cut();
@@ -205,3 +245,18 @@ fn bytes_to_regex(bs: &[u8]) -> String {
     }
     s
 }
+
+/// Converts arbitrary bytes to a nice string.
+fn show(bs: &[u8]) -> String {
+    // Why aren't we using this to feed to the regex? Doesn't really matter
+    // I guess. ---AG
+    use std::ascii::escape_default;
+    use std::str;
+
+    let mut nice = String::new();
+    for &b in bs {
+        let part: Vec<u8> = escape_default(b).collect();
+        nice.push_str(str::from_utf8(&part).unwrap());
+    }
+    nice
+}
diff --git a/grep/src/search.rs b/grep/src/search.rs
index 6451cc55..a26fb151 100644
--- a/grep/src/search.rs
+++ b/grep/src/search.rs
@@ -152,6 +152,7 @@ impl GrepBuilder {
                  .unicode(true)
                  .case_insensitive(self.opts.case_insensitive)
                  .parse(&self.pattern));
+        debug!("regex ast:\n{:#?}", expr);
         Ok(try!(nonl::remove(expr, self.opts.line_terminator)))
     }
 }
author	Andrew Gallant <jamslam@gmail.com>	2016-09-06 19:33:03 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2016-09-06 19:33:03 -0400
commit	fd3e5069b6a4149e82789d5e1aad9fb7a8b49a77 (patch)
tree	03131328fe423ccf2a674e92faa281cfa62f6a5b /grep
parent	0891b4a3c01ad702ad0f18a6965f52a8fca1dc89 (diff)