Improve detection of upper-case characters by smart-case feature

Fixes #717 (partially) The previous implementation of the smart-case feature was actually *too* smart, in that it inspected the final character ranges in the AST to determine if the pattern contained upper-case characters. This meant that patterns like `foo\w` would not be handled case-insensitively, since `\w` includes the range of upper-case characters A–Z. As a medium-term solution to this problem, we now inspect the input pattern itself for upper-case characters, ignoring any that immediately follow a `\`. This neatly handles all of the most basic cases like `\w`, `\S`, and `É`, though it still has problems with more complex features like `\p{Ll}`. Handling those correctly will require improvements to the AST.
author: dana <dana@dana.is> 2017-12-18 14:56:01 -0600
committer: Andrew Gallant <jamslam@gmail.com> 2017-12-18 17:58:26 -0500
commit: 86c890bcecf885619565ce949ddb9099d55dd43c (patch)
tree: 6e2c66ac70271b74820fc38c697692910457bafd /grep
parent: d775259ed9399037e1a1da0ff65dd377bc8b82d1 (diff)
1 files changed, 39 insertions, 42 deletions
diff --git a/grep/src/search.rs b/grep/src/search.rs
index 498df0c1..9942f58a 100644
--- a/grep/src/search.rs
+++ b/grep/src/search.rs
@@ -102,9 +102,9 @@ impl GrepBuilder {
 
     /// Whether to enable smart case search or not (disabled by default).
     ///
-    /// Smart case uses case insensitive search if the regex is contains all
-    /// lowercase literal characters. Otherwise, a case sensitive search is
-    /// used instead.
+    /// Smart case uses case insensitive search if the pattern contains only
+    /// lowercase characters (ignoring any characters which immediately follow
+    /// a '\'). Otherwise, a case sensitive search is used instead.
     ///
     /// Enabling the case_insensitive flag overrides this.
     pub fn case_smart(mut self, yes: bool) -> GrepBuilder {
@@ -197,8 +197,6 @@ impl GrepBuilder {
     }
 
     /// Determines whether the case insensitive flag should be enabled or not.
-    ///
-    /// An error is returned if the regex could not be parsed.
     fn is_case_insensitive(&self) -> Result<bool> {
         if self.opts.case_insensitive {
             return Ok(true);
@@ -206,12 +204,7 @@ impl GrepBuilder {
         if !self.opts.case_smart {
             return Ok(false);
         }
-        let expr =
-            try!(syntax::ExprBuilder::new()
-                 .allow_bytes(true)
-                 .unicode(true)
-                 .parse(&self.pattern));
-        Ok(!has_uppercase_literal(&expr))
+        Ok(!has_uppercase_literal(&self.pattern))
     }
 }
 
@@ -317,38 +310,26 @@ impl<'b, 's> Iterator for Iter<'b, 's> {
     }
 }
 
-fn has_uppercase_literal(expr: &Expr) -> bool {
-    use syntax::Expr::*;
-    fn byte_is_upper(b: u8) -> bool { b'A' <= b && b <= b'Z' }
-    match *expr {
-        Literal { ref chars, casei } => {
-            casei || chars.iter().any(|c| c.is_uppercase())
-        }
-        LiteralBytes { ref bytes, casei } => {
-            casei || bytes.iter().any(|&b| byte_is_upper(b))
-        }
-        Class(ref ranges) => {
-            for r in ranges {
-                if r.start.is_uppercase() || r.end.is_uppercase() {
-                    return true;
-                }
-            }
-            false
-        }
-        ClassBytes(ref ranges) => {
-            for r in ranges {
-                if byte_is_upper(r.start) || byte_is_upper(r.end) {
-                    return true;
-                }
-            }
-            false
+/// Determine whether the pattern contains an uppercase character which should
+/// negate the effect of the smart-case option.
+///
+/// Ideally we would be able to check the AST in order to correctly handle
+/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly
+/// cased), but we don't currently have that option. For now, our 'good enough'
+/// solution is to simply perform a semi-naïve scan of the input pattern and
+/// ignore all characters following a '\'. The ExprBuilder will handle any
+/// actual errors, and this at least lets us support the most common cases,
+/// like 'foo\w' and 'foo\S', in an intuitive manner.
+fn has_uppercase_literal(pattern: &str) -> bool {
+    let mut chars = pattern.chars();
+    while let Some(c) = chars.next() {
+        if c == '\\' {
+            chars.next();
+        } else if c.is_uppercase() {
+            return true;
         }
-        Group { ref e, .. } => has_uppercase_literal(e),
-        Repeat { ref e, .. } => has_uppercase_literal(e),
-        Concat(ref es) => es.iter().any(has_uppercase_literal),
-        Alternate(ref es) => es.iter().any(has_uppercase_literal),
-        _ => false,
     }
+    false
 }
 
 #[cfg(test)]
@@ -358,7 +339,7 @@ mod tests {
     use memchr::{memchr, memrchr};
     use regex::bytes::Regex;
 
-    use super::{GrepBuilder, Match};
+    use super::{GrepBuilder, Match, has_uppercase_literal};
 
     static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt");
 
@@ -395,4 +376,20 @@ mod tests {
         assert_eq!(expected.len(), got.len());
         assert_eq!(expected, got);
     }
+
+    #[test]
+    fn pattern_case() {
+        assert_eq!(has_uppercase_literal(&"".to_string()), false);
+        assert_eq!(has_uppercase_literal(&"foo".to_string()), false);
+        assert_eq!(has_uppercase_literal(&"Foo".to_string()), true);
+        assert_eq!(has_uppercase_literal(&"foO".to_string()), true);
+        assert_eq!(has_uppercase_literal(&"foo\\\\".to_string()), false);
+        assert_eq!(has_uppercase_literal(&"foo\\w".to_string()), false);
+        assert_eq!(has_uppercase_literal(&"foo\\S".to_string()), false);
+        assert_eq!(has_uppercase_literal(&"foo\\p{Ll}".to_string()), true);
+        assert_eq!(has_uppercase_literal(&"foo[a-z]".to_string()), false);
+        assert_eq!(has_uppercase_literal(&"foo[A-Z]".to_string()), true);
+        assert_eq!(has_uppercase_literal(&"foo[\\S\\t]".to_string()), false);
+        assert_eq!(has_uppercase_literal(&"foo\\\\S".to_string()), true);
+    }
 }
author	dana <dana@dana.is>	2017-12-18 14:56:01 -0600
committer	Andrew Gallant <jamslam@gmail.com>	2017-12-18 17:58:26 -0500
commit	86c890bcecf885619565ce949ddb9099d55dd43c (patch)
tree	6e2c66ac70271b74820fc38c697692910457bafd /grep
parent	d775259ed9399037e1a1da0ff65dd377bc8b82d1 (diff)