diff options
author | dana <dana@dana.is> | 2017-12-18 14:56:01 -0600 |
---|---|---|
committer | Andrew Gallant <jamslam@gmail.com> | 2017-12-18 17:58:26 -0500 |
commit | 86c890bcecf885619565ce949ddb9099d55dd43c (patch) | |
tree | 6e2c66ac70271b74820fc38c697692910457bafd /grep | |
parent | d775259ed9399037e1a1da0ff65dd377bc8b82d1 (diff) |
Improve detection of upper-case characters by smart-case feature
Fixes #717 (partially)
The previous implementation of the smart-case feature was actually *too* smart,
in that it inspected the final character ranges in the AST to determine if the
pattern contained upper-case characters. This meant that patterns like `foo\w`
would not be handled case-insensitively, since `\w` includes the range of
upper-case characters A–Z.
As a medium-term solution to this problem, we now inspect the input pattern
itself for upper-case characters, ignoring any that immediately follow a `\`.
This neatly handles all of the most basic cases like `\w`, `\S`, and `É`, though
it still has problems with more complex features like `\p{Ll}`. Handling those
correctly will require improvements to the AST.
Diffstat (limited to 'grep')
-rw-r--r-- | grep/src/search.rs | 81 |
1 files changed, 39 insertions, 42 deletions
diff --git a/grep/src/search.rs b/grep/src/search.rs index 498df0c1..9942f58a 100644 --- a/grep/src/search.rs +++ b/grep/src/search.rs @@ -102,9 +102,9 @@ impl GrepBuilder { /// Whether to enable smart case search or not (disabled by default). /// - /// Smart case uses case insensitive search if the regex is contains all - /// lowercase literal characters. Otherwise, a case sensitive search is - /// used instead. + /// Smart case uses case insensitive search if the pattern contains only + /// lowercase characters (ignoring any characters which immediately follow + /// a '\'). Otherwise, a case sensitive search is used instead. /// /// Enabling the case_insensitive flag overrides this. pub fn case_smart(mut self, yes: bool) -> GrepBuilder { @@ -197,8 +197,6 @@ impl GrepBuilder { } /// Determines whether the case insensitive flag should be enabled or not. - /// - /// An error is returned if the regex could not be parsed. fn is_case_insensitive(&self) -> Result<bool> { if self.opts.case_insensitive { return Ok(true); @@ -206,12 +204,7 @@ impl GrepBuilder { if !self.opts.case_smart { return Ok(false); } - let expr = - try!(syntax::ExprBuilder::new() - .allow_bytes(true) - .unicode(true) - .parse(&self.pattern)); - Ok(!has_uppercase_literal(&expr)) + Ok(!has_uppercase_literal(&self.pattern)) } } @@ -317,38 +310,26 @@ impl<'b, 's> Iterator for Iter<'b, 's> { } } -fn has_uppercase_literal(expr: &Expr) -> bool { - use syntax::Expr::*; - fn byte_is_upper(b: u8) -> bool { b'A' <= b && b <= b'Z' } - match *expr { - Literal { ref chars, casei } => { - casei || chars.iter().any(|c| c.is_uppercase()) - } - LiteralBytes { ref bytes, casei } => { - casei || bytes.iter().any(|&b| byte_is_upper(b)) - } - Class(ref ranges) => { - for r in ranges { - if r.start.is_uppercase() || r.end.is_uppercase() { - return true; - } - } - false - } - ClassBytes(ref ranges) => { - for r in ranges { - if byte_is_upper(r.start) || byte_is_upper(r.end) { - return true; - } - } - false +/// Determine whether the pattern contains an uppercase character which should +/// negate the effect of the smart-case option. +/// +/// Ideally we would be able to check the AST in order to correctly handle +/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly +/// cased), but we don't currently have that option. For now, our 'good enough' +/// solution is to simply perform a semi-naïve scan of the input pattern and +/// ignore all characters following a '\'. The ExprBuilder will handle any +/// actual errors, and this at least lets us support the most common cases, +/// like 'foo\w' and 'foo\S', in an intuitive manner. +fn has_uppercase_literal(pattern: &str) -> bool { + let mut chars = pattern.chars(); + while let Some(c) = chars.next() { + if c == '\\' { + chars.next(); + } else if c.is_uppercase() { + return true; } - Group { ref e, .. } => has_uppercase_literal(e), - Repeat { ref e, .. } => has_uppercase_literal(e), - Concat(ref es) => es.iter().any(has_uppercase_literal), - Alternate(ref es) => es.iter().any(has_uppercase_literal), - _ => false, } + false } #[cfg(test)] @@ -358,7 +339,7 @@ mod tests { use memchr::{memchr, memrchr}; use regex::bytes::Regex; - use super::{GrepBuilder, Match}; + use super::{GrepBuilder, Match, has_uppercase_literal}; static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt"); @@ -395,4 +376,20 @@ mod tests { assert_eq!(expected.len(), got.len()); assert_eq!(expected, got); } + + #[test] + fn pattern_case() { + assert_eq!(has_uppercase_literal(&"".to_string()), false); + assert_eq!(has_uppercase_literal(&"foo".to_string()), false); + assert_eq!(has_uppercase_literal(&"Foo".to_string()), true); + assert_eq!(has_uppercase_literal(&"foO".to_string()), true); + assert_eq!(has_uppercase_literal(&"foo\\\\".to_string()), false); + assert_eq!(has_uppercase_literal(&"foo\\w".to_string()), false); + assert_eq!(has_uppercase_literal(&"foo\\S".to_string()), false); + assert_eq!(has_uppercase_literal(&"foo\\p{Ll}".to_string()), true); + assert_eq!(has_uppercase_literal(&"foo[a-z]".to_string()), false); + assert_eq!(has_uppercase_literal(&"foo[A-Z]".to_string()), true); + assert_eq!(has_uppercase_literal(&"foo[\\S\\t]".to_string()), false); + assert_eq!(has_uppercase_literal(&"foo\\\\S".to_string()), true); + } } |