summaryrefslogtreecommitdiffstats
path: root/grep
diff options
context:
space:
mode:
authordana <dana@dana.is>2017-12-18 14:56:01 -0600
committerAndrew Gallant <jamslam@gmail.com>2017-12-18 17:58:26 -0500
commit86c890bcecf885619565ce949ddb9099d55dd43c (patch)
tree6e2c66ac70271b74820fc38c697692910457bafd /grep
parentd775259ed9399037e1a1da0ff65dd377bc8b82d1 (diff)
Improve detection of upper-case characters by smart-case feature
Fixes #717 (partially) The previous implementation of the smart-case feature was actually *too* smart, in that it inspected the final character ranges in the AST to determine if the pattern contained upper-case characters. This meant that patterns like `foo\w` would not be handled case-insensitively, since `\w` includes the range of upper-case characters A–Z. As a medium-term solution to this problem, we now inspect the input pattern itself for upper-case characters, ignoring any that immediately follow a `\`. This neatly handles all of the most basic cases like `\w`, `\S`, and `É`, though it still has problems with more complex features like `\p{Ll}`. Handling those correctly will require improvements to the AST.
Diffstat (limited to 'grep')
-rw-r--r--grep/src/search.rs81
1 files changed, 39 insertions, 42 deletions
diff --git a/grep/src/search.rs b/grep/src/search.rs
index 498df0c1..9942f58a 100644
--- a/grep/src/search.rs
+++ b/grep/src/search.rs
@@ -102,9 +102,9 @@ impl GrepBuilder {
/// Whether to enable smart case search or not (disabled by default).
///
- /// Smart case uses case insensitive search if the regex is contains all
- /// lowercase literal characters. Otherwise, a case sensitive search is
- /// used instead.
+ /// Smart case uses case insensitive search if the pattern contains only
+ /// lowercase characters (ignoring any characters which immediately follow
+ /// a '\'). Otherwise, a case sensitive search is used instead.
///
/// Enabling the case_insensitive flag overrides this.
pub fn case_smart(mut self, yes: bool) -> GrepBuilder {
@@ -197,8 +197,6 @@ impl GrepBuilder {
}
/// Determines whether the case insensitive flag should be enabled or not.
- ///
- /// An error is returned if the regex could not be parsed.
fn is_case_insensitive(&self) -> Result<bool> {
if self.opts.case_insensitive {
return Ok(true);
@@ -206,12 +204,7 @@ impl GrepBuilder {
if !self.opts.case_smart {
return Ok(false);
}
- let expr =
- try!(syntax::ExprBuilder::new()
- .allow_bytes(true)
- .unicode(true)
- .parse(&self.pattern));
- Ok(!has_uppercase_literal(&expr))
+ Ok(!has_uppercase_literal(&self.pattern))
}
}
@@ -317,38 +310,26 @@ impl<'b, 's> Iterator for Iter<'b, 's> {
}
}
-fn has_uppercase_literal(expr: &Expr) -> bool {
- use syntax::Expr::*;
- fn byte_is_upper(b: u8) -> bool { b'A' <= b && b <= b'Z' }
- match *expr {
- Literal { ref chars, casei } => {
- casei || chars.iter().any(|c| c.is_uppercase())
- }
- LiteralBytes { ref bytes, casei } => {
- casei || bytes.iter().any(|&b| byte_is_upper(b))
- }
- Class(ref ranges) => {
- for r in ranges {
- if r.start.is_uppercase() || r.end.is_uppercase() {
- return true;
- }
- }
- false
- }
- ClassBytes(ref ranges) => {
- for r in ranges {
- if byte_is_upper(r.start) || byte_is_upper(r.end) {
- return true;
- }
- }
- false
+/// Determine whether the pattern contains an uppercase character which should
+/// negate the effect of the smart-case option.
+///
+/// Ideally we would be able to check the AST in order to correctly handle
+/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly
+/// cased), but we don't currently have that option. For now, our 'good enough'
+/// solution is to simply perform a semi-naïve scan of the input pattern and
+/// ignore all characters following a '\'. The ExprBuilder will handle any
+/// actual errors, and this at least lets us support the most common cases,
+/// like 'foo\w' and 'foo\S', in an intuitive manner.
+fn has_uppercase_literal(pattern: &str) -> bool {
+ let mut chars = pattern.chars();
+ while let Some(c) = chars.next() {
+ if c == '\\' {
+ chars.next();
+ } else if c.is_uppercase() {
+ return true;
}
- Group { ref e, .. } => has_uppercase_literal(e),
- Repeat { ref e, .. } => has_uppercase_literal(e),
- Concat(ref es) => es.iter().any(has_uppercase_literal),
- Alternate(ref es) => es.iter().any(has_uppercase_literal),
- _ => false,
}
+ false
}
#[cfg(test)]
@@ -358,7 +339,7 @@ mod tests {
use memchr::{memchr, memrchr};
use regex::bytes::Regex;
- use super::{GrepBuilder, Match};
+ use super::{GrepBuilder, Match, has_uppercase_literal};
static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt");
@@ -395,4 +376,20 @@ mod tests {
assert_eq!(expected.len(), got.len());
assert_eq!(expected, got);
}
+
+ #[test]
+ fn pattern_case() {
+ assert_eq!(has_uppercase_literal(&"".to_string()), false);
+ assert_eq!(has_uppercase_literal(&"foo".to_string()), false);
+ assert_eq!(has_uppercase_literal(&"Foo".to_string()), true);
+ assert_eq!(has_uppercase_literal(&"foO".to_string()), true);
+ assert_eq!(has_uppercase_literal(&"foo\\\\".to_string()), false);
+ assert_eq!(has_uppercase_literal(&"foo\\w".to_string()), false);
+ assert_eq!(has_uppercase_literal(&"foo\\S".to_string()), false);
+ assert_eq!(has_uppercase_literal(&"foo\\p{Ll}".to_string()), true);
+ assert_eq!(has_uppercase_literal(&"foo[a-z]".to_string()), false);
+ assert_eq!(has_uppercase_literal(&"foo[A-Z]".to_string()), true);
+ assert_eq!(has_uppercase_literal(&"foo[\\S\\t]".to_string()), false);
+ assert_eq!(has_uppercase_literal(&"foo\\\\S".to_string()), true);
+ }
}