From 42b8132d0ad1918c1c0dc677015d87c12819fa26 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 13 Mar 2018 21:43:23 -0400 Subject: grep: add "perfect" smart case detection This commit removes the previous smart case detection logic and replaces it with detection based on the regex AST. This particular AST is a faithful representation of the concrete syntax, which lets us be very precise in how we handle it. Closes #851 --- CHANGELOG.md | 2 + grep/src/lib.rs | 1 + grep/src/search.rs | 58 +++------------ grep/src/smart_case.rs | 191 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 203 insertions(+), 49 deletions(-) create mode 100644 grep/src/smart_case.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index c3be4888..0eb35e18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,6 +43,8 @@ Bug fixes: Support backslash escapes in globs. * [BUG #832](https://github.com/BurntSushi/ripgrep/issues/832): Clarify usage instructions for `-f/--file` flag. +* [BUG #851](https://github.com/BurntSushi/ripgrep/issues/851): + Fix `-S/--smart-case` detection once and for all. * [BUG #852](https://github.com/BurntSushi/ripgrep/issues/852): Be robust with respect to `ENOMEM` errors returned by `mmap`. * [BUG #853](https://github.com/BurntSushi/ripgrep/issues/853): diff --git a/grep/src/lib.rs b/grep/src/lib.rs index 3b2f0ebd..023cd64a 100644 --- a/grep/src/lib.rs +++ b/grep/src/lib.rs @@ -19,6 +19,7 @@ pub use search::{Grep, GrepBuilder, Iter, Match}; mod literals; mod nonl; mod search; +mod smart_case; mod word_boundary; /// Result is a convenient type alias that fixes the type of the error to diff --git a/grep/src/search.rs b/grep/src/search.rs index 1d5d7e29..49ddf1f8 100644 --- a/grep/src/search.rs +++ b/grep/src/search.rs @@ -1,10 +1,11 @@ use memchr::{memchr, memrchr}; +use syntax::ParserBuilder; +use syntax::hir::Hir; use regex::bytes::{Regex, RegexBuilder}; use literals::LiteralSets; use nonl; -use syntax::ParserBuilder; -use syntax::hir::Hir; +use smart_case::Cased; use word_boundary::strip_unicode_word_boundaries; use Result; @@ -205,7 +206,11 @@ impl GrepBuilder { if !self.opts.case_smart { return Ok(false); } - Ok(!has_uppercase_literal(&self.pattern)) + let cased = match Cased::from_pattern(&self.pattern) { + None => return Ok(false), + Some(cased) => cased, + }; + Ok(cased.any_literal && !cased.any_uppercase) } } @@ -311,44 +316,15 @@ impl<'b, 's> Iterator for Iter<'b, 's> { } } -/// Determine whether the pattern contains an uppercase character which should -/// negate the effect of the smart-case option. -/// -/// Ideally we would be able to check the AST in order to correctly handle -/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly -/// cased), but we don't currently have that option. For now, our 'good enough' -/// solution is to simply perform a semi-naïve scan of the input pattern and -/// ignore all characters following a '\'. The ExprBuilder will handle any -/// actual errors, and this at least lets us support the most common cases, -/// like 'foo\w' and 'foo\S', in an intuitive manner. -fn has_uppercase_literal(pattern: &str) -> bool { - let mut chars = pattern.chars(); - while let Some(c) = chars.next() { - if c == '\\' { - chars.next(); - } else if c.is_uppercase() { - return true; - } - } - false -} - #[cfg(test)] mod tests { - #![allow(unused_imports)] - use memchr::{memchr, memrchr}; use regex::bytes::Regex; - use super::{GrepBuilder, Match, has_uppercase_literal}; + use super::{GrepBuilder, Match}; static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt"); - #[allow(dead_code)] - fn s(bytes: &[u8]) -> String { - String::from_utf8(bytes.to_vec()).unwrap() - } - fn find_lines(pat: &str, haystack: &[u8]) -> Vec { let re = Regex::new(pat).unwrap(); let mut lines = vec![]; @@ -377,20 +353,4 @@ mod tests { assert_eq!(expected.len(), got.len()); assert_eq!(expected, got); } - - #[test] - fn pattern_case() { - assert_eq!(has_uppercase_literal(&"".to_string()), false); - assert_eq!(has_uppercase_literal(&"foo".to_string()), false); - assert_eq!(has_uppercase_literal(&"Foo".to_string()), true); - assert_eq!(has_uppercase_literal(&"foO".to_string()), true); - assert_eq!(has_uppercase_literal(&"foo\\\\".to_string()), false); - assert_eq!(has_uppercase_literal(&"foo\\w".to_string()), false); - assert_eq!(has_uppercase_literal(&"foo\\S".to_string()), false); - assert_eq!(has_uppercase_literal(&"foo\\p{Ll}".to_string()), true); - assert_eq!(has_uppercase_literal(&"foo[a-z]".to_string()), false); - assert_eq!(has_uppercase_literal(&"foo[A-Z]".to_string()), true); - assert_eq!(has_uppercase_literal(&"foo[\\S\\t]".to_string()), false); - assert_eq!(has_uppercase_literal(&"foo\\\\S".to_string()), true); - } } diff --git a/grep/src/smart_case.rs b/grep/src/smart_case.rs new file mode 100644 index 00000000..1379b326 --- /dev/null +++ b/grep/src/smart_case.rs @@ -0,0 +1,191 @@ +use syntax::ast::{self, Ast}; +use syntax::ast::parse::Parser; + +/// The results of analyzing a regex for cased literals. +#[derive(Clone, Debug, Default)] +pub struct Cased { + /// True if and only if a literal uppercase character occurs in the regex. + /// + /// A regex like `\pL` contains no uppercase literals, even though `L` + /// is uppercase and the `\pL` class contains uppercase characters. + pub any_uppercase: bool, + /// True if and only if the regex contains any literal at all. A regex like + /// `\pL` has this set to false. + pub any_literal: bool, +} + +impl Cased { + /// Returns a `Cased` value by doing analysis on the AST of `pattern`. + /// + /// If `pattern` is not a valid regular expression, then `None` is + /// returned. + pub fn from_pattern(pattern: &str) -> Option { + Parser::new() + .parse(pattern) + .map(|ast| Cased::from_ast(&ast)) + .ok() + } + + fn from_ast(ast: &Ast) -> Cased { + let mut cased = Cased::default(); + cased.from_ast_impl(ast); + cased + } + + fn from_ast_impl(&mut self, ast: &Ast) { + if self.done() { + return; + } + match *ast { + Ast::Empty(_) + | Ast::Flags(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::Class(ast::Class::Unicode(_)) + | Ast::Class(ast::Class::Perl(_)) => {} + Ast::Literal(ref x) => { + self.from_ast_literal(x); + } + Ast::Class(ast::Class::Bracketed(ref x)) => { + self.from_ast_class_set(&x.kind); + } + Ast::Repetition(ref x) => { + self.from_ast_impl(&x.ast); + } + Ast::Group(ref x) => { + self.from_ast_impl(&x.ast); + } + Ast::Alternation(ref alt) => { + for x in &alt.asts { + self.from_ast_impl(x); + } + } + Ast::Concat(ref alt) => { + for x in &alt.asts { + self.from_ast_impl(x); + } + } + } + } + + fn from_ast_class_set(&mut self, ast: &ast::ClassSet) { + if self.done() { + return; + } + match *ast { + ast::ClassSet::Item(ref item) => { + self.from_ast_class_set_item(item); + } + ast::ClassSet::BinaryOp(ref x) => { + self.from_ast_class_set(&x.lhs); + self.from_ast_class_set(&x.rhs); + } + } + } + + fn from_ast_class_set_item(&mut self, ast: &ast::ClassSetItem) { + if self.done() { + return; + } + match *ast { + ast::ClassSetItem::Empty(_) + | ast::ClassSetItem::Ascii(_) + | ast::ClassSetItem::Unicode(_) + | ast::ClassSetItem::Perl(_) => {} + ast::ClassSetItem::Literal(ref x) => { + self.from_ast_literal(x); + } + ast::ClassSetItem::Range(ref x) => { + self.from_ast_literal(&x.start); + self.from_ast_literal(&x.end); + } + ast::ClassSetItem::Bracketed(ref x) => { + self.from_ast_class_set(&x.kind); + } + ast::ClassSetItem::Union(ref union) => { + for x in &union.items { + self.from_ast_class_set_item(x); + } + } + } + } + + fn from_ast_literal(&mut self, ast: &ast::Literal) { + self.any_literal = true; + self.any_uppercase = self.any_uppercase || ast.c.is_uppercase(); + } + + /// Returns true if and only if the attributes can never change no matter + /// what other AST it might see. + fn done(&self) -> bool { + self.any_uppercase && self.any_literal + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn cased(pattern: &str) -> Cased { + Cased::from_pattern(pattern).unwrap() + } + + #[test] + fn various() { + let x = cased(""); + assert!(!x.any_uppercase); + assert!(!x.any_literal); + + let x = cased("foo"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + + let x = cased("Foo"); + assert!(x.any_uppercase); + assert!(x.any_literal); + + let x = cased("foO"); + assert!(x.any_uppercase); + assert!(x.any_literal); + + let x = cased(r"foo\\"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + + let x = cased(r"foo\w"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + + let x = cased(r"foo\S"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + + let x = cased(r"foo\p{Ll}"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + + let x = cased(r"foo[a-z]"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + + let x = cased(r"foo[A-Z]"); + assert!(x.any_uppercase); + assert!(x.any_literal); + + let x = cased(r"foo[\S\t]"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + + let x = cased(r"foo\\S"); + assert!(x.any_uppercase); + assert!(x.any_literal); + + let x = cased(r"\p{Ll}"); + assert!(!x.any_uppercase); + assert!(!x.any_literal); + + let x = cased(r"aBc\w"); + assert!(x.any_uppercase); + assert!(x.any_literal); + } +} -- cgit v1.2.3