diff options
author | Andrew Gallant <jamslam@gmail.com> | 2018-03-13 20:38:50 -0400 |
---|---|---|
committer | Andrew Gallant <jamslam@gmail.com> | 2018-03-13 22:55:39 -0400 |
commit | cd08707c7c82058559bd5557efb3c1d0379dbf1d (patch) | |
tree | 37b7094ef1b753f1a2dc090b10badeef693e3c69 /grep | |
parent | c2e97cd858fd5a18bcfe31979dbd5d859cb240aa (diff) |
grep: upgrade to regex-syntax 0.5
This update brings with it many bug fixes:
* Better error messages are printed overall. We also include
explicit call out for unsupported features like backreferences
and look-around.
* Regexes like `\s*{` no longer emit incomprehensible errors.
* Unicode escape sequences, such as `\u{..}` are now supported.
For the most part, this upgrade was done in a straight-forward way. We
resist the urge to refactor the `grep` crate, in anticipation of it
being rewritten anyway.
Note that we removed the `--fixed-strings` suggestion whenever a regex
syntax error occurs. In practice, I've found that it results in a lot of
false positives, and I believe that its use is not as paramount now that
regex parse errors are much more readable.
Closes #268, Closes #395, Closes #702, Closes #853
Diffstat (limited to 'grep')
-rw-r--r-- | grep/Cargo.toml | 2 | ||||
-rw-r--r-- | grep/src/literals.rs | 119 | ||||
-rw-r--r-- | grep/src/nonl.rs | 85 | ||||
-rw-r--r-- | grep/src/search.rs | 19 | ||||
-rw-r--r-- | grep/src/word_boundary.rs | 31 |
5 files changed, 131 insertions, 125 deletions
diff --git a/grep/Cargo.toml b/grep/Cargo.toml index 541d3260..b404a86a 100644 --- a/grep/Cargo.toml +++ b/grep/Cargo.toml @@ -16,4 +16,4 @@ license = "Unlicense/MIT" log = "0.4" memchr = "2" regex = "0.2.9" -regex-syntax = "0.4.0" +regex-syntax = "0.5.3" diff --git a/grep/src/literals.rs b/grep/src/literals.rs index eebeac4c..3e1c385b 100644 --- a/grep/src/literals.rs +++ b/grep/src/literals.rs @@ -10,10 +10,8 @@ principled. use std::cmp; use regex::bytes::RegexBuilder; -use syntax::{ - Expr, Literals, Lit, - ByteClass, ByteRange, CharClass, ClassRange, Repeater, -}; +use syntax::hir::{self, Hir, HirKind}; +use syntax::hir::literal::{Literal, Literals}; #[derive(Clone, Debug)] pub struct LiteralSets { @@ -23,12 +21,12 @@ pub struct LiteralSets { } impl LiteralSets { - pub fn create(expr: &Expr) -> Self { + pub fn create(expr: &Hir) -> Self { let mut required = Literals::empty(); union_required(expr, &mut required); LiteralSets { - prefixes: expr.prefixes(), - suffixes: expr.suffixes(), + prefixes: Literals::prefixes(expr), + suffixes: Literals::suffixes(expr), required: required, } } @@ -93,60 +91,52 @@ impl LiteralSets { } } -fn union_required(expr: &Expr, lits: &mut Literals) { - use syntax::Expr::*; - match *expr { - Literal { ref chars, casei: false } => { - let s: String = chars.iter().cloned().collect(); - lits.cross_add(s.as_bytes()); +fn union_required(expr: &Hir, lits: &mut Literals) { + match *expr.kind() { + HirKind::Literal(hir::Literal::Unicode(c)) => { + let mut buf = [0u8; 4]; + lits.cross_add(c.encode_utf8(&mut buf).as_bytes()); } - Literal { ref chars, casei: true } => { - for &c in chars { - let cls = CharClass::new(vec![ - ClassRange { start: c, end: c }, - ]).case_fold(); - if !lits.add_char_class(&cls) { - lits.cut(); - return; - } + HirKind::Literal(hir::Literal::Byte(b)) => { + lits.cross_add(&[b]); + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) { + lits.cut(); + } + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) { + lits.cut(); } } - LiteralBytes { ref bytes, casei: false } => { - lits.cross_add(bytes); + HirKind::Group(hir::Group { ref hir, .. }) => { + union_required(&**hir, lits); } - LiteralBytes { ref bytes, casei: true } => { - for &b in bytes { - let cls = ByteClass::new(vec![ - ByteRange { start: b, end: b }, - ]).case_fold(); - if !lits.add_byte_class(&cls) { + HirKind::Repetition(ref x) => { + match x.kind { + hir::RepetitionKind::ZeroOrOne => lits.cut(), + hir::RepetitionKind::ZeroOrMore => lits.cut(), + hir::RepetitionKind::OneOrMore => { + union_required(&x.hir, lits); lits.cut(); - return; + } + hir::RepetitionKind::Range(ref rng) => { + let (min, max) = match *rng { + hir::RepetitionRange::Exactly(m) => (m, Some(m)), + hir::RepetitionRange::AtLeast(m) => (m, None), + hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), + }; + repeat_range_literals( + &x.hir, min, max, x.greedy, lits, union_required); } } } - Class(_) => { - lits.cut(); - } - ClassBytes(_) => { - lits.cut(); + HirKind::Concat(ref es) if es.is_empty() => {} + HirKind::Concat(ref es) if es.len() == 1 => { + union_required(&es[0], lits) } - Group { ref e, .. } => { - union_required(&**e, lits); - } - Repeat { r: Repeater::ZeroOrOne, .. } => lits.cut(), - Repeat { r: Repeater::ZeroOrMore, .. } => lits.cut(), - Repeat { ref e, r: Repeater::OneOrMore, .. } => { - union_required(&**e, lits); - lits.cut(); - } - Repeat { ref e, r: Repeater::Range { min, max }, greedy } => { - repeat_range_literals( - &**e, min, max, greedy, lits, union_required); - } - Concat(ref es) if es.is_empty() => {} - Concat(ref es) if es.len() == 1 => union_required(&es[0], lits), - Concat(ref es) => { + HirKind::Concat(ref es) => { for e in es { let mut lits2 = lits.to_empty(); union_required(e, &mut lits2); @@ -157,7 +147,6 @@ fn union_required(expr: &Expr, lits: &mut Literals) { if lits2.contains_empty() { lits.cut(); } - // if !lits.union(lits2) { if !lits.cross_product(&lits2) { // If this expression couldn't yield any literal that // could be extended, then we need to quit. Since we're @@ -167,15 +156,15 @@ fn union_required(expr: &Expr, lits: &mut Literals) { } } } - Alternate(ref es) => { + HirKind::Alternation(ref es) => { alternate_literals(es, lits, union_required); } _ => lits.cut(), } } -fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>( - e: &Expr, +fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>( + e: &Hir, min: u32, max: Option<u32>, _greedy: bool, @@ -204,8 +193,8 @@ fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>( } } -fn alternate_literals<F: FnMut(&Expr, &mut Literals)>( - es: &[Expr], +fn alternate_literals<F: FnMut(&Hir, &mut Literals)>( + es: &[Hir], lits: &mut Literals, mut f: F, ) { @@ -234,11 +223,21 @@ fn alternate_literals<F: FnMut(&Expr, &mut Literals)>( } lits.cut(); if !lcs.is_empty() { - lits.add(Lit::empty()); - lits.add(Lit::new(lcs.to_vec())); + lits.add(Literal::empty()); + lits.add(Literal::new(lcs.to_vec())); } } +/// Return the number of characters in the given class. +fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 { + cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum() +} + +/// Return the number of bytes in the given class. +fn count_byte_class(cls: &hir::ClassBytes) -> u32 { + cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum() +} + /// Converts an arbitrary sequence of bytes to a literal suitable for building /// a regular expression. fn bytes_to_regex(bs: &[u8]) -> String { diff --git a/grep/src/nonl.rs b/grep/src/nonl.rs index 361b0b00..3beb5f61 100644 --- a/grep/src/nonl.rs +++ b/grep/src/nonl.rs @@ -1,4 +1,4 @@ -use syntax::Expr; +use syntax::hir::{self, Hir, HirKind}; use {Error, Result}; @@ -9,59 +9,66 @@ use {Error, Result}; /// /// If `byte` is not an ASCII character (i.e., greater than `0x7F`), then this /// function panics. -pub fn remove(expr: Expr, byte: u8) -> Result<Expr> { - // TODO(burntsushi): There is a bug in this routine where only `\n` is - // handled correctly. Namely, `AnyChar` and `AnyByte` need to be translated - // to proper character classes instead of the special `AnyCharNoNL` and - // `AnyByteNoNL` classes. - use syntax::Expr::*; +pub fn remove(expr: Hir, byte: u8) -> Result<Hir> { assert!(byte <= 0x7F); let chr = byte as char; assert!(chr.len_utf8() == 1); - Ok(match expr { - Literal { chars, casei } => { - if chars.iter().position(|&c| c == chr).is_some() { + Ok(match expr.into_kind() { + HirKind::Empty => Hir::empty(), + HirKind::Literal(hir::Literal::Unicode(c)) => { + if c == chr { return Err(Error::LiteralNotAllowed(chr)); } - Literal { chars: chars, casei: casei } + Hir::literal(hir::Literal::Unicode(c)) } - LiteralBytes { bytes, casei } => { - if bytes.iter().position(|&b| b == byte).is_some() { + HirKind::Literal(hir::Literal::Byte(b)) => { + if b as char == chr { return Err(Error::LiteralNotAllowed(chr)); } - LiteralBytes { bytes: bytes, casei: casei } + Hir::literal(hir::Literal::Byte(b)) } - AnyChar => AnyCharNoNL, - AnyByte => AnyByteNoNL, - Class(mut cls) => { - cls.remove(chr); - Class(cls) - } - ClassBytes(mut cls) => { - cls.remove(byte); - ClassBytes(cls) - } - Group { e, i, name } => { - Group { - e: Box::new(remove(*e, byte)?), - i: i, - name: name, + HirKind::Class(hir::Class::Unicode(mut cls)) => { + let remove = hir::ClassUnicode::new(Some( + hir::ClassUnicodeRange::new(chr, chr), + )); + cls.difference(&remove); + if cls.iter().next().is_none() { + return Err(Error::LiteralNotAllowed(chr)); } + Hir::class(hir::Class::Unicode(cls)) } - Repeat { e, r, greedy } => { - Repeat { - e: Box::new(remove(*e, byte)?), - r: r, - greedy: greedy, + HirKind::Class(hir::Class::Bytes(mut cls)) => { + let remove = hir::ClassBytes::new(Some( + hir::ClassBytesRange::new(byte, byte), + )); + cls.difference(&remove); + if cls.iter().next().is_none() { + return Err(Error::LiteralNotAllowed(chr)); } + Hir::class(hir::Class::Bytes(cls)) + } + HirKind::Anchor(x) => Hir::anchor(x), + HirKind::WordBoundary(x) => Hir::word_boundary(x), + HirKind::Repetition(mut x) => { + x.hir = Box::new(remove(*x.hir, byte)?); + Hir::repetition(x) + } + HirKind::Group(mut x) => { + x.hir = Box::new(remove(*x.hir, byte)?); + Hir::group(x) } - Concat(exprs) => { - Concat(exprs.into_iter().map(|e| remove(e, byte)).collect::<Result<Vec<Expr>>>()?) + HirKind::Concat(xs) => { + let xs = xs.into_iter() + .map(|e| remove(e, byte)) + .collect::<Result<Vec<Hir>>>()?; + Hir::concat(xs) } - Alternate(exprs) => { - Alternate(exprs.into_iter().map(|e| remove(e, byte)).collect::<Result<Vec<Expr>>>()?) + HirKind::Alternation(xs) => { + let xs = xs.into_iter() + .map(|e| remove(e, byte)) + .collect::<Result<Vec<Hir>>>()?; + Hir::alternation(xs) } - e => e, }) } diff --git a/grep/src/search.rs b/grep/src/search.rs index 8d056796..1d5d7e29 100644 --- a/grep/src/search.rs +++ b/grep/src/search.rs @@ -1,10 +1,10 @@ use memchr::{memchr, memrchr}; use regex::bytes::{Regex, RegexBuilder}; -use syntax; use literals::LiteralSets; use nonl; -use syntax::Expr; +use syntax::ParserBuilder; +use syntax::hir::Hir; use word_boundary::strip_unicode_word_boundaries; use Result; @@ -166,7 +166,7 @@ impl GrepBuilder { /// Creates a new regex from the given expression with the current /// configuration. - fn regex(&self, expr: &Expr) -> Result<Regex> { + fn regex(&self, expr: &Hir) -> Result<Regex> { let mut builder = RegexBuilder::new(&expr.to_string()); builder.unicode(true); self.regex_build(builder) @@ -184,15 +184,16 @@ impl GrepBuilder { /// Parses the underlying pattern and ensures the pattern can never match /// the line terminator. - fn parse(&self) -> Result<syntax::Expr> { - let expr = - syntax::ExprBuilder::new() - .allow_bytes(true) - .unicode(true) + fn parse(&self) -> Result<Hir> { + let expr = ParserBuilder::new() + .allow_invalid_utf8(true) .case_insensitive(self.is_case_insensitive()?) + .multi_line(true) + .build() .parse(&self.pattern)?; + debug!("original regex HIR pattern:\n{}", expr); let expr = nonl::remove(expr, self.opts.line_terminator)?; - debug!("regex ast:\n{:#?}", expr); + debug!("transformed regex HIR pattern:\n{}", expr); Ok(expr) } diff --git a/grep/src/word_boundary.rs b/grep/src/word_boundary.rs index 6df5c657..8e6b86d1 100644 --- a/grep/src/word_boundary.rs +++ b/grep/src/word_boundary.rs @@ -1,4 +1,4 @@ -use syntax::Expr; +use syntax::hir::{self, Hir, HirKind}; /// Strips Unicode word boundaries from the given expression. /// @@ -8,7 +8,7 @@ use syntax::Expr; /// false negatives. /// /// If no word boundaries could be stripped, then None is returned. -pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> { +pub fn strip_unicode_word_boundaries(expr: &Hir) -> Option<Hir> { // The real reason we do this is because Unicode word boundaries are the // one thing that Rust's regex DFA engine can't handle. When it sees a // Unicode word boundary among non-ASCII text, it falls back to one of the @@ -16,23 +16,24 @@ pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> { // a regex to find candidate matches without a Unicode word boundary. We'll // only then use the full (and slower) regex to confirm a candidate as a // match or not during search. - use syntax::Expr::*; - - match *expr { - Concat(ref es) if !es.is_empty() => { + // + // It looks like we only check the outer edges for `\b`? I guess this is + // an attempt to optimize for the `-w/--word-regexp` flag? ---AG + match *expr.kind() { + HirKind::Concat(ref es) if !es.is_empty() => { let first = is_unicode_word_boundary(&es[0]); let last = is_unicode_word_boundary(es.last().unwrap()); // Be careful not to strip word boundaries if there are no other // expressions to match. match (first, last) { (true, false) if es.len() > 1 => { - Some(Concat(es[1..].to_vec())) + Some(Hir::concat(es[1..].to_vec())) } (false, true) if es.len() > 1 => { - Some(Concat(es[..es.len() - 1].to_vec())) + Some(Hir::concat(es[..es.len() - 1].to_vec())) } (true, true) if es.len() > 2 => { - Some(Concat(es[1..es.len() - 1].to_vec())) + Some(Hir::concat(es[1..es.len() - 1].to_vec())) } _ => None, } @@ -42,13 +43,11 @@ pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> { } /// Returns true if the given expression is a Unicode word boundary. -fn is_unicode_word_boundary(expr: &Expr) -> bool { - use syntax::Expr::*; - - match *expr { - WordBoundary => true, - NotWordBoundary => true, - Group { ref e, .. } => is_unicode_word_boundary(e), +fn is_unicode_word_boundary(expr: &Hir) -> bool { + match *expr.kind() { + HirKind::WordBoundary(hir::WordBoundary::Unicode) => true, + HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => true, + HirKind::Group(ref x) => is_unicode_word_boundary(&x.hir), _ => false, } } |