diff options
author | Neal H. Walfield <neal@pep.foundation> | 2021-03-23 09:44:20 +0100 |
---|---|---|
committer | Neal H. Walfield <neal@pep.foundation> | 2021-03-23 10:18:43 +0100 |
commit | ac992b942e57fb68131a72d238c188e30f306941 (patch) | |
tree | a9e0f989a5937331b9a43dba75dac9b0be7a5512 | |
parent | e55f5ab5440962f705ec47da7bf8283e1d6b660c (diff) |
openpgp: Short-circuit regex alternations with empty branches.
- The regex 'a|b|' is an alternation of three branches: 'a', 'b',
and ''. The last branch matches anything, so the alternation
matches anything, and therefore the whole thing can be
elided.
- This is required for regex <= 1.3.7, which doesn't support empty
alternations.
- Unfortunately, this is the version in Debian Bullseye.
- Fixes #694.
-rw-r--r-- | openpgp/src/regex/grammar.lalrpop | 92 | ||||
-rw-r--r-- | openpgp/src/regex/mod.rs | 25 |
2 files changed, 92 insertions, 25 deletions
diff --git a/openpgp/src/regex/grammar.lalrpop b/openpgp/src/regex/grammar.lalrpop index e9e619b5..46fd7a70 100644 --- a/openpgp/src/regex/grammar.lalrpop +++ b/openpgp/src/regex/grammar.lalrpop @@ -17,7 +17,19 @@ pub(crate) Regex : Hir = { <l:LBranch> <r:RBranch*> => { let mut r = r; r.insert(0, l); - Hir::alternation(r) + + // If any of the branches are empty, then that branch matches + // everything, and we can just short circuit the whole + // alternation. + // + // This is actually required for version 1.3.7 of the regex + // crate, which is the version that is in Debian Bullseye. + // See issue #694 for details. + if r.iter().any(|b| b.kind().is_empty()) { + hir::Hir::empty() + } else { + Hir::alternation(r) + } }, } @@ -30,45 +42,75 @@ RBranch : Hir = { } Branch : Hir = { - <p:Piece*> => { - hir::Hir::group(hir::Group { - kind: hir::GroupKind::NonCapturing, - hir: Box::new(hir::Hir::concat(p)), - }) + => { + hir::Hir::empty() + }, + <p:Piece+> => { + if p.iter().all(|p| p.kind().is_empty()) { + // All pieces are empty. Just return empty. + hir::Hir::empty() + } else { + hir::Hir::group(hir::Group { + kind: hir::GroupKind::NonCapturing, + hir: Box::new(hir::Hir::concat(p)), + }) + } }, } Piece : Hir = { <a:Atom> => a, <a:Atom> STAR => { - hir::Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, - greedy: true, - hir: Box::new(a) - }) + if a.kind().is_empty() { + // Piece is empty. This is equivalent to empty so just + // return it. + a + } else { + hir::Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: true, + hir: Box::new(a) + }) + } }, <a:Atom> PLUS => { - hir::Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::OneOrMore, - greedy: true, - hir: Box::new(a) - }) + if a.kind().is_empty() { + // Piece is empty. This is equivalent to empty so just + // return it. + a + } else { + hir::Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::OneOrMore, + greedy: true, + hir: Box::new(a) + }) + } }, <a:Atom> QUESTION => { - hir::Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrOne, - greedy: true, - hir: Box::new(a) - }) + if a.kind().is_empty() { + // Piece is empty. This is equivalent to empty so just + // return it. + a + } else { + hir::Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrOne, + greedy: true, + hir: Box::new(a) + }) + } }, } Atom : Hir = { LPAREN <r:Regex> RPAREN => { - hir::Hir::group(hir::Group { - kind: hir::GroupKind::NonCapturing, - hir: Box::new(r), - }) + if r.kind().is_empty() { + r + } else { + hir::Hir::group(hir::Group { + kind: hir::GroupKind::NonCapturing, + hir: Box::new(r), + }) + } }, Range, diff --git a/openpgp/src/regex/mod.rs b/openpgp/src/regex/mod.rs index c61878a1..0d4fd7b3 100644 --- a/openpgp/src/regex/mod.rs +++ b/openpgp/src/regex/mod.rs @@ -1197,6 +1197,31 @@ mod tests { (true, "d"), (true, "eeee"), ]); + // A nested empty. + a("(a|)|b", &[ + (true, "a"), + (true, "b"), + ]); + // empty+ + a("(a|b|()+)", &[ + (true, "a"), + (true, "b"), + ]); + // (empty)+ + a("(a|b|(())+)", &[ + (true, "a"), + (true, "b"), + ]); + // Multiple empty branches. + a("(a|b|(()())())", &[ + (true, "a"), + (true, "b"), + ]); + a("(a|b|(()())())|", &[ + (true, "a"), + (true, "b"), + ]); + // This is: "ab" or "cd", not a followed by b or c followed by d: // // A regular expression is zero or more branches, separated by '|'. |