From 3367828e487fa79d2ec0a615e6b15dd5fbee5c9a Mon Sep 17 00:00:00 2001 From: "Neal H. Walfield" Date: Fri, 8 Jan 2021 14:29:22 +0100 Subject: openpgp: Add regex support. - Fixes #188. --- Cargo.lock | 1 + openpgp/Cargo.toml | 1 + openpgp/src/lib.rs | 1 + openpgp/src/regex/grammar.lalrpop | 204 ++++ openpgp/src/regex/lexer.rs | 222 +++++ openpgp/src/regex/mod.rs | 1943 +++++++++++++++++++++++++++++++++++++ 6 files changed, 2372 insertions(+) create mode 100644 openpgp/src/regex/grammar.lalrpop create mode 100644 openpgp/src/regex/lexer.rs create mode 100644 openpgp/src/regex/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 4bbfe62d..9d5a238f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2027,6 +2027,7 @@ dependencies = [ "quickcheck", "rand", "regex", + "regex-syntax", "rpassword", "sha1collisiondetection", "thiserror", diff --git a/openpgp/Cargo.toml b/openpgp/Cargo.toml index a56ce401..35827295 100644 --- a/openpgp/Cargo.toml +++ b/openpgp/Cargo.toml @@ -38,6 +38,7 @@ libc = "0.2.66" memsec = { version = ">=0.5", default-features = false } nettle = { version = "7", optional = true } regex = "1" +regex-syntax = "0.6" sha1collisiondetection = { version = "0.2.3", default-features = false, features = ["std"] } thiserror = "1.0.2" backtrace = "0.3.3" diff --git a/openpgp/src/lib.rs b/openpgp/src/lib.rs index 7670f736..da44fb4c 100644 --- a/openpgp/src/lib.rs +++ b/openpgp/src/lib.rs @@ -159,6 +159,7 @@ mod keyid; pub use keyid::KeyID; mod keyhandle; pub use keyhandle::KeyHandle; +pub mod regex; pub mod policy; pub(crate) mod seal; diff --git a/openpgp/src/regex/grammar.lalrpop b/openpgp/src/regex/grammar.lalrpop new file mode 100644 index 00000000..e9e619b5 --- /dev/null +++ b/openpgp/src/regex/grammar.lalrpop @@ -0,0 +1,204 @@ +// -*- mode: Rust; -*- + +use super::generate_class; +use super::lexer; +use super::lexer::{Token, LexicalError}; +use regex_syntax::hir::{self, Hir}; + +// Pass in the original, untokenized input to facilitate error +// recovery. +grammar<'input>(input: &'input str); + +// This is a straightforward translation of the regular expression +// grammar from section 8 of RFC 4880. +// +// https://tools.ietf.org/html/rfc4880#section-8 +pub(crate) Regex : Hir = { + => { + let mut r = r; + r.insert(0, l); + Hir::alternation(r) + }, +} + +LBranch : Hir = { + Branch, +} + +RBranch : Hir = { + PIPE , +} + +Branch : Hir = { + => { + hir::Hir::group(hir::Group { + kind: hir::GroupKind::NonCapturing, + hir: Box::new(hir::Hir::concat(p)), + }) + }, +} + +Piece : Hir = { + => a, + STAR => { + hir::Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: true, + hir: Box::new(a) + }) + }, + PLUS => { + hir::Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::OneOrMore, + greedy: true, + hir: Box::new(a) + }) + }, + QUESTION => { + hir::Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrOne, + greedy: true, + hir: Box::new(a) + }) + }, +} + +Atom : Hir = { + LPAREN RPAREN => { + hir::Hir::group(hir::Group { + kind: hir::GroupKind::NonCapturing, + hir: Box::new(r), + }) + }, + + Range, + + DOT => { + hir::Hir::any(false) + }, + CARET => { + hir::Hir::anchor(hir::Anchor::StartText) + }, + DOLLAR => { + hir::Hir::anchor(hir::Anchor::EndText) + }, + + BACKSLASH => { + hir::Hir::literal(hir::Literal::Unicode(t.to_char())) + }, + + => { + hir::Hir::literal(hir::Literal::Unicode(t.to_char())) + }, + +} + +Range : Hir = { + LBRACKET RBRACKET => { + generate_class(c.is_some(), + std::iter::once(class1.to_char()) + .chain(class2.into_iter().map(|t| t.to_char()))) + }, + LBRACKET CARET RBRACKET => { + generate_class(true, + class.into_iter().map(|t| t.to_char())) + }, + LBRACKET RBRACKET => { + generate_class(false, + std::iter::once(class1.to_char()) + .chain(class2.into_iter().map(|t| t.to_char()))) + }, +} + +NotRBracket : Token = { + PIPE => Token::OTHER('|'), + + STAR => Token::OTHER('*'), + PLUS => Token::OTHER('+'), + QUESTION => Token::OTHER('?'), + + LPAREN => Token::OTHER('('), + RPAREN => Token::OTHER(')'), + + DOT => Token::OTHER('.'), + CARET => Token::OTHER('^'), + DOLLAR => Token::OTHER('$'), + BACKSLASH => Token::OTHER('\\'), + + LBRACKET => Token::OTHER('['), + // RBRACKET => Token::OTHER(']'), + DASH => Token::OTHER('-'), + + OTHER, +} + +NotCaretNotRBracket : Token = { + PIPE => Token::OTHER('|'), + + STAR => Token::OTHER('*'), + PLUS => Token::OTHER('+'), + QUESTION => Token::OTHER('?'), + + LPAREN => Token::OTHER('('), + RPAREN => Token::OTHER(')'), + + DOT => Token::OTHER('.'), + // CARET => Token::OTHER('^'), + DOLLAR => Token::OTHER('$'), + BACKSLASH => Token::OTHER('\\'), + + LBRACKET => Token::OTHER('['), + // RBRACKET => Token::OTHER(']'), + DASH => Token::OTHER('-'), + + OTHER, +} + +AnyChar : Token = { + PIPE => Token::OTHER('|'), + + STAR => Token::OTHER('*'), + PLUS => Token::OTHER('+'), + QUESTION => Token::OTHER('?'), + + LPAREN => Token::OTHER('('), + RPAREN => Token::OTHER(')'), + + DOT => Token::OTHER('.'), + CARET => Token::OTHER('^'), + DOLLAR => Token::OTHER('$'), + BACKSLASH => Token::OTHER('\\'), + + LBRACKET => Token::OTHER('['), + RBRACKET => Token::OTHER(']'), + DASH => Token::OTHER('-'), + + OTHER, +} + +extern { + type Location = usize; + type Error = LexicalError; + + enum lexer::Token { + PIPE => lexer::Token::PIPE, + + STAR => lexer::Token::STAR, + PLUS => lexer::Token::PLUS, + QUESTION => lexer::Token::QUESTION, + + LPAREN => lexer::Token::LPAREN, + RPAREN => lexer::Token::RPAREN, + + DOT => lexer::Token::DOT, + CARET => lexer::Token::CARET, + DOLLAR => lexer::Token::DOLLAR, + BACKSLASH => lexer::Token::BACKSLASH, + + LBRACKET => lexer::Token::LBRACKET, + RBRACKET => lexer::Token::RBRACKET, + DASH => lexer::Token::DASH, + + OTHER => lexer::Token::OTHER(_), + } +} diff --git a/openpgp/src/regex/lexer.rs b/openpgp/src/regex/lexer.rs new file mode 100644 index 00000000..18300d90 --- /dev/null +++ b/openpgp/src/regex/lexer.rs @@ -0,0 +1,222 @@ +use std::fmt; + +#[derive(Clone, PartialEq, Eq, Debug)] +pub enum LexicalError { +} + +impl fmt::Display for LexicalError { + // This trait requires `fmt` with this exact signature. + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", "{}") + } +} + +pub type Spanned + = Result<(Loc, Token, Loc), LexicalError>; + +// The type of the parser's input. +// +// The parser iterators over tuples consisting of the token's starting +// position, the token itself, and the token's ending position. +pub(crate) type LexerItem + = Spanned; + +/// The components of an OpenPGP Message. +#[derive(Debug, Clone, PartialEq)] +pub enum Token { + PIPE, + + STAR, + PLUS, + QUESTION, + + LPAREN, + RPAREN, + + DOT, + CARET, + DOLLAR, + BACKSLASH, + + LBRACKET, + RBRACKET, + DASH, + + OTHER(char), +} +assert_send_and_sync!(Token); + +impl fmt::Display for Token { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(&format!("{:?}", self)[..]) + } +} + +impl From for String { + fn from(t: Token) -> String { + use self::Token::*; + match t { + PIPE => '|'.to_string(), + STAR => '*'.to_string(), + PLUS => '+'.to_string(), + QUESTION => '?'.to_string(), + LPAREN => '('.to_string(), + RPAREN => ')'.to_string(), + DOT => '.'.to_string(), + CARET => '^'.to_string(), + DOLLAR => '$'.to_string(), + BACKSLASH => '\\'.to_string(), + LBRACKET => '['.to_string(), + RBRACKET => ']'.to_string(), + DASH => '-'.to_string(), + OTHER(c) => c.to_string(), + } + } +} + +impl Token { + pub fn to_string(self) -> String { + self.into() + } + + pub fn to_char(&self) -> char { + use self::Token::*; + match self { + PIPE => '|', + STAR => '*', + PLUS => '+', + QUESTION => '?', + LPAREN => '(', + RPAREN => ')', + DOT => '.', + CARET => '^', + DOLLAR => '$', + BACKSLASH => '\\', + LBRACKET => '[', + RBRACKET => ']', + DASH => '-', + OTHER(c) => *c, + } + } +} + +pub(crate) struct Lexer<'input> { + offset: usize, + input: &'input str, +} + +impl<'input> Lexer<'input> { + pub fn new(input: &'input str) -> Self { + Lexer { offset: 0, input } + } +} + +impl<'input> Iterator for Lexer<'input> { + type Item = LexerItem; + + fn next(&mut self) -> Option { + use self::Token::*; + + tracer!(super::TRACE, "regex::Lexer::next"); + + // Returns the length of the first character in s in bytes. + // If s is empty, returns 0. + fn char_bytes(s: &str) -> usize { + if let Some(c) = s.chars().next() { + c.len_utf8() + } else { + 0 + } + } + + let one = |input: &'input str| -> Option { + let c = input.chars().next()?; + Some(match c { + '|' => PIPE, + '*' => STAR, + '+' => PLUS, + '?' => QUESTION, + '(' => LPAREN, + ')' => RPAREN, + '.' => DOT, + '^' => CARET, + '$' => DOLLAR, + '\\' => BACKSLASH, + '[' => LBRACKET, + ']' => RBRACKET, + '-' => DASH, + _ => OTHER(c), + }) + }; + + let l = char_bytes(self.input); + let t = match one(self.input) { + Some(t) => t, + None => return None, + }; + + self.input = &self.input[l..]; + + let start = self.offset; + let end = start + l; + self.offset += l; + + t!("Returning token at offset {}: '{:?}'", + start, t); + + Some(Ok((start, t, end))) + } +} + +impl<'input> From<&'input str> for Lexer<'input> { + fn from(i: &'input str) -> Lexer<'input> { + Lexer::new(i) + } +} + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn lexer() { + fn lex(s: &str, expected: &[Token]) { + let tokens: Vec = Lexer::new(s) + .map(|t| t.unwrap().1) + .collect(); + + assert_eq!(&tokens[..], expected, + "{}", s); + } + + use Token::*; + lex("|", &[ PIPE ]); + lex("*", &[ STAR ]); + lex("+", &[ PLUS ]); + lex("?", &[ QUESTION ]); + lex("(", &[ LPAREN ]); + lex(")", &[ RPAREN ]); + lex(".", &[ DOT ]); + lex("^", &[ CARET ]); + lex("$", &[ DOLLAR ]); + lex("\\", &[ BACKSLASH ]); + lex("[", &[ LBRACKET ]); + lex("]", &[ RBRACKET ]); + lex("-", &[ DASH ]); + lex("a", &[ OTHER('a') ]); + lex("aa", &[ OTHER('a'), OTHER('a') ]); + lex("foo", &[ OTHER('f'), OTHER('o'), OTHER('o') ]); + + lex("foo\\bar", &[ OTHER('f'), OTHER('o'), OTHER('o'), + BACKSLASH, + OTHER('b'), OTHER('a'), OTHER('r') ]); + lex("*?!", &[ STAR, QUESTION, OTHER('!') ]); + + // Multi-byte UTF-8. + lex("ΓŸβ„πŸ’£", &[ OTHER('ß'), OTHER('ℝ'), OTHER('πŸ’£'), ]); + lex("(ß|ℝ|πŸ’£", + &[ LPAREN, OTHER('ß'), PIPE, OTHER('ℝ'), PIPE, OTHER('πŸ’£') ]); + lex("東京", &[ OTHER('東'), OTHER('δΊ¬') ]); + } +} diff --git a/openpgp/src/regex/mod.rs b/openpgp/src/regex/mod.rs new file mode 100644 index 00000000..626e5190 --- /dev/null +++ b/openpgp/src/regex/mod.rs @@ -0,0 +1,1943 @@ +//! OpenPGP regex parser. +//! +//! OpenPGP defines a [regular expression language]. It is used with +//! [trust signatures] to scope the trust that they extend. +//! +//! [regular expression language]: https://tools.ietf.org/html/rfc4880#section-8 +//! [trust signatures]: https://tools.ietf.org/html/rfc4880#section-5.2.3.13 +//! +//! Compared with most regular expression lanugages, OpenPGP's is +//! quite simple. In particular, it only includes the following +//! features: +//! +//! - Alternations using `|`, +//! - Grouping using `(` and `)`, +//! - The `*`, `+`, and `?` glob operators, +//! - The `^`, and `$` anchors, +//! - The '.' operator, positive *non-empty* ranges +//! (e.g. `[a-zA-Z]`) and negative *non-empty* ranges (`[^@]`), and +//! - The backslash operator to escape special characters (except +//! in ranges). +//! +//! The regular expression engine defined in this module implements +//! that language with two differences. The first difference is that +//! the compiler only works on UTF-8 strings (not bytes). The second +//! difference is that ranges in character classes are between UTF-8 +//! characters, not just ASCII characters. +//! +//! # Data Structures +//! +//! This module defines two data structures. [`Regex`] encapsulates a +//! valid regular expression, and provides methods to check whether +//! the regular expression matches a string or a [`UserID`]. +//! [`RegexSet`] is similar, but encapsulates zero or more regular +//! expressions, which may or may not be valid. Its match methods +//! return `true` if there are no regular expressions, or, if there is +//! at least one regular expression, they return whether at least one +//! of the regular expressions matches it. `RegexSet`'s matcher +//! handles invalid regular expressions by considering them to be +//! regular expressions that don't match anything. These semantics +//! are consistent with a trust signature's scoping rules. Further, +//! strings that contain control characters never match. This +//! behavior can be overridden using [`Regex::disable_sanitizations`] +//! and [`RegexSet::disable_sanitizations`]. +//! +//! [`Regex`]: struct.Regex.html +//! [`UserID`]: ../packet/struct.UserID.html +//! [`RegexSet`]: struct.RegexSet.html +//! [`Regex::disable_sanitizations`]: struct.Regex.html#method.disable_sanitizations +//! [`RegexSet::disable_sanitizations`]: struct.RegexSet.html#method.disable_sanitizations +//! +//! # Scoped Trust Signatures +//! +//! To create a trust signature, you create a signature whose [type] +//! is either [GenericCertification], [PersonaCertification], +//! [CasualCertification], or [PositiveCertification], and add a +//! [Trust Signature] subpacket using, for instance, the +//! [`SignatureBuilder::set_trust_signature`] method. +//! +//! [type]: https://tools.ietf.org/html/rfc4880#section-5.2.1 +//! [GenericCertification]: ../types/enum.SignatureType.html#variant.GenericCertification +//! [PersonaCertification]: ../types/enum.SignatureType.html#variant.PersonaCertification +//! [CasualCertification]: ../types/enum.SignatureType.html#variant.CasualCertification +//! [PositiveCertification]: ../types/enum.SignatureType.html#variant.PositiveCertification +//! [Trust Signature]: https://tools.ietf.org/html/rfc4880#section-5.2.3.13 +//! [`SignatureBuilder::set_trust_signature`]: ../packet/signature/struct.SignatureBuilder.html#method.set_trust_signature +//! +//! To scope a trust signature, you add a [Regular Expression +//! subpacket] to it using +//! [`SignatureBuilder::set_regular_expression`] or +//! [`SignatureBuilder::add_regular_expression`]. +//! +//! To extract any regular expressions, you can use +//! [`SignatureBuilder::regular_expressions`]. +//! +//! [Regular Expression subpacket]: https://tools.ietf.org/html/rfc4880#section-5.2.3.14 +//! [`SignatureBuilder::set_regular_expression`]: ../packet/signature/struct.SignatureBuilder.html#method.set_regular_expression +//! [`SignatureBuilder::add_regular_expression`]: ../packet/signature/struct.SignatureBuilder.html#method.add_regular_expression +//! [`SignatureBuilder::regular_expressions`]: ../packet/signature/struct.SignatureBuilder.html#method.regular_expressions +//! +//! # Caveat Emptor +//! +//! Note: GnuPG has [very limited regular expression support]. In +//! particular, it only recognizes regular expressions with the +//! following form: +//! +//! [very limited regular expression support]: https://dev.gnupg.org/source/gnupg/browse/master/g10/trustdb.c;15e065dee891eef9545556f210b4199107999869$1558 +//! +//! ```text +//! <[^>]+[@.]example\.com>$ +//! ``` +//! +//! Further, it escapes any operators between the `<[^>]+[@.]` and the +//! `>$` except `.` and `\`. Otherwise, GnuPG treats the regular +//! expression as a literal domain (e.g., `example.com`). +//! +//! Further, until [version 2.2.22] (released in August 2020), GnuPG +//! did not support regular expressions on Windows, and other systems +//! that don't include `regcomp`. On these systems, if a trust +//! signature included a regular expression, GnuPG conservatively +//! considered the whole trust signature to match nothing. +//! +//! [version 2.2.22]: https://dev.gnupg.org/T5030 +//! +//! # Examples +//! +//! A CA signs two certificates, one for Alice, who works at +//! `example.com`, and one for Bob, who is associated with `some.org`. +//! Carol then creates a trust signature for the CA, which she scopes +//! to `example.org` and `example.com`. We then confirm that Carol +//! can use the CA to authenticate Alice, but not Bob. +//! +//! ``` +//! use sequoia_openpgp as openpgp; +//! use openpgp::cert::prelude::*; +//! use openpgp::packet::prelude::*; +//! use openpgp::policy::StandardPolicy; +//! use openpgp::regex::RegexSet; +//! use openpgp::types::SignatureType; +//! +//! # fn main() -> openpgp::Result<()> { +//! let p = &StandardPolicy::new(); +//! +//! let (ca, _) +//! = CertBuilder::general_purpose(None, Some("OpenPGP CA ")) +//! .generate()?; +//! let mut ca_signer = ca.primary_key().key().clone() +//! .parts_into_secret()?.into_keypair()?; +//! let ca_userid = ca.with_policy(p, None)? +//! .userids().nth(0).expect("Added a User ID").userid(); +//! +//! // The CA certifies "Alice ". +//! let (alice, _) +//! = CertBuilder::general_purpose(None, Some("Alice ")) +//! .generate()?; +//! let alice_userid = alice.with_policy(p, None)? +//! .userids().nth(0).expect("Added a User ID").userid(); +//! let alice_certification = SignatureBuilder::new(SignatureType::GenericCertification) +//! .sign_userid_binding( +//! &mut ca_signer, +//! alice.primary_key().component(), +//! alice_userid)?; +//! let alice = alice.insert_packets(alice_certification.clone())?; +//! # assert!(alice.clone().into_packets().any(|p| { +//! # match p { +//! # Packet::Signature(sig) => sig == alice_certification, +//! # _ => false, +//! # } +//! # })); +//! +//! // The CA certifies "Bob ". +//! let (bob, _) +//! = CertBuilder::general_purpose(None, Some("Bob ")) +//! .generate()?; +//! let bob_userid = bob.with_policy(p, None)? +//! .userids().nth(0).expect("Added a User ID").userid(); +//! let bob_certification = SignatureBuilder::new(SignatureType::GenericCertification) +//! .sign_userid_binding( +//! &mut ca_signer, +//! bob.primary_key().component(), +//! bob_userid)?; +//! let bob = bob.insert_packets(bob_certification.clone())?; +//! # assert!(bob.clone().into_packets().any(|p| { +//! # match p { +//! # Packet::Signature(sig) => sig == bob_certification, +//! # _ => false, +//! # } +//! # })); +//! +//! +//! // Carol tsigns the CA's certificate. +//! let (carol, _) +//! = CertBuilder::general_purpose(None, Some("Carol ")) +//! .generate()?; +//! let mut carol_signer = carol.primary_key().key().clone() +//! .parts_into_secret()?.into_keypair()?; +//! +//! let ca_tsig = SignatureBuilder::new(SignatureType::GenericCertification) +//! .set_trust_signature(2, 120)? +//! .set_regular_expression("<[^>]+[@.]example\\.org>$")? +//! .add_regular_expression("<[^>]+[@.]example\\.com>$")? +//! .sign_userid_binding( +//! &mut carol_signer, +//! ca.primary_key().component(), +//! ca_userid)?; +//! let ca = ca.insert_packets(ca_tsig.clone())?; +//! # assert!(ca.clone().into_packets().any(|p| { +//! # match p { +//! # Packet::Signature(sig) => sig == ca_tsig, +//! # _ => false, +//! # } +//! # })); +//! +//! +//! // Carol now tries to authenticate Alice and Bob's certificates +//! // using the CA as a trusted introducer based on `ca_tsig`. +//! let res = RegexSet::from_signature(&ca_tsig)?; +//! +//! // Should should be able to authenticate Alice. +//! let alice_ua = alice.with_policy(p, None)? +//! .userids().nth(0).expect("Added a User ID"); +//! # assert!(res.matches_userid(&alice_ua)); +//! let mut authenticated = false; +//! for c in alice_ua.certifications() { +//! if c.get_issuers().into_iter().any(|h| h.aliases(ca.key_handle())) { +//! if c.clone().verify_userid_binding( +//! ca.primary_key().key(), +//! alice.primary_key().key(), +//! alice_ua.userid()).is_ok() +//! { +//! authenticated |= res.matches_userid(&alice_ua); +//! } +//! } +//! } +//! assert!(authenticated); +//! +//! // But, although the CA has certified Bob's key, Carol doesn't rely +//! // on it, because Bob's email address ("bob@some.org") is out of +//! // scope (some.org, not example.com). +//! let bob_ua = bob.with_policy(p, None)? +//! .userids().nth(0).expect("Added a User ID"); +//! # assert!(! res.matches_userid(&bob_ua)); +//! let mut have_certification = false; +//! let mut authenticated = false; +//! for c in bob_ua.certifications() { +//! if c.get_issuers().into_iter().any(|h| h.aliases(ca.key_handle())) { +//! if c.clone().verify_userid_binding( +//! ca.primary_key().key(), +//! bob.primary_key().key(), +//! bob_ua.userid()).is_ok() +//! { +//! have_certification = true; +//! authenticated |= res.matches_userid(&bob_ua); +//! } +//! } +//! } +//! assert!(have_certification); +//! assert!(! authenticated); +//! # Ok(()) } +//! ``` + +use std::borrow::Borrow; + +use lalrpop_util::ParseError; +use regex_syntax::hir::{self, Hir}; +use regex; + +use crate::Error; +use crate::Result; +use crate::packet::prelude::*; +use crate::types::SignatureType; + +pub(crate) mod lexer; +lalrpop_util::lalrpop_mod!( + #[allow(clippy::all)] + #[allow(unused_parens)] + grammar, + "/regex/grammar.rs" +); + +pub(crate) use self::lexer::Token; +pub(crate) use self::lexer::{Lexer, LexicalError}; + +const TRACE: bool = false; + +// Convert tokens into strings. +// +// Unfortunately, we can't implement From, because we don't define +// ParseError in this crate. +pub(crate) fn parse_error_downcast(e: ParseError) + -> ParseError +{ + match e { + ParseError::UnrecognizedToken { + token: (start, t, end), + expected, + } => ParseError::UnrecognizedToken { + token: (start, t.into(), end), + expected, + }, + + ParseError::ExtraToken { + token: (start, t, end), + } => ParseError::ExtraToken { + token: (start, t.into(), end), + }, + + ParseError::InvalidToken { location } + => ParseError::InvalidToken { location }, + + ParseError::User { error } + => ParseError::User { error }, + + ParseError::UnrecognizedEOF { location, expected } + => ParseError::UnrecognizedEOF { location, expected }, + } +} + +// Used by grammar.lalrpop to generate a regex class (e.g. '[a-ce]'). +fn generate_class(caret: bool, chars: impl Iterator) -> Hir +{ + tracer!(TRACE, "generate_class"); + + // Dealing with ranges is a bit tricky. We need to examine three + // tokens. If the middle one is a dash, it's a range. + + let chars: Vec> = chars + // Pad it out so what we can use windows to get three + // characters at a time, and be sure to process all + // characters. + .map(|c| Some(c)) + .chain(std::iter::once(None)) + .chain(std::iter::once(None)) + .collect(); + if chars.len() == 2 { + // The grammar doesn't allow an empty class. + unreachable!(); + } else { + let r = chars + .windows(3) + .scan(0, + |skip: &mut usize, x: &[Option]| + // Scan stops if the result is None. + // filter_map keeps only those elements that + // are Some. + -> Option> + { + if *skip > 0 { + *skip -= 1; + t!("Skipping: {:?} (skip now: {})", x, skip); + Some(None) + } else { + match (x[0], x[1], x[2]) { + (Some(a), Some('-'), Some(c)) => { + // We've got a real range. + *skip = 2; + t!("range for '{}-{}'", a, c); + Some(Some(hir::ClassUnicodeRange::new(a, c))) + } + (Some(a), _, _) => { + t!("range for '{}'", a); + Some(Some(hir::ClassUnicodeRange::new(a, a))) + } + (None, _, _) => unreachable!(), + } + } + }) + .filter_map(|r| r); + let mut class = hir::Class::Unicode(hir::ClassUnicode::new(r)); + if caret { + class.negate(); + } + Hir::class(class) + } +} + +/// A compiled OpenPGP regular expression for matching UTF-8 encoded +/// strings. +/// +/// A `Regex` contains a regular expression compiled according to the +/// rules defined in [Section 8 of RFC 4880] modulo two differences. +/// First, the compiler only works on UTF-8 strings (not bytes). +/// Second, ranges in character classes are between UTF-8 characters, +/// not just ASCII characters. Further, by default, strings that +/// don't pass a sanity check (in particular, include Unicode control +/// characters) never match. This behavior can be customized using +/// [`Regex::disable_sanitizations`]. +/// +/// [Section 8 of RFC 4880]: https://tools.ietf.org/html/rfc4880#section-8 +/// [trust signatures]: https://tools.ietf.org/html/rfc4880#section-5.2.3.13 +/// [`Regex::disable_sanitizations`]: #method.disable_sanitizations +/// +/// Regular expressions are used to scope the trust that [trust +/// signatures] extend. +/// +/// When working with trust signatures, you'll usually want to use the +/// [`RegexSet`] data structure, which already implements the correct +/// semantics. +/// +/// [`RegexSet`]: struct.RegexSet.html +/// +/// See the [module-level documentation] for more details. +/// +/// [module-level documentation]: index.html +#[derive(Clone, Debug)] +pub struct Regex { + regex: regex::Regex, + disable_sanitizations: bool, +} + +impl Regex { + /// Parses and compiles the regular expression. + /// + /// By default, strings that don't pass a sanity check (in + /// particular, include Unicode control characters) never match. + /// This behavior can be customized using + /// [`Regex::disable_sanitizations`]. + /// + /// [`Regex::disable_sanitizations`]: #method.disable_sanitizations + pub fn new(re: &str) -> Result + { + let re = re.as_ref(); + let lexer = Lexer::new(re); + let hir = match grammar::RegexParser::new().parse(re, lexer) { + Ok(hir) => hir, + Err(err) => return Err(parse_error_downcast(err).into()), + }; + + // Converting the Hir to a string and the compiling that is + // apparently the canonical way to convert a Hir to a Regex + // (at least it is what rip-grep does, which the author of + // regex also wrote. See + // ripgrep/crates/regex/src/config.rs:ConfiguredHir::regex. + let regex = regex::RegexBuilder::new(&hir.to_string()) + .build()?; + + Ok(Self { + regex, + disable_sanitizations: false, + }) + } + + /// Parses and compiles the regular expression. + /// + /// Returns an error if `re` is not a valid UTF-8 string. + /// + /// By default, strings that don't pass a sanity check (in + /// particular, include Unicode control characters) never match. + /// This behavior can be customized using + /// [`Regex::disable_sanitizations`]. + /// + /// [`Regex::disable_sanitizations`]: #method.disable_sanitizations + pub fn from_bytes(re: &[u8]) -> Result { + Self::new(std::str::from_utf8(re)?) + } + + /// Controls whether matched strings must pass a sanity check. + /// + /// If `false` (the default), i.e., sanity checks are enabled, and + /// the string doesn't pass the sanity check (in particular, it + /// contains a Unicode control character according to + /// [`char::is_control`], including newlines and an embedded `NUL` + /// byte), this returns `false`. + /// + /// [`char::is_control`]: https://doc.rust-lang.org/std/primitive.char.html#method.is_control + pub fn disable_sanitizations(&mut self, disabled: bool) { + self.disable_sanitizations = disabled; + } + + /// Returns whether the regular expression matches the string. + /// + /// If sanity checks are enabled (the default) and the string + /// doesn't pass the sanity check (in particular, it contains a + /// Unicode control character according to [`char::is_control`], + /// including newlines and an embedded `NUL` byte), this returns + /// `false`. + /// + /// [`char::is_control`]: https://doc.rust-lang.org/std/primitive.char.html#method.is_control + pub fn is_match(&self, s: &str) -> bool { + if ! self.disable_sanitizations { + if s.chars().any(char::is_control) { + return false; + } + } + + self.is_match_clean(s) + } + + // is_match, but without the sanity check. + fn is_match_clean(&self, s: &str) -> bool { + self.regex.is_match(s) + } + + /// Returns whether the regular expression matches the User ID. + /// + /// If the User ID is not a valid UTF-8 string, this returns + /// `false`. + /// + /// If sanity checks are enabled (the default) and the string + /// doesn't pass the sanity check (in particular, it contains a + /// Unicode control character according to [`char::is_control`], + /// including newlines and an embedded `NUL` byte), this returns + /// `false`. + /// + /// [`char::is_control`]: https://doc.rust-lang.org/std/primitive.char.html#method.is_control + pub fn matches_userid(&self, u: &UserID) -> bool { + if let Ok(u) = std::str::from_utf8(u.value()) { + self.is_match(u) + } else { + false + } + } +} + +#[derive(Clone, Debug)] +enum RegexSet_ { + Regex(Regex), + Invalid, +} + +/// A set of regular expressions. +/// +/// A `RegexSet` encapsulates a set of regular expressions. The +/// regular expressions are compiled according to the rules defined in +/// [Section 8 of RFC 4880] modulo two differences. First, the +/// compiler only works on UTF-8 strings (not bytes). Second, ranges +/// in character classes are between UTF-8 characters, not just ASCII +/// characters. Further, by default, strings that don't pass a sanity +/// check (in particular, include Unicode control characters) never +/// match. This behavior can be customized using +/// [`RegexSet::disable_sanitizations`]. +/// +/// [Section 8 of RFC 4880]: https://tools.ietf.org/html/rfc4880#section-8 +/// [`RegexSet::disable_sanitizations`]: #method.disable_sanitizations +/// +/// `RegexSet` implements the semantics of [regular expression]s used +/// in [Trust Signatures]. In particular, a `RegexSet` makes it +/// easier to deal with trust signatures that: +/// +/// - Contain multiple Regular Expression subpackts, +/// - Have no Regular Expression subpackets, and/or +/// - Include one or more Regular Expression subpackets that are invalid. +/// +/// [regular expressions]: https://tools.ietf.org/html/rfc4880#section-5.2.3.14 +/// [Trust Signatures]: https://tools.ietf.org/html/rfc4880#section-5.2.3.13 +/// +/// `RegexSet` compiles each regular expression individually. If +/// there are no regular expressions, the `RegexSet` matches +/// everything. If a regular expression is invalid, `RegexSet` treats +/// it as if it doesn't match anything. Thus, if all regular +/// expressions are invalid, the `RegexSet` matches nothing (not +/// everything!). +/// +/// See the [module-level documentation] for more details. +/// +/// [module-level documentation]: index.html +#[derive(Clone, Debug)] +pub struct RegexSet { + re_set: RegexSet_, + disable_sanitizations: bool, +} + +impl RegexSet { + /// Parses and compiles the regular expressions. + /// + /// Invalid regular expressions do not cause this to fail. See + /// [`RegexSet`]'s top-level documentation for details. + /// + /// [`RegexSet`]: struct.RegexSet.html + /// + /// By default, strings that don't pass a sanity check (in + /// particular, include Unicode control characters) never match. + /// This behavior can be customized using + /// [`RegexSet::disable_sanitizations`]. + /// + /// [`RegexSet::disable_sanitizations`]: #method.disable_sanitizations + /// + /// # Examples + /// + /// ``` + /// use sequoia_openpgp as openpgp; + /// use openpgp::regex::RegexSet; + /// + /// # fn main() -> openpgp::Result<()> { + /// // Extract the regex and compile it. + /// let res = &[ + /// "<[^>]+[@.]example\\.org>$", + /// // Invalid. + /// "[..", + /// ]; + /// + /// let res = RegexSet::new(res.iter())?; + /// + /// assert!(res.is_match("Alice ")); + /// assert!(! res.is_match("Bob ")); + /// # Ok(()) } + /// ``` + pub fn new<'a, RE>(res: impl Iterator) -> Result + where RE: Borrow<&'a str> + { + tracer!(TRACE, "RegexSet::new"); + + let mut regexes = Vec::with_capacity(2); + let mut had_good = false; + let mut had_bad = false; + + for re in res { + let re = re.borrow(); + let lexer = Lexer::new(re); + match grammar::RegexParser::new().parse(re, lexer) { + Ok(hir) => { + had_good = true; + let hir = hir::Hir::group(hir::Group { + kind: hir::GroupKind::NonCapturing, + hir: Box::new(hir), + }); + regexes.push(hir); + } + Err(err) => { + had_bad = true; + t!("Compiling {:?}: {}", re, err); + } + } + } + + if had_bad && ! had_good { + t!("All regular expressions were invalid."); + Ok(RegexSet { + re_set: RegexSet_::Invalid, + disable_sanitizations: false, + }) + } else if ! had_bad && ! had_good { + // Match everything. + t!("No regular expressions provided."); + Ok(RegexSet { + re_set: RegexSet_::Regex( + Regex { + regex: regex::RegexBuilder::new( + &Hir::empty().to_string()) + .build()?, + disable_sanitizations: false, + }), + disable_sanitizations: false, + }) + } else { + // Match any of the regular expressions. + Ok(RegexSet { + re_set: RegexSet_::Regex( + Regex { + regex: regex::RegexBuilder::new( + &Hir::alternation(regexes).to_string()) + .build()?, + disable_sanitizations: false, + }), + disable_sanitizations: false, + }) + } + } + + /// Parses and compiles the regular expressions. + /// + /// The regular expressions are first converted to UTF-8 strings. + /// Byte sequences that are not valid UTF-8 strings are considered + /// to be invalid regular expressions. Invalid regular + /// expressions do not cause this to fail. See [`RegexSet`]'s + /// top-level documentation for details. + /// + /// [`RegexSet`]: struct.RegexSet.html + /// + /// By default, strings that don't pass a sanity check (in + /// particular, include Unicode control characters) never match. + /// This behavior can be customized using + /// [`RegexSet::disable_sanitizations`]. + /// + /// [`RegexSet::disable_sanitizations`]: #method.disable_sanitizations + /// + /// # Examples + /// + /// ``` + /// use sequoia_openpgp as openpgp; + /// use openpgp::regex::RegexSet; + /// + /// # fn main() -> openpgp::Result<()> { + /// // A valid and an invalid UTF-8 byte sequence. The invalid + /// // sequence doesn't match anything. But, that doesn't impact + /// // the other regular expressions. + /// let res: &[ &[u8] ] = &[ + /// &b"<[^>]+[@.]example\\.org>$"[..], + /// // Invalid UTF-8. + /// &b"\xC3\x28"[..], + /// ]; + /// assert!(std::str::from_utf8(res[0]).is_ok()); + /// assert!(std::str::from_utf8(res[1]).is_err()); + /// + /// let re_set = RegexSet::from_bytes(res.into_iter())?; + /// + /// assert!(re_set.is_match("Alice ")); + /// assert!(! re_set.is_match("Bob ")); + /// + /// // If we only have invalid UTF-8 strings, then nothing + /// // matches. + /// let res: &[ &[u8] ] = &[ + /// // Invalid UTF-8. + /// &b"\xC3\x28"[..], + /// ]; + /// assert!(std::str::from_utf8(res[0]).is_err()); + /// + /// let re_set = RegexSet::from_bytes(res.into_iter())?; + /// + /// assert!(! re_set.is_match("Alice ")); + /// assert!(! re_set.is_match("Bob ")); + /// + /// + /// // But, if we have no regular expressions, everything matches. + /// let res: &[ &[u8] ] = &[]; + /// let re_set = RegexSet::from_bytes(res.into_iter())?; + /// + /// assert!(re_set.is_match("Alice ")); + /// assert!(re_set.is_match("Bob ")); + /// # Ok(()) } + /// ``` + pub fn from_bytes<'a, RE>(res: impl Iterator) -> Result + where RE: Borrow<&'a [u8]> + { + let mut have_valid_utf8 = false; + let mut have_invalid_utf8 = false; + let re_set = Self::new( + res + .scan((&mut have_valid_utf8, &mut have_invalid_utf8), + |(valid, invalid), re| + { + if let Ok(re) = std::str::from_utf8(re.borrow()) { + **valid = true; + Some(Some(re)) + } else { + **invalid = true; + Some(None) + } + }) + .filter_map(|re| re)); + + if !have_valid_utf8 && have_invalid_utf8 { + // None of the strings were valid UTF-8. Reject + // everything. + Ok(RegexSet { + re_set: RegexSet_::Invalid, + disable_sanitizations: false, + }) + } else { + // We had nothing or at least one string was valid UTF-8. + // RegexSet::new did the right thing. + re_set + } + } + + /// Creates a `RegexSet` from the regular expressions stored in a + /// trust signature. + /// + /// This method is a convenience function, which extracts any + /// regular expressions from a [Trust Signature] and wraps them in a + /// `RegexSet`. + /// + /// [Trust Signature]: https://tools.ietf.org/html/rfc4880#section-5.2.3.13 + /// + /// If the signature is not a valid trust signature (its [type] is + /// [GenericCertification], [PersonaCertification], + /// [CasualCertification], or [PositiveCertification], and the + /// [Trust Signature] subpacket is present), this returns an + /// error. + /// + /// [type]: https://tools.ietf.org/html/rfc4880#section-5.2.1 + /// [GenericCertification]: ../types/enum.SignatureType.html#variant.GenericCertification + /// [PersonaCertification]: ../types/enum.SignatureType.html#variant.PersonaCertification + /// [CasualCertification]: ../types/enum.SignatureType.html#variant.CasualCertification + /// [PositiveCertification]: ../types/enum.SignatureType.html#variant.PositiveCertification + /// + /// By default, strings that don't pass a sanity check (in + /// particular, include Unicode control characters) never match. + /// This behavior can be customized using + /// [`RegexSet::disable_sanitizations`]. + /// + /// [`RegexSet::disable_sanitizations`]: #method.disable_sanitizations + /// + /// # Examples + /// + /// ``` + /// use sequoia_openpgp as openpgp; + /// # use openpgp::cert::prelude::*; + /// # use openpgp::packet::prelude::*; + /// # use openpgp::policy::StandardPolicy; + /// use openpgp::regex::RegexSet; + /// # use openpgp::types::SignatureType; + /// # + /// # fn main() -> openpgp::Result<()> { + /// # let p = &StandardPolicy::new(); + /// # + /// # let (alice, _) + /// # = CertBuilder::general_purpose(None, Some("Alice ")) + /// # .generate()?; + /// # let mut alices_signer = alice.primary_key().key().clone() + /// # .parts_into_secret()?.into_keypair()?; + /// # + /// # let (example_com, _) + /// # = CertBuilder::general_purpose(None, Some("OpenPGP CA ")) + /// # .generate()?; + /// # let example_com_userid = example_com.with_policy(p, None)? + /// # .userids().nth(0).expect("Added a User ID").userid(); + /// # + /// # let certification = SignatureBuilder::new(SignatureType::GenericCertification) + /// # .set_trust_signature(1, 120)? + /// # .set_regular_expression("<[^>]+[@.]example\\.org>$")? + /// # .add_regular_expression("<[^>]+[@.]example\\.com>$")? + /// # .sign_userid_binding( + /// # &mut alices_signer, + /// # example_com.primary_key().component(), + /// # example_com_userid)?; + /// + /// // certification is a trust signature, which contains two regular + /// // expressions: one that matches all mail addresses for 'example.org' + /// // and another that matches all mail addresses for 'example.com'. + /// let certification: &Signature = // ...; + /// # &certification; + /// + /// // Extract the regex and compile it. + /// let res = RegexSet::from_signature(certification)?; + /// + /// // Some positive examples. + /// assert!(res.is_match("Alice ")); + /// assert!(res.is_match("Bob ")); + /// + /// // Wrong domain. + /// assert!(! res.is_match("Carol ")); + /// + /// // The standard regex, "<[^>]+[@.]example\\.org>$" only matches + /// // email addresses wrapped in <>. + /// assert!(! res.is_match("dave@example.com")); + /// + /// // And, it is case sensitive. + /// assert!(res.is_match("Ellen ")); + /// assert!(! res.is_match("Ellen ")); + /// # Ok(()) } + /// ``` + pub fn from_signature(sig: &Signature) -> Result + { + use SignatureType::*; + match sig.typ() { + GenericCertification => (), + PersonaCertification => (), + CasualCertification => (), + PositiveCertification => (), + t => return Err( + Error::InvalidArgument( + format!( + "Expected a certification signature, found a {}", + t).into()) + .into()), + } + + if sig.trust_signature().is_none() { + return Err( + Error::InvalidArgument( + "Expected a trust signature, \ + but the signature does not include \ + a valid Trust Signature subpacket".into()) + .into()); + } + + Self::from_bytes(sig.regular_expressions()) + } + + /// Controls whether strings with control characters are allowed. + /// + /// If `false` (the default), i.e., sanity checks are enabled, and + /// the string doesn't pass the sanity check (in particular, it + /// contains a Unicode control character according to + /// [`char::is_control`], including newlines and an embedded `NUL` + /// byte), this returns `false`. + /// + /// [`char::is_control`]: https://doc.rust-lang.org/std/primitive.char.html#method.is_control + pub fn disable_sanitizations(&mut self, allowed: bool) { + self.disable_sanitizations = allowed; + if let RegexSet_::Regex(ref mut re) = self.re_set { + re.disable_sanitizations(allowed); + } + } + + /// Returns whether the regular expression set matches the string. + /// + /// If sanity checks are enabled (the default) and the string + /// doesn't pass the sanity check (in particular, it contains a + /// Unicode control character according to [`char::is_control`], + /// including newlines and an embedded `NUL` byte), this returns + /// `false`. + /// + /// [`char::is_control`]: https://doc.rust-lang.org/std/primitive.char.html#method.is_control + /// + /// If the `RegexSet` contains one or more regular expressions, + /// this method returns whether at least one of the regular + /// expressions matches. Invalid regular expressions never match. + /// + /// If the `RegexSet` does not contain any regular expressions + /// (valid or otherwise), this method returns `true`. + /// + /// # Examples + /// + /// ``` + /// use sequoia_openpgp as openpgp; + /// use openpgp::regex::RegexSet; + /// + /// # fn main() -> openpgp::Result<()> { + /// // A regular expression that matches anything. (Note: this is + /// // equivalent to providing no regular expressions.) + /// let res: &[ &str ] = &[ + /// &""[..], + /// ]; + /// let re_set = RegexSet::new(res.into_iter())?; + /// + /// assert!(re_set.is_match("Alice Lovelace ")); + /// + /// // If a User ID has an embedded control character, it doesn't + /// // match. + /// assert!(! re_set.is_match("Alice \0")); + /// # Ok(()) } + /// ``` + pub fn is_match(&self, s: &str) -> bool { + if ! self.disable_sanitizations { + if s.chars().any(char::is_control) { + return false; + } + } + + match self.re_set { + RegexSet_::Regex(ref re) => + re.is_match_clean(s), + RegexSet_::Invalid => + false, + } + } + + /// Returns whether the regular expression matches the User ID. + /// + /// If the User ID is not a valid UTF-8 string, this returns `false`. + /// + /// If sanity checks are enabled (the default) and the string + /// doesn't pass the sanity check (in particular, it contains a + /// Unicode control character according to [`char::is_control`], + /// including newlines and an embedded `NUL` byte), this returns + /// `false`. + /// + /// [`char::is_control`]: https://doc.rust-lang.org/std/primitive.char.html#method.is_control + /// + /// If the `RegexSet` contains one or more regular expressions, + /// this method returns whether at least one of the regular + /// expressions matches. Invalid regular expressions never match. + /// + /// If the `RegexSet` does not contain any regular expressions + /// (valid or otherwise), this method returns `true`. + /// + /// # Examples + /// + /// ``` + /// use sequoia_openpgp as openpgp; + /// use openpgp::packet::UserID; + /// use openpgp::regex::RegexSet; + /// + /// # fn main() -> openpgp::Result<()> { + /// // A regular expression that matches anything. (Note: this is + /// // equivalent to providing no regular expressions.) + /// let res: &[ &str ] = &[ + /// "", + /// ]; + /// let re_set = RegexSet::new(res.into_iter())?; + /// + /// assert!(re_set.matches_userid( + /// &UserID::from(&b"Alice Lovelace "[..]))); + /// + /// // If a User ID is not valid UTF-8, it never matches. + /// assert!(! re_set.matches_userid( + /// &UserID::from(&b"Alice \xC3\x28 Lovelace "[..]))); + /// + /// // If a User ID has an embedded control character, it doesn't + /// // match. + /// assert!(! re_set.matches_userid( + /// &UserID::from(&b"Alice \0"[..]))); + /// # Ok(()) } + /// ``` + pub fn matches_userid(&self, u: &UserID) -> bool + { + let u = u.borrow(); + if let Ok(u) = std::str::from_utf8(u.value()) { + self.is_match(u) + } else { + false + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn regex() -> Result<()> { + fn a(regex: &str, matches: &[(bool, &str)]) { + eprint!("{} -> ", regex); + let mut compiled = Regex::new(regex).unwrap(); + compiled.disable_sanitizations(true); + eprintln!("{:?}", compiled); + for &(matches, text) in matches { + assert_eq!(matches, compiled.is_match(text), + "regex: {}\n text: {:?} should{} match", + regex, text, if matches { "" } else { " not" }); + } + } + fn f(regex: &str) { + eprint!("{} -> ", regex); + let compiled = Regex::new(regex); + assert!(compiled.is_err()); + eprintln!("failed (expected)"); + } + + // Test an important corner case: the + should only apply to + // the b! See: https://github.com/rust-lang/regex/issues/731 + a("xab+y", &[ + (true, "xaby"), + (true, "xabby"), + (false, "xababy"), + ]); + a("x(ab+)y", &[ + (false, "xy"), + (false, "xay"), + (true, "xaby"), + (true, "xabby"), + (true, "xabbby"), + (false, "xababy"), + ]); + // But here the + matches "ab", not just the "b". + a("x(ab)+y", &[ + (false, "xy"), + (true, "xaby"), + (false, "xabby"), + (true, "xababy"), + (true, "xabababy"), + (false, "x(ab)y"), + ]); + + + + a("", &[ + (true, "s"), + (true, "ss"), + ]); + a("s", &[ + (true, "s"), + (true, "ss"), + (false, "a"), + (true, "hello, my prettiessss"), + (false, "S"), + ]); + a("ss", &[ + (false, "s"), + (true, "ss"), + (true, "sss"), + (false, "this has lots of ses, but not two ses together"), + (true, "halloss"), + ]); + + a("a|b", &[ + (true, "a"), + (true, "b"), + (false, "c"), + (true, "xxxaxxxbxxx"), + ]); + a("a|b|c", &[ + (true, "a"), + (true, "b"), + (true, "c"), + (false, "d"), + (true, "xxxaxxxbxxx"), + ]); + // This should match anything. + a("|a", &[ + (true, "a"), + (true, "b"), + ]); + a("a|", &[ + (true, "a"), + (true, "b"), + ]); + a("|a|b", &[ + (true, "a"), + (true, "b"), + (true, "c"), + ]); + a("|a|b|c|d", &[ + (true, "a"), + (true, "b"), + (true, "c"), + (true, "d"), + (true, "eeee"), + ]); + a("a|b|", &[ + (true, "a"), + (true, "b"), + (true, "c"), + ]); + a("a|b|c|", &[ + (true, "a"), + (true, "b"), + (true, "c"), + (true, "d"), + (true, "eeee"), + ]); + a("|", &[ + (true, "a"), + (true, "b"), + (true, "c"), + (true, "d"), + (true, "eeee"), + ]); + a("|a|", &[ + (true, "a"), + (true, "b"), + (true, "c"), + (true, "d"), + (true, "eeee"), + ]); + a("|a|b|", &[ + (true, "a"), + (true, "b"), + (true, "c"), + (true, "d"), + (true, "eeee"), + ]); + // This is: "ab" or "cd", not a followed by b or c followed by d: + // + // A regular expression is zero or more branches, separated by '|'. + // ... + // A branch is zero or more pieces, concatenated. + // ... + // A piece is an atom + // ... + // An atom is... a single character. + a("ab|cd", &[ + (true, "abd"), + (true, "acd"), + (true, "abcd"), + (false, "ad"), + (false, "b"), + (false, "c"), + (false, "bb"), + ]); + + a("a*", &[ + (true, ""), + (true, "a"), + (true, "aa"), + (true, "b"), + ]); + a("xa*y", &[ + (true, "xy"), + (true, "xay"), + (true, "xaay"), + (false, "y"), + (false, "ay"), + (false, "aay"), + (false, "x y"), + (false, "x ay"), + (false, "x aay"), + ]); + f("*"); + + a("a+", &[ + (false, ""), + (true, "a"), + (true, "aa"), + (false, "b"), + (true, "baab"), + (true, "by ab"), + (true, "baa b"), + ]); + a("ab+", &[ + (false, ""), + (false, "a"), + (false, "b"), + (true, "ab"), + (false, "bb"), + (true, "baab"), + (true, "by ab"), + (false, "baa b"), + ]); + f("+"); + + a("a?", &[ + (true, ""), + (true, "a"), + (true, "aa"), + (true, "aaa"), + (true, "b"), + (true, "baab"), + (true, "by ab"), + (true, "baa b"), + ]); + a("xa?y", &[ + (false, ""), + (true, "xy"), + (false, "a"), + (true, "xay"), + (false, "aa"), + (false, "xaay"), + (false, "b"), + (false, "bxaayb"), + (true, "by xayb"), + (true, "baxay b"), + ]); + f("?"); + + f("a*?"); + a("a*b?c+", &[ + (false, ""), + (true, "c"), + (true, "abc"), + (true, "aabbcc"), + (false, "aab"), + (true, "aaaaaabcccccccc"), + ]); + f("a?*+"); + + a("a?|b+", &[ + (true, ""), + (true, "aaa"), + (true, "bbb"), + (true, "abaa"), + ]); + a("a+|b+", &[ + (false, ""), + (true, "a"), + (true, "aaa"), + (true, "b"), + (true, "bbb"), + (true, "abaa"), + ]); + a("a+|b+|c+", &[ + (false, ""), + (true, "a"), + (true, "aaa"), + (true, "b"), + (true, "bbb"), + (true, "abaa"), + (true, "c"), + (true, "ccc"), + (true, "abaaccc"), + ]); + a("xa+|b+|c+y", &[ + (false, ""), + (true, "xa"), + (true, "xaa"), + (true, "b"), + (true, "bb"), + (true, "cy"), + (true, "ccy"), + + (false, "a"), + (false, "aaa"), + (false, "c"), + (false, "ccc"), + ]); + a("xa+y|sb+u", &[ + (false, ""), + (true, "xay"), + (true, "xaay"), + (true, "sbu"), + (true, "sbbu"), + (true, "xysbu"), + + (false, "a"), + (false, "aaa"), + (false, "xyu"), + (false, "ccc"), + ]); + a("a*|a+|ab+cd+|", &[ + (true, ""), + ]); + + a("()", &[ + (true, ""), + (true, "xyzzy"), + ]); + a("(())", &[ + (true, ""), + (true, "xyzzy"), + ]); + a("((()))", &[ + (true, ""), + (true, "xyzzy"), + ]); + f("((())"); + f("((())))"); + a("(a)", &[ + (true, "a"), + (true, "(a)"), + (false, "b"), + ]); + a("x(a)y", &[ + (false, "xy"), + (true, "xay"), + (false, "x(a)y"), + (true, "(xay)"), + (false, "a"), + (false, "yax"), + ]); + a("x(ab)y", &[ + (false, "xy"), + (false, "xay"), + (false, "xby"), + (true, "xaby"), + (false, "x(ab)y"), + (true, "(xaby)"), + ]); + a("x(ab)(cd)y", &[ + (true, "xabcdy"), + (true, "zxabcdyz"), + ]); + a("a(bc)d(ef)g", &[ + (true, "abcdefg"), + (true, "xabcdefgy"), + (false, "xa(bc)d(ef)gy"), + ]); + a("a((bc))d((ef))g", &[ + (true, "abcdefg"), + (true, "xabcdefgy"), + (false, "xa(bc)d(ef)gy"), + ]); + a("a(b(c)d)e", &[ + (true, "abcde"), + (true, "xabcdey"), + (false, "xa(b(c)d)ey"), + ]); + a("x(a+|b+)y", &[ + (false, "xy"), + (true, "xay"), + (true, "xby"), + (true, "xaay"), + (true, "xbby"), + (false, "xaby"), + (false, "xaaby"), + (false, "xabby"), + (false, "xaabby"), + (false, "xcy"), + ]); + + a(".", &[ + (false, ""), + (true, "a"), + (true, "ab"), + (true, "ab\nc"), + (true, "ab.c"), + ]); + a("x.y", &[ + (false, ""), + (false, "xy"), + (true, "xay"), + (true, "x\ny"), + (true, "x.y"), + (false, "x..y"), + ]); + + a("^", &[ + (true, ""), + (true, "xx"), + ]); + a("^abc", &[ + (false, ""), + (true, "abcdef"), + (false, "xabcdef"), + (false, "\nabcdef"), + ]); + a("(^abc|^def)", &[ + (false, ""), + (true, "abcd"), + (true, "defg"), + (false, "xabcd"), + (false, "xdefg"), + (false, "^abc"), + (false, "^(abc|def)"), + (false, "\nabcdef"), + ]); + a("(^abc|def)", &[ + (false, ""), + (true, "abcd"), + (true, "defg"), + (false, "xabcd"), + (true, "xdefg"), + (false, "^abc"), + (true, "^(abc|def)"), + (false, "\nabcde"), + ]); + a("^^", &[ + (true, ""), + (true, "abcdef"), + ]); + a("^abc^", &[ + (false, ""), + (false, "abcdef"), + (false, "xabcdef"), + (false, "abc\n"), + (false, "\nabc\n"), + (false, "^abc^"), + ]); + + a("$", &[ + (true, ""), + (true, "abc"), + ]); + a("abc$", &[ + (false, ""), + (true, "abc"), + (false, "abcx"), + (false, "abc\n"), + (false, "abc$"), + ]); + a("abc$$", &[ + (false, ""), + (true, "abc"), + (false, "abcx"), + (false, "abc\n"), + (false, "abc$"), + ]); + a("(abc$)x", &[ + (false, ""), + (false, "abc"), + (false, "abcx"), + (false, "abc\nx"), + (false, "abc$x"), + ]); + a("abc$|def$", &[ + (false, ""), + (true, "abc"), + (false, "abcx"), + (false, "abc\n"), + (false, "abc$"), + (true, "def"), + (false, "defx"), + (false, "def\n"), + (false, "def$"), + (true, "abcdef"), + ]); + + a("\\|", &[ + (true, "|"), + (false, ""), + (false, "a"), + ]); + a("\\*", &[ + (true, "*"), + (false, ""), + (false, "a"), + ]); + a("\\+", &[ + (true, "+"), + (false, ""), + (false, "a"), + ]); + a("\\?", &[ + (true, "?"), + (false, ""), + (false, "a"), + ]); + a("\\.", &[ + (true, "."), + (false, ""), + (false, "a"), + ]); + a("\\^", &[ + (true, "^"), + (false, ""), + (false, "a"), + ]); + a("\\$", &[ + (true, "$"), + (false, ""), + (false, "a"), + ]); + a("\\\\", &[ + (true, "\\"), + (false, ""), + (false, "a"), + ]); + a("\\[", &[ + (true, "["), + (false, ""), + (false, "a"), + ]); + a("\\]", &[ + (true, "]"), + (false, ""), + (false, "a"), + ]); + a("\\-", &[ + (true, "-"), + (false, ""), + (false, "a"), + ]); + f("\\"); + + a("[a]", &[ + (true, "a"), + (false, "b"), + ]); + a("[abc]", &[ + (true, "a"), + (true, "b"), + (true, "c"), + (false, "d"), + ]); + a("[a-c]", &[ + (true, "a"), + (true, "b"), + (true, "c"), + (false, "d"), + ]); + a("[xa-c]", &[ + (true, "a"), + (true, "b"), + (true, "c"), + (true, "x"), + (false, "d"), + ]); + a("[a-cxyz]", &[ + (true, "a"), + (true, "b"), + (true, "c"), + (true, "x"), + (false, "d"), + ]); + a("[a-c]x", &[ +