diff options
Diffstat (limited to 'openpgp/src/regex')
-rw-r--r-- | openpgp/src/regex/grammar.lalrpop | 204 | ||||
-rw-r--r-- | openpgp/src/regex/lexer.rs | 222 | ||||
-rw-r--r-- | openpgp/src/regex/mod.rs | 1943 |
3 files changed, 2369 insertions, 0 deletions
diff --git a/openpgp/src/regex/grammar.lalrpop b/openpgp/src/regex/grammar.lalrpop new file mode 100644 index 00000000..e9e619b5 --- /dev/null +++ b/openpgp/src/regex/grammar.lalrpop @@ -0,0 +1,204 @@ +// -*- mode: Rust; -*- + +use super::generate_class; +use super::lexer; +use super::lexer::{Token, LexicalError}; +use regex_syntax::hir::{self, Hir}; + +// Pass in the original, untokenized input to facilitate error +// recovery. +grammar<'input>(input: &'input str); + +// This is a straightforward translation of the regular expression +// grammar from section 8 of RFC 4880. +// +// https://tools.ietf.org/html/rfc4880#section-8 +pub(crate) Regex : Hir = { + <l:LBranch> <r:RBranch*> => { + let mut r = r; + r.insert(0, l); + Hir::alternation(r) + }, +} + +LBranch : Hir = { + Branch, +} + +RBranch : Hir = { + PIPE <Branch>, +} + +Branch : Hir = { + <p:Piece*> => { + hir::Hir::group(hir::Group { + kind: hir::GroupKind::NonCapturing, + hir: Box::new(hir::Hir::concat(p)), + }) + }, +} + +Piece : Hir = { + <a:Atom> => a, + <a:Atom> STAR => { + hir::Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrMore, + greedy: true, + hir: Box::new(a) + }) + }, + <a:Atom> PLUS => { + hir::Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::OneOrMore, + greedy: true, + hir: Box::new(a) + }) + }, + <a:Atom> QUESTION => { + hir::Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrOne, + greedy: true, + hir: Box::new(a) + }) + }, +} + +Atom : Hir = { + LPAREN <r:Regex> RPAREN => { + hir::Hir::group(hir::Group { + kind: hir::GroupKind::NonCapturing, + hir: Box::new(r), + }) + }, + + Range, + + DOT => { + hir::Hir::any(false) + }, + CARET => { + hir::Hir::anchor(hir::Anchor::StartText) + }, + DOLLAR => { + hir::Hir::anchor(hir::Anchor::EndText) + }, + + BACKSLASH <t:AnyChar> => { + hir::Hir::literal(hir::Literal::Unicode(t.to_char())) + }, + + <t:OTHER> => { + hir::Hir::literal(hir::Literal::Unicode(t.to_char())) + }, + +} + +Range : Hir = { + LBRACKET <c:CARET?> <class1:RBRACKET> <class2:NotRBracket*> RBRACKET => { + generate_class(c.is_some(), + std::iter::once(class1.to_char()) + .chain(class2.into_iter().map(|t| t.to_char()))) + }, + LBRACKET CARET <class:NotRBracket+> RBRACKET => { + generate_class(true, + class.into_iter().map(|t| t.to_char())) + }, + LBRACKET <class1:NotCaretNotRBracket> <class2:NotRBracket*> RBRACKET => { + generate_class(false, + std::iter::once(class1.to_char()) + .chain(class2.into_iter().map(|t| t.to_char()))) + }, +} + +NotRBracket : Token = { + PIPE => Token::OTHER('|'), + + STAR => Token::OTHER('*'), + PLUS => Token::OTHER('+'), + QUESTION => Token::OTHER('?'), + + LPAREN => Token::OTHER('('), + RPAREN => Token::OTHER(')'), + + DOT => Token::OTHER('.'), + CARET => Token::OTHER('^'), + DOLLAR => Token::OTHER('$'), + BACKSLASH => Token::OTHER('\\'), + + LBRACKET => Token::OTHER('['), + // RBRACKET => Token::OTHER(']'), + DASH => Token::OTHER('-'), + + OTHER, +} + +NotCaretNotRBracket : Token = { + PIPE => Token::OTHER('|'), + + STAR => Token::OTHER('*'), + PLUS => Token::OTHER('+'), + QUESTION => Token::OTHER('?'), + + LPAREN => Token::OTHER('('), + RPAREN => Token::OTHER(')'), + + DOT => Token::OTHER('.'), + // CARET => Token::OTHER('^'), + DOLLAR => Token::OTHER('$'), + BACKSLASH => Token::OTHER('\\'), + + LBRACKET => Token::OTHER('['), + // RBRACKET => Token::OTHER(']'), + DASH => Token::OTHER('-'), + + OTHER, +} + +AnyChar : Token = { + PIPE => Token::OTHER('|'), + + STAR => Token::OTHER('*'), + PLUS => Token::OTHER('+'), + QUESTION => Token::OTHER('?'), + + LPAREN => Token::OTHER('('), + RPAREN => Token::OTHER(')'), + + DOT => Token::OTHER('.'), + CARET => Token::OTHER('^'), + DOLLAR => Token::OTHER('$'), + BACKSLASH => Token::OTHER('\\'), + + LBRACKET => Token::OTHER('['), + RBRACKET => Token::OTHER(']'), + DASH => Token::OTHER('-'), + + OTHER, +} + +extern { + type Location = usize; + type Error = LexicalError; + + enum lexer::Token { + PIPE => lexer::Token::PIPE, + + STAR => lexer::Token::STAR, + PLUS => lexer::Token::PLUS, + QUESTION => lexer::Token::QUESTION, + + LPAREN => lexer::Token::LPAREN, + RPAREN => lexer::Token::RPAREN, + + DOT => lexer::Token::DOT, + CARET => lexer::Token::CARET, + DOLLAR => lexer::Token::DOLLAR, + BACKSLASH => lexer::Token::BACKSLASH, + + LBRACKET => lexer::Token::LBRACKET, + RBRACKET => lexer::Token::RBRACKET, + DASH => lexer::Token::DASH, + + OTHER => lexer::Token::OTHER(_), + } +} diff --git a/openpgp/src/regex/lexer.rs b/openpgp/src/regex/lexer.rs new file mode 100644 index 00000000..18300d90 --- /dev/null +++ b/openpgp/src/regex/lexer.rs @@ -0,0 +1,222 @@ +use std::fmt; + +#[derive(Clone, PartialEq, Eq, Debug)] +pub enum LexicalError { +} + +impl fmt::Display for LexicalError { + // This trait requires `fmt` with this exact signature. + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", "{}") + } +} + +pub type Spanned<Token, Loc, LexicalError> + = Result<(Loc, Token, Loc), LexicalError>; + +// The type of the parser's input. +// +// The parser iterators over tuples consisting of the token's starting +// position, the token itself, and the token's ending position. +pub(crate) type LexerItem<Token, Loc, LexicalError> + = Spanned<Token, Loc, LexicalError>; + +/// The components of an OpenPGP Message. +#[derive(Debug, Clone, PartialEq)] +pub enum Token { + PIPE, + + STAR, + PLUS, + QUESTION, + + LPAREN, + RPAREN, + + DOT, + CARET, + DOLLAR, + BACKSLASH, + + LBRACKET, + RBRACKET, + DASH, + + OTHER(char), +} +assert_send_and_sync!(Token); + +impl fmt::Display for Token { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(&format!("{:?}", self)[..]) + } +} + +impl From<Token> for String { + fn from(t: Token) -> String { + use self::Token::*; + match t { + PIPE => '|'.to_string(), + STAR => '*'.to_string(), + PLUS => '+'.to_string(), + QUESTION => '?'.to_string(), + LPAREN => '('.to_string(), + RPAREN => ')'.to_string(), + DOT => '.'.to_string(), + CARET => '^'.to_string(), + DOLLAR => '$'.to_string(), + BACKSLASH => '\\'.to_string(), + LBRACKET => '['.to_string(), + RBRACKET => ']'.to_string(), + DASH => '-'.to_string(), + OTHER(c) => c.to_string(), + } + } +} + +impl Token { + pub fn to_string(self) -> String { + self.into() + } + + pub fn to_char(&self) -> char { + use self::Token::*; + match self { + PIPE => '|', + STAR => '*', + PLUS => '+', + QUESTION => '?', + LPAREN => '(', + RPAREN => ')', + DOT => '.', + CARET => '^', + DOLLAR => '$', + BACKSLASH => '\\', + LBRACKET => '[', + RBRACKET => ']', + DASH => '-', + OTHER(c) => *c, + } + } +} + +pub(crate) struct Lexer<'input> { + offset: usize, + input: &'input str, +} + +impl<'input> Lexer<'input> { + pub fn new(input: &'input str) -> Self { + Lexer { offset: 0, input } + } +} + +impl<'input> Iterator for Lexer<'input> { + type Item = LexerItem<Token, usize, LexicalError>; + + fn next(&mut self) -> Option<Self::Item> { + use self::Token::*; + + tracer!(super::TRACE, "regex::Lexer::next"); + + // Returns the length of the first character in s in bytes. + // If s is empty, returns 0. + fn char_bytes(s: &str) -> usize { + if let Some(c) = s.chars().next() { + c.len_utf8() + } else { + 0 + } + } + + let one = |input: &'input str| -> Option<Token> { + let c = input.chars().next()?; + Some(match c { + '|' => PIPE, + '*' => STAR, + '+' => PLUS, + '?' => QUESTION, + '(' => LPAREN, + ')' => RPAREN, + '.' => DOT, + '^' => CARET, + '$' => DOLLAR, + '\\' => BACKSLASH, + '[' => LBRACKET, + ']' => RBRACKET, + '-' => DASH, + _ => OTHER(c), + }) + }; + + let l = char_bytes(self.input); + let t = match one(self.input) { + Some(t) => t, + None => return None, + }; + + self.input = &self.input[l..]; + + let start = self.offset; + let end = start + l; + self.offset += l; + + t!("Returning token at offset {}: '{:?}'", + start, t); + + Some(Ok((start, t, end))) + } +} + +impl<'input> From<&'input str> for Lexer<'input> { + fn from(i: &'input str) -> Lexer<'input> { + Lexer::new(i) + } +} + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn lexer() { + fn lex(s: &str, expected: &[Token]) { + let tokens: Vec<Token> = Lexer::new(s) + .map(|t| t.unwrap().1) + .collect(); + + assert_eq!(&tokens[..], expected, + "{}", s); + } + + use Token::*; + lex("|", &[ PIPE ]); + lex("*", &[ STAR ]); + lex("+", &[ PLUS ]); + lex("?", &[ QUESTION ]); + lex("(", &[ LPAREN ]); + lex(")", &[ RPAREN ]); + lex(".", &[ DOT ]); + lex("^", &[ CARET ]); + lex("$", &[ DOLLAR ]); + lex("\\", &[ BACKSLASH ]); + lex("[", &[ LBRACKET ]); + lex("]", &[ RBRACKET ]); + lex("-", &[ DASH ]); + lex("a", &[ OTHER('a') ]); + lex("aa", &[ OTHER('a'), OTHER('a') ]); + lex("foo", &[ OTHER('f'), OTHER('o'), OTHER('o') ]); + + lex("foo\\bar", &[ OTHER('f'), OTHER('o'), OTHER('o'), + BACKSLASH, + OTHER('b'), OTHER('a'), OTHER('r') ]); + lex("*?!", &[ STAR, QUESTION, OTHER('!') ]); + + // Multi-byte UTF-8. + lex("Γβπ£", &[ OTHER('Γ'), OTHER('β'), OTHER('π£'), ]); + lex("(Γ|β|π£", + &[ LPAREN, OTHER('Γ'), PIPE, OTHER('β'), PIPE, OTHER('π£') ]); + lex("ζ±δΊ¬", &[ OTHER('ζ±'), OTHER('δΊ¬') ]); + } +} diff --git a/openpgp/src/regex/mod.rs b/openpgp/src/regex/mod.rs new file mode 100644 index 00000000..626e5190 --- /dev/null +++ b/openpgp/src/regex/mod.rs @@ -0,0 +1,1943 @@ +//! OpenPGP regex parser. +//! +//! OpenPGP defines a [regular expression language]. It is used with +//! [trust signatures] to scope the trust that they extend. +//! +//! [regular expression language]: https://tools.ietf.org/html/rfc4880#section-8 +//! [trust signatures]: https://tools.ietf.org/html/rfc4880#section-5.2.3.13 +//! +//! Compared with most regular expression lanugages, OpenPGP's is +//! quite simple. In particular, it only includes the following +//! features: +//! +//! - Alternations using `|`, +//! - Grouping using `(` and `)`, +//! - The `*`, `+`, and `?` glob operators, +//! - The `^`, and `$` anchors, +//! - The '.' operator, positive *non-empty* ranges +//! (e.g. `[a-zA-Z]`) and negative *non-empty* ranges (`[^@]`), and +//! - The backslash operator to escape special characters (except +//! in ranges). +//! +//! The regular expression engine defined in this module implements +//! that language with two differences. The first difference is that +//! the compiler only works on UTF-8 strings (not bytes). The second +//! difference is that ranges in character classes are between UTF-8 +//! characters, not just ASCII characters. +//! +//! # Data Structures +//! +//! This module defines two data structures. [`Regex`] encapsulates a +//! valid regular expression, and provides methods to check whether +//! the regular expression matches a string or a [`UserID`]. +//! [`RegexSet`] is similar, but encapsulates zero or more regular +//! expressions, which may or may not be valid. Its match methods +//! return `true` if there are no regular expressions, or, if there is +//! at least one regular expression, they return whether at least one +//! of the regular expressions matches it. `RegexSet`'s matcher +//! handles invalid regular expressions by considering them to be +//! regular expressions that don't match anything. These semantics +//! are consistent with a trust signature's scoping rules. Further, +//! strings that contain control characters never match. This +//! behavior can be overridden using [`Regex::disable_sanitizations`] +//! and [`RegexSet::disable_sanitizations`]. +//! +//! [`Regex`]: struct.Regex.html +//! [`UserID`]: ../packet/struct.UserID.html +//! [`RegexSet`]: struct.RegexSet.html +//! [`Regex::disable_sanitizations`]: struct.Regex.html#method.disable_sanitizations +//! [`RegexSet::disable_sanitizations`]: struct.RegexSet.html#method.disable_sanitizations +//! +//! # Scoped Trust Signatures +//! +//! To create a trust signature, you create a signature whose [type] +//! is either [GenericCertification], [PersonaCertification], +//! [CasualCertification], or [PositiveCertification], and add a +//! [Trust Signature] subpacket using, for instance, the +//! [`SignatureBuilder::set_trust_signature`] method. +//! +//! [type]: https://tools.ietf.org/html/rfc4880#section-5.2.1 +//! [GenericCertification]: ../types/enum.SignatureType.html#variant.GenericCertification +//! [PersonaCertification]: ../types/enum.SignatureType.html#variant.PersonaCertification +//! [CasualCertification]: ../types/enum.SignatureType.html#variant.CasualCertification +//! [PositiveCertification]: ../types/enum.SignatureType.html#variant.PositiveCertification +//! [Trust Signature]: https://tools.ietf.org/html/rfc4880#section-5.2.3.13 +//! [`SignatureBuilder::set_trust_signature`]: ../packet/signature/struct.SignatureBuilder.html#method.set_trust_signature +//! +//! To scope a trust signature, you add a [Regular Expression +//! subpacket] to it using +//! [`SignatureBuilder::set_regular_expression`] or +//! [`SignatureBuilder::add_regular_expression`]. +//! +//! To extract any regular expressions, you can use +//! [`SignatureBuilder::regular_expressions`]. +//! +//! [Regular Expression subpacket]: https://tools.ietf.org/html/rfc4880#section-5.2.3.14 +//! [`SignatureBuilder::set_regular_expression`]: ../packet/signature/struct.SignatureBuilder.html#method.set_regular_expression +//! [`SignatureBuilder::add_regular_expression`]: ../packet/signature/struct.SignatureBuilder.html#method.add_regular_expression +//! [`SignatureBuilder::regular_expressions`]: ../packet/signature/struct.SignatureBuilder.html#method.regular_expressions +//! +//! # Caveat Emptor +//! +//! Note: GnuPG has [very limited regular expression support]. In +//! particular, it only recognizes regular expressions with the +//! following form: +//! +//! [very limited regular expression support]: https://dev.gnupg.org/source/gnupg/browse/master/g10/trustdb.c;15e065dee891eef9545556f210b4199107999869$1558 +//! +//! ```text +//! <[^>]+[@.]example\.com>$ +//! ``` +//! +//! Further, it escapes any operators between the `<[^>]+[@.]` and the +//! `>$` except `.` and `\`. Otherwise, GnuPG treats the regular +//! expression as a literal domain (e.g., `example.com`). +//! +//! Further, until [version 2.2.22] (released in August 2020), GnuPG +//! did not support regular expressions on Windows, and other systems +//! that don't include `regcomp`. On these systems, if a trust +//! signature included a regular expression, GnuPG conservatively +//! considered the whole trust signature to match nothing. +//! +//! [version 2.2.22]: https://dev.gnupg.org/T5030 +//! +//! # Examples +//! +//! A CA signs two certificates, one for Alice, who works at +//! `example.com`, and one for Bob, who is associated with `some.org`. +//! Carol then creates a trust signature for the CA, which she scopes +//! to `example.org` and `example.com`. We then confirm that Carol +//! can use the CA to authenticate Alice, but not Bob. +//! +//! ``` +//! use sequoia_openpgp as openpgp; +//! use openpgp::cert::prelude::*; +//! use openpgp::packet::prelude::*; +//! use openpgp::policy::StandardPolicy; +//! use openpgp::regex::RegexSet; +//! use openpgp::types::SignatureType; +//! +//! # fn main() -> openpgp::Result<()> { +//! let p = &StandardPolicy::new(); +//! +//! let (ca, _) +//! = CertBuilder::general_purpose(None, Some("OpenPGP CA <openpgp-ca@example.com>")) +//! .generate()?; +//! let mut ca_signer = ca.primary_key().key().clone() +//! .parts_into_secret()?.into_keypair()?; +//! let ca_userid = ca.with_policy(p, None)? +//! .userids().nth(0).expect("Added a User ID").userid(); +//! +//! // The CA certifies "Alice <alice@example.com>". +//! let (alice, _) +//! = CertBuilder::general_purpose(None, Some("Alice <alice@example.com>")) +//! .generate()?; +//! let alice_userid = alice.with_policy(p, None)? +//! .userids().nth(0).expect("Added a User ID").userid(); +//! let alice_certification = SignatureBuilder::new(SignatureType::GenericCertification) +//! .sign_userid_binding( +//! &mut ca_signer, +//! alice.primary_key().component(), +//! alice_userid)?; +//! let alice = alice.insert_packets(alice_certification.clone())?; +//! # assert!(alice.clone().into_packets().any(|p| { +//! # match p { +//! # Packet::Signature(sig) => sig == alice_certification, +//! # _ => false, +//! # } +//! # })); +//! +//! // The CA certifies "Bob <bob@some.org>". +//! let (bob, _) +//! = CertBuilder::general_purpose(None, Some("Bob <bob@some.org>")) +//! .generate()?; +//! let bob_userid = bob.with_policy(p, None)? +//! .userids().nth(0).expect("Added a User ID").userid(); +//! let bob_certification = SignatureBuilder::new(SignatureType::GenericCertification) +//! .sign_userid_binding( +//! &mut ca_signer, +//! bob.primary_key().component(), +//! bob_userid)?; +//! let bob = bob.insert_packets(bob_certification.clone())?; +//! # assert!(bob.clone().into_packets().any(|p| { +//! # match p { +//! # Packet::Signature(sig) => sig == bob_certification, +//! # _ => false, +//! # } +//! # })); +//! +//! +//! // Carol tsigns the CA's certificate. +//! let (carol, _) +//! = CertBuilder::general_purpose(None, Some("Carol <carol@another.net>")) +//! .generate()?; +//! let mut carol_signer = carol.primary_key().key().clone() +//! .parts_into_secret()?.into_keypair()?; +//! +//! let ca_tsig = SignatureBuilder::new(SignatureType::GenericCertification) +//! .set_trust_signature(2, 120)? +//! .set_regular_expression("<[^>]+[@.]example\\.org>$")? +//! .add_regular_expression("<[^>]+[@.]example\\.com>$")? +//! .sign_userid_binding( +//! &mut carol_signer, +//! ca.primary_key().component(), +//! ca_userid)?; +//! let ca = ca.insert_packets(ca_tsig.clone())?; +//! # assert!(ca.clone().into_packets().any(|p| { +//! # match p { +//! # Packet::Signature(sig) => sig == ca_tsig, +//! # _ => false, +//! # } +//! # })); +//! +//! +//! // Carol now tries to authenticate Alice and Bob's certificates +//! // using the CA as a trusted introducer based on `ca_tsig`. +//! let res = RegexSet::from_signature(&ca_tsig)?; +//! +//! // Should should be able to authenticate Alice. +//! let alice_ua = alice.with_policy(p, None)? +//! .userids().nth(0).expect("Added a User ID"); +//! # assert!(res.matches_userid(&alice_ua)); +//! let mut authenticated = false; +//! for c in alice_ua.certifications() { +//! if c.get_issuers().into_iter().any(|h| h.aliases(ca.key_handle())) { +//! if c.clone().verify_userid_binding( +//! ca.primary_key().key(), +//! alice.primary_key().key(), +//! alice_ua.userid()).is_ok() +//! { +//! authenticated |= res.matches_userid(&alice_ua); +//! } +//! } +//! } +//! assert!(authenticated); +//! +//! // But, although the CA has certified Bob's key, Carol doesn't rely +//! // on it, because Bob's email address ("bob@some.org") is out of +//! // scope (some.org, not example.com). +//! let bob_ua = bob.with_policy(p, None)? +//! .userids().nth(0).expect("Added a User ID"); +//! # assert!(! res.matches_userid(&bob_ua)); +//! let mut have_certification = false; +//! let mut authenticated = false; +//! for c in bob_ua.certifications() { +//! if c.get_issuers().into_iter().any(|h| h.aliases(ca.key_handle())) { +//! if c.clone().verify_userid_binding( +//! ca.primary_key().key(), +//! bob.primary_key().key(), +//! bob_ua.userid()).is_ok() +//! { +//! have_certification = true; +//! authenticated |= res.matches_userid(&bob_ua); +//! } +//! } +//! } +//! assert!(have_certification); +//! assert!(! authenticated); +//! # Ok(()) } +//! ``` + +use std::borrow::Borrow; + +use lalrpop_util::ParseError; +use regex_syntax::hir::{self, Hir}; +use regex; + +use crate::Error; +use crate::Result; +use crate::packet::prelude::*; +use crate::types::SignatureType; + +pub(crate) mod lexer; +lalrpop_util::lalrpop_mod!( + #[allow(clippy::all)] + #[allow(unused_parens)] + grammar, + "/regex/grammar.rs" +); + +pub(crate) use self::lexer::Token; +pub(crate) use self::lexer::{Lexer, LexicalError}; + +const TRACE: bool = false; + +// Convert tokens into strings. +// +// Unfortunately, we can't implement From, because we don't define +// ParseError in this crate. +pub(crate) fn parse_error_downcast(e: ParseError<usize, Token, LexicalError>) + -> ParseError<usize, String, LexicalError> +{ + match e { + ParseError::UnrecognizedToken { + token: (start, t, end), + expected, + } => ParseError::UnrecognizedToken { + token: (start, t.into(), end), + expected, + }, + + ParseError::ExtraToken { + token: (start, t, end), + } => ParseError::ExtraToken { + token: (start, t.into(), end), + }, + + ParseError::InvalidToken { location } + => ParseError::InvalidToken { location }, + + ParseError::User { error } + => ParseError::User { error }, + + ParseError::UnrecognizedEOF { location, expected } + => ParseError::UnrecognizedEOF { location, expected }, + } +} + +// Used by grammar.lalrpop to generate a regex class (e.g. '[a-ce]'). +fn generate_class(caret: bool, chars: impl Iterator<Item=char>) -> Hir +{ + tracer!(TRACE, "generate_class"); + + // Dealing with ranges is a bit tricky. We need to examine three + // tokens. If the middle one is a dash, it's a range. + + let chars: Vec<Option<char>> = chars + // Pad it out so what we can use windows to get three + // characters at a time, and be sure to process all + // characters. + .map(|c| Some(c)) + .chain(std::iter::once(None)) + .chain(std::iter::once(None)) + .collect(); + if chars.len() == 2 { + // The grammar doesn't allow an empty class. + unreachable!(); + } else { + let r = chars + .windows(3) + .scan(0, + |skip: &mut usize, x: &[Option<char>]| + // Scan stops if the result is None. + // filter_map keeps only those elements that + // are Some. + -> Option<Option<hir::ClassUnicodeRange>> + { + if *skip > 0 { + *skip -= 1; + t!("Skipping: {:?} (skip now: {})", x, skip); + Some(None) + } else { + match (x[0], x[1], x[2]) { + (Some(a), Some('-'), Some(c)) => { + // We've got a real range. + *skip = 2; + t!("range for '{}-{}'", a, c); + Some(Some(hir::ClassUnicodeRange::new(a, c))) + } + (Some(a), _, _) => { + t!("range for '{}'", a); + Some(Some(hir::ClassUnicodeRange::new(a, a))) + } + (None, _, _) => unreachable!(), + } + } + }) + .filter_map(|r| r); + let mut class = hir::Class::Unicode(hir::ClassUnicode::new(r)); + if caret { + class.negate(); + } + Hir::class(class) + } +} + +/// A compiled OpenPGP regular expression for matching UTF-8 encoded +/// strings. +/// +/// A `Regex` contains a regular expression compiled according to the +/// rules defined in [Section 8 of RFC 4880] modulo two differences. +/// First, the compiler only works on UTF-8 strings (not bytes). +/// Second, ranges in character classes are between UTF-8 characters, +/// not just ASCII characters. Further, by default, strings that +/// don't pass a sanity check (in particular, include Unicode control +/// characters) never match. This behavior can be customized using +/// [`Regex::disable_sanitizations`]. +/// +/// [Section 8 of RFC 4880]: https://tools.ietf.org/html/rfc4880#section-8 +/// [trust signatures]: https://tools.ietf.org/html/rfc4880#section-5.2.3.13 +/// [`Regex::disable_sanitizations`]: #method.disable_sanitizations +/// +/// Regular expressions are used to scope the trust that [trust +/// signatures] extend. +/// +/// When working with trust signatures, you'll usually want to use the +/// [`RegexSet`] data structure, which already implements the correct +/// semantics. +/// +/// [`RegexSet`]: struct.RegexSet.html +/// +/// See the [module-level documentation] for more details. +/// +/// [module-level documentation]: index.html +#[derive(Clone, Debug)] +pub struct Regex { + regex: regex::Regex, + disable_sanitizations: bool, +} + +impl Regex { + /// Parses and compiles the regular expression. + /// + /// By default, strings that don't pass a sanity check (in + /// particular, include Unicode control characters) never match. + /// This behavior can be customized using + /// [`Regex::disable_sanitizations |