diff options
author | Neal H. Walfield <neal@pep.foundation> | 2019-04-11 23:31:18 +0200 |
---|---|---|
committer | Neal H. Walfield <neal@pep.foundation> | 2019-04-12 11:39:37 +0200 |
commit | 5c7e4274102748b287e81ffb195a918f442e2e13 (patch) | |
tree | 60487c5edc7af1e1fc2a1e62db03b760443549c9 | |
parent | e3b30cebc5fabcd9abc1b4bd109c4816e06a2f01 (diff) |
New crate sequoia-rfc2822.
- An RFC 2882 mail name-addr parser.
-rw-r--r-- | Cargo.toml | 1 | ||||
-rw-r--r-- | rfc2822/Cargo.toml | 27 | ||||
-rw-r--r-- | rfc2822/README.md | 44 | ||||
-rw-r--r-- | rfc2822/build.rs | 11 | ||||
-rw-r--r-- | rfc2822/src/component.rs | 212 | ||||
-rw-r--r-- | rfc2822/src/grammar.lalrpop | 698 | ||||
-rw-r--r-- | rfc2822/src/grammar.rs | 2 | ||||
-rw-r--r-- | rfc2822/src/lexer.rs | 212 | ||||
-rw-r--r-- | rfc2822/src/lib.rs | 898 | ||||
-rw-r--r-- | rfc2822/src/macros.rs | 19 | ||||
-rw-r--r-- | rfc2822/src/strings.rs | 57 | ||||
-rw-r--r-- | rfc2822/src/trace.rs | 63 |
12 files changed, 2244 insertions, 0 deletions
@@ -23,6 +23,7 @@ maintenance = { status = "actively-developed" } [dependencies] buffered-reader = { path = "buffered-reader", version = "0.5" } +sequoia-rfc2822 = { path = "rfc2822", version = "0.1" } sequoia-openpgp = { path = "openpgp", version = "0.5" } sequoia-openpgp-ffi = { path = "openpgp-ffi", version = "0.5" } sequoia-core = { path = "core", version = "0.5" } diff --git a/rfc2822/Cargo.toml b/rfc2822/Cargo.toml new file mode 100644 index 00000000..82888ba8 --- /dev/null +++ b/rfc2822/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "sequoia-rfc2822" +description = "An RFC 2822 name-addr parser" +version = "0.1.0" +authors = [ + "Justus Winter <justus@sequoia-pgp.org>", + "Neal H. Walfield <neal@sequoia-pgp.org>", +] +build = "build.rs" +documentation = "https://docs.sequoia-pgp.org/0.1.0/rfc2822" +homepage = "https://sequoia-pgp.org/" +repository = "https://gitlab.com/sequoia-pgp/sequoia" +readme = "README.md" +license = "GPL-3.0" +keywords = ["rfc2822", "rfc822", "name-addr", "email"] +categories = ["email"] + +[badges] +gitlab = { repository = "sequoia-pgp/sequoia" } +maintenance = { status = "actively-developed" } + +[dependencies] +failure = "0.1.2" +lalrpop-util = "0.16" + +[build-dependencies] +lalrpop = "0.16" diff --git a/rfc2822/README.md b/rfc2822/README.md new file mode 100644 index 00000000..db6140a8 --- /dev/null +++ b/rfc2822/README.md @@ -0,0 +1,44 @@ +An [RFC 2822] parser. + + [RFC 2822]: https://tools.ietf.org/html/rfc2822 + +Currently, this crate only recognizes the [RFC 2822] [name-addr] and +[addr-spec] productions, i.e., things of the form: + + [name-addr]: https://tools.ietf.org/html/rfc2822#section-3.4 + [addr-spec]: https://tools.ietf.org/html/rfc2822#section-3.4.1 + +``` +Name (Comment) <email@example.org> +``` + +and + +``` +email@example.org +``` + +Although the above appear simple to parse, [RFC 2822]'s whitespace and +comment rules are rather complex. This crate implements the whole +grammar. + +As an extension, in addition to ASCII, we also recognize all UTF-8 +text. + +Further, we also allow dots in the name-addr Name. That is: + +``` +Professor Pippy P. Poopypants <pippy@jerome-horwitz.k12.oh.us> +``` + +is recognized. But [RFC 2822] strictly requires that the name be +quoted: + +``` +"Professor Pippy P. Poopypants" <pippy@jerome-horwitz.k12.oh.us> +``` + + +This crate does not (yet) implement the new [RFC 5322]. + + [RFC 5322]: https://tools.ietf.org/html/rfc5322 diff --git a/rfc2822/build.rs b/rfc2822/build.rs new file mode 100644 index 00000000..24051d93 --- /dev/null +++ b/rfc2822/build.rs @@ -0,0 +1,11 @@ +extern crate lalrpop; + +// Rerun if any of these files change: +#[allow(dead_code)] +const SOURCE: [ &'static str; 1 ] + = [ include_str!("src/grammar.lalrpop"), + ]; + +fn main() { + lalrpop::process_root().unwrap(); +} diff --git a/rfc2822/src/component.rs b/rfc2822/src/component.rs new file mode 100644 index 00000000..808f497f --- /dev/null +++ b/rfc2822/src/component.rs @@ -0,0 +1,212 @@ +/// A UserID value typically looks something like: +/// +/// Text (Comment) <name@example.org> +/// +/// That is, it contains three components: a text string, a comment, +/// and an email address. +/// +/// The actual format allows for lots of interleaved comments and +/// multiple texts. Thus, when parsing we build up a vector of +/// Components in the order that they were encountered. +#[derive(Debug, PartialEq, Eq, Clone)] +pub enum Component { + // A text string. + Text(String), + // A comment. + // + // The outermost parens are removed. That is, if the comment is: + // "(foo(bar)bam)", then "foo(bar)bam" is stored. + Comment(String), + // An email address. + Address(String), + // White space. + WS, +} + +impl From<Component> for Vec<Component> { + fn from(c: Component) -> Self { + vec![c] + } +} + +impl From<Component> for Option<Vec<Component>> { + fn from(c: Component) -> Self { + Some(vec![c]) + } +} + +// Collect the `Component`s to the vector `v`. +// +// The Components can be anything that can be turned into an +// Option<Vec<Component>>. This currently includes `Component`, and +// `Vec<Component>`. +macro_rules! components_concat_into { + ( $v:expr, $c:expr ) => {{ + let v: &mut Vec<Component> = $v; + let c : Option<Vec<Component>> = $c.into(); + if let Some(mut c) = c { + // If v ends in a WS and c starts with a WS, then collapse + // them. + if destructures_to!(Some(Component::WS) = v.last()) + && destructures_to!(Some(Component::WS) = c.first()) + { + v.pop(); + } + v.append(&mut c); + } + }}; + ( $v:expr, $car:expr, $($cdr:expr),* ) => {{ + let v: &mut Vec<Component> = $v; + let car : Option<Vec<Component>> = $car.into(); + if let Some(mut car) = car { + if destructures_to!(Some(Component::WS) = v.last()) + && destructures_to!(Some(Component::WS) = car.first()) + { + v.pop(); + } + v.append(&mut car) + } + components_concat_into!(v, $($cdr),*); + }}; +} + +// Collect the `Component`s into a vector `v`. +// +// The Components can be anything that can be turned into an +// Option<Vec<Component>>. This currently includes `Component`, and +// `Vec<Component>`. +macro_rules! components_concat { + ( $( $args:expr ),*) => {{ + let mut v : Vec<Component> = Vec::new(); + components_concat_into!(&mut v, $($args),*); + v + }}; +} + +// Kills leading (`left`) and/or trailing (`right`) whitespace +// (`Component::WS`). +pub(crate) fn components_kill_ws(v: Option<Vec<Component>>, + left: bool, right: bool) + -> Vec<Component> +{ + tracer!(::TRACE, "components_kill_ws"); + t!("v: {:?}, left: {}, right: {}", v, left, right); + + let v = if let Some(mut v) = v { + if v.len() > 0 && right { + let mut kill = false; + if let Component::WS = v[v.len() - 1] { + kill = true; + } + if kill { + v.pop(); + } + } + if v.len() > 0 && left { + let mut kill = false; + if let Component::WS = v[0] { + kill = true; + } + if kill { + v.remove(0); + } + } + v + } else { + vec![] + }; + t!("=> {:?}", v); + v +} + +// Merge the components in the vector. +pub(crate) fn components_merge(components: Vec<Component>) + -> Vec<Component> +{ + tracer!(::TRACE, "components_merge", 0); + t!("{:?}", components); + + let mut iter = components.into_iter(); + let mut components = vec![]; + + let mut left = if let Some(left) = iter.next() { + left + } else { + return components; + }; + let mut middleo = iter.next(); + let mut righto = iter.next(); + + while let Some(mut middle) = middleo { + enum Kill { + None, + Middle, + MiddleRight, + Right, + }; + let mut kill = Kill::None; + + match (&mut left, &mut middle, righto.as_mut()) { + (Component::Text(ref mut l), + Component::Text(ref mut m), + _) => { + t!("Merging '{}' and '{}'", l, m); + l.push_str(m); + kill = Kill::Middle; + }, + + (Component::Text(ref mut l), + Component::WS, + Some(Component::Text(ref mut r))) => { + t!("Merging '{}', WS and '{}'", l, r); + l.push(' '); + l.push_str(r); + kill = Kill::MiddleRight; + }, + (_, + Component::WS, + Some(Component::WS)) => { + // This can happen when we have a local-part of the + // following form: + // + // (comment) foo (comment) + // + // The local-part is produced by the dot_atom_left + // production, which puts the dot_atom_text (foo) to + // the right: + // + // COMMENT WS WS COMMENT TEXT + kill = Kill::Right; + }, + _ => (), + } + + match kill { + Kill::Middle => { + middleo = righto; + righto = iter.next(); + } + Kill::MiddleRight => { + middleo = iter.next(); + righto = iter.next(); + } + Kill::Right => { + middleo = Some(middle); + righto = iter.next(); + } + Kill::None => { + components.push(left); + left = middle; + middleo = righto; + righto = iter.next(); + } + } + } + + components.push(left); + if let Some(middle) = middleo { + components.push(middle); + } + + components +} diff --git a/rfc2822/src/grammar.lalrpop b/rfc2822/src/grammar.lalrpop new file mode 100644 index 00000000..f20735ed --- /dev/null +++ b/rfc2822/src/grammar.lalrpop @@ -0,0 +1,698 @@ +// -*- mode: Rust; -*- +use Error; + +use strings::{ + strings_flatten_into, + strings_flatten2, + strings_flatten, +}; +use component::{ + Component, + components_kill_ws, + components_merge, +}; +use lexer; +use lexer::Token; + +grammar<'input>; + +// RFC 4880 says: +// +// 5.11. User ID Packet (Tag 13) +// +// A User ID packet consists of UTF-8 text that is intended to represent +// the name and email address of the key holder. By convention, it +// includes an RFC 2822 [RFC2822] mail name-addr, but there are no +// restrictions on its content. +// +// At least today, the convention is more along the lines of RFC +// 2822's mailbox instead of its name-addr. The only different is +// that the mailbox production allows for a bare email address i.e., +// one without angle brackets whereas the name-addr production +// requires angle brackets. +// +// A further convention is an ssh-host-uri production: +// +// ssh-host-uri = "ssh://" dns-hostname + + +CRLF: () = { + CR LF +} + +// text = %d1-9 / ; Characters excluding CR and LF +// %d11 / +// %d12 / +// %d14-127 / +// obs-text +text : Token<'input> = { + WSP, + NO_WS_CTL, + specials, + OTHER, +} + +// specials = "(" / ")" / ; Special characters used in +// "<" / ">" / ; other parts of the syntax +// "[" / "]" / +// ":" / ";" / +// "@" / "\" / +// "," / "." / +// DQUOTE +specials : Token<'input> = { + LPAREN, + RPAREN, + LANGLE, + RANGLE, + LBRACKET, + RBRACKET, + COLON, + SEMICOLON, + AT, + BACKSLASH, + COMMA, + DOT, + DQUOTE, +}; + + +// 3.2.2. Quoted characters + +// quoted-pair = ("\" text) / obs-qp +// +// In RFC 2822, text is a single character and the BACKSLAH is +// followed by exactly one character. As an optimization, our lexer +// groups runs of 'text' characters into a single token, Token::OTHER. +// Since a quoted pair can always be followed by a run of OTHER +// characters, the semantics are preserved. +quoted_pair : Token<'input> = { + BACKSLASH <text>, +} + +// 3.2.3. Folding white space and comments + +// Folding white space +// +// FWS = ([*WSP CRLF] 1*WSP) / ; +// obs-FWS +// +// Runs of FWS, comment or CFWS that occur between lexical tokens in +// a structured field header are semantically interpreted as a +// single space character. +#[inline] +FWS : Component = { + (WSP* CRLF)? WSP+ => Component::WS, +} + +// ctext = NO-WS-CTL / ; Non white space controls +// %d33-39 / ; The rest of the US-ASCII +// %d42-91 / ; characters not including "(", +// %d93-126 ; ")", or "\" +ctext : Token<'input> = { + NO_WS_CTL, + + // LPAREN, + // RPAREN, + LANGLE, + RANGLE, + LBRACKET, + RBRACKET, + COLON, + SEMICOLON, + AT, + // BACKSLASH, + COMMA, + DOT, + DQUOTE, + + OTHER, +} + +// ccontent = ctext / quoted-pair / comment +ccontent : String = { + <c:ctext> => c.to_string(), + <c:quoted_pair> => c.to_string(), + <c:comment> => { + let mut s = String::new(); + s.push('('); + if let Component::Comment(comment) = c { + s.push_str(&comment[..]); + } else { + panic!("Expected a Component::Comment"); + } + s.push(')'); + s + }, +} + +// comment = "(" *([FWS] ccontent) [FWS] ")" +pub(crate) Comment : Component = { + <comment> +} + +comment : Component = { + LPAREN <c:(<FWS?> <ccontent>)*> <d:FWS?> RPAREN => { + let mut s = strings_flatten2( + c.into_iter().map(|(fws, c)| (fws.is_some(), c)), " "); + + if d.is_some() { + s.push(' '); + } + + Component::Comment(s) + }, +} + +// CFWS = *([FWS] comment) (([FWS] comment) / FWS) +pub(crate) Cfws : Vec<Component> = { + <c:CFWS> => { + components_merge(c) + } +} + +CFWS : Vec<Component> = { + // <c:(FWS? <comment>)*> FWS? <d:comment> => ..., + // <c:(FWS? <comment>)*> FWS => ..., + + // The following is equivalent to the above, but the actions are a + // bit simpler. + <c:(<FWS?> <comment>)+> => { + let v : Vec<Component> = c.into_iter() + .map(|(w, c)| { + if let Some(w) = w { + vec![w, c] + } else { + vec![c] + } + }) + .flatten() + .collect(); + v + }, + <c:(<FWS?> <comment>)*> <w2:FWS> => { + let mut v : Vec<Component> = c.into_iter() + .map(|(w, c)| { + if let Some(w) = w { + vec![w, c] + } else { + vec![c] + } + }) + .flatten() + .collect(); + v.push(w2); + v + } +} + +// 3.2.4. Atom + +// atext = ALPHA / DIGIT / ; Any character except controls, +// "!" / "#" / ; SP, and specials. +// "$" / "%" / ; Used for atoms +// "&" / "'" / +// "*" / "+" / +// "-" / "/" / +// "=" / "?" / +// "^" / "_" / +// "`" / "{" / +// "|" / "}" / +// "~" +// +// As an optimization the lexer collects atexts, i.e., Token::OTHER is +// 1*atext. +atext_plus : String = { + <a:OTHER> => { + let a = a.to_string(); + assert!(a.len() > 0); + a + }, +} + + +// The display-name in a name-addr production often includes a ., but +// is not quoted. The RFC even recommends supporting this variation. +other_or_dot : String = { + <a:OTHER> => a.to_string(), + <d:DOT> => d.to_string(), +} + +atext_dot_plus : String = { + <a:other_or_dot+> => strings_flatten(a.into_iter(), ""), +} + +// atom = [CFWS] 1*atext [CFWS] +// +// "Both atom and dot-atom are interpreted as a single unit, comprised +// of the string of characters that make it up. Semantically, the +// optional comments and FWS surrounding the rest of the characters +// are not part of the atom" +pub(crate) Atom : Vec<Component> = { + <a:atom> => components_merge(a), +} + +atom : Vec<Component> = { + <c1:CFWS?> <a:atext_dot_plus> <c2:CFWS?> => + components_concat!( + components_kill_ws(c1, false, true), + Component::Text(a), + components_kill_ws(c2, true, false)), +} + +// See the phrase production for this variant of the 'atom' production +// exists, and why the 'CFWS?'es are not included. +atom_prime : Component = { + <a:atext_dot_plus> => Component::Text(a), +} + +// dot-atom = [CFWS] dot-atom-text [CFWS] +// +// "Both atom and dot-atom are interpreted as a single unit, comprised +// of the string of characters that make it up. Semantically, the +// optional comments and FWS surrounding the rest of the characters +// are not part of the atom" +pub(crate) DotAtom : Vec<Component> = { + <d:dot_atom> => components_merge(d), +} + +dot_atom : Vec<Component> = { + <c1:CFWS?> <a:dot_atom_text> <c2:CFWS?> => + components_concat!( + components_kill_ws(c1, false, true), + a, + components_kill_ws(c2, true, false)), +} + +// A variant of dot_atom that places all comments to the left. +dot_atom_left : Vec<Component> = { + <c1:CFWS?> <a:dot_atom_text> <c2:CFWS?> => + components_concat!( + components_kill_ws( + Some(components_concat!(c1, c2)), false, true), + a), +} + +// A variant of dot_atom that places all comments to the right. +dot_atom_right : Vec<Component> = { + <c1:CFWS?> <a:dot_atom_text> <c2:CFWS?> => + components_concat!( + a, + components_kill_ws( + Some(components_concat!(c1, c2)), true, false)), +} + +// dot-atom-text = 1*atext *("." 1*atext) +dot_atom_text : Component = { + <v:atext_plus> <w:(DOT <atext_plus>)*> => { + let mut v = v; + if w.len() > 0 { + v.push('.'); + } + Component::Text( + strings_flatten_into(v, w.into_iter(), ".")) + }, +} + +// 3.2.5. Quoted strings + +// qtext = NO-WS-CTL / ; Non white space controls +// %d33 / ; The rest of the US-ASCII +// %d35-91 / ; characters not including "\" +// %d93-126 ; or the quote character +qtext : Token<'input> = { + NO_WS_CTL, + + LPAREN, + RPAREN, + LANGLE, + RANGLE, + LBRACKET, + RBRACKET, + COLON, + SEMICOLON, + AT, + // BACKSLASH, + COMMA, + DOT, + // DQUOTE, + + OTHER, +} + +// qcontent = qtext / quoted-pair +qcontent : Component = { + <c:qtext> => Component::Text(c.to_string()), + <c:quoted_pair> => Component::Text(c.to_string()), +} + +// quoted-string = [CFWS] +// DQUOTE *([FWS] qcontent) [FWS] DQUOTE +// [CFWS] +pub(crate) QuotedString : Vec<Component> = { + <q:quoted_string> => components_merge(q), +} + +quoted_string : Vec<Component> = { + <c1:CFWS?> DQUOTE <c:(<FWS?> <qcontent>)*> <d:FWS?> DQUOTE <c2:CFWS?> => { + // Make sure any leading and trailing whitespace *inside* the + // quotes is turned into Component::Text. + components_concat!( + // c1 is an Option<Vec<Component>>. + c1, + // c is a Vec<(Option<Component>, Component)>. Turn it + // into a Vec<Component>. + c.into_iter() + .map(|(fws, c)| { + if let Some(_) = fws { + vec![Component::Text(" ".to_string()), c] + } else { + vec![c] + } + }) + .flatten() + .collect::<Vec<Component>>(), + // d is an Option<Component>, turn it into a + // Option<Vec<Component>>. + d.map(|_| vec![Component::Text(" ".to_string())]), + c2) + }, +} + +// Variant of quoted_string that moves all comments to the left. +quoted_string_left : Vec<Component> = { + <c1:CFWS?> DQUOTE <c:(<FWS?> <qcontent>)*> <d:FWS?> DQUOTE <c2:CFWS?> => { + // Make sure any leading and trailing whitespace *inside* the + // quotes is turned into Component::Text. + components_concat!( + // c1 is an Option<Vec<Component>>. + components_kill_ws(Some(components_concat!(c1, c2)), false, true), + // c is a Vec<(Option<Component>, Component)>. Turn it + // into a Vec<Component>. + c.into_iter() + .map(|(fws, c)| { + if let Some(_) = fws { + vec![Component::Text(" ".to_string()), c] + } else { + vec![c] + } + }) + .flatten() + .collect::<Vec<Component>>(), + // d is an Option<Component>, turn it into a + // Option<Vec<Component>>. + d.map(|_| vec![Component::Text(" ".to_string())])) + }, +} + +// See the phrase production for this variant of the 'quoted_string' +// production exists, and why the 'CFWS?'es are not included. +quoted_string_prime : Vec<Component> = { + DQUOTE <c:(<FWS?> <qcontent>)*> <d:FWS?> DQUOTE => { + // Make sure any leading and trailing whitespace *inside* the + // quotes is turned into Component::Text. + components_concat!( + // c is a Vec<(Option<Component>, Component)>. Turn it + // into a Vec<Component>. + c.into_iter() + .map(|(fws, c)| { + if let Some(_) = fws { + vec![Component::Text(" ".to_string()), c] + } else { + vec![c] + } + }) + .flatten() + .collect::<Vec<Component>>(), + // d is an Option<Component>, turn it into a + // Option<Vec<Component>>. + d.map(|_| vec![Component::Text(" ".to_string())])) + }, +} + +// 3.2.6. Miscellaneous tokens + +// word = atom / quoted-string +pub(crate) Word : Vec<Component> = { + <w:word> => components_merge(w), +} + +word : Vec<Component> = { + atom, + quoted_string, +} + +// phrase = 1*word / obs-phrase + +pub(crate) Phrase : Vec<Component> = { + <p:phrase> => components_merge(p), +} + +// phrase : String = { +// <v:word+> => strings_flatten(v, ""), +// } +// +// Note: consider the following parse tree: +// +// phrase +// / \ +// word word +// / \ +// atom atom +// / | \ / | \ +// CFWS+? atext+ CFWS? CFWS+? atext+ CFWS? +// +// This has an ambiguity! Does a CFWS immediate after the first +// atext+ belong to the first atom or the second? And, if there are +// no CFWSes, how do we split the atext? +// +// To avoid these problems, we modify the grammar as presented in the +// RFC as follows: +atom_or_quoted_string : Vec<Component> = { + <a:atom_prime> <r:cfws_or_quoted_string?> => { + // Note: it's not possible to have multiple atoms in a row. + // The following: + // + // foo bar + // + // is 'atom_prime CFWS atom_prime'. + + components_concat!(a, r) |