diff options
Diffstat (limited to 'rfc2822')
-rw-r--r-- | rfc2822/README.md | 67 | ||||
-rw-r--r-- | rfc2822/build.rs | 5 | ||||
-rw-r--r-- | rfc2822/src/component.rs | 206 | ||||
-rw-r--r-- | rfc2822/src/grammar.lalrpop | 891 | ||||
-rw-r--r-- | rfc2822/src/grammar.rs | 2 | ||||
-rw-r--r-- | rfc2822/src/lexer.rs | 188 | ||||
-rw-r--r-- | rfc2822/src/lib.rs | 1691 | ||||
-rw-r--r-- | rfc2822/src/macros.rs | 19 | ||||
-rw-r--r-- | rfc2822/src/roundtrip.rs | 1094 | ||||
-rw-r--r-- | rfc2822/src/strings.rs | 57 | ||||
-rw-r--r-- | rfc2822/src/trace.rs | 63 |
11 files changed, 0 insertions, 4283 deletions
diff --git a/rfc2822/README.md b/rfc2822/README.md deleted file mode 100644 index 9467daee..00000000 --- a/rfc2822/README.md +++ /dev/null @@ -1,67 +0,0 @@ -An [RFC 2822] parser. - - [RFC 2822]: https://tools.ietf.org/html/rfc2822 - -Currently, this crate only recognizes the [RFC 2822] [name-addr] and -[addr-spec] productions, i.e., things of the form: - - [name-addr]: https://tools.ietf.org/html/rfc2822#section-3.4 - [addr-spec]: https://tools.ietf.org/html/rfc2822#section-3.4.1 - -``` -Name (Comment) <email@example.org> -``` - -and - -``` -email@example.org -``` - -Although the above appear simple to parse, [RFC 2822]'s whitespace and -comment rules are rather complex. This crate implements the whole -grammar. - -As an extension, in addition to ASCII, we also recognize all UTF-8 -code points. NUL, controls, and specials retain their meaning as -defined in RFC 2822. Other UTF-8 code points are considered to be -text like `a`. - -Further, we also allow dots (`.`) and at symbols (`@`) in the `atom` -production. That is, the `atom` production is extended from: - -``` -atom = [CFWS] 1*atext [CFWS] -``` - -to: - -``` -atom = [CFWS] 1*atext_or_dot_or_at [CFWS] -atext_or_dot_or_at = atext | DOT | AT -``` - -And, as such: - -``` -Professor Pippy P. Poopypants <pippy@jerome-horwitz.k12.oh.us> -``` - -is recognized as a `name-addr` even though [RFC 2822] strictly -requires that the `display-name` be quoted like: - -``` -"Professor Pippy P. Poopypants" <pippy@jerome-horwitz.k12.oh.us> -``` - -Likewise, - -``` -foo@bar.com <foo@bar.com> -``` - -is recognized as a `name-addr` even though the `@` should be quoted. - -This crate does not (yet) implement the new [RFC 5322]. - - [RFC 5322]: https://tools.ietf.org/html/rfc5322 diff --git a/rfc2822/build.rs b/rfc2822/build.rs deleted file mode 100644 index 23c7d3f8..00000000 --- a/rfc2822/build.rs +++ /dev/null @@ -1,5 +0,0 @@ -extern crate lalrpop; - -fn main() { - lalrpop::process_root().unwrap(); -} diff --git a/rfc2822/src/component.rs b/rfc2822/src/component.rs deleted file mode 100644 index 37ea4cbd..00000000 --- a/rfc2822/src/component.rs +++ /dev/null @@ -1,206 +0,0 @@ -use lalrpop_util::ParseError; -use crate::lexer::LexicalError; - -/// A UserID value typically looks something like: -/// -/// Text (Comment) <name@example.org> -/// -/// That is, it contains three components: a text string, a comment, -/// and an email address. -/// -/// The actual format allows for lots of interleaved comments and -/// multiple texts. Thus, when parsing we build up a vector of -/// Components in the order that they were encountered. -#[derive(Debug, Clone)] -pub enum Component { - // A text string. - Text(String), - // A comment. - // - // The outermost parens are removed. That is, if the comment is: - // "(foo(bar)bam)", then "foo(bar)bam" is stored. - Comment(String), - // An email address. - Address(String), - - // The text found where an address was expected. - InvalidAddress(ParseError<usize, String, LexicalError>, String), - - // White space. - WS, -} - -// When comparing two `Component::InvalidAddress`es, we consider them -// equal if the values match; we don't compare the saved errors. This -// is because the parser will always generate the same error for the -// same input. And, the PartialEq implementation is only used to -// support comparing two `Component`s in assertions. -impl PartialEq for Component { - fn eq(&self, other: &Self) -> bool { - match (self, other) { - (Component::Text(a), Component::Text(b)) => a == b, - (Component::Comment(a), Component::Comment(b)) => a == b, - (Component::Address(a), Component::Address(b)) => a == b, - (Component::InvalidAddress(_, a), Component::InvalidAddress(_, b)) => - a == b, - (Component::WS, Component::WS) => true, - (_, _) => false, - } - } -} - -impl Eq for Component { -} - -impl From<Component> for Vec<Component> { - fn from(c: Component) -> Self { - vec![c] - } -} - -impl From<Component> for Option<Vec<Component>> { - fn from(c: Component) -> Self { - Some(vec![c]) - } -} - -// Collect the `Component`s to the vector `v`. -// -// The Components can be anything that can be turned into an -// Option<Vec<Component>>. This currently includes `Component`, and -// `Vec<Component>`. -macro_rules! components_concat_into { - ( $v:expr, $c:expr ) => {{ - let v: &mut Vec<Component> = $v; - let c : Option<Vec<Component>> = $c.into(); - if let Some(mut c) = c { - // If v ends in a WS and c starts with a WS, then collapse - // them. - if destructures_to!(Some(Component::WS) = v.last()) - && destructures_to!(Some(Component::WS) = c.first()) - { - v.pop(); - } - v.append(&mut c); - } - }}; - ( $v:expr, $car:expr, $($cdr:expr),* ) => {{ - let v: &mut Vec<Component> = $v; - let car : Option<Vec<Component>> = $car.into(); - if let Some(mut car) = car { - if destructures_to!(Some(Component::WS) = v.last()) - && destructures_to!(Some(Component::WS) = car.first()) - { - v.pop(); - } - v.append(&mut car) - } - components_concat_into!(v, $($cdr),*); - }}; -} - -// Collect the `Component`s into a vector `v`. -// -// The Components can be anything that can be turned into an -// Option<Vec<Component>>. This currently includes `Component`, and -// `Vec<Component>`. -macro_rules! components_concat { - ( $( $args:expr ),*) => {{ - let mut v : Vec<Component> = Vec::new(); - components_concat_into!(&mut v, $($args),*); - v - }}; -} - -// Merge the components in the vector. -pub(crate) fn components_merge(components: Vec<Component>) - -> Vec<Component> -{ - tracer!(crate::TRACE, "components_merge", 0); - t!("{:?}", components); - - let mut iter = components.into_iter(); - let mut components = vec![]; - - let mut left = if let Some(left) = iter.next() { - left - } else { - return components; - }; - let mut middleo = iter.next(); - let mut righto = iter.next(); - - while let Some(mut middle) = middleo { - enum Kill { - None, - Middle, - MiddleRight, - }; - let mut kill = Kill::None; - - match (&mut left, &mut middle, righto.as_mut()) { - (Component::Text(ref mut l), - Component::Text(ref mut m), - _) => { - t!("Merging '{}' and '{}'", l, m); - l.push_str(m); - kill = Kill::Middle; - }, - - (Component::Text(ref mut l), - Component::WS, - Some(Component::Text(ref mut r))) => { - t!("Merging '{}', WS and '{}'", l, r); - l.push(' '); - l.push_str(r); - kill = Kill::MiddleRight; - }, - (Component::WS, - Component::WS, - _) => { - // This can happen when we have a local-part of the - // following form: - // - // (comment) foo (comment) - // - // The local-part is produced by the dot_atom_left - // production, which puts the dot_atom_text (foo) to - // the right: - // - // COMMENT WS WS COMMENT TEXT - // - // It is also possible to have: - // - // WS WS COMMENT TEXT - // - // as CFWS can expand to just a WS. - kill = Kill::Middle; - }, - _ => (), - } - - match kill { - Kill::Middle => { - middleo = righto; - righto = iter.next(); - } - Kill::MiddleRight => { - middleo = iter.next(); - righto = iter.next(); - } - Kill::None => { - components.push(left); - left = middle; - middleo = righto; - righto = iter.next(); - } - } - } - - components.push(left); - if let Some(middle) = middleo { - components.push(middle); - } - - components -} diff --git a/rfc2822/src/grammar.lalrpop b/rfc2822/src/grammar.lalrpop deleted file mode 100644 index 1b1d2b94..00000000 --- a/rfc2822/src/grammar.lalrpop +++ /dev/null @@ -1,891 +0,0 @@ -// -*- mode: Rust; -*- -use crate::parse_error_downcast; -use crate::strings::{ - strings_flatten_into, - strings_flatten2, - strings_flatten, -}; -use crate::component::{ - Component, - components_merge, -}; -use crate::lexer; -use crate::lexer::{Token, LexicalError}; - -// Pass in the original, untokenized input to facilitate error -// recovery. See, for instance, the `addr-spec-or-other` production. -grammar<'input>(input: &'input str); - -// RFC 4880 says: -// -// 5.11. User ID Packet (Tag 13) -// -// A User ID packet consists of UTF-8 text that is intended to represent -// the name and email address of the key holder. By convention, it -// includes an RFC 2822 [RFC2822] mail name-addr, but there are no -// restrictions on its content. -// -// At least today, the convention is more along the lines of RFC -// 2822's mailbox instead of its name-addr. The only different is -// that the mailbox production allows for a bare email address i.e., -// one without angle brackets whereas the name-addr production -// requires angle brackets. -// -// A further convention is an ssh-host-uri production: -// -// ssh-host-uri = "ssh://" dns-hostname -// -// Support for this should be added in the future. - -CRLF: () = { - CR LF -} - -// text = %d1-9 / ; Characters excluding CR and LF -// %d11 / -// %d12 / -// %d14-127 / -// obs-text -pub(crate) Text : Token<'input> = { - text, -} - -text : Token<'input> = { - WSP, - NO_WS_CTL, - specials, - OTHER, -} - -// To reduce the size of this grammar, we have a list of common tokens -// that is reused throughout this grammar. -common_xtext : Token<'input> = { - // LPAREN, - // RPAREN, - LANGLE, - RANGLE, - // LBRACKET, - // RBRACKET, - COLON, - SEMICOLON, - AT, - // BACKSLASH, - COMMA, - DOT, - // DQUOTE, -} - -// specials = "(" / ")" / ; Special characters used in -// "<" / ">" / ; other parts of the syntax -// "[" / "]" / -// ":" / ";" / -// "@" / "\" / -// "," / "." / -// DQUOTE -specials : Token<'input> = { - LPAREN, - RPAREN, - LBRACKET, - RBRACKET, - BACKSLASH, - DQUOTE, - common_xtext, -}; - - -// 3.2.2. Quoted characters - -// quoted-pair = ("\" text) / obs-qp -// -// In RFC 2822, text is a single character and the BACKSLASH is -// followed by exactly one character. As an optimization, our lexer -// groups runs of 'text' characters into a single token, Token::OTHER. -// Since a quoted pair can always be followed by a run of OTHER -// characters, the semantics are preserved. -quoted_pair : Token<'input> = { - BACKSLASH <text>, -} - -// 3.2.3. Folding white space and comments - -// Folding white space -// -// FWS = ([*WSP CRLF] 1*WSP) / ; -// obs-FWS -// -// Runs of FWS, comment or CFWS that occur between lexical tokens in -// a structured field header are semantically interpreted as a -// single space character. - -// FWS can't be exported, because it uses inline. -pub(crate) FWS_ : Component = { - FWS -} - -#[inline] -FWS : Component = { - (WSP* CRLF)? WSP+ => Component::WS, -} - -// ctext = NO-WS-CTL / ; Non white space controls -// %d33-39 / ; The rest of the US-ASCII -// %d42-91 / ; characters not including "(", -// %d93-126 ; ")", or "\" -pub(crate) CText : Token<'input> = { - ctext -} - -ctext : Token<'input> = { - NO_WS_CTL, - - // LPAREN, - // RPAREN, - LBRACKET, - RBRACKET, - // BACKSLASH, - DQUOTE, - common_xtext, - - OTHER, -} - -// ccontent = ctext / quoted-pair / comment -ccontent : String = { - <c:ctext> => c.to_string(), - <c:quoted_pair> => c.to_string(), - <c:comment> => { - let mut s = String::new(); - s.push('('); - if let Component::Comment(comment) = c { - s.push_str(&comment[..]); - } else { - panic!("Expected a Component::Comment"); - } - s.push(')'); - s - }, -} - -// comment = "(" *([FWS] ccontent) [FWS] ")" -pub(crate) Comment : Component = { - <comment> -} - -comment : Component = { - LPAREN <c:(<FWS?> <ccontent>)*> <d:FWS?> RPAREN => { - let mut s = strings_flatten2( - c.into_iter().map(|(fws, c)| (fws.is_some(), c)), " "); - - if d.is_some() { - s.push(' '); - } - - Component::Comment(s) - }, -} - -// CFWS = *([FWS] comment) (([FWS] comment) / FWS) -pub(crate) Cfws : Vec<Component> = { - <c:CFWS> => { - components_merge(c) - } -} - -CFWS : Vec<Component> = { - // <c:(FWS? <comment>)*> FWS? <d:comment> => ..., - // <c:(FWS? <comment>)*> FWS => ..., - - // The following is equivalent to the above, but the actions are a - // bit simpler. - <c:(<FWS?> <comment>)+> => { - let v : Vec<Component> = c.into_iter() - .map(|(w, c)| { - if let Some(w) = w { - vec![w, c] - } else { - vec![c] - } - }) - .flatten() - .collect(); - v - }, - <c:(<FWS?> <comment>)*> <w2:FWS> => { - let mut v : Vec<Component> = c.into_iter() - .map(|(w, c)| { - if let Some(w) = w { - vec![w, c] - } else { - vec![c] - } - }) - .flatten() - .collect(); - v.push(w2); - v - } -} - -// 3.2.4. Atom - -// atext = ALPHA / DIGIT / ; Any character except controls, -// "!" / "#" / ; SP, and specials. -// "$" / "%" / ; Used for atoms -// "&" / "'" / -// "*" / "+" / -// "-" / "/" / -// "=" / "?" / -// "^" / "_" / -// "`" / "{" / -// "|" / "}" / -// "~" -// -// As an optimization the lexer collects atexts, i.e., Token::OTHER is -// 1*atext. -atext_plus : String = { - <a:OTHER> => { - let a = a.to_string(); - assert!(a.len() > 0); - a - }, -} - - -// The display-name in a name-addr production often includes a ., but -// is not quoted. The RFC even recommends supporting this variant. -// Also some OpenPGP implementations create User IDs that look like: -// -// foo@bar.com <foo@bar.com> -// -// That is, with an unquoted at! Support that too. -other_or_dot_or_at : String = { - <a:OTHER> => a.to_string(), - <d:DOT> => d.to_string(), - <a:AT> => a.to_string(), -} - -atext_dot_at_plus : String = { - <a:other_or_dot_or_at+> => strings_flatten(a.into_iter(), ""), -} - -// atom = [CFWS] 1*atext [CFWS] -// -// "Both atom and dot-atom are interpreted as a single unit, comprised -// of the string of characters that make it up. Semantically, the -// optional comments and FWS surrounding the rest of the characters -// are not part of the atom" -pub(crate) Atom : Vec<Component> = { - <a:atom> => components_merge(a), -} - -atom : Vec<Component> = { - <c1:CFWS?> <a:atext_dot_at_plus> <c2:CFWS?> => - components_concat!( - c1, - Component::Text(a), - c2), -} - -// See the phrase production for why this variant of the 'atom' -// production exists, and why the 'CFWS?'es are not included. -atom_prime : Component = { - <a:atext_dot_at_plus> => Component::Text(a), -} - -// dot-atom = [CFWS] dot-atom-text [CFWS] -// -// "Both atom and dot-atom are interpreted as a single unit, comprised -// of the string of characters that make it up. Semantically, the -// optional comments and FWS surrounding the rest of the characters -// are not part of the atom" -pub(crate) DotAtom : Vec<Component> = { - <d:dot_atom> => components_merge(d), -} - -dot_atom : Vec<Component> = { - <c1:CFWS?> <a:dot_atom_text> <c2:CFWS?> => - components_concat!(c1, a, c2), -} - -// A variant of dot_atom that places all comments to the left. -dot_atom_left : Vec<Component> = { - <c1:CFWS?> <a:dot_atom_text> <c2:CFWS?> => - components_concat!(c1, c2, a), -} - -// A variant of dot_atom that places all comments to the right. -dot_atom_right : Vec<Component> = { - <c1:CFWS?> <a:dot_atom_text> <c2:CFWS?> => - components_concat!(a, c1, c2), -} - -// dot-atom-text = 1*atext *("." 1*atext) -dot_atom_text : Component = { - <v:atext_plus> <w:(DOT <atext_plus>)*> => { - let mut v = v; - if w.len() > 0 { - v.push('.'); - } - Component::Text( - strings_flatten_into(v, w.into_iter(), ".")) - }, -} - -// 3.2.5. Quoted strings - -// qtext = NO-WS-CTL / ; Non white space controls -// %d33 / ; The rest of the US-ASCII -// %d35-91 / ; characters not including "\" -// %d93-126 ; or the quote character -qtext : Token<'input> = { - NO_WS_CTL, - - LPAREN, - RPAREN, - LBRACKET, - RBRACKET, - // BACKSLASH, - // DQUOTE, - common_xtext, - - OTHER, -} - -// qcontent = qtext / quoted-pair -pub(crate) QContent : Vec<Component> = { - <q:qcontent> => components_merge(vec![ q ]), -} - -qcontent : Component = { - <c:qtext> => Component::Text(c.to_string()), - <c:quoted_pair> => Component::Text(c.to_string()), -} - -// quoted-string = [CFWS] -// DQUOTE *([FWS] qcontent) [FWS] DQUOTE -// [CFWS] -pub(crate) QuotedString : Vec<Component> = { - <q:quoted_string> => components_merge(q), -} - -quoted_string : Vec<Component> = { - <c1:CFWS?> DQUOTE <c:(<FWS?> <qcontent>)*> <d:FWS?> DQUOTE <c2:CFWS?> => { - // Make sure any leading and trailing whitespace *inside* the - // quotes is turned into Component::Text. - components_concat!( - // c1 is an Option<Vec<Component>>. - c1, - // If we have "" make sure we return Component::Text("") - // instead of nothing. - Component::Text("".into()), - // c is a Vec<(Option<Component>, Component)>. Turn it - // into a Vec<Component>. - c.into_iter() - .map(|(fws, c)| { - if let Some(_) = fws { - vec![Component::Text(" ".to_string()), c] - } else { - vec![c] - } - }) - .flatten() - .collect::<Vec<Component>>(), - // d is an Option<Component>, turn it into a - // Option<Vec<Component>>. - d.map(|_| vec![Component::Text(" ".to_string())]), - c2) - }, -} - -// Variant of quoted_string that moves all comments to the left. -quoted_string_left : Vec<Component> = { - <c1:CFWS?> DQUOTE <c:(<FWS?> <qcontent>)*> <d:FWS?> DQUOTE <c2:CFWS?> => { - // Make sure any leading and trailing whitespace *inside* the - // quotes is turned into Component::Text. - components_concat!( - // c1 is an Option<Vec<Component>>. - c1, - c2, - // If we have "" make sure we return Component::Text("") - // instead of nothing. - Component::Text("".into()), - // c is a Vec<(Option<Component>, Component)>. Turn it - // into a Vec<Component>. - c.into_iter() - .map(|(fws, c)| { - if let Some(_) = fws { - vec![Component::Text(" ".to_string()), c] - } else { - vec![c] - } - }) - .flatten() - .collect::<Vec<Component>>(), - // d is an Option<Component>, turn it into a - // Option<Vec<Component>>. - d.map(|_| vec![Component::Text(" ".to_string())])) - }, -} - -// See the phrase production for this variant of the 'quoted_string' -// production exists, and why the 'CFWS?'es are not included. -quoted_string_prime : Vec<Component> = { - DQUOTE <c:(<FWS?> <qcontent>)*> <d:FWS?> DQUOTE => { - // Make sure any leading and trailing whitespace *inside* the - // quotes is turned into Component::Text. - components_concat!( - // If we have "" make sure we return Component::Text("") - // instead of nothing. - Component::Text("".into()), - // c is a Vec<(Option<Component>, Component)>. Turn it - // into a Vec<Component>. - c.into_iter() - .map(|(fws, c)| { - if let Some(_) = fws { - vec![Component::Text(" ".to_string()), c] - } else { - vec![c] - } - }) - .flatten() - .collect::<Vec<Component>>(), - // d is an Option<Component>, turn it into a - // Option<Vec<Component>>. - d.map(|_| vec![Component::Text(" ".to_string())])) - }, -} - -// 3.2.6. Miscellaneous tokens - -// word = atom / quoted-string -pub(crate) Word : Vec<Component> = { - <w:word> => components_merge(w), -} - -word : Vec<Component> = { - atom, - quoted_string, -} - -// phrase = 1*word / obs-phrase - -pub(crate) Phrase : Vec<Component> = { - <p:phrase> => components_merge(p), -} - -// phrase : String = { -// <v:word+> => strings_flatten(v, ""), -// } -// -// Note: consider the following parse tree: -// -// phrase -// / \ -// word word -// / \ -// atom atom -// / | \ / | \ -// CFWS? atext+ CFWS? CFWS? atext+ CFWS? -// -// This has an ambiguity! Does a CFWS immediate after the first -// atext+ belong to the first atom or the second? And, if there are -// no CFWSes, how do we split the atext+? -// -// To avoid these problems, we modify the grammar as presented in the -// RFC as follows: -atom_or_quoted_string : Vec<Component> = { - <a:atom_prime> <r:cfws_or_quoted_string?> => { - // Note: it's not possible to have multiple atoms in a row. - // The following: - // - // foo bar - // - // is 'atom_prime CFWS atom_prime'. - - components_concat!(a, r) - }, - <q:quoted_string_prime+> <r:cfws_or_atom?> => { - // But, it's possible to have multiple quoted strings in a - // row, e.g.: - // - // "foo""bar" - // - // Note that '"foo" "bar"' would match quoted_string_prime, - // CFWS, quoted_string_prime. - - components_concat!( - q.into_iter().flatten().collect::<Vec<Component>>(), r) - }, -} - -cfws_or_quoted_string : Vec<Component> = { - <c:CFWS> <r:atom_or_quoted_string?> => components_concat!(c, r), - <q:quoted_string_prime+> <r:cfws_or_atom?> => - components_concat!( - q.into_iter().flatten().collect::<Vec<Component>>(), r), -} - -cfws_or_atom : Vec<Component> = { - <c:CFWS> <r:atom_or_quoted_string?> => components_concat!(c, r), - <a:atom_prime> <r:cfws_or_quoted_string?> => components_concat!(a, r), < |