summaryrefslogtreecommitdiffstats
path: root/rfc2822
diff options
context:
space:
mode:
Diffstat (limited to 'rfc2822')
-rw-r--r--rfc2822/README.md67
-rw-r--r--rfc2822/build.rs5
-rw-r--r--rfc2822/src/component.rs206
-rw-r--r--rfc2822/src/grammar.lalrpop891
-rw-r--r--rfc2822/src/grammar.rs2
-rw-r--r--rfc2822/src/lexer.rs188
-rw-r--r--rfc2822/src/lib.rs1691
-rw-r--r--rfc2822/src/macros.rs19
-rw-r--r--rfc2822/src/roundtrip.rs1094
-rw-r--r--rfc2822/src/strings.rs57
-rw-r--r--rfc2822/src/trace.rs63
11 files changed, 0 insertions, 4283 deletions
diff --git a/rfc2822/README.md b/rfc2822/README.md
deleted file mode 100644
index 9467daee..00000000
--- a/rfc2822/README.md
+++ /dev/null
@@ -1,67 +0,0 @@
-An [RFC 2822] parser.
-
- [RFC 2822]: https://tools.ietf.org/html/rfc2822
-
-Currently, this crate only recognizes the [RFC 2822] [name-addr] and
-[addr-spec] productions, i.e., things of the form:
-
- [name-addr]: https://tools.ietf.org/html/rfc2822#section-3.4
- [addr-spec]: https://tools.ietf.org/html/rfc2822#section-3.4.1
-
-```
-Name (Comment) <email@example.org>
-```
-
-and
-
-```
-email@example.org
-```
-
-Although the above appear simple to parse, [RFC 2822]'s whitespace and
-comment rules are rather complex. This crate implements the whole
-grammar.
-
-As an extension, in addition to ASCII, we also recognize all UTF-8
-code points. NUL, controls, and specials retain their meaning as
-defined in RFC 2822. Other UTF-8 code points are considered to be
-text like `a`.
-
-Further, we also allow dots (`.`) and at symbols (`@`) in the `atom`
-production. That is, the `atom` production is extended from:
-
-```
-atom = [CFWS] 1*atext [CFWS]
-```
-
-to:
-
-```
-atom = [CFWS] 1*atext_or_dot_or_at [CFWS]
-atext_or_dot_or_at = atext | DOT | AT
-```
-
-And, as such:
-
-```
-Professor Pippy P. Poopypants <pippy@jerome-horwitz.k12.oh.us>
-```
-
-is recognized as a `name-addr` even though [RFC 2822] strictly
-requires that the `display-name` be quoted like:
-
-```
-"Professor Pippy P. Poopypants" <pippy@jerome-horwitz.k12.oh.us>
-```
-
-Likewise,
-
-```
-foo@bar.com <foo@bar.com>
-```
-
-is recognized as a `name-addr` even though the `@` should be quoted.
-
-This crate does not (yet) implement the new [RFC 5322].
-
- [RFC 5322]: https://tools.ietf.org/html/rfc5322
diff --git a/rfc2822/build.rs b/rfc2822/build.rs
deleted file mode 100644
index 23c7d3f8..00000000
--- a/rfc2822/build.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-extern crate lalrpop;
-
-fn main() {
- lalrpop::process_root().unwrap();
-}
diff --git a/rfc2822/src/component.rs b/rfc2822/src/component.rs
deleted file mode 100644
index 37ea4cbd..00000000
--- a/rfc2822/src/component.rs
+++ /dev/null
@@ -1,206 +0,0 @@
-use lalrpop_util::ParseError;
-use crate::lexer::LexicalError;
-
-/// A UserID value typically looks something like:
-///
-/// Text (Comment) <name@example.org>
-///
-/// That is, it contains three components: a text string, a comment,
-/// and an email address.
-///
-/// The actual format allows for lots of interleaved comments and
-/// multiple texts. Thus, when parsing we build up a vector of
-/// Components in the order that they were encountered.
-#[derive(Debug, Clone)]
-pub enum Component {
- // A text string.
- Text(String),
- // A comment.
- //
- // The outermost parens are removed. That is, if the comment is:
- // "(foo(bar)bam)", then "foo(bar)bam" is stored.
- Comment(String),
- // An email address.
- Address(String),
-
- // The text found where an address was expected.
- InvalidAddress(ParseError<usize, String, LexicalError>, String),
-
- // White space.
- WS,
-}
-
-// When comparing two `Component::InvalidAddress`es, we consider them
-// equal if the values match; we don't compare the saved errors. This
-// is because the parser will always generate the same error for the
-// same input. And, the PartialEq implementation is only used to
-// support comparing two `Component`s in assertions.
-impl PartialEq for Component {
- fn eq(&self, other: &Self) -> bool {
- match (self, other) {
- (Component::Text(a), Component::Text(b)) => a == b,
- (Component::Comment(a), Component::Comment(b)) => a == b,
- (Component::Address(a), Component::Address(b)) => a == b,
- (Component::InvalidAddress(_, a), Component::InvalidAddress(_, b)) =>
- a == b,
- (Component::WS, Component::WS) => true,
- (_, _) => false,
- }
- }
-}
-
-impl Eq for Component {
-}
-
-impl From<Component> for Vec<Component> {
- fn from(c: Component) -> Self {
- vec![c]
- }
-}
-
-impl From<Component> for Option<Vec<Component>> {
- fn from(c: Component) -> Self {
- Some(vec![c])
- }
-}
-
-// Collect the `Component`s to the vector `v`.
-//
-// The Components can be anything that can be turned into an
-// Option<Vec<Component>>. This currently includes `Component`, and
-// `Vec<Component>`.
-macro_rules! components_concat_into {
- ( $v:expr, $c:expr ) => {{
- let v: &mut Vec<Component> = $v;
- let c : Option<Vec<Component>> = $c.into();
- if let Some(mut c) = c {
- // If v ends in a WS and c starts with a WS, then collapse
- // them.
- if destructures_to!(Some(Component::WS) = v.last())
- && destructures_to!(Some(Component::WS) = c.first())
- {
- v.pop();
- }
- v.append(&mut c);
- }
- }};
- ( $v:expr, $car:expr, $($cdr:expr),* ) => {{
- let v: &mut Vec<Component> = $v;
- let car : Option<Vec<Component>> = $car.into();
- if let Some(mut car) = car {
- if destructures_to!(Some(Component::WS) = v.last())
- && destructures_to!(Some(Component::WS) = car.first())
- {
- v.pop();
- }
- v.append(&mut car)
- }
- components_concat_into!(v, $($cdr),*);
- }};
-}
-
-// Collect the `Component`s into a vector `v`.
-//
-// The Components can be anything that can be turned into an
-// Option<Vec<Component>>. This currently includes `Component`, and
-// `Vec<Component>`.
-macro_rules! components_concat {
- ( $( $args:expr ),*) => {{
- let mut v : Vec<Component> = Vec::new();
- components_concat_into!(&mut v, $($args),*);
- v
- }};
-}
-
-// Merge the components in the vector.
-pub(crate) fn components_merge(components: Vec<Component>)
- -> Vec<Component>
-{
- tracer!(crate::TRACE, "components_merge", 0);
- t!("{:?}", components);
-
- let mut iter = components.into_iter();
- let mut components = vec![];
-
- let mut left = if let Some(left) = iter.next() {
- left
- } else {
- return components;
- };
- let mut middleo = iter.next();
- let mut righto = iter.next();
-
- while let Some(mut middle) = middleo {
- enum Kill {
- None,
- Middle,
- MiddleRight,
- };
- let mut kill = Kill::None;
-
- match (&mut left, &mut middle, righto.as_mut()) {
- (Component::Text(ref mut l),
- Component::Text(ref mut m),
- _) => {
- t!("Merging '{}' and '{}'", l, m);
- l.push_str(m);
- kill = Kill::Middle;
- },
-
- (Component::Text(ref mut l),
- Component::WS,
- Some(Component::Text(ref mut r))) => {
- t!("Merging '{}', WS and '{}'", l, r);
- l.push(' ');
- l.push_str(r);
- kill = Kill::MiddleRight;
- },
- (Component::WS,
- Component::WS,
- _) => {
- // This can happen when we have a local-part of the
- // following form:
- //
- // (comment) foo (comment)
- //
- // The local-part is produced by the dot_atom_left
- // production, which puts the dot_atom_text (foo) to
- // the right:
- //
- // COMMENT WS WS COMMENT TEXT
- //
- // It is also possible to have:
- //
- // WS WS COMMENT TEXT
- //
- // as CFWS can expand to just a WS.
- kill = Kill::Middle;
- },
- _ => (),
- }
-
- match kill {
- Kill::Middle => {
- middleo = righto;
- righto = iter.next();
- }
- Kill::MiddleRight => {
- middleo = iter.next();
- righto = iter.next();
- }
- Kill::None => {
- components.push(left);
- left = middle;
- middleo = righto;
- righto = iter.next();
- }
- }
- }
-
- components.push(left);
- if let Some(middle) = middleo {
- components.push(middle);
- }
-
- components
-}
diff --git a/rfc2822/src/grammar.lalrpop b/rfc2822/src/grammar.lalrpop
deleted file mode 100644
index 1b1d2b94..00000000
--- a/rfc2822/src/grammar.lalrpop
+++ /dev/null
@@ -1,891 +0,0 @@
-// -*- mode: Rust; -*-
-use crate::parse_error_downcast;
-use crate::strings::{
- strings_flatten_into,
- strings_flatten2,
- strings_flatten,
-};
-use crate::component::{
- Component,
- components_merge,
-};
-use crate::lexer;
-use crate::lexer::{Token, LexicalError};
-
-// Pass in the original, untokenized input to facilitate error
-// recovery. See, for instance, the `addr-spec-or-other` production.
-grammar<'input>(input: &'input str);
-
-// RFC 4880 says:
-//
-// 5.11. User ID Packet (Tag 13)
-//
-// A User ID packet consists of UTF-8 text that is intended to represent
-// the name and email address of the key holder. By convention, it
-// includes an RFC 2822 [RFC2822] mail name-addr, but there are no
-// restrictions on its content.
-//
-// At least today, the convention is more along the lines of RFC
-// 2822's mailbox instead of its name-addr. The only different is
-// that the mailbox production allows for a bare email address i.e.,
-// one without angle brackets whereas the name-addr production
-// requires angle brackets.
-//
-// A further convention is an ssh-host-uri production:
-//
-// ssh-host-uri = "ssh://" dns-hostname
-//
-// Support for this should be added in the future.
-
-CRLF: () = {
- CR LF
-}
-
-// text = %d1-9 / ; Characters excluding CR and LF
-// %d11 /
-// %d12 /
-// %d14-127 /
-// obs-text
-pub(crate) Text : Token<'input> = {
- text,
-}
-
-text : Token<'input> = {
- WSP,
- NO_WS_CTL,
- specials,
- OTHER,
-}
-
-// To reduce the size of this grammar, we have a list of common tokens
-// that is reused throughout this grammar.
-common_xtext : Token<'input> = {
- // LPAREN,
- // RPAREN,
- LANGLE,
- RANGLE,
- // LBRACKET,
- // RBRACKET,
- COLON,
- SEMICOLON,
- AT,
- // BACKSLASH,
- COMMA,
- DOT,
- // DQUOTE,
-}
-
-// specials = "(" / ")" / ; Special characters used in
-// "<" / ">" / ; other parts of the syntax
-// "[" / "]" /
-// ":" / ";" /
-// "@" / "\" /
-// "," / "." /
-// DQUOTE
-specials : Token<'input> = {
- LPAREN,
- RPAREN,
- LBRACKET,
- RBRACKET,
- BACKSLASH,
- DQUOTE,
- common_xtext,
-};
-
-
-// 3.2.2. Quoted characters
-
-// quoted-pair = ("\" text) / obs-qp
-//
-// In RFC 2822, text is a single character and the BACKSLASH is
-// followed by exactly one character. As an optimization, our lexer
-// groups runs of 'text' characters into a single token, Token::OTHER.
-// Since a quoted pair can always be followed by a run of OTHER
-// characters, the semantics are preserved.
-quoted_pair : Token<'input> = {
- BACKSLASH <text>,
-}
-
-// 3.2.3. Folding white space and comments
-
-// Folding white space
-//
-// FWS = ([*WSP CRLF] 1*WSP) / ;
-// obs-FWS
-//
-// Runs of FWS, comment or CFWS that occur between lexical tokens in
-// a structured field header are semantically interpreted as a
-// single space character.
-
-// FWS can't be exported, because it uses inline.
-pub(crate) FWS_ : Component = {
- FWS
-}
-
-#[inline]
-FWS : Component = {
- (WSP* CRLF)? WSP+ => Component::WS,
-}
-
-// ctext = NO-WS-CTL / ; Non white space controls
-// %d33-39 / ; The rest of the US-ASCII
-// %d42-91 / ; characters not including "(",
-// %d93-126 ; ")", or "\"
-pub(crate) CText : Token<'input> = {
- ctext
-}
-
-ctext : Token<'input> = {
- NO_WS_CTL,
-
- // LPAREN,
- // RPAREN,
- LBRACKET,
- RBRACKET,
- // BACKSLASH,
- DQUOTE,
- common_xtext,
-
- OTHER,
-}
-
-// ccontent = ctext / quoted-pair / comment
-ccontent : String = {
- <c:ctext> => c.to_string(),
- <c:quoted_pair> => c.to_string(),
- <c:comment> => {
- let mut s = String::new();
- s.push('(');
- if let Component::Comment(comment) = c {
- s.push_str(&comment[..]);
- } else {
- panic!("Expected a Component::Comment");
- }
- s.push(')');
- s
- },
-}
-
-// comment = "(" *([FWS] ccontent) [FWS] ")"
-pub(crate) Comment : Component = {
- <comment>
-}
-
-comment : Component = {
- LPAREN <c:(<FWS?> <ccontent>)*> <d:FWS?> RPAREN => {
- let mut s = strings_flatten2(
- c.into_iter().map(|(fws, c)| (fws.is_some(), c)), " ");
-
- if d.is_some() {
- s.push(' ');
- }
-
- Component::Comment(s)
- },
-}
-
-// CFWS = *([FWS] comment) (([FWS] comment) / FWS)
-pub(crate) Cfws : Vec<Component> = {
- <c:CFWS> => {
- components_merge(c)
- }
-}
-
-CFWS : Vec<Component> = {
- // <c:(FWS? <comment>)*> FWS? <d:comment> => ...,
- // <c:(FWS? <comment>)*> FWS => ...,
-
- // The following is equivalent to the above, but the actions are a
- // bit simpler.
- <c:(<FWS?> <comment>)+> => {
- let v : Vec<Component> = c.into_iter()
- .map(|(w, c)| {
- if let Some(w) = w {
- vec![w, c]
- } else {
- vec![c]
- }
- })
- .flatten()
- .collect();
- v
- },
- <c:(<FWS?> <comment>)*> <w2:FWS> => {
- let mut v : Vec<Component> = c.into_iter()
- .map(|(w, c)| {
- if let Some(w) = w {
- vec![w, c]
- } else {
- vec![c]
- }
- })
- .flatten()
- .collect();
- v.push(w2);
- v
- }
-}
-
-// 3.2.4. Atom
-
-// atext = ALPHA / DIGIT / ; Any character except controls,
-// "!" / "#" / ; SP, and specials.
-// "$" / "%" / ; Used for atoms
-// "&" / "'" /
-// "*" / "+" /
-// "-" / "/" /
-// "=" / "?" /
-// "^" / "_" /
-// "`" / "{" /
-// "|" / "}" /
-// "~"
-//
-// As an optimization the lexer collects atexts, i.e., Token::OTHER is
-// 1*atext.
-atext_plus : String = {
- <a:OTHER> => {
- let a = a.to_string();
- assert!(a.len() > 0);
- a
- },
-}
-
-
-// The display-name in a name-addr production often includes a ., but
-// is not quoted. The RFC even recommends supporting this variant.
-// Also some OpenPGP implementations create User IDs that look like:
-//
-// foo@bar.com <foo@bar.com>
-//
-// That is, with an unquoted at! Support that too.
-other_or_dot_or_at : String = {
- <a:OTHER> => a.to_string(),
- <d:DOT> => d.to_string(),
- <a:AT> => a.to_string(),
-}
-
-atext_dot_at_plus : String = {
- <a:other_or_dot_or_at+> => strings_flatten(a.into_iter(), ""),
-}
-
-// atom = [CFWS] 1*atext [CFWS]
-//
-// "Both atom and dot-atom are interpreted as a single unit, comprised
-// of the string of characters that make it up. Semantically, the
-// optional comments and FWS surrounding the rest of the characters
-// are not part of the atom"
-pub(crate) Atom : Vec<Component> = {
- <a:atom> => components_merge(a),
-}
-
-atom : Vec<Component> = {
- <c1:CFWS?> <a:atext_dot_at_plus> <c2:CFWS?> =>
- components_concat!(
- c1,
- Component::Text(a),
- c2),
-}
-
-// See the phrase production for why this variant of the 'atom'
-// production exists, and why the 'CFWS?'es are not included.
-atom_prime : Component = {
- <a:atext_dot_at_plus> => Component::Text(a),
-}
-
-// dot-atom = [CFWS] dot-atom-text [CFWS]
-//
-// "Both atom and dot-atom are interpreted as a single unit, comprised
-// of the string of characters that make it up. Semantically, the
-// optional comments and FWS surrounding the rest of the characters
-// are not part of the atom"
-pub(crate) DotAtom : Vec<Component> = {
- <d:dot_atom> => components_merge(d),
-}
-
-dot_atom : Vec<Component> = {
- <c1:CFWS?> <a:dot_atom_text> <c2:CFWS?> =>
- components_concat!(c1, a, c2),
-}
-
-// A variant of dot_atom that places all comments to the left.
-dot_atom_left : Vec<Component> = {
- <c1:CFWS?> <a:dot_atom_text> <c2:CFWS?> =>
- components_concat!(c1, c2, a),
-}
-
-// A variant of dot_atom that places all comments to the right.
-dot_atom_right : Vec<Component> = {
- <c1:CFWS?> <a:dot_atom_text> <c2:CFWS?> =>
- components_concat!(a, c1, c2),
-}
-
-// dot-atom-text = 1*atext *("." 1*atext)
-dot_atom_text : Component = {
- <v:atext_plus> <w:(DOT <atext_plus>)*> => {
- let mut v = v;
- if w.len() > 0 {
- v.push('.');
- }
- Component::Text(
- strings_flatten_into(v, w.into_iter(), "."))
- },
-}
-
-// 3.2.5. Quoted strings
-
-// qtext = NO-WS-CTL / ; Non white space controls
-// %d33 / ; The rest of the US-ASCII
-// %d35-91 / ; characters not including "\"
-// %d93-126 ; or the quote character
-qtext : Token<'input> = {
- NO_WS_CTL,
-
- LPAREN,
- RPAREN,
- LBRACKET,
- RBRACKET,
- // BACKSLASH,
- // DQUOTE,
- common_xtext,
-
- OTHER,
-}
-
-// qcontent = qtext / quoted-pair
-pub(crate) QContent : Vec<Component> = {
- <q:qcontent> => components_merge(vec![ q ]),
-}
-
-qcontent : Component = {
- <c:qtext> => Component::Text(c.to_string()),
- <c:quoted_pair> => Component::Text(c.to_string()),
-}
-
-// quoted-string = [CFWS]
-// DQUOTE *([FWS] qcontent) [FWS] DQUOTE
-// [CFWS]
-pub(crate) QuotedString : Vec<Component> = {
- <q:quoted_string> => components_merge(q),
-}
-
-quoted_string : Vec<Component> = {
- <c1:CFWS?> DQUOTE <c:(<FWS?> <qcontent>)*> <d:FWS?> DQUOTE <c2:CFWS?> => {
- // Make sure any leading and trailing whitespace *inside* the
- // quotes is turned into Component::Text.
- components_concat!(
- // c1 is an Option<Vec<Component>>.
- c1,
- // If we have "" make sure we return Component::Text("")
- // instead of nothing.
- Component::Text("".into()),
- // c is a Vec<(Option<Component>, Component)>. Turn it
- // into a Vec<Component>.
- c.into_iter()
- .map(|(fws, c)| {
- if let Some(_) = fws {
- vec![Component::Text(" ".to_string()), c]
- } else {
- vec![c]
- }
- })
- .flatten()
- .collect::<Vec<Component>>(),
- // d is an Option<Component>, turn it into a
- // Option<Vec<Component>>.
- d.map(|_| vec![Component::Text(" ".to_string())]),
- c2)
- },
-}
-
-// Variant of quoted_string that moves all comments to the left.
-quoted_string_left : Vec<Component> = {
- <c1:CFWS?> DQUOTE <c:(<FWS?> <qcontent>)*> <d:FWS?> DQUOTE <c2:CFWS?> => {
- // Make sure any leading and trailing whitespace *inside* the
- // quotes is turned into Component::Text.
- components_concat!(
- // c1 is an Option<Vec<Component>>.
- c1,
- c2,
- // If we have "" make sure we return Component::Text("")
- // instead of nothing.
- Component::Text("".into()),
- // c is a Vec<(Option<Component>, Component)>. Turn it
- // into a Vec<Component>.
- c.into_iter()
- .map(|(fws, c)| {
- if let Some(_) = fws {
- vec![Component::Text(" ".to_string()), c]
- } else {
- vec![c]
- }
- })
- .flatten()
- .collect::<Vec<Component>>(),
- // d is an Option<Component>, turn it into a
- // Option<Vec<Component>>.
- d.map(|_| vec![Component::Text(" ".to_string())]))
- },
-}
-
-// See the phrase production for this variant of the 'quoted_string'
-// production exists, and why the 'CFWS?'es are not included.
-quoted_string_prime : Vec<Component> = {
- DQUOTE <c:(<FWS?> <qcontent>)*> <d:FWS?> DQUOTE => {
- // Make sure any leading and trailing whitespace *inside* the
- // quotes is turned into Component::Text.
- components_concat!(
- // If we have "" make sure we return Component::Text("")
- // instead of nothing.
- Component::Text("".into()),
- // c is a Vec<(Option<Component>, Component)>. Turn it
- // into a Vec<Component>.
- c.into_iter()
- .map(|(fws, c)| {
- if let Some(_) = fws {
- vec![Component::Text(" ".to_string()), c]
- } else {
- vec![c]
- }
- })
- .flatten()
- .collect::<Vec<Component>>(),
- // d is an Option<Component>, turn it into a
- // Option<Vec<Component>>.
- d.map(|_| vec![Component::Text(" ".to_string())]))
- },
-}
-
-// 3.2.6. Miscellaneous tokens
-
-// word = atom / quoted-string
-pub(crate) Word : Vec<Component> = {
- <w:word> => components_merge(w),
-}
-
-word : Vec<Component> = {
- atom,
- quoted_string,
-}
-
-// phrase = 1*word / obs-phrase
-
-pub(crate) Phrase : Vec<Component> = {
- <p:phrase> => components_merge(p),
-}
-
-// phrase : String = {
-// <v:word+> => strings_flatten(v, ""),
-// }
-//
-// Note: consider the following parse tree:
-//
-// phrase
-// / \
-// word word
-// / \
-// atom atom
-// / | \ / | \
-// CFWS? atext+ CFWS? CFWS? atext+ CFWS?
-//
-// This has an ambiguity! Does a CFWS immediate after the first
-// atext+ belong to the first atom or the second? And, if there are
-// no CFWSes, how do we split the atext+?
-//
-// To avoid these problems, we modify the grammar as presented in the
-// RFC as follows:
-atom_or_quoted_string : Vec<Component> = {
- <a:atom_prime> <r:cfws_or_quoted_string?> => {
- // Note: it's not possible to have multiple atoms in a row.
- // The following:
- //
- // foo bar
- //
- // is 'atom_prime CFWS atom_prime'.
-
- components_concat!(a, r)
- },
- <q:quoted_string_prime+> <r:cfws_or_atom?> => {
- // But, it's possible to have multiple quoted strings in a
- // row, e.g.:
- //
- // "foo""bar"
- //
- // Note that '"foo" "bar"' would match quoted_string_prime,
- // CFWS, quoted_string_prime.
-
- components_concat!(
- q.into_iter().flatten().collect::<Vec<Component>>(), r)
- },
-}
-
-cfws_or_quoted_string : Vec<Component> = {
- <c:CFWS> <r:atom_or_quoted_string?> => components_concat!(c, r),
- <q:quoted_string_prime+> <r:cfws_or_atom?> =>
- components_concat!(
- q.into_iter().flatten().collect::<Vec<Component>>(), r),
-}
-
-cfws_or_atom : Vec<Component> = {
- <c:CFWS> <r:atom_or_quoted_string?> => components_concat!(c, r),
- <a:atom_prime> <r:cfws_or_quoted_string?> => components_concat!(a, r),
<