// -*- mode: Rust; -*- use super::parse_error_downcast; use super::strings::{ strings_flatten_into, strings_flatten2, strings_flatten, }; use super::component::{ Component, components_merge, }; use super::lexer; use super::lexer::{Token, LexicalError}; // Pass in the original, untokenized input to facilitate error // recovery. See, for instance, the `addr-spec-or-other` production. grammar<'input>(input: &'input str); // RFC 4880 says: // // 5.11. User ID Packet (Tag 13) // // A User ID packet consists of UTF-8 text that is intended to represent // the name and email address of the key holder. By convention, it // includes an RFC 2822 [RFC2822] mail name-addr, but there are no // restrictions on its content. // // At least today, the convention is more along the lines of RFC // 2822's mailbox instead of its name-addr. The only different is // that the mailbox production allows for a bare email address i.e., // one without angle brackets whereas the name-addr production // requires angle brackets. // // A further convention is an ssh-host-uri production: // // ssh-host-uri = "ssh://" dns-hostname // // Support for this should be added in the future. CRLF: () = { CR LF } // text = %d1-9 / ; Characters excluding CR and LF // %d11 / // %d12 / // %d14-127 / // obs-text pub(crate) Text : Token<'input> = { text, } text : Token<'input> = { WSP, NO_WS_CTL, specials, OTHER, } // To reduce the size of this grammar, we have a list of common tokens // that is reused throughout this grammar. common_xtext : Token<'input> = { // LPAREN, // RPAREN, LANGLE, RANGLE, // LBRACKET, // RBRACKET, COLON, SEMICOLON, AT, // BACKSLASH, COMMA, DOT, // DQUOTE, } // specials = "(" / ")" / ; Special characters used in // "<" / ">" / ; other parts of the syntax // "[" / "]" / // ":" / ";" / // "@" / "\" / // "," / "." / // DQUOTE specials : Token<'input> = { LPAREN, RPAREN, LBRACKET, RBRACKET, BACKSLASH, DQUOTE, common_xtext, }; // 3.2.2. Quoted characters // quoted-pair = ("\" text) / obs-qp // // In RFC 2822, text is a single character and the BACKSLASH is // followed by exactly one character. As an optimization, our lexer // groups runs of 'text' characters into a single token, Token::OTHER. // Since a quoted pair can always be followed by a run of OTHER // characters, the semantics are preserved. quoted_pair : Token<'input> = { BACKSLASH , } // 3.2.3. Folding white space and comments // Folding white space // // FWS = ([*WSP CRLF] 1*WSP) / ; // obs-FWS // // Runs of FWS, comment or CFWS that occur between lexical tokens in // a structured field header are semantically interpreted as a // single space character. // FWS can't be exported, because it uses inline. pub(crate) FWS_ : Component = { FWS } #[inline] FWS : Component = { (WSP* CRLF)? WSP+ => Component::WS, } // ctext = NO-WS-CTL / ; Non white space controls // %d33-39 / ; The rest of the US-ASCII // %d42-91 / ; characters not including "(", // %d93-126 ; ")", or "\" pub(crate) CText : Token<'input> = { ctext } ctext : Token<'input> = { NO_WS_CTL, // LPAREN, // RPAREN, LBRACKET, RBRACKET, // BACKSLASH, DQUOTE, common_xtext, OTHER, } // ccontent = ctext / quoted-pair / comment ccontent : String = { => c.to_string(), => c.to_string(), => { let mut s = String::new(); s.push('('); if let Component::Comment(comment) = c { s.push_str(&comment[..]); } else { panic!("Expected a Component::Comment"); } s.push(')'); s }, } // comment = "(" *([FWS] ccontent) [FWS] ")" pub(crate) Comment : Component = { } comment : Component = { LPAREN )*> RPAREN => { let mut s = strings_flatten2( c.into_iter().map(|(fws, c)| (fws.is_some(), c)), " "); if d.is_some() { s.push(' '); } Component::Comment(s) }, } // CFWS = *([FWS] comment) (([FWS] comment) / FWS) pub(crate) Cfws : Vec = { => { components_merge(c) } } CFWS : Vec = { // )*> FWS? => ..., // )*> FWS => ..., // The following is equivalent to the above, but the actions are a // bit simpler. )+> => { let v : Vec = c.into_iter() .map(|(w, c)| { if let Some(w) = w { vec![w, c] } else { vec![c] } }) .flatten() .collect(); v }, )*> => { let mut v : Vec = c.into_iter() .map(|(w, c)| { if let Some(w) = w { vec![w, c] } else { vec![c] } }) .flatten() .collect(); v.push(w2); v } } // 3.2.4. Atom // atext = ALPHA / DIGIT / ; Any character except controls, // "!" / "#" / ; SP, and specials. // "$" / "%" / ; Used for atoms // "&" / "'" / // "*" / "+" / // "-" / "/" / // "=" / "?" / // "^" / "_" / // "`" / "{" / // "|" / "}" / // "~" // // As an optimization the lexer collects atexts, i.e., Token::OTHER is // 1*atext. atext_plus : String = { => { let a = a.to_string(); assert!(a.len() > 0); a }, } // The display-name in a name-addr production often includes a ., but // is not quoted. The RFC even recommends supporting this variant. // Also some OpenPGP implementations create User IDs that look like: // // foo@bar.com // // That is, with an unquoted at! Support that too. other_or_dot_or_at : String = { => a.to_string(), => d.to_string(), => a.to_string(), } atext_dot_at_plus : String = { => strings_flatten(a.into_iter(), ""), } // atom = [CFWS] 1*atext [CFWS] // // "Both atom and dot-atom are interpreted as a single unit, comprised // of the string of characters that make it up. Semantically, the // optional comments and FWS surrounding the rest of the characters // are not part of the atom" pub(crate) Atom : Vec = { => components_merge(a), } atom : Vec = { => components_concat!( c1, Component::Text(a), c2), } // See the phrase production for why this variant of the 'atom' // production exists, and why the 'CFWS?'es are not included. atom_prime : Component = { => Component::Text(a), } // dot-atom = [CFWS] dot-atom-text [CFWS] // // "Both atom and dot-atom are interpreted as a single unit, comprised // of the string of characters that make it up. Semantically, the // optional comments and FWS surrounding the rest of the characters // are not part of the atom" pub(crate) DotAtom : Vec = { => components_merge(d), } dot_atom : Vec = { => components_concat!(c1, a, c2), } // A variant of dot_atom that places all comments to the left. dot_atom_left : Vec = { => components_concat!(c1, c2, a), } // A variant of dot_atom that places all comments to the right. dot_atom_right : Vec = { => components_concat!(a, c1, c2), } // dot-atom-text = 1*atext *("." 1*atext) dot_atom_text : Component = { )*> => { let mut v = v; if w.len() > 0 { v.push('.'); } Component::Text( strings_flatten_into(v, w.into_iter(), ".")) }, } // 3.2.5. Quoted strings // qtext = NO-WS-CTL / ; Non white space controls // %d33 / ; The rest of the US-ASCII // %d35-91 / ; characters not including "\" // %d93-126 ; or the quote character qtext : Token<'input> = { NO_WS_CTL, LPAREN, RPAREN, LBRACKET, RBRACKET, // BACKSLASH, // DQUOTE, common_xtext, OTHER, } // qcontent = qtext / quoted-pair pub(crate) QContent : Vec = { => components_merge(vec![ q ]), } qcontent : Component = { => Component::Text(c.to_string()), => Component::Text(c.to_string()), } // quoted-string = [CFWS] // DQUOTE *([FWS] qcontent) [FWS] DQUOTE // [CFWS] pub(crate) QuotedString : Vec = { => components_merge(q), } quoted_string : Vec = { DQUOTE )*> DQUOTE => { // Make sure any leading and trailing whitespace *inside* the // quotes is turned into Component::Text. components_concat!( // c1 is an Option>. c1, // If we have "" make sure we return Component::Text("") // instead of nothing. Component::Text("".into()), // c is a Vec<(Option, Component)>. Turn it // into a Vec. c.into_iter() .map(|(fws, c)| { if let Some(_) = fws { vec![Component::Text(" ".to_string()), c] } else { vec![c] } }) .flatten() .collect::>(), // d is an Option, turn it into a // Option>. d.map(|_| vec![Component::Text(" ".to_string())]), c2) }, } // Variant of quoted_string that moves all comments to the left. quoted_string_left : Vec = { DQUOTE )*> DQUOTE => { // Make sure any leading and trailing whitespace *inside* the // quotes is turned into Component::Text. components_concat!( // c1 is an Option>. c1, c2, // If we have "" make sure we return Component::Text("") // instead of nothing. Component::Text("".into()), // c is a Vec<(Option, Component)>. Turn it // into a Vec. c.into_iter() .map(|(fws, c)| { if let Some(_) = fws { vec![Component::Text(" ".to_string()), c] } else { vec![c] } }) .flatten() .collect::>(), // d is an Option, turn it into a // Option>. d.map(|_| vec![Component::Text(" ".to_string())])) }, } // See the phrase production for this variant of the 'quoted_string' // production exists, and why the 'CFWS?'es are not included. quoted_string_prime : Vec = { DQUOTE )*> DQUOTE => { // Make sure any leading and trailing whitespace *inside* the // quotes is turned into Component::Text. components_concat!( // If we have "" make sure we return Component::Text("") // instead of nothing. Component::Text("".into()), // c is a Vec<(Option, Component)>. Turn it // into a Vec. c.into_iter() .map(|(fws, c)| { if let Some(_) = fws { vec![Component::Text(" ".to_string()), c] } else { vec![c] } }) .flatten() .collect::>(), // d is an Option, turn it into a // Option>. d.map(|_| vec![Component::Text(" ".to_string())])) }, } // 3.2.6. Miscellaneous tokens // word = atom / quoted-string pub(crate) Word : Vec = { => components_merge(w), } word : Vec = { atom, quoted_string, } // phrase = 1*word / obs-phrase pub(crate) Phrase : Vec = { => components_merge(p), } // phrase : String = { // => strings_flatten(v, ""), // } // // Note: consider the following parse tree: // // phrase // / \ // word word // / \ // atom atom // / | \ / | \ // CFWS? atext+ CFWS? CFWS? atext+ CFWS? // // This has an ambiguity! Does a CFWS immediate after the first // atext+ belong to the first atom or the second? And, if there are // no CFWSes, how do we split the atext+? // // To avoid these problems, we modify the grammar as presented in the // RFC as follows: atom_or_quoted_string : Vec = { => { // Note: it's not possible to have multiple atoms in a row. // The following: // // foo bar // // is 'atom_prime CFWS atom_prime'. components_concat!(a, r) }, => { // But, it's possible to have multiple quoted strings in a // row, e.g.: // // "foo""bar" // // Note that '"foo" "bar"' would match quoted_string_prime, // CFWS, quoted_string_prime. components_concat!( q.into_iter().flatten().collect::>(), r) }, } cfws_or_quoted_string : Vec = { => components_concat!(c, r), => components_concat!( q.into_iter().flatten().collect::>(), r), } cfws_or_atom : Vec = { => components_concat!(c, r), => components_concat!(a, r), } phrase : Vec = { => components_concat!(c, r), } // 3.4. Address Specification // mailbox = name-addr / addr-spec // pub(crate) Mailbox : Vec = { // mailbox, // } // // mailbox : Vec = { // name_addr, // addr_spec, // } // name-addr = [display-name] angle-addr pub(crate) NameAddr : Vec = { => components_merge(n), } // The display_name ends in an optional CFWS and the angle_addr starts // with one. This causes an ambiguity. We resolve the ambiguity by // introducing the angle_addr_prime production, which doesn't match a // leading CFWS non-terminal. But, this creates another small // problem. Consider: // // " " // // This is: [CFWS angle-addr-prime]. The CFWS isn't folded into the // angle-addr-prime to fix the aforementioned ambiguity. But it also // doesn't reduce to a display-name, because there are no phrases, and // display-name requires at least one phrase! Thus, we special case // this. name_addr : Vec = { => components_concat!(n, a), => components_concat!(c, a), } // An extension. See addr-spec-or-other for details. // name-addr-or-other = [display-name] angle-addr-or-other pub(crate) NameAddrOrOther : Vec = { => components_merge(n), } name_addr_or_other : Vec = { => components_concat!(n, a), => components_concat!(c, a), } // angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr pub(crate) AngleAddr : Vec = { => components_merge(a), } angle_addr : Vec = { LANGLE RANGLE => components_concat!(c1, a, c2), } angle_addr_prime : Vec = { LANGLE RANGLE => components_concat!(a, c2), } // An extension. See addr-spec-or-other for details. // angle-addr-or-other = [CFWS] "<" addr-spec-or-other ">" [CFWS] pub(crate) AngleAddrOrOther : Vec = { => components_merge(a), } angle_addr_or_other : Vec = { LANGLE RANGLE => components_concat!(c1, a, c2), } angle_addr_or_other_prime : Vec = { LANGLE RANGLE => components_concat!(a, c2), } // display-name = phrase pub(crate) DisplayName : Vec = { => components_merge(d), } display_name : Vec = { => p, } // 3.4.1. Addr-spec specification // addr-spec = local-part "@" domain pub(crate) AddrSpec : Vec = { => components_merge(a), } addr_spec : Vec = { AT => { let mut l = components_merge(l); let mut d = components_merge(d); // local_part and domain can both be preceded or followed by // comment-folding whitespace. So, something like: // // "<(comment) (comment) \r\n foo (comment)@ (comment) bar.com (comment)>" // // is valid (it's foo@bar.com). // The local part may start with comments and the domain part // may end with comments. let local_part = l.pop() .expect(&format!("empty local-part ({:?})", input)); let domain = d.remove(0); let mut v = components_merge( vec![local_part, Component::Text("@".into()), domain]); assert_eq!(v.len(), 1, "Expected 1 component, got: {:?}", v); let addr = match v.pop() { Some(Component::Text(addr)) => Component::Address(addr), Some(c) => panic!("addr_spec production failed: {:?}", c), None => panic!("addr_spec production failed"), }; components_concat!(l, addr, d) }, } // An extention to extract anything where an email address is // expected. This is useful for OpenPGP, because sometimes the // address is not an email address, but a URI. Instead of adding a // whole URI parser, we just return the raw content. // // addr-spec-or-other = local-part "@" domain // | anything pub(crate) AddrSpecOrOther : Vec = { => components_merge(a), } addr_spec_or_other : Vec = { addr_spec, => { match e { lalrpop_util::ErrorRecovery { error, .. } => { vec![ Component::InvalidAddress( parse_error_downcast(error), input[start..end].to_string() ) ] } } } } // local-part = dot-atom / quoted-string / obs-local-part pub(crate) LocalPart : Vec = { => components_merge(l), } local_part : Vec = { dot_atom_left, quoted_string_left, } // domain = dot-atom / domain-literal / obs-domain pub(crate) Domain : Vec = { => components_merge(d), } domain : Vec = { dot_atom_right, domain_literal_right, } // domain-literal = [CFWS] "[" *([FWS] dcontent) [FWS] "]" [CFWS] pub(crate) DomainLiteral : Vec = { => components_merge(d), } domain_literal : Vec = { LBRACKET )*> RBRACKET => { components_concat!( // c1 is an Option>. c1, Component::Text("[".into()), // c is a Vec<(Option, Component)>. Turn it // into a Vec. c.into_iter() .map(|(fws, c)| { let c = Component::Text(c.to_string()); if let Some(fws) = fws { vec![fws, c] } else { vec![c] } }) .flatten() .collect::>(), // d is an Option, turn it into an // Option>. d.map(|x| vec![x]), Component::Text("]".into()), c2) } } domain_literal_right : Vec = { LBRACKET )*> RBRACKET => { components_concat!( Component::Text("[".into()), // c is a Vec<(Option, Component)>. Turn it // into a Vec. c.into_iter() .map(|(fws, c)| { let c = Component::Text(c.to_string()); if let Some(fws) = fws { vec![fws, c] } else { vec![c] } }) .flatten() .collect::>(), // d is an Option, turn it into an // Option>. d.map(|x| vec![x]), Component::Text("]".into()), c1, c2) } } // dcontent = dtext / quoted-pair pub(crate) DContent : Vec = { => components_merge(vec![ Component::Text(d.to_string()) ]), } dcontent : Token<'input> = { dtext, quoted_pair, } // dtext = NO-WS-CTL / ; Non white space controls // %d33-90 / ; The rest of the US-ASCII // %d94-126 ; characters not including "[", // ; "]", or "\" dtext : Token<'input> = { NO_WS_CTL, LPAREN, RPAREN, //LBRACKET, //RBRACKET, //BACKSLASH, DQUOTE, common_xtext, OTHER, } // A production to escape a display name. // // Note: all characters are allowed in display names except NUL, CR and LF. pub(crate) EscapedDisplayName : String = { escaped_display_name } escaped_display_name : String = { escaped_display_name_token* => { let (need_quote, last_was_space, s) = <>.into_iter() .fold((false, true, String::new()), |(need_quote, last_was_space, mut s), (_need_quote, t)| { s.push_str(&t); let is_space = t == " "; ((need_quote || _need_quote || (last_was_space && is_space)), is_space, s) }); if need_quote || last_was_space { format!("\"{}\"", s) } else { s } } } // The bool means whether to put the whole thing in quotes. escaped_display_name_token : (bool, String) = { WSP => (false, <>.to_string()), // Needs to be put in quotes and escaped. NO_WS_CTL => (true, format!("\\{}", <>)), // CR and LF are invalid in a display name. // Except for DQUOTE, specials can be put in quotes. LPAREN => (true, <>.to_string()), RPAREN => (true, <>.to_string()), LANGLE => (true, <>.to_string()), RANGLE => (true, <>.to_string()), LBRACKET => (true, <>.to_string()), RBRACKET => (true, <>.to_string()), COLON => (true, <>.to_string()), SEMICOLON => (true, <>.to_string()), AT => (true, <>.to_string()), BACKSLASH => (true, <>.to_string()), COMMA => (true, <>.to_string()), DOT => (true, <>.to_string()), // Needs to be put in quotes and escaped. DQUOTE => (true, "\\\"".to_string()), OTHER => (false, <>.to_string()), } extern { type Location = usize; type Error = LexicalError; enum lexer::Token<'input> { WSP => lexer::Token::WSP(_), NO_WS_CTL => lexer::Token::NO_WS_CTL(_), CR => lexer::Token::CR, LF => lexer::Token::LF, LPAREN => lexer::Token::LPAREN, RPAREN => lexer::Token::RPAREN, LANGLE => lexer::Token::LANGLE, RANGLE => lexer::Token::RANGLE, LBRACKET => lexer::Token::LBRACKET, RBRACKET => lexer::Token::RBRACKET, COLON => lexer::Token::COLON, SEMICOLON => lexer::Token::SEMICOLON, AT => lexer::Token::AT, BACKSLASH => lexer::Token::BACKSLASH, COMMA => lexer::Token::COMMA, DOT => lexer::Token::DOT, DQUOTE => lexer::Token::DQUOTE, OTHER => lexer::Token::OTHER(_), } }