diff options
author | Neal H. Walfield <neal@pep.foundation> | 2019-04-30 09:11:57 +0200 |
---|---|---|
committer | Neal H. Walfield <neal@pep.foundation> | 2019-04-30 09:11:57 +0200 |
commit | 9f0a3e7e043024ea2ee2ef546c1832ac8a579b4e (patch) | |
tree | 2cc9d2ad032bfd6197e16044cc6969b3db7ced2e | |
parent | d6eb647ef27bff222bb1bc04b8a7307a864a63cc (diff) |
rfc2822: Improve comments
-rw-r--r-- | rfc2822/src/grammar.lalrpop | 33 | ||||
-rw-r--r-- | rfc2822/src/lexer.rs | 38 |
2 files changed, 17 insertions, 54 deletions
diff --git a/rfc2822/src/grammar.lalrpop b/rfc2822/src/grammar.lalrpop index e400a0aa..4cc571f3 100644 --- a/rfc2822/src/grammar.lalrpop +++ b/rfc2822/src/grammar.lalrpop @@ -33,7 +33,8 @@ grammar<'input>; // A further convention is an ssh-host-uri production: // // ssh-host-uri = "ssh://" dns-hostname - +// +// Support for this should be added in the future. CRLF: () = { CR LF @@ -83,7 +84,7 @@ specials : Token<'input> = { // quoted-pair = ("\" text) / obs-qp // -// In RFC 2822, text is a single character and the BACKSLAH is +// In RFC 2822, text is a single character and the BACKSLASH is // followed by exactly one character. As an optimization, our lexer // groups runs of 'text' characters into a single token, Token::OTHER. // Since a quoted pair can always be followed by a run of OTHER @@ -272,8 +273,8 @@ atom : Vec<Component> = { c2), } -// See the phrase production for this variant of the 'atom' production -// exists, and why the 'CFWS?'es are not included. +// See the phrase production for why this variant of the 'atom' +// production exists, and why the 'CFWS?'es are not included. atom_prime : Component = { <a:atext_dot_plus> => Component::Text(a), } @@ -544,18 +545,18 @@ pub(crate) NameAddr : Vec<Component> = { } // The display_name ends in an optional CFWS and the angle_addr starts -// with one. This causes an ambiguity. The angle_addr_prime -// production removes the optional leading CFWS non-terminal. -// -// But, this creates a small problem. Consider: +// with one. This causes an ambiguity. We resolve the ambiguity by +// introducing the angle_addr_prime production, which doesn't match a +// leading CFWS non-terminal. But, this creates another small +// problem. Consider: // // " <email@example.org>" // -// This is: [CFWS angle-addr]. Now, we are using angle-addr-prime so -// that it won't match leading CFWSes, because they are matched by -// display-name. But display-name doesn't match in this case because -// there are no phrases, and it requires at least on phrase! The -// second rule below covers this edge case. +// This is: [CFWS angle-addr-prime]. The CFWS isn't folded into the +// angle-addr-prime to fix the aforementioned ambiguity. But it also +// doesn't reduce to a display-name, because there are no phrases, and +// display-name requires at least one phrase! Thus, we special case +// this. name_addr : Vec<Component> = { <n:display_name?> <a:angle_addr_prime> => components_concat!(n, a), @@ -609,7 +610,7 @@ addr_spec : Vec<Component> = { // // is valid (it's foo@bar.com). - // The local part may start with commends and the domain part + // The local part may start with comments and the domain part // may end with comments. let local_part = l.pop().expect("empty local_part"); let domain = d.remove(0); @@ -675,7 +676,7 @@ domain_literal : Vec<Component> = { }) .flatten() .collect::<Vec<Component>>(), - // d is an Option<Component>, turn it into a + // d is an Option<Component>, turn it into an // Option<Vec<Component>>. d.map(|x| vec![x]), Component::Text("]".into()), @@ -700,7 +701,7 @@ domain_literal_right : Vec<Component> = { }) .flatten() .collect::<Vec<Component>>(), - // d is an Option<Component>, turn it into a + // d is an Option<Component>, turn it into an // Option<Vec<Component>>. d.map(|x| vec![x]), Component::Text("]".into()), diff --git a/rfc2822/src/lexer.rs b/rfc2822/src/lexer.rs index 5d7eb049..eadd4aa2 100644 --- a/rfc2822/src/lexer.rs +++ b/rfc2822/src/lexer.rs @@ -85,44 +85,6 @@ impl<'input> Lexer<'input> { // 3.2.1. Primitive Tokens -// The symbols. The default tokenizer returns &str, but we want -// chars. So, we need to do a little dance. -// -// match { -// // All unicode white space. -// // 2.2.2. says that whitespace is only ' ' and '\t'. -// r" \t" => WSP_TOKEN, -// -// r"(?x) -// [\x01-\x08 # %d1-8 / -// \x0b # %d11 / -// \x0c # %d12 / -// \x0e-\x1f # %d14-31 / -// \x7f # %d127 -// ]" => NO_WS_CTL_TOKEN, -// -// "\r" => CR_TOKEN, -// "\n" => LF_TOKEN, -// -// // specials -// "(" => LPAREN_TOKEN, -// ")" => RPAREN_TOKEN, -// "<" => LANGLE_TOKEN, -// ">" => RANGLE_TOKEN, -// "[" => LBRACKET_TOKEN, -// "]" => RBRACKET_TOKEN, -// ":" => COLON_TOKEN, -// ";" => SEMICOLON_TOKEN, -// "@" => AT_TOKEN, -// "\\" => BACKSLASH_TOKEN, -// "," => COMMA_TOKEN, -// "." => DOT_TOKEN, -// "\"" => DQUOTE_TOKEN, -// } else { -// // Everything else. -// r"." => OTHER_TOKEN -// } - impl<'input> Iterator for Lexer<'input> { type Item = LexerItem<Token<'input>, usize, Error>; |