diff options
author | Neal H. Walfield <neal@pep.foundation> | 2019-04-29 17:58:46 +0200 |
---|---|---|
committer | Neal H. Walfield <neal@pep.foundation> | 2019-04-29 18:06:33 +0200 |
commit | d6eb647ef27bff222bb1bc04b8a7307a864a63cc (patch) | |
tree | 1cf2d51da320c0724adb814ea4265dcc7f10fda7 /rfc2822 | |
parent | 1fa018acc90194e1cd1ddf2bd0c6706000a3bc9e (diff) |
rfc2822: Use quickcheck. Fix edge cases.
- Use quickcheck to test the parser.
- To fix a number of edge cases that it uncovered, don't discard
Component::WS, and compress them in components_merge more
carefully intelligently.
- Update the test vectors, and fix the edge cases.
Diffstat (limited to 'rfc2822')
-rw-r--r-- | rfc2822/Cargo.toml | 5 | ||||
-rw-r--r-- | rfc2822/src/component.rs | 53 | ||||
-rw-r--r-- | rfc2822/src/grammar.lalrpop | 113 | ||||
-rw-r--r-- | rfc2822/src/lib.rs | 92 | ||||
-rw-r--r-- | rfc2822/src/roundtrip.rs | 1081 |
5 files changed, 1265 insertions, 79 deletions
diff --git a/rfc2822/Cargo.toml b/rfc2822/Cargo.toml index 0b149513..20849a01 100644 --- a/rfc2822/Cargo.toml +++ b/rfc2822/Cargo.toml @@ -23,5 +23,10 @@ maintenance = { status = "actively-developed" } failure = "0.1.2" lalrpop-util = "0.16" +[dev-dependencies] +lazy_static = "1.3" +quickcheck = "0.8" +rand = "0.6" + [build-dependencies] lalrpop = "0.16" diff --git a/rfc2822/src/component.rs b/rfc2822/src/component.rs index 808f497f..f7b42e4b 100644 --- a/rfc2822/src/component.rs +++ b/rfc2822/src/component.rs @@ -83,42 +83,6 @@ macro_rules! components_concat { }}; } -// Kills leading (`left`) and/or trailing (`right`) whitespace -// (`Component::WS`). -pub(crate) fn components_kill_ws(v: Option<Vec<Component>>, - left: bool, right: bool) - -> Vec<Component> -{ - tracer!(::TRACE, "components_kill_ws"); - t!("v: {:?}, left: {}, right: {}", v, left, right); - - let v = if let Some(mut v) = v { - if v.len() > 0 && right { - let mut kill = false; - if let Component::WS = v[v.len() - 1] { - kill = true; - } - if kill { - v.pop(); - } - } - if v.len() > 0 && left { - let mut kill = false; - if let Component::WS = v[0] { - kill = true; - } - if kill { - v.remove(0); - } - } - v - } else { - vec![] - }; - t!("=> {:?}", v); - v -} - // Merge the components in the vector. pub(crate) fn components_merge(components: Vec<Component>) -> Vec<Component> @@ -142,7 +106,6 @@ pub(crate) fn components_merge(components: Vec<Component>) None, Middle, MiddleRight, - Right, }; let mut kill = Kill::None; @@ -163,9 +126,9 @@ pub(crate) fn components_merge(components: Vec<Component>) l.push_str(r); kill = Kill::MiddleRight; }, - (_, + (Component::WS, Component::WS, - Some(Component::WS)) => { + _) => { // This can happen when we have a local-part of the // following form: // @@ -176,7 +139,13 @@ pub(crate) fn components_merge(components: Vec<Component>) // the right: // // COMMENT WS WS COMMENT TEXT - kill = Kill::Right; + // + // It is also possible to have: + // + // WS WS COMMENT TEXT + // + // as CFWS can expand to just a WS. + kill = Kill::Middle; }, _ => (), } @@ -190,10 +159,6 @@ pub(crate) fn components_merge(components: Vec<Component>) middleo = iter.next(); righto = iter.next(); } - Kill::Right => { - middleo = Some(middle); - righto = iter.next(); - } Kill::None => { components.push(left); left = middle; diff --git a/rfc2822/src/grammar.lalrpop b/rfc2822/src/grammar.lalrpop index f20735ed..e400a0aa 100644 --- a/rfc2822/src/grammar.lalrpop +++ b/rfc2822/src/grammar.lalrpop @@ -8,7 +8,6 @@ use strings::{ }; use component::{ Component, - components_kill_ws, components_merge, }; use lexer; @@ -45,6 +44,10 @@ CRLF: () = { // %d12 / // %d14-127 / // obs-text +pub(crate) Text : Token<'input> = { + text, +} + text : Token<'input> = { WSP, NO_WS_CTL, @@ -99,6 +102,12 @@ quoted_pair : Token<'input> = { // Runs of FWS, comment or CFWS that occur between lexical tokens in // a structured field header are semantically interpreted as a // single space character. + +// FWS can't be exported, because it uses inline. +pub(crate) FWS_ : Component = { + FWS +} + #[inline] FWS : Component = { (WSP* CRLF)? WSP+ => Component::WS, @@ -108,6 +117,10 @@ FWS : Component = { // %d33-39 / ; The rest of the US-ASCII // %d42-91 / ; characters not including "(", // %d93-126 ; ")", or "\" +pub(crate) CText : Token<'input> = { + ctext +} + ctext : Token<'input> = { NO_WS_CTL, @@ -254,9 +267,9 @@ pub(crate) Atom : Vec<Component> = { atom : Vec<Component> = { <c1:CFWS?> <a:atext_dot_plus> <c2:CFWS?> => components_concat!( - components_kill_ws(c1, false, true), + c1, Component::Text(a), - components_kill_ws(c2, true, false)), + c2), } // See the phrase production for this variant of the 'atom' production @@ -277,28 +290,19 @@ pub(crate) DotAtom : Vec<Component> = { dot_atom : Vec<Component> = { <c1:CFWS?> <a:dot_atom_text> <c2:CFWS?> => - components_concat!( - components_kill_ws(c1, false, true), - a, - components_kill_ws(c2, true, false)), + components_concat!(c1, a, c2), } // A variant of dot_atom that places all comments to the left. dot_atom_left : Vec<Component> = { <c1:CFWS?> <a:dot_atom_text> <c2:CFWS?> => - components_concat!( - components_kill_ws( - Some(components_concat!(c1, c2)), false, true), - a), + components_concat!(c1, c2, a), } // A variant of dot_atom that places all comments to the right. dot_atom_right : Vec<Component> = { <c1:CFWS?> <a:dot_atom_text> <c2:CFWS?> => - components_concat!( - a, - components_kill_ws( - Some(components_concat!(c1, c2)), true, false)), + components_concat!(a, c1, c2), } // dot-atom-text = 1*atext *("." 1*atext) @@ -340,6 +344,10 @@ qtext : Token<'input> = { } // qcontent = qtext / quoted-pair +pub(crate) QContent : Vec<Component> = { + <q:qcontent> => components_merge(vec![ q ]), +} + qcontent : Component = { <c:qtext> => Component::Text(c.to_string()), <c:quoted_pair> => Component::Text(c.to_string()), @@ -359,6 +367,9 @@ quoted_string : Vec<Component> = { components_concat!( // c1 is an Option<Vec<Component>>. c1, + // If we have "" make sure we return Component::Text("") + // instead of nothing. + Component::Text("".into()), // c is a Vec<(Option<Component>, Component)>. Turn it // into a Vec<Component>. c.into_iter() @@ -385,7 +396,11 @@ quoted_string_left : Vec<Component> = { // quotes is turned into Component::Text. components_concat!( // c1 is an Option<Vec<Component>>. - components_kill_ws(Some(components_concat!(c1, c2)), false, true), + c1, + c2, + // If we have "" make sure we return Component::Text("") + // instead of nothing. + Component::Text("".into()), // c is a Vec<(Option<Component>, Component)>. Turn it // into a Vec<Component>. c.into_iter() @@ -411,6 +426,9 @@ quoted_string_prime : Vec<Component> = { // Make sure any leading and trailing whitespace *inside* the // quotes is turned into Component::Text. components_concat!( + // If we have "" make sure we return Component::Text("") + // instead of nothing. + Component::Text("".into()), // c is a Vec<(Option<Component>, Component)>. Turn it // into a Vec<Component>. c.into_iter() @@ -459,11 +477,11 @@ pub(crate) Phrase : Vec<Component> = { // / \ // atom atom // / | \ / | \ -// CFWS+? atext+ CFWS? CFWS+? atext+ CFWS? +// CFWS? atext+ CFWS? CFWS? atext+ CFWS? // // This has an ambiguity! Does a CFWS immediate after the first // atext+ belong to the first atom or the second? And, if there are -// no CFWSes, how do we split the atext? +// no CFWSes, how do we split the atext+? // // To avoid these problems, we modify the grammar as presented in the // RFC as follows: @@ -528,9 +546,22 @@ pub(crate) NameAddr : Vec<Component> = { // The display_name ends in an optional CFWS and the angle_addr starts // with one. This causes an ambiguity. The angle_addr_prime // production removes the optional leading CFWS non-terminal. +// +// But, this creates a small problem. Consider: +// +// " <email@example.org>" +// +// This is: [CFWS angle-addr]. Now, we are using angle-addr-prime so +// that it won't match leading CFWSes, because they are matched by +// display-name. But display-name doesn't match in this case because +// there are no phrases, and it requires at least on phrase! The +// second rule below covers this edge case. name_addr : Vec<Component> = { <n:display_name?> <a:angle_addr_prime> => components_concat!(n, a), + + <c:CFWS> <a:angle_addr_prime> => + components_concat!(c, a), } @@ -551,8 +582,12 @@ angle_addr_prime : Vec<Component> = { // display-name = phrase +pub(crate) DisplayName : Vec<Component> = { + <d:display_name> => components_merge(d), +} + display_name : Vec<Component> = { - <p:phrase> => components_kill_ws(Some(p), true, true), + <p:phrase> => p, } // 3.4.1. Addr-spec specification @@ -596,15 +631,23 @@ addr_spec : Vec<Component> = { } // local-part = dot-atom / quoted-string / obs-local-part +pub(crate) LocalPart : Vec<Component> = { + <l:local_part> => components_merge(l), +} + local_part : Vec<Component> = { dot_atom_left, quoted_string_left, } // domain = dot-atom / domain-literal / obs-domain +pub(crate) Domain : Vec<Component> = { + <d:domain> => components_merge(d), +} + domain : Vec<Component> = { dot_atom_right, - domain_literal, + domain_literal_right, } @@ -640,8 +683,38 @@ domain_literal : Vec<Component> = { } } +domain_literal_right : Vec<Component> = { + <c1:CFWS?> LBRACKET <c:(<FWS?> <dcontent>)*> <d:FWS?> RBRACKET <c2:CFWS?> => { + components_concat!( + Component::Text("[".into()), + // c is a Vec<(Option<Component>, Component)>. Turn it + // into a Vec<Component>. + c.into_iter() + .map(|(fws, c)| { + let c = Component::Text(c.to_string()); + if let Some(fws) = fws { + vec![fws, c] + } else { + vec![c] + } + }) + .flatten() + .collect::<Vec<Component>>(), + // d is an Option<Component>, turn it into a + // Option<Vec<Component>>. + d.map(|x| vec![x]), + Component::Text("]".into()), + c1, + c2) + } +} + // dcontent = dtext / quoted-pair +pub(crate) DContent : Vec<Component> = { + <d:dcontent> => components_merge(vec![ Component::Text(d.to_string()) ]), +} + dcontent : Token<'input> = { dtext, quoted_pair, diff --git a/rfc2822/src/lib.rs b/rfc2822/src/lib.rs index 0008a928..5f348814 100644 --- a/rfc2822/src/lib.rs +++ b/rfc2822/src/lib.rs @@ -1,6 +1,10 @@ extern crate failure; extern crate lalrpop_util; +#[cfg(test)] #[macro_use] extern crate lazy_static; +#[cfg(test)] #[macro_use] extern crate quickcheck; +#[cfg(test)] extern crate rand; + use lalrpop_util::ParseError; #[macro_use] mod macros; @@ -22,6 +26,9 @@ mod grammar; #[allow(unused_imports, dead_code)] mod grammar; +#[cfg(test)] +mod roundtrip; + const TRACE : bool = false; pub type Result<T> = ::std::result::Result<T, Error>; @@ -488,9 +495,17 @@ mod tests { // If a CFWS precedes an atom, any comments are retained, but // trailing white space is removed. c("\r\n foobar \r\n ", - Some(vec![Component::Text("foobar".into())])); + Some(vec![ + Component::WS, + Component::Text("foobar".into()), + Component::WS, + ])); c(" \r\n foobar ", - Some(vec![Component::Text("foobar".into())])); + Some(vec![ + Component::WS, + Component::Text("foobar".into()), + Component::WS, + ])); c("(some comment)foobar", Some(vec![ @@ -500,11 +515,13 @@ mod tests { c("(some comment) foobar", Some(vec![ Component::Comment("some comment".into()), + Component::WS, Component::Text("foobar".into()) ])); c("(so\r\n m \r\n e co\r\n mme \r\n nt\r\n ) \r\n foobar", Some(vec![ Component::Comment("so m e co mme nt ".into()), + Component::WS, Component::Text("foobar".into()) ])); @@ -525,8 +542,9 @@ mod tests { Component::Comment("b".into()), Component::WS, Component::Comment("c".into()), - // Whitespace between a comment and an atom is elided. + Component::WS, Component::Text("foobar".into()), + Component::WS, Component::Comment("d".into()), Component::Comment("e".into()), Component::WS, @@ -685,37 +703,55 @@ mod tests { // Internal space is not allowed. c("foo bar", None); - // But lead and trailing space is okay (and it is removed). + // But leading and trailing space is okay. c(" f", - Some(vec![Component::Text("f".into())])); + Some(vec![ + Component::WS, + Component::Text("f".into()), + ])); c(" f", - Some(vec![Component::Text("f".into())])); + Some(vec![ + Component::WS, + Component::Text("f".into()), + ])); c("f ", - Some(vec![Component::Text("f".into())])); + Some(vec![ + Component::Text("f".into()), + Component::WS, + ])); c("f ", - Some(vec![Component::Text("f".into())])); + Some(vec![ + Component::Text("f".into()), + Component::WS, + ])); // Comments are also okay. c("(comment) f", Some(vec![ Component::Comment("comment".into()), + Component::WS, Component::Text("f".into()), ])); c(" (comment) f", Some(vec![ Component::WS, Component::Comment("comment".into()), + Component::WS, Component::Text("f".into()), ])); c(" f (comment) ", Some(vec![ + Component::WS, Component::Text("f".into()), + Component::WS, Component::Comment("comment".into()), Component::WS, ])); c(" f (comment)", Some(vec![ + Component::WS, Component::Text("f".into()), + Component::WS, Component::Comment("comment".into()), ])); } @@ -782,21 +818,34 @@ mod tests { // White space is ignored at the beginning and ending of the // local part, but not in the middle. c("< \r\n foo.bar@x>", - Some(vec![Component::Address("foo.bar@x".into())])); + Some(vec![ + Component::WS, + Component::Address("foo.bar@x".into()), + ])); c("< \r\n foo.bar \r\n @x>", - Some(vec![Component::Address("foo.bar@x".into())])); + Some(vec![ + Component::WS, + Component::Address("foo.bar@x".into()) + ])); c("< (quuz) \r\n foo.bar@x>", Some(vec![ Component::WS, Component::Comment("quuz".into()), + Component::WS, Component::Address("foo.bar@x".into()), ])); c("< \r\n foo.bar \r\n @x>", - Some(vec![Component::Address("foo.bar@x".into())])); + Some(vec![ + Component::WS, + Component::Address("foo.bar@x".into()), + ])); c("<f \r\n oo.bar@x>", None); c("<foo.bar@x \r\n >", - Some(vec![Component::Address("foo.bar@x".into())])); + Some(vec![ + Component::Address("foo.bar@x".into()), + Component::WS, + ])); c("<f \r\n oo.bar@x \r\n y>", None); c(" <foo.bar@x> ", @@ -811,7 +860,10 @@ mod tests { Some(vec![ Component::WS, Component::Comment("Hello!".into()), - Component::Address("foo.bar@x".into())])); + Component::WS, + Component::Address("foo.bar@x".into()), + Component::WS, + ])); // Comments in the local part are always moved left. c("< (Hello!) foo.bar (bye?) \r\n @x \r\n >", Some(vec![ @@ -819,17 +871,24 @@ mod tests { Component::Comment("Hello!".into()), Component::WS, Component::Comment("bye?".into()), + Component::WS, Component::Address("foo.bar@x".into()), + Component::WS, ])); c("< (Hello!) foo.bar@x \r\n >", Some(vec![ Component::WS, Component::Comment("Hello!".into()), - Component::Address("foo.bar@x".into())])); + Component::WS, + Component::Address("foo.bar@x".into()), + Component::WS, + ])); // Comments in the domain part are always moved right. c("< x@ (Hello!) foo.bar (bye?) >", Some(vec![ + Component::WS, Component::Address("x@foo.bar".into()), + Component::WS, Component::Comment("Hello!".into()), Component::WS, Component::Comment("bye?".into()), @@ -841,7 +900,10 @@ mod tests { Some(vec![ Component::WS, Component::Comment("Hello!".into()), - Component::Address("f oo.bar@x".into())])); + Component::WS, + Component::Address("f oo.bar@x".into()), + Component::WS, + ])); } #[test] diff --git a/rfc2822/src/roundtrip.rs b/rfc2822/src/roundtrip.rs new file mode 100644 index 00000000..b9ecfea5 --- /dev/null +++ b/rfc2822/src/roundtrip.rs @@ -0,0 +1,1081 @@ +use std::cmp; + +use lalrpop_util::ParseError; +use quickcheck::{Arbitrary, Gen}; +use rand::Rng; +use lexer; +use grammar; +use component::{Component, components_merge}; + +// We put each type of token in its own struct, which contains exactly +// one element, a String (e.g., 'Foo(String)'). +macro_rules! token { + ( $name:ident, $g:ident, $arbitrary:block ) => { + #[derive(Debug, Clone)] + #[allow(non_camel_case_types)] + struct $name(String); + + impl $name { + fn to_string(self) -> String { + self.0.to_string() + } + } + + impl Arbitrary for $name { + fn arbitrary<G: Gen>(g: &mut G) -> Self { + let $g = g; + // eprintln!("{} -> generating...", stringify!($name)); + let r = $arbitrary; + // eprintln!("{} <- {:?}", stringify!($name), r); + $name(r.to_string()) + } + } + }; + ( $name:ident, $g:ident, $arbitrary:block, $self:ident, $shrink:block ) => { + #[derive(Debug, Clone)] + #[allow(non_camel_case_types)] + struct $name(String); + + impl $name { + fn to_string(self) -> String { + self.0.to_string() + } + } + + impl Arbitrary for $name { + fn arbitrary<G: Gen>(g: &mut G) -> Self { + let $g = g; + // eprintln!("{} -> generating...", stringify!($name)); + let r = $arbitrary; + // eprintln!("{} <- {:?}", stringify!($name), r); + $name(r.to_string()) + } + + fn shrink(&$self) -> Box<Iterator<Item=Self>> { + $shrink + } + } + }; +} + +trait Production { + fn components(&self) -> Vec<Component>; + fn input(&self) -> String; + + fn inner(&self) -> (Vec<Component>, String) { + (self.components(), self.input()) + } + + fn to_input(&self) -> Input { + Input::from(self.components(), self.input()) + } +} + +// Input (s) and its expected parse (c). +#[derive(Debug, Clone)] +struct Input { + c: Vec<Component>, + s: String, +} + +impl Production for Input { + fn components(&self) -> Vec<Component> { + self.c.clone() + } + + fn input(&self) -> String { + self.s.clone() + } +} + +impl Input { + fn new() -> Self { + Input { + c: vec![], + s: String::new(), + } + } + + fn from<S: AsRef<str>>(c: Vec<Component>, s: S) -> Self { + Input { + c: c, + s: s.as_ref().to_string(), + } + } + + fn push<S: AsRef<str>>(&mut self, c: Component, s: S) { + self.c.push(c); + self.s.push_str(s.as_ref()); + } + + fn append<S: AsRef<str>>(&mut self, mut c: Vec<Component>, s: S) { + self.c.append(&mut c); + self.s.push_str(s.as_ref()); + } + + fn concat<P: Production>(&mut self, other: P) { + self.c.append(&mut other.components()); + self.s.push_str(&other.input()[..]); + } + + fn push_input<S: AsRef<str>>(&mut self, s: S) { + self.s.push_str(s.as_ref()) + } +} + +macro_rules! production { + ( $name:ident, $g:ident, $arbitrary:block ) => { + #[derive(Debug, Clone)] + struct $name(Input); + + impl Production for $name { + fn components(&self) -> Vec<Component> { + self.0.components() + } + + fn input(&self) -> String { + self.0.input() + } + } + + impl Arbitrary for $name { + fn arbitrary<G: Gen>(g: &mut G) -> Self { + let $g = g; + eprintln!("{} -> generating...", stringify!($name)); + let r = $arbitrary; + eprintln!("{} <- {:?}", stringify!($name), r); + $name(r) + } + } + }; +} + +macro_rules! parser_quickcheck { + ( $type:ident, $parser:ident ) => { + quickcheck! { + fn $parser(t: $type) -> bool { + let s = t.clone().input(); + let lexer = lexer::Lexer::new(s.as_ref()); + + match grammar::$parser::new().parse(lexer) { + Ok(components) => { + let got = components; + let expected = t.components(); + + if got == expected { + true + } else { + eprintln!(" Got: {:?}\nExpected: {:?}", + got, expected); + for (i, (got, expected)) + in got.iter().zip(expected).enumerate() + { + if *got != expected { + eprintln!("First difference at offset {}: \ + got: {:?}; expected: {:?}", + i, got, expected); + break; + } + } + false + } + } + Err(err) => { + eprintln!("Parsing: {:?}: {:?}", t, err); + if let ParseError::UnrecognizedToken { + token: Some((start, _, end)), .. + } = err + { + eprintln!("Context:"); + // XXX: This won't work with multi-byte + // characters. But at that point we've + // already printed the error message. + for i in (cmp::max(5, start) - 5) + ..cmp::min(end + 5, s.len() - 1) + { + eprintln!("{} {}: {:?}", + if i == start { "*" } else { " " }, + i, &s[i..i+1]); + } + } + false + }, + } + } + } + }; +} + +// ' ' or tab. +token!(WSP, g, { + lazy_static! { + static ref WSP_CHARS : Vec<char> = vec![ ' ', '\t' ]; + } + + WSP_CHARS[g.gen_range(0, WSP_CHARS.len())] +}); + +// NO-WS-CTL = %d1-8 / ; US-ASCII control characters +// %d11 / 0xB ; that do not include the +// %d12 / 0xC ; carriage return, line feed, +// %d14-31 / 0x |