From db44b0bf457dfdd67f48d3f05f3706c36a871162 Mon Sep 17 00:00:00 2001 From: Kartikaya Gupta Date: Tue, 12 Feb 2019 13:50:54 -0500 Subject: Add the addrparse module, to help parse address/mailbox lists Fixes #34. --- src/addrparse.rs | 394 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 2 + 2 files changed, 396 insertions(+) create mode 100644 src/addrparse.rs diff --git a/src/addrparse.rs b/src/addrparse.rs new file mode 100644 index 0000000..7b4b5f1 --- /dev/null +++ b/src/addrparse.rs @@ -0,0 +1,394 @@ +#[derive(Debug, PartialEq)] +pub struct SingleInfo { + pub display_name: Option, + pub addr: String, +} + +impl SingleInfo { + fn new(name: Option, addr: String) -> Self { + SingleInfo { + display_name: name, + addr: addr, + } + } +} + +#[derive(Debug, PartialEq)] +pub struct GroupInfo { + pub group_name: String, + pub addrs: Vec, +} + +impl GroupInfo { + fn new(name: String, addrs: Vec) -> Self { + GroupInfo { + group_name: name, + addrs: addrs, + } + } +} + +#[derive(Debug, PartialEq)] +pub enum MailAddr { + Group(GroupInfo), + Single(SingleInfo), +} + +#[derive(Debug)] +enum AddrParseState { + Initial, + QuotedName, + EscapedChar, + AfterQuotedName, + BracketedAddr, + AfterBracketedAddr, + Unquoted, + TrailerComment, +} + +pub fn addrparse(addrs: &str) -> Result, &'static str> { + let mut it = addrs.chars(); + addrparse_inner(&mut it, false) +} + +fn addrparse_inner(it: &mut std::str::Chars, in_group: bool) -> Result, &'static str> { + let mut result = vec![]; + let mut state = AddrParseState::Initial; + + let mut c = match it.next() { + None => return Ok(vec![]), + Some(v) => v, + }; + + let mut name = None; + let mut addr = None; + let mut post_quote_ws = None; + + loop { + match state { + AddrParseState::Initial => { + if c.is_whitespace() { + // continue in same state + } else if c == '"' { + state = AddrParseState::QuotedName; + name = Some(String::new()); + } else if c == '<' { + state = AddrParseState::BracketedAddr; + addr = Some(String::new()); + } else if c == ';' { + if !in_group { + return Err("Unexpected group terminator found in initial list"); + } + return Ok(result); + } else { + state = AddrParseState::Unquoted; + addr = Some(String::new()); + addr.as_mut().unwrap().push(c); + } + } + AddrParseState::QuotedName => { + if c == '\\' { + state = AddrParseState::EscapedChar; + } else if c == '"' { + state = AddrParseState::AfterQuotedName; + } else { + name.as_mut().unwrap().push(c); + } + } + AddrParseState::EscapedChar => { + state = AddrParseState::QuotedName; + name.as_mut().unwrap().push(c); + } + AddrParseState::AfterQuotedName => { + if c.is_whitespace() { + if post_quote_ws.is_none() { + post_quote_ws = Some(String::new()); + } + post_quote_ws.as_mut().unwrap().push(c); + } else if c == '<' { + state = AddrParseState::BracketedAddr; + addr = Some(String::new()); + } else if c == ':' { + if in_group { + return Err("Found unexpected nested group"); + } + let group_addrs = try!(addrparse_inner(it, true)); + state = AddrParseState::Initial; + result.push(MailAddr::Group(GroupInfo::new( + name.unwrap(), + group_addrs.into_iter().map(|addr| { + match addr { + MailAddr::Single(s) => s, + MailAddr::Group(_) => panic!("Unexpected nested group encountered"), + } + }).collect() + ))); + name = None; + } else { + // I think technically not valid, but this occurs in real-world corpus, so + // handle gracefully + if c == '"' { + post_quote_ws.map(|ws| name.as_mut().unwrap().push_str(&ws)); + state = AddrParseState::QuotedName; + } else { + post_quote_ws.map(|ws| name.as_mut().unwrap().push_str(&ws)); + name.as_mut().unwrap().push(c); + } + post_quote_ws = None; + } + } + AddrParseState::BracketedAddr => { + if c == '>' { + state = AddrParseState::AfterBracketedAddr; + result.push(MailAddr::Single(SingleInfo::new(name, addr.unwrap()))); + name = None; + addr = None; + } else { + addr.as_mut().unwrap().push(c); + } + } + AddrParseState::AfterBracketedAddr => { + if c.is_whitespace() { + // continue in same state + } else if c == ',' { + state = AddrParseState::Initial; + } else if c == ';' { + if in_group { + return Ok(result); + } + // Technically not valid, but a similar case occurs in real-world corpus, so handle it gracefully + state = AddrParseState::Initial; + } else if c == '(' { + state = AddrParseState::TrailerComment; + } else { + return Err("Unexpected char found after bracketed address"); + } + } + AddrParseState::Unquoted => { + if c == '<' { + state = AddrParseState::BracketedAddr; + name = addr.map(|s| s.trim_end().to_owned()); + addr = Some(String::new()); + } else if c == ',' { + state = AddrParseState::Initial; + result.push(MailAddr::Single(SingleInfo::new(None, addr.unwrap().trim_end().to_owned()))); + addr = None; + } else if c == ';' { + result.push(MailAddr::Single(SingleInfo::new(None, addr.unwrap().trim_end().to_owned()))); + if in_group { + return Ok(result); + } + // Technically not valid, but occurs in real-world corpus, so handle it gracefully + state = AddrParseState::Initial; + addr = None; + } else if c == ':' { + if in_group { + return Err("Found unexpected nested group"); + } + let group_addrs = try!(addrparse_inner(it, true)); + state = AddrParseState::Initial; + result.push(MailAddr::Group(GroupInfo::new( + addr.unwrap().trim_end().to_owned(), + group_addrs.into_iter().map(|addr| { + match addr { + MailAddr::Single(s) => s, + MailAddr::Group(_) => panic!("Unexpected nested group encountered"), + } + }).collect() + ))); + addr = None; + } else { + addr.as_mut().unwrap().push(c); + } + } + AddrParseState::TrailerComment => { + if c == ')' { + state = AddrParseState::AfterBracketedAddr; + } + } + } + + c = match it.next() { + None => break, + Some(v) => v, + }; + } + + if in_group { + return Err("Found unterminated group address"); + } + + match state { + AddrParseState::QuotedName | + AddrParseState::EscapedChar | + AddrParseState::AfterQuotedName | + AddrParseState::BracketedAddr | + AddrParseState::TrailerComment => { + Err("Address string unexpected terminated") + } + AddrParseState::Unquoted => { + result.push(MailAddr::Single(SingleInfo::new(None, addr.unwrap().trim_end().to_owned()))); + Ok(result) + } + _ => { + Ok(result) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_basic() { + assert_eq!( + addrparse("foo bar ").unwrap(), + vec![MailAddr::Single(SingleInfo::new(Some("foo bar".to_string()), "foo@bar.com".to_string()))] + ); + assert_eq!( + addrparse("\"foo bar\" ").unwrap(), + vec![MailAddr::Single(SingleInfo::new(Some("foo bar".to_string()), "foo@bar.com".to_string()))] + ); + assert_eq!( + addrparse("foo@bar.com ").unwrap(), + vec![MailAddr::Single(SingleInfo::new(None, "foo@bar.com".to_string()))] + ); + assert_eq!( + addrparse("foo ").unwrap(), + vec![MailAddr::Single(SingleInfo::new(Some("foo".to_string()), "bar".to_string()))] + ); + assert_eq!( + addrparse("\"foo\" ").unwrap(), + vec![MailAddr::Single(SingleInfo::new(Some("foo".to_string()), "bar".to_string()))] + ); + assert_eq!( + addrparse("\"foo \" ").unwrap(), + vec![MailAddr::Single(SingleInfo::new(Some("foo ".to_string()), "bar".to_string()))] + ); + } + + #[test] + fn parse_backslashes() { + assert_eq!( + addrparse(r#" "First \"nick\" Last" "#).unwrap(), + vec![MailAddr::Single(SingleInfo::new(Some("First \"nick\" Last".to_string()), "user@host.tld".to_string()))] + ); + assert_eq!( + addrparse(r#" First \"nick\" Last "#).unwrap(), + vec![MailAddr::Single(SingleInfo::new(Some("First \\\"nick\\\" Last".to_string()), "user@host.tld".to_string()))] + ); + } + + #[test] + fn parse_multi() { + assert_eq!( + addrparse("foo , joe, baz ").unwrap(), + vec![ + MailAddr::Single(SingleInfo::new(Some("foo".to_string()), "bar".to_string())), + MailAddr::Single(SingleInfo::new(None, "joe".to_string())), + MailAddr::Single(SingleInfo::new(Some("baz".to_string()), "quux".to_string())), + ] + ); + } + + #[test] + fn parse_empty_group() { + assert_eq!( + addrparse("empty-group:;").unwrap(), + vec![MailAddr::Group(GroupInfo::new("empty-group".to_string(), vec![]))] + ); + assert_eq!( + addrparse(" empty-group : ; ").unwrap(), + vec![MailAddr::Group(GroupInfo::new("empty-group".to_string(), vec![]))] + ); + } + + #[test] + fn parse_simple_group() { + assert_eq!( + addrparse("bar-group: foo ;").unwrap(), + vec![MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![ + SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()), + ]))] + ); + assert_eq!( + addrparse("bar-group: foo , baz@bar.com;").unwrap(), + vec![MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![ + SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()), + SingleInfo::new(None, "baz@bar.com".to_string()), + ]))] + ); + } + + #[test] + fn parse_mixed() { + assert_eq!( + addrparse("joe@bloe.com, bar-group: foo ;").unwrap(), + vec![ + MailAddr::Single(SingleInfo::new(None, "joe@bloe.com".to_string())), + MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![ + SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()), + ])), + ] + ); + assert_eq!( + addrparse("bar-group: foo ; joe@bloe.com").unwrap(), + vec![ + MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![ + SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()), + ])), + MailAddr::Single(SingleInfo::new(None, "joe@bloe.com".to_string())), + ] + ); + assert_eq!( + addrparse("flim@flam.com, bar-group: foo ; joe@bloe.com").unwrap(), + vec![ + MailAddr::Single(SingleInfo::new(None, "flim@flam.com".to_string())), + MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![ + SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()), + ])), + MailAddr::Single(SingleInfo::new(None, "joe@bloe.com".to_string())), + ] + ); + assert_eq!( + addrparse("first-group:; flim@flam.com, bar-group: foo ; joe@bloe.com, final-group: zip, zap, \"Zaphod\" ;").unwrap(), + vec![ + MailAddr::Group(GroupInfo::new("first-group".to_string(), vec![])), + MailAddr::Single(SingleInfo::new(None, "flim@flam.com".to_string())), + MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![ + SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()), + ])), + MailAddr::Single(SingleInfo::new(None, "joe@bloe.com".to_string())), + MailAddr::Group(GroupInfo::new("final-group".to_string(), vec![ + SingleInfo::new(None, "zip".to_string()), + SingleInfo::new(None, "zap".to_string()), + SingleInfo::new(Some("Zaphod".to_string()), "zaphod@beeblebrox".to_string()), + ])), + ] + ); + } + + #[test] + fn real_world_examples() { + // taken from a real "From" header. This might not be valid according to the RFC + // but obviously made it through the internet so we should at least not crash. + assert_eq!( + addrparse("\"The Foo of Bar\" Course Staff ").unwrap(), + vec![MailAddr::Single(SingleInfo::new(Some("The Foo of Bar Course Staff".to_string()), "foo-no-reply@bar.edx.org".to_string()))] + ); + + // This one has a comment tacked on to the end. Adding proper support for comments seems + // complicated so I just added trailer comment support. + assert_eq!( + addrparse("John Doe (GitHub Staff)").unwrap(), + vec![MailAddr::Single(SingleInfo::new(Some("John Doe".to_string()), "support@github.com".to_string()))] + ); + + // Taken from a real world "To" header. It was spam, but still... + assert_eq!( + addrparse("foo@bar.com;").unwrap(), + vec![MailAddr::Single(SingleInfo::new(None, "foo@bar.com".to_string()))] + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index 86057ff..893384c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,9 +11,11 @@ use charset::decode_latin1; use charset::Charset; pub mod body; +mod addrparse; mod dateparse; use body::Body; +pub use addrparse::{addrparse, GroupInfo, MailAddr, SingleInfo}; pub use dateparse::dateparse; /// An error type that represents the different kinds of errors that may be -- cgit v1.2.3