summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKartikaya Gupta <kats@trevize.staktrace.com>2019-02-12 13:50:54 -0500
committerKartikaya Gupta <kats@trevize.staktrace.com>2019-11-09 04:59:52 -0500
commitdb44b0bf457dfdd67f48d3f05f3706c36a871162 (patch)
tree8ae6db8712346d204adcae8d7830ad18a724335e
parentf4597a3e5460c34ba36753c715061805336abb6e (diff)
downloadmailparse-db44b0bf457dfdd67f48d3f05f3706c36a871162.tar.gz
mailparse-db44b0bf457dfdd67f48d3f05f3706c36a871162.tar.xz
Add the addrparse module, to help parse address/mailbox lists
Fixes #34.
-rw-r--r--src/addrparse.rs394
-rw-r--r--src/lib.rs2
2 files changed, 396 insertions, 0 deletions
diff --git a/src/addrparse.rs b/src/addrparse.rs
new file mode 100644
index 0000000..7b4b5f1
--- /dev/null
+++ b/src/addrparse.rs
@@ -0,0 +1,394 @@
+#[derive(Debug, PartialEq)]
+pub struct SingleInfo {
+ pub display_name: Option<String>,
+ pub addr: String,
+}
+
+impl SingleInfo {
+ fn new(name: Option<String>, addr: String) -> Self {
+ SingleInfo {
+ display_name: name,
+ addr: addr,
+ }
+ }
+}
+
+#[derive(Debug, PartialEq)]
+pub struct GroupInfo {
+ pub group_name: String,
+ pub addrs: Vec<SingleInfo>,
+}
+
+impl GroupInfo {
+ fn new(name: String, addrs: Vec<SingleInfo>) -> Self {
+ GroupInfo {
+ group_name: name,
+ addrs: addrs,
+ }
+ }
+}
+
+#[derive(Debug, PartialEq)]
+pub enum MailAddr {
+ Group(GroupInfo),
+ Single(SingleInfo),
+}
+
+#[derive(Debug)]
+enum AddrParseState {
+ Initial,
+ QuotedName,
+ EscapedChar,
+ AfterQuotedName,
+ BracketedAddr,
+ AfterBracketedAddr,
+ Unquoted,
+ TrailerComment,
+}
+
+pub fn addrparse(addrs: &str) -> Result<Vec<MailAddr>, &'static str> {
+ let mut it = addrs.chars();
+ addrparse_inner(&mut it, false)
+}
+
+fn addrparse_inner(it: &mut std::str::Chars, in_group: bool) -> Result<Vec<MailAddr>, &'static str> {
+ let mut result = vec![];
+ let mut state = AddrParseState::Initial;
+
+ let mut c = match it.next() {
+ None => return Ok(vec![]),
+ Some(v) => v,
+ };
+
+ let mut name = None;
+ let mut addr = None;
+ let mut post_quote_ws = None;
+
+ loop {
+ match state {
+ AddrParseState::Initial => {
+ if c.is_whitespace() {
+ // continue in same state
+ } else if c == '"' {
+ state = AddrParseState::QuotedName;
+ name = Some(String::new());
+ } else if c == '<' {
+ state = AddrParseState::BracketedAddr;
+ addr = Some(String::new());
+ } else if c == ';' {
+ if !in_group {
+ return Err("Unexpected group terminator found in initial list");
+ }
+ return Ok(result);
+ } else {
+ state = AddrParseState::Unquoted;
+ addr = Some(String::new());
+ addr.as_mut().unwrap().push(c);
+ }
+ }
+ AddrParseState::QuotedName => {
+ if c == '\\' {
+ state = AddrParseState::EscapedChar;
+ } else if c == '"' {
+ state = AddrParseState::AfterQuotedName;
+ } else {
+ name.as_mut().unwrap().push(c);
+ }
+ }
+ AddrParseState::EscapedChar => {
+ state = AddrParseState::QuotedName;
+ name.as_mut().unwrap().push(c);
+ }
+ AddrParseState::AfterQuotedName => {
+ if c.is_whitespace() {
+ if post_quote_ws.is_none() {
+ post_quote_ws = Some(String::new());
+ }
+ post_quote_ws.as_mut().unwrap().push(c);
+ } else if c == '<' {
+ state = AddrParseState::BracketedAddr;
+ addr = Some(String::new());
+ } else if c == ':' {
+ if in_group {
+ return Err("Found unexpected nested group");
+ }
+ let group_addrs = try!(addrparse_inner(it, true));
+ state = AddrParseState::Initial;
+ result.push(MailAddr::Group(GroupInfo::new(
+ name.unwrap(),
+ group_addrs.into_iter().map(|addr| {
+ match addr {
+ MailAddr::Single(s) => s,
+ MailAddr::Group(_) => panic!("Unexpected nested group encountered"),
+ }
+ }).collect()
+ )));
+ name = None;
+ } else {
+ // I think technically not valid, but this occurs in real-world corpus, so
+ // handle gracefully
+ if c == '"' {
+ post_quote_ws.map(|ws| name.as_mut().unwrap().push_str(&ws));
+ state = AddrParseState::QuotedName;
+ } else {
+ post_quote_ws.map(|ws| name.as_mut().unwrap().push_str(&ws));
+ name.as_mut().unwrap().push(c);
+ }
+ post_quote_ws = None;
+ }
+ }
+ AddrParseState::BracketedAddr => {
+ if c == '>' {
+ state = AddrParseState::AfterBracketedAddr;
+ result.push(MailAddr::Single(SingleInfo::new(name, addr.unwrap())));
+ name = None;
+ addr = None;
+ } else {
+ addr.as_mut().unwrap().push(c);
+ }
+ }
+ AddrParseState::AfterBracketedAddr => {
+ if c.is_whitespace() {
+ // continue in same state
+ } else if c == ',' {
+ state = AddrParseState::Initial;
+ } else if c == ';' {
+ if in_group {
+ return Ok(result);
+ }
+ // Technically not valid, but a similar case occurs in real-world corpus, so handle it gracefully
+ state = AddrParseState::Initial;
+ } else if c == '(' {
+ state = AddrParseState::TrailerComment;
+ } else {
+ return Err("Unexpected char found after bracketed address");
+ }
+ }
+ AddrParseState::Unquoted => {
+ if c == '<' {
+ state = AddrParseState::BracketedAddr;
+ name = addr.map(|s| s.trim_end().to_owned());
+ addr = Some(String::new());
+ } else if c == ',' {
+ state = AddrParseState::Initial;
+ result.push(MailAddr::Single(SingleInfo::new(None, addr.unwrap().trim_end().to_owned())));
+ addr = None;
+ } else if c == ';' {
+ result.push(MailAddr::Single(SingleInfo::new(None, addr.unwrap().trim_end().to_owned())));
+ if in_group {
+ return Ok(result);
+ }
+ // Technically not valid, but occurs in real-world corpus, so handle it gracefully
+ state = AddrParseState::Initial;
+ addr = None;
+ } else if c == ':' {
+ if in_group {
+ return Err("Found unexpected nested group");
+ }
+ let group_addrs = try!(addrparse_inner(it, true));
+ state = AddrParseState::Initial;
+ result.push(MailAddr::Group(GroupInfo::new(
+ addr.unwrap().trim_end().to_owned(),
+ group_addrs.into_iter().map(|addr| {
+ match addr {
+ MailAddr::Single(s) => s,
+ MailAddr::Group(_) => panic!("Unexpected nested group encountered"),
+ }
+ }).collect()
+ )));
+ addr = None;
+ } else {
+ addr.as_mut().unwrap().push(c);
+ }
+ }
+ AddrParseState::TrailerComment => {
+ if c == ')' {
+ state = AddrParseState::AfterBracketedAddr;
+ }
+ }
+ }
+
+ c = match it.next() {
+ None => break,
+ Some(v) => v,
+ };
+ }
+
+ if in_group {
+ return Err("Found unterminated group address");
+ }
+
+ match state {
+ AddrParseState::QuotedName |
+ AddrParseState::EscapedChar |
+ AddrParseState::AfterQuotedName |
+ AddrParseState::BracketedAddr |
+ AddrParseState::TrailerComment => {
+ Err("Address string unexpected terminated")
+ }
+ AddrParseState::Unquoted => {
+ result.push(MailAddr::Single(SingleInfo::new(None, addr.unwrap().trim_end().to_owned())));
+ Ok(result)
+ }
+ _ => {
+ Ok(result)
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn parse_basic() {
+ assert_eq!(
+ addrparse("foo bar <foo@bar.com>").unwrap(),
+ vec![MailAddr::Single(SingleInfo::new(Some("foo bar".to_string()), "foo@bar.com".to_string()))]
+ );
+ assert_eq!(
+ addrparse("\"foo bar\" <foo@bar.com>").unwrap(),
+ vec![MailAddr::Single(SingleInfo::new(Some("foo bar".to_string()), "foo@bar.com".to_string()))]
+ );
+ assert_eq!(
+ addrparse("foo@bar.com ").unwrap(),
+ vec![MailAddr::Single(SingleInfo::new(None, "foo@bar.com".to_string()))]
+ );
+ assert_eq!(
+ addrparse("foo <bar>").unwrap(),
+ vec![MailAddr::Single(SingleInfo::new(Some("foo".to_string()), "bar".to_string()))]
+ );
+ assert_eq!(
+ addrparse("\"foo\" <bar>").unwrap(),
+ vec![MailAddr::Single(SingleInfo::new(Some("foo".to_string()), "bar".to_string()))]
+ );
+ assert_eq!(
+ addrparse("\"foo \" <bar>").unwrap(),
+ vec![MailAddr::Single(SingleInfo::new(Some("foo ".to_string()), "bar".to_string()))]
+ );
+ }
+
+ #[test]
+ fn parse_backslashes() {
+ assert_eq!(
+ addrparse(r#" "First \"nick\" Last" <user@host.tld> "#).unwrap(),
+ vec![MailAddr::Single(SingleInfo::new(Some("First \"nick\" Last".to_string()), "user@host.tld".to_string()))]
+ );
+ assert_eq!(
+ addrparse(r#" First \"nick\" Last <user@host.tld> "#).unwrap(),
+ vec![MailAddr::Single(SingleInfo::new(Some("First \\\"nick\\\" Last".to_string()), "user@host.tld".to_string()))]
+ );
+ }
+
+ #[test]
+ fn parse_multi() {
+ assert_eq!(
+ addrparse("foo <bar>, joe, baz <quux>").unwrap(),
+ vec![
+ MailAddr::Single(SingleInfo::new(Some("foo".to_string()), "bar".to_string())),
+ MailAddr::Single(SingleInfo::new(None, "joe".to_string())),
+ MailAddr::Single(SingleInfo::new(Some("baz".to_string()), "quux".to_string())),
+ ]
+ );
+ }
+
+ #[test]
+ fn parse_empty_group() {
+ assert_eq!(
+ addrparse("empty-group:;").unwrap(),
+ vec![MailAddr::Group(GroupInfo::new("empty-group".to_string(), vec![]))]
+ );
+ assert_eq!(
+ addrparse(" empty-group : ; ").unwrap(),
+ vec![MailAddr::Group(GroupInfo::new("empty-group".to_string(), vec![]))]
+ );
+ }
+
+ #[test]
+ fn parse_simple_group() {
+ assert_eq!(
+ addrparse("bar-group: foo <foo@bar.com>;").unwrap(),
+ vec![MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![
+ SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()),
+ ]))]
+ );
+ assert_eq!(
+ addrparse("bar-group: foo <foo@bar.com>, baz@bar.com;").unwrap(),
+ vec![MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![
+ SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()),
+ SingleInfo::new(None, "baz@bar.com".to_string()),
+ ]))]
+ );
+ }
+
+ #[test]
+ fn parse_mixed() {
+ assert_eq!(
+ addrparse("joe@bloe.com, bar-group: foo <foo@bar.com>;").unwrap(),
+ vec![
+ MailAddr::Single(SingleInfo::new(None, "joe@bloe.com".to_string())),
+ MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![
+ SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()),
+ ])),
+ ]
+ );
+ assert_eq!(
+ addrparse("bar-group: foo <foo@bar.com>; joe@bloe.com").unwrap(),
+ vec![
+ MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![
+ SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()),
+ ])),
+ MailAddr::Single(SingleInfo::new(None, "joe@bloe.com".to_string())),
+ ]
+ );
+ assert_eq!(
+ addrparse("flim@flam.com, bar-group: foo <foo@bar.com>; joe@bloe.com").unwrap(),
+ vec![
+ MailAddr::Single(SingleInfo::new(None, "flim@flam.com".to_string())),
+ MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![
+ SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()),
+ ])),
+ MailAddr::Single(SingleInfo::new(None, "joe@bloe.com".to_string())),
+ ]
+ );
+ assert_eq!(
+ addrparse("first-group:; flim@flam.com, bar-group: foo <foo@bar.com>; joe@bloe.com, final-group: zip, zap, \"Zaphod\" <zaphod@beeblebrox>;").unwrap(),
+ vec![
+ MailAddr::Group(GroupInfo::new("first-group".to_string(), vec![])),
+ MailAddr::Single(SingleInfo::new(None, "flim@flam.com".to_string())),
+ MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![
+ SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()),
+ ])),
+ MailAddr::Single(SingleInfo::new(None, "joe@bloe.com".to_string())),
+ MailAddr::Group(GroupInfo::new("final-group".to_string(), vec![
+ SingleInfo::new(None, "zip".to_string()),
+ SingleInfo::new(None, "zap".to_string()),
+ SingleInfo::new(Some("Zaphod".to_string()), "zaphod@beeblebrox".to_string()),
+ ])),
+ ]
+ );
+ }
+
+ #[test]
+ fn real_world_examples() {
+ // taken from a real "From" header. This might not be valid according to the RFC
+ // but obviously made it through the internet so we should at least not crash.
+ assert_eq!(
+ addrparse("\"The Foo of Bar\" Course Staff <foo-no-reply@bar.edx.org>").unwrap(),
+ vec![MailAddr::Single(SingleInfo::new(Some("The Foo of Bar Course Staff".to_string()), "foo-no-reply@bar.edx.org".to_string()))]
+ );
+
+ // This one has a comment tacked on to the end. Adding proper support for comments seems
+ // complicated so I just added trailer comment support.
+ assert_eq!(
+ addrparse("John Doe <support@github.com> (GitHub Staff)").unwrap(),
+ vec![MailAddr::Single(SingleInfo::new(Some("John Doe".to_string()), "support@github.com".to_string()))]
+ );
+
+ // Taken from a real world "To" header. It was spam, but still...
+ assert_eq!(
+ addrparse("foo@bar.com;").unwrap(),
+ vec![MailAddr::Single(SingleInfo::new(None, "foo@bar.com".to_string()))]
+ );
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 86057ff..893384c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -11,9 +11,11 @@ use charset::decode_latin1;
use charset::Charset;
pub mod body;
+mod addrparse;
mod dateparse;
use body::Body;
+pub use addrparse::{addrparse, GroupInfo, MailAddr, SingleInfo};
pub use dateparse::dateparse;
/// An error type that represents the different kinds of errors that may be