diff options
author | Matthias Beyer <mail@beyermatthias.de> | 2019-12-23 12:38:50 +0100 |
---|---|---|
committer | Matthias Beyer <mail@beyermatthias.de> | 2019-12-23 13:37:03 +0100 |
commit | 8e7978ca744d0ba02bd28da20caf1a9e4e979968 (patch) | |
tree | e759b9da8dba84622d16f6ad1ce7b85911c523e2 | |
parent | a41ed49c8e4a960319db5212da437ba25e1651c0 (diff) |
Import mailparse code
As the "mailparse" code is licensed as 0BSD, I think I can import this
code here without any further arrangements. (IANAL)
All credit up to here goes to the author of the "mailparse" crate, of
course.
The code was a bit restructured into more modules.
Signed-off-by: Matthias Beyer <mail@beyermatthias.de>
-rw-r--r-- | parser/Cargo.toml | 12 | ||||
-rw-r--r-- | parser/src/addrparse.rs | 624 | ||||
-rw-r--r-- | parser/src/body.rs | 153 | ||||
-rw-r--r-- | parser/src/dateparse.rs | 220 | ||||
-rw-r--r-- | parser/src/error.rs | 70 | ||||
-rw-r--r-- | parser/src/lib.rs | 25 | ||||
-rw-r--r-- | parser/src/parser.rs | 1325 | ||||
-rw-r--r-- | parser/src/util.rs | 47 |
8 files changed, 2469 insertions, 7 deletions
diff --git a/parser/Cargo.toml b/parser/Cargo.toml index 3153989..b2366c6 100644 --- a/parser/Cargo.toml +++ b/parser/Cargo.toml @@ -15,3 +15,15 @@ autoexamples = true [dependencies] +mail-core = { version = "0.6.2", features = ["serde-impl"] } +mail-headers = { version = "0.6.6", features = ["serde-impl"] } +mail-internals = "0.2.3" + +failure = "0.1" +vec1 = { version = "1.3.0", features = ["serde"]} +serde = { version = "1", features = ["derive"] } +toml = "0.4" +base64 = "0.11" +quoted_printable = "0.4" +charset = "0.1" + diff --git a/parser/src/addrparse.rs b/parser/src/addrparse.rs new file mode 100644 index 0000000..f5da46a --- /dev/null +++ b/parser/src/addrparse.rs @@ -0,0 +1,624 @@ +use std::fmt; + +/// A representation of a single mailbox. Each mailbox has +/// a routing address `addr` and an optional display name. +#[derive(Clone, Debug, PartialEq)] +pub struct SingleInfo { + pub display_name: Option<String>, + pub addr: String, +} + +impl SingleInfo { + fn new(name: Option<String>, addr: String) -> Self { + SingleInfo { + display_name: name, + addr: addr, + } + } +} + +impl fmt::Display for SingleInfo { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(name) = &self.display_name { + write!(f, r#""{}" <{}>"#, name.replace('"', r#"\""#), self.addr) + } else { + write!(f, "{}", self.addr) + } + } +} + +/// A representation of a group address. It has a name and +/// a list of mailboxes. +#[derive(Clone, Debug, PartialEq)] +pub struct GroupInfo { + pub group_name: String, + pub addrs: Vec<SingleInfo>, +} + +impl GroupInfo { + fn new(name: String, addrs: Vec<SingleInfo>) -> Self { + GroupInfo { + group_name: name, + addrs: addrs, + } + } +} + +impl fmt::Display for GroupInfo { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, r#""{}":"#, self.group_name.replace('"', r#"\""#))?; + for (i, addr) in self.addrs.iter().enumerate() { + if i == 0 { + write!(f, " ")?; + } else { + write!(f, ", ")?; + } + addr.fmt(f)?; + } + write!(f, ";") + } +} + +/// An abstraction over the two different kinds of top-level addresses allowed +/// in email headers. Group addresses have a name and a list of mailboxes. Single +/// addresses are just a mailbox. Each mailbox consists of what you would consider +/// an email address (e.g. foo@bar.com) and optionally a display name ("Foo Bar"). +/// Groups are represented in email headers with colons and semicolons, e.g. +/// To: my-peeps: foo@peeps.org, bar@peeps.org; +#[derive(Clone, Debug, PartialEq)] +pub enum MailAddr { + Group(GroupInfo), + Single(SingleInfo), +} + +#[derive(Debug)] +enum AddrParseState { + Initial, + QuotedName, + EscapedChar, + AfterQuotedName, + BracketedAddr, + AfterBracketedAddr, + Unquoted, + TrailerComment, +} + +/// A simple wrapper around `Vec<MailAddr>`. This is primarily here so we can +/// implement the Display trait on it, and allow user code to easily convert +/// the return value from `addrparse` back into a string. However there are some +/// additional utility functions on this wrapper as well. +#[derive(Clone, Debug, PartialEq)] +pub struct MailAddrList(Vec<MailAddr>); + +impl std::ops::Deref for MailAddrList { + type Target = Vec<MailAddr>; + + fn deref(&self) -> &Vec<MailAddr> { + &self.0 + } +} + +impl std::ops::DerefMut for MailAddrList { + fn deref_mut(&mut self) -> &mut Vec<MailAddr> { + &mut self.0 + } +} + +impl fmt::Display for MailAddrList { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut last_was_group = false; + for (i, addr) in self.iter().enumerate() { + if i > 0 { + if last_was_group { + write!(f, " ")?; + } else { + write!(f, ", ")?; + } + } + match addr { + MailAddr::Group(g) => { + g.fmt(f)?; + last_was_group = true; + } + MailAddr::Single(s) => { + s.fmt(f)?; + last_was_group = false; + } + } + } + Ok(()) + } +} + +impl From<Vec<MailAddr>> for MailAddrList { + fn from(addrs: Vec<MailAddr>) -> Self { + MailAddrList(addrs) + } +} + +impl MailAddrList { + /// Count the number of `SingleInfo` instances in this list of addresses. + pub fn count_addrs(&self) -> usize { + self.iter().fold(0, |acc, elem| { + match elem { + MailAddr::Single(_) => acc + 1, + MailAddr::Group(g) => acc + g.addrs.len(), + } + }) + } + + /// Convenience function to check if this list of addresses contains exactly + /// one `SingleInfo`, and if it does, to return it. If there is not exactly + /// one `SingleInfo`, this function returns None. + pub fn extract_single_info(self) -> Option<SingleInfo> { + if self.len() == 1 { + match &self[0] { + MailAddr::Group(_) => None, + MailAddr::Single(s) => Some(s.clone()), + } + } else { + None + } + } +} + +/// Convert an address field from an email header into a structured type. +/// This function handles the most common formatting of to/from/cc/bcc fields +/// found in email headers. +/// +/// # Examples +/// ``` +/// use mailparse::{addrparse, MailAddr, SingleInfo}; +/// match &addrparse("John Doe <john@doe.com>").unwrap()[0] { +/// MailAddr::Single(info) => { +/// assert_eq!(info.display_name, Some("John Doe".to_string())); +/// assert_eq!(info.addr, "john@doe.com".to_string()); +/// } +/// _ => panic!() +/// }; +/// ``` +pub fn addrparse(addrs: &str) -> Result<MailAddrList, &'static str> { + let mut it = addrs.chars(); + addrparse_inner(&mut it, false) +} + +fn addrparse_inner(it: &mut std::str::Chars, in_group: bool) -> Result<MailAddrList, &'static str> { + let mut result = vec![]; + let mut state = AddrParseState::Initial; + + let mut c = match it.next() { + None => return Ok(MailAddrList(vec![])), + Some(v) => v, + }; + + let mut name = None; + let mut addr = None; + let mut post_quote_ws = None; + + loop { + match state { + AddrParseState::Initial => { + if c.is_whitespace() { + // continue in same state + } else if c == '"' { + state = AddrParseState::QuotedName; + name = Some(String::new()); + } else if c == '<' { + state = AddrParseState::BracketedAddr; + addr = Some(String::new()); + } else if c == ';' { + if !in_group { + return Err("Unexpected group terminator found in initial list"); + } + return Ok(MailAddrList(result)); + } else { + state = AddrParseState::Unquoted; + addr = Some(String::new()); + addr.as_mut().unwrap().push(c); + } + } + AddrParseState::QuotedName => { + if c == '\\' { + state = AddrParseState::EscapedChar; + } else if c == '"' { + state = AddrParseState::AfterQuotedName; + } else { + name.as_mut().unwrap().push(c); + } + } + AddrParseState::EscapedChar => { + state = AddrParseState::QuotedName; + name.as_mut().unwrap().push(c); + } + AddrParseState::AfterQuotedName => { + if c.is_whitespace() { + if post_quote_ws.is_none() { + post_quote_ws = Some(String::new()); + } + post_quote_ws.as_mut().unwrap().push(c); + } else if c == '<' { + state = AddrParseState::BracketedAddr; + addr = Some(String::new()); + } else if c == ':' { + if in_group { + return Err("Found unexpected nested group"); + } + let group_addrs = addrparse_inner(it, true)?; + state = AddrParseState::Initial; + result.push(MailAddr::Group(GroupInfo::new( + name.unwrap(), + group_addrs.0.into_iter().map(|addr| { + match addr { + MailAddr::Single(s) => s, + MailAddr::Group(_) => panic!("Unexpected nested group encountered"), + } + }).collect() + ))); + name = None; + } else { + // I think technically not valid, but this occurs in real-world corpus, so + // handle gracefully + if c == '"' { + post_quote_ws.map(|ws| name.as_mut().unwrap().push_str(&ws)); + state = AddrParseState::QuotedName; + } else { + post_quote_ws.map(|ws| name.as_mut().unwrap().push_str(&ws)); + name.as_mut().unwrap().push(c); + } + post_quote_ws = None; + } + } + AddrParseState::BracketedAddr => { + if c == '>' { + state = AddrParseState::AfterBracketedAddr; + result.push(MailAddr::Single(SingleInfo::new(name, addr.unwrap()))); + name = None; + addr = None; + } else { + addr.as_mut().unwrap().push(c); + } + } + AddrParseState::AfterBracketedAddr => { + if c.is_whitespace() { + // continue in same state + } else if c == ',' { + state = AddrParseState::Initial; + } else if c == ';' { + if in_group { + return Ok(MailAddrList(result)); + } + // Technically not valid, but a similar case occurs in real-world corpus, so handle it gracefully + state = AddrParseState::Initial; + } else if c == '(' { + state = AddrParseState::TrailerComment; + } else { + return Err("Unexpected char found after bracketed address"); + } + } + AddrParseState::Unquoted => { + if c == '<' { + state = AddrParseState::BracketedAddr; + name = addr.map(|s| s.trim_end().to_owned()); + addr = Some(String::new()); + } else if c == ',' { + state = AddrParseState::Initial; + result.push(MailAddr::Single(SingleInfo::new(None, addr.unwrap().trim_end().to_owned()))); + addr = None; + } else if c == ';' { + result.push(MailAddr::Single(SingleInfo::new(None, addr.unwrap().trim_end().to_owned()))); + if in_group { + return Ok(MailAddrList(result)); + } + // Technically not valid, but occurs in real-world corpus, so handle it gracefully + state = AddrParseState::Initial; + addr = None; + } else if c == ':' { + if in_group { + return Err("Found unexpected nested group"); + } + let group_addrs = addrparse_inner(it, true)?; + state = AddrParseState::Initial; + result.push(MailAddr::Group(GroupInfo::new( + addr.unwrap().trim_end().to_owned(), + group_addrs.0.into_iter().map(|addr| { + match addr { + MailAddr::Single(s) => s, + MailAddr::Group(_) => panic!("Unexpected nested group encountered"), + } + }).collect() + ))); + addr = None; + } else { + addr.as_mut().unwrap().push(c); + } + } + AddrParseState::TrailerComment => { + if c == ')' { + state = AddrParseState::AfterBracketedAddr; + } + } + } + + c = match it.next() { + None => break, + Some(v) => v, + }; + } + + if in_group { + return Err("Found unterminated group address"); + } + + match state { + AddrParseState::QuotedName | + AddrParseState::EscapedChar | + AddrParseState::AfterQuotedName | + AddrParseState::BracketedAddr | + AddrParseState::TrailerComment => { + Err("Address string unexpected terminated") + } + AddrParseState::Unquoted => { + result.push(MailAddr::Single(SingleInfo::new(None, addr.unwrap().trim_end().to_owned()))); + Ok(MailAddrList(result)) + } + _ => { + Ok(MailAddrList(result)) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_basic() { + assert_eq!( + addrparse("foo bar <foo@bar.com>").unwrap(), + MailAddrList(vec![MailAddr::Single(SingleInfo::new(Some("foo bar".to_string()), "foo@bar.com".to_string()))]) + ); + assert_eq!( + addrparse("\"foo bar\" <foo@bar.com>").unwrap(), + MailAddrList(vec![MailAddr::Single(SingleInfo::new(Some("foo bar".to_string()), "foo@bar.com".to_string()))]) + ); + assert_eq!( + addrparse("foo@bar.com ").unwrap(), + MailAddrList(vec![MailAddr::Single(SingleInfo::new(None, "foo@bar.com".to_string()))]) + ); + assert_eq!( + addrparse("foo <bar>").unwrap(), + MailAddrList(vec![MailAddr::Single(SingleInfo::new(Some("foo".to_string()), "bar".to_string()))]) + ); + assert_eq!( + addrparse("\"foo\" <bar>").unwrap(), + MailAddrList(vec![MailAddr::Single(SingleInfo::new(Some("foo".to_string()), "bar".to_string()))]) + ); + assert_eq!( + addrparse("\"foo \" <bar>").unwrap(), + MailAddrList(vec![MailAddr::Single(SingleInfo::new(Some("foo ".to_string()), "bar".to_string()))]) + ); + } + + #[test] + fn parse_backslashes() { + assert_eq!( + addrparse(r#" "First \"nick\" Last" <user@host.tld> "#).unwrap(), + MailAddrList(vec![MailAddr::Single(SingleInfo::new(Some("First \"nick\" Last".to_string()), "user@host.tld".to_string()))]) + ); + assert_eq!( + addrparse(r#" First \"nick\" Last <user@host.tld> "#).unwrap(), + MailAddrList(vec![MailAddr::Single(SingleInfo::new(Some("First \\\"nick\\\" Last".to_string()), "user@host.tld".to_string()))]) + ); + } + + #[test] + fn parse_multi() { + assert_eq!( + addrparse("foo <bar>, joe, baz <quux>").unwrap(), + MailAddrList(vec![ + MailAddr::Single(SingleInfo::new(Some("foo".to_string()), "bar".to_string())), + MailAddr::Single(SingleInfo::new(None, "joe".to_string())), + MailAddr::Single(SingleInfo::new(Some("baz".to_string()), "quux".to_string())), + ]) + ); + } + + #[test] + fn parse_empty_group() { + assert_eq!( + addrparse("empty-group:;").unwrap(), + MailAddrList(vec![MailAddr::Group(GroupInfo::new("empty-group".to_string(), vec![]))]) + ); + assert_eq!( + addrparse(" empty-group : ; ").unwrap(), + MailAddrList(vec![MailAddr::Group(GroupInfo::new("empty-group".to_string(), vec![]))]) + ); + } + + #[test] + fn parse_simple_group() { + assert_eq!( + addrparse("bar-group: foo <foo@bar.com>;").unwrap(), + MailAddrList(vec![ + MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![ + SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()), + ])) + ]) + ); + assert_eq!( + addrparse("bar-group: foo <foo@bar.com>, baz@bar.com;").unwrap(), + MailAddrList(vec![ + MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![ + SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()), + SingleInfo::new(None, "baz@bar.com".to_string()), + ])) + ]) + ); + } + + #[test] + fn parse_mixed() { + assert_eq!( + addrparse("joe@bloe.com, bar-group: foo <foo@bar.com>;").unwrap(), + MailAddrList(vec![ + MailAddr::Single(SingleInfo::new(None, "joe@bloe.com".to_string())), + MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![ + SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()), + ])), + ]) + ); + assert_eq!( + addrparse("bar-group: foo <foo@bar.com>; joe@bloe.com").unwrap(), + MailAddrList(vec![ + MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![ + SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()), + ])), + MailAddr::Single(SingleInfo::new(None, "joe@bloe.com".to_string())), + ]) + ); + assert_eq!( + addrparse("flim@flam.com, bar-group: foo <foo@bar.com>; joe@bloe.com").unwrap(), + MailAddrList(vec![ + MailAddr::Single(SingleInfo::new(None, "flim@flam.com".to_string())), + MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![ + SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()), + ])), + MailAddr::Single(SingleInfo::new(None, "joe@bloe.com".to_string())), + ]) + ); + assert_eq!( + addrparse("first-group:; flim@flam.com, bar-group: foo <foo@bar.com>; joe@bloe.com, final-group: zip, zap, \"Zaphod\" <zaphod@beeblebrox>;").unwrap(), + MailAddrList(vec![ + MailAddr::Group(GroupInfo::new("first-group".to_string(), vec![])), + MailAddr::Single(SingleInfo::new(None, "flim@flam.com".to_string())), + MailAddr::Group(GroupInfo::new("bar-group".to_string(), vec![ + SingleInfo::new(Some("foo".to_string()), "foo@bar.com".to_string()), + ])), + MailAddr::Single(SingleInfo::new(None, "joe@bloe.com".to_string())), + MailAddr::Group(GroupInfo::new("final-group".to_string(), vec![ + SingleInfo::new(None, "zip".to_string()), + SingleInfo::new(None, "zap".to_string()), + SingleInfo::new(Some("Zaphod".to_string()), "zaphod@beeblebrox".to_string()), + ])), + ]) + ); + } + + #[test] + fn real_world_examples() { + // taken from a real "From" header. This might not be valid according to the RFC + // but obviously made it through the internet so we should at least not crash. + assert_eq!( + addrparse("\"The Foo of Bar\" Course Staff <foo-no-reply@bar.edx.org>").unwrap(), + MailAddrList(vec![MailAddr::Single(SingleInfo::new(Some("The Foo of Bar Course Staff".to_string()), "foo-no-reply@bar.edx.org".to_string()))]) + ); + + // This one has a comment tacked on to the end. Adding proper support for comments seems + // complicated so I just added trailer comment support. + assert_eq!( + addrparse("John Doe <support@github.com> (GitHub Staff)").unwrap(), + MailAddrList(vec![MailAddr::Single(SingleInfo::new(Some("John Doe".to_string()), "support@github.com".to_string()))]) + ); + + // Taken from a real world "To" header. It was spam, but still... + assert_eq!( + addrparse("foo@bar.com;").unwrap(), + MailAddrList(vec![MailAddr::Single(SingleInfo::new(None, "foo@bar.com".to_string()))]) + ); + } + + #[test] + fn stringify_single() { + let tc = SingleInfo::new(Some("John Doe".to_string()), "john@doe.com".to_string()); + assert_eq!(tc.to_string(), r#""John Doe" <john@doe.com>"#); + assert_eq!(addrparse(&tc.to_string()).unwrap(), MailAddrList(vec![MailAddr::Single(tc)])); + + let tc = SingleInfo::new(Some(r#"John "Jack" Doe"#.to_string()), "john@doe.com".to_string()); + assert_eq!(tc.to_string(), r#""John \"Jack\" Doe" <john@doe.com>"#); + assert_eq!(addrparse(&tc.to_string()).unwrap(), MailAddrList(vec![MailAddr::Single(tc)])); + + let tc = SingleInfo::new(None, "foo@bar.com".to_string()); + assert_eq!(tc.to_string(), r#"foo@bar.com"#); + assert_eq!(addrparse(&tc.to_string()).unwrap(), MailAddrList(vec![MailAddr::Single(tc)])); + } + + #[test] + fn stringify_group() { + let tc = GroupInfo::new("group-name".to_string(), vec![ + SingleInfo::new(None, "foo@bar.com".to_string()), + SingleInfo::new(Some("A".to_string()), "a@b".to_string()), + ]); + assert_eq!(tc.to_string(), r#""group-name": foo@bar.com, "A" <a@b>;"#); + assert_eq!(addrparse(&tc.to_string()).unwrap(), MailAddrList(vec![MailAddr::Group(tc)])); + + let tc = GroupInfo::new("empty-group".to_string(), vec![]); + assert_eq!(tc.to_string(), r#""empty-group":;"#); + assert_eq!(addrparse(&tc.to_string()).unwrap(), MailAddrList(vec![MailAddr::Group(tc)])); + + let tc = GroupInfo::new(r#"group-with"quote"#.to_string(), vec![]); + assert_eq!(tc.to_string(), r#""group-with\"quote":;"#); + assert_eq!(addrparse(&tc.to_string()).unwrap(), MailAddrList(vec![MailAddr::Group(tc)])); + } + + #[test] + fn stringify_list() { + let tc = MailAddrList(vec![ + MailAddr::Group(GroupInfo::new("marvel".to_string(), vec![ + SingleInfo::new(None, "ironman@marvel.com".to_string()), + SingleInfo::new(None, "spiderman@marvel.com".to_string()), + ])), + MailAddr::Single(SingleInfo::new(Some("b-man".to_string()), "b@man.com".to_string())), + MailAddr::Group(GroupInfo::new("dc".to_string(), vec![ + SingleInfo::new(None, "batman@dc.com".to_string()), + SingleInfo::new(None, "superman@dc.com".to_string()), + ])), + MailAddr::Single(SingleInfo::new(Some("d-woman".to_string()), "d@woman.com".to_string())), + ]); + assert_eq!(tc.to_string(), + r#""marvel": ironman@marvel.com, spiderman@marvel.com; "b-man" <b@man.com>, "dc": batman@dc.com, superman@dc.com; "d-woman" <d@woman.com>"#); + } + + #[test] + fn count_addrs() { + let tc = MailAddrList(vec![ + MailAddr::Group(GroupInfo::new("marvel".to_string(), vec![ + SingleInfo::new(None, "ironman@marvel.com".to_string()), + SingleInfo::new(None, "spiderman@marvel.com".to_string()), + ])), + MailAddr::Single(SingleInfo::new(Some("b-man".to_string()), "b@man.com".to_string())), + MailAddr::Group(GroupInfo::new("dc".to_string(), vec![ + SingleInfo::new(None, "batman@dc.com".to_string()), + SingleInfo::new(None, "superman@dc.com".to_string()), + ])), + MailAddr::Single(SingleInfo::new(Some("d-woman".to_string()), "d@woman.com".to_string())), + ]); + assert_eq!(tc.count_addrs(), 6); + assert_eq!(tc.extract_single_info(), None); + + let tc = MailAddrList(vec![]); + assert_eq!(tc.count_addrs(), 0); + assert_eq!(tc.extract_single_info(), None); + + let tc = MailAddrList(vec![ + MailAddr::Group(GroupInfo::new("group".to_string(), vec![ + SingleInfo::new(None, "foo@bar.com".to_string()), + ])), + ]); + assert_eq!(tc.count_addrs(), 1); + assert_eq!(tc.extract_single_info(), None); + + let tc = MailAddrList(vec![ + MailAddr::Single(SingleInfo::new(None, "foo@bar.com".to_string())), + ]); + assert_eq!(tc.count_addrs(), 1); + assert_eq!(tc.extract_single_info(), Some(SingleInfo::new(None, "foo@bar.com".to_string()))); + + let tc = MailAddrList(vec![ + MailAddr::Group(GroupInfo::new("group".to_string(), vec![])), + MailAddr::Group(GroupInfo::new("group".to_string(), vec![])), + ]); + assert_eq!(tc.count_addrs(), 0); + assert_eq!(tc.extract_single_info(), None); + + } +} diff --git a/parser/src/body.rs b/parser/src/body.rs new file mode 100644 index 0000000..4a7fc36 --- /dev/null +++ b/parser/src/body.rs @@ -0,0 +1,153 @@ +use charset::{decode_ascii, Charset}; +use crate::error::MailParseError; +use crate::parser::ParsedContentType; + +/// Represents the body of an email (or mail subpart) +pub enum Body<'a> { + /// A body with 'base64' Content-Transfer-Encoding. + Base64(EncodedBody<'a>), + /// A body with 'quoted-printable' Content-Transfer-Encoding. + QuotedPrintable(EncodedBody<'a>), + /// A body with '7bit' Content-Transfer-Encoding. + SevenBit(TextBody<'a>), + /// A body with '8bit' Content-Transfer-Encoding. + EightBit(TextBody<'a>), + /// A body with 'binary' Content-Transfer-Encoding. + Binary(BinaryBody<'a>), +} + +impl<'a> Body<'a> { + pub fn new( + body: &'a [u8], + ctype: &'a ParsedContentType, + transfer_encoding: &Option<String>, + ) -> Body<'a> { + transfer_encoding + .as_ref() + .map(|encoding| match encoding.as_ref() { + "base64" => Body::Base64(EncodedBody { + decoder: decode_base64, + body, + ctype, + }), + "quoted-printable" => Body::QuotedPrintable(EncodedBody { + decoder: decode_quoted_printable, + body, + ctype, + }), + "7bit" => Body::SevenBit(TextBody { body, ctype }), + "8bit" => Body::EightBit(TextBody { body, ctype }), + "binary" => Body::Binary(BinaryBody { body, ctype }), + _ => Body::get_default(body, ctype), + }) + .unwrap_or_else(|| Body::get_default(body, ctype)) + } + + fn get_default(body: &'a [u8], ctype: &'a ParsedContentType) -> Body<'a> { + Body::SevenBit(TextBody { body, ctype }) + } +} + +/// Struct that holds the encoded body representation of the message (or message subpart). +pub struct EncodedBody<'a> { + decoder: fn(&[u8]) -> Result<Vec<u8>, MailParseError>, + ctype: &'a ParsedContentType, + body: &'a [u8], +} + +impl<'a> EncodedBody<'a> { + /// Get the body Content-Type + pub fn get_content_type(&self) -> &'a ParsedContentType { + self.ctype + } + + /// Get the raw body of the message exactly as it is written in the message (or message subpart). + pub fn get_raw(&self) -> &'a [u8] { + self.body + } + + /// Get the decoded body of the message (or message subpart). + pub fn get_decoded(&self) -> Result<Vec<u8>, MailParseError> { + (self.decoder)(self.body) + } + + /// Get the body of the message as a Rust string. + /// This function tries to decode the body and then converts + /// the result into a Rust UTF-8 string using the charset in the Content-Type + /// (or "us-ascii" if the charset was missing or not recognized). + /// This operation returns a valid result only if the decoded body + /// has a text format. + pub fn get_decoded_as_string(&self) -> Result<String, MailParseError> { + get_body_as_string(&self.get_decoded()?, &self.ctype) + } +} + +/// Struct that holds the textual body representation of the message (or message subpart). +pub struct TextBody<'a> { + ctype: &'a ParsedContentType, + body: &'a [u8], +} + +impl<'a> TextBody<'a> { + /// Get the body Content-Type + pub fn get_content_type(&self) -> &'a ParsedContentType { + self.ctype + } + + /// Get the raw body of the message exactly as it is written in the message (or message subpart). + pub fn get_raw(&self) -> &'a [u8] { + self.body + } + + /// Get the body of the message as a Rust string. + /// This function converts the body into a Rust UTF-8 string using the charset + /// in the Content-Type + /// (or "us-ascii" if the charset was missing or not recognized). + pub fn get_as_string(&self) -> Result<String, MailParseError> { + get_body_as_string(self.body, &self.ctype) + } +} + +/// Struct that holds a binary body representation of the message (or message subpart). +pub struct BinaryBody<'a> { + ctype: &'a ParsedContentType, + body: &'a [u8], +} + +impl<'a> BinaryBody<'a> { + /// Get the body Content-Type + pub fn get_content_type(&self) -> &'a ParsedContentType { + self.ctype + } + + /// Get the raw body of the message exactly as it is written in the message (or message subpart). + pub fn get_raw(&self) -> &'a [u8] { + self.body + } +} + +fn decode_base64(body: &[u8]) -> Result<Vec<u8>, MailParseError> { + let cleaned = body + .iter() + .filter(|c| !c.is_ascii_whitespace()) + .cloned() + .collect::<Vec<u8>>(); + Ok(base64::decode(&cleaned)?) +} + +fn decode_quoted_printable(body: &[u8]) -> Result<Vec<u8>, MailParseError> { + Ok(quoted_printable::decode( + body, + quoted_printable::ParseMode::Robust, + )?) +} + +fn get_body_as_string(body: &[u8], ctype: &ParsedContentType) -> Result<String, MailParseError> { + let cow = if let Some(charset) = Charset::for_label(ctype.charset.as_bytes()) { + let (cow, _, _) = charset.decode(body); + cow + } else { + decode_ascii(body) + }; + Ok(cow.into_owned()) +} diff --git a/parser/src/dateparse.rs b/parser/src/dateparse.rs new file mode 100644 index 0000000..ad735ef --- /dev/null +++ b/parser/src/dateparse.rs @@ -0,0 +1,220 @@ +enum DateParseState { + Date, + Month, + Year, + Hour, + Minute, + Second, + Timezone, +} + +fn days_in_month(month: i64, year: i64) -> i64 { + match month { + 0 | 2 | 4 | 6 | 7 | 9 | 11 => 31, + 3 | 5 | 8 | 10 => 30, + 1 => { + if (year % 400) == 0 { + 29 + } else if (year % 100) == 0 { + 28 + } else if (year % 4) == 0 { + 29 + } else { + 28 + } + } + _ => 0, + } +} + +fn seconds_to_date(year: i64, month: i64, day: i64) -> i64 { + let mut result: i64 = 0; + for y in 1970..2001 { + if y == year { + break; + } + result += 86400 * 365; + if (y % 4) == 0 { + result += 86400; + } + } + let mut y = 2001; + while y < year { + if year - y >= 400 { + result += (86400 * 365 * 400) + (86400 * 97); + y += 400; + continue; + } + if year - y >= 100 { + result += (86400 * 365 * 100) + (86400 * 24); + y += 100; + continue; + } + if year - y >= 4 { + result += (86400 * 365 * 4) + (86400); + y += 4; + continue; + } + result += 86400 * 365; + y += 1; + } + for m in 0..month { + result += 86400 * days_in_month(m, year) + } + result + 86400 * (day - 1) +} + +/// Convert a date field from an email header into a UNIX epoch timestamp. +/// This function handles the most common formatting of date fields found in +/// email headers. It may fail to parse some of the more creative formattings. +/// +/// # Examples +/// ``` +/// use mailparse::dateparse; +/// assert_eq!(dateparse("Sun, 02 Oct 2016 07:06:22 -0700 (PDT)").unwrap(), 1475417182); +/// ``` +pub fn dateparse(date: &str) -> Result<i64, &'static str> { + let mut result = 0; + let mut month = 0; + let mut day_of_month = 0; + let mut state = DateParseState::Date; + for tok in date.split(|c| c == ' ' || c == ':') { + if tok.is_empty() { + continue; + } + match state { + DateParseState::Date => { + if let Ok(v) = tok.parse::<u8>() { + day_of_month = v; + state = DateParseState::Month; + }; + continue; + } + DateParseState::Month => { + month = match tok.to_uppercase().as_str() { + "JAN" | "JANUARY" => 0, + "FEB" | "FEBRUARY" => 1, + |