From a4a0e917580cf465ffd1aa8ae6723860aab23c6f Mon Sep 17 00:00:00 2001 From: ufoscout Date: Tue, 28 May 2019 13:16:13 +0200 Subject: Allow access to encoded body --- src/body.rs | 152 ++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 205 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 331 insertions(+), 26 deletions(-) create mode 100644 src/body.rs (limited to 'src') diff --git a/src/body.rs b/src/body.rs new file mode 100644 index 0000000..ed1b617 --- /dev/null +++ b/src/body.rs @@ -0,0 +1,152 @@ +use charset::{decode_ascii, Charset}; +use {MailParseError, ParsedContentType}; + +/// Represents the body of an email (or mail subpart) +pub enum Body<'a> { + /// A body with 'base64' Content-Transfer-Encoding. + Base64(EncodedBody<'a>), + /// A body with 'quoted-printable' Content-Transfer-Encoding. + QuotedPrintable(EncodedBody<'a>), + /// A body with '7bit' Content-Transfer-Encoding. + SevenBit(TextBody<'a>), + /// A body with '8bit' Content-Transfer-Encoding. + EightBit(TextBody<'a>), + /// A body with 'binary' Content-Transfer-Encoding. + Binary(BinaryBody<'a>), +} + +impl<'a> Body<'a> { + pub fn new( + body: &'a [u8], + ctype: &'a ParsedContentType, + transfer_encoding: &Option, + ) -> Body<'a> { + transfer_encoding + .as_ref() + .map(|encoding| match encoding.as_ref() { + "base64" => Body::Base64(EncodedBody { + decoder: decode_base64, + body, + ctype, + }), + "quoted-printable" => Body::QuotedPrintable(EncodedBody { + decoder: decode_quoted_printable, + body, + ctype, + }), + "7bit" => Body::SevenBit(TextBody { body, ctype }), + "8bit" => Body::EightBit(TextBody { body, ctype }), + "binary" => Body::Binary(BinaryBody { body, ctype }), + _ => Body::get_default(body, ctype), + }) + .unwrap_or_else(|| Body::get_default(body, ctype)) + } + + fn get_default(body: &'a [u8], ctype: &'a ParsedContentType) -> Body<'a> { + Body::SevenBit(TextBody { body, ctype }) + } +} + +/// Struct that holds the encoded body representation of the message (or message subpart). +pub struct EncodedBody<'a> { + decoder: fn(&[u8]) -> Result, MailParseError>, + ctype: &'a ParsedContentType, + body: &'a [u8], +} + +impl<'a> EncodedBody<'a> { + /// Get the body Content-Type + pub fn get_content_type(&self) -> &'a ParsedContentType { + self.ctype + } + + /// Get the raw body of the message exactly as it is written in the message (or message subpart). + pub fn get_raw(&self) -> &'a [u8] { + self.body + } + + /// Get the decoded body of the message (or message subpart). + pub fn get_decoded(&self) -> Result, MailParseError> { + (self.decoder)(self.body) + } + + /// Get the body of the message as a Rust string. + /// This function tries to decode the body and then converts + /// the result into a Rust UTF-8 string using the charset in the Content-Type + /// (or "us-ascii" if the charset was missing or not recognized). + /// This operation returns a valid result only if the decoded body + /// has a text format. + pub fn get_decoded_as_string(&self) -> Result { + get_body_as_string(&self.get_decoded()?, &self.ctype) + } +} + +/// Struct that holds the textual body representation of the message (or message subpart). +pub struct TextBody<'a> { + ctype: &'a ParsedContentType, + body: &'a [u8], +} + +impl<'a> TextBody<'a> { + /// Get the body Content-Type + pub fn get_content_type(&self) -> &'a ParsedContentType { + self.ctype + } + + /// Get the raw body of the message exactly as it is written in the message (or message subpart). + pub fn get_raw(&self) -> &'a [u8] { + self.body + } + + /// Get the body of the message as a Rust string. + /// This function converts the body into a Rust UTF-8 string using the charset + /// in the Content-Type + /// (or "us-ascii" if the charset was missing or not recognized). + pub fn get_as_string(&self) -> Result { + get_body_as_string(self.body, &self.ctype) + } +} + +/// Struct that holds a binary body representation of the message (or message subpart). +pub struct BinaryBody<'a> { + ctype: &'a ParsedContentType, + body: &'a [u8], +} + +impl<'a> BinaryBody<'a> { + /// Get the body Content-Type + pub fn get_content_type(&self) -> &'a ParsedContentType { + self.ctype + } + + /// Get the raw body of the message exactly as it is written in the message (or message subpart). + pub fn get_raw(&self) -> &'a [u8] { + self.body + } +} + +fn decode_base64(body: &[u8]) -> Result, MailParseError> { + let cleaned = body + .iter() + .filter(|c| !c.is_ascii_whitespace()) + .cloned() + .collect::>(); + Ok(base64::decode(&cleaned)?) +} + +fn decode_quoted_printable(body: &[u8]) -> Result, MailParseError> { + Ok(quoted_printable::decode( + body, + quoted_printable::ParseMode::Robust, + )?) +} + +fn get_body_as_string(body: &[u8], ctype: &ParsedContentType) -> Result { + let cow = if let Some(charset) = Charset::for_label(ctype.charset.as_bytes()) { + let (cow, _, _) = charset.decode(body); + cow + } else { + decode_ascii(body) + }; + Ok(cow.into_owned()) +} diff --git a/src/lib.rs b/src/lib.rs index c727c58..7af26bc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,12 +7,13 @@ use std::error; use std::fmt; use std::ops::Deref; -use charset::decode_ascii; use charset::decode_latin1; use charset::Charset; +pub mod body; mod dateparse; +use body::Body; pub use dateparse::dateparse; /// An error type that represents the different kinds of errors that may be @@ -650,14 +651,13 @@ impl<'a> ParsedMail<'a> { /// assert_eq!(p.get_body().unwrap(), "This is the body"); /// ``` pub fn get_body(&self) -> Result { - let decoded = self.get_body_raw()?; - let cow = if let Some(charset) = Charset::for_label(self.ctype.charset.as_bytes()) { - let (cow, _, _) = charset.decode(&decoded); - cow - } else { - decode_ascii(&decoded) - }; - Ok(cow.into_owned()) + match self.get_body_encoded()? { + Body::Base64(body) | Body::QuotedPrintable(body) => body.get_decoded_as_string(), + Body::SevenBit(body) | Body::EightBit(body) => body.get_as_string(), + Body::Binary(_) => Err(MailParseError::Generic( + "Message body of type binary body cannot be parsed into a string", + )), + } } /// Get the body of the message as a Rust Vec. This function tries to @@ -675,27 +675,59 @@ impl<'a> ParsedMail<'a> { /// assert_eq!(p.get_body_raw().unwrap(), b"This is the body"); /// ``` pub fn get_body_raw(&self) -> Result, MailParseError> { - let transfer_coding = self + match self.get_body_encoded()? { + Body::Base64(body) | Body::QuotedPrintable(body) => body.get_decoded(), + Body::SevenBit(body) | Body::EightBit(body) => Ok(Vec::::from(body.get_raw())), + Body::Binary(body) => Ok(Vec::::from(body.get_raw())), + } + } + + /// Get the body of the message. + /// This function returns original the body without attempting to + /// unapply the Content-Transfer-Encoding. + /// + /// # Examples + /// ``` + /// use mailparse::parse_mail; + /// use mailparse::body::Body; + /// + /// let mail = parse_mail(b"Content-Transfer-Encoding: base64\r\n\r\naGVsbG 8gd\r\n29ybGQ=").unwrap(); + /// + /// match mail.get_body_encoded().unwrap() { + /// Body::Base64(body) => { + /// assert_eq!(body.get_raw(), b"aGVsbG 8gd\r\n29ybGQ="); + /// assert_eq!(body.get_decoded().unwrap(), b"hello world"); + /// assert_eq!(body.get_decoded_as_string().unwrap(), "hello world"); + /// }, + /// _ => assert!(false), + /// }; + /// + /// + /// // An email whose body encoding is not known upfront + /// let another_mail = parse_mail(b"").unwrap(); + /// + /// match another_mail.get_body_encoded().unwrap() { + /// Body::Base64(body) | Body::QuotedPrintable(body) => { + /// println!("mail body encoded: {:?}", body.get_raw()); + /// println!("mail body decoded: {:?}", body.get_decoded().unwrap()); + /// println!("mail body decoded as string: {}", body.get_decoded_as_string().unwrap()); + /// }, + /// Body::SevenBit(body) | Body::EightBit(body) => { + /// println!("mail body: {:?}", body.get_raw()); + /// println!("mail body as string: {}", body.get_as_string().unwrap()); + /// }, + /// Body::Binary(body) => { + /// println!("mail body binary: {:?}", body.get_raw()); + /// } + /// } + /// ``` + pub fn get_body_encoded(&'a self) -> Result, MailParseError> { + let transfer_encoding = self .headers .get_first_value("Content-Transfer-Encoding")? .map(|s| s.to_lowercase()); - let decoded = match transfer_coding { - Some(ref enc) if enc == "base64" => { - let cleaned = self - .body - .iter() - .filter(|c| !c.is_ascii_whitespace()) - .cloned() - .collect::>(); - base64::decode(&cleaned)? - } - Some(ref enc) if enc == "quoted-printable" => { - quoted_printable::decode(self.body, quoted_printable::ParseMode::Robust)? - } - _ => Vec::::from(self.body), - }; - Ok(decoded) + Ok(Body::new(self.body, &self.ctype, &transfer_encoding)) } /// Returns a struct containing a parsed representation of the @@ -1265,4 +1297,125 @@ mod tests { let parsed = parse_param_content(r#"Content-Type: application/octet-stream; name=""#); assert_eq!(parsed.params["name"], "\""); } + + #[test] + fn test_default_content_encoding() { + let mail = parse_mail(b"Content-Type: text/plain; charset=UTF-7\r\n\r\n+JgM-").unwrap(); + let body = mail.get_body_encoded().unwrap(); + match body { + Body::SevenBit(body) => { + assert_eq!(body.get_raw(), b"+JgM-"); + assert_eq!(body.get_as_string().unwrap(), "\u{2603}"); + } + _ => assert!(false), + }; + } + + #[test] + fn test_7bit_content_encoding() { + let mail = parse_mail(b"Content-Type: text/plain; charset=UTF-7\r\nContent-Transfer-Encoding: 7bit\r\n\r\n+JgM-").unwrap(); + let body = mail.get_body_encoded().unwrap(); + match body { + Body::SevenBit(body) => { + assert_eq!(body.get_raw(), b"+JgM-"); + assert_eq!(body.get_as_string().unwrap(), "\u{2603}"); + } + _ => assert!(false), + }; + } + + #[test] + fn test_8bit_content_encoding() { + let mail = parse_mail(b"Content-Type: text/plain; charset=UTF-7\r\nContent-Transfer-Encoding: 8bit\r\n\r\n+JgM-").unwrap(); + let body = mail.get_body_encoded().unwrap(); + match body { + Body::EightBit(body) => { + assert_eq!(body.get_raw(), b"+JgM-"); + assert_eq!(body.get_as_string().unwrap(), "\u{2603}"); + } + _ => assert!(false), + }; + } + + #[test] + fn test_quoted_printable_content_encoding() { + let mail = parse_mail( + b"Content-Type: text/plain; charset=UTF-7\r\nContent-Transfer-Encoding: quoted-printable\r\n\r\n+JgM-", + ).unwrap(); + match mail.get_body_encoded().unwrap() { + Body::QuotedPrintable(body) => { + assert_eq!(body.get_raw(), b"+JgM-"); + assert_eq!(body.get_decoded().unwrap(), b"+JgM-"); + assert_eq!(body.get_decoded_as_string().unwrap(), "\u{2603}"); + } + _ => assert!(false), + }; + } + + #[test] + fn test_base64_content_encoding() { + let mail = + parse_mail(b"Content-Transfer-Encoding: base64\r\n\r\naGVsbG 8gd\r\n29ybGQ=").unwrap(); + match mail.get_body_encoded().unwrap() { + Body::Base64(body) => { + assert_eq!(body.get_raw(), b"aGVsbG 8gd\r\n29ybGQ="); + assert_eq!(body.get_decoded().unwrap(), b"hello world"); + assert_eq!(body.get_decoded_as_string().unwrap(), "hello world"); + } + _ => assert!(false), + }; + } + + #[test] + fn test_binary_content_encoding() { + let mail = parse_mail(b"Content-Transfer-Encoding: binary\r\n\r\n######").unwrap(); + let body = mail.get_body_encoded().unwrap(); + match body { + Body::Binary(body) => { + assert_eq!(body.get_raw(), b"######"); + } + _ => assert!(false), + }; + } + + #[test] + fn test_body_content_encoding_with_multipart() { + let mail_filepath = "./tests/files/test_email_01.txt"; + let mail = std::fs::read(mail_filepath) + .expect(&format!("Unable to open the file [{}]", mail_filepath)); + let mail = parse_mail(&mail).unwrap(); + + let subpart_0 = mail.subparts.get(0).unwrap(); + match subpart_0.get_body_encoded().unwrap() { + Body::SevenBit(body) => { + assert_eq!( + body.get_as_string().unwrap().trim(), + "Test with attachments" + ); + } + _ => assert!(false), + }; + + let subpart_1 = mail.subparts.get(1).unwrap(); + match subpart_1.get_body_encoded().unwrap() { + Body::Base64(body) => { + let pdf_filepath = "./tests/files/test_email_01_sample.pdf"; + let original_pdf = std::fs::read(pdf_filepath) + .expect(&format!("Unable to open the file [{}]", pdf_filepath)); + assert_eq!(body.get_decoded().unwrap(), original_pdf); + } + _ => assert!(false), + }; + + let subpart_2 = mail.subparts.get(2).unwrap(); + match subpart_2.get_body_encoded().unwrap() { + Body::Base64(body) => { + assert_eq!( + body.get_decoded_as_string().unwrap(), + "txt file context for email collector\n1234567890987654321\n" + ); + } + _ => assert!(false), + }; + } } -- cgit v1.2.3