// Copyright 2019 Alexandros Frantzis // // This Source Code Form is subject to the terms of the Mozilla Public // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. // // SPDX-License-Identifier: MPL-2.0 //! Normalization of email data for easier processing. //! //! Normalization includes: //! //! * Placing multi-line header fields on a single line //! * Decoding base64 or quoted-printable encoded text data, including //! MIME encoded-words in the header. //! * Converting all text data to UTF-8. use ::regex::bytes::{RegexBuilder, Regex, Captures}; use std::collections::HashMap; use std::iter::Peekable; use memchr::{memchr, memchr_iter}; use charset::Charset; use std::borrow::Cow; use lazy_static::lazy_static; use crate::decode::{base64_decode_into_buf, qp_decode_into_buf}; /// An element recognized by the [EmailParser](struct.EmailParser.html). enum Element { HeaderField{data: Vec}, Body{ data: Vec, encoding: Option, content_type: Option, charset: Option }, Verbatim{data: Vec}, } /// Information about a part in a multi-part email message. /// The top-level is also considered a part. struct Part { encoding: Option, content_type: Option, charset: Option, subpart_boundary: Option>, } impl Part { fn new() -> Self { Part{ encoding: None, content_type: None, charset: None, subpart_boundary: None, } } } /// Iterator for the lines contained in a slice of [u8]. pub struct SliceLines<'a> { buf: &'a [u8], last: usize, } impl<'a> Iterator for SliceLines<'a> { type Item = &'a [u8]; fn next(&mut self) -> Option<&'a [u8]> { match memchr(b'\n', &self.buf[self.last..]) { Some(m) => { let line = &self.buf[self.last..=(self.last + m)]; self.last = self.last + m + 1; Some(line) }, None => { let line = &self.buf[self.last..]; if line.is_empty() { None } else { self.last = self.buf.len(); Some(line) } } } } } /// A parser for the elements contained in an email. /// /// The parsed elements are accessible by iterating over the parser. /// /// Every line in the email is contained in a MIME part (which itself may be /// nested in another part). The top level of the email is also considered /// to be a part for convenience of processing. struct EmailParser<'a> { lines: Peekable>, // The stack of nested parts the line we are processing is contained in. part_stack: Vec, // Whether we currently parsing header lines. in_header: bool, // The active multi-part boundary. active_boundary: Vec, content_encoding_regex: Regex, content_type_regex: Regex, boundary_regex: Regex, } impl<'a> EmailParser<'a> { fn new(buf: &'a [u8]) -> Self { let content_encoding_regex = RegexBuilder::new(r"Content-Transfer-Encoding:\s*([[:alnum:]-]+)") .case_insensitive(true) .build().unwrap(); let content_type_regex = RegexBuilder::new(r#"^Content-Type:\s*([^;]+)\s*(?:;\s*charset\s*=\s*"?([[:alnum:]_:\-\.]+))?"?"#) .case_insensitive(true) .build().unwrap(); let boundary_regex = RegexBuilder::new(r#"^Content-Type:\s*multipart/.*boundary\s*=\s*"?([[:alnum:]'_,/:=\(\)\+\-\.\?]+)"?"#) .case_insensitive(true) .build().unwrap(); EmailParser{ lines: SliceLines{buf, last: 0}.peekable(), // All emails have the top-level part. part_stack: vec![Part::new()], in_header: true, active_boundary: Vec::new(), content_encoding_regex: content_encoding_regex, content_type_regex: content_type_regex, boundary_regex: boundary_regex, } } // Returns the content type of the active part. fn active_content_type(&self) -> Option { self.part_stack.last()?.content_type.clone() } // Returns the encoding of the active part. fn active_encoding(&self) -> Option { self.part_stack.last()?.encoding.clone() } // Returns the charset of the active part. fn active_charset(&self) -> Option { self.part_stack.last()?.charset.clone() } fn begin_part(&mut self) { let part = self.part_stack.last().unwrap(); // We need to differentiate between the first and subsequent parts in a // multipart message. The first part creates a new subpart in the // part_stack... if part.subpart_boundary.as_ref().is_some() && part.subpart_boundary.as_ref().unwrap() == &self.active_boundary { self.part_stack.push(Part::new()) } else { // ...whereas subsequent sibling parts just replace the existing // part in the stack. let part = self.part_stack.last_mut().unwrap(); *part = Part::new(); } } fn end_part(&mut self) { match &self.part_stack.last().unwrap().subpart_boundary { // If last part is top part (i.e., we just had a boundary end line // without a preceding boundary start line) do nothing. Some(b) if b == &self.active_boundary => {}, // Otherwise, remove the active part. _ => { self.part_stack.pop(); } } // Remove boundary info from top part. self.part_stack.last_mut().unwrap().subpart_boundary = None; self.active_boundary.clear(); for p in self.part_stack.iter().rev() { if let Some(b) = &p.subpart_boundary { self.active_boundary = b.clone(); } } } fn update_active_part_from_header_field(&mut self, field: &[u8]) { let mut part = self.part_stack.last_mut().unwrap(); if let Some(captures) = self.content_encoding_regex.captures(&field) { let enc_bytes = captures.get(1).unwrap().as_bytes(); part.encoding = Some(std::str::from_utf8(&enc_bytes).unwrap().to_lowercase()); } else if let Some(captures) = self.boundary_regex.captures(&field) { part.subpart_boundary = Some(captures.get(1).unwrap().as_bytes().to_vec()); self.active_boundary = part.subpart_boundary.as_ref().unwrap().clone(); } else if let Some(captures) = self.content_type_regex.captures(&field) { let type_bytes = captures.get(1).unwrap().as_bytes(); part.content_type = Some(std::str::from_utf8(&type_bytes).unwrap().to_lowercase()); if let Some(charset) = captures.get(2) { part.charset = Some(std::str::from_utf8(charset.as_bytes()).unwrap().to_lowercase()); } } } } /// Removes newline characters from the end of a byte vector. fn vec_trim_end_newline(line: &mut Vec) { while let Some(&b) = line.last() { if b != b'\n' && b != b'\r' { break; } line.pop(); } } /// Returns a new slice not including any newline characters from the /// end of an existing slice. fn slice_trim_end_newline(mut line: &[u8]) -> &[u8] { while let Some(&b) = line.last() { if b != b'\n' && b != b'\r' { break; } line = &line[..line.len()-1]; } line } /// Returns whether a line of bytes is a multi-part boundary line for the /// specified boundary string. fn is_boundary_line(line: &[u8], boundary: &[u8]) -> bool { if line.starts_with(b"--") && !boundary.is_empty() { let line = slice_trim_end_newline(&line); let line = if line.ends_with(b"--") { &line[..line.len()-2] } else { &line[..] }; return line.len() > 2 && &line[2..] == boundary; } false } impl Iterator for EmailParser<'_> { type Item = Element; fn next(&mut self) -> Option { let mut inprogress = Vec::new(); let mut element = None; // Loop until we recognize an element (or reach end of input). loop { let line = match self.lines.next() { Some(l) => l, None => break, }; if self.in_header { match line[0] { // Empty lines denote the end of header. b'\n' | b'\r' => { self.in_header = false; element = Some(Element::Verbatim{data: line.to_vec()}); break; }, // Lines beginning with are continuation lines. b' ' | b'\t' => { vec_trim_end_newline(&mut inprogress); inprogress.extend(line); }, _ => inprogress = line.to_vec(), }; // If the next line is not a continuation line, break // to emit the current header field. if let Some(next_line) = self.lines.peek() { if next_line[0] != b' ' && next_line[0] != b'\t' { break; } } continue; } if is_boundary_line(&line, &self.active_boundary) { if slice_trim_end_newline(&line).ends_with(b"--") { self.end_part(); } else { self.begin_part(); // After a boundary start line we expect a header. self.in_header = true; } element = Some(Element::Verbatim{data: line.to_vec()}); break; } // If we reached this point, this line is a body line. Append // it to the inprogress data. inprogress.extend(line); // If next line is a boundary line, break to emit the current // body. if let Some(next_line) = self.lines.peek() { if is_boundary_line(next_line, &self.active_boundary) { break; } } } // Breaking out the loop happens in three cases: // 1. End of input // 2. We have recognized a verbatim element. // 3. We have inprogress data that we have recognized as a header field // or body. // If we have inprogress data, emit it as header or body. if !inprogress.is_empty() { // We shouldn't have set an element at this point, since we have // inprogress data, and this would lead to loss of data. assert!(element.is_none()); if self.in_header { element = Some(Element::HeaderField{data: inprogress}); } else { element = Some( Element::Body{ data: inprogress, encoding: self.active_encoding(), content_type: self.active_content_type(), charset: self.active_charset(), } ); } } if let Some(Element::HeaderField{data: field}) = element.as_ref() { self.update_active_part_from_header_field(&field); } element } } /// Decodes a byte array slice with the specified content encoding and charset /// to utf-8 byte data, appending to the specified Vec. fn decode_text_data_to_buf( data: &[u8], encoding: Option<&str>, charset: Option<&str>, mut out: &mut Vec, ) { let should_decode = encoding.is_some(); let mut should_convert_charset = true; let initial_len = out.len(); if should_decode { let result = match encoding.unwrap().as_ref() { "base64" => base64_decode_into_buf(&data, &mut out), "quoted-printable" => qp_decode_into_buf(&data, &mut out), "8bit" | "binary" => { out.extend(data); Ok(()) }, _ => Err("unknown encoding".into()), }; if result.is_ok() { // During decoding the final CRLF/LF in the data may be dropped. // Restore it to ensure that subsequent lines don't get folded // with the decoded data. const CRLF: &[u8] = &[b'\r', b'\n']; const LF: &[u8] = &[b'\n']; if data.ends_with(CRLF) && !out.ends_with(CRLF) { out.extend(CRLF); } else if data.ends_with(LF) && !out.ends_with(LF) { out.extend(LF); } } else { out.resize(initial_len, 0); should_convert_charset = false; } } if out.len() == initial_len { out.extend(data); } if should_convert_charset { if let Some(chr) = Charset::for_label(charset.unwrap_or("us-ascii").as_bytes()) { let (cow, _, _) = chr.decode(&out[initial_len..]); if let Cow::Owned(c) = cow { out.resize(initial_len, 0); out.extend(c.bytes()); } } } } /// Returns whether a byte array slice could contain an MIME encoded-word. /// /// This function could return a false positive, but never a false negative. fn maybe_contains_encoded_word(data: &[u8]) -> bool { for spacepos in memchr_iter(b'?', &data) { if spacepos + 1 < data.len() && data[spacepos + 1] == b'=' { return true; } } false } /// Decodes a MIME encoded-word represented as regex captures. fn decode_encoded_word_from_captures(caps: &Captures) -> Vec { let charset = String::from_utf8_lossy(&caps[1]).to_lowercase(); let encoding = match &caps[2] { b"q" | b"Q" => "quoted-printable", b"b" | b"B" => "base64", _ => "", }; let mut data = Cow::from(&caps[3]); // Quoted-printable in encoded-words may use underscores for spaces. if encoding == "quoted-printable" { let space_positions: Vec<_> = memchr_iter(b'_', &data).collect(); for pos in space_positions { data.to_mut()[pos] = b' '; } } let mut decoded = Vec::new(); decode_text_data_to_buf(&data, Some(encoding), Some(&charset), &mut decoded); decoded } /// Normalizes an email and parses header fields. /// /// See module documentation about what is involved in normalization. /// /// Returns the normalized data and a map of header field names to values. pub fn normalize_email(data: &[u8]) -> (Vec, HashMap>) { lazy_static! { static ref ENCODED_WORD_REGEX: Regex = RegexBuilder::new(r"=\?([^?]+)\?([^?]+)\?([^? \t]+)\?=") .case_insensitive(true) .build().unwrap(); static ref ENCODED_WORD_WSP_REGEX: Regex = RegexBuilder::new(r"\?([^?]+)\?=\s*=\?([^?]+)\?") .case_insensitive(true) .build().unwrap(); } let parser = EmailParser::new(&data); let mut normalized = Vec::new(); let mut fields = HashMap::new(); for element in parser { match element { Element::HeaderField{data} => { let initial_len = normalized.len(); if maybe_contains_encoded_word(&data) { // First remove whitespace between consecutive encoded-words // as required by the RFC, then decode. let data = ENCODED_WORD_WSP_REGEX.replace_all( &data, "?$1?==?$2?".as_bytes()); let data = ENCODED_WORD_REGEX.replace_all( &data, decode_encoded_word_from_captures); normalized.extend(data.as_ref()); } else { normalized.extend(&data); } // Populate the fields map. let field_str = String::from_utf8_lossy(&normalized[initial_len..]); let field_str = field_str.trim(); let mut split = field_str.splitn(2, ':'); let name = split.next().map(|n| n.to_lowercase()).unwrap(); let value = split.next().unwrap_or("").to_owned(); fields.entry(name).or_insert(Vec::new()).push(value); }, Element::Body{data, encoding, content_type, charset} => { // Only decode text content. match content_type { Some(ref content_type) if !content_type.starts_with("text/") => { normalized.extend(&data); }, _ => { decode_text_data_to_buf( &data, encoding.as_ref().map(String::as_str), charset.as_ref().map(String::as_str), &mut normalized); } }; }, Element::Verbatim{data} => { normalized.extend(&data); }, } } (normalized, fields) }