diff options
Diffstat (limited to 'src/normalize.rs')
-rw-r--r-- | src/normalize.rs | 477 |
1 files changed, 477 insertions, 0 deletions
diff --git a/src/normalize.rs b/src/normalize.rs new file mode 100644 index 0000000..7c8487d --- /dev/null +++ b/src/normalize.rs @@ -0,0 +1,477 @@ +// Copyright 2019 Alexandros Frantzis +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. +// +// SPDX-License-Identifier: MPL-2.0 + +//! Normalization of email data for easier processing. +//! +//! Normalization includes: +//! +//! * Placing multi-line header fields on a single line +//! * Decoding base64 or quoted-printable encoded text data, including +//! MIME encoded-words in the header. +//! * Converting all text data to UTF-8. + +use ::regex::bytes::{RegexBuilder, Regex, Captures}; +use std::collections::HashMap; +use std::iter::Peekable; +use memchr::{memchr, memchr_iter}; +use charset::Charset; +use std::borrow::Cow; +use lazy_static::lazy_static; + +use crate::decode::{base64_decode_into_buf, qp_decode_into_buf}; + +/// An element recognized by the [EmailParser](struct.EmailParser.html). +enum Element { + HeaderField{data: Vec<u8>}, + Body{ + data: Vec<u8>, + encoding: Option<String>, + content_type: Option<String>, + charset: Option<String> + }, + Verbatim{data: Vec<u8>}, +} + +/// Information about a part in a multi-part email message. +/// The top-level is also considered a part. +struct Part { + encoding: Option<String>, + content_type: Option<String>, + charset: Option<String>, + subpart_boundary: Option<Vec<u8>>, +} + +impl Part { + fn new() -> Self { + Part{ + encoding: None, + content_type: None, + charset: None, + subpart_boundary: None, + } + } +} + +/// Iterator for the lines contained in a slice of [u8]. +pub struct SliceLines<'a> { + buf: &'a [u8], + last: usize, +} + +impl<'a> Iterator for SliceLines<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option<&'a [u8]> { + match memchr(b'\n', &self.buf[self.last..]) { + Some(m) => { + let line = &self.buf[self.last..=(self.last + m)]; + self.last = self.last + m + 1; + Some(line) + }, + None => { + let line = &self.buf[self.last..]; + if line.is_empty() { + None + } else { + self.last = self.buf.len(); + Some(line) + } + } + } + } +} + +/// A parser for the elements contained in an email. +/// +/// The parsed elements are accessible by iterating over the parser. +/// +/// Every line in the email is contained in a MIME part (which itself may be +/// nested in another part). The top level of the email is also considered +/// to be a part for convenience of processing. +struct EmailParser<'a> { + lines: Peekable<SliceLines<'a>>, + // The stack of nested parts the line we are processing is contained in. + part_stack: Vec<Part>, + // Whether we currently parsing header lines. + in_header: bool, + // The active multi-part boundary. + active_boundary: Vec<u8>, + content_encoding_regex: Regex, + content_type_regex: Regex, + boundary_regex: Regex, +} + +impl<'a> EmailParser<'a> { + fn new(buf: &'a [u8]) -> Self { + let content_encoding_regex = + RegexBuilder::new(r"Content-Transfer-Encoding:\s*([[:alnum:]-]+)") + .case_insensitive(true) + .build().unwrap(); + let content_type_regex = + RegexBuilder::new(r#"^Content-Type:\s*([^;]+)\s*(?:;\s*charset\s*=\s*"?([[:alnum:]_:\-\.]+))?"?"#) + .case_insensitive(true) + .build().unwrap(); + + let boundary_regex = + RegexBuilder::new(r#"^Content-Type:\s*multipart/.*boundary\s*=\s*"?([[:alnum:]'_,/:=\(\)\+\-\.\?]+)"?"#) + .case_insensitive(true) + .build().unwrap(); + + EmailParser{ + lines: SliceLines{buf, last: 0}.peekable(), + // All emails have the top-level part. + part_stack: vec![Part::new()], + in_header: true, + active_boundary: Vec::new(), + content_encoding_regex: content_encoding_regex, + content_type_regex: content_type_regex, + boundary_regex: boundary_regex, + } + } + + // Returns the content type of the active part. + fn active_content_type(&self) -> Option<String> { + self.part_stack.last()?.content_type.clone() + } + + // Returns the encoding of the active part. + fn active_encoding(&self) -> Option<String> { + self.part_stack.last()?.encoding.clone() + } + + // Returns the charset of the active part. + fn active_charset(&self) -> Option<String> { + self.part_stack.last()?.charset.clone() + } + + fn begin_part(&mut self) { + let part = self.part_stack.last().unwrap(); + + // We need to differentiate between the first and subsequent parts in a + // multipart message. The first part creates a new subpart in the + // part_stack... + if part.subpart_boundary.as_ref().is_some() && + part.subpart_boundary.as_ref().unwrap() == &self.active_boundary { + self.part_stack.push(Part::new()) + } else { + // ...whereas subsequent sibling parts just replace the existing + // part in the stack. + let part = self.part_stack.last_mut().unwrap(); + *part = Part::new(); + } + } + + fn end_part(&mut self) { + self.part_stack.pop(); + if let Some(part) = self.part_stack.last_mut() { + part.subpart_boundary = None; + } + for p in self.part_stack.iter().rev() { + if let Some(b) = &p.subpart_boundary { + self.active_boundary = b.clone(); + } + } + } + + fn update_active_part_from_header_field(&mut self, field: &[u8]) { + let mut part = self.part_stack.last_mut().unwrap(); + + if let Some(captures) = self.content_encoding_regex.captures(&field) { + let enc_bytes = captures.get(1).unwrap().as_bytes(); + part.encoding = Some(std::str::from_utf8(&enc_bytes).unwrap().to_lowercase()); + } else if let Some(captures) = self.boundary_regex.captures(&field) { + part.subpart_boundary = Some(captures.get(1).unwrap().as_bytes().to_vec()); + self.active_boundary = part.subpart_boundary.as_ref().unwrap().clone(); + } + else if let Some(captures) = self.content_type_regex.captures(&field) { + let type_bytes = captures.get(1).unwrap().as_bytes(); + part.content_type = Some(std::str::from_utf8(&type_bytes).unwrap().to_lowercase()); + if let Some(charset) = captures.get(2) { + part.charset = Some(std::str::from_utf8(charset.as_bytes()).unwrap().to_lowercase()); + } + } + } +} + +/// Removes newline characters from the end of a byte vector. +fn vec_trim_end_newline(line: &mut Vec<u8>) { + while let Some(&b) = line.last() { + if b != b'\n' && b != b'\r' { + break; + } + line.pop(); + } +} + +/// Returns a new slice not including any newline characters from the +/// end of an existing slice. +fn slice_trim_end_newline(mut line: &[u8]) -> &[u8] { + while let Some(&b) = line.last() { + if b != b'\n' && b != b'\r' { + break; + } + line = &line[..line.len()-1]; + } + line +} + +/// Returns whether a line of bytes is a multi-part boundary line for the +/// specified boundary string. +fn is_boundary_line(line: &[u8], boundary: &[u8]) -> bool { + line.starts_with(b"--") && + !boundary.is_empty() && + line[2..].starts_with(&boundary) +} + + +impl Iterator for EmailParser<'_> { + type Item = Element; + + fn next(&mut self) -> Option<Element> { + let mut inprogress = Vec::new(); + let mut element = None; + + // Loop until we recognize an element (or reach end of input). + loop { + let line = match self.lines.next() { + Some(l) => l, + None => break, + }; + + if self.in_header { + match line[0] { + // Empty lines denote the end of header. + b'\n' | b'\r' => { + self.in_header = false; + element = Some(Element::Verbatim{data: line.to_vec()}); + break; + }, + // Lines beginning with are continuation lines. + b' ' | b'\t' => { + vec_trim_end_newline(&mut inprogress); + inprogress.extend(line); + }, + _ => inprogress = line.to_vec(), + }; + + // If the next line is not a continuation line, break + // to emit the current header field. + if let Some(next_line) = self.lines.peek() { + if next_line[0] != b' ' && next_line[0] != b'\t' { + break; + } + } + + continue; + } + + if is_boundary_line(&line, &self.active_boundary) { + if slice_trim_end_newline(&line).ends_with(b"--") { + self.end_part(); + } else { + self.begin_part(); + // After a boundary start line we expect a header. + self.in_header = true; + } + + element = Some(Element::Verbatim{data: line.to_vec()}); + break; + } + + // If we reached this point, this line is a body line. Append + // it to the inprogress data. + inprogress.extend(line); + + // If next line is a boundary line, break to emit the current + // body. + if let Some(next_line) = self.lines.peek() { + if is_boundary_line(next_line, &self.active_boundary) { + break; + } + } + } + + // Breaking out the loop happens in three cases: + // 1. End of input + // 2. We have recognized a verbatim element. + // 3. We have inprogress data that we have recognized as a header field + // or body. + + // If we have inprogress data, emit it as header or body. + if !inprogress.is_empty() { + // We shouldn't have set an element at this point, since we have + // inprogress data, and this would lead to loss of data. + assert!(element.is_none()); + + if self.in_header { + element = Some(Element::HeaderField{data: inprogress}); + } else { + element = Some( + Element::Body{ + data: inprogress, + encoding: self.active_encoding(), + content_type: self.active_content_type(), + charset: self.active_charset(), + } + ); + } + } + + if let Some(Element::HeaderField{data: field}) = element.as_ref() { + self.update_active_part_from_header_field(&field); + } + + element + } +} + +/// Decodes a byte array slice with the specified content encoding and charset +/// to utf-8 byte data, appending to the specified Vec<u8>. +fn decode_text_data_to_buf( + data: &[u8], + encoding: Option<&str>, + charset: Option<&str>, + mut out: &mut Vec<u8>, +) { + let should_decode = encoding.is_some(); + let mut should_convert_charset = true; + let initial_len = out.len(); + + if should_decode { + let result = match encoding.unwrap().as_ref() { + "base64" => base64_decode_into_buf(&data, &mut out), + "quoted-printable" => qp_decode_into_buf(&data, &mut out), + "8bit" | "binary" => { out.extend(data); Ok(()) }, + _ => Err("unknown encoding".into()), + }; + + if result.is_err() { + out.resize(initial_len, 0); + should_convert_charset = false; + } + } + + if out.len() == initial_len { + out.extend(data); + } + + if should_convert_charset { + if let Some(chr) = Charset::for_label(charset.unwrap_or("us-ascii").as_bytes()) { + let (cow, _, _) = chr.decode(&out[initial_len..]); + if let Cow::Owned(c) = cow { + out.resize(initial_len, 0); + out.extend(c.bytes()); + } + } + } +} + +/// Returns whether a byte array slice could contain an MIME encoded-word. +/// +/// This function could return a false positive, but never a false negative. +fn maybe_contains_encoded_word(data: &[u8]) -> bool { + for spacepos in memchr_iter(b'?', &data) { + if spacepos + 1 < data.len() && data[spacepos + 1] == b'=' { + return true; + } + } + + false +} + +/// Decodes a MIME encoded-word represented as regex captures. +fn decode_encoded_word_from_captures(caps: &Captures) -> Vec<u8> { + let charset = String::from_utf8_lossy(&caps[1]).to_lowercase(); + let encoding = match &caps[2] { + b"q" | b"Q" => "quoted-printable", + b"b" | b"B" => "base64", + _ => "", + }; + let mut data = Cow::from(&caps[3]); + + // Quoted-printable in encoded-words may use underscores for spaces. + if encoding == "quoted-printable" { + let space_positions: Vec<_> = memchr_iter(b'_', &data).collect(); + for pos in space_positions { + data.to_mut()[pos] = b' '; + } + } + + let mut decoded = Vec::new(); + decode_text_data_to_buf(&data, Some(encoding), Some(&charset), &mut decoded); + decoded +} + +/// Normalizes an email and parses header fields. +/// +/// See module documentation about what is involved in normalization. +/// +/// Returns the normalized data and a map of header field names to values. +pub fn normalize_email(data: &[u8]) -> (Vec<u8>, HashMap<String, Vec<String>>) { + lazy_static! { + static ref ENCODED_WORD_REGEX: Regex = + RegexBuilder::new(r"=\?([^?]+)\?([^?]+)\?([^? \t]+)\?=") + .case_insensitive(true) + .build().unwrap(); + static ref ENCODED_WORD_WSP_REGEX: Regex = + RegexBuilder::new(r"\?([^?]+)\?=\s*=\?([^?]+)\?") + .case_insensitive(true) + .build().unwrap(); + } + let parser = EmailParser::new(&data); + let mut normalized = Vec::new(); + let mut fields = HashMap::new(); + + for element in parser { + match element { + Element::HeaderField{data} => { + let initial_len = normalized.len(); + + if maybe_contains_encoded_word(&data) { + // First remove whitespace between consecutive encoded-words + // as required by the RFC, then decode. + let data = ENCODED_WORD_WSP_REGEX.replace_all( + &data, "?$1?==?$2?".as_bytes()); + let data = ENCODED_WORD_REGEX.replace_all( + &data, decode_encoded_word_from_captures); + normalized.extend(data.as_ref()); + } else { + normalized.extend(&data); + } + + // Populate the fields map. + let field_str = String::from_utf8_lossy(&normalized[initial_len..]); + let field_str = field_str.trim(); + let mut split = field_str.splitn(2, ':'); + let name = split.next().map(|n| n.to_lowercase()).unwrap(); + let value = split.next().unwrap_or("").to_owned(); + fields.entry(name).or_insert(Vec::new()).push(value); + }, + Element::Body{data, encoding, content_type, charset} => { + // Only decode text content. + match content_type { + Some(ref content_type) if !content_type.starts_with("text/") => { + normalized.extend(&data); + }, + _ => { + decode_text_data_to_buf( + &data, + encoding.as_ref().map(String::as_str), + charset.as_ref().map(String::as_str), + &mut normalized); + } + }; + }, + Element::Verbatim{data} => { + normalized.extend(&data); + }, + } + } + + (normalized, fields) +} |