diff options
author | Alexandros Frantzis <alf82@freemail.gr> | 2019-10-01 23:06:38 +0300 |
---|---|---|
committer | Alexandros Frantzis <alf82@freemail.gr> | 2019-10-01 23:36:26 +0300 |
commit | a054789ddb60ed1fab26e6d4e6bd36ed926273f1 (patch) | |
tree | bd3caad78a0e377816b78276889301d9276344b9 /src |
Initial public release
Diffstat (limited to 'src')
-rw-r--r-- | src/decode.rs | 224 | ||||
-rw-r--r-- | src/deliver.rs | 176 | ||||
-rw-r--r-- | src/lib.rs | 373 | ||||
-rw-r--r-- | src/normalize.rs | 477 | ||||
-rw-r--r-- | src/processing.rs | 95 | ||||
-rw-r--r-- | src/regex.rs | 114 |
6 files changed, 1459 insertions, 0 deletions
diff --git a/src/decode.rs b/src/decode.rs new file mode 100644 index 0000000..8004f18 --- /dev/null +++ b/src/decode.rs @@ -0,0 +1,224 @@ +// Copyright 2019 Alexandros Frantzis +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. +// +// SPDX-License-Identifier: MPL-2.0 + +//! Base64 and quoted-printable decoding. + +use crate::Result; + +const PAD: u8 = 64; // The pseudo-index of the PAD character. +const INV: u8 = 99; // An invalid index. + +static BASE64_INDICES: &'static [u8] = &[ + // 0 1 2 3 4 5 6 7 8 9 A B C D E F +/* 0 */ INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, +/* 1 */ INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, +/* 2 */ INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, 62, INV, INV, INV, 63, +/* 3 */ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, INV, INV, INV, PAD, INV, INV, +/* 4 */ INV, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, +/* 5 */ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, INV, INV, INV, INV, INV, +/* 6 */ INV, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, +/* 7 */ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, INV, INV, INV, INV, INV, +/* 8 */ INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, +/* 9 */ INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, +/* A */ INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, +/* B */ INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, +/* C */ INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, +/* D */ INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, +/* E */ INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, +/* F */ INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, INV, +]; + +/// Decodes base64 encoded data, appending the decoded data to a Vec<u8>. +/// +/// During decoding all line breaks and invalid characters are ignored. +/// If an error is encountered during decoding, the already decoded data in the +/// output buffer is left intact. It's up to the caller to deal with the partial +/// decoded data in case of failure. +pub fn base64_decode_into_buf(input: &[u8], output: &mut Vec<u8>) -> Result<()> { + let mut num_chars = 0; + let mut cur_triplet = 0; + let mut valid_chars = 0; + + for c in input { + let ci = BASE64_INDICES[*c as usize]; + match ci { + // rfc2045: All line breaks or other characters not + // found in Table 1 must be ignored by decoding software. + INV => continue, + _ if ci < PAD => valid_chars += 1, + _ => {} + } + + cur_triplet = cur_triplet << 6 | ((ci & 0x3f) as u32); + num_chars += 1; + + if num_chars == 4 { + match valid_chars { + 2 => output.push((cur_triplet >> 16) as u8), + 3 => output.extend( + &[(cur_triplet >> 16) as u8, (cur_triplet >> 8) as u8] + ), + 4 => output.extend( + &[(cur_triplet >> 16) as u8, + (cur_triplet >> 8) as u8, + cur_triplet as u8 + ] + ), + _ => return Err("Invalid base64 encoding".into()), + } + + cur_triplet = 0; + num_chars = 0; + valid_chars = 0; + } + } + + // rfc2045: A full encoding quantum is always completed at the end of a body. + if num_chars != 0 { + return Err("Unpadded input".into()); + } + + Ok(()) +} + +/// Converts an ascii byte representing a hex digit to it's numerical value. +fn hexdigit_to_num(mut a: u8) -> Option<u8> { + if a.is_ascii_digit() { + return Some(a - b'0'); + } + + a.make_ascii_lowercase(); + + if a >= b'a' && a <= b'f' { + return Some(a - b'a' + 10); + } + + None +} + +/// Decodes quoted-printable encoded data, appending the decoding data to a +/// Vec<u8>. +/// +/// During decoding all line breaks and invalid characters are ignored. +/// If an error is encountered during decoding, the already decoded data in the +/// output buffer is left intact. It's up to the caller to deal with the partial +/// decoded data in case of failure. +pub fn qp_decode_into_buf(input: &[u8], output: &mut Vec<u8>) -> Result<()> { + let mut iter = input.iter().peekable(); + + 'outer: loop { + loop { + match iter.next() { + Some(b'=') => break, + Some(c) => output.push(*c), + None => break 'outer, + } + } + + // At this point we have encountered a '=', so check + // to see what follows. + if let Some(&first) = iter.next() { + // A CRLF/LF after '=' marks a line continuation, and + // is effectively dropped. + if first == b'\r' { + if iter.peek() == Some(&&b'\n') { + iter.next(); + continue; + } + } else if first == b'\n' { + continue; + } else if let Some(first_num) = hexdigit_to_num(first) { + // A valid pair of hexdigits represent the raw byte value. + if let Some(&&second) = iter.peek() { + if let Some(second_num) = hexdigit_to_num(second) { + output.push(first_num * 16 + second_num); + iter.next(); + continue; + } + } + } + + // Emit the raw sequence if it's not one of the special + // special cases checked above. + output.extend(&[b'=', first]); + } else { + // Last character in the input was an '=', just emit it. + output.push(b'='); + } + } + + + Ok(()) +} + +#[cfg(test)] +mod test_base64 { + use crate::decode::base64_decode_into_buf; + + #[test] + fn decodes_full_length() { + let mut decoded = Vec::new(); + assert!(base64_decode_into_buf("YWJj".as_bytes(), &mut decoded).is_ok()); + assert_eq!(decoded, &[b'a', b'b', b'c']); + } + + #[test] + fn decodes_with_two_padding() { + let mut decoded = Vec::new(); + assert!(base64_decode_into_buf("YWJjZA==".as_bytes(), &mut decoded).is_ok()); + assert_eq!(decoded, &[b'a', b'b', b'c', b'd']); + } + + #[test] + fn decodes_with_one_padding() { + let mut decoded = Vec::new(); + assert!(base64_decode_into_buf("YWJjZGU=".as_bytes(), &mut decoded).is_ok()); + assert_eq!(decoded, &[b'a', b'b', b'c', b'd', b'e']); + } + + #[test] + fn error_with_invalid_paddings() { + let mut decoded = Vec::new(); + assert!(base64_decode_into_buf("YWJj====".as_bytes(), &mut decoded).is_err()); + assert!(base64_decode_into_buf("YWJjZ===".as_bytes(), &mut decoded).is_err()); + assert!(base64_decode_into_buf("====".as_bytes(), &mut decoded).is_err()); + } + + #[test] + fn error_with_unpadded_input() { + let mut decoded = Vec::new(); + assert!(base64_decode_into_buf("YWJjZA=".as_bytes(), &mut decoded).is_err()); + } +} + +#[cfg(test)] +mod test_qp { + use crate::decode::qp_decode_into_buf; + + #[test] + fn decodes_byte() { + let mut decoded = Vec::new(); + assert!(qp_decode_into_buf("a=62c=64".as_bytes(), &mut decoded).is_ok()); + assert_eq!(decoded, &[b'a', b'b', b'c', b'd']); + } + + #[test] + fn decodes_soft_break() { + let mut decoded = Vec::new(); + assert!(qp_decode_into_buf("a=\r\nb=\nc".as_bytes(), &mut decoded).is_ok()); + assert_eq!(decoded, &[b'a', b'b', b'c']); + } + + #[test] + fn invalid_sequences_are_untouched() { + let mut decoded = Vec::new(); + let invalid_sequence = "a=6t= c=".as_bytes(); + assert!(qp_decode_into_buf(invalid_sequence, &mut decoded).is_ok()); + assert_eq!(decoded, invalid_sequence); + } +} diff --git a/src/deliver.rs b/src/deliver.rs new file mode 100644 index 0000000..8ade10f --- /dev/null +++ b/src/deliver.rs @@ -0,0 +1,176 @@ +// Copyright 2019 Alexandros Frantzis +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. +// +// SPDX-License-Identifier: MPL-2.0 + +//! Email delivery functionality. + +use std::fs::{self, File}; +use std::io::ErrorKind; +use std::io::prelude::*; +use std::os::unix::prelude::*; +use std::path::{PathBuf, Path}; +use std::process; +use std::sync::{Arc, Mutex}; +use std::time::{SystemTime, UNIX_EPOCH}; + +use crate::{DeliveryDurability, Result}; + +use gethostname::gethostname; +use libc; + +/// A generator for likely unique maildir email filenames. +/// +/// Using it as an iterator gets a filename that can be used in a maildir +/// and is likely to be unique. +pub struct EmailFilenameGenerator { + count: usize, + max_seen_unix_time: u64, + hostname: String, +} + +impl EmailFilenameGenerator { + pub fn new() -> Self { + // From https://cr.yp.to/proto/maildir.html: + // "To deal with invalid host names, replace / with \057 and : with \072" + let hostname = + gethostname() + .to_string_lossy() + .into_owned() + .replace("/", r"\057") + .replace(":", r"\072"); + + EmailFilenameGenerator{ + count: 0, + max_seen_unix_time: 0, + hostname: hostname, + } + } +} + +impl Iterator for EmailFilenameGenerator { + type Item = String; + + fn next(&mut self) -> Option<String> { + let unix_time = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs(); + let pid = process::id(); + + if self.max_seen_unix_time < unix_time { + self.max_seen_unix_time = unix_time; + self.count = 0; + } else { + self.count += 1; + } + + Some(format!("{}.{}_{}.{}", unix_time, pid, self.count, self.hostname)) + } +} + +/// A representation of a maildir. +pub struct Maildir { + root: PathBuf, + email_filename_gen: Arc<Mutex<EmailFilenameGenerator>>, +} + +impl Maildir { + /// Opens, or creates if it doesn't a exist, a maildir directory structure + /// at the specified path. + pub fn open_or_create( + mailbox: &Path, + email_filename_gen: Arc<Mutex<EmailFilenameGenerator>> + ) -> Result<Self> { + let root = PathBuf::from(mailbox); + for s in &["tmp", "new", "cur"] { + let path = root.join(&s); + fs::create_dir_all(&path)?; + } + + Ok(Maildir{root, email_filename_gen}) + } + + /// Delivers an email to the maildir by creating a new file with the email data, + /// and using the specified DeliveryDurability method. + pub fn deliver( + &self, + data: &[u8], + delivery_durability: DeliveryDurability + ) -> Result<PathBuf> { + loop { + let tmp_dir = self.root.join("tmp"); + let new_dir = self.root.join("new"); + + let tmp_email = self.write_email_to_dir(data, &tmp_dir)?; + let new_email = new_dir.join( + tmp_email.file_name().ok_or("")?.to_str().ok_or("")?); + + let result = fs::hard_link(&tmp_email, &new_email); + fs::remove_file(&tmp_email)?; + + match result { + Ok(_) => { + if delivery_durability == DeliveryDurability::FileAndDirSync { + File::open(&new_dir)?.sync_all()?; + File::open(&tmp_dir)?.sync_all()?; + } + return Ok(new_email); + }, + Err(ref err) if err.kind() == ErrorKind::AlreadyExists => {}, + Err(err) => return Err(err.into()), + } + } + } + + /// Delivers an email to the maildir by hard-linking with an existing file, + /// and using the specified DeliveryDurability method. + pub fn deliver_with_hard_link( + &self, + src: &Path, + delivery_durability: DeliveryDurability + ) -> Result<PathBuf> { + loop { + let new_dir = self.root.join("new"); + let new_email = new_dir.join(self.next_email_filename_candidate()?); + + match fs::hard_link(&src, &new_email) { + Ok(_) => { + if delivery_durability == DeliveryDurability::FileAndDirSync { + File::open(&new_dir)?.sync_all()?; + } + return Ok(new_email); + }, + Err(ref err) if err.kind() == ErrorKind::AlreadyExists => {}, + Err(err) => return Err(err.into()), + } + } + } + + /// Writes email data to a new file in the specified directory. + fn write_email_to_dir(&self, data: &[u8], dir: &Path) -> Result<PathBuf> { + loop { + let email = dir.join(self.next_email_filename_candidate()?); + let result = fs::OpenOptions::new() + .create_new(true) + .write(true) + .custom_flags(libc::O_SYNC) + .open(&email); + + match result { + Ok(mut f) => { + f.write_all(&data)?; + return Ok(email); + }, + Err(ref err) if err.kind() == ErrorKind::AlreadyExists => {}, + Err(err) => return Err(err.into()), + } + } + } + + /// Gets the next email filename candidate from the EmailFilenameGenerator. + fn next_email_filename_candidate(&self) -> Result<String> { + let mut gen = self.email_filename_gen.lock().map_err(|_| "")?; + gen.next().ok_or("".into()) + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..bdb8c85 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,373 @@ +// Copyright 2019 Alexandros Frantzis +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. +// +// SPDX-License-Identifier: MPL-2.0 + +//! The mda crate provides a library for writing custom Mail Deliver Agents. It +//! supports local delivery to maildirs, access to normalized email byte +//! data for easier processing, and access to individual header fields. +//! +//! Email data normalization involves ensuring header fields are in single +//! lines, decoding text parts of the message that use some kind of transfer +//! encoding (e.g., base64), and converting all text to UTF-8. The original +//! (non-normalized) email data is used during delivery. +//! +//! This crate also exposes convenience methods for regular expression searching +//! and processing/filtering of emails. +//! +//! # Email construction +//! +//! The [Email struct](struct.Email.html) is the basic abstraction of the `mda` +//! crate. To construct an Email use the +//! [Email::from_stdin](struct.Email.html#method.from_stdin) or +//! [Email::from_vec](struct.Email.html#method.from_vec) method. +//! +//! ```no_run +//! use mda::Email; +//! let email = Email::from_stdin()?; +//! let email = Email::from_vec(vec![97, 98, 99])?; +//! # Ok::<(), Box<dyn std::error::Error>>(()) +//! ``` +//! +//! # Email delivery +//! +//! Use the +//! [Email::deliver_to_maildir](struct.Email.html#method.deliver_to_maildir) +//! method to deliver the email to local maildir directories. Note that +//! the original (non-normalized) email data is used during delivery. +//! +//! ```no_run +//! use mda::Email; +//! let email = Email::from_stdin()?; +//! email.deliver_to_maildir("/my/maildir/path")?; +//! email.deliver_to_maildir("/my/other/maildir/path")?; +//! # Ok::<(), Box<dyn std::error::Error>>(()) +//! ``` +//! +//! # Accessing email header fields +//! +//! Use the [Email::header_field](struct.Email.html#method.header_field) and +//! [Email::header_field_all_occurrences](struct.Email.html#method.header_field_all_occurrences) +//! methods to access the email header fields. Any MIME encoded words in the +//! header field values are decoded and the field value is converted to UTF-8. +//! +//! ```no_run +//! use mda::Email; +//! let email = Email::from_stdin()?; +//! let to = email.header_field("To").unwrap_or(""); +//! if to.contains("me@example.com") { +//! email.deliver_to_maildir("/my/maildir/path")?; +//! } +//! # Ok::<(), Box<dyn std::error::Error>>(()) +//! ``` +//! +//! # Searching with regular expressions +//! +//! The [EmailRegex](trait.EmailRegex.html) trait provides convenience methods +//! for searching the header, the body or the whole email with regular +//! expressions. The convenience functions use case-insensitive, multi-line +//! search (`^` and `$` match beginning and end of lines). If the above don't +//! match your needs, or you require additional functionality, you can perform +//! manual regex search using the email data. +//! +//! ```no_run +//! use mda::{Email, EmailRegex}; +//! let email = Email::from_stdin()?; +//! if email.header().search(r"^To:.*me@example.com")? { +//! email.deliver_to_maildir("/my/maildir/path")?; +//! } +//! # Ok::<(), Box<dyn std::error::Error>>(()) +//! ``` +//! +//! # Processing and filtering the email with external programs +//! +//! Use the [Email::filter](struct.Email.html#method.filter) and +//! [Email::from_stdin_filtered](struct.Email.html#method.from_stdin_filtered) +//! methods to filter the email, in both cases creating a new email. +//! +//! ```no_run +//! use mda::Email; +//! // Filtering directly from stdin is more efficient. +//! let email = Email::from_stdin_filtered(&["bogofilter", "-ep"])?; +//! let bogosity = email.header_field("X-Bogosity").unwrap_or(""); +//! if bogosity.contains("Spam, tests=bogofilter") { +//! email.deliver_to_maildir("/my/spam/path")?; +//! } +//! // We can also filter at any other time. +//! let email = email.filter(&["bogofilter", "-ep"])?; +//! # Ok::<(), Box<dyn std::error::Error>>(()) +//! ``` +//! +//! To perform more general processing use the +//! [Email::process](struct.Email.html#method.process) +//! method: +//! +//! ```no_run +//! use mda::Email; +//! let email = Email::from_stdin()?; +//! let output = email.process(&["bogofilter"])?; +//! if let Some(0) = output.status.code() { +//! email.deliver_to_maildir("/my/spam/path")?; +//! } +//! # Ok::<(), Box<dyn std::error::Error>>(()) +//! ``` +//! +//! # Access to byte data +//! +//! Use the [Email::header](struct.Email.html#method.header), +//! [Email::body](struct.Email.html#method.body), +//! [Email::data](struct.Email.html#method.data) methods to access the +//! normalized byte data of the header, body and whole email respectively. +//! +//! Normalization involves ensuring header fields are in single lines, decoding +//! text parts of the message that use some kind of transfer encoding (e.g., +//! base64), and converting all text to UTF-8 character encoding. +//! +//! If for some reason you need access to non-normalized data use +//! [Email::raw_data](struct.Email.html#method.raw_data). +//! +//! ```no_run +//! use std::str; +//! use mda::Email; +//! let email = Email::from_stdin()?; +//! let body_str = String::from_utf8_lossy(email.header()); +//! +//! if body_str.contains("FREE BEER") { +//! email.deliver_to_maildir("/my/spam/path")?; +//! } +//! # Ok::<(), Box<dyn std::error::Error>>(()) +//! ``` +//! +//! # Decide delivery durability vs speed trade-off +//! +//! Use the [Email::set_delivery_durability](struct.Email.html#method.set_delivery_durability) +//! to decide which [DeliveryDurability](enum.DeliveryDurability.html) method to use. +//! By default the most durable (but also slower) method is used. +//! +//! ```no_run +//! use mda::{Email, DeliveryDurability}; +//! let mut email = Email::from_stdin()?; +//! email.set_delivery_durability(DeliveryDurability::FileSyncOnly); +//! # Ok::<(), Box<dyn std::error::Error>>(()) +//! ``` + +mod deliver; +mod regex; +mod processing; +mod normalize; +mod decode; + +use std::io; +use std::io::prelude::*; +use std::path::{PathBuf, Path}; +use std::sync:: {Arc, Mutex, RwLock}; +use std::collections::HashMap; + +use deliver::{Maildir, EmailFilenameGenerator}; +use normalize::normalize_email; + +pub use crate::regex::EmailRegex; + +pub type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>; + +fn find_empty_line(data: &[u8]) -> Option<usize> { + data.windows(2).position(|w| w[0]== b'\n' && (w[1] == b'\n' || w[1] == b'\r')) +} + +/// The method to use to try to guarantee durable email delivery. +#[derive(PartialEq, Copy, Clone)] +pub enum DeliveryDurability { + /// Perform both file and directory syncing during delivery. + /// This is the default delivery durability method. + FileAndDirSync, + /// Perform only file sync during delivery. This method is + /// potentially much faster, and is used by many existing + /// MDAs, but, depending on the used filesystem, may not + /// provide the required delivery durability guarantees. + FileSyncOnly, +} + +/// A representation of an email. +pub struct Email { + data: Vec<u8>, + normalized_data: Vec<u8>, + body_index: usize, + deliver_path: RwLock<Option<PathBuf>>, + fields: HashMap<String, Vec<String>>, + email_filename_gen: Arc<Mutex<EmailFilenameGenerator>>, + delivery_durability: DeliveryDurability, +} + +impl Email { + /// Creates an `Email` by reading data from stdin. + /// + /// # Example + /// + /// ```no_run + /// # use mda::Email; + /// let email = Email::from_stdin()?; + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn from_stdin() -> Result<Self> { + let stdin = io::stdin(); + let mut data = Vec::new(); + stdin.lock().read_to_end(&mut data)?; + Email::from_vec(data) + } + + /// Creates an `Email` by using data passed in a `Vec<u8>`. + /// + /// # Example + /// + /// ```no_run + /// # use mda::Email; + /// let email = Email::from_vec(vec![1, 2, 3])?; + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn from_vec(data: Vec<u8>) -> Result<Self> { + let (normalized_data, fields) = normalize_email(&data); + let body_index = find_empty_line(&normalized_data).unwrap_or(normalized_data.len()); + let email_filename_gen = Arc::new(Mutex::new(EmailFilenameGenerator::new())); + + Ok( + Email{ + data: data, + normalized_data: normalized_data, + body_index: body_index, + deliver_path: RwLock::new(None), + fields: fields, + email_filename_gen: email_filename_gen, + delivery_durability: DeliveryDurability::FileAndDirSync, + } + ) + } + + /// Sets the durability method for delivery of this email. + /// + /// # Example + /// + /// ```no_run + /// # use mda::{DeliveryDurability, Email}; + /// let mut email = Email::from_stdin()?; + /// email.set_delivery_durability(DeliveryDurability::FileSyncOnly); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn set_delivery_durability(&mut self, delivery_durability: DeliveryDurability) { + self.delivery_durability = delivery_durability; + } + + /// Returns the value of a header field, if present. If a field occurs + /// multiple times, the value of the first occurrence is returned. + /// + /// # Example + /// + /// ```no_run + /// # use mda::Email; + /// let email = Email::from_stdin()?; + /// let to = email.header_field("To").unwrap_or(""); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn header_field(&self, name: &str) -> Option<&str> { + self.fields.get(&name.to_lowercase()).map(|v| v[0].as_str()) + } + + /// Returns the values from all occurrences of a header field, if present. + /// + /// # Example + /// + /// ```no_run + /// # use mda::Email; + /// let email = Email::from_stdin()?; + /// if let Some(all_received) = email.header_field_all_occurrences("Received") { + /// // process all_received + /// } + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn header_field_all_occurrences(&self, name: &str) -> Option<&Vec<String>> { + self.fields.get(&name.to_lowercase()).map(|v| v) + } + + /// Delivers the email to the specified maildir. If the maildir isn't + /// present it is created. + /// + /// The first delivery of an email involves writing the email data to + /// the target file, whereas subsequent deliveries try to use a hard link + /// to the first delivery, falling back to a normal write if needed. + /// + /// The email is delivered durably by syncing both the file and the + /// associated directories (`DeliveryDurability::FileAndDirSync`), + /// unless a different durability method is specified with + /// `set_delivery_durability`. + /// + /// # Example + /// + /// ```no_run + /// # use mda::Email; + /// let email = Email::from_stdin()?; + /// email.deliver_to_maildir("/path/to/maildir/")?; + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn deliver_to_maildir(&self, path: impl AsRef<Path>) -> Result<PathBuf> { + self.deliver_to_maildir_path(path.as_ref()) + } + + fn deliver_to_maildir_path(&self, path: &Path) -> Result<PathBuf> { + let maildir = Maildir::open_or_create(&path, self.email_filename_gen.clone())?; + + if let Some(deliver_path) = self.deliver_path.read().unwrap().as_ref() { + let email_path_result = + maildir.deliver_with_hard_link( + deliver_path, + self.delivery_durability); + + if email_path_result.is_ok() { + return email_path_result; + } + } + + let email_path = maildir.deliver(&self.data, self.delivery_durability)?; + + *self.deliver_path.write().unwrap() = Some(email_path.clone()); + + Ok(email_path) + } + + /// Returns whether the email has been delivered to at least one maildir. + /// + /// # Example + /// + /// ```no_run + /// # use mda::Email; + /// let email = Email::from_stdin()?; + /// if !email.has_been_delivered() { + /// email.deliver_to_maildir("/fallback/maildir/")?; + /// } + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn has_been_delivered(&self) -> bool { + self.deliver_path.read().unwrap().is_some() + } + + /// Provides access to the normalized email byte data. + pub fn data(&self) -> &[u8] { + &self.normalized_data + } + + /// Provides access to the normalized email header byte data. + pub fn header(&self) -> &[u8] { + &self.normalized_data[..self.body_index] + } + + /// Provides access to the normalized email body byte data. + pub fn body(&self) -> &[u8] { + &self.normalized_data[self.body_index..] + } + + /// Provides access to the raw (non-normalized) email byte data. + pub fn raw_data(&self) -> &[u8] { + &self.data + } +} diff --git a/src/normalize.rs b/src/normalize.rs new file mode 100644 index 0000000..7c8487d --- /dev/null +++ b/src/normalize.rs @@ -0,0 +1,477 @@ +// Copyright 2019 Alexandros Frantzis +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. +// +// SPDX-License-Identifier: MPL-2.0 + +//! Normalization of email data for easier processing. +//! +//! Normalization includes: +//! +//! * Placing multi-line header fields on a single line +//! * Decoding base64 or quoted-printable encoded text data, including +//! MIME encoded-words in the header. +//! * Converting all text data to UTF-8. + +use ::regex::bytes::{RegexBuilder, Regex, Captures}; +use std::collections::HashMap; +use std::iter::Peekable; +use memchr::{memchr, memchr_iter}; +use charset::Charset; +use std::borrow::Cow; +use lazy_static::lazy_static; + +use crate::decode::{base64_decode_into_buf, qp_decode_into_buf}; + +/// An element recognized by the [EmailParser](struct.EmailParser.html). +enum Element { + HeaderField{data: Vec<u8>}, + Body{ + data: Vec<u8>, + encoding: Option<String>, + content_type: Option<String>, + charset: Option<String> + }, + Verbatim{data: Vec<u8>}, +} + +/// Information about a part in a multi-part email message. +/// The top-level is also considered a part. +struct Part { + encoding: Option<String>, + content_type: Option<String>, + charset: Option<String>, + subpart_boundary: Option<Vec<u8>>, +} + +impl Part { + fn new() -> Self { + Part{ + encoding: None, + content_type: None, + charset: None, + subpart_boundary: None, + } + } +} + +/// Iterator for the lines contained in a slice of [u8]. +pub struct SliceLines<'a> { + buf: &'a [u8], + last: usize, +} + +impl<'a> Iterator for SliceLines<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option<&'a [u8]> { + match memchr(b'\n', &self.buf[self.last..]) { + Some(m) => { + let line = &self.buf[self.last..=(self.last + m)]; + self.last = self.last + m + 1; + Some(line) + }, + None => { + let line = &self.buf[self.last..]; + if line.is_empty() { + None + } else { + self.last = self.buf.len(); + Some(line) + } + } + } + } +} + +/// A parser for the elements contained in an email. +/// +/// The parsed elements are accessible by iterating over the parser. +/// +/// Every line in the email is contained in a MIME part (which itself may be +/// nested in another part). The top level of the email is also considered +/// to be a part for convenience of processing. +struct EmailParser<'a> { + lines: Peekable<SliceLines<'a>>, + // The stack of nested parts the line we are processing is contained in. + part_stack: Vec<Part>, + // Whether we currently parsing header lines. + in_header: bool, + // The active multi-part boundary. + active_boundary: Vec<u8>, + content_encoding_regex: Regex, + content_type_regex: Regex, + boundary_regex: Regex, +} + +impl<'a> EmailParser<'a> { + fn new(buf: &'a [u8]) -> Self { + let content_encoding_regex = + RegexBuilder::new(r"Content-Transfer-Encoding:\s*([[:alnum:]-]+)") + .case_insensitive(true) + .build().unwrap(); + let content_type_regex = + RegexBuilder::new(r#"^Content-Type:\s*([^;]+)\s*(?:;\s*charset\s*=\s*"?([[:alnum:]_:\-\.]+))?"?"#) + .case_insensitive(true) + .build().unwrap(); + + let boundary_regex = + RegexBuilder::new(r#"^Content-Type:\s*multipart/.*boundary\s*=\s*"?([[:alnum:]'_,/:=\(\)\+\-\.\?]+)"?"#) + .case_insensitive(true) + .build().unwrap(); + + EmailParser{ + lines: SliceLines{buf, last: 0}.peekable(), + // All emails have the top-level part. + part_stack: vec![Part::new()], + in_header: true, + active_boundary: Vec::new(), + content_encoding_regex: content_encoding_regex, + content_type_regex: content_type_regex, + boundary_regex: boundary_regex, + } + } + + // Returns the content type of the active part. + fn active_content_type(&self) -> Option<String> { + self.part_stack.last()?.content_type.clone() + } + + // Returns the encoding of the active part. + fn active_encoding(&self) -> Option<String> { + self.part_stack.last()?.encoding.clone() + } + + // Returns the charset of the active part. + fn active_charset(&self) -> Option<String> { + self.part_stack.last()?.charset.clone() + } + + fn begin_part(&mut self) { + let part = self.part_stack.last().unwrap(); + + // We need to differentiate between the first and subsequent parts in a + // multipart message. The first part creates a new subpart in the + // part_stack... + if part.subpart_boundary.as_ref().is_some() && + part.subpart_boundary.as_ref().unwrap() == &self.active_boundary { + self.part_stack.push(Part::new()) + } else { + // ...whereas subsequent sibling parts just replace the existing + // part in the stack. + let part = self.part_stack.last_mut().unwrap(); + *part = Part::new(); + } + } + + fn end_part(&mut self) { + self.part_stack.pop(); + if let Some(part) = self.part_stack.last_mut() { + part.subpart_boundary = None; + } + for p in self.part_stack.iter().rev() { + if let Some(b) = &p.subpart_boundary { + self.active_boundary = b.clone(); + } + } + } + + fn update_active_part_from_header_field(&mut self, field: &[u8]) { + let mut part = self.part_stack.last_mut().unwrap(); + + if let Some(captures) = self.content_encoding_regex.captures(&field) { + let enc_bytes = captures.get(1).unwrap().as_bytes(); + part.encoding = Some(std::str::from_utf8(&enc_bytes).unwrap().to_lowercase()); + } else if let Some(captures) = self.boundary_regex.captures(&field) { + part.subpart_boundary = Some(captures.get(1).unwrap().as_bytes().to_vec()); + self.active_boundary = part.subpart_boundary.as_ref().unwrap().clone(); + } + else if let Some(captures) = self.content_type_regex.captures(&field) { + let type_bytes = captures.get(1).unwrap().as_bytes(); |