From 5d3418ab24cb0d532d863f16853c93dd757a6217 Mon Sep 17 00:00:00 2001 From: Matthias Beyer Date: Sat, 5 Oct 2019 13:54:18 +0200 Subject: Move header related code to header module Signed-off-by: Matthias Beyer --- src/header.rs | 347 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 342 +-------------------------------------------------------- 2 files changed, 349 insertions(+), 340 deletions(-) create mode 100644 src/header.rs diff --git a/src/header.rs b/src/header.rs new file mode 100644 index 0000000..3447156 --- /dev/null +++ b/src/header.rs @@ -0,0 +1,347 @@ +use charset::decode_latin1; +use charset::Charset; + +use error::*; +use util::is_boundary; +use util::find_from; + +/// A struct that represents a single header in the message. +/// It holds slices into the raw byte array passed to parse_mail, and so the +/// lifetime of this struct must be contained within the lifetime of the raw +/// input. There are additional accessor functions on this struct to extract +/// the data as Rust strings. +#[derive(Debug)] +pub struct MailHeader<'a> { + key: &'a [u8], + value: &'a [u8], +} + +impl<'a> MailHeader<'a> { + /// Get the name of the header. Note that header names are case-insensitive. + pub fn get_key(&self) -> Result { + Ok(decode_latin1(self.key).into_owned()) + } + + fn decode_word(&self, encoded: &str) -> Option { + let ix_delim1 = encoded.find('?')?; + let ix_delim2 = find_from(encoded, ix_delim1 + 1, "?")?; + + let charset = &encoded[0..ix_delim1]; + let transfer_coding = &encoded[ix_delim1 + 1..ix_delim2]; + let input = &encoded[ix_delim2 + 1..]; + + let decoded = match transfer_coding { + "B" | "b" => base64::decode(input.as_bytes()).ok()?, + "Q" | "q" => { + // The quoted_printable module does a trim_end on the input, so if + // that affects the output we should save and restore the trailing + // whitespace + let to_decode = input.replace("_", " "); + let trimmed = to_decode.trim_end(); + let mut d = quoted_printable::decode(&trimmed, quoted_printable::ParseMode::Robust); + if d.is_ok() && to_decode.len() != trimmed.len() { + d.as_mut() + .unwrap() + .extend_from_slice(to_decode[trimmed.len()..].as_bytes()); + } + d.ok()? + } + _ => return None, + }; + let charset = Charset::for_label_no_replacement(charset.as_bytes())?; + let (cow, _) = charset.decode_without_bom_handling(&decoded); + Some(cow.into_owned()) + } + + /// Get the value of the header. Any sequences of newlines characters followed + /// by whitespace are collapsed into a single space. In effect, header values + /// wrapped across multiple lines are compacted back into one line, while + /// discarding the extra whitespace required by the MIME format. Additionally, + /// any quoted-printable words in the value are decoded. + /// + /// # Examples + /// ``` + /// use mailparse::parse_header; + /// let (parsed, _) = parse_header(b"Subject: =?iso-8859-1?Q?=A1Hola,_se=F1or!?=").unwrap(); + /// assert_eq!(parsed.get_key().unwrap(), "Subject"); + /// assert_eq!(parsed.get_value().unwrap(), "\u{a1}Hola, se\u{f1}or!"); + /// ``` + pub fn get_value(&self) -> Result { + let mut result = String::new(); + let chars = decode_latin1(self.value); + let mut lines = chars.lines(); + let mut add_space = false; + while let Some(line) = lines.next().map(str::trim_start) { + if add_space { + result.push(' '); + } + add_space = true; + + let mut ix_search = 0; + loop { + match find_from(line, ix_search, "=?") { + Some(v) => { + let ix_begin = v + 2; + if !is_boundary(line, ix_begin.checked_sub(3)) { + result.push_str(&line[ix_search..ix_begin]); + ix_search = ix_begin; + continue; + } + result.push_str(&line[ix_search..ix_begin - 2]); + let mut ix_end_search = ix_begin; + loop { + match find_from(line, ix_end_search, "?=") { + Some(ix_end) => { + if !is_boundary(line, ix_end.checked_add(2)) { + ix_end_search = ix_end + 2; + continue; + } + match self.decode_word(&line[ix_begin..ix_end]) { + Some(v) => { + result.push_str(&v); + add_space = false; + } + None => result.push_str(&line[ix_begin - 2..ix_end + 2]), + }; + ix_search = ix_end; + } + None => { + result.push_str(&"=?"); + ix_search = ix_begin - 2; + } + }; + break; + } + ix_search += 2; + continue; + } + None => { + result.push_str(&line[ix_search..]); + break; + } + }; + } + } + Ok(result) + } +} + +#[derive(Debug)] +enum HeaderParseState { + Initial, + Key, + PreValue, + Value, + ValueNewline, +} + +/// Parse a single header from the raw data given. +/// This function takes raw byte data, and starts parsing it, expecting there +/// to be a MIME header key-value pair right at the beginning. It parses that +/// header and returns it, along with the index at which the next header is +/// expected to start. If you just want to parse a single header, you can ignore +/// the second component of the tuple, which is the index of the next header. +/// Error values are returned if the data could not be successfully interpreted +/// as a MIME key-value pair. +/// +/// # Examples +/// ``` +/// use mailparse::parse_header; +/// let (parsed, _) = parse_header(concat!( +/// "Subject: Hello, sir,\n", +/// " I am multiline\n", +/// "Next:Header").as_bytes()) +/// .unwrap(); +/// assert_eq!(parsed.get_key().unwrap(), "Subject"); +/// assert_eq!(parsed.get_value().unwrap(), "Hello, sir, I am multiline"); +/// ``` +pub fn parse_header(raw_data: &[u8]) -> Result<(MailHeader, usize), MailParseError> { + let mut it = raw_data.iter(); + let mut ix = 0; + let mut c = match it.next() { + None => return Err(MailParseError::Generic("Empty string provided")), + Some(v) => *v, + }; + + let mut ix_key_end = None; + let mut ix_value_start = 0; + let mut ix_value_end = 0; + + let mut state = HeaderParseState::Initial; + loop { + match state { + HeaderParseState::Initial => { + if c == b' ' { + return Err(MailParseError::Generic( + "Header cannot start with a space; it is \ + likely an overhanging line from a \ + previous header", + )); + }; + state = HeaderParseState::Key; + continue; + } + HeaderParseState::Key => { + if c == b':' { + ix_key_end = Some(ix); + state = HeaderParseState::PreValue; + } else if c == b'\n' { + return Err(MailParseError::Generic("Unexpected newline in header key")); + } + } + HeaderParseState::PreValue => { + if c != b' ' { + ix_value_start = ix; + ix_value_end = ix; + state = HeaderParseState::Value; + continue; + } + } + HeaderParseState::Value => { + if c == b'\n' { + state = HeaderParseState::ValueNewline; + } else { + ix_value_end = ix + 1; + } + } + HeaderParseState::ValueNewline => { + if c == b' ' || c == b'\t' { + state = HeaderParseState::Value; + continue; + } else { + break; + } + } + } + ix += 1; + c = match it.next() { + None => break, + Some(v) => *v, + }; + } + match ix_key_end { + Some(v) => Ok(( + MailHeader { + key: &raw_data[0..v], + value: &raw_data[ix_value_start..ix_value_end], + }, + ix, + )), + + None => Err(MailParseError::Generic( + "Unable to determine end of the header key component", + )), + } +} + +/// A trait that is implemented by the [MailHeader] slice. These functions are +/// also available on Vec which is returned by the parse_headers +/// function. It provides a map-like interface to look up header values by their +/// name. +pub trait MailHeaderMap { + /// Look through the list of headers and return the value of the first one + /// that matches the provided key. It returns Ok(None) if the no matching + /// header was found. Header names are matched case-insensitively. + /// + /// # Examples + /// ``` + /// use mailparse::{parse_mail, MailHeaderMap}; + /// let headers = parse_mail(concat!( + /// "Subject: Test\n", + /// "\n", + /// "This is a test message").as_bytes()) + /// .unwrap().headers; + /// assert_eq!(headers.get_first_value("Subject").unwrap(), Some("Test".to_string())); + /// ``` + fn get_first_value(&self, key: &str) -> Result, MailParseError>; + + /// Look through the list of headers and return the values of all headers + /// matching the provided key. Returns an empty vector if no matching headers + /// were found. The order of the returned values is the same as the order + /// of the matching headers in the message. Header names are matched + /// case-insensitively. + /// + /// # Examples + /// ``` + /// use mailparse::{parse_mail, MailHeaderMap}; + /// let headers = parse_mail(concat!( + /// "Key: Value1\n", + /// "Key: Value2").as_bytes()) + /// .unwrap().headers; + /// assert_eq!(headers.get_all_values("Key").unwrap(), + /// vec!["Value1".to_string(), "Value2".to_string()]); + /// ``` + fn get_all_values(&self, key: &str) -> Result, MailParseError>; +} + +impl<'a> MailHeaderMap for [MailHeader<'a>] { + fn get_first_value(&self, key: &str) -> Result, MailParseError> { + for x in self { + if x.get_key()?.eq_ignore_ascii_case(key) { + return x.get_value().map(Some); + } + } + Ok(None) + } + + fn get_all_values(&self, key: &str) -> Result, MailParseError> { + let mut values: Vec = Vec::new(); + for x in self { + if x.get_key()?.eq_ignore_ascii_case(key) { + values.push(x.get_value()?); + } + } + Ok(values) + } +} + +/// Parses all the headers from the raw data given. +/// This function takes raw byte data, and starts parsing it, expecting there +/// to be zero or more MIME header key-value pair right at the beginning, +/// followed by two consecutive newlines (i.e. a blank line). It parses those +/// headers and returns them in a vector. The normal vector functions can be +/// used to access the headers linearly, or the MailHeaderMap trait can be used +/// to access them in a map-like fashion. Along with this vector, the function +/// returns the index at which the message body is expected to start. If you +/// just care about the headers, you can ignore the second component of the +/// returned tuple. +/// Error values are returned if there was some sort of parsing error. +/// +/// # Examples +/// ``` +/// use mailparse::{parse_headers, MailHeaderMap}; +/// let (headers, _) = parse_headers(concat!( +/// "Subject: Test\n", +/// "From: me@myself.com\n", +/// "To: you@yourself.com").as_bytes()) +/// .unwrap(); +/// assert_eq!(headers[1].get_key().unwrap(), "From"); +/// assert_eq!(headers.get_first_value("To").unwrap(), Some("you@yourself.com".to_string())); +/// ``` +pub fn parse_headers(raw_data: &[u8]) -> Result<(Vec, usize), MailParseError> { + let mut headers: Vec = Vec::new(); + let mut ix = 0; + loop { + if ix >= raw_data.len() { + break; + } else if raw_data[ix] == b'\n' { + ix += 1; + break; + } else if raw_data[ix] == b'\r' { + if ix + 1 < raw_data.len() && raw_data[ix + 1] == b'\n' { + ix += 2; + break; + } else { + return Err(MailParseError::Generic( + "Headers were followed by an unexpected lone \ + CR character!", + )); + } + } + let (header, ix_next) = parse_header(&raw_data[ix..])?; + headers.push(header); + ix += ix_next; + } + Ok((headers, ix)) +} + diff --git a/src/lib.rs b/src/lib.rs index e5c98c4..a81b512 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,351 +7,13 @@ use std::collections::BTreeMap; pub mod body; mod dateparse; mod error; +mod header; mod util; use body::Body; pub use dateparse::dateparse; pub use error::*; - -/// A struct that represents a single header in the message. -/// It holds slices into the raw byte array passed to parse_mail, and so the -/// lifetime of this struct must be contained within the lifetime of the raw -/// input. There are additional accessor functions on this struct to extract -/// the data as Rust strings. -#[derive(Debug)] -pub struct MailHeader<'a> { - key: &'a [u8], - value: &'a [u8], -} - -impl<'a> MailHeader<'a> { - /// Get the name of the header. Note that header names are case-insensitive. - pub fn get_key(&self) -> Result { - Ok(decode_latin1(self.key).into_owned()) - } - - fn decode_word(&self, encoded: &str) -> Option { - let ix_delim1 = encoded.find('?')?; - let ix_delim2 = find_from(encoded, ix_delim1 + 1, "?")?; - - let charset = &encoded[0..ix_delim1]; - let transfer_coding = &encoded[ix_delim1 + 1..ix_delim2]; - let input = &encoded[ix_delim2 + 1..]; - - let decoded = match transfer_coding { - "B" | "b" => base64::decode(input.as_bytes()).ok()?, - "Q" | "q" => { - // The quoted_printable module does a trim_end on the input, so if - // that affects the output we should save and restore the trailing - // whitespace - let to_decode = input.replace("_", " "); - let trimmed = to_decode.trim_end(); - let mut d = quoted_printable::decode(&trimmed, quoted_printable::ParseMode::Robust); - if d.is_ok() && to_decode.len() != trimmed.len() { - d.as_mut() - .unwrap() - .extend_from_slice(to_decode[trimmed.len()..].as_bytes()); - } - d.ok()? - } - _ => return None, - }; - let charset = Charset::for_label_no_replacement(charset.as_bytes())?; - let (cow, _) = charset.decode_without_bom_handling(&decoded); - Some(cow.into_owned()) - } - - /// Get the value of the header. Any sequences of newlines characters followed - /// by whitespace are collapsed into a single space. In effect, header values - /// wrapped across multiple lines are compacted back into one line, while - /// discarding the extra whitespace required by the MIME format. Additionally, - /// any quoted-printable words in the value are decoded. - /// - /// # Examples - /// ``` - /// use mailparse::parse_header; - /// let (parsed, _) = parse_header(b"Subject: =?iso-8859-1?Q?=A1Hola,_se=F1or!?=").unwrap(); - /// assert_eq!(parsed.get_key().unwrap(), "Subject"); - /// assert_eq!(parsed.get_value().unwrap(), "\u{a1}Hola, se\u{f1}or!"); - /// ``` - pub fn get_value(&self) -> Result { - let mut result = String::new(); - let chars = decode_latin1(self.value); - let mut lines = chars.lines(); - let mut add_space = false; - while let Some(line) = lines.next().map(str::trim_start) { - if add_space { - result.push(' '); - } - add_space = true; - - let mut ix_search = 0; - loop { - match find_from(line, ix_search, "=?") { - Some(v) => { - let ix_begin = v + 2; - if !is_boundary(line, ix_begin.checked_sub(3)) { - result.push_str(&line[ix_search..ix_begin]); - ix_search = ix_begin; - continue; - } - result.push_str(&line[ix_search..ix_begin - 2]); - let mut ix_end_search = ix_begin; - loop { - match find_from(line, ix_end_search, "?=") { - Some(ix_end) => { - if !is_boundary(line, ix_end.checked_add(2)) { - ix_end_search = ix_end + 2; - continue; - } - match self.decode_word(&line[ix_begin..ix_end]) { - Some(v) => { - result.push_str(&v); - add_space = false; - } - None => result.push_str(&line[ix_begin - 2..ix_end + 2]), - }; - ix_search = ix_end; - } - None => { - result.push_str(&"=?"); - ix_search = ix_begin - 2; - } - }; - break; - } - ix_search += 2; - continue; - } - None => { - result.push_str(&line[ix_search..]); - break; - } - }; - } - } - Ok(result) - } -} - -#[derive(Debug)] -enum HeaderParseState { - Initial, - Key, - PreValue, - Value, - ValueNewline, -} - -/// Parse a single header from the raw data given. -/// This function takes raw byte data, and starts parsing it, expecting there -/// to be a MIME header key-value pair right at the beginning. It parses that -/// header and returns it, along with the index at which the next header is -/// expected to start. If you just want to parse a single header, you can ignore -/// the second component of the tuple, which is the index of the next header. -/// Error values are returned if the data could not be successfully interpreted -/// as a MIME key-value pair. -/// -/// # Examples -/// ``` -/// use mailparse::parse_header; -/// let (parsed, _) = parse_header(concat!( -/// "Subject: Hello, sir,\n", -/// " I am multiline\n", -/// "Next:Header").as_bytes()) -/// .unwrap(); -/// assert_eq!(parsed.get_key().unwrap(), "Subject"); -/// assert_eq!(parsed.get_value().unwrap(), "Hello, sir, I am multiline"); -/// ``` -pub fn parse_header(raw_data: &[u8]) -> Result<(MailHeader, usize), MailParseError> { - let mut it = raw_data.iter(); - let mut ix = 0; - let mut c = match it.next() { - None => return Err(MailParseError::Generic("Empty string provided")), - Some(v) => *v, - }; - - let mut ix_key_end = None; - let mut ix_value_start = 0; - let mut ix_value_end = 0; - - let mut state = HeaderParseState::Initial; - loop { - match state { - HeaderParseState::Initial => { - if c == b' ' { - return Err(MailParseError::Generic( - "Header cannot start with a space; it is \ - likely an overhanging line from a \ - previous header", - )); - }; - state = HeaderParseState::Key; - continue; - } - HeaderParseState::Key => { - if c == b':' { - ix_key_end = Some(ix); - state = HeaderParseState::PreValue; - } else if c == b'\n' { - return Err(MailParseError::Generic("Unexpected newline in header key")); - } - } - HeaderParseState::PreValue => { - if c != b' ' { - ix_value_start = ix; - ix_value_end = ix; - state = HeaderParseState::Value; - continue; - } - } - HeaderParseState::Value => { - if c == b'\n' { - state = HeaderParseState::ValueNewline; - } else { - ix_value_end = ix + 1; - } - } - HeaderParseState::ValueNewline => { - if c == b' ' || c == b'\t' { - state = HeaderParseState::Value; - continue; - } else { - break; - } - } - } - ix += 1; - c = match it.next() { - None => break, - Some(v) => *v, - }; - } - match ix_key_end { - Some(v) => Ok(( - MailHeader { - key: &raw_data[0..v], - value: &raw_data[ix_value_start..ix_value_end], - }, - ix, - )), - - None => Err(MailParseError::Generic( - "Unable to determine end of the header key component", - )), - } -} - -/// A trait that is implemented by the [MailHeader] slice. These functions are -/// also available on Vec which is returned by the parse_headers -/// function. It provides a map-like interface to look up header values by their -/// name. -pub trait MailHeaderMap { - /// Look through the list of headers and return the value of the first one - /// that matches the provided key. It returns Ok(None) if the no matching - /// header was found. Header names are matched case-insensitively. - /// - /// # Examples - /// ``` - /// use mailparse::{parse_mail, MailHeaderMap}; - /// let headers = parse_mail(concat!( - /// "Subject: Test\n", - /// "\n", - /// "This is a test message").as_bytes()) - /// .unwrap().headers; - /// assert_eq!(headers.get_first_value("Subject").unwrap(), Some("Test".to_string())); - /// ``` - fn get_first_value(&self, key: &str) -> Result, MailParseError>; - - /// Look through the list of headers and return the values of all headers - /// matching the provided key. Returns an empty vector if no matching headers - /// were found. The order of the returned values is the same as the order - /// of the matching headers in the message. Header names are matched - /// case-insensitively. - /// - /// # Examples - /// ``` - /// use mailparse::{parse_mail, MailHeaderMap}; - /// let headers = parse_mail(concat!( - /// "Key: Value1\n", - /// "Key: Value2").as_bytes()) - /// .unwrap().headers; - /// assert_eq!(headers.get_all_values("Key").unwrap(), - /// vec!["Value1".to_string(), "Value2".to_string()]); - /// ``` - fn get_all_values(&self, key: &str) -> Result, MailParseError>; -} - -impl<'a> MailHeaderMap for [MailHeader<'a>] { - fn get_first_value(&self, key: &str) -> Result, MailParseError> { - for x in self { - if x.get_key()?.eq_ignore_ascii_case(key) { - return x.get_value().map(Some); - } - } - Ok(None) - } - - fn get_all_values(&self, key: &str) -> Result, MailParseError> { - let mut values: Vec = Vec::new(); - for x in self { - if x.get_key()?.eq_ignore_ascii_case(key) { - values.push(x.get_value()?); - } - } - Ok(values) - } -} - -/// Parses all the headers from the raw data given. -/// This function takes raw byte data, and starts parsing it, expecting there -/// to be zero or more MIME header key-value pair right at the beginning, -/// followed by two consecutive newlines (i.e. a blank line). It parses those -/// headers and returns them in a vector. The normal vector functions can be -/// used to access the headers linearly, or the MailHeaderMap trait can be used -/// to access them in a map-like fashion. Along with this vector, the function -/// returns the index at which the message body is expected to start. If you -/// just care about the headers, you can ignore the second component of the -/// returned tuple. -/// Error values are returned if there was some sort of parsing error. -/// -/// # Examples -/// ``` -/// use mailparse::{parse_headers, MailHeaderMap}; -/// let (headers, _) = parse_headers(concat!( -/// "Subject: Test\n", -/// "From: me@myself.com\n", -/// "To: you@yourself.com").as_bytes()) -/// .unwrap(); -/// assert_eq!(headers[1].get_key().unwrap(), "From"); -/// assert_eq!(headers.get_first_value("To").unwrap(), Some("you@yourself.com".to_string())); -/// ``` -pub fn parse_headers(raw_data: &[u8]) -> Result<(Vec, usize), MailParseError> { - let mut headers: Vec = Vec::new(); - let mut ix = 0; - loop { - if ix >= raw_data.len() { - break; - } else if raw_data[ix] == b'\n' { - ix += 1; - break; - } else if raw_data[ix] == b'\r' { - if ix + 1 < raw_data.len() && raw_data[ix + 1] == b'\n' { - ix += 2; - break; - } else { - return Err(MailParseError::Generic( - "Headers were followed by an unexpected lone \ - CR character!", - )); - } - } - let (header, ix_next) = parse_header(&raw_data[ix..])?; - headers.push(header); - ix += ix_next; - } - Ok((headers, ix)) -} +pub use header::*; /// A struct to hold a more structured representation of the Content-Type header. /// This is provided mostly as a convenience since this metadata is usually -- cgit v1.2.3