diff options
Diffstat (limited to 'src/header.rs')
-rw-r--r-- | src/header.rs | 347 |
1 files changed, 347 insertions, 0 deletions
diff --git a/src/header.rs b/src/header.rs new file mode 100644 index 0000000..3447156 --- /dev/null +++ b/src/header.rs @@ -0,0 +1,347 @@ +use charset::decode_latin1; +use charset::Charset; + +use error::*; +use util::is_boundary; +use util::find_from; + +/// A struct that represents a single header in the message. +/// It holds slices into the raw byte array passed to parse_mail, and so the +/// lifetime of this struct must be contained within the lifetime of the raw +/// input. There are additional accessor functions on this struct to extract +/// the data as Rust strings. +#[derive(Debug)] +pub struct MailHeader<'a> { + key: &'a [u8], + value: &'a [u8], +} + +impl<'a> MailHeader<'a> { + /// Get the name of the header. Note that header names are case-insensitive. + pub fn get_key(&self) -> Result<String, MailParseError> { + Ok(decode_latin1(self.key).into_owned()) + } + + fn decode_word(&self, encoded: &str) -> Option<String> { + let ix_delim1 = encoded.find('?')?; + let ix_delim2 = find_from(encoded, ix_delim1 + 1, "?")?; + + let charset = &encoded[0..ix_delim1]; + let transfer_coding = &encoded[ix_delim1 + 1..ix_delim2]; + let input = &encoded[ix_delim2 + 1..]; + + let decoded = match transfer_coding { + "B" | "b" => base64::decode(input.as_bytes()).ok()?, + "Q" | "q" => { + // The quoted_printable module does a trim_end on the input, so if + // that affects the output we should save and restore the trailing + // whitespace + let to_decode = input.replace("_", " "); + let trimmed = to_decode.trim_end(); + let mut d = quoted_printable::decode(&trimmed, quoted_printable::ParseMode::Robust); + if d.is_ok() && to_decode.len() != trimmed.len() { + d.as_mut() + .unwrap() + .extend_from_slice(to_decode[trimmed.len()..].as_bytes()); + } + d.ok()? + } + _ => return None, + }; + let charset = Charset::for_label_no_replacement(charset.as_bytes())?; + let (cow, _) = charset.decode_without_bom_handling(&decoded); + Some(cow.into_owned()) + } + + /// Get the value of the header. Any sequences of newlines characters followed + /// by whitespace are collapsed into a single space. In effect, header values + /// wrapped across multiple lines are compacted back into one line, while + /// discarding the extra whitespace required by the MIME format. Additionally, + /// any quoted-printable words in the value are decoded. + /// + /// # Examples + /// ``` + /// use mailparse::parse_header; + /// let (parsed, _) = parse_header(b"Subject: =?iso-8859-1?Q?=A1Hola,_se=F1or!?=").unwrap(); + /// assert_eq!(parsed.get_key().unwrap(), "Subject"); + /// assert_eq!(parsed.get_value().unwrap(), "\u{a1}Hola, se\u{f1}or!"); + /// ``` + pub fn get_value(&self) -> Result<String, MailParseError> { + let mut result = String::new(); + let chars = decode_latin1(self.value); + let mut lines = chars.lines(); + let mut add_space = false; + while let Some(line) = lines.next().map(str::trim_start) { + if add_space { + result.push(' '); + } + add_space = true; + + let mut ix_search = 0; + loop { + match find_from(line, ix_search, "=?") { + Some(v) => { + let ix_begin = v + 2; + if !is_boundary(line, ix_begin.checked_sub(3)) { + result.push_str(&line[ix_search..ix_begin]); + ix_search = ix_begin; + continue; + } + result.push_str(&line[ix_search..ix_begin - 2]); + let mut ix_end_search = ix_begin; + loop { + match find_from(line, ix_end_search, "?=") { + Some(ix_end) => { + if !is_boundary(line, ix_end.checked_add(2)) { + ix_end_search = ix_end + 2; + continue; + } + match self.decode_word(&line[ix_begin..ix_end]) { + Some(v) => { + result.push_str(&v); + add_space = false; + } + None => result.push_str(&line[ix_begin - 2..ix_end + 2]), + }; + ix_search = ix_end; + } + None => { + result.push_str(&"=?"); + ix_search = ix_begin - 2; + } + }; + break; + } + ix_search += 2; + continue; + } + None => { + result.push_str(&line[ix_search..]); + break; + } + }; + } + } + Ok(result) + } +} + +#[derive(Debug)] +enum HeaderParseState { + Initial, + Key, + PreValue, + Value, + ValueNewline, +} + +/// Parse a single header from the raw data given. +/// This function takes raw byte data, and starts parsing it, expecting there +/// to be a MIME header key-value pair right at the beginning. It parses that +/// header and returns it, along with the index at which the next header is +/// expected to start. If you just want to parse a single header, you can ignore +/// the second component of the tuple, which is the index of the next header. +/// Error values are returned if the data could not be successfully interpreted +/// as a MIME key-value pair. +/// +/// # Examples +/// ``` +/// use mailparse::parse_header; +/// let (parsed, _) = parse_header(concat!( +/// "Subject: Hello, sir,\n", +/// " I am multiline\n", +/// "Next:Header").as_bytes()) +/// .unwrap(); +/// assert_eq!(parsed.get_key().unwrap(), "Subject"); +/// assert_eq!(parsed.get_value().unwrap(), "Hello, sir, I am multiline"); +/// ``` +pub fn parse_header(raw_data: &[u8]) -> Result<(MailHeader, usize), MailParseError> { + let mut it = raw_data.iter(); + let mut ix = 0; + let mut c = match it.next() { + None => return Err(MailParseError::Generic("Empty string provided")), + Some(v) => *v, + }; + + let mut ix_key_end = None; + let mut ix_value_start = 0; + let mut ix_value_end = 0; + + let mut state = HeaderParseState::Initial; + loop { + match state { + HeaderParseState::Initial => { + if c == b' ' { + return Err(MailParseError::Generic( + "Header cannot start with a space; it is \ + likely an overhanging line from a \ + previous header", + )); + }; + state = HeaderParseState::Key; + continue; + } + HeaderParseState::Key => { + if c == b':' { + ix_key_end = Some(ix); + state = HeaderParseState::PreValue; + } else if c == b'\n' { + return Err(MailParseError::Generic("Unexpected newline in header key")); + } + } + HeaderParseState::PreValue => { + if c != b' ' { + ix_value_start = ix; + ix_value_end = ix; + state = HeaderParseState::Value; + continue; + } + } + HeaderParseState::Value => { + if c == b'\n' { + state = HeaderParseState::ValueNewline; + } else { + ix_value_end = ix + 1; + } + } + HeaderParseState::ValueNewline => { + if c == b' ' || c == b'\t' { + state = HeaderParseState::Value; + continue; + } else { + break; + } + } + } + ix += 1; + c = match it.next() { + None => break, + Some(v) => *v, + }; + } + match ix_key_end { + Some(v) => Ok(( + MailHeader { + key: &raw_data[0..v], + value: &raw_data[ix_value_start..ix_value_end], + }, + ix, + )), + + None => Err(MailParseError::Generic( + "Unable to determine end of the header key component", + )), + } +} + +/// A trait that is implemented by the [MailHeader] slice. These functions are +/// also available on Vec<MailHeader> which is returned by the parse_headers +/// function. It provides a map-like interface to look up header values by their +/// name. +pub trait MailHeaderMap { + /// Look through the list of headers and return the value of the first one + /// that matches the provided key. It returns Ok(None) if the no matching + /// header was found. Header names are matched case-insensitively. + /// + /// # Examples + /// ``` + /// use mailparse::{parse_mail, MailHeaderMap}; + /// let headers = parse_mail(concat!( + /// "Subject: Test\n", + /// "\n", + /// "This is a test message").as_bytes()) + /// .unwrap().headers; + /// assert_eq!(headers.get_first_value("Subject").unwrap(), Some("Test".to_string())); + /// ``` + fn get_first_value(&self, key: &str) -> Result<Option<String>, MailParseError>; + + /// Look through the list of headers and return the values of all headers + /// matching the provided key. Returns an empty vector if no matching headers + /// were found. The order of the returned values is the same as the order + /// of the matching headers in the message. Header names are matched + /// case-insensitively. + /// + /// # Examples + /// ``` + /// use mailparse::{parse_mail, MailHeaderMap}; + /// let headers = parse_mail(concat!( + /// "Key: Value1\n", + /// "Key: Value2").as_bytes()) + /// .unwrap().headers; + /// assert_eq!(headers.get_all_values("Key").unwrap(), + /// vec!["Value1".to_string(), "Value2".to_string()]); + /// ``` + fn get_all_values(&self, key: &str) -> Result<Vec<String>, MailParseError>; +} + +impl<'a> MailHeaderMap for [MailHeader<'a>] { + fn get_first_value(&self, key: &str) -> Result<Option<String>, MailParseError> { + for x in self { + if x.get_key()?.eq_ignore_ascii_case(key) { + return x.get_value().map(Some); + } + } + Ok(None) + } + + fn get_all_values(&self, key: &str) -> Result<Vec<String>, MailParseError> { + let mut values: Vec<String> = Vec::new(); + for x in self { + if x.get_key()?.eq_ignore_ascii_case(key) { + values.push(x.get_value()?); + } + } + Ok(values) + } +} + +/// Parses all the headers from the raw data given. +/// This function takes raw byte data, and starts parsing it, expecting there +/// to be zero or more MIME header key-value pair right at the beginning, +/// followed by two consecutive newlines (i.e. a blank line). It parses those +/// headers and returns them in a vector. The normal vector functions can be +/// used to access the headers linearly, or the MailHeaderMap trait can be used +/// to access them in a map-like fashion. Along with this vector, the function +/// returns the index at which the message body is expected to start. If you +/// just care about the headers, you can ignore the second component of the +/// returned tuple. +/// Error values are returned if there was some sort of parsing error. +/// +/// # Examples +/// ``` +/// use mailparse::{parse_headers, MailHeaderMap}; +/// let (headers, _) = parse_headers(concat!( +/// "Subject: Test\n", +/// "From: me@myself.com\n", +/// "To: you@yourself.com").as_bytes()) +/// .unwrap(); +/// assert_eq!(headers[1].get_key().unwrap(), "From"); +/// assert_eq!(headers.get_first_value("To").unwrap(), Some("you@yourself.com".to_string())); +/// ``` +pub fn parse_headers(raw_data: &[u8]) -> Result<(Vec<MailHeader>, usize), MailParseError> { + let mut headers: Vec<MailHeader> = Vec::new(); + let mut ix = 0; + loop { + if ix >= raw_data.len() { + break; + } else if raw_data[ix] == b'\n' { + ix += 1; + break; + } else if raw_data[ix] == b'\r' { + if ix + 1 < raw_data.len() && raw_data[ix + 1] == b'\n' { + ix += 2; + break; + } else { + return Err(MailParseError::Generic( + "Headers were followed by an unexpected lone \ + CR character!", + )); + } + } + let (header, ix_next) = parse_header(&raw_data[ix..])?; + headers.push(header); + ix += ix_next; + } + Ok((headers, ix)) +} + |