diff options
Diffstat (limited to 'internals/src/grammar.rs')
-rw-r--r-- | internals/src/grammar.rs | 423 |
1 files changed, 423 insertions, 0 deletions
diff --git a/internals/src/grammar.rs b/internals/src/grammar.rs new file mode 100644 index 0000000..cc2e8f6 --- /dev/null +++ b/internals/src/grammar.rs @@ -0,0 +1,423 @@ +//! This module contains a number of helper functions for writing parsers. +//! +//! Ironically they are also needed when writing mail encoders/generators +//! e.g. for checking if a part need special encoding. +use ::MailType; + +/// ftext as defined by RFC 5322 +/// +/// which is: printable US-ASCII characters not includign `:` +/// => 0x21-0x39 / 0x3B-0x7E +/// => '!'...'9' / ';'...'~' +/// => <0x7F && != 0x3A +#[inline(always)] +pub fn is_ftext(ch: char) -> bool { + let bch = ch as u32; + bch > 32 && bch < 127 && ch != ':' +} + +///WS as defined by RFC 5234 +#[inline(always)] +pub fn is_ws(ch: char) -> bool { + // is not limited to ascii ws + //ch.is_whitespace() + //WSP = SP / HTAB + ch == ' ' || ch == '\t' +} + +/// True if `ch` is `' '` +#[inline(always)] +pub fn is_space(ch: char) -> bool { + ch == ' ' +} + +/// True if `ch` is us-ascii (i.e. <128) +#[inline(always)] +pub fn is_ascii(ch: char) -> bool { + (ch as u32) < 128 +} + +/// True if `ch` is ascii and "visible"/"printable". +/// +/// This is the case for any char in the (decimal) +/// range 33..=126 which is '!'..='~'. +#[inline(always)] +pub fn is_ascii_vchar(ch: char) -> bool { + let u32_ch = ch as u32; + 32 < u32_ch && u32_ch <= 126 +} + +/// VCHAR as defined by RFC 5243 +/// +/// Is true if it's either an us-ascii vchar or +/// an non us-ascii char and the mail type is +/// internationalized. +/// +/// This mean that this includes _non printable_ +/// characters as long as the mail is internationalized +/// and the character is non us-ascii utf-8. +#[inline(always)] +pub fn is_vchar(ch: char, mt: MailType) -> bool { + is_ascii_vchar(ch) || (mt == MailType::Internationalized && !is_ascii(ch)) +} + + +//TODO as RFCs +/// can be quoted in a quoted string (internalized) based on RFC ... and RFC ... +#[inline(always)] +pub fn is_quotable(ch: char, tp: MailType) -> bool { + is_vchar(ch, tp) || is_ws(ch) +} + +/// any whitespace (char::is_whitespace) +#[inline(always)] +pub fn is_any_whitespace(ch: char) -> bool { + ch.is_whitespace() +} + +/// ctext as defined by RFC 5322 +pub fn is_ctext(ch: char, mt: MailType) -> bool { + match ch { + '!'...'\'' | + '*'...'[' | + ']'...'~' => true, + // obs-ctext + _ => mt == MailType::Internationalized && !is_ascii( ch ) + } +} + +/// check if a char is a especial (_based on RFC 5322_) +/// +/// Note that there is _another_ especial from a different RFC. +pub fn is_special(ch: char) -> bool { + match ch { + '(' | ')' | + '<' | '>' | + '[' | ']' | + ':' | ';' | + '@' | '\\'| + ',' | '.' | + '"' => true, + _ => false + } +} + + +/// check if a char is an tspecial (based on RFC 2045) +pub fn is_tspecial(ch: char) -> bool { + match ch { + '(' | ')' | + '<' | '>' | + '@' | ',' | + ';' | ':' | + '\\'| '"' | + '/' | '[' | + ']' | '?' | + '=' => true, + _ => false + } +} + + + +/// atext as defined by RFC 5322 +#[inline(always)] +pub fn is_atext(ch: char, tp: MailType) -> bool { + is_vchar(ch, tp) && !is_special(ch) +} + +/// dtext as defined by RFC 5322 +#[inline(always)] +pub fn is_dtext(ch: char , mt: MailType) -> bool { + match ch as u32 { + 33...90 | + 94...126 => true, + _ => mt == MailType::Internationalized && !is_ascii(ch) + } +} + +/// qtext as defined by RFC 5322 +pub fn is_qtext(ch: char, mt: MailType) -> bool { + match ch { + //not ' ' [d:32] + '!' | + //not '"' [d:34] + '#'...'[' | + //not '\\' [d:92] + ']'...'~' => true, + _ => mt == MailType::Internationalized && !is_ascii(ch) + } +} + +/// Chack if it is a CTL char (based on RFC 822). +/// +/// # Note +/// the standard specifies `'\t'` as a CTL but not `' '` +/// but both `'\t'` and `' '` are LWSP-char i.e. semantically +/// space i.e. _semantically equivalent_. +#[inline(always)] +pub fn is_ctl(ch: char) -> bool { + (ch as u32) < 32 +} + +/// Check if a char is an token char (based on RFC 2045). +#[inline(always)] +pub fn is_token_char(ch: char) -> bool { + is_ascii(ch) && !is_ctl(ch) && !is_tspecial(ch) && ch != ' ' +} + + +//TODO add rfc +/// Check if a char is especial (based on RFC ...). +#[inline(always)] +pub fn is_especial(ch: char) -> bool { + match ch { + '(' | ')' | + '<' | '>' | + '@' | ',' | + ';' | ':' | + '"' | '/'| + '[' | ']' | + '?' | '.' | + '=' => true, + _ => false + } +} + +//TODO add rfc +/// Check if a string is an token (based on RFC ...). +pub fn is_token(s: &str) -> bool { + 0 < s.len() && s.chars().all(is_token_char) +} + +// +//pub fn is_dot_atom_text( text: &str, mt: MailType ) -> bool { +// use nom::IResult; +// use self::parse::recognize_dot_atom_text; +// +// let res = tuple!( text, +// call!( recognize_dot_atom_text, mt ), +// eof!() +// ); +// +// match res { +// IResult::Done(_, _) => true, +// _ => false +// } +//} + +//pub mod parse { +// use nom::IResult; +// use super::{ is_atext, MailType }; +// +// pub fn recognize_dot_atom_text( input: &str, mt: MailType ) -> IResult<&str, &str> { +// recognize!( input, tuple!( +// take_while1!( call!( is_atext, mt ) ), +// many0!( tuple!( +// char!( "." ), +// take_while1!( call!( is_atext, mt ) ) +// ) ) +// ) ) +// } +// +//} +//TODO this should be some where else I think +// (but it is used by `1. codec`, `2. components` ) +/// Grammar parts for encoded words (based on RFC 2047). +pub mod encoded_word { + use nom; + use ::MailType; + use ::error::{EncodingError, EncodingErrorKind}; + use super::{ is_especial, is_ascii_vchar }; + + /// maximal length of an encoded word + pub const MAX_ECW_LEN: usize = 75; + + /// The syntax overhead from "framing" an encoded word. + /// + /// This is the start (1x`=?`) the first and second separator (2x`?`) and the + /// end (1x`?=`) leading to 6 byte overhead. + pub const ECW_SEP_OVERHEAD: usize = 6; + + /// Represents the place at which the encoded word appears. + /// + /// Depending on the place more or less character have to be + /// encoded. + /// + /// Note: Implementations creating encoded words might use a + /// stricter context which is compatible with all places to + /// reduce code complexity. + #[derive(Debug, Copy, Clone, Hash, Eq, PartialEq)] + pub enum EncodedWordContext { + Phrase, + Text, + Comment + } + + impl EncodedWordContext { + + /// Returns a (context dependent) validator to check if a char can be represented without encoding. + fn char_validator( &self ) -> fn(char) -> bool { + use self::EncodedWordContext::*; + match *self { + Phrase => valid_char_in_ec_in_phrase, + Text => is_encoded_word_char, + Comment => valid_char_in_ec_in_comment, + } + } + } + + + /// Returns true if the given word is a encoded word. + /// + /// Note that this depends on the context the word appears in and the mail type. + /// The reason for this is that encoded words tend to be valid text even without + /// decoding them. But this means if the encoded word has some syntax error (e.g. + /// missing closing `?=`) it is no longer an encoded word but just some text which + /// happen to look similar to one. + pub fn is_encoded_word(word: &str, ctx: EncodedWordContext, mail_type: MailType) -> bool { + try_parse_encoded_word_parts(word, ctx, mail_type).is_ok() + } + + /// Tries to parse the given string as an encoded word. + pub fn try_parse_encoded_word_parts( + word: &str, + ctx: EncodedWordContext, + mail_type: MailType + ) -> Result<(&str, &str, &str), EncodingError> + { + let char_validator = ctx.char_validator(); + // Note we could get a possible speed up by making rustc generate + // a different function for each Context, inlining ALL char tests + let res = do_parse!( + word, + char!( '=' ) >> + char!( '?' ) >> + charset: take_while!( is_ew_token_char ) >> + char!( '?' ) >> + encoding: take_while!( is_ew_token_char ) >> + char!( '?' ) >> + text: take_while!( char_validator ) >> + char!( '?' ) >> + char!( '=' ) >> + eof!() >> + (charset, encoding, text) + ); + + match res { + nom::IResult::Done( rest, result ) => { + assert_eq!(rest.len(), 0, "[BUG] used nom::eof!() but rest.len() > 0"); + Ok( result ) + }, + nom::IResult::Incomplete( .. ) => { + return Err((EncodingErrorKind::Malformed, mail_type).into()); + } + nom::IResult::Error( .. ) => { + return Err((EncodingErrorKind::Malformed, mail_type).into()); + } + } + } + + /// True if the char can appear in an encoded word. + fn is_encoded_word_char(ch: char) -> bool { + is_ascii_vchar(ch) && ch != '?' + } + + /// True if the char can appear in an encoded word appearing in a comment. + fn valid_char_in_ec_in_comment(ch: char) -> bool { + is_encoded_word_char(ch) && !(ch == '(' || ch == ')' || ch == '"') + } + + /// True if the char is valid in an encode word appearing in a phrase. + fn valid_char_in_ec_in_phrase(ch: char) -> bool { + match ch { + '0'...'9' | + 'a'...'z' | + 'A'...'Z' | + '!' | '*' | + '+' | '-' | + '/' | '=' | + '_' => true, + _ => false + } + } + + /// True if the char is a encoded word token. + /// + /// Encoded word tokens are used for the charset and + /// language part of an encoded word. + fn is_ew_token_char(ch: char) -> bool { + is_ascii_vchar(ch) && !is_especial(ch) + } + +} + +//TODO shouldn't we use `bind/quoted_string`? +/// True if the given string is a quoted string. +pub fn is_quoted_string(qstr: &str, tp: MailType) -> bool { + let mut iter = qstr.chars(); + if let Some('"') = iter.next() {} else { return false } + let mut next = iter.next(); + while let Some(ch) = next { + match ch { + '\\' => { + if let Some(next_char) = iter.next() { + if !(is_vchar(next_char, tp) || is_ws(next_char)) { + return false; + } + } else { + return false; + } + }, + '"' => { + if iter.next().is_none() { + return true; + } else { + return false; + } + } + ch => { + if !is_qtext(ch, tp) { + return false + } + } + } + next = iter.next() + } + + // The only true return if we have a '"' followed by iter.next().is_none() + return false; +} + + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn _is_ascii_vchar() { + assert_eq!(false, is_ascii_vchar('\x7f')); + for bad_char in b'\0'..b' ' { + if is_ascii_vchar(bad_char as char) { + panic!("{:?} should not be a VCHAR", bad_char); + } + } + for good_char in b'!'..(b'~'+1) { + if !is_ascii_vchar(good_char as char) { + panic!("{:?} should be a VCHAR", good_char as char); + } + } + } + + #[test] + fn htap_is_ctl_space_is_not() { + assert_eq!(true, is_ctl('\t')); + assert_eq!(false, is_ctl(' ')); + } + + #[test] + fn is_toke_empty() { + assert_eq!(false, is_token("")); + } +} + |