summaryrefslogtreecommitdiffstats
path: root/internals/src/grammar.rs
diff options
context:
space:
mode:
Diffstat (limited to 'internals/src/grammar.rs')
-rw-r--r--internals/src/grammar.rs423
1 files changed, 423 insertions, 0 deletions
diff --git a/internals/src/grammar.rs b/internals/src/grammar.rs
new file mode 100644
index 0000000..cc2e8f6
--- /dev/null
+++ b/internals/src/grammar.rs
@@ -0,0 +1,423 @@
+//! This module contains a number of helper functions for writing parsers.
+//!
+//! Ironically they are also needed when writing mail encoders/generators
+//! e.g. for checking if a part need special encoding.
+use ::MailType;
+
+/// ftext as defined by RFC 5322
+///
+/// which is: printable US-ASCII characters not includign `:`
+/// => 0x21-0x39 / 0x3B-0x7E
+/// => '!'...'9' / ';'...'~'
+/// => <0x7F && != 0x3A
+#[inline(always)]
+pub fn is_ftext(ch: char) -> bool {
+ let bch = ch as u32;
+ bch > 32 && bch < 127 && ch != ':'
+}
+
+///WS as defined by RFC 5234
+#[inline(always)]
+pub fn is_ws(ch: char) -> bool {
+ // is not limited to ascii ws
+ //ch.is_whitespace()
+ //WSP = SP / HTAB
+ ch == ' ' || ch == '\t'
+}
+
+/// True if `ch` is `' '`
+#[inline(always)]
+pub fn is_space(ch: char) -> bool {
+ ch == ' '
+}
+
+/// True if `ch` is us-ascii (i.e. <128)
+#[inline(always)]
+pub fn is_ascii(ch: char) -> bool {
+ (ch as u32) < 128
+}
+
+/// True if `ch` is ascii and "visible"/"printable".
+///
+/// This is the case for any char in the (decimal)
+/// range 33..=126 which is '!'..='~'.
+#[inline(always)]
+pub fn is_ascii_vchar(ch: char) -> bool {
+ let u32_ch = ch as u32;
+ 32 < u32_ch && u32_ch <= 126
+}
+
+/// VCHAR as defined by RFC 5243
+///
+/// Is true if it's either an us-ascii vchar or
+/// an non us-ascii char and the mail type is
+/// internationalized.
+///
+/// This mean that this includes _non printable_
+/// characters as long as the mail is internationalized
+/// and the character is non us-ascii utf-8.
+#[inline(always)]
+pub fn is_vchar(ch: char, mt: MailType) -> bool {
+ is_ascii_vchar(ch) || (mt == MailType::Internationalized && !is_ascii(ch))
+}
+
+
+//TODO as RFCs
+/// can be quoted in a quoted string (internalized) based on RFC ... and RFC ...
+#[inline(always)]
+pub fn is_quotable(ch: char, tp: MailType) -> bool {
+ is_vchar(ch, tp) || is_ws(ch)
+}
+
+/// any whitespace (char::is_whitespace)
+#[inline(always)]
+pub fn is_any_whitespace(ch: char) -> bool {
+ ch.is_whitespace()
+}
+
+/// ctext as defined by RFC 5322
+pub fn is_ctext(ch: char, mt: MailType) -> bool {
+ match ch {
+ '!'...'\'' |
+ '*'...'[' |
+ ']'...'~' => true,
+ // obs-ctext
+ _ => mt == MailType::Internationalized && !is_ascii( ch )
+ }
+}
+
+/// check if a char is a especial (_based on RFC 5322_)
+///
+/// Note that there is _another_ especial from a different RFC.
+pub fn is_special(ch: char) -> bool {
+ match ch {
+ '(' | ')' |
+ '<' | '>' |
+ '[' | ']' |
+ ':' | ';' |
+ '@' | '\\'|
+ ',' | '.' |
+ '"' => true,
+ _ => false
+ }
+}
+
+
+/// check if a char is an tspecial (based on RFC 2045)
+pub fn is_tspecial(ch: char) -> bool {
+ match ch {
+ '(' | ')' |
+ '<' | '>' |
+ '@' | ',' |
+ ';' | ':' |
+ '\\'| '"' |
+ '/' | '[' |
+ ']' | '?' |
+ '=' => true,
+ _ => false
+ }
+}
+
+
+
+/// atext as defined by RFC 5322
+#[inline(always)]
+pub fn is_atext(ch: char, tp: MailType) -> bool {
+ is_vchar(ch, tp) && !is_special(ch)
+}
+
+/// dtext as defined by RFC 5322
+#[inline(always)]
+pub fn is_dtext(ch: char , mt: MailType) -> bool {
+ match ch as u32 {
+ 33...90 |
+ 94...126 => true,
+ _ => mt == MailType::Internationalized && !is_ascii(ch)
+ }
+}
+
+/// qtext as defined by RFC 5322
+pub fn is_qtext(ch: char, mt: MailType) -> bool {
+ match ch {
+ //not ' ' [d:32]
+ '!' |
+ //not '"' [d:34]
+ '#'...'[' |
+ //not '\\' [d:92]
+ ']'...'~' => true,
+ _ => mt == MailType::Internationalized && !is_ascii(ch)
+ }
+}
+
+/// Chack if it is a CTL char (based on RFC 822).
+///
+/// # Note
+/// the standard specifies `'\t'` as a CTL but not `' '`
+/// but both `'\t'` and `' '` are LWSP-char i.e. semantically
+/// space i.e. _semantically equivalent_.
+#[inline(always)]
+pub fn is_ctl(ch: char) -> bool {
+ (ch as u32) < 32
+}
+
+/// Check if a char is an token char (based on RFC 2045).
+#[inline(always)]
+pub fn is_token_char(ch: char) -> bool {
+ is_ascii(ch) && !is_ctl(ch) && !is_tspecial(ch) && ch != ' '
+}
+
+
+//TODO add rfc
+/// Check if a char is especial (based on RFC ...).
+#[inline(always)]
+pub fn is_especial(ch: char) -> bool {
+ match ch {
+ '(' | ')' |
+ '<' | '>' |
+ '@' | ',' |
+ ';' | ':' |
+ '"' | '/'|
+ '[' | ']' |
+ '?' | '.' |
+ '=' => true,
+ _ => false
+ }
+}
+
+//TODO add rfc
+/// Check if a string is an token (based on RFC ...).
+pub fn is_token(s: &str) -> bool {
+ 0 < s.len() && s.chars().all(is_token_char)
+}
+
+//
+//pub fn is_dot_atom_text( text: &str, mt: MailType ) -> bool {
+// use nom::IResult;
+// use self::parse::recognize_dot_atom_text;
+//
+// let res = tuple!( text,
+// call!( recognize_dot_atom_text, mt ),
+// eof!()
+// );
+//
+// match res {
+// IResult::Done(_, _) => true,
+// _ => false
+// }
+//}
+
+//pub mod parse {
+// use nom::IResult;
+// use super::{ is_atext, MailType };
+//
+// pub fn recognize_dot_atom_text( input: &str, mt: MailType ) -> IResult<&str, &str> {
+// recognize!( input, tuple!(
+// take_while1!( call!( is_atext, mt ) ),
+// many0!( tuple!(
+// char!( "." ),
+// take_while1!( call!( is_atext, mt ) )
+// ) )
+// ) )
+// }
+//
+//}
+//TODO this should be some where else I think
+// (but it is used by `1. codec`, `2. components` )
+/// Grammar parts for encoded words (based on RFC 2047).
+pub mod encoded_word {
+ use nom;
+ use ::MailType;
+ use ::error::{EncodingError, EncodingErrorKind};
+ use super::{ is_especial, is_ascii_vchar };
+
+ /// maximal length of an encoded word
+ pub const MAX_ECW_LEN: usize = 75;
+
+ /// The syntax overhead from "framing" an encoded word.
+ ///
+ /// This is the start (1x`=?`) the first and second separator (2x`?`) and the
+ /// end (1x`?=`) leading to 6 byte overhead.
+ pub const ECW_SEP_OVERHEAD: usize = 6;
+
+ /// Represents the place at which the encoded word appears.
+ ///
+ /// Depending on the place more or less character have to be
+ /// encoded.
+ ///
+ /// Note: Implementations creating encoded words might use a
+ /// stricter context which is compatible with all places to
+ /// reduce code complexity.
+ #[derive(Debug, Copy, Clone, Hash, Eq, PartialEq)]
+ pub enum EncodedWordContext {
+ Phrase,
+ Text,
+ Comment
+ }
+
+ impl EncodedWordContext {
+
+ /// Returns a (context dependent) validator to check if a char can be represented without encoding.
+ fn char_validator( &self ) -> fn(char) -> bool {
+ use self::EncodedWordContext::*;
+ match *self {
+ Phrase => valid_char_in_ec_in_phrase,
+ Text => is_encoded_word_char,
+ Comment => valid_char_in_ec_in_comment,
+ }
+ }
+ }
+
+
+ /// Returns true if the given word is a encoded word.
+ ///
+ /// Note that this depends on the context the word appears in and the mail type.
+ /// The reason for this is that encoded words tend to be valid text even without
+ /// decoding them. But this means if the encoded word has some syntax error (e.g.
+ /// missing closing `?=`) it is no longer an encoded word but just some text which
+ /// happen to look similar to one.
+ pub fn is_encoded_word(word: &str, ctx: EncodedWordContext, mail_type: MailType) -> bool {
+ try_parse_encoded_word_parts(word, ctx, mail_type).is_ok()
+ }
+
+ /// Tries to parse the given string as an encoded word.
+ pub fn try_parse_encoded_word_parts(
+ word: &str,
+ ctx: EncodedWordContext,
+ mail_type: MailType
+ ) -> Result<(&str, &str, &str), EncodingError>
+ {
+ let char_validator = ctx.char_validator();
+ // Note we could get a possible speed up by making rustc generate
+ // a different function for each Context, inlining ALL char tests
+ let res = do_parse!(
+ word,
+ char!( '=' ) >>
+ char!( '?' ) >>
+ charset: take_while!( is_ew_token_char ) >>
+ char!( '?' ) >>
+ encoding: take_while!( is_ew_token_char ) >>
+ char!( '?' ) >>
+ text: take_while!( char_validator ) >>
+ char!( '?' ) >>
+ char!( '=' ) >>
+ eof!() >>
+ (charset, encoding, text)
+ );
+
+ match res {
+ nom::IResult::Done( rest, result ) => {
+ assert_eq!(rest.len(), 0, "[BUG] used nom::eof!() but rest.len() > 0");
+ Ok( result )
+ },
+ nom::IResult::Incomplete( .. ) => {
+ return Err((EncodingErrorKind::Malformed, mail_type).into());
+ }
+ nom::IResult::Error( .. ) => {
+ return Err((EncodingErrorKind::Malformed, mail_type).into());
+ }
+ }
+ }
+
+ /// True if the char can appear in an encoded word.
+ fn is_encoded_word_char(ch: char) -> bool {
+ is_ascii_vchar(ch) && ch != '?'
+ }
+
+ /// True if the char can appear in an encoded word appearing in a comment.
+ fn valid_char_in_ec_in_comment(ch: char) -> bool {
+ is_encoded_word_char(ch) && !(ch == '(' || ch == ')' || ch == '"')
+ }
+
+ /// True if the char is valid in an encode word appearing in a phrase.
+ fn valid_char_in_ec_in_phrase(ch: char) -> bool {
+ match ch {
+ '0'...'9' |
+ 'a'...'z' |
+ 'A'...'Z' |
+ '!' | '*' |
+ '+' | '-' |
+ '/' | '=' |
+ '_' => true,
+ _ => false
+ }
+ }
+
+ /// True if the char is a encoded word token.
+ ///
+ /// Encoded word tokens are used for the charset and
+ /// language part of an encoded word.
+ fn is_ew_token_char(ch: char) -> bool {
+ is_ascii_vchar(ch) && !is_especial(ch)
+ }
+
+}
+
+//TODO shouldn't we use `bind/quoted_string`?
+/// True if the given string is a quoted string.
+pub fn is_quoted_string(qstr: &str, tp: MailType) -> bool {
+ let mut iter = qstr.chars();
+ if let Some('"') = iter.next() {} else { return false }
+ let mut next = iter.next();
+ while let Some(ch) = next {
+ match ch {
+ '\\' => {
+ if let Some(next_char) = iter.next() {
+ if !(is_vchar(next_char, tp) || is_ws(next_char)) {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ },
+ '"' => {
+ if iter.next().is_none() {
+ return true;
+ } else {
+ return false;
+ }
+ }
+ ch => {
+ if !is_qtext(ch, tp) {
+ return false
+ }
+ }
+ }
+ next = iter.next()
+ }
+
+ // The only true return if we have a '"' followed by iter.next().is_none()
+ return false;
+}
+
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ #[test]
+ fn _is_ascii_vchar() {
+ assert_eq!(false, is_ascii_vchar('\x7f'));
+ for bad_char in b'\0'..b' ' {
+ if is_ascii_vchar(bad_char as char) {
+ panic!("{:?} should not be a VCHAR", bad_char);
+ }
+ }
+ for good_char in b'!'..(b'~'+1) {
+ if !is_ascii_vchar(good_char as char) {
+ panic!("{:?} should be a VCHAR", good_char as char);
+ }
+ }
+ }
+
+ #[test]
+ fn htap_is_ctl_space_is_not() {
+ assert_eq!(true, is_ctl('\t'));
+ assert_eq!(false, is_ctl(' '));
+ }
+
+ #[test]
+ fn is_toke_empty() {
+ assert_eq!(false, is_token(""));
+ }
+}
+