diff options
Diffstat (limited to 'rfc2822/src/lexer.rs')
-rw-r--r-- | rfc2822/src/lexer.rs | 212 |
1 files changed, 212 insertions, 0 deletions
diff --git a/rfc2822/src/lexer.rs b/rfc2822/src/lexer.rs new file mode 100644 index 00000000..5d7eb049 --- /dev/null +++ b/rfc2822/src/lexer.rs @@ -0,0 +1,212 @@ +use std::fmt; + +use Error; + +pub type Spanned<Token, Loc, Error> = Result<(Loc, Token, Loc), Error>; + +// The type of the parser's input. +// +// The parser iterators over tuples consisting of the token's starting +// position, the token itself, and the token's ending position. +pub(crate) type LexerItem<Token, Loc, Error> = Spanned<Token, Loc, Error>; + +#[derive(Debug, Clone, PartialEq)] +pub enum Token<'a> { + WSP(char), + #[allow(non_camel_case_types)] + NO_WS_CTL(char), + CR, + LF, + LPAREN, + RPAREN, + LANGLE, + RANGLE, + LBRACKET, + RBRACKET, + COLON, + SEMICOLON, + AT, + BACKSLASH, + COMMA, + DOT, + DQUOTE, + // Everything else. + OTHER(&'a str), +} + +impl<'a> fmt::Display for Token<'a> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(&format!("{:?}", self)[..]) + } +} + +impl<'a> From<Token<'a>> for String { + fn from(t: Token<'a>) -> String { + use self::Token::*; + match t { + WSP(c) => c.to_string(), + NO_WS_CTL(c) => c.to_string(), + CR => '\r'.to_string(), + LF => '\n'.to_string(), + LPAREN => '('.to_string(), + RPAREN => ')'.to_string(), + LANGLE => '<'.to_string(), + RANGLE => '>'.to_string(), + LBRACKET => '['.to_string(), + RBRACKET => ']'.to_string(), + COLON => ':'.to_string(), + SEMICOLON => ';'.to_string(), + AT => '@'.to_string(), + BACKSLASH => '\\'.to_string(), + COMMA => ','.to_string(), + DOT => '.'.to_string(), + DQUOTE => '"'.to_string(), + OTHER(s) => s.to_string(), + } + } +} + +impl<'a> Token<'a> { + pub fn to_string(self) -> String { + self.into() + } +} + +pub(crate) struct Lexer<'input> { + offset: usize, + input: &'input str, +} + +impl<'input> Lexer<'input> { + pub fn new(input: &'input str) -> Self { + Lexer { offset: 0, input } + } +} + +// 3.2.1. Primitive Tokens + +// The symbols. The default tokenizer returns &str, but we want +// chars. So, we need to do a little dance. +// +// match { +// // All unicode white space. +// // 2.2.2. says that whitespace is only ' ' and '\t'. +// r" \t" => WSP_TOKEN, +// +// r"(?x) +// [\x01-\x08 # %d1-8 / +// \x0b # %d11 / +// \x0c # %d12 / +// \x0e-\x1f # %d14-31 / +// \x7f # %d127 +// ]" => NO_WS_CTL_TOKEN, +// +// "\r" => CR_TOKEN, +// "\n" => LF_TOKEN, +// +// // specials +// "(" => LPAREN_TOKEN, +// ")" => RPAREN_TOKEN, +// "<" => LANGLE_TOKEN, +// ">" => RANGLE_TOKEN, +// "[" => LBRACKET_TOKEN, +// "]" => RBRACKET_TOKEN, +// ":" => COLON_TOKEN, +// ";" => SEMICOLON_TOKEN, +// "@" => AT_TOKEN, +// "\\" => BACKSLASH_TOKEN, +// "," => COMMA_TOKEN, +// "." => DOT_TOKEN, +// "\"" => DQUOTE_TOKEN, +// } else { +// // Everything else. +// r"." => OTHER_TOKEN +// } + +impl<'input> Iterator for Lexer<'input> { + type Item = LexerItem<Token<'input>, usize, Error>; + + fn next(&mut self) -> Option<Self::Item> { + use self::Token::*; + + tracer!(::TRACE, "Lexer::next"); + + // Returns the length of the first character in s in bytes. + // If s is empty, returns 0. + fn char_bytes(s: &str) -> usize { + if let Some(c) = s.chars().next() { + c.len_utf8() + } else { + 0 + } + } + + let one = |input: &'input str| -> Option<Token> { + let c = input.chars().next()?; + Some(match c { + c @ ' ' | c @ '\t' => WSP(c), + + // NO-WS-CTL = %d1-8 / ; US-ASCII control characters + // %d11 / 0xB ; that do not include the + // %d12 / 0xC ; carriage return, line feed, + // %d14-31 / 0xE-1F ; and white space characters + // %d127 0x7F + c @ '\x01'..='\x08' + | c @ '\x0B'..='\x0C' + | c @ '\x0E'..='\x1F' + | c @ '\x7F' => + NO_WS_CTL(c), + + '\r' => CR, + '\n' => LF, + '(' => LPAREN, + ')' => RPAREN, + '<' => LANGLE, + '>' => RANGLE, + '[' => LBRACKET, + ']' => RBRACKET, + ':' => COLON, + ';' => SEMICOLON, + '@' => AT, + '\\' => BACKSLASH, + ',' => COMMA, + '.' => DOT, + '"' => DQUOTE, + + _ => OTHER(&input[0..c.len_utf8()]), + }) + }; + + let mut l = char_bytes(self.input); + let t = match one(self.input) { + Some(OTHER(_)) => { + loop { + if let Some(OTHER(s)) = one(&self.input[l..]) { + l = l + char_bytes(s); + } else { + break OTHER(&self.input[..l]); + } + } + }, + Some(t) => t, + None => return None, + }; + + self.input = &self.input[l..]; + + let start = self.offset; + let end = start + l; + self.offset += l; + + t!("Returning token at offset {}: '{:?}'", + start, t); + + Some(Ok((start, t, end))) + } +} + +impl<'input> From<&'input str> for Lexer<'input> { + fn from(i: &'input str) -> Lexer<'input> { + Lexer::new(i) + } +} |