use std::fmt; #[derive(Clone, PartialEq, Eq, Debug)] pub enum LexicalError { } impl fmt::Display for LexicalError { // This trait requires `fmt` with this exact signature. fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}", "{}") } } pub type Spanned = Result<(Loc, Token, Loc), LexicalError>; // The type of the parser's input. // // The parser iterators over tuples consisting of the token's starting // position, the token itself, and the token's ending position. pub(crate) type LexerItem = Spanned; /// The components of an OpenPGP Message. #[derive(Debug, Clone, PartialEq)] pub enum Token { PIPE, STAR, PLUS, QUESTION, LPAREN, RPAREN, DOT, CARET, DOLLAR, BACKSLASH, LBRACKET, RBRACKET, DASH, OTHER(char), } assert_send_and_sync!(Token); impl fmt::Display for Token { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.write_str(&format!("{:?}", self)[..]) } } impl From for String { fn from(t: Token) -> String { use self::Token::*; match t { PIPE => '|'.to_string(), STAR => '*'.to_string(), PLUS => '+'.to_string(), QUESTION => '?'.to_string(), LPAREN => '('.to_string(), RPAREN => ')'.to_string(), DOT => '.'.to_string(), CARET => '^'.to_string(), DOLLAR => '$'.to_string(), BACKSLASH => '\\'.to_string(), LBRACKET => '['.to_string(), RBRACKET => ']'.to_string(), DASH => '-'.to_string(), OTHER(c) => c.to_string(), } } } impl Token { pub fn to_string(self) -> String { self.into() } pub fn to_char(&self) -> char { use self::Token::*; match self { PIPE => '|', STAR => '*', PLUS => '+', QUESTION => '?', LPAREN => '(', RPAREN => ')', DOT => '.', CARET => '^', DOLLAR => '$', BACKSLASH => '\\', LBRACKET => '[', RBRACKET => ']', DASH => '-', OTHER(c) => *c, } } } pub(crate) struct Lexer<'input> { offset: usize, input: &'input str, } impl<'input> Lexer<'input> { pub fn new(input: &'input str) -> Self { Lexer { offset: 0, input } } } impl<'input> Iterator for Lexer<'input> { type Item = LexerItem; fn next(&mut self) -> Option { use self::Token::*; tracer!(super::TRACE, "regex::Lexer::next"); // Returns the length of the first character in s in bytes. // If s is empty, returns 0. fn char_bytes(s: &str) -> usize { if let Some(c) = s.chars().next() { c.len_utf8() } else { 0 } } let one = |input: &'input str| -> Option { let c = input.chars().next()?; Some(match c { '|' => PIPE, '*' => STAR, '+' => PLUS, '?' => QUESTION, '(' => LPAREN, ')' => RPAREN, '.' => DOT, '^' => CARET, '$' => DOLLAR, '\\' => BACKSLASH, '[' => LBRACKET, ']' => RBRACKET, '-' => DASH, _ => OTHER(c), }) }; let l = char_bytes(self.input); let t = match one(self.input) { Some(t) => t, None => return None, }; self.input = &self.input[l..]; let start = self.offset; let end = start + l; self.offset += l; t!("Returning token at offset {}: '{:?}'", start, t); Some(Ok((start, t, end))) } } impl<'input> From<&'input str> for Lexer<'input> { fn from(i: &'input str) -> Lexer<'input> { Lexer::new(i) } } #[cfg(test)] mod tests { use super::*; #[test] fn lexer() { fn lex(s: &str, expected: &[Token]) { let tokens: Vec = Lexer::new(s) .map(|t| t.unwrap().1) .collect(); assert_eq!(&tokens[..], expected, "{}", s); } use Token::*; lex("|", &[ PIPE ]); lex("*", &[ STAR ]); lex("+", &[ PLUS ]); lex("?", &[ QUESTION ]); lex("(", &[ LPAREN ]); lex(")", &[ RPAREN ]); lex(".", &[ DOT ]); lex("^", &[ CARET ]); lex("$", &[ DOLLAR ]); lex("\\", &[ BACKSLASH ]); lex("[", &[ LBRACKET ]); lex("]", &[ RBRACKET ]); lex("-", &[ DASH ]); lex("a", &[ OTHER('a') ]); lex("aa", &[ OTHER('a'), OTHER('a') ]); lex("foo", &[ OTHER('f'), OTHER('o'), OTHER('o') ]); lex("foo\\bar", &[ OTHER('f'), OTHER('o'), OTHER('o'), BACKSLASH, OTHER('b'), OTHER('a'), OTHER('r') ]); lex("*?!", &[ STAR, QUESTION, OTHER('!') ]); // Multi-byte UTF-8. lex("ΓŸβ„πŸ’£", &[ OTHER('ß'), OTHER('ℝ'), OTHER('πŸ’£'), ]); lex("(ß|ℝ|πŸ’£", &[ LPAREN, OTHER('ß'), PIPE, OTHER('ℝ'), PIPE, OTHER('πŸ’£') ]); lex("東京", &[ OTHER('東'), OTHER('δΊ¬') ]); } }