From 3367828e487fa79d2ec0a615e6b15dd5fbee5c9a Mon Sep 17 00:00:00 2001 From: "Neal H. Walfield" Date: Fri, 8 Jan 2021 14:29:22 +0100 Subject: openpgp: Add regex support. - Fixes #188. --- openpgp/src/regex/lexer.rs | 222 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 222 insertions(+) create mode 100644 openpgp/src/regex/lexer.rs (limited to 'openpgp/src/regex/lexer.rs') diff --git a/openpgp/src/regex/lexer.rs b/openpgp/src/regex/lexer.rs new file mode 100644 index 00000000..18300d90 --- /dev/null +++ b/openpgp/src/regex/lexer.rs @@ -0,0 +1,222 @@ +use std::fmt; + +#[derive(Clone, PartialEq, Eq, Debug)] +pub enum LexicalError { +} + +impl fmt::Display for LexicalError { + // This trait requires `fmt` with this exact signature. + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", "{}") + } +} + +pub type Spanned + = Result<(Loc, Token, Loc), LexicalError>; + +// The type of the parser's input. +// +// The parser iterators over tuples consisting of the token's starting +// position, the token itself, and the token's ending position. +pub(crate) type LexerItem + = Spanned; + +/// The components of an OpenPGP Message. +#[derive(Debug, Clone, PartialEq)] +pub enum Token { + PIPE, + + STAR, + PLUS, + QUESTION, + + LPAREN, + RPAREN, + + DOT, + CARET, + DOLLAR, + BACKSLASH, + + LBRACKET, + RBRACKET, + DASH, + + OTHER(char), +} +assert_send_and_sync!(Token); + +impl fmt::Display for Token { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(&format!("{:?}", self)[..]) + } +} + +impl From for String { + fn from(t: Token) -> String { + use self::Token::*; + match t { + PIPE => '|'.to_string(), + STAR => '*'.to_string(), + PLUS => '+'.to_string(), + QUESTION => '?'.to_string(), + LPAREN => '('.to_string(), + RPAREN => ')'.to_string(), + DOT => '.'.to_string(), + CARET => '^'.to_string(), + DOLLAR => '$'.to_string(), + BACKSLASH => '\\'.to_string(), + LBRACKET => '['.to_string(), + RBRACKET => ']'.to_string(), + DASH => '-'.to_string(), + OTHER(c) => c.to_string(), + } + } +} + +impl Token { + pub fn to_string(self) -> String { + self.into() + } + + pub fn to_char(&self) -> char { + use self::Token::*; + match self { + PIPE => '|', + STAR => '*', + PLUS => '+', + QUESTION => '?', + LPAREN => '(', + RPAREN => ')', + DOT => '.', + CARET => '^', + DOLLAR => '$', + BACKSLASH => '\\', + LBRACKET => '[', + RBRACKET => ']', + DASH => '-', + OTHER(c) => *c, + } + } +} + +pub(crate) struct Lexer<'input> { + offset: usize, + input: &'input str, +} + +impl<'input> Lexer<'input> { + pub fn new(input: &'input str) -> Self { + Lexer { offset: 0, input } + } +} + +impl<'input> Iterator for Lexer<'input> { + type Item = LexerItem; + + fn next(&mut self) -> Option { + use self::Token::*; + + tracer!(super::TRACE, "regex::Lexer::next"); + + // Returns the length of the first character in s in bytes. + // If s is empty, returns 0. + fn char_bytes(s: &str) -> usize { + if let Some(c) = s.chars().next() { + c.len_utf8() + } else { + 0 + } + } + + let one = |input: &'input str| -> Option { + let c = input.chars().next()?; + Some(match c { + '|' => PIPE, + '*' => STAR, + '+' => PLUS, + '?' => QUESTION, + '(' => LPAREN, + ')' => RPAREN, + '.' => DOT, + '^' => CARET, + '$' => DOLLAR, + '\\' => BACKSLASH, + '[' => LBRACKET, + ']' => RBRACKET, + '-' => DASH, + _ => OTHER(c), + }) + }; + + let l = char_bytes(self.input); + let t = match one(self.input) { + Some(t) => t, + None => return None, + }; + + self.input = &self.input[l..]; + + let start = self.offset; + let end = start + l; + self.offset += l; + + t!("Returning token at offset {}: '{:?}'", + start, t); + + Some(Ok((start, t, end))) + } +} + +impl<'input> From<&'input str> for Lexer<'input> { + fn from(i: &'input str) -> Lexer<'input> { + Lexer::new(i) + } +} + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn lexer() { + fn lex(s: &str, expected: &[Token]) { + let tokens: Vec = Lexer::new(s) + .map(|t| t.unwrap().1) + .collect(); + + assert_eq!(&tokens[..], expected, + "{}", s); + } + + use Token::*; + lex("|", &[ PIPE ]); + lex("*", &[ STAR ]); + lex("+", &[ PLUS ]); + lex("?", &[ QUESTION ]); + lex("(", &[ LPAREN ]); + lex(")", &[ RPAREN ]); + lex(".", &[ DOT ]); + lex("^", &[ CARET ]); + lex("$", &[ DOLLAR ]); + lex("\\", &[ BACKSLASH ]); + lex("[", &[ LBRACKET ]); + lex("]", &[ RBRACKET ]); + lex("-", &[ DASH ]); + lex("a", &[ OTHER('a') ]); + lex("aa", &[ OTHER('a'), OTHER('a') ]); + lex("foo", &[ OTHER('f'), OTHER('o'), OTHER('o') ]); + + lex("foo\\bar", &[ OTHER('f'), OTHER('o'), OTHER('o'), + BACKSLASH, + OTHER('b'), OTHER('a'), OTHER('r') ]); + lex("*?!", &[ STAR, QUESTION, OTHER('!') ]); + + // Multi-byte UTF-8. + lex("ΓŸβ„πŸ’£", &[ OTHER('ß'), OTHER('ℝ'), OTHER('πŸ’£'), ]); + lex("(ß|ℝ|πŸ’£", + &[ LPAREN, OTHER('ß'), PIPE, OTHER('ℝ'), PIPE, OTHER('πŸ’£') ]); + lex("東京", &[ OTHER('東'), OTHER('δΊ¬') ]); + } +} -- cgit v1.2.3