diff options
author | Paul Masurel <paul.masurel@gmail.com> | 2018-09-05 09:43:56 +0900 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-09-05 09:43:56 +0900 |
commit | c64972e03966671c5e3983fe018758d20b9b32e9 (patch) | |
tree | 27e6afac35e7f637943c02bce062d14a4ae53fec | |
parent | b3b2421e8aaf523b97862942891ee5a01be33a03 (diff) |
Apply unicode lowercasing. (#408)
Checks if the str is ASCII, and uses a fast track if it is the case.
If not, the std's definition of a lowercase character.
Closes #406
-rw-r--r-- | src/tokenizer/lower_caser.rs | 66 |
1 files changed, 60 insertions, 6 deletions
diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index ebade39..578678a 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -1,4 +1,5 @@ use super::{Token, TokenFilter, TokenStream}; +use std::mem; /// Token filter that lowercase terms. #[derive(Clone)] @@ -15,13 +16,22 @@ where } } -pub struct LowerCaserTokenStream<TailTokenStream> -where - TailTokenStream: TokenStream, -{ +pub struct LowerCaserTokenStream<TailTokenStream> { + buffer: String, tail: TailTokenStream, } +// writes a lowercased version of text into output. +fn to_lowercase_unicode(text: &mut String, output: &mut String) { + output.clear(); + for c in text.chars() { + // Contrary to the std, we do not take care of sigma special case. + // This will have an normalizationo effect, which is ok for search. + output.extend(c.to_lowercase()); + } +} + + impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream> where TailTokenStream: TokenStream, @@ -36,7 +46,14 @@ where fn advance(&mut self) -> bool { if self.tail.advance() { - self.tail.token_mut().text.make_ascii_lowercase(); + if self.token_mut().text.is_ascii() { + // fast track for ascii. + self.token_mut().text.make_ascii_lowercase(); + } else { + to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer); + + mem::swap(&mut self.tail.token_mut().text, &mut self.buffer); + } true } else { false @@ -49,6 +66,43 @@ where TailTokenStream: TokenStream, { fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream<TailTokenStream> { - LowerCaserTokenStream { tail } + LowerCaserTokenStream { + tail, + buffer: String::with_capacity(100) + } } } + +#[cfg(test)] +mod tests { + use tokenizer::Tokenizer; + use tokenizer::LowerCaser; + use tokenizer::TokenStream; + use tokenizer::SimpleTokenizer; + + #[test] + fn test_to_lower_case() { + assert_eq!(lowercase_helper("Русский текст"), + vec!["русский".to_string(), "текст".to_string()]); + } + + fn lowercase_helper(text: &str) -> Vec<String> { + let mut tokens = vec![]; + let mut token_stream = SimpleTokenizer + .filter(LowerCaser) + .token_stream(text); + while token_stream.advance() { + let token_text = token_stream.token().text.clone(); + tokens.push(token_text); + } + tokens + } + + + #[test] + fn test_lowercaser() { + assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]); + assert_eq!(lowercase_helper("Русский"), vec!["русский".to_string()]); + } + +}
\ No newline at end of file |