summaryrefslogtreecommitdiffstats
path: root/src/tokenizer
diff options
context:
space:
mode:
authorPaul Masurel <paul.masurel@gmail.com>2017-12-14 18:23:35 +0900
committerPaul Masurel <paul.masurel@gmail.com>2017-12-14 18:23:35 +0900
commitf24e5f405ec205b99989554a7cea48cb2f4b9d07 (patch)
tree1508a82b18ff0112c338abd271e572c16bd87dd9 /src/tokenizer
parent2589be3984a7fe71c38b7da07475a2481cdd4d27 (diff)
NOBUG intellij misc lint
Diffstat (limited to 'src/tokenizer')
-rw-r--r--src/tokenizer/japanese_tokenizer.rs6
-rw-r--r--src/tokenizer/lower_caser.rs2
-rw-r--r--src/tokenizer/raw_tokenizer.rs2
-rw-r--r--src/tokenizer/remove_long.rs6
-rw-r--r--src/tokenizer/simple_tokenizer.rs2
-rw-r--r--src/tokenizer/stemmer.rs4
-rw-r--r--src/tokenizer/token_stream_chain.rs4
-rw-r--r--src/tokenizer/tokenizer_manager.rs10
8 files changed, 20 insertions, 16 deletions
diff --git a/src/tokenizer/japanese_tokenizer.rs b/src/tokenizer/japanese_tokenizer.rs
index c9981b2..3dfb3ad 100644
--- a/src/tokenizer/japanese_tokenizer.rs
+++ b/src/tokenizer/japanese_tokenizer.rs
@@ -30,15 +30,15 @@ impl<'a> Tokenizer<'a> for JapaneseTokenizer {
offset_to = offset_from + term.len();
if term.chars().all(char::is_alphanumeric) {
tokens.push(Token {
- offset_from: offset_from,
- offset_to: offset_to,
+ offset_from,
+ offset_to,
position: pos,
text: term,
});
}
}
JapaneseTokenizerStream {
- tokens: tokens,
+ tokens,
cursor: Cursor::HasNotStarted,
}
}
diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs
index b7357ee..a15d34b 100644
--- a/src/tokenizer/lower_caser.rs
+++ b/src/tokenizer/lower_caser.rs
@@ -48,6 +48,6 @@ where
TailTokenStream: TokenStream,
{
fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream<TailTokenStream> {
- LowerCaserTokenStream { tail: tail }
+ LowerCaserTokenStream { tail }
}
}
diff --git a/src/tokenizer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs
index fe36338..039ac6a 100644
--- a/src/tokenizer/raw_tokenizer.rs
+++ b/src/tokenizer/raw_tokenizer.rs
@@ -21,7 +21,7 @@ impl<'a> Tokenizer<'a> for RawTokenizer {
text: text.to_string(),
};
RawTokenStream {
- token: token,
+ token,
has_token: true,
}
}
diff --git a/src/tokenizer/remove_long.rs b/src/tokenizer/remove_long.rs
index 5637906..94d6b6c 100644
--- a/src/tokenizer/remove_long.rs
+++ b/src/tokenizer/remove_long.rs
@@ -14,7 +14,7 @@ pub struct RemoveLongFilter {
impl RemoveLongFilter {
// the limit is in bytes of the UTF-8 representation.
pub fn limit(length_limit: usize) -> RemoveLongFilter {
- RemoveLongFilter { length_limit: length_limit }
+ RemoveLongFilter { length_limit }
}
}
@@ -31,8 +31,8 @@ where
tail: TailTokenStream,
) -> RemoveLongFilterStream<TailTokenStream> {
RemoveLongFilterStream {
- token_length_limit: token_length_limit,
- tail: tail,
+ token_length_limit,
+ tail,
}
}
}
diff --git a/src/tokenizer/simple_tokenizer.rs b/src/tokenizer/simple_tokenizer.rs
index e9d93de..8850c5f 100644
--- a/src/tokenizer/simple_tokenizer.rs
+++ b/src/tokenizer/simple_tokenizer.rs
@@ -18,7 +18,7 @@ impl<'a> Tokenizer<'a> for SimpleTokenizer {
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
SimpleTokenStream {
- text: text,
+ text,
chars: text.char_indices(),
token: Token::default(),
}
diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs
index 1c349e0..9a8e7d1 100644
--- a/src/tokenizer/stemmer.rs
+++ b/src/tokenizer/stemmer.rs
@@ -67,8 +67,8 @@ where
tail: TailTokenStream,
) -> StemmerTokenStream<TailTokenStream> {
StemmerTokenStream {
- tail: tail,
- stemmer: stemmer,
+ tail,
+ stemmer,
}
}
}
diff --git a/src/tokenizer/token_stream_chain.rs b/src/tokenizer/token_stream_chain.rs
index eaeccd4..4815936 100644
--- a/src/tokenizer/token_stream_chain.rs
+++ b/src/tokenizer/token_stream_chain.rs
@@ -18,9 +18,9 @@ where
token_streams: Vec<TTokenStream>,
) -> TokenStreamChain<TTokenStream> {
TokenStreamChain {
- offsets: offsets,
+ offsets,
stream_idx: 0,
- token_streams: token_streams,
+ token_streams,
position_shift: 0,
token: Token::default(),
}
diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs
index 24f611a..54c28f1 100644
--- a/src/tokenizer/tokenizer_manager.rs
+++ b/src/tokenizer/tokenizer_manager.rs
@@ -17,9 +17,13 @@ use tokenizer::Stemmer;
///
/// By default, it is populated with the following managers.
///
-/// * raw : does not process nor tokenize the text.
-/// * default : Chops the text on according to whitespace and
-/// punctuation, removes tokens that are too long, lowercases
+/// * `raw` : does not process nor tokenize the text.
+/// * `default` : Chops the text on according to whitespace and
+/// punctuation, removes tokens that are too long, and lowercases
+/// tokens
+/// * `en_stem` : Like `default`, but also applies stemming on the
+/// resulting tokens. Stemming can improve the recall of your
+/// search engine.
#[derive(Clone)]
pub struct TokenizerManager {
tokenizers: Arc<RwLock<HashMap<String, Box<BoxedTokenizer>>>>,