diff options
author | Paul Masurel <paul.masurel@gmail.com> | 2019-04-24 20:59:48 +0900 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-04-24 20:59:48 +0900 |
commit | 96a4f503ecd8f138862bf0a22948a600faf2b120 (patch) | |
tree | 86d3b555f43f9605af17b2ced7965bf0cce116b9 | |
parent | 9df288b0c922ef455e4120336b7f7eee7dcc939c (diff) |
Closes #526 (#535)
-rw-r--r-- | CHANGELOG.md | 2 | ||||
-rw-r--r-- | src/postings/mod.rs | 52 | ||||
-rw-r--r-- | src/postings/postings_writer.rs | 9 | ||||
-rw-r--r-- | src/tokenizer/mod.rs | 9 |
4 files changed, 66 insertions, 6 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index ab7f8cc..068b291 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ Minor - Small simplification of the code. Calling .freq() or .doc() when .advance() has never on segment postings should panic from now on. - +- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking. Tantivy 0.9.0 diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 3e1c034..edc9d6c 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -55,13 +55,15 @@ pub mod tests { use fieldnorm::FieldNormReader; use indexer::operation::AddOperation; use indexer::SegmentWriter; + use merge_policy::NoMergePolicy; use query::Scorer; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; - use schema::Field; - use schema::IndexRecordOption; use schema::{Document, Schema, Term, INDEXED, STRING, TEXT}; + use schema::{Field, TextOptions}; + use schema::{IndexRecordOption, TextFieldIndexing}; use std::iter; + use tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN}; use DocId; use Score; @@ -161,6 +163,52 @@ pub mod tests { } #[test] + pub fn test_drop_token_that_are_too_long() { + let ok_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN).collect(); + let mut exceeding_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN + 1).collect(); + exceeding_token_text.push_str(" hello"); + let mut schema_builder = Schema::builder(); + let text_options = TextOptions::default().set_indexing_options( + TextFieldIndexing::default() + .set_index_option(IndexRecordOption::WithFreqsAndPositions) + .set_tokenizer("simple_no_truncation"), + ); + let text_field = schema_builder.add_text_field("text", text_options); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + index + .tokenizers() + .register("simple_no_truncation", SimpleTokenizer); + let reader = index.reader().unwrap(); + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.set_merge_policy(Box::new(NoMergePolicy)); + { + index_writer.add_document(doc!(text_field=>exceeding_token_text)); + index_writer.commit().unwrap(); + reader.reload().unwrap(); + let searcher = reader.searcher(); + let segment_reader = searcher.segment_reader(0u32); + let inverted_index = segment_reader.inverted_index(text_field); + assert_eq!(inverted_index.terms().num_terms(), 1); + let mut bytes = vec![]; + assert!(inverted_index.terms().ord_to_term(0, &mut bytes)); + assert_eq!(&bytes, b"hello"); + } + { + index_writer.add_document(doc!(text_field=>ok_token_text.clone())); + index_writer.commit().unwrap(); + reader.reload().unwrap(); + let searcher = reader.searcher(); + let segment_reader = searcher.segment_reader(1u32); + let inverted_index = segment_reader.inverted_index(text_field); + assert_eq!(inverted_index.terms().num_terms(), 1); + let mut bytes = vec![]; + assert!(inverted_index.terms().ord_to_term(0, &mut bytes)); + assert_eq!(&bytes[..], ok_token_text.as_bytes()); + } + } + + #[test] pub fn test_position_and_fieldnorm1() { let mut positions = Vec::new(); let mut schema_builder = Schema::builder(); diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 61bcc35..d263258 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -12,8 +12,8 @@ use std::io; use std::marker::PhantomData; use std::ops::DerefMut; use termdict::TermOrdinal; -use tokenizer::Token; use tokenizer::TokenStream; +use tokenizer::{Token, MAX_TOKEN_LEN}; use DocId; use Result; @@ -210,8 +210,11 @@ pub trait PostingsWriter { ) -> u32 { let mut term = Term::for_field(field); let mut sink = |token: &Token| { - term.set_text(token.text.as_str()); - self.subscribe(term_index, doc_id, token.position as u32, &term, heap); + // We skip all tokens with a len greater than u16. + if token.text.len() <= MAX_TOKEN_LEN { + term.set_text(token.text.as_str()); + self.subscribe(term_index, doc_id, token.position as u32, &term, heap); + } }; token_stream.process(&mut sink) } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index e07116f..f9b72a6 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -97,6 +97,8 @@ //! If you built your schema programmatically, a complete example //! could like this for instance. //! +//! Note that tokens with a len greater or equal to [`MAX_TOKEN_LEN`](./constant.MAX_TOKEN_LEN.html). +//! //! # Example //! //! ``` @@ -157,6 +159,13 @@ pub use self::tokenizer::BoxedTokenizer; pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer_manager::TokenizerManager; +/// Maximum authorized len (in bytes) for a token. +/// +/// Tokenizer are in charge of not emitting tokens larger than this value. +/// Currently, if a faulty tokenizer implementation emits tokens with a length larger than +/// `2^16 - 1 - 4`, the token will simply be ignored downstream. +pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 4; + #[cfg(test)] pub mod tests { use super::{ |