summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Masurel <paul.masurel@gmail.com>2019-04-24 20:59:48 +0900
committerGitHub <noreply@github.com>2019-04-24 20:59:48 +0900
commit96a4f503ecd8f138862bf0a22948a600faf2b120 (patch)
tree86d3b555f43f9605af17b2ced7965bf0cce116b9
parent9df288b0c922ef455e4120336b7f7eee7dcc939c (diff)
Closes #526 (#535)
-rw-r--r--CHANGELOG.md2
-rw-r--r--src/postings/mod.rs52
-rw-r--r--src/postings/postings_writer.rs9
-rw-r--r--src/tokenizer/mod.rs9
4 files changed, 66 insertions, 6 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ab7f8cc..068b291 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@ Minor
- Small simplification of the code.
Calling .freq() or .doc() when .advance() has never
on segment postings should panic from now on.
-
+- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
Tantivy 0.9.0
diff --git a/src/postings/mod.rs b/src/postings/mod.rs
index 3e1c034..edc9d6c 100644
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -55,13 +55,15 @@ pub mod tests {
use fieldnorm::FieldNormReader;
use indexer::operation::AddOperation;
use indexer::SegmentWriter;
+ use merge_policy::NoMergePolicy;
use query::Scorer;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
- use schema::Field;
- use schema::IndexRecordOption;
use schema::{Document, Schema, Term, INDEXED, STRING, TEXT};
+ use schema::{Field, TextOptions};
+ use schema::{IndexRecordOption, TextFieldIndexing};
use std::iter;
+ use tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
use DocId;
use Score;
@@ -161,6 +163,52 @@ pub mod tests {
}
#[test]
+ pub fn test_drop_token_that_are_too_long() {
+ let ok_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN).collect();
+ let mut exceeding_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN + 1).collect();
+ exceeding_token_text.push_str(" hello");
+ let mut schema_builder = Schema::builder();
+ let text_options = TextOptions::default().set_indexing_options(
+ TextFieldIndexing::default()
+ .set_index_option(IndexRecordOption::WithFreqsAndPositions)
+ .set_tokenizer("simple_no_truncation"),
+ );
+ let text_field = schema_builder.add_text_field("text", text_options);
+ let schema = schema_builder.build();
+ let index = Index::create_in_ram(schema.clone());
+ index
+ .tokenizers()
+ .register("simple_no_truncation", SimpleTokenizer);
+ let reader = index.reader().unwrap();
+ let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+ index_writer.set_merge_policy(Box::new(NoMergePolicy));
+ {
+ index_writer.add_document(doc!(text_field=>exceeding_token_text));
+ index_writer.commit().unwrap();
+ reader.reload().unwrap();
+ let searcher = reader.searcher();
+ let segment_reader = searcher.segment_reader(0u32);
+ let inverted_index = segment_reader.inverted_index(text_field);
+ assert_eq!(inverted_index.terms().num_terms(), 1);
+ let mut bytes = vec![];
+ assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
+ assert_eq!(&bytes, b"hello");
+ }
+ {
+ index_writer.add_document(doc!(text_field=>ok_token_text.clone()));
+ index_writer.commit().unwrap();
+ reader.reload().unwrap();
+ let searcher = reader.searcher();
+ let segment_reader = searcher.segment_reader(1u32);
+ let inverted_index = segment_reader.inverted_index(text_field);
+ assert_eq!(inverted_index.terms().num_terms(), 1);
+ let mut bytes = vec![];
+ assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
+ assert_eq!(&bytes[..], ok_token_text.as_bytes());
+ }
+ }
+
+ #[test]
pub fn test_position_and_fieldnorm1() {
let mut positions = Vec::new();
let mut schema_builder = Schema::builder();
diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs
index 61bcc35..d263258 100644
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -12,8 +12,8 @@ use std::io;
use std::marker::PhantomData;
use std::ops::DerefMut;
use termdict::TermOrdinal;
-use tokenizer::Token;
use tokenizer::TokenStream;
+use tokenizer::{Token, MAX_TOKEN_LEN};
use DocId;
use Result;
@@ -210,8 +210,11 @@ pub trait PostingsWriter {
) -> u32 {
let mut term = Term::for_field(field);
let mut sink = |token: &Token| {
- term.set_text(token.text.as_str());
- self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
+ // We skip all tokens with a len greater than u16.
+ if token.text.len() <= MAX_TOKEN_LEN {
+ term.set_text(token.text.as_str());
+ self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
+ }
};
token_stream.process(&mut sink)
}
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index e07116f..f9b72a6 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -97,6 +97,8 @@
//! If you built your schema programmatically, a complete example
//! could like this for instance.
//!
+//! Note that tokens with a len greater or equal to [`MAX_TOKEN_LEN`](./constant.MAX_TOKEN_LEN.html).
+//!
//! # Example
//!
//! ```
@@ -157,6 +159,13 @@ pub use self::tokenizer::BoxedTokenizer;
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
pub use self::tokenizer_manager::TokenizerManager;
+/// Maximum authorized len (in bytes) for a token.
+///
+/// Tokenizer are in charge of not emitting tokens larger than this value.
+/// Currently, if a faulty tokenizer implementation emits tokens with a length larger than
+/// `2^16 - 1 - 4`, the token will simply be ignored downstream.
+pub const MAX_TOKEN_LEN: usize = u16::max_value() as usize - 4;
+
#[cfg(test)]
pub mod tests {
use super::{