diff options
author | Paul Masurel <paul.masurel@gmail.com> | 2018-09-11 09:45:27 +0900 |
---|---|---|
committer | Paul Masurel <paul.masurel@gmail.com> | 2018-09-11 09:45:27 +0900 |
commit | 63868733a38fa57fccf3d2e6e52ae1c5462a01ba (patch) | |
tree | 1f91f520ae4c84ec92c7893d630607cf2dbe2de6 /src | |
parent | 644d8a3a10bd6ad292360562e6c6a302bb709f78 (diff) |
Added SnippetGenerator
Diffstat (limited to 'src')
-rw-r--r-- | src/common/mod.rs | 21 | ||||
-rw-r--r-- | src/core/index.rs | 25 | ||||
-rw-r--r-- | src/core/searcher.rs | 1 | ||||
-rw-r--r-- | src/indexer/merger.rs | 10 | ||||
-rwxr-xr-x | src/lib.rs | 6 | ||||
-rw-r--r-- | src/query/query.rs | 2 | ||||
-rw-r--r-- | src/schema/schema.rs | 4 | ||||
-rw-r--r-- | src/schema/value.rs | 6 | ||||
-rw-r--r-- | src/snippet/mod.rs | 67 | ||||
-rw-r--r-- | src/store/mod.rs | 2 | ||||
-rw-r--r-- | src/tokenizer/mod.rs | 4 | ||||
-rw-r--r-- | src/tokenizer/tokenizer.rs | 2 | ||||
-rw-r--r-- | src/tokenizer/tokenizer_manager.rs | 2 |
13 files changed, 112 insertions, 40 deletions
diff --git a/src/common/mod.rs b/src/common/mod.rs index 778f047..2942438 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -68,17 +68,6 @@ pub trait HasLen { } } - -pub fn is_stricly_sorted<T: Ord>(els: &[T]) -> bool { - if els.is_empty() { - true - } else { - els.iter() - .zip(els[1..].iter()) - .all(|(left, right)| left < right) - } -} - const HIGHEST_BIT: u64 = 1 << 63; /// Maps a `i64` to `u64` @@ -116,20 +105,12 @@ pub fn u64_to_i64(val: u64) -> i64 { pub(crate) mod test { pub use super::serialize::test::fixed_size_test; - use super::{compute_num_bits, i64_to_u64, u64_to_i64, is_stricly_sorted}; + use super::{compute_num_bits, i64_to_u64, u64_to_i64}; fn test_i64_converter_helper(val: i64) { assert_eq!(u64_to_i64(i64_to_u64(val)), val); } - - #[test] - fn test_is_strictly_sorted() { - assert!(is_stricly_sorted::<u32>(&[])); - assert!(is_stricly_sorted(&[1])); - assert!(is_stricly_sorted(&[1, 2, 3])); - assert!(!is_stricly_sorted(&[1, 3, 2])); - } #[test] fn test_i64_converter() { assert_eq!(i64_to_u64(i64::min_value()), u64::min_value()); diff --git a/src/core/index.rs b/src/core/index.rs index f0df65b..da17449 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -115,6 +115,8 @@ impl Index { &self.tokenizers } + + /// Helper to access the tokenizer associated to a specific field. pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<BoxedTokenizer>> { let field_entry = self.schema.get_field_entry(field); let field_type = field_entry.field_type(); @@ -325,3 +327,26 @@ impl Clone for Index { } } } + + +#[cfg(test)] +mod tests { + use Index; + use schema::{SchemaBuilder, TEXT, INT_INDEXED}; + + #[test] + fn test_indexer_for_field() { + let mut schema_builder = SchemaBuilder::default(); + let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED); + let body_field = schema_builder.add_text_field("body", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + assert!(index.tokenizer_for_field(body_field).is_ok()); + assert_eq!( + format!("{:?}", index.tokenizer_for_field(num_likes_field).err()), + "Some(SchemaError(\"\\\"num_likes\\\" is not a text field.\"))" + ); + } + + +}
\ No newline at end of file diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 9de6c85..f17df04 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -33,6 +33,7 @@ impl Searcher { } } + /// Returns the `Index` associated to the `Searcher` pub fn index(&self) -> &Index { &self.index } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index e79551a..5d2e17c 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -770,23 +770,23 @@ mod tests { } { let doc = searcher.doc(&DocAddress(0, 0)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "af b"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b")); } { let doc = searcher.doc(&DocAddress(0, 1)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c")); } { let doc = searcher.doc(&DocAddress(0, 2)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c d"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c d")); } { let doc = searcher.doc(&DocAddress(0, 3)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "af b"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b")); } { let doc = searcher.doc(&DocAddress(0, 4)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c g"); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c g")); } { let get_fast_vals = |terms: Vec<Term>| { @@ -899,11 +899,11 @@ mod tests { assert_eq!(document.len(), 3); let values = document.get_all(text_field); assert_eq!(values.len(), 2); - assert_eq!(values[0].text(), "tantivy"); - assert_eq!(values[1].text(), "some other value"); + assert_eq!(values[0].text(), Some("tantivy")); + assert_eq!(values[1].text(), Some("some other value")); let values = document.get_all(other_text_field); assert_eq!(values.len(), 1); - assert_eq!(values[0].text(), "short"); + assert_eq!(values[0].text(), Some("short")); } #[test] diff --git a/src/query/query.rs b/src/query/query.rs index 9bf139b..6abbf35 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -60,6 +60,8 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug { Ok(result) } + /// Extract all of the terms associated to the query and insert them in the + /// term set given in arguments. fn query_terms(&self, _term_set: &mut BTreeSet<Term>) {} /// Search works as follows : diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 6d4f6c9..d000ab9 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -443,8 +443,8 @@ mod tests { }"#, ) .unwrap(); - assert_eq!(doc.get_first(title_field).unwrap().text(), "my title"); - assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton"); + assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title")); + assert_eq!(doc.get_first(author_field).unwrap().text(), Some("fulmicoton")); assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4); assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10); } diff --git a/src/schema/value.rs b/src/schema/value.rs index f5ce151..64b0dc7 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -74,10 +74,10 @@ impl Value { /// /// # Panics /// If the value is not of type `Str` - pub fn text(&self) -> &str { + pub fn text(&self) -> Option<&str> { match *self { - Value::Str(ref text) => text, - _ => panic!("This is not a text field."), + Value::Str(ref text) => Some(text), + _ => None, } } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index 9842cdd..6703d64 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -7,6 +7,9 @@ use Searcher; use schema::Field; use std::collections::BTreeSet; use tokenizer::BoxedTokenizer; +use Document; + +const DEFAULT_MAX_NUM_CHARS: usize = 150; #[derive(Debug)] pub struct HighlightSection { @@ -189,16 +192,58 @@ fn select_best_fragment_combination<'a>( } } - -const DEFAULT_MAX_NUM_CHARS: usize = 150; - +/// `SnippetGenerator` +/// +/// # Example +/// +/// ```rust +/// # #[macro_use] +/// # extern crate tantivy; +/// # use tantivy::Index; +/// # use tantivy::schema::{SchemaBuilder, TEXT}; +/// # use tantivy::query::QueryParser; +/// use tantivy::SnippetGenerator; +/// +/// # fn main() -> tantivy::Result<()> { +/// # let mut schema_builder = SchemaBuilder::default(); +/// # let text_field = schema_builder.add_text_field("text", TEXT); +/// # let schema = schema_builder.build(); +/// # let index = Index::create_in_ram(schema); +/// # let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?; +/// # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles, +/// # Je ne me sentis plus guidé par les haleurs : +/// # Des Peaux-Rouges criards les avaient pris pour cibles, +/// # Les ayant cloués nus aux poteaux de couleurs. +/// # +/// # J'étais insoucieux de tous les équipages, +/// # Porteur de blés flamands ou de cotons anglais. +/// # Quand avec mes haleurs ont fini ces tapages, +/// # Les Fleuves m'ont laissé descendre où je voulais. +/// # "#); +/// # index_writer.add_document(doc.clone()); +/// # index_writer.commit()?; +/// # let query_parser = QueryParser::for_index(&index, vec![text_field]); +/// // ... +/// let query = query_parser.parse_query("haleurs flamands").unwrap(); +/// # index.load_searchers()?; +/// # let searcher = index.searcher(); +/// let mut snippet_generator = SnippetGenerator::new(&*searcher, &*query, text_field)?; +/// snippet_generator.set_max_num_chars(100); +/// let snippet = snippet_generator.snippet_from_doc(&doc); +/// let snippet_html: String = snippet.to_html(); +/// assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les <b>haleurs</b> :\n Des"); +/// # Ok(()) +/// # } +/// ``` pub struct SnippetGenerator { terms_text: BTreeMap<String, f32>, tokenizer: Box<BoxedTokenizer>, + field: Field, max_num_chars: usize } impl SnippetGenerator { + /// Creates a new snippet generator pub fn new(searcher: &Searcher, query: &Query, field: Field) -> Result<SnippetGenerator> { @@ -212,14 +257,30 @@ impl SnippetGenerator { Ok(SnippetGenerator { terms_text, tokenizer, + field, max_num_chars: DEFAULT_MAX_NUM_CHARS }) } + /// Sets a maximum number of chars. pub fn set_max_num_chars(&mut self, max_num_chars: usize) { self.max_num_chars = max_num_chars; } + /// Generates a snippet for the given `Document`. + /// + /// This method extract the text associated to the `SnippetGenerator`'s field + /// and computes a snippet. + pub fn snippet_from_doc(&self, doc: &Document) -> Snippet { + let text: String = doc.get_all(self.field) + .into_iter() + .flat_map(|val| val.text()) + .collect::<Vec<&str>>() + .join(" "); + self.snippet(&text) + } + + /// Generates a snippet for the given text. pub fn snippet(&self, text: &str) -> Snippet { let fragment_candidates = search_fragments(&*self.tokenizer, &text, diff --git a/src/store/mod.rs b/src/store/mod.rs index 5d71563..7bce908 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -109,7 +109,7 @@ pub mod tests { let store = StoreReader::from_source(store_source); for i in 0..1_000 { assert_eq!( - *store.get(i).unwrap().get_first(field_title).unwrap().text(), + *store.get(i).unwrap().get_first(field_title).unwrap().text().unwrap(), format!("Doc {}", i) ); } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index d4a735b..e8bb352 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -153,7 +153,9 @@ pub use self::simple_tokenizer::SimpleTokenizer; pub use self::stemmer::Stemmer; pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; -pub use self::tokenizer::{BoxedTokenizer, box_tokenizer}; +pub use self::tokenizer::BoxedTokenizer; +pub(crate) use self::tokenizer::box_tokenizer; + pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer_manager::TokenizerManager; diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index e806b70..fcdf8f2 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -130,7 +130,7 @@ where } } -pub fn box_tokenizer<A>(a: A) -> Box<BoxedTokenizer> +pub(crate) fn box_tokenizer<A>(a: A) -> Box<BoxedTokenizer> where A: 'static + Send + Sync + for<'a> Tokenizer<'a>, { diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index cbb46af..447dea3 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; use std::sync::{Arc, RwLock}; -use tokenizer::tokenizer::box_tokenizer; +use tokenizer::box_tokenizer; use tokenizer::BoxedTokenizer; use tokenizer::JapaneseTokenizer; use tokenizer::LowerCaser; |