diff options
author | Paul Masurel <paul.masurel@gmail.com> | 2018-05-07 19:50:35 -0700 |
---|---|---|
committer | Paul Masurel <paul.masurel@gmail.com> | 2018-05-07 19:50:35 -0700 |
commit | 9a0b7f9855f34eb772358e0cc1d00b4fa937c781 (patch) | |
tree | bb3c31aa5a7e39dc6a81d6aaf987e7572563d731 | |
parent | 8e343b1ca3915b54134214c60b45d47effce60d6 (diff) |
Rustfmt
-rw-r--r-- | examples/custom_tokenizer.rs | 378 | ||||
-rw-r--r-- | src/collector/mod.rs | 2 | ||||
-rw-r--r-- | src/compression/mod.rs | 6 | ||||
-rw-r--r-- | src/core/segment_reader.rs | 4 | ||||
-rw-r--r-- | src/datastruct/stacker/hashmap.rs | 8 | ||||
-rw-r--r-- | src/fastfield/bytes/mod.rs | 2 | ||||
-rw-r--r-- | src/fastfield/bytes/reader.rs | 7 | ||||
-rw-r--r-- | src/fastfield/bytes/writer.rs | 13 | ||||
-rw-r--r-- | src/fastfield/mod.rs | 2 | ||||
-rw-r--r-- | src/fastfield/multivalued/writer.rs | 2 | ||||
-rw-r--r-- | src/fastfield/serializer.rs | 5 | ||||
-rw-r--r-- | src/fastfield/writer.rs | 9 | ||||
-rw-r--r-- | src/fieldnorm/writer.rs | 3 | ||||
-rw-r--r-- | src/indexer/merger.rs | 286 | ||||
-rw-r--r-- | src/postings/postings_writer.rs | 20 | ||||
-rw-r--r-- | src/postings/serializer.rs | 11 | ||||
-rw-r--r-- | src/query/all_query.rs | 25 | ||||
-rw-r--r-- | src/schema/field_entry.rs | 2 | ||||
-rw-r--r-- | src/schema/field_type.rs | 25 | ||||
-rw-r--r-- | src/schema/term.rs | 2 | ||||
-rw-r--r-- | src/termdict/merger.rs | 19 | ||||
-rw-r--r-- | src/termdict/mod.rs | 1 | ||||
-rw-r--r-- | src/termdict/term_info_store.rs | 2 | ||||
-rw-r--r-- | src/tokenizer/ngram_tokenizer.rs | 176 | ||||
-rw-r--r-- | src/tokenizer/tokenizer.rs | 24 |
25 files changed, 514 insertions, 520 deletions
diff --git a/examples/custom_tokenizer.rs b/examples/custom_tokenizer.rs index 4f498df..9ddd17e 100644 --- a/examples/custom_tokenizer.rs +++ b/examples/custom_tokenizer.rs @@ -13,99 +13,99 @@ use tantivy::Index; use tempdir::TempDir; fn main() { - // Let's create a temporary directory for the - // sake of this example - if let Ok(dir) = TempDir::new("tantivy_token_example_dir") { - run_example(dir.path()).unwrap(); - dir.close().unwrap(); - } + // Let's create a temporary directory for the + // sake of this example + if let Ok(dir) = TempDir::new("tantivy_token_example_dir") { + run_example(dir.path()).unwrap(); + dir.close().unwrap(); + } } fn run_example(index_path: &Path) -> tantivy::Result<()> { - // # Defining the schema - // - // The Tantivy index requires a very strict schema. - // The schema declares which fields are in the index, - // and for each field, its type and "the way it should - // be indexed". - - // first we need to define a schema ... - let mut schema_builder = SchemaBuilder::default(); - - // Our first field is title. - // In this example we want to use NGram searching - // we will set that to 3 characters, so any three - // char in the title should be findable. - let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer("ngram3") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); - let text_options = TextOptions::default() - .set_indexing_options(text_field_indexing) - .set_stored(); - schema_builder.add_text_field("title", text_options); - - // Our second field is body. - // We want full-text search for it, but we do not - // need to be able to be able to retrieve it - // for our application. - // - // We can make our index lighter and - // by omitting `STORED` flag. - schema_builder.add_text_field("body", TEXT); - - let schema = schema_builder.build(); - - // # Indexing documents - // - // Let's create a brand new index. - // - // This will actually just save a meta.json - // with our schema in the directory. - let index = Index::create(index_path, schema.clone())?; - - // here we are registering our custome tokenizer - // this will store tokens of 3 characters each - index - .tokenizers() - .register("ngram3", NgramTokenizer::new(3, 3, false)); - - // To insert document we need an index writer. - // There must be only one writer at a time. - // This single `IndexWriter` is already - // multithreaded. - // - // Here we use a buffer of 50MB per thread. Using a bigger - // heap for the indexer can increase its throughput. - let mut index_writer = index.writer(50_000_000)?; - - // Let's index our documents! - // We first need a handle on the title and the body field. - - // ### Create a document "manually". - // - // We can create a document manually, by setting the fields - // one by one in a Document object. - let title = schema.get_field("title").unwrap(); - let body = schema.get_field("body").unwrap(); - - let mut old_man_doc = Document::default(); - old_man_doc.add_text(title, "The Old Man and the Sea"); - old_man_doc.add_text( - body, - "He was an old man who fished alone in a skiff in the Gulf Stream and \ - he had gone eighty-four days now without taking a fish.", - ); - - // ... and add it to the `IndexWriter`. - index_writer.add_document(old_man_doc); - - // ### Create a document directly from json. - // - // Alternatively, we can use our schema to parse a - // document object directly from json. - // The document is a string, but we use the `json` macro - // from `serde_json` for the convenience of multi-line support. - let json = json!({ + // # Defining the schema + // + // The Tantivy index requires a very strict schema. + // The schema declares which fields are in the index, + // and for each field, its type and "the way it should + // be indexed". + + // first we need to define a schema ... + let mut schema_builder = SchemaBuilder::default(); + + // Our first field is title. + // In this example we want to use NGram searching + // we will set that to 3 characters, so any three + // char in the title should be findable. + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("ngram3") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let text_options = TextOptions::default() + .set_indexing_options(text_field_indexing) + .set_stored(); + schema_builder.add_text_field("title", text_options); + + // Our second field is body. + // We want full-text search for it, but we do not + // need to be able to be able to retrieve it + // for our application. + // + // We can make our index lighter and + // by omitting `STORED` flag. + schema_builder.add_text_field("body", TEXT); + + let schema = schema_builder.build(); + + // # Indexing documents + // + // Let's create a brand new index. + // + // This will actually just save a meta.json + // with our schema in the directory. + let index = Index::create(index_path, schema.clone())?; + + // here we are registering our custome tokenizer + // this will store tokens of 3 characters each + index + .tokenizers() + .register("ngram3", NgramTokenizer::new(3, 3, false)); + + // To insert document we need an index writer. + // There must be only one writer at a time. + // This single `IndexWriter` is already + // multithreaded. + // + // Here we use a buffer of 50MB per thread. Using a bigger + // heap for the indexer can increase its throughput. + let mut index_writer = index.writer(50_000_000)?; + + // Let's index our documents! + // We first need a handle on the title and the body field. + + // ### Create a document "manually". + // + // We can create a document manually, by setting the fields + // one by one in a Document object. + let title = schema.get_field("title").unwrap(); + let body = schema.get_field("body").unwrap(); + + let mut old_man_doc = Document::default(); + old_man_doc.add_text(title, "The Old Man and the Sea"); + old_man_doc.add_text( + body, + "He was an old man who fished alone in a skiff in the Gulf Stream and \ + he had gone eighty-four days now without taking a fish.", + ); + + // ... and add it to the `IndexWriter`. + index_writer.add_document(old_man_doc); + + // ### Create a document directly from json. + // + // Alternatively, we can use our schema to parse a + // document object directly from json. + // The document is a string, but we use the `json` macro + // from `serde_json` for the convenience of multi-line support. + let json = json!({ "title": "Of Mice and Men", "body": "A few miles south of Soledad, the Salinas River drops in close to the hillside \ bank and runs deep and green. The water is warm too, for it has slipped twinkling \ @@ -116,111 +116,111 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ limbs and branches that arch over the pool" }); - let mice_and_men_doc = schema.parse_document(&json.to_string())?; + let mice_and_men_doc = schema.parse_document(&json.to_string())?; - index_writer.add_document(mice_and_men_doc); + index_writer.add_document(mice_and_men_doc); - // Multi-valued field are allowed, they are - // expressed in JSON by an array. - // The following document has two titles. - let json = json!({ + // Multi-valued field are allowed, they are + // expressed in JSON by an array. + // The following document has two titles. + let json = json!({ "title": ["Frankenstein", "The Modern Prometheus"], "body": "You will rejoice to hear that no disaster has accompanied the commencement of an \ enterprise which you have regarded with such evil forebodings. I arrived here \ yesterday, and my first task is to assure my dear sister of my welfare and \ increasing confidence in the success of my undertaking." }); - let frankenstein_doc = schema.parse_document(&json.to_string())?; - - index_writer.add_document(frankenstein_doc); - - // This is an example, so we will only index 3 documents - // here. You can check out tantivy's tutorial to index - // the English wikipedia. Tantivy's indexing is rather fast. - // Indexing 5 million articles of the English wikipedia takes - // around 4 minutes on my computer! - - // ### Committing - // - // At this point our documents are not searchable. - // - // - // We need to call .commit() explicitly to force the - // index_writer to finish processing the documents in the queue, - // flush the current index to the disk, and advertise - // the existence of new documents. - // - // This call is blocking. - index_writer.commit()?; - - // If `.commit()` returns correctly, then all of the - // documents that have been added are guaranteed to be - // persistently indexed. - // - // In the scenario of a crash or a power failure, - // tantivy behaves as if has rolled back to its last - // commit. - - // # Searching - // - // Let's search our index. Start by reloading - // searchers in the index. This should be done - // after every commit(). - index.load_searchers()?; - - // Afterwards create one (or more) searchers. - // - // You should create a searcher - // every time you start a "search query". - let searcher = index.searcher(); - - // The query parser can interpret human queries. - // Here, if the user does not specify which - // field they want to search, tantivy will search - // in both title and body. - let query_parser = QueryParser::for_index(&index, vec![title, body]); - - // here we want to get a hit on the 'ken' in Frankenstein - let query = query_parser.parse_query("ken")?; - - // A query defines a set of documents, as - // well as the way they should be scored. - // - // A query created by the query parser is scored according - // to a metric called Tf-Idf, and will consider - // any document matching at least one of our terms. - - // ### Collectors - // - // We are not interested in all of the documents but - // only in the top 10. Keeping track of our top 10 best documents - // is the role of the TopCollector. - let mut top_collector = TopCollector::with_limit(10); - - // We can now perform our query. - searcher.search(&*query, &mut top_collector)?; - - // Our top collector now contains the 10 - // most relevant doc ids... - let doc_addresses = top_collector.docs(); - - // The actual documents still need to be - // retrieved from Tantivy's store. - // - // Since the body field was not configured as stored, - // the document returned will only contain - // a title. - - for doc_address in doc_addresses { - let retrieved_doc = searcher.doc(&doc_address)?; - println!("{}", schema.to_json(&retrieved_doc)); - } - - // Wait for indexing and merging threads to shut down. - // Usually this isn't needed, but in `main` we try to - // delete the temporary directory and that fails on - // Windows if the files are still open. - index_writer.wait_merging_threads()?; - - Ok(()) + let frankenstein_doc = schema.parse_document(&json.to_string())?; + + index_writer.add_document(frankenstein_doc); + + // This is an example, so we will only index 3 documents + // here. You can check out tantivy's tutorial to index + // the English wikipedia. Tantivy's indexing is rather fast. + // Indexing 5 million articles of the English wikipedia takes + // around 4 minutes on my computer! + + // ### Committing + // + // At this point our documents are not searchable. + // + // + // We need to call .commit() explicitly to force the + // index_writer to finish processing the documents in the queue, + // flush the current index to the disk, and advertise + // the existence of new documents. + // + // This call is blocking. + index_writer.commit()?; + + // If `.commit()` returns correctly, then all of the + // documents that have been added are guaranteed to be + // persistently indexed. + // + // In the scenario of a crash or a power failure, + // tantivy behaves as if has rolled back to its last + // commit. + + // # Searching + // + // Let's search our index. Start by reloading + // searchers in the index. This should be done + // after every commit(). + index.load_searchers()?; + + // Afterwards create one (or more) searchers. + // + // You should create a searcher + // every time you start a "search query". + let searcher = index.searcher(); + + // The query parser can interpret human queries. + // Here, if the user does not specify which + // field they want to search, tantivy will search + // in both title and body. + let query_parser = QueryParser::for_index(&index, vec![title, body]); + + // here we want to get a hit on the 'ken' in Frankenstein + let query = query_parser.parse_query("ken")?; + + // A query defines a set of documents, as + // well as the way they should be scored. + // + // A query created by the query parser is scored according + // to a metric called Tf-Idf, and will consider + // any document matching at least one of our terms. + + // ### Collectors + // + // We are not interested in all of the documents but + // only in the top 10. Keeping track of our top 10 best documents + // is the role of the TopCollector. + let mut top_collector = TopCollector::with_limit(10); + + // We can now perform our query. + searcher.search(&*query, &mut top_collector)?; + + // Our top collector now contains the 10 + // most relevant doc ids... + let doc_addresses = top_collector.docs(); + + // The actual documents still need to be + // retrieved from Tantivy's store. + // + // Since the body field was not configured as stored, + // the document returned will only contain + // a title. + + for doc_address in doc_addresses { + let retrieved_doc = searcher.doc(&doc_address)?; + println!("{}", schema.to_json(&retrieved_doc)); + } + + // Wait for indexing and merging threads to shut down. + // Usually this isn't needed, but in `main` we try to + // delete the temporary directory and that fails on + // Windows if the files are still open. + index_writer.wait_merging_threads()?; + + Ok(()) } diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 9c0d3ac..17ca931 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -90,12 +90,12 @@ pub mod tests { use super::*; use core::SegmentReader; + use fastfield::BytesFastFieldReader; use fastfield::FastFieldReader; use schema::Field; use DocId; use Score; use SegmentLocalId; - use fastfield::BytesFastFieldReader; /// Stores all of the doc ids. /// This collector is only used for tests. diff --git a/src/compression/mod.rs b/src/compression/mod.rs index 5261fea..0e6a189 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -274,10 +274,10 @@ pub mod tests { mod bench { use super::*; - use test::Bencher; - use rand::XorShiftRng; - use rand::SeedableRng; use rand::Rng; + use rand::SeedableRng; + use rand::XorShiftRng; + use test::Bencher; fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> { let seed: &[u32; 4] = &[1, 2, 3, seed_val]; diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 6c9d331..5482c8a 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -122,7 +122,7 @@ impl SegmentReader { pub(crate) fn fast_field_reader_with_idx<Item: FastValue>( &self, field: Field, - idx: usize + idx: usize, ) -> fastfield::Result<FastFieldReader<Item>> { if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) { Ok(FastFieldReader::open(ff_source)) @@ -153,7 +153,7 @@ impl SegmentReader { pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result<BytesFastFieldReader> { let field_entry = self.schema.get_field_entry(field); match field_entry.field_type() { - &FieldType::Bytes => {}, + &FieldType::Bytes => {} _ => return Err(FastFieldNotAvailableError::new(field_entry)), } let idx_reader = self.fast_fields_composite diff --git a/src/datastruct/stacker/hashmap.rs b/src/datastruct/stacker/hashmap.rs index 36c0b13..f872f31 100644 --- a/src/datastruct/stacker/hashmap.rs +++ b/src/datastruct/stacker/hashmap.rs @@ -4,7 +4,6 @@ use std::iter; use std::mem; use std::slice; - mod murmurhash2 { const SEED: u32 = 3_242_157_231u32; @@ -12,7 +11,6 @@ mod murmurhash2 { #[inline(always)] pub fn murmurhash2(key: &[u8]) -> u32 { - let mut key_ptr: *const u32 = key.as_ptr() as *const u32; let len = key.len() as u32; let mut h: u32 = SEED ^ len; @@ -54,7 +52,6 @@ mod murmurhash2 { } } - /// Split the thread memory budget into /// - the heap size /// - the hash table "table" itself. @@ -217,14 +214,14 @@ impl<'a> TermHashMap<'a> { } } -#[cfg(all(test, feature="unstable"))] +#[cfg(all(test, feature = "unstable"))] mod bench { use super::murmurhash2::murmurhash2; use test::Bencher; #[bench] fn bench_murmurhash2(b: &mut Bencher) { - let keys: [&'static str; 3]= ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "]; + let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "]; b.iter(|| { let mut s = 0; for &key in &keys { @@ -324,7 +321,6 @@ mod tests { assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870); } - #[test] fn test_murmur_collisions() { let mut set: HashSet<u32> = HashSet::default(); diff --git a/src/fastfield/bytes/mod.rs b/src/fastfield/bytes/mod.rs index add4e6b..48c021f 100644 --- a/src/fastfield/bytes/mod.rs +++ b/src/fastfield/bytes/mod.rs @@ -35,4 +35,4 @@ mod tests { let long = vec![0u8; 1000]; assert_eq!(bytes_reader.get_val(4), long.as_slice()); } -}
\ No newline at end of file +} diff --git a/src/fastfield/bytes/reader.rs b/src/fastfield/bytes/reader.rs index 8db65e7..9e4c879 100644 --- a/src/fastfield/bytes/reader.rs +++ b/src/fastfield/bytes/reader.rs @@ -25,10 +25,7 @@ impl BytesFastFieldReader { values_source: ReadOnlySource, ) -> BytesFastFieldReader { let values = OwningRef::new(values_source).map(|source| &source[..]); - BytesFastFieldReader { - idx_reader, - values, - } + BytesFastFieldReader { idx_reader, values } } /// Returns the bytes associated to the given `doc` @@ -37,4 +34,4 @@ impl BytesFastFieldReader { let stop = self.idx_reader.get(doc + 1) as usize; &self.values[start..stop] } -}
\ No newline at end of file +} diff --git a/src/fastfield/bytes/writer.rs b/src/fastfield/bytes/writer.rs index 16aa330..568a542 100644 --- a/src/fastfield/bytes/writer.rs +++ b/src/fastfield/bytes/writer.rs @@ -1,4 +1,3 @@ - use std::io; use fastfield::serializer::FastFieldSerializer; @@ -55,7 +54,10 @@ impl BytesFastFieldWriter { if let &Value::Bytes(ref bytes) = field_value.value() { self.vals.extend_from_slice(bytes); } else { - panic!("Bytes field contained non-Bytes Value!. Field {:?} = {:?}", self.field, field_value); + panic!( + "Bytes field contained non-Bytes Value!. Field {:?} = {:?}", + self.field, field_value + ); } } } @@ -73,10 +75,7 @@ impl BytesFastFieldWriter { } /// Serializes the fast field values by pushing them to the `FastFieldSerializer`. - pub fn serialize( - &self, - serializer: &mut FastFieldSerializer - ) -> io::Result<()> { + pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> { { // writing the offset index let mut doc_index_serializer = @@ -94,4 +93,4 @@ impl BytesFastFieldWriter { } Ok(()) } -}
\ No newline at end of file +} diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index ba14c52..265d325 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -23,6 +23,7 @@ values stored. Read access performance is comparable to that of an array lookup. */ +pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; pub use self::delete::write_delete_bitset; pub use self::delete::DeleteBitSet; pub use self::error::{FastFieldNotAvailableError, Result}; @@ -31,7 +32,6 @@ pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastField pub use self::reader::FastFieldReader; pub use self::serializer::FastFieldSerializer; pub use self::writer::{FastFieldsWriter, IntFastFieldWriter}; -pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; use common; use schema::Cardinality; use schema::FieldType; diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index 9991980..b872fa1 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -3,10 +3,10 @@ use fastfield::value_to_u64; use fastfield::FastFieldSerializer; use itertools::Itertools; use postings::UnorderedTermId; -use termdict::TermOrdinal; use schema::{Document, Field}; use std::collections::HashMap; use std::io; +use termdict::TermOrdinal; use DocId; /// Writer for multi-valued (as in, more than one value per document) diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index 287fd76..ef781c2 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -65,7 +65,7 @@ impl FastFieldSerializer { pub fn new_bytes_fast_field_with_idx( &mut self, field: Field, - idx: usize + idx: usize, ) -> io::Result<FastBytesFieldSerializer<CountingWriter<WritePtr>>> { let field_write = self.composite_write.for_field_with_idx(field, idx); FastBytesFieldSerializer::open(field_write) @@ -87,7 +87,6 @@ pub struct FastSingleFieldSerializer<'a, W: Write + 'a> { } impl<'a, W: Write> FastSingleFieldSerializer<'a, W> { - /// Creates a new fast field serializer. /// /// The serializer in fact encode the values by bitpacking @@ -144,4 +143,4 @@ impl<'a, W: Write> FastBytesFieldSerializer<'a, W> { pub fn flush(&mut self) -> io::Result<()> { self.write.flush() } -}
\ No newline at end of file +} diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index c7aa081..1c3d11e 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -1,12 +1,12 @@ +use super::multivalued::MultiValueIntFastFieldWriter; use common; use common::BinarySerializable; use common::VInt; use fastfield::{BytesFastFieldWriter, FastFieldSerializer}; use postings::UnorderedTermId; -use schema::{FieldType, Cardinality, Document, Field, Schema}; +use schema::{Cardinality, Document, Field, FieldType, Schema}; use std::collections::HashMap; use std::io; -use super::multivalued::MultiValueIntFastFieldWriter; use termdict::TermOrdinal; /// The fastfieldswriter regroup all of the fast field writers. @@ -89,10 +89,7 @@ impl FastFieldsWriter { /// /// Returns None if the field does not exist, or is not /// configured as a bytes fastfield in the schema. - pub fn get_bytes_writer( - &mut self, - field: Field, - ) -> Option<&mut BytesFastFieldWriter> { + pub fn get_bytes_writer(&mut self, field: Field) -> Option<&mut BytesFastFieldWriter> { // TODO optimize self.bytes_value_writers .iter_mut() diff --git a/src/fieldnorm/writer.rs b/src/fieldnorm/writer.rs index e0b21fe..c36ea99 100644 --- a/src/fieldnorm/writer.rs +++ b/src/fieldnorm/writer.rs @@ -17,8 +17,7 @@ impl FieldNormsWriter { .fields() .iter() .enumerate() - .filter(|&(_, field_entry)| - field_entry.is_indexed()) + .filter(|&(_, field_entry)| field_entry.is_indexed()) .map(|(field, _)| Field(field as u32)) .collect::<Vec<Field>>() } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 5d58e50..0e59434 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -6,6 +6,7 @@ use error::Result; use fastfield::DeleteBitSet; use fastfield::FastFieldReader; use fastfield::FastFieldSerializer; +use fastfield::MultiValueIntFastFieldReader; use fieldnorm::FieldNormReader; use fieldnorm::FieldNormsSerializer; use fieldnorm::FieldNormsWriter; @@ -13,16 +14,15 @@ use indexer::SegmentSerializer; use itertools::Itertools; use postings::InvertedIndexSerializer; use postings::Postings; +use schema::Cardinality; +use schema::FieldType; use schema::{Field, Schema}; +use std::cmp; +use std::collections::HashMap; use store::StoreWriter; use termdict::TermMerger; -use DocId; -use schema::FieldType; use termdict::TermOrdinal; -use schema::Cardinality; -use std::collections::HashMap; -use fastfield::MultiValueIntFastFieldReader; -use std::cmp; +use DocId; fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 { let mut total_tokens = 0u64; @@ -53,8 +53,6 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 { .sum::<u64>() } - - pub struct IndexMerger { schema: Schema, readers: Vec<SegmentReader>, @@ -89,7 +87,7 @@ fn compute_min_max_val( } struct TermOrdinalMapping { - per_segment_new_term_ordinals: Vec<Vec<TermOrdinal>> + per_segment_new_term_ordinals: Vec<Vec<TermOrdinal>>, } impl TermOrdinalMapping { @@ -98,14 +96,11 @@ impl TermOrdinalMapping { per_segment_new_term_ordinals: max_term_ords .into_iter() .map(|max_term_ord| vec![TermOrdinal::default(); max_term_ord as usize]) - .collect() + .collect(), } } - fn register_from_to(&mut self, - segment_ord: usize, - from_ord: TermOrdinal, - to_ord: TermOrdinal) { + fn register_from_to(&mut self, segment_ord: usize, from_ord: TermOrdinal, to_ord: TermOrdinal) { self.per_segment_new_term_ordinals[segment_ord][from_ord as usize] = to_ord; } @@ -116,9 +111,7 @@ impl TermOrdinalMapping { fn max_term_ord(&self) -> TermOrdinal { self.per_segment_new_term_ordinals .iter() - .flat_map(|term_ordinals| { - term_ordinals.iter().cloned().max() - }) + .flat_map(|term_ordinals| term_ordinals.iter().cloned().max()) .max() .unwrap_or(TermOrdinal::default()) } @@ -185,9 +178,11 @@ impl IndexMerger { Ok(()) } - fn write_fast_fields(&self, - fast_field_serializer: &mut FastFieldSerializer, - mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>) -> Result<()> { + fn write_fast_fields( + &self, + fast_field_serializer: &mut FastFieldSerializer, + mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>, + ) -> Result<()> { for (field_id, field_entry) in self.schema.fields().iter().enumerate() { let field = Field(field_id as u32); let field_type = field_entry.field_type(); @@ -198,17 +193,15 @@ impl IndexMerger { .expect("Logic Error in Tantivy (Please report). HierarchicalFact field should have required a\ `term_ordinal_mapping`."); self.write_hierarchical_facet_field( - field, - term_ordinal_mapping, - fast_field_serializer)?; + field, + term_ordinal_mapping, + fast_field_serializer, + )?; } FieldType::U64(ref options) | FieldType::I64(ref options) => { match options.get_fastfield_cardinality() { Some(Cardinality::SingleValue) => { - self.write_single_fast_field( - field, - fast_field_serializer - )?; + self.write_single_fast_field(field, fast_field_serializer)?; } Some(Cardinality::MultiValues) => { self.write_multi_fast_field(field, fast_field_serializer)?; @@ -229,34 +222,25 @@ impl IndexMerger { Ok(()) } - < |