diff options
author | Paul Masurel <paul.masurel@gmail.com> | 2019-07-17 08:32:29 +0900 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-07-17 08:32:29 +0900 |
commit | 5095e6b0100bbab06daae43a0212b577d769c36c (patch) | |
tree | 3138f03dec9a007fce7f717625bcb850fa02a322 | |
parent | 1aebc87ee34fdb20ed48ca8bbece3c45ea47d7ec (diff) |
Introduce a small refactoring of the sgment writer. (#596)
-rw-r--r-- | src/indexer/index_writer.rs | 34 | ||||
-rw-r--r-- | src/indexer/segment_writer.rs | 39 | ||||
-rw-r--r-- | src/postings/mod.rs | 2 |
3 files changed, 39 insertions, 36 deletions
diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index bb16641..24e4c23 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -18,7 +18,6 @@ use crate::indexer::stamper::Stamper; use crate::indexer::MergePolicy; use crate::indexer::SegmentEntry; use crate::indexer::SegmentWriter; -use crate::postings::compute_table_size; use crate::schema::Document; use crate::schema::IndexRecordOption; use crate::schema::Term; @@ -48,27 +47,6 @@ const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000; type OperationSender = channel::Sender<Vec<AddOperation>>; type OperationReceiver = channel::Receiver<Vec<AddOperation>>; -/// Split the thread memory budget into -/// - the heap size -/// - the hash table "table" itself. -/// -/// Returns (the heap size in bytes, the hash table size in number of bits) -fn initial_table_size(per_thread_memory_budget: usize) -> usize { - assert!(per_thread_memory_budget > 1_000); - let table_size_limit: usize = per_thread_memory_budget / 3; - if let Some(limit) = (1..) - .take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit) - .last() - { - limit.min(19) // we cap it at 2^19 = 512K. - } else { - unreachable!( - "Per thread memory is too small: {}", - per_thread_memory_budget - ); - } -} - /// `IndexWriter` is the user entry-point to add document to an index. /// /// It manages a small number of indexing thread, as well as a shared @@ -274,8 +252,7 @@ fn index_documents( ) -> Result<bool> { let schema = segment.schema(); let segment_id = segment.id(); - let table_size = initial_table_size(memory_budget); - let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?; + let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?; for documents in document_iterator { for doc in documents { segment_writer.add_document(doc, &schema)?; @@ -772,7 +749,6 @@ impl IndexWriter { mod tests { use super::super::operation::UserOperation; - use super::initial_table_size; use crate::collector::TopDocs; use crate::directory::error::LockError; use crate::error::*; @@ -1064,14 +1040,6 @@ mod tests { assert_eq!(num_docs_containing("b"), 100); } - #[test] - fn test_hashmap_size() { - assert_eq!(initial_table_size(100_000), 11); - assert_eq!(initial_table_size(1_000_000), 14); - assert_eq!(initial_table_size(10_000_000), 17); - assert_eq!(initial_table_size(1_000_000_000), 19); - } - #[cfg(not(feature = "no_fail"))] #[test] fn test_write_commit_fails() { diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index b8fc00f..2a7065f 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -4,6 +4,7 @@ use crate::core::SerializableSegment; use crate::fastfield::FastFieldsWriter; use crate::fieldnorm::FieldNormsWriter; use crate::indexer::segment_serializer::SegmentSerializer; +use crate::postings::compute_table_size; use crate::postings::MultiFieldPostingsWriter; use crate::schema::FieldEntry; use crate::schema::FieldType; @@ -16,9 +17,26 @@ use crate::tokenizer::{TokenStream, Tokenizer}; use crate::DocId; use crate::Opstamp; use crate::Result; +use crate::TantivyError; use std::io; use std::str; +/// Computes the initial size of the hash table. +/// +/// Returns a number of bit `b`, such that the recommended initial table size is 2^b. +fn initial_table_size(per_thread_memory_budget: usize) -> Result<usize> { + let table_memory_upper_bound = per_thread_memory_budget / 3; + if let Some(limit) = (10..) + .take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_memory_upper_bound) + .last() + { + Ok(limit.min(19)) // we cap it at 2^19 = 512K. + } else { + Err(TantivyError::InvalidArgument( + format!("per thread memory budget (={}) is too small. Raise the memory budget or lower the number of threads.", per_thread_memory_budget))) + } +} + /// A `SegmentWriter` is in charge of creating segment index from a /// set of documents. /// @@ -45,12 +63,15 @@ impl SegmentWriter { /// - segment: The segment being written /// - schema pub fn for_segment( - table_bits: usize, + memory_budget: usize, mut segment: Segment, schema: &Schema, ) -> Result<SegmentWriter> { + // We shoot for using at most one third of the memory budget in the hash table. + // It's a lot, but + let table_num_bits = initial_table_size(memory_budget)?; let segment_serializer = SegmentSerializer::for_segment(&mut segment)?; - let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits); + let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits); let tokenizers = schema .fields() @@ -254,3 +275,17 @@ impl SerializableSegment for SegmentWriter { Ok(max_doc) } } + +#[cfg(test)] +mod tests { + use super::initial_table_size; + + #[test] + fn test_hashmap_size() { + assert_eq!(initial_table_size(100_000).unwrap(), 11); + assert_eq!(initial_table_size(1_000_000).unwrap(), 14); + assert_eq!(initial_table_size(10_000_000).unwrap(), 17); + assert_eq!(initial_table_size(1_000_000_000).unwrap(), 19); + } + +} diff --git a/src/postings/mod.rs b/src/postings/mod.rs index b8b5110..dcbf94a 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -220,7 +220,7 @@ pub mod tests { { let mut segment_writer = - SegmentWriter::for_segment(18, segment.clone(), &schema).unwrap(); + SegmentWriter::for_segment(3_000_000, segment.clone(), &schema).unwrap(); { let mut doc = Document::default(); // checking that position works if the field has two values |