summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Masurel <paul.masurel@gmail.com>2019-07-17 08:32:29 +0900
committerGitHub <noreply@github.com>2019-07-17 08:32:29 +0900
commit5095e6b0100bbab06daae43a0212b577d769c36c (patch)
tree3138f03dec9a007fce7f717625bcb850fa02a322
parent1aebc87ee34fdb20ed48ca8bbece3c45ea47d7ec (diff)
Introduce a small refactoring of the sgment writer. (#596)
-rw-r--r--src/indexer/index_writer.rs34
-rw-r--r--src/indexer/segment_writer.rs39
-rw-r--r--src/postings/mod.rs2
3 files changed, 39 insertions, 36 deletions
diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs
index bb16641..24e4c23 100644
--- a/src/indexer/index_writer.rs
+++ b/src/indexer/index_writer.rs
@@ -18,7 +18,6 @@ use crate::indexer::stamper::Stamper;
use crate::indexer::MergePolicy;
use crate::indexer::SegmentEntry;
use crate::indexer::SegmentWriter;
-use crate::postings::compute_table_size;
use crate::schema::Document;
use crate::schema::IndexRecordOption;
use crate::schema::Term;
@@ -48,27 +47,6 @@ const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
type OperationSender = channel::Sender<Vec<AddOperation>>;
type OperationReceiver = channel::Receiver<Vec<AddOperation>>;
-/// Split the thread memory budget into
-/// - the heap size
-/// - the hash table "table" itself.
-///
-/// Returns (the heap size in bytes, the hash table size in number of bits)
-fn initial_table_size(per_thread_memory_budget: usize) -> usize {
- assert!(per_thread_memory_budget > 1_000);
- let table_size_limit: usize = per_thread_memory_budget / 3;
- if let Some(limit) = (1..)
- .take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
- .last()
- {
- limit.min(19) // we cap it at 2^19 = 512K.
- } else {
- unreachable!(
- "Per thread memory is too small: {}",
- per_thread_memory_budget
- );
- }
-}
-
/// `IndexWriter` is the user entry-point to add document to an index.
///
/// It manages a small number of indexing thread, as well as a shared
@@ -274,8 +252,7 @@ fn index_documents(
) -> Result<bool> {
let schema = segment.schema();
let segment_id = segment.id();
- let table_size = initial_table_size(memory_budget);
- let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
+ let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
for documents in document_iterator {
for doc in documents {
segment_writer.add_document(doc, &schema)?;
@@ -772,7 +749,6 @@ impl IndexWriter {
mod tests {
use super::super::operation::UserOperation;
- use super::initial_table_size;
use crate::collector::TopDocs;
use crate::directory::error::LockError;
use crate::error::*;
@@ -1064,14 +1040,6 @@ mod tests {
assert_eq!(num_docs_containing("b"), 100);
}
- #[test]
- fn test_hashmap_size() {
- assert_eq!(initial_table_size(100_000), 11);
- assert_eq!(initial_table_size(1_000_000), 14);
- assert_eq!(initial_table_size(10_000_000), 17);
- assert_eq!(initial_table_size(1_000_000_000), 19);
- }
-
#[cfg(not(feature = "no_fail"))]
#[test]
fn test_write_commit_fails() {
diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs
index b8fc00f..2a7065f 100644
--- a/src/indexer/segment_writer.rs
+++ b/src/indexer/segment_writer.rs
@@ -4,6 +4,7 @@ use crate::core::SerializableSegment;
use crate::fastfield::FastFieldsWriter;
use crate::fieldnorm::FieldNormsWriter;
use crate::indexer::segment_serializer::SegmentSerializer;
+use crate::postings::compute_table_size;
use crate::postings::MultiFieldPostingsWriter;
use crate::schema::FieldEntry;
use crate::schema::FieldType;
@@ -16,9 +17,26 @@ use crate::tokenizer::{TokenStream, Tokenizer};
use crate::DocId;
use crate::Opstamp;
use crate::Result;
+use crate::TantivyError;
use std::io;
use std::str;
+/// Computes the initial size of the hash table.
+///
+/// Returns a number of bit `b`, such that the recommended initial table size is 2^b.
+fn initial_table_size(per_thread_memory_budget: usize) -> Result<usize> {
+ let table_memory_upper_bound = per_thread_memory_budget / 3;
+ if let Some(limit) = (10..)
+ .take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_memory_upper_bound)
+ .last()
+ {
+ Ok(limit.min(19)) // we cap it at 2^19 = 512K.
+ } else {
+ Err(TantivyError::InvalidArgument(
+ format!("per thread memory budget (={}) is too small. Raise the memory budget or lower the number of threads.", per_thread_memory_budget)))
+ }
+}
+
/// A `SegmentWriter` is in charge of creating segment index from a
/// set of documents.
///
@@ -45,12 +63,15 @@ impl SegmentWriter {
/// - segment: The segment being written
/// - schema
pub fn for_segment(
- table_bits: usize,
+ memory_budget: usize,
mut segment: Segment,
schema: &Schema,
) -> Result<SegmentWriter> {
+ // We shoot for using at most one third of the memory budget in the hash table.
+ // It's a lot, but
+ let table_num_bits = initial_table_size(memory_budget)?;
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
- let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits);
+ let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
let tokenizers =
schema
.fields()
@@ -254,3 +275,17 @@ impl SerializableSegment for SegmentWriter {
Ok(max_doc)
}
}
+
+#[cfg(test)]
+mod tests {
+ use super::initial_table_size;
+
+ #[test]
+ fn test_hashmap_size() {
+ assert_eq!(initial_table_size(100_000).unwrap(), 11);
+ assert_eq!(initial_table_size(1_000_000).unwrap(), 14);
+ assert_eq!(initial_table_size(10_000_000).unwrap(), 17);
+ assert_eq!(initial_table_size(1_000_000_000).unwrap(), 19);
+ }
+
+}
diff --git a/src/postings/mod.rs b/src/postings/mod.rs
index b8b5110..dcbf94a 100644
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -220,7 +220,7 @@ pub mod tests {
{
let mut segment_writer =
- SegmentWriter::for_segment(18, segment.clone(), &schema).unwrap();
+ SegmentWriter::for_segment(3_000_000, segment.clone(), &schema).unwrap();
{
let mut doc = Document::default();
// checking that position works if the field has two values