summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason Wolfe <jasonwolfe@utexas.edu>2018-10-15 09:04:36 +0900
committerPaul Masurel <paul.masurel@gmail.com>2018-10-15 09:04:36 +0900
commit0098e3d4285967555c605de955a397fba06a6c6c (patch)
tree32793cc444645f67b4b9038c06f36038f9448fce
parent69d5e4b9b170e93a050ae2f508a4f448340bffe1 (diff)
Compute space usage of a Searcher / SegmentReader / CompositeFile (#282)
* Compute space usage of a Searcher / SegmentReader / CompositeFile * Fix typo * Add serde Serialize/Deserialize for all the SpaceUsage structs * Fix indexing * Public methods for consuming space usage information * #281: Add a space usage method that takes a SegmentComponent to support code that is unaware of particular segment components, and to make it more likely to update methods when a new component type is added. * Add support for space usage computation of positions skip index file (#281) * Add some tests for space usage computation (#281)
-rw-r--r--src/common/composite_file.rs12
-rw-r--r--src/core/searcher.rs10
-rw-r--r--src/core/segment_reader.rs16
-rw-r--r--src/fastfield/delete.rs6
-rwxr-xr-xsrc/lib.rs1
-rw-r--r--src/space_usage/mod.rs484
-rw-r--r--src/store/reader.rs6
7 files changed, 535 insertions, 0 deletions
diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs
index e7d657b..0cdfdff 100644
--- a/src/common/composite_file.rs
+++ b/src/common/composite_file.rs
@@ -4,6 +4,8 @@ use common::VInt;
use directory::ReadOnlySource;
use directory::WritePtr;
use schema::Field;
+use space_usage::PerFieldSpaceUsage;
+use space_usage::FieldUsage;
use std::collections::HashMap;
use std::io::Write;
use std::io::{self, Read};
@@ -166,6 +168,16 @@ impl CompositeFile {
.get(&FileAddr { field, idx })
.map(|&(from, to)| self.data.slice(from, to))
}
+
+ pub fn space_usage(&self) -> PerFieldSpaceUsage {
+ let mut fields = HashMap::new();
+ for (&field_addr, &(start, end)) in self.offsets_index.iter() {
+ fields.entry(field_addr.field)
+ .or_insert_with(|| FieldUsage::empty(field_addr.field))
+ .add_field_idx(field_addr.idx, end - start);
+ }
+ PerFieldSpaceUsage::new(fields)
+ }
}
#[cfg(test)]
diff --git a/src/core/searcher.rs b/src/core/searcher.rs
index 64e5263..826bf45 100644
--- a/src/core/searcher.rs
+++ b/src/core/searcher.rs
@@ -5,6 +5,7 @@ use query::Query;
use schema::Document;
use schema::Schema;
use schema::{Field, Term};
+use space_usage::SearcherSpaceUsage;
use std::fmt;
use std::sync::Arc;
use termdict::TermMerger;
@@ -99,6 +100,15 @@ impl Searcher {
.collect::<Vec<_>>();
FieldSearcher::new(inv_index_readers)
}
+
+ /// Summarize total space usage of this searcher.
+ pub fn space_usage(&self) -> SearcherSpaceUsage {
+ let mut space_usage = SearcherSpaceUsage::new();
+ for segment_reader in self.segment_readers.iter() {
+ space_usage.add_segment(segment_reader.space_usage());
+ }
+ space_usage
+ }
}
pub struct FieldSearcher {
diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs
index 7cf395c..54b465e 100644
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -16,6 +16,7 @@ use schema::Document;
use schema::Field;
use schema::FieldType;
use schema::Schema;
+use space_usage::SegmentSpaceUsage;
use std::collections::HashMap;
use std::fmt;
use std::sync::Arc;
@@ -381,6 +382,21 @@ impl SegmentReader {
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
SegmentReaderAliveDocsIterator::new(&self)
}
+
+ /// Summarize total space usage of this segment.
+ pub fn space_usage(&self) -> SegmentSpaceUsage {
+ SegmentSpaceUsage::new(
+ self.num_docs(),
+ self.termdict_composite.space_usage(),
+ self.postings_composite.space_usage(),
+ self.positions_composite.space_usage(),
+ self.positions_idx_composite.space_usage(),
+ self.fast_fields_composite.space_usage(),
+ self.fieldnorms_composite.space_usage(),
+ self.store_reader.space_usage(),
+ self.delete_bitset_opt.as_ref().map(|x| x.space_usage()).unwrap_or(0),
+ )
+ }
}
impl fmt::Debug for SegmentReader {
diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs
index 15ed658..76ff7e4 100644
--- a/src/fastfield/delete.rs
+++ b/src/fastfield/delete.rs
@@ -2,6 +2,7 @@ use bit_set::BitSet;
use common::HasLen;
use directory::ReadOnlySource;
use directory::WritePtr;
+use space_usage::ByteCount;
use std::io;
use std::io::Write;
use DocId;
@@ -63,6 +64,11 @@ impl DeleteBitSet {
b & (1u8 << shift) != 0
}
}
+
+ /// Summarize total space usage of this bitset.
+ pub fn space_usage(&self) -> ByteCount {
+ self.data.len()
+ }
}
impl HasLen for DeleteBitSet {
diff --git a/src/lib.rs b/src/lib.rs
index 62802dc..7aa8572 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -213,6 +213,7 @@ pub(crate) mod positions;
pub mod postings;
pub mod query;
pub mod schema;
+pub mod space_usage;
pub mod store;
pub mod termdict;
diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs
new file mode 100644
index 0000000..9ffd8b8
--- /dev/null
+++ b/src/space_usage/mod.rs
@@ -0,0 +1,484 @@
+/*!
+Representations for the space usage of various parts of a Tantivy index.
+
+This can be used programmatically, and will also be exposed in a human readable fashion in
+tantivy-cli.
+
+One important caveat for all of this functionality is that none of it currently takes storage-level
+details into consideration. For example, if your file system block size is 4096 bytes, we can
+under-count actual resultant space usage by up to 4095 bytes per file.
+*/
+
+use schema::Field;
+use std::collections::HashMap;
+use SegmentComponent;
+
+/// Indicates space usage in bytes
+pub type ByteCount = usize;
+
+/// Enum containing any of the possible space usage results for segment components.
+pub enum ComponentSpaceUsage {
+ /// Data is stored per field in a uniform way
+ PerField(PerFieldSpaceUsage),
+ /// Data is stored in separate pieces in the store
+ Store(StoreSpaceUsage),
+ /// Some sort of raw byte count
+ Basic(ByteCount),
+}
+
+/// Represents combined space usage of an entire searcher and its component segments.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct SearcherSpaceUsage {
+ segments: Vec<SegmentSpaceUsage>,
+ total: ByteCount,
+}
+
+impl SearcherSpaceUsage {
+ pub(crate) fn new() -> SearcherSpaceUsage {
+ SearcherSpaceUsage {
+ segments: Vec::new(),
+ total: 0,
+ }
+ }
+
+ /// Add a segment, to `self`.
+ /// Performs no deduplication or other intelligence.
+ pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) {
+ self.total += segment.total();
+ self.segments.push(segment);
+ }
+
+ /// Per segment space usage
+ pub fn segments(&self) -> &[SegmentSpaceUsage] {
+ &self.segments[..]
+ }
+
+ /// Returns total byte usage of this searcher, including all large subcomponents.
+ /// Does not account for smaller things like `meta.json`.
+ pub fn total(&self) -> ByteCount {
+ self.total
+ }
+}
+
+/// Represents combined space usage for all of the large components comprising a segment.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct SegmentSpaceUsage {
+ num_docs: u32,
+
+ termdict: PerFieldSpaceUsage,
+ postings: PerFieldSpaceUsage,
+ positions: PerFieldSpaceUsage,
+ positions_idx: PerFieldSpaceUsage,
+ fast_fields: PerFieldSpaceUsage,
+ fieldnorms: PerFieldSpaceUsage,
+
+ store: StoreSpaceUsage,
+
+ deletes: ByteCount,
+
+ total: ByteCount,
+}
+
+impl SegmentSpaceUsage {
+ pub(crate) fn new(
+ num_docs: u32,
+ termdict: PerFieldSpaceUsage,
+ postings: PerFieldSpaceUsage,
+ positions: PerFieldSpaceUsage,
+ positions_idx: PerFieldSpaceUsage,
+ fast_fields: PerFieldSpaceUsage,
+ fieldnorms: PerFieldSpaceUsage,
+ store: StoreSpaceUsage,
+ deletes: ByteCount,
+ ) -> SegmentSpaceUsage {
+ let total = termdict.total()
+ + postings.total()
+ + positions.total()
+ + fast_fields.total()
+ + fieldnorms.total()
+ + store.total()
+ + deletes;
+ SegmentSpaceUsage {
+ num_docs,
+ termdict,
+ postings,
+ positions,
+ positions_idx,
+ fast_fields,
+ fieldnorms,
+ store,
+ deletes,
+ total,
+ }
+ }
+
+ /// Space usage for the given component
+ ///
+ /// Clones the underlying data.
+ /// Use the components directly if this is somehow in performance critical code.
+ pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage {
+ use SegmentComponent::*;
+ use self::ComponentSpaceUsage::*;
+ match component {
+ POSTINGS => PerField(self.postings().clone()),
+ POSITIONS => PerField(self.positions().clone()),
+ POSITIONSSKIP => PerField(self.positions_skip_idx().clone()),
+ FASTFIELDS => PerField(self.fast_fields().clone()),
+ FIELDNORMS => PerField(self.fieldnorms().clone()),
+ TERMS => PerField(self.termdict().clone()),
+ STORE => Store(self.store().clone()),
+ DELETE => Basic(self.deletes()),
+ }
+ }
+
+ /// Num docs in segment
+ pub fn num_docs(&self) -> u32 {
+ self.num_docs
+ }
+
+ /// Space usage for term dictionary
+ pub fn termdict(&self) -> &PerFieldSpaceUsage {
+ &self.termdict
+ }
+
+ /// Space usage for postings list
+ pub fn postings(&self) -> &PerFieldSpaceUsage {
+ &self.postings
+ }
+
+ /// Space usage for positions
+ pub fn positions(&self) -> &PerFieldSpaceUsage {
+ &self.positions
+ }
+
+ /// Space usage for positions skip idx
+ pub fn positions_skip_idx(&self) -> &PerFieldSpaceUsage {
+ &self.positions_idx
+ }
+
+ /// Space usage for fast fields
+ pub fn fast_fields(&self) -> &PerFieldSpaceUsage {
+ &self.fast_fields
+ }
+
+ /// Space usage for field norms
+ pub fn fieldnorms(&self) -> &PerFieldSpaceUsage {
+ &self.fieldnorms
+ }
+
+ /// Space usage for stored documents
+ pub fn store(&self) -> &StoreSpaceUsage {
+ &self.store
+ }
+
+ /// Space usage for document deletions
+ pub fn deletes(&self) -> ByteCount {
+ self.deletes
+ }
+
+ /// Total space usage in bytes for this segment.
+ pub fn total(&self) -> ByteCount {
+ self.total
+ }
+}
+
+/// Represents space usage for the Store for this segment.
+///
+/// This is composed of two parts.
+/// `data` represents the compressed data itself.
+/// `offsets` represents a lookup to find the start of a block
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct StoreSpaceUsage {
+ data: ByteCount,
+ offsets: ByteCount,
+}
+
+impl StoreSpaceUsage {
+ pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage {
+ StoreSpaceUsage { data, offsets }
+ }
+
+ /// Space usage for the data part of the store
+ pub fn data_usage(&self) -> ByteCount {
+ self.data
+ }
+
+ /// Space usage for the offsets part of the store (doc ID -> offset)
+ pub fn offsets_usage(&self) -> ByteCount {
+ self.offsets
+ }
+
+ /// Total space usage in bytes for this Store
+ pub fn total(&self) -> ByteCount {
+ self.data + self.offsets
+ }
+}
+
+/// Represents space usage for all of the (field, index) pairs that appear in a CompositeFile.
+///
+/// A field can appear with a single index (typically 0) or with multiple indexes.
+/// Multiple indexes are used to handle variable length things, where
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct PerFieldSpaceUsage {
+ fields: HashMap<Field, FieldUsage>,
+ total: ByteCount
+}
+
+impl PerFieldSpaceUsage {
+ pub(crate) fn new(fields: HashMap<Field, FieldUsage>) -> PerFieldSpaceUsage {
+ let total = fields.values().map(|x| x.total()).sum();
+ PerFieldSpaceUsage { fields, total }
+ }
+
+ /// Per field space usage
+ pub fn fields(&self) -> impl Iterator<Item = (&Field, &FieldUsage)> {
+ self.fields.iter()
+ }
+
+ /// Bytes used by the represented file
+ pub fn total(&self) -> ByteCount {
+ self.total
+ }
+}
+
+/// Represents space usage of a given field, breaking it down into the (field, index) pairs that
+/// comprise it.
+///
+/// See documentation for PerFieldSpaceUsage for slightly more information.
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct FieldUsage {
+ field: Field,
+ num_bytes: ByteCount,
+ /// A field can be composed of more than one piece.
+ /// These pieces are indexed by arbitrary numbers starting at zero.
+ /// `self.num_bytes` includes all of `self.sub_num_bytes`.
+ sub_num_bytes: Vec<Option<ByteCount>>,
+}
+
+impl FieldUsage {
+ pub(crate) fn empty(field: Field) -> FieldUsage {
+ FieldUsage {
+ field,
+ num_bytes: 0,
+ sub_num_bytes: Vec::new(),
+ }
+ }
+
+ pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) {
+ if self.sub_num_bytes.len() < idx + 1{
+ self.sub_num_bytes.resize(idx + 1, None);
+ }
+ assert!(self.sub_num_bytes[idx].is_none());
+ self.sub_num_bytes[idx] = Some(size);
+ self.num_bytes += size
+ }
+
+ /// Field
+ pub fn field(&self) -> Field {
+ self.field
+ }
+
+ /// Space usage for each index
+ pub fn sub_num_bytes(&self) -> &[Option<ByteCount>] {
+ &self.sub_num_bytes[..]
+ }
+
+ /// Total bytes used for this field in this context
+ pub fn total(&self) -> ByteCount {
+ self.num_bytes
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use core::Index;
+ use schema::SchemaBuilder;
+ use schema::{FAST, INT_INDEXED, TEXT};
+ use schema::Field;
+ use space_usage::ByteCount;
+ use space_usage::PerFieldSpaceUsage;
+ use schema::STORED;
+ use Term;
+
+ #[test]
+ fn test_empty() {
+ let schema = SchemaBuilder::new().build();
+ let index = Index::create_in_ram(schema.clone());
+
+ index.load_searchers().unwrap();
+ let searcher = index.searcher();
+ let searcher_space_usage = searcher.space_usage();
+ assert_eq!(0, searcher_space_usage.total());
+ }
+
+ fn expect_single_field(field_space: &PerFieldSpaceUsage, field: &Field, min_size: ByteCount, max_size: ByteCount) {
+ assert!(field_space.total() >= min_size);
+ assert!(field_space.total() <= max_size);
+ assert_eq!(
+ vec![(field, field_space.total())],
+ field_space.fields().map(|(x,y)| (x, y.total())).collect::<Vec<_>>()
+ );
+ }
+
+ #[test]
+ fn test_fast_indexed() {
+ let mut schema_builder = SchemaBuilder::new();
+ let name = schema_builder.add_u64_field("name", FAST | INT_INDEXED);
+ let schema = schema_builder.build();
+ let index = Index::create_in_ram(schema.clone());
+
+ {
+ let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+ index_writer.add_document(doc!(name => 1u64));
+ index_writer.add_document(doc!(name => 2u64));
+ index_writer.add_document(doc!(name => 10u64));
+ index_writer.add_document(doc!(name => 20u64));
+ index_writer.commit().unwrap();
+ }
+
+ index.load_searchers().unwrap();
+ let searcher = index.searcher();
+ let searcher_space_usage = searcher.space_usage();
+ assert!(searcher_space_usage.total() > 0);
+ assert_eq!(1, searcher_space_usage.segments().len());
+
+ let segment = &searcher_space_usage.segments()[0];
+ assert!(segment.total() > 0);
+
+ assert_eq!(4, segment.num_docs());
+
+ expect_single_field(segment.termdict(), &name, 1, 512);
+ expect_single_field(segment.postings(), &name, 1, 512);
+ assert_eq!(0, segment.positions().total());
+ assert_eq!(0, segment.positions_skip_idx().total());
+ expect_single_field(segment.fast_fields(), &name, 1, 512);
+ expect_single_field(segment.fieldnorms(), &name, 1, 512);
+ // TODO: understand why the following fails
+// assert_eq!(0, segment.store().total());
+ assert_eq!(0, segment.deletes());
+ }
+
+ #[test]
+ fn test_text() {
+ let mut schema_builder = SchemaBuilder::new();
+ let name = schema_builder.add_text_field("name", TEXT);
+ let schema = schema_builder.build();
+ let index = Index::create_in_ram(schema.clone());
+
+ {
+ let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+ index_writer.add_document(doc!(name => "hi"));
+ index_writer.add_document(doc!(name => "this is a test"));
+ index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test"));
+ index_writer.add_document(doc!(name => "hello hi goodbye"));
+ index_writer.commit().unwrap();
+ }
+
+ index.load_searchers().unwrap();
+ let searcher = index.searcher();
+ let searcher_space_usage = searcher.space_usage();
+ assert!(searcher_space_usage.total() > 0);
+ assert_eq!(1, searcher_space_usage.segments().len());
+
+ let segment = &searcher_space_usage.segments()[0];
+ assert!(segment.total() > 0);
+
+ assert_eq!(4, segment.num_docs());
+
+ expect_single_field(segment.termdict(), &name, 1, 512);
+ expect_single_field(segment.postings(), &name, 1, 512);
+ expect_single_field(segment.positions(), &name, 1, 512);
+ expect_single_field(segment.positions_skip_idx(), &name, 1, 512);
+ assert_eq!(0, segment.fast_fields().total());
+ expect_single_field(segment.fieldnorms(), &name, 1, 512);
+ // TODO: understand why the following fails
+// assert_eq!(0, segment.store().total());
+ assert_eq!(0, segment.deletes());
+ }
+
+ #[test]
+ fn test_store() {
+ let mut schema_builder = SchemaBuilder::new();
+ let name = schema_builder.add_text_field("name", STORED);
+ let schema = schema_builder.build();
+ let index = Index::create_in_ram(schema.clone());
+
+ {
+ let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+ index_writer.add_document(doc!(name => "hi"));
+ index_writer.add_document(doc!(name => "this is a test"));
+ index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test"));
+ index_writer.add_document(doc!(name => "hello hi goodbye"));
+ index_writer.commit().unwrap();
+ }
+
+ index.load_searchers().unwrap();
+ let searcher = index.searcher();
+ let searcher_space_usage = searcher.space_usage();
+ assert!(searcher_space_usage.total() > 0);
+ assert_eq!(1, searcher_space_usage.segments().len());
+
+ let segment = &searcher_space_usage.segments()[0];
+ assert!(segment.total() > 0);
+
+ assert_eq!(4, segment.num_docs());
+
+ assert_eq!(0, segment.termdict().total());
+ assert_eq!(0, segment.postings().total());
+ assert_eq!(0, segment.positions().total());
+ assert_eq!(0, segment.positions_skip_idx().total());
+ assert_eq!(0, segment.fast_fields().total());
+ assert_eq!(0, segment.fieldnorms().total());
+ assert!(segment.store().total() > 0);
+ assert!(segment.store().total() < 512);
+ assert_eq!(0, segment.deletes());
+ }
+
+ #[test]
+ fn test_deletes() {
+ let mut schema_builder = SchemaBuilder::new();
+ let name = schema_builder.add_u64_field("name", INT_INDEXED);
+ let schema = schema_builder.build();
+ let index = Index::create_in_ram(schema.clone());
+
+ {
+ let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
+ index_writer.add_document(doc!(name => 1u64));
+ index_writer.add_document(doc!(name => 2u64));
+ index_writer.add_document(doc!(name => 3u64));
+ index_writer.add_document(doc!(name => 4u64));
+ index_writer.commit().unwrap();
+ }
+
+ {
+ let mut index_writer2 = index.writer(50_000_000).unwrap();
+ index_writer2.delete_term(Term::from_field_u64(name, 2u64));
+ index_writer2.delete_term(Term::from_field_u64(name, 3u64));
+
+ // ok, now we should have a deleted doc
+ index_writer2.commit().unwrap();
+ }
+
+ index.load_searchers().unwrap();
+
+ let searcher = index.searcher();
+ let searcher_space_usage = searcher.space_usage();
+ assert!(searcher_space_usage.total() > 0);
+ assert_eq!(1, searcher_space_usage.segments().len());
+
+ let segment = &searcher_space_usage.segments()[0];
+ assert!(segment.total() > 0);
+
+ assert_eq!(2, segment.num_docs());
+
+ expect_single_field(segment.termdict(), &name, 1, 512);
+ expect_single_field(segment.postings(), &name, 1, 512);
+ assert_eq!(0, segment.positions().total());
+ assert_eq!(0, segment.positions_skip_idx().total());
+ assert_eq!(0, segment.fast_fields().total());
+ expect_single_field(segment.fieldnorms(), &name, 1, 512);
+ // TODO: understand why the following fails
+// assert_eq!(0, segment.store().total());
+ assert!(segment.deletes() > 0);
+ }
+} \ No newline at end of file
diff --git a/src/store/reader.rs b/src/store/reader.rs
index 428b013..e94705b 100644
--- a/src/store/reader.rs
+++ b/src/store/reader.rs
@@ -6,6 +6,7 @@ use common::BinarySerializable;
use common::VInt;
use directory::ReadOnlySource;
use schema::Document;
+use space_usage::StoreSpaceUsage;
use std::cell::RefCell;
use std::io;
use std::mem::size_of;
@@ -87,6 +88,11 @@ impl StoreReader {
cursor = &cursor[..doc_length];
Ok(Document::deserialize(&mut cursor)?)
}
+
+ /// Summarize total space usage of this store reader.
+ pub fn space_usage(&self) -> StoreSpaceUsage {
+ StoreSpaceUsage::new(self.data.len(), self.offset_index_source.len())
+ }
}
#[cfg_attr(