diff options
author | Jason Wolfe <jasonwolfe@utexas.edu> | 2018-10-15 09:04:36 +0900 |
---|---|---|
committer | Paul Masurel <paul.masurel@gmail.com> | 2018-10-15 09:04:36 +0900 |
commit | 0098e3d4285967555c605de955a397fba06a6c6c (patch) | |
tree | 32793cc444645f67b4b9038c06f36038f9448fce | |
parent | 69d5e4b9b170e93a050ae2f508a4f448340bffe1 (diff) |
Compute space usage of a Searcher / SegmentReader / CompositeFile (#282)
* Compute space usage of a Searcher / SegmentReader / CompositeFile
* Fix typo
* Add serde Serialize/Deserialize for all the SpaceUsage structs
* Fix indexing
* Public methods for consuming space usage information
* #281: Add a space usage method that takes a SegmentComponent to support code that is unaware of particular segment components, and to make it more likely to update methods when a new component type is added.
* Add support for space usage computation of positions skip index file (#281)
* Add some tests for space usage computation (#281)
-rw-r--r-- | src/common/composite_file.rs | 12 | ||||
-rw-r--r-- | src/core/searcher.rs | 10 | ||||
-rw-r--r-- | src/core/segment_reader.rs | 16 | ||||
-rw-r--r-- | src/fastfield/delete.rs | 6 | ||||
-rwxr-xr-x | src/lib.rs | 1 | ||||
-rw-r--r-- | src/space_usage/mod.rs | 484 | ||||
-rw-r--r-- | src/store/reader.rs | 6 |
7 files changed, 535 insertions, 0 deletions
diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs index e7d657b..0cdfdff 100644 --- a/src/common/composite_file.rs +++ b/src/common/composite_file.rs @@ -4,6 +4,8 @@ use common::VInt; use directory::ReadOnlySource; use directory::WritePtr; use schema::Field; +use space_usage::PerFieldSpaceUsage; +use space_usage::FieldUsage; use std::collections::HashMap; use std::io::Write; use std::io::{self, Read}; @@ -166,6 +168,16 @@ impl CompositeFile { .get(&FileAddr { field, idx }) .map(|&(from, to)| self.data.slice(from, to)) } + + pub fn space_usage(&self) -> PerFieldSpaceUsage { + let mut fields = HashMap::new(); + for (&field_addr, &(start, end)) in self.offsets_index.iter() { + fields.entry(field_addr.field) + .or_insert_with(|| FieldUsage::empty(field_addr.field)) + .add_field_idx(field_addr.idx, end - start); + } + PerFieldSpaceUsage::new(fields) + } } #[cfg(test)] diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 64e5263..826bf45 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -5,6 +5,7 @@ use query::Query; use schema::Document; use schema::Schema; use schema::{Field, Term}; +use space_usage::SearcherSpaceUsage; use std::fmt; use std::sync::Arc; use termdict::TermMerger; @@ -99,6 +100,15 @@ impl Searcher { .collect::<Vec<_>>(); FieldSearcher::new(inv_index_readers) } + + /// Summarize total space usage of this searcher. + pub fn space_usage(&self) -> SearcherSpaceUsage { + let mut space_usage = SearcherSpaceUsage::new(); + for segment_reader in self.segment_readers.iter() { + space_usage.add_segment(segment_reader.space_usage()); + } + space_usage + } } pub struct FieldSearcher { diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 7cf395c..54b465e 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -16,6 +16,7 @@ use schema::Document; use schema::Field; use schema::FieldType; use schema::Schema; +use space_usage::SegmentSpaceUsage; use std::collections::HashMap; use std::fmt; use std::sync::Arc; @@ -381,6 +382,21 @@ impl SegmentReader { pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator { SegmentReaderAliveDocsIterator::new(&self) } + + /// Summarize total space usage of this segment. + pub fn space_usage(&self) -> SegmentSpaceUsage { + SegmentSpaceUsage::new( + self.num_docs(), + self.termdict_composite.space_usage(), + self.postings_composite.space_usage(), + self.positions_composite.space_usage(), + self.positions_idx_composite.space_usage(), + self.fast_fields_composite.space_usage(), + self.fieldnorms_composite.space_usage(), + self.store_reader.space_usage(), + self.delete_bitset_opt.as_ref().map(|x| x.space_usage()).unwrap_or(0), + ) + } } impl fmt::Debug for SegmentReader { diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index 15ed658..76ff7e4 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -2,6 +2,7 @@ use bit_set::BitSet; use common::HasLen; use directory::ReadOnlySource; use directory::WritePtr; +use space_usage::ByteCount; use std::io; use std::io::Write; use DocId; @@ -63,6 +64,11 @@ impl DeleteBitSet { b & (1u8 << shift) != 0 } } + + /// Summarize total space usage of this bitset. + pub fn space_usage(&self) -> ByteCount { + self.data.len() + } } impl HasLen for DeleteBitSet { @@ -213,6 +213,7 @@ pub(crate) mod positions; pub mod postings; pub mod query; pub mod schema; +pub mod space_usage; pub mod store; pub mod termdict; diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs new file mode 100644 index 0000000..9ffd8b8 --- /dev/null +++ b/src/space_usage/mod.rs @@ -0,0 +1,484 @@ +/*! +Representations for the space usage of various parts of a Tantivy index. + +This can be used programmatically, and will also be exposed in a human readable fashion in +tantivy-cli. + +One important caveat for all of this functionality is that none of it currently takes storage-level +details into consideration. For example, if your file system block size is 4096 bytes, we can +under-count actual resultant space usage by up to 4095 bytes per file. +*/ + +use schema::Field; +use std::collections::HashMap; +use SegmentComponent; + +/// Indicates space usage in bytes +pub type ByteCount = usize; + +/// Enum containing any of the possible space usage results for segment components. +pub enum ComponentSpaceUsage { + /// Data is stored per field in a uniform way + PerField(PerFieldSpaceUsage), + /// Data is stored in separate pieces in the store + Store(StoreSpaceUsage), + /// Some sort of raw byte count + Basic(ByteCount), +} + +/// Represents combined space usage of an entire searcher and its component segments. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct SearcherSpaceUsage { + segments: Vec<SegmentSpaceUsage>, + total: ByteCount, +} + +impl SearcherSpaceUsage { + pub(crate) fn new() -> SearcherSpaceUsage { + SearcherSpaceUsage { + segments: Vec::new(), + total: 0, + } + } + + /// Add a segment, to `self`. + /// Performs no deduplication or other intelligence. + pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) { + self.total += segment.total(); + self.segments.push(segment); + } + + /// Per segment space usage + pub fn segments(&self) -> &[SegmentSpaceUsage] { + &self.segments[..] + } + + /// Returns total byte usage of this searcher, including all large subcomponents. + /// Does not account for smaller things like `meta.json`. + pub fn total(&self) -> ByteCount { + self.total + } +} + +/// Represents combined space usage for all of the large components comprising a segment. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct SegmentSpaceUsage { + num_docs: u32, + + termdict: PerFieldSpaceUsage, + postings: PerFieldSpaceUsage, + positions: PerFieldSpaceUsage, + positions_idx: PerFieldSpaceUsage, + fast_fields: PerFieldSpaceUsage, + fieldnorms: PerFieldSpaceUsage, + + store: StoreSpaceUsage, + + deletes: ByteCount, + + total: ByteCount, +} + +impl SegmentSpaceUsage { + pub(crate) fn new( + num_docs: u32, + termdict: PerFieldSpaceUsage, + postings: PerFieldSpaceUsage, + positions: PerFieldSpaceUsage, + positions_idx: PerFieldSpaceUsage, + fast_fields: PerFieldSpaceUsage, + fieldnorms: PerFieldSpaceUsage, + store: StoreSpaceUsage, + deletes: ByteCount, + ) -> SegmentSpaceUsage { + let total = termdict.total() + + postings.total() + + positions.total() + + fast_fields.total() + + fieldnorms.total() + + store.total() + + deletes; + SegmentSpaceUsage { + num_docs, + termdict, + postings, + positions, + positions_idx, + fast_fields, + fieldnorms, + store, + deletes, + total, + } + } + + /// Space usage for the given component + /// + /// Clones the underlying data. + /// Use the components directly if this is somehow in performance critical code. + pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage { + use SegmentComponent::*; + use self::ComponentSpaceUsage::*; + match component { + POSTINGS => PerField(self.postings().clone()), + POSITIONS => PerField(self.positions().clone()), + POSITIONSSKIP => PerField(self.positions_skip_idx().clone()), + FASTFIELDS => PerField(self.fast_fields().clone()), + FIELDNORMS => PerField(self.fieldnorms().clone()), + TERMS => PerField(self.termdict().clone()), + STORE => Store(self.store().clone()), + DELETE => Basic(self.deletes()), + } + } + + /// Num docs in segment + pub fn num_docs(&self) -> u32 { + self.num_docs + } + + /// Space usage for term dictionary + pub fn termdict(&self) -> &PerFieldSpaceUsage { + &self.termdict + } + + /// Space usage for postings list + pub fn postings(&self) -> &PerFieldSpaceUsage { + &self.postings + } + + /// Space usage for positions + pub fn positions(&self) -> &PerFieldSpaceUsage { + &self.positions + } + + /// Space usage for positions skip idx + pub fn positions_skip_idx(&self) -> &PerFieldSpaceUsage { + &self.positions_idx + } + + /// Space usage for fast fields + pub fn fast_fields(&self) -> &PerFieldSpaceUsage { + &self.fast_fields + } + + /// Space usage for field norms + pub fn fieldnorms(&self) -> &PerFieldSpaceUsage { + &self.fieldnorms + } + + /// Space usage for stored documents + pub fn store(&self) -> &StoreSpaceUsage { + &self.store + } + + /// Space usage for document deletions + pub fn deletes(&self) -> ByteCount { + self.deletes + } + + /// Total space usage in bytes for this segment. + pub fn total(&self) -> ByteCount { + self.total + } +} + +/// Represents space usage for the Store for this segment. +/// +/// This is composed of two parts. +/// `data` represents the compressed data itself. +/// `offsets` represents a lookup to find the start of a block +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct StoreSpaceUsage { + data: ByteCount, + offsets: ByteCount, +} + +impl StoreSpaceUsage { + pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage { + StoreSpaceUsage { data, offsets } + } + + /// Space usage for the data part of the store + pub fn data_usage(&self) -> ByteCount { + self.data + } + + /// Space usage for the offsets part of the store (doc ID -> offset) + pub fn offsets_usage(&self) -> ByteCount { + self.offsets + } + + /// Total space usage in bytes for this Store + pub fn total(&self) -> ByteCount { + self.data + self.offsets + } +} + +/// Represents space usage for all of the (field, index) pairs that appear in a CompositeFile. +/// +/// A field can appear with a single index (typically 0) or with multiple indexes. +/// Multiple indexes are used to handle variable length things, where +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct PerFieldSpaceUsage { + fields: HashMap<Field, FieldUsage>, + total: ByteCount +} + +impl PerFieldSpaceUsage { + pub(crate) fn new(fields: HashMap<Field, FieldUsage>) -> PerFieldSpaceUsage { + let total = fields.values().map(|x| x.total()).sum(); + PerFieldSpaceUsage { fields, total } + } + + /// Per field space usage + pub fn fields(&self) -> impl Iterator<Item = (&Field, &FieldUsage)> { + self.fields.iter() + } + + /// Bytes used by the represented file + pub fn total(&self) -> ByteCount { + self.total + } +} + +/// Represents space usage of a given field, breaking it down into the (field, index) pairs that +/// comprise it. +/// +/// See documentation for PerFieldSpaceUsage for slightly more information. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct FieldUsage { + field: Field, + num_bytes: ByteCount, + /// A field can be composed of more than one piece. + /// These pieces are indexed by arbitrary numbers starting at zero. + /// `self.num_bytes` includes all of `self.sub_num_bytes`. + sub_num_bytes: Vec<Option<ByteCount>>, +} + +impl FieldUsage { + pub(crate) fn empty(field: Field) -> FieldUsage { + FieldUsage { + field, + num_bytes: 0, + sub_num_bytes: Vec::new(), + } + } + + pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) { + if self.sub_num_bytes.len() < idx + 1{ + self.sub_num_bytes.resize(idx + 1, None); + } + assert!(self.sub_num_bytes[idx].is_none()); + self.sub_num_bytes[idx] = Some(size); + self.num_bytes += size + } + + /// Field + pub fn field(&self) -> Field { + self.field + } + + /// Space usage for each index + pub fn sub_num_bytes(&self) -> &[Option<ByteCount>] { + &self.sub_num_bytes[..] + } + + /// Total bytes used for this field in this context + pub fn total(&self) -> ByteCount { + self.num_bytes + } +} + +#[cfg(test)] +mod test { + use core::Index; + use schema::SchemaBuilder; + use schema::{FAST, INT_INDEXED, TEXT}; + use schema::Field; + use space_usage::ByteCount; + use space_usage::PerFieldSpaceUsage; + use schema::STORED; + use Term; + + #[test] + fn test_empty() { + let schema = SchemaBuilder::new().build(); + let index = Index::create_in_ram(schema.clone()); + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let searcher_space_usage = searcher.space_usage(); + assert_eq!(0, searcher_space_usage.total()); + } + + fn expect_single_field(field_space: &PerFieldSpaceUsage, field: &Field, min_size: ByteCount, max_size: ByteCount) { + assert!(field_space.total() >= min_size); + assert!(field_space.total() <= max_size); + assert_eq!( + vec![(field, field_space.total())], + field_space.fields().map(|(x,y)| (x, y.total())).collect::<Vec<_>>() + ); + } + + #[test] + fn test_fast_indexed() { + let mut schema_builder = SchemaBuilder::new(); + let name = schema_builder.add_u64_field("name", FAST | INT_INDEXED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + + { + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(name => 1u64)); + index_writer.add_document(doc!(name => 2u64)); + index_writer.add_document(doc!(name => 10u64)); + index_writer.add_document(doc!(name => 20u64)); + index_writer.commit().unwrap(); + } + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let searcher_space_usage = searcher.space_usage(); + assert!(searcher_space_usage.total() > 0); + assert_eq!(1, searcher_space_usage.segments().len()); + + let segment = &searcher_space_usage.segments()[0]; + assert!(segment.total() > 0); + + assert_eq!(4, segment.num_docs()); + + expect_single_field(segment.termdict(), &name, 1, 512); + expect_single_field(segment.postings(), &name, 1, 512); + assert_eq!(0, segment.positions().total()); + assert_eq!(0, segment.positions_skip_idx().total()); + expect_single_field(segment.fast_fields(), &name, 1, 512); + expect_single_field(segment.fieldnorms(), &name, 1, 512); + // TODO: understand why the following fails +// assert_eq!(0, segment.store().total()); + assert_eq!(0, segment.deletes()); + } + + #[test] + fn test_text() { + let mut schema_builder = SchemaBuilder::new(); + let name = schema_builder.add_text_field("name", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + + { + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(name => "hi")); + index_writer.add_document(doc!(name => "this is a test")); + index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test")); + index_writer.add_document(doc!(name => "hello hi goodbye")); + index_writer.commit().unwrap(); + } + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let searcher_space_usage = searcher.space_usage(); + assert!(searcher_space_usage.total() > 0); + assert_eq!(1, searcher_space_usage.segments().len()); + + let segment = &searcher_space_usage.segments()[0]; + assert!(segment.total() > 0); + + assert_eq!(4, segment.num_docs()); + + expect_single_field(segment.termdict(), &name, 1, 512); + expect_single_field(segment.postings(), &name, 1, 512); + expect_single_field(segment.positions(), &name, 1, 512); + expect_single_field(segment.positions_skip_idx(), &name, 1, 512); + assert_eq!(0, segment.fast_fields().total()); + expect_single_field(segment.fieldnorms(), &name, 1, 512); + // TODO: understand why the following fails +// assert_eq!(0, segment.store().total()); + assert_eq!(0, segment.deletes()); + } + + #[test] + fn test_store() { + let mut schema_builder = SchemaBuilder::new(); + let name = schema_builder.add_text_field("name", STORED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + + { + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(name => "hi")); + index_writer.add_document(doc!(name => "this is a test")); + index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test")); + index_writer.add_document(doc!(name => "hello hi goodbye")); + index_writer.commit().unwrap(); + } + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let searcher_space_usage = searcher.space_usage(); + assert!(searcher_space_usage.total() > 0); + assert_eq!(1, searcher_space_usage.segments().len()); + + let segment = &searcher_space_usage.segments()[0]; + assert!(segment.total() > 0); + + assert_eq!(4, segment.num_docs()); + + assert_eq!(0, segment.termdict().total()); + assert_eq!(0, segment.postings().total()); + assert_eq!(0, segment.positions().total()); + assert_eq!(0, segment.positions_skip_idx().total()); + assert_eq!(0, segment.fast_fields().total()); + assert_eq!(0, segment.fieldnorms().total()); + assert!(segment.store().total() > 0); + assert!(segment.store().total() < 512); + assert_eq!(0, segment.deletes()); + } + + #[test] + fn test_deletes() { + let mut schema_builder = SchemaBuilder::new(); + let name = schema_builder.add_u64_field("name", INT_INDEXED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + + { + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(name => 1u64)); + index_writer.add_document(doc!(name => 2u64)); + index_writer.add_document(doc!(name => 3u64)); + index_writer.add_document(doc!(name => 4u64)); + index_writer.commit().unwrap(); + } + + { + let mut index_writer2 = index.writer(50_000_000).unwrap(); + index_writer2.delete_term(Term::from_field_u64(name, 2u64)); + index_writer2.delete_term(Term::from_field_u64(name, 3u64)); + + // ok, now we should have a deleted doc + index_writer2.commit().unwrap(); + } + + index.load_searchers().unwrap(); + + let searcher = index.searcher(); + let searcher_space_usage = searcher.space_usage(); + assert!(searcher_space_usage.total() > 0); + assert_eq!(1, searcher_space_usage.segments().len()); + + let segment = &searcher_space_usage.segments()[0]; + assert!(segment.total() > 0); + + assert_eq!(2, segment.num_docs()); + + expect_single_field(segment.termdict(), &name, 1, 512); + expect_single_field(segment.postings(), &name, 1, 512); + assert_eq!(0, segment.positions().total()); + assert_eq!(0, segment.positions_skip_idx().total()); + assert_eq!(0, segment.fast_fields().total()); + expect_single_field(segment.fieldnorms(), &name, 1, 512); + // TODO: understand why the following fails +// assert_eq!(0, segment.store().total()); + assert!(segment.deletes() > 0); + } +}
\ No newline at end of file diff --git a/src/store/reader.rs b/src/store/reader.rs index 428b013..e94705b 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -6,6 +6,7 @@ use common::BinarySerializable; use common::VInt; use directory::ReadOnlySource; use schema::Document; +use space_usage::StoreSpaceUsage; use std::cell::RefCell; use std::io; use std::mem::size_of; @@ -87,6 +88,11 @@ impl StoreReader { cursor = &cursor[..doc_length]; Ok(Document::deserialize(&mut cursor)?) } + + /// Summarize total space usage of this store reader. + pub fn space_usage(&self) -> StoreSpaceUsage { + StoreSpaceUsage::new(self.data.len(), self.offset_index_source.len()) + } } #[cfg_attr( |