summaryrefslogtreecommitdiffstats
path: root/src/postings
diff options
context:
space:
mode:
authorPaul Masurel <paul.masurel@gmail.com>2017-06-02 21:03:37 +0900
committerPaul Masurel <paul.masurel@gmail.com>2017-06-02 21:03:37 +0900
commit36376201877df652c1bbe236d16ab94b34af1c22 (patch)
tree89306137201a3a876f303e8e2443c457e76f014a /src/postings
parent4cfc9806c0dacd4cd4e7a67c5398b4a66b80af9f (diff)
parenta94679d74d6847f827af4932b540038cf7956f6c (diff)
Merge branch 'master' of github.com:tantivy-search/tantivy
Diffstat (limited to 'src/postings')
-rw-r--r--src/postings/docset.rs14
-rw-r--r--src/postings/intersection.rs115
-rw-r--r--src/postings/mod.rs63
-rw-r--r--src/postings/segment_postings.rs4
-rw-r--r--src/postings/serializer.rs51
-rw-r--r--src/postings/term_info.rs10
-rw-r--r--src/postings/vec_postings.rs4
7 files changed, 163 insertions, 98 deletions
diff --git a/src/postings/docset.rs b/src/postings/docset.rs
index e28319f..ea4211a 100644
--- a/src/postings/docset.rs
+++ b/src/postings/docset.rs
@@ -65,6 +65,10 @@ pub trait DocSet {
None
}
}
+
+ /// Returns a best-effort hint of the
+ /// length of the docset.
+ fn size_hint(&self) -> usize;
}
@@ -83,6 +87,11 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
let unboxed: &TDocSet = self.borrow();
unboxed.doc()
}
+
+ fn size_hint(&self) -> usize {
+ let unboxed: &TDocSet = self.borrow();
+ unboxed.size_hint()
+ }
}
impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
@@ -100,4 +109,9 @@ impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
let unref: &TDocSet = *self;
unref.doc()
}
+
+ fn size_hint(&self) -> usize {
+ let unref: &TDocSet = *self;
+ unref.size_hint()
+ }
}
diff --git a/src/postings/intersection.rs b/src/postings/intersection.rs
index e4e4c23..06bc0b9 100644
--- a/src/postings/intersection.rs
+++ b/src/postings/intersection.rs
@@ -10,12 +10,13 @@ pub struct IntersectionDocSet<TDocSet: DocSet> {
}
impl<TDocSet: DocSet> From<Vec<TDocSet>> for IntersectionDocSet<TDocSet> {
- fn from(docsets: Vec<TDocSet>) -> IntersectionDocSet<TDocSet> {
+ fn from(mut docsets: Vec<TDocSet>) -> IntersectionDocSet<TDocSet> {
assert!(docsets.len() >= 2);
+ docsets.sort_by_key(|docset| docset.size_hint());
IntersectionDocSet {
docsets: docsets,
finished: false,
- doc: DocId::max_value(),
+ doc: 0u32,
}
}
}
@@ -31,37 +32,51 @@ impl<TDocSet: DocSet> IntersectionDocSet<TDocSet> {
impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> {
+ fn size_hint(&self) -> usize {
+ self.docsets
+ .iter()
+ .map(|docset| docset.size_hint())
+ .min()
+ .unwrap() // safe as docsets cannot be empty.
+ }
+
+ #[allow(never_loop)]
fn advance(&mut self) -> bool {
if self.finished {
return false;
}
- let num_docsets = self.docsets.len();
- let mut count_matching = 0;
- let mut doc_candidate = 0;
- let mut ord = 0;
- loop {
- let mut doc_set = &mut self.docsets[ord];
- match doc_set.skip_next(doc_candidate) {
- SkipResult::Reached => {
- count_matching += 1;
- if count_matching == num_docsets {
- self.doc = doc_candidate;
- return true;
+
+ let mut candidate_doc = self.doc;
+ let mut candidate_ord = self.docsets.len();
+
+ 'outer: loop {
+
+ for (ord, docset) in self.docsets.iter_mut().enumerate() {
+ if ord != candidate_ord {
+ // `candidate_ord` is already at the
+ // right position.
+ //
+ // Calling `skip_next` would advance this docset
+ // and miss it.
+ match docset.skip_next(candidate_doc) {
+ SkipResult::Reached => {}
+ SkipResult::OverStep => {
+ // this is not in the intersection,
+ // let's update our candidate.
+ candidate_doc = docset.doc();
+ candidate_ord = ord;
+ continue 'outer;
+ }
+ SkipResult::End => {
+ self.finished = true;
+ return false;
+ }
}
}
- SkipResult::End => {
- self.finished = true;
- return false;
- }
- SkipResult::OverStep => {
- count_matching = 1;
- doc_candidate = doc_set.doc();
- }
- }
- ord += 1;
- if ord == num_docsets {
- ord = 0;
}
+
+ self.doc = candidate_doc;
+ return true;
}
}
@@ -69,3 +84,51 @@ impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> {
self.doc
}
}
+
+
+#[cfg(test)]
+mod tests {
+
+ use postings::{DocSet, VecPostings, IntersectionDocSet};
+
+ #[test]
+ fn test_intersection() {
+ {
+ let left = VecPostings::from(vec![1, 3, 9]);
+ let right = VecPostings::from(vec![3, 4, 9, 18]);
+ let mut intersection = IntersectionDocSet::from(vec![left, right]);
+ assert!(intersection.advance());
+ assert_eq!(intersection.doc(), 3);
+ assert!(intersection.advance());
+ assert_eq!(intersection.doc(), 9);
+ assert!(!intersection.advance());
+ }
+ {
+ let a = VecPostings::from(vec![1, 3, 9]);
+ let b = VecPostings::from(vec![3, 4, 9, 18]);
+ let c = VecPostings::from(vec![1, 5, 9, 111]);
+ let mut intersection = IntersectionDocSet::from(vec![a, b, c]);
+ assert!(intersection.advance());
+ assert_eq!(intersection.doc(), 9);
+ assert!(!intersection.advance());
+ }
+ }
+
+ #[test]
+ fn test_intersection_zero() {
+ let left = VecPostings::from(vec![0]);
+ let right = VecPostings::from(vec![0]);
+ let mut intersection = IntersectionDocSet::from(vec![left, right]);
+ assert!(intersection.advance());
+ assert_eq!(intersection.doc(), 0);
+ }
+
+ #[test]
+ fn test_intersection_empty() {
+ let a = VecPostings::from(vec![1, 3]);
+ let b = VecPostings::from(vec![1, 4]);
+ let c = VecPostings::from(vec![3, 9]);
+ let mut intersection = IntersectionDocSet::from(vec![a, b, c]);
+ assert!(!intersection.advance());
+ }
+}
diff --git a/src/postings/mod.rs b/src/postings/mod.rs
index 647a7ae..5de1f7a 100644
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -373,29 +373,6 @@ mod tests {
}
}
- #[test]
- fn test_intersection() {
- {
- let left = VecPostings::from(vec![1, 3, 9]);
- let right = VecPostings::from(vec![3, 4, 9, 18]);
- let mut intersection = IntersectionDocSet::from(vec![left, right]);
- assert!(intersection.advance());
- assert_eq!(intersection.doc(), 3);
- assert!(intersection.advance());
- assert_eq!(intersection.doc(), 9);
- assert!(!intersection.advance());
- }
- {
- let a = VecPostings::from(vec![1, 3, 9]);
- let b = VecPostings::from(vec![3, 4, 9, 18]);
- let c = VecPostings::from(vec![1, 5, 9, 111]);
- let mut intersection = IntersectionDocSet::from(vec![a, b, c]);
- assert!(intersection.advance());
- assert_eq!(intersection.doc(), 9);
- assert!(!intersection.advance());
- }
- }
-
lazy_static! {
static ref TERM_A: Term = {
@@ -406,6 +383,14 @@ mod tests {
let field = Field(0);
Term::from_field_text(field, "b")
};
+ static ref TERM_C: Term = {
+ let field = Field(0);
+ Term::from_field_text(field, "c")
+ };
+ static ref TERM_D: Term = {
+ let field = Field(0);
+ Term::from_field_text(field, "d")
+ };
static ref INDEX: Index = {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", STRING);
@@ -415,25 +400,23 @@ mod tests {
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
let index = Index::create_in_ram(schema);
- let mut count_a = 0;
- let mut count_b = 0;
- let posting_list_size = 100_000;
+ let posting_list_size = 1_000_000;
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
- for _ in 0 .. {
- if count_a >= posting_list_size &&
- count_b >= posting_list_size {
- break;
- }
+ for _ in 0 .. posting_list_size {
let mut doc = Document::default();
- if count_a < posting_list_size && rng.gen_weighted_bool(15) {
- count_a += 1;
+ if rng.gen_weighted_bool(15) {
doc.add_text(text_field, "a");
}
- if count_b < posting_list_size && rng.gen_weighted_bool(10) {
- count_b += 1;
+ if rng.gen_weighted_bool(10) {
doc.add_text(text_field, "b");
}
+ if rng.gen_weighted_bool(5) {
+ doc.add_text(text_field, "c");
+ }
+ if rng.gen_weighted_bool(1) {
+ doc.add_text(text_field, "d");
+ }
index_writer.add_document(doc);
}
assert!(index_writer.commit().is_ok());
@@ -467,8 +450,16 @@ mod tests {
let segment_postings_b = segment_reader
.read_postings(&*TERM_B, SegmentPostingsOption::NoFreq)
.unwrap();
+ let segment_postings_c = segment_reader
+ .read_postings(&*TERM_C, SegmentPostingsOption::NoFreq)
+ .unwrap();
+ let segment_postings_d = segment_reader
+ .read_postings(&*TERM_D, SegmentPostingsOption::NoFreq)
+ .unwrap();
let mut intersection = IntersectionDocSet::from(vec![segment_postings_a,
- segment_postings_b]);
+ segment_postings_b,
+ segment_postings_c,
+ segment_postings_d]);
while intersection.advance() {}
});
}
diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs
index d6386d1..f429226 100644
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -152,6 +152,10 @@ impl<'a> DocSet for SegmentPostings<'a> {
}
}
+ fn size_hint(&self) -> usize {
+ self.len()
+ }
+
#[inline]
fn doc(&self) -> DocId {
let docs = self.block_cursor.docs();
diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs
index d5e72fa..4a3078a 100644
--- a/src/postings/serializer.rs
+++ b/src/postings/serializer.rs
@@ -10,12 +10,11 @@ use directory::WritePtr;
use compression::{NUM_DOCS_PER_BLOCK, BlockEncoder, CompositeEncoder};
use DocId;
use core::Segment;
-use std::io;
-use core::SegmentComponent;
-use std::io::Write;
+use std::io::{self, Write};
use compression::VIntEncoder;
use common::VInt;
use common::BinarySerializable;
+use common::CountingWriter;
use termdict::TermDictionaryBuilder;
@@ -52,10 +51,8 @@ use termdict::TermDictionaryBuilder;
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
pub struct PostingsSerializer {
terms_fst_builder: TermDictionaryBuilderImpl<WritePtr, TermInfo>,
- postings_write: WritePtr,
- positions_write: WritePtr,
- written_bytes_postings: usize,
- written_bytes_positions: usize,
+ postings_write: CountingWriter<WritePtr>,
+ positions_write: CountingWriter<WritePtr>,
last_doc_id_encoded: u32,
positions_encoder: CompositeEncoder,
block_encoder: BlockEncoder,
@@ -78,10 +75,8 @@ impl PostingsSerializer {
let terms_fst_builder = try!(TermDictionaryBuilderImpl::new(terms_write));
Ok(PostingsSerializer {
terms_fst_builder: terms_fst_builder,
- postings_write: postings_write,
- positions_write: positions_write,
- written_bytes_postings: 0,
- written_bytes_positions: 0,
+ postings_write: CountingWriter::wrap(postings_write),
+ positions_write: CountingWriter::wrap(positions_write),
last_doc_id_encoded: 0u32,
positions_encoder: CompositeEncoder::new(),
block_encoder: BlockEncoder::new(),
@@ -98,12 +93,10 @@ impl PostingsSerializer {
/// Open a new `PostingsSerializer` for the given segment
pub fn open(segment: &mut Segment) -> Result<PostingsSerializer> {
- let terms_write = try!(segment.open_write(SegmentComponent::TERMS));
- let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS));
- let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS));
- PostingsSerializer::new(terms_write,
- postings_write,
- positions_write,
+ use SegmentComponent::{TERMS, POSTINGS, POSITIONS};
+ PostingsSerializer::new(segment.open_write(TERMS)?,
+ segment.open_write(POSTINGS)?,
+ segment.open_write(POSITIONS)?,
segment.schema())
}
@@ -141,8 +134,8 @@ impl PostingsSerializer {
self.position_deltas.clear();
self.current_term_info = TermInfo {
doc_freq: 0,
- postings_offset: self.written_bytes_postings as u32,
- positions_offset: self.written_bytes_positions as u32,
+ postings_offset: self.postings_write.written_bytes() as u32,
+ positions_offset: self.positions_write.written_bytes() as u32,
};
self.terms_fst_builder.insert_key(term)
}
@@ -168,8 +161,7 @@ impl PostingsSerializer {
let block_encoded =
self.block_encoder
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
- self.written_bytes_postings += block_encoded.len();
- try!(self.postings_write.write_all(block_encoded));
+ self.postings_write.write_all(block_encoded)?;
self.doc_ids.clear();
}
// ... Idem for term frequencies
@@ -177,8 +169,7 @@ impl PostingsSerializer {
let block_encoded = self.block_encoder
.compress_vint_unsorted(&self.term_freqs[..]);
for num in block_encoded {
- self.written_bytes_postings +=
- try!(num.serialize(&mut self.postings_write));
+ num.serialize(&mut self.postings_write)?;
}
self.term_freqs.clear();
}
@@ -186,13 +177,11 @@ impl PostingsSerializer {
// On the other hand, positions are entirely buffered until the
// end of the term, at which point they are compressed and written.
if self.text_indexing_options.is_position_enabled() {
- self.written_bytes_positions +=
- try!(VInt(self.position_deltas.len() as u64)
- .serialize(&mut self.positions_write));
+ let posdelta_len = VInt(self.position_deltas.len() as u64);
+ posdelta_len.serialize(&mut self.positions_write)?;
let positions_encoded: &[u8] = self.positions_encoder
.compress_unsorted(&self.position_deltas[..]);
- try!(self.positions_write.write_all(positions_encoded));
- self.written_bytes_positions += positions_encoded.len();
+ self.positions_write.write_all(positions_encoded)?;
self.position_deltas.clear();
}
self.term_open = false;
@@ -230,15 +219,13 @@ impl PostingsSerializer {
self.block_encoder
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
- try!(self.postings_write.write_all(block_encoded));
- self.written_bytes_postings += block_encoded.len();
+ self.postings_write.write_all(block_encoded)?;
}
if self.text_indexing_options.is_termfreq_enabled() {
// encode the term_freqs
let block_encoded: &[u8] = self.block_encoder
.compress_block_unsorted(&self.term_freqs);
- try!(self.postings_write.write_all(block_encoded));
- self.written_bytes_postings += block_encoded.len();
+ self.postings_write.write_all(block_encoded)?;
self.term_freqs.clear();
}
self.doc_ids.clear();
diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs
index fbcf9e0..d639e9a 100644
--- a/src/postings/term_info.rs
+++ b/src/postings/term_info.rs
@@ -24,11 +24,13 @@ pub struct TermInfo {
impl BinarySerializable for TermInfo {
- fn serialize(&self, writer: &mut io::Write) -> io::Result<usize> {
- Ok(try!(self.doc_freq.serialize(writer)) + try!(self.postings_offset.serialize(writer)) +
- try!(self.positions_offset.serialize(writer)))
+ fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
+ self.doc_freq.serialize(writer)?;
+ self.postings_offset.serialize(writer)?;
+ self.positions_offset.serialize(writer)
}
- fn deserialize(reader: &mut io::Read) -> io::Result<Self> {
+
+ fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let doc_freq = try!(u32::deserialize(reader));
let postings_offset = try!(u32::deserialize(reader));
let positions_offset = try!(u32::deserialize(reader));
diff --git a/src/postings/vec_postings.rs b/src/postings/vec_postings.rs
index 399307c..8c9512f 100644
--- a/src/postings/vec_postings.rs
+++ b/src/postings/vec_postings.rs
@@ -34,6 +34,10 @@ impl DocSet for VecPostings {
fn doc(&self) -> DocId {
self.doc_ids[self.cursor.0]
}
+
+ fn size_hint(&self) -> usize {
+ self.len()
+ }
}
impl HasLen for VecPostings {