diff options
author | Paul Masurel <paul.masurel@gmail.com> | 2017-06-02 21:03:37 +0900 |
---|---|---|
committer | Paul Masurel <paul.masurel@gmail.com> | 2017-06-02 21:03:37 +0900 |
commit | 36376201877df652c1bbe236d16ab94b34af1c22 (patch) | |
tree | 89306137201a3a876f303e8e2443c457e76f014a /src/postings | |
parent | 4cfc9806c0dacd4cd4e7a67c5398b4a66b80af9f (diff) | |
parent | a94679d74d6847f827af4932b540038cf7956f6c (diff) |
Merge branch 'master' of github.com:tantivy-search/tantivy
Diffstat (limited to 'src/postings')
-rw-r--r-- | src/postings/docset.rs | 14 | ||||
-rw-r--r-- | src/postings/intersection.rs | 115 | ||||
-rw-r--r-- | src/postings/mod.rs | 63 | ||||
-rw-r--r-- | src/postings/segment_postings.rs | 4 | ||||
-rw-r--r-- | src/postings/serializer.rs | 51 | ||||
-rw-r--r-- | src/postings/term_info.rs | 10 | ||||
-rw-r--r-- | src/postings/vec_postings.rs | 4 |
7 files changed, 163 insertions, 98 deletions
diff --git a/src/postings/docset.rs b/src/postings/docset.rs index e28319f..ea4211a 100644 --- a/src/postings/docset.rs +++ b/src/postings/docset.rs @@ -65,6 +65,10 @@ pub trait DocSet { None } } + + /// Returns a best-effort hint of the + /// length of the docset. + fn size_hint(&self) -> usize; } @@ -83,6 +87,11 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> { let unboxed: &TDocSet = self.borrow(); unboxed.doc() } + + fn size_hint(&self) -> usize { + let unboxed: &TDocSet = self.borrow(); + unboxed.size_hint() + } } impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet { @@ -100,4 +109,9 @@ impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet { let unref: &TDocSet = *self; unref.doc() } + + fn size_hint(&self) -> usize { + let unref: &TDocSet = *self; + unref.size_hint() + } } diff --git a/src/postings/intersection.rs b/src/postings/intersection.rs index e4e4c23..06bc0b9 100644 --- a/src/postings/intersection.rs +++ b/src/postings/intersection.rs @@ -10,12 +10,13 @@ pub struct IntersectionDocSet<TDocSet: DocSet> { } impl<TDocSet: DocSet> From<Vec<TDocSet>> for IntersectionDocSet<TDocSet> { - fn from(docsets: Vec<TDocSet>) -> IntersectionDocSet<TDocSet> { + fn from(mut docsets: Vec<TDocSet>) -> IntersectionDocSet<TDocSet> { assert!(docsets.len() >= 2); + docsets.sort_by_key(|docset| docset.size_hint()); IntersectionDocSet { docsets: docsets, finished: false, - doc: DocId::max_value(), + doc: 0u32, } } } @@ -31,37 +32,51 @@ impl<TDocSet: DocSet> IntersectionDocSet<TDocSet> { impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> { + fn size_hint(&self) -> usize { + self.docsets + .iter() + .map(|docset| docset.size_hint()) + .min() + .unwrap() // safe as docsets cannot be empty. + } + + #[allow(never_loop)] fn advance(&mut self) -> bool { if self.finished { return false; } - let num_docsets = self.docsets.len(); - let mut count_matching = 0; - let mut doc_candidate = 0; - let mut ord = 0; - loop { - let mut doc_set = &mut self.docsets[ord]; - match doc_set.skip_next(doc_candidate) { - SkipResult::Reached => { - count_matching += 1; - if count_matching == num_docsets { - self.doc = doc_candidate; - return true; + + let mut candidate_doc = self.doc; + let mut candidate_ord = self.docsets.len(); + + 'outer: loop { + + for (ord, docset) in self.docsets.iter_mut().enumerate() { + if ord != candidate_ord { + // `candidate_ord` is already at the + // right position. + // + // Calling `skip_next` would advance this docset + // and miss it. + match docset.skip_next(candidate_doc) { + SkipResult::Reached => {} + SkipResult::OverStep => { + // this is not in the intersection, + // let's update our candidate. + candidate_doc = docset.doc(); + candidate_ord = ord; + continue 'outer; + } + SkipResult::End => { + self.finished = true; + return false; + } } } - SkipResult::End => { - self.finished = true; - return false; - } - SkipResult::OverStep => { - count_matching = 1; - doc_candidate = doc_set.doc(); - } - } - ord += 1; - if ord == num_docsets { - ord = 0; } + + self.doc = candidate_doc; + return true; } } @@ -69,3 +84,51 @@ impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> { self.doc } } + + +#[cfg(test)] +mod tests { + + use postings::{DocSet, VecPostings, IntersectionDocSet}; + + #[test] + fn test_intersection() { + { + let left = VecPostings::from(vec![1, 3, 9]); + let right = VecPostings::from(vec![3, 4, 9, 18]); + let mut intersection = IntersectionDocSet::from(vec![left, right]); + assert!(intersection.advance()); + assert_eq!(intersection.doc(), 3); + assert!(intersection.advance()); + assert_eq!(intersection.doc(), 9); + assert!(!intersection.advance()); + } + { + let a = VecPostings::from(vec![1, 3, 9]); + let b = VecPostings::from(vec![3, 4, 9, 18]); + let c = VecPostings::from(vec![1, 5, 9, 111]); + let mut intersection = IntersectionDocSet::from(vec![a, b, c]); + assert!(intersection.advance()); + assert_eq!(intersection.doc(), 9); + assert!(!intersection.advance()); + } + } + + #[test] + fn test_intersection_zero() { + let left = VecPostings::from(vec![0]); + let right = VecPostings::from(vec![0]); + let mut intersection = IntersectionDocSet::from(vec![left, right]); + assert!(intersection.advance()); + assert_eq!(intersection.doc(), 0); + } + + #[test] + fn test_intersection_empty() { + let a = VecPostings::from(vec![1, 3]); + let b = VecPostings::from(vec![1, 4]); + let c = VecPostings::from(vec![3, 9]); + let mut intersection = IntersectionDocSet::from(vec![a, b, c]); + assert!(!intersection.advance()); + } +} diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 647a7ae..5de1f7a 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -373,29 +373,6 @@ mod tests { } } - #[test] - fn test_intersection() { - { - let left = VecPostings::from(vec![1, 3, 9]); - let right = VecPostings::from(vec![3, 4, 9, 18]); - let mut intersection = IntersectionDocSet::from(vec![left, right]); - assert!(intersection.advance()); - assert_eq!(intersection.doc(), 3); - assert!(intersection.advance()); - assert_eq!(intersection.doc(), 9); - assert!(!intersection.advance()); - } - { - let a = VecPostings::from(vec![1, 3, 9]); - let b = VecPostings::from(vec![3, 4, 9, 18]); - let c = VecPostings::from(vec![1, 5, 9, 111]); - let mut intersection = IntersectionDocSet::from(vec![a, b, c]); - assert!(intersection.advance()); - assert_eq!(intersection.doc(), 9); - assert!(!intersection.advance()); - } - } - lazy_static! { static ref TERM_A: Term = { @@ -406,6 +383,14 @@ mod tests { let field = Field(0); Term::from_field_text(field, "b") }; + static ref TERM_C: Term = { + let field = Field(0); + Term::from_field_text(field, "c") + }; + static ref TERM_D: Term = { + let field = Field(0); + Term::from_field_text(field, "d") + }; static ref INDEX: Index = { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", STRING); @@ -415,25 +400,23 @@ mod tests { let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed); let index = Index::create_in_ram(schema); - let mut count_a = 0; - let mut count_b = 0; - let posting_list_size = 100_000; + let posting_list_size = 1_000_000; { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - for _ in 0 .. { - if count_a >= posting_list_size && - count_b >= posting_list_size { - break; - } + for _ in 0 .. posting_list_size { let mut doc = Document::default(); - if count_a < posting_list_size && rng.gen_weighted_bool(15) { - count_a += 1; + if rng.gen_weighted_bool(15) { doc.add_text(text_field, "a"); } - if count_b < posting_list_size && rng.gen_weighted_bool(10) { - count_b += 1; + if rng.gen_weighted_bool(10) { doc.add_text(text_field, "b"); } + if rng.gen_weighted_bool(5) { + doc.add_text(text_field, "c"); + } + if rng.gen_weighted_bool(1) { + doc.add_text(text_field, "d"); + } index_writer.add_document(doc); } assert!(index_writer.commit().is_ok()); @@ -467,8 +450,16 @@ mod tests { let segment_postings_b = segment_reader .read_postings(&*TERM_B, SegmentPostingsOption::NoFreq) .unwrap(); + let segment_postings_c = segment_reader + .read_postings(&*TERM_C, SegmentPostingsOption::NoFreq) + .unwrap(); + let segment_postings_d = segment_reader + .read_postings(&*TERM_D, SegmentPostingsOption::NoFreq) + .unwrap(); let mut intersection = IntersectionDocSet::from(vec![segment_postings_a, - segment_postings_b]); + segment_postings_b, + segment_postings_c, + segment_postings_d]); while intersection.advance() {} }); } diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index d6386d1..f429226 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -152,6 +152,10 @@ impl<'a> DocSet for SegmentPostings<'a> { } } + fn size_hint(&self) -> usize { + self.len() + } + #[inline] fn doc(&self) -> DocId { let docs = self.block_cursor.docs(); diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index d5e72fa..4a3078a 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -10,12 +10,11 @@ use directory::WritePtr; use compression::{NUM_DOCS_PER_BLOCK, BlockEncoder, CompositeEncoder}; use DocId; use core::Segment; -use std::io; -use core::SegmentComponent; -use std::io::Write; +use std::io::{self, Write}; use compression::VIntEncoder; use common::VInt; use common::BinarySerializable; +use common::CountingWriter; use termdict::TermDictionaryBuilder; @@ -52,10 +51,8 @@ use termdict::TermDictionaryBuilder; /// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html). pub struct PostingsSerializer { terms_fst_builder: TermDictionaryBuilderImpl<WritePtr, TermInfo>, - postings_write: WritePtr, - positions_write: WritePtr, - written_bytes_postings: usize, - written_bytes_positions: usize, + postings_write: CountingWriter<WritePtr>, + positions_write: CountingWriter<WritePtr>, last_doc_id_encoded: u32, positions_encoder: CompositeEncoder, block_encoder: BlockEncoder, @@ -78,10 +75,8 @@ impl PostingsSerializer { let terms_fst_builder = try!(TermDictionaryBuilderImpl::new(terms_write)); Ok(PostingsSerializer { terms_fst_builder: terms_fst_builder, - postings_write: postings_write, - positions_write: positions_write, - written_bytes_postings: 0, - written_bytes_positions: 0, + postings_write: CountingWriter::wrap(postings_write), + positions_write: CountingWriter::wrap(positions_write), last_doc_id_encoded: 0u32, positions_encoder: CompositeEncoder::new(), block_encoder: BlockEncoder::new(), @@ -98,12 +93,10 @@ impl PostingsSerializer { /// Open a new `PostingsSerializer` for the given segment pub fn open(segment: &mut Segment) -> Result<PostingsSerializer> { - let terms_write = try!(segment.open_write(SegmentComponent::TERMS)); - let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS)); - let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS)); - PostingsSerializer::new(terms_write, - postings_write, - positions_write, + use SegmentComponent::{TERMS, POSTINGS, POSITIONS}; + PostingsSerializer::new(segment.open_write(TERMS)?, + segment.open_write(POSTINGS)?, + segment.open_write(POSITIONS)?, segment.schema()) } @@ -141,8 +134,8 @@ impl PostingsSerializer { self.position_deltas.clear(); self.current_term_info = TermInfo { doc_freq: 0, - postings_offset: self.written_bytes_postings as u32, - positions_offset: self.written_bytes_positions as u32, + postings_offset: self.postings_write.written_bytes() as u32, + positions_offset: self.positions_write.written_bytes() as u32, }; self.terms_fst_builder.insert_key(term) } @@ -168,8 +161,7 @@ impl PostingsSerializer { let block_encoded = self.block_encoder .compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded); - self.written_bytes_postings += block_encoded.len(); - try!(self.postings_write.write_all(block_encoded)); + self.postings_write.write_all(block_encoded)?; self.doc_ids.clear(); } // ... Idem for term frequencies @@ -177,8 +169,7 @@ impl PostingsSerializer { let block_encoded = self.block_encoder .compress_vint_unsorted(&self.term_freqs[..]); for num in block_encoded { - self.written_bytes_postings += - try!(num.serialize(&mut self.postings_write)); + num.serialize(&mut self.postings_write)?; } self.term_freqs.clear(); } @@ -186,13 +177,11 @@ impl PostingsSerializer { // On the other hand, positions are entirely buffered until the // end of the term, at which point they are compressed and written. if self.text_indexing_options.is_position_enabled() { - self.written_bytes_positions += - try!(VInt(self.position_deltas.len() as u64) - .serialize(&mut self.positions_write)); + let posdelta_len = VInt(self.position_deltas.len() as u64); + posdelta_len.serialize(&mut self.positions_write)?; let positions_encoded: &[u8] = self.positions_encoder .compress_unsorted(&self.position_deltas[..]); - try!(self.positions_write.write_all(positions_encoded)); - self.written_bytes_positions += positions_encoded.len(); + self.positions_write.write_all(positions_encoded)?; self.position_deltas.clear(); } self.term_open = false; @@ -230,15 +219,13 @@ impl PostingsSerializer { self.block_encoder .compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded); self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1]; - try!(self.postings_write.write_all(block_encoded)); - self.written_bytes_postings += block_encoded.len(); + self.postings_write.write_all(block_encoded)?; } if self.text_indexing_options.is_termfreq_enabled() { // encode the term_freqs let block_encoded: &[u8] = self.block_encoder .compress_block_unsorted(&self.term_freqs); - try!(self.postings_write.write_all(block_encoded)); - self.written_bytes_postings += block_encoded.len(); + self.postings_write.write_all(block_encoded)?; self.term_freqs.clear(); } self.doc_ids.clear(); diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index fbcf9e0..d639e9a 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -24,11 +24,13 @@ pub struct TermInfo { impl BinarySerializable for TermInfo { - fn serialize(&self, writer: &mut io::Write) -> io::Result<usize> { - Ok(try!(self.doc_freq.serialize(writer)) + try!(self.postings_offset.serialize(writer)) + - try!(self.positions_offset.serialize(writer))) + fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> { + self.doc_freq.serialize(writer)?; + self.postings_offset.serialize(writer)?; + self.positions_offset.serialize(writer) } - fn deserialize(reader: &mut io::Read) -> io::Result<Self> { + + fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> { let doc_freq = try!(u32::deserialize(reader)); let postings_offset = try!(u32::deserialize(reader)); let positions_offset = try!(u32::deserialize(reader)); diff --git a/src/postings/vec_postings.rs b/src/postings/vec_postings.rs index 399307c..8c9512f 100644 --- a/src/postings/vec_postings.rs +++ b/src/postings/vec_postings.rs @@ -34,6 +34,10 @@ impl DocSet for VecPostings { fn doc(&self) -> DocId { self.doc_ids[self.cursor.0] } + + fn size_hint(&self) -> usize { + self.len() + } } impl HasLen for VecPostings { |