Merge branch 'master' of github.com:tantivy-search/tantivy

author: Paul Masurel <paul.masurel@gmail.com> 2017-06-02 21:03:37 +0900
committer: Paul Masurel <paul.masurel@gmail.com> 2017-06-02 21:03:37 +0900
commit: 36376201877df652c1bbe236d16ab94b34af1c22 (patch)
tree: 89306137201a3a876f303e8e2443c457e76f014a /src/postings
parent: 4cfc9806c0dacd4cd4e7a67c5398b4a66b80af9f (diff)
parent: a94679d74d6847f827af4932b540038cf7956f6c (diff)
7 files changed, 163 insertions, 98 deletions
diff --git a/src/postings/docset.rs b/src/postings/docset.rs
index e28319f..ea4211a 100644
--- a/src/postings/docset.rs
+++ b/src/postings/docset.rs
@@ -65,6 +65,10 @@ pub trait DocSet {
             None
         }
     }
+
+    /// Returns a best-effort hint of the
+    /// length of the docset.
+    fn size_hint(&self) -> usize;
 }
 
 
@@ -83,6 +87,11 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
         let unboxed: &TDocSet = self.borrow();
         unboxed.doc()
     }
+
+    fn size_hint(&self) -> usize {
+        let unboxed: &TDocSet = self.borrow();
+        unboxed.size_hint()
+    }
 }
 
 impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
@@ -100,4 +109,9 @@ impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
         let unref: &TDocSet = *self;
         unref.doc()
     }
+
+    fn size_hint(&self) -> usize {
+        let unref: &TDocSet = *self;
+        unref.size_hint()
+    }
 }
diff --git a/src/postings/intersection.rs b/src/postings/intersection.rs
index e4e4c23..06bc0b9 100644
--- a/src/postings/intersection.rs
+++ b/src/postings/intersection.rs
@@ -10,12 +10,13 @@ pub struct IntersectionDocSet<TDocSet: DocSet> {
 }
 
 impl<TDocSet: DocSet> From<Vec<TDocSet>> for IntersectionDocSet<TDocSet> {
-    fn from(docsets: Vec<TDocSet>) -> IntersectionDocSet<TDocSet> {
+    fn from(mut docsets: Vec<TDocSet>) -> IntersectionDocSet<TDocSet> {
         assert!(docsets.len() >= 2);
+        docsets.sort_by_key(|docset| docset.size_hint());
         IntersectionDocSet {
             docsets: docsets,
             finished: false,
-            doc: DocId::max_value(),
+            doc: 0u32,
         }
     }
 }
@@ -31,37 +32,51 @@ impl<TDocSet: DocSet> IntersectionDocSet<TDocSet> {
 
 
 impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> {
+    fn size_hint(&self) -> usize {
+        self.docsets
+            .iter()
+            .map(|docset| docset.size_hint())
+            .min()
+            .unwrap() // safe as docsets cannot be empty.
+    }
+
+    #[allow(never_loop)]
     fn advance(&mut self) -> bool {
         if self.finished {
             return false;
         }
-        let num_docsets = self.docsets.len();
-        let mut count_matching = 0;
-        let mut doc_candidate = 0;
-        let mut ord = 0;
-        loop {
-            let mut doc_set = &mut self.docsets[ord];
-            match doc_set.skip_next(doc_candidate) {
-                SkipResult::Reached => {
-                    count_matching += 1;
-                    if count_matching == num_docsets {
-                        self.doc = doc_candidate;
-                        return true;
+
+        let mut candidate_doc = self.doc;
+        let mut candidate_ord = self.docsets.len();
+
+        'outer: loop {
+
+            for (ord, docset) in self.docsets.iter_mut().enumerate() {
+                if ord != candidate_ord {
+                    // `candidate_ord` is already at the
+                    // right position.
+                    //
+                    // Calling `skip_next` would advance this docset
+                    // and miss it.
+                    match docset.skip_next(candidate_doc) {
+                        SkipResult::Reached => {}
+                        SkipResult::OverStep => {
+                            // this is not in the intersection,
+                            // let's update our candidate.
+                            candidate_doc = docset.doc();
+                            candidate_ord = ord;
+                            continue 'outer;
+                        }
+                        SkipResult::End => {
+                            self.finished = true;
+                            return false;
+                        }
                     }
                 }
-                SkipResult::End => {
-                    self.finished = true;
-                    return false;
-                }
-                SkipResult::OverStep => {
-                    count_matching = 1;
-                    doc_candidate = doc_set.doc();
-                }
-            }
-            ord += 1;
-            if ord == num_docsets {
-                ord = 0;
             }
+
+            self.doc = candidate_doc;
+            return true;
         }
     }
 
@@ -69,3 +84,51 @@ impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> {
         self.doc
     }
 }
+
+
+#[cfg(test)]
+mod tests {
+
+    use postings::{DocSet, VecPostings, IntersectionDocSet};
+
+    #[test]
+    fn test_intersection() {
+        {
+            let left = VecPostings::from(vec![1, 3, 9]);
+            let right = VecPostings::from(vec![3, 4, 9, 18]);
+            let mut intersection = IntersectionDocSet::from(vec![left, right]);
+            assert!(intersection.advance());
+            assert_eq!(intersection.doc(), 3);
+            assert!(intersection.advance());
+            assert_eq!(intersection.doc(), 9);
+            assert!(!intersection.advance());
+        }
+        {
+            let a = VecPostings::from(vec![1, 3, 9]);
+            let b = VecPostings::from(vec![3, 4, 9, 18]);
+            let c = VecPostings::from(vec![1, 5, 9, 111]);
+            let mut intersection = IntersectionDocSet::from(vec![a, b, c]);
+            assert!(intersection.advance());
+            assert_eq!(intersection.doc(), 9);
+            assert!(!intersection.advance());
+        }
+    }
+
+    #[test]
+    fn test_intersection_zero() {
+        let left = VecPostings::from(vec![0]);
+        let right = VecPostings::from(vec![0]);
+        let mut intersection = IntersectionDocSet::from(vec![left, right]);
+        assert!(intersection.advance());
+        assert_eq!(intersection.doc(), 0);
+    }
+
+    #[test]
+    fn test_intersection_empty() {
+        let a = VecPostings::from(vec![1, 3]);
+        let b = VecPostings::from(vec![1, 4]);
+        let c = VecPostings::from(vec![3, 9]);
+        let mut intersection = IntersectionDocSet::from(vec![a, b, c]);
+        assert!(!intersection.advance());
+    }
+}
diff --git a/src/postings/mod.rs b/src/postings/mod.rs
index 647a7ae..5de1f7a 100644
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -373,29 +373,6 @@ mod tests {
         }
     }
 
-    #[test]
-    fn test_intersection() {
-        {
-            let left = VecPostings::from(vec![1, 3, 9]);
-            let right = VecPostings::from(vec![3, 4, 9, 18]);
-            let mut intersection = IntersectionDocSet::from(vec![left, right]);
-            assert!(intersection.advance());
-            assert_eq!(intersection.doc(), 3);
-            assert!(intersection.advance());
-            assert_eq!(intersection.doc(), 9);
-            assert!(!intersection.advance());
-        }
-        {
-            let a = VecPostings::from(vec![1, 3, 9]);
-            let b = VecPostings::from(vec![3, 4, 9, 18]);
-            let c = VecPostings::from(vec![1, 5, 9, 111]);
-            let mut intersection = IntersectionDocSet::from(vec![a, b, c]);
-            assert!(intersection.advance());
-            assert_eq!(intersection.doc(), 9);
-            assert!(!intersection.advance());
-        }
-    }
-
 
     lazy_static! {
         static ref TERM_A: Term = {
@@ -406,6 +383,14 @@ mod tests {
             let field = Field(0);
             Term::from_field_text(field, "b")
         };
+        static ref TERM_C: Term = {
+            let field = Field(0);
+            Term::from_field_text(field, "c")
+        };
+        static ref TERM_D: Term = {
+            let field = Field(0);
+            Term::from_field_text(field, "d")
+        };
         static ref INDEX: Index = {
             let mut schema_builder = SchemaBuilder::default();
             let text_field = schema_builder.add_text_field("text", STRING);
@@ -415,25 +400,23 @@ mod tests {
             let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
 
             let index = Index::create_in_ram(schema);
-            let mut count_a = 0;
-            let mut count_b = 0;
-            let posting_list_size = 100_000;
+            let posting_list_size = 1_000_000;
             {
                 let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
-                for _ in 0 .. {
-                    if count_a >= posting_list_size &&
-                       count_b >= posting_list_size {
-                        break;
-                    }
+                for _ in 0 .. posting_list_size {
                     let mut doc = Document::default();
-                    if count_a < posting_list_size &&  rng.gen_weighted_bool(15) {
-                        count_a += 1;
+                    if rng.gen_weighted_bool(15) {
                         doc.add_text(text_field, "a");
                     }
-                    if count_b < posting_list_size && rng.gen_weighted_bool(10) {
-                        count_b += 1;
+                    if rng.gen_weighted_bool(10) {
                         doc.add_text(text_field, "b");
                     }
+                    if rng.gen_weighted_bool(5) {
+                        doc.add_text(text_field, "c");
+                    }
+                    if rng.gen_weighted_bool(1) {
+                        doc.add_text(text_field, "d");
+                    }
                     index_writer.add_document(doc);
                 }
                 assert!(index_writer.commit().is_ok());
@@ -467,8 +450,16 @@ mod tests {
             let segment_postings_b = segment_reader
                 .read_postings(&*TERM_B, SegmentPostingsOption::NoFreq)
                 .unwrap();
+            let segment_postings_c = segment_reader
+                .read_postings(&*TERM_C, SegmentPostingsOption::NoFreq)
+                .unwrap();
+            let segment_postings_d = segment_reader
+                .read_postings(&*TERM_D, SegmentPostingsOption::NoFreq)
+                .unwrap();
             let mut intersection = IntersectionDocSet::from(vec![segment_postings_a,
-                                                                 segment_postings_b]);
+                                                                 segment_postings_b,
+                                                                 segment_postings_c,
+                                                                 segment_postings_d]);
             while intersection.advance() {}
         });
     }
diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs
index d6386d1..f429226 100644
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -152,6 +152,10 @@ impl<'a> DocSet for SegmentPostings<'a> {
         }
     }
 
+    fn size_hint(&self) -> usize {
+        self.len()
+    }
+
     #[inline]
     fn doc(&self) -> DocId {
         let docs = self.block_cursor.docs();
diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs
index d5e72fa..4a3078a 100644
--- a/src/postings/serializer.rs
+++ b/src/postings/serializer.rs
@@ -10,12 +10,11 @@ use directory::WritePtr;
 use compression::{NUM_DOCS_PER_BLOCK, BlockEncoder, CompositeEncoder};
 use DocId;
 use core::Segment;
-use std::io;
-use core::SegmentComponent;
-use std::io::Write;
+use std::io::{self, Write};
 use compression::VIntEncoder;
 use common::VInt;
 use common::BinarySerializable;
+use common::CountingWriter;
 use termdict::TermDictionaryBuilder;
 
 
@@ -52,10 +51,8 @@ use termdict::TermDictionaryBuilder;
 /// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
 pub struct PostingsSerializer {
     terms_fst_builder: TermDictionaryBuilderImpl<WritePtr, TermInfo>,
-    postings_write: WritePtr,
-    positions_write: WritePtr,
-    written_bytes_postings: usize,
-    written_bytes_positions: usize,
+    postings_write: CountingWriter<WritePtr>,
+    positions_write: CountingWriter<WritePtr>,
     last_doc_id_encoded: u32,
     positions_encoder: CompositeEncoder,
     block_encoder: BlockEncoder,
@@ -78,10 +75,8 @@ impl PostingsSerializer {
         let terms_fst_builder = try!(TermDictionaryBuilderImpl::new(terms_write));
         Ok(PostingsSerializer {
                terms_fst_builder: terms_fst_builder,
-               postings_write: postings_write,
-               positions_write: positions_write,
-               written_bytes_postings: 0,
-               written_bytes_positions: 0,
+               postings_write: CountingWriter::wrap(postings_write),
+               positions_write: CountingWriter::wrap(positions_write),
                last_doc_id_encoded: 0u32,
                positions_encoder: CompositeEncoder::new(),
                block_encoder: BlockEncoder::new(),
@@ -98,12 +93,10 @@ impl PostingsSerializer {
 
     /// Open a new `PostingsSerializer` for the given segment
     pub fn open(segment: &mut Segment) -> Result<PostingsSerializer> {
-        let terms_write = try!(segment.open_write(SegmentComponent::TERMS));
-        let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS));
-        let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS));
-        PostingsSerializer::new(terms_write,
-                                postings_write,
-                                positions_write,
+        use SegmentComponent::{TERMS, POSTINGS, POSITIONS};
+        PostingsSerializer::new(segment.open_write(TERMS)?,
+                                segment.open_write(POSTINGS)?,
+                                segment.open_write(POSITIONS)?,
                                 segment.schema())
     }
 
@@ -141,8 +134,8 @@ impl PostingsSerializer {
         self.position_deltas.clear();
         self.current_term_info = TermInfo {
             doc_freq: 0,
-            postings_offset: self.written_bytes_postings as u32,
-            positions_offset: self.written_bytes_positions as u32,
+            postings_offset: self.postings_write.written_bytes() as u32,
+            positions_offset: self.positions_write.written_bytes() as u32,
         };
         self.terms_fst_builder.insert_key(term)
     }
@@ -168,8 +161,7 @@ impl PostingsSerializer {
                     let block_encoded =
                         self.block_encoder
                             .compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
-                    self.written_bytes_postings += block_encoded.len();
-                    try!(self.postings_write.write_all(block_encoded));
+                    self.postings_write.write_all(block_encoded)?;
                     self.doc_ids.clear();
                 }
                 // ... Idem for term frequencies
@@ -177,8 +169,7 @@ impl PostingsSerializer {
                     let block_encoded = self.block_encoder
                         .compress_vint_unsorted(&self.term_freqs[..]);
                     for num in block_encoded {
-                        self.written_bytes_postings +=
-                            try!(num.serialize(&mut self.postings_write));
+                        num.serialize(&mut self.postings_write)?;
                     }
                     self.term_freqs.clear();
                 }
@@ -186,13 +177,11 @@ impl PostingsSerializer {
             // On the other hand, positions are entirely buffered until the
             // end of the term, at which point they are compressed and written.
             if self.text_indexing_options.is_position_enabled() {
-                self.written_bytes_positions +=
-                    try!(VInt(self.position_deltas.len() as u64)
-                                                         .serialize(&mut self.positions_write));
+                let posdelta_len = VInt(self.position_deltas.len() as u64);
+                posdelta_len.serialize(&mut self.positions_write)?;
                 let positions_encoded: &[u8] = self.positions_encoder
                     .compress_unsorted(&self.position_deltas[..]);
-                try!(self.positions_write.write_all(positions_encoded));
-                self.written_bytes_positions += positions_encoded.len();
+                self.positions_write.write_all(positions_encoded)?;
                 self.position_deltas.clear();
             }
             self.term_open = false;
@@ -230,15 +219,13 @@ impl PostingsSerializer {
                     self.block_encoder
                         .compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
                 self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
-                try!(self.postings_write.write_all(block_encoded));
-                self.written_bytes_postings += block_encoded.len();
+                self.postings_write.write_all(block_encoded)?;
             }
             if self.text_indexing_options.is_termfreq_enabled() {
                 // encode the term_freqs
                 let block_encoded: &[u8] = self.block_encoder
                     .compress_block_unsorted(&self.term_freqs);
-                try!(self.postings_write.write_all(block_encoded));
-                self.written_bytes_postings += block_encoded.len();
+                self.postings_write.write_all(block_encoded)?;
                 self.term_freqs.clear();
             }
             self.doc_ids.clear();
diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs
index fbcf9e0..d639e9a 100644
--- a/src/postings/term_info.rs
+++ b/src/postings/term_info.rs
@@ -24,11 +24,13 @@ pub struct TermInfo {
 
 
 impl BinarySerializable for TermInfo {
-    fn serialize(&self, writer: &mut io::Write) -> io::Result<usize> {
-        Ok(try!(self.doc_freq.serialize(writer)) + try!(self.postings_offset.serialize(writer)) +
-           try!(self.positions_offset.serialize(writer)))
+    fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
+        self.doc_freq.serialize(writer)?;
+        self.postings_offset.serialize(writer)?;
+        self.positions_offset.serialize(writer)
     }
-    fn deserialize(reader: &mut io::Read) -> io::Result<Self> {
+
+    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
         let doc_freq = try!(u32::deserialize(reader));
         let postings_offset = try!(u32::deserialize(reader));
         let positions_offset = try!(u32::deserialize(reader));
diff --git a/src/postings/vec_postings.rs b/src/postings/vec_postings.rs
index 399307c..8c9512f 100644
--- a/src/postings/vec_postings.rs
+++ b/src/postings/vec_postings.rs
@@ -34,6 +34,10 @@ impl DocSet for VecPostings {
     fn doc(&self) -> DocId {
         self.doc_ids[self.cursor.0]
     }
+
+    fn size_hint(&self) -> usize {
+        self.len()
+    }
 }
 
 impl HasLen for VecPostings {
author	Paul Masurel <paul.masurel@gmail.com>	2017-06-02 21:03:37 +0900
committer	Paul Masurel <paul.masurel@gmail.com>	2017-06-02 21:03:37 +0900
commit	36376201877df652c1bbe236d16ab94b34af1c22 (patch)
tree	89306137201a3a876f303e8e2443c457e76f014a /src/postings
parent	4cfc9806c0dacd4cd4e7a67c5398b4a66b80af9f (diff)
parent	a94679d74d6847f827af4932b540038cf7956f6c (diff)