Term as a wrapper

Small changes Plastic
author: Paul Masurel <paul.masurel@gmail.com> 2017-05-25 22:54:27 +0900
committer: Paul Masurel <paul.masurel@gmail.com> 2017-05-25 23:49:54 +0900
commit: ac0b1a21eb5a23c594b0684ab4e3c9899eb82639 (patch)
tree: 0ec11ce04dfa5965fda044ed974b651bdcd0697c
parent: 6bbc789d841f783fb69a210ecb8554fb51677819 (diff)
8 files changed, 129 insertions, 99 deletions
diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs
index fcddae9..4c19254 100644
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -17,7 +17,7 @@ use fastfield::FastFieldSerializer;
 use fastfield::FastFieldReader;
 use store::StoreWriter;
 use std::cmp::{min, max};
-use schema;
+use schema::Term;
 use termdict::TermStreamer;
 use postings::SegmentPostingsOption;
 
@@ -218,6 +218,7 @@ impl IndexMerger {
         let mut segment_postings_option = SegmentPostingsOption::FreqAndPositions;
 
         while merged_terms.advance() {
+
             // Create the total list of doc ids
             // by stacking the doc ids from the different segment.
             //
@@ -228,8 +229,8 @@ impl IndexMerger {
             // - Segment 2's doc ids become  [seg0.max_doc + seg1.max_doc,
             //                                seg0.max_doc + seg1.max_doc + seg2.max_doc]
             // ...
-            let term_bytes = merged_terms.key();
-            let current_field = schema::extract_field_from_term_bytes(term_bytes);
+            let term = Term::wrap(merged_terms.key());
+            let current_field = term.field();
 
             if last_field != Some(current_field) {
                 // we reached a new field.
@@ -278,7 +279,7 @@ impl IndexMerger {
 
             // We know that there is at least one document containing
             // the term, so we add it.
-            serializer.new_term(term_bytes)?;
+            serializer.new_term(term.as_ref())?;
 
             // We can now serialize this postings, by pushing each document to the
             // postings serializer.
diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs
index 65fba2f..7208fd0 100644
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -9,7 +9,6 @@ use Result;
 use schema::{Schema, Field};
 use analyzer::StreamingIterator;
 use std::marker::PhantomData;
-use schema::extract_field_from_term_bytes;
 use std::ops::DerefMut;
 use datastruct::stacker::{HashMap, Heap};
 use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
@@ -87,7 +86,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
         let term_offsets_it = term_offsets
             .iter()
             .cloned()
-            .map(|(key, _)| extract_field_from_term_bytes(key))
+            .map(|(key, _)| Term::wrap(key).field())
             .enumerate();
 
         let mut prev_field = Field(u32::max_value());
diff --git a/src/schema/mod.rs b/src/schema/mod.rs
index 7dae35b..a9da6e7 100644
--- a/src/schema/mod.rs
+++ b/src/schema/mod.rs
@@ -113,7 +113,6 @@ mod field;
 mod value;
 mod named_field_document;
 
-pub(crate) use self::term::extract_field_from_term_bytes;
 pub use self::named_field_document::NamedFieldDocument;
 pub use self::schema::{Schema, SchemaBuilder};
 pub use self::value::Value;
diff --git a/src/schema/term.rs b/src/schema/term.rs
index 06d8ffb..f66144b 100644
--- a/src/schema/term.rs
+++ b/src/schema/term.rs
@@ -13,52 +13,9 @@ const INT_TERM_LEN: usize = 4 + 8;
 ///
 /// It actually wraps a `Vec<u8>`.
 #[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
-pub struct Term(Vec<u8>);
-
-/// Extract `field` from Term.
-pub(crate) fn extract_field_from_term_bytes(term_bytes: &[u8]) -> Field {
-    Field(BigEndian::read_u32(&term_bytes[..4]))
-}
+pub struct Term<B = Vec<u8>>(B) where B: AsRef<[u8]>;
 
 impl Term {
-    /// Returns the field.
-    pub fn field(&self) -> Field {
-        extract_field_from_term_bytes(&self.0)
-    }
-
-    /// Returns the field.
-    pub fn set_field(&mut self, field: Field) {
-        if self.0.len() < 4 {
-            self.0.resize(4, 0u8);
-        }
-        BigEndian::write_u32(&mut self.0[0..4], field.0);
-    }
-
-    /// Builds a term given a field, and a u64-value
-    ///
-    /// Assuming the term has a field id of 1, and a u64 value of 3234,
-    /// the Term will have 8 bytes.
-    ///
-    /// The first four byte are dedicated to storing the field id as a u64.
-    /// The 4 following bytes are encoding the u64 value.
-    pub fn from_field_u64(field: Field, val: u64) -> Term {
-        let mut term = Term(vec![0u8; INT_TERM_LEN]);
-        term.set_field(field);
-        term.set_u64(val);
-        term
-    }
-
-    /// Sets a u64 value in the term.
-    ///
-    /// U64 are serialized using (8-byte) BigEndian
-    /// representation.
-    /// The use of BigEndian has the benefit of preserving
-    /// the natural order of the values.
-    pub fn set_u64(&mut self, val: u64) {
-        self.0.resize(INT_TERM_LEN, 0u8);
-        BigEndian::write_u64(&mut self.0[4..], val);
-    }
-
     /// Builds a term given a field, and a u64-value
     ///
     /// Assuming the term has a field id of 1, and a u64 value of 3234,
@@ -71,6 +28,7 @@ impl Term {
         Term::from_field_u64(field, val_u64)
     }
 
+
     /// Builds a term given a field, and a string value
     ///
     /// Assuming the term has a field id of 2, and a text value of "abc",
@@ -85,6 +43,20 @@ impl Term {
         term
     }
 
+    /// Builds a term given a field, and a u64-value
+    ///
+    /// Assuming the term has a field id of 1, and a u64 value of 3234,
+    /// the Term will have 8 bytes.
+    ///
+    /// The first four byte are dedicated to storing the field id as a u64.
+    /// The 4 following bytes are encoding the u64 value.
+    pub fn from_field_u64(field: Field, val: u64) -> Term {
+        let mut term = Term(vec![0u8; INT_TERM_LEN]);
+        term.set_field(field);
+        term.set_u64(val);
+        term
+    }
+
     /// Creates a new Term with an empty buffer,
     /// but with a given capacity.
     ///
@@ -95,11 +67,35 @@ impl Term {
         Term(Vec::with_capacity(num_bytes))
     }
 
-    /// Assume the term is a u64 field.
+    /// Returns the field.
+    pub fn set_field(&mut self, field: Field) {
+        if self.0.len() < 4 {
+            self.0.resize(4, 0u8);
+        }
+        BigEndian::write_u32(&mut self.0[0..4], field.0);
+    }
+
+    /// Sets a u64 value in the term.
     ///
-    /// Panics if the term is not a u64 field.
-    pub fn get_u64(&self) -> u64 {
-        BigEndian::read_u64(&self.0[4..])
+    /// U64 are serialized using (8-byte) BigEndian
+    /// representation.
+    /// The use of BigEndian has the benefit of preserving
+    /// the natural order of the values.
+    pub fn set_u64(&mut self, val: u64) {
+        self.0.resize(INT_TERM_LEN, 0u8);
+        BigEndian::write_u64(&mut self.0[4..], val);
+    }
+
+    /// Sets a `i64` value in the term.
+    pub fn set_i64(&mut self, val: i64) {
+        self.set_u64(common::i64_to_u64(val));
+    }
+
+
+    /// Set the texts only, keeping the field untouched.
+    pub fn set_text(&mut self, text: &str) {
+        self.0.resize(4, 0u8);
+        self.0.extend(text.as_bytes());
     }
 
     /// Builds a term from its byte representation.
@@ -110,15 +106,37 @@ impl Term {
     pub(crate) fn from_bytes(data: &[u8]) -> Term {
         Term(Vec::from(data))
     }
+}
+
+impl<B> Term<B>
+    where B: AsRef<[u8]>
+{
+    /// Wraps a source of data
+    pub fn wrap(data: B) -> Term<B> {
+        Term(data)
+    }
 
-    /// Returns the serialized value of the term.
-    /// (this does not include the field.)
+    /// Returns the field.
+    pub fn field(&self) -> Field {
+        Field(BigEndian::read_u32(&self.0.as_ref()[..4]))
+    }
+
+    /// Returns the `u64` value stored in a term.
     ///
-    /// If the term is a string, its value is utf-8 encoded.
-    /// If the term is a u64, its value is encoded according
-    /// to `byteorder::LittleEndian`.
-    pub fn value(&self) -> &[u8] {
-        &self.0[4..]
+    /// # Panics
+    /// ... or returns an invalid value
+    /// if the term is not a `u64` field.
+    pub fn get_u64(&self) -> u64 {
+        BigEndian::read_u64(&self.0.as_ref()[4..])
+    }
+
+    /// Returns the `i64` value stored in a term.
+    ///
+    /// # Panics
+    /// ... or returns an invalid value
+    /// if the term is not a `i64` field.
+    pub fn get_i64(&self) -> i64 {
+        common::u64_to_i64(BigEndian::read_u64(&self.0.as_ref()[4..]))
     }
 
     /// Returns the text associated with the term.
@@ -128,24 +146,30 @@ impl Term {
     /// if the index is corrupted or if you try to
     /// call this method on a non-string type.
     pub fn text(&self) -> &str {
-        str::from_utf8(self.value()).expect("Term does not contain valid utf-8.")
+        str::from_utf8(self.value_bytes()).expect("Term does not contain valid utf-8.")
     }
 
-    /// Set the texts only, keeping the field untouched.
-    pub fn set_text(&mut self, text: &str) {
-        self.0.resize(4, 0u8);
-        self.0.extend(text.as_bytes());
+    /// Returns the serialized value of the term.
+    /// (this does not include the field.)
+    ///
+    /// If the term is a string, its value is utf-8 encoded.
+    /// If the term is a u64, its value is encoded according
+    /// to `byteorder::LittleEndian`.
+    pub fn value_bytes(&self) -> &[u8] {
+        &self.0.as_ref()[4..]
     }
 
     /// Returns the underlying `&[u8]`
     pub fn as_slice(&self) -> &[u8] {
-        &self.0
+        self.0.as_ref()
     }
 }
 
-impl AsRef<[u8]> for Term {
+impl<B> AsRef<[u8]> for Term<B>
+    where B: AsRef<[u8]>
+{
     fn as_ref(&self) -> &[u8] {
-        &self.0
+        self.0.as_ref()
     }
 }
 
diff --git a/src/store/reader.rs b/src/store/reader.rs
index 5a67778..05781a5 100644
--- a/src/store/reader.rs
+++ b/src/store/reader.rs
@@ -85,6 +85,7 @@ impl StoreReader {
     }
 }
 
+#[allow(needless_pass_by_value)]
 fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) {
     let data_len = data.len();
     let footer_offset = data_len - size_of::<u64>() - size_of::<u32>();
diff --git a/src/termdict/merger.rs b/src/termdict/merger.rs
index 77088cb..341fa06 100644
--- a/src/termdict/merger.rs
+++ b/src/termdict/merger.rs
@@ -6,7 +6,7 @@ use postings::TermInfo;
 use std::cmp::Ordering;
 use termdict::TermStreamer;
 use termdict::TermDictionary;
-use fst::Streamer;
+use schema::Term;
 
 pub struct HeapItem<'a, V>
     where V: 'a + BinarySerializable + Default
@@ -128,6 +128,15 @@ impl<'a, V> TermMerger<'a, V>
     pub fn current_kvs(&self) -> &[HeapItem<'a, V>] {
         &self.current_streamers[..]
     }
+
+    /// Iterates through terms
+    pub fn next(&mut self) -> Option<Term<&[u8]>> {
+        if self.advance() {
+            Some(Term::wrap(self.current_streamers[0].streamer.key()))
+        } else {
+            None
+        }
+    }
 }
 
 
@@ -140,18 +149,3 @@ impl<'a> From<&'a [SegmentReader]> for TermMerger<'a, TermInfo> {
                             .collect())
     }
 }
-
-impl<'a, V> Streamer<'a> for TermMerger<'a, V>
-    where V: BinarySerializable + Default
-{
-    type Item = &'a [u8];
-
-    fn next(&'a mut self) -> Option<Self::Item> {
-        if self.advance() {
-            Some(self.current_streamers[0].streamer.key())
-        } else {
-            None
-        }
-
-    }
-}
diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs
index 97e2fbb..9ec3247 100644
--- a/src/termdict/mod.rs
+++ b/src/termdict/mod.rs
@@ -190,9 +190,9 @@ pub trait TermStreamer<V>: Sized {
     fn value(&self) -> &V;
 
     /// Return the next `(key, value)` pair.
-    fn next(&mut self) -> Option<(&[u8], &V)> {
+    fn next(&mut self) -> Option<(Term<&[u8]>, &V)> {
         if self.advance() {
-            Some((self.key(), self.value()))
+            Some((Term::wrap(self.key()), self.value()))
         } else {
             None
         }
@@ -261,12 +261,24 @@ mod tests {
         assert_eq!(term_dict.get("abc"), Some(34u32));
         assert_eq!(term_dict.get("abcd"), Some(346u32));
         let mut stream = term_dict.stream();
-        assert_eq!(stream.next().unwrap(), ("abc".as_bytes(), &34u32));
-        assert_eq!(stream.key(), "abc".as_bytes());
-        assert_eq!(*stream.value(), 34u32);
-        assert_eq!(stream.next().unwrap(), ("abcd".as_bytes(), &346u32));
-        assert_eq!(stream.key(), "abcd".as_bytes());
-        assert_eq!(*stream.value(), 346u32);
+        {
+            {
+                let (k, v) = stream.next().unwrap();
+                assert_eq!(k.as_ref(), "abc".as_bytes());
+                assert_eq!(v, &34u32);
+            }
+            assert_eq!(stream.key(), "abc".as_bytes());
+            assert_eq!(*stream.value(), 34u32);
+        }
+        {
+            {
+                let (k, v) = stream.next().unwrap();
+                assert_eq!(k.as_slice(), "abcd".as_bytes());
+                assert_eq!(v, &346u32);
+            }
+            assert_eq!(stream.key(), "abcd".as_bytes());
+            assert_eq!(*stream.value(), 346u32);
+        }
         assert!(!stream.advance());
     }
 
@@ -337,7 +349,7 @@ mod tests {
             let mut i = 0;
             while let Some((streamer_k, streamer_v)) = streamer.next() {
                 let &(ref key, ref v) = &ids[i];
-                assert_eq!(streamer_k, key.as_bytes());
+                assert_eq!(streamer_k.as_ref(), key.as_bytes());
                 assert_eq!(streamer_v, v);
                 i += 1;
             }
@@ -374,7 +386,7 @@ mod tests {
                 for j in 0..3 {
                     let (streamer_k, streamer_v) = streamer.next().unwrap();
                     let &(ref key, ref v) = &ids[i + j];
-                    assert_eq!(str::from_utf8(streamer_k).unwrap(), key);
+                    assert_eq!(str::from_utf8(streamer_k.as_ref()).unwrap(), key);
                     assert_eq!(streamer_v, v);
                 }
             }
@@ -390,7 +402,7 @@ mod tests {
                 for j in 0..3 {
                     let (streamer_k, streamer_v) = streamer.next().unwrap();
                     let &(ref key, ref v) = &ids[i + j + 1];
-                    assert_eq!(streamer_k, key.as_bytes());
+                    assert_eq!(streamer_k.as_ref(), key.as_bytes());
                     assert_eq!(streamer_v, v);
                 }
             }
diff --git a/src/termdict/streamdict/streamer.rs b/src/termdict/streamdict/streamer.rs
index 7d7dd65..7e1ed1a 100644
--- a/src/termdict/streamdict/streamer.rs
+++ b/src/termdict/streamdict/streamer.rs
@@ -106,12 +106,12 @@ fn get_offset<'a, V, P: Fn(&[u8]) -> bool>(predicate: P,
     let mut prev_data: Vec<u8> = streamer.current_key.clone();
 
     while let Some((iter_key, _)) = streamer.next() {
-        if !predicate(iter_key) {
+        if !predicate(iter_key.as_ref()) {
             return (prev.as_ptr() as usize, prev_data);
         }
         prev = streamer.cursor;
         prev_data.clear();
-        prev_data.extend_from_slice(iter_key);
+        prev_data.extend_from_slice(iter_key.as_ref());
     }
     (prev.as_ptr() as usize, prev_data)
 }
author	Paul Masurel <paul.masurel@gmail.com>	2017-05-25 22:54:27 +0900
committer	Paul Masurel <paul.masurel@gmail.com>	2017-05-25 23:49:54 +0900
commit	ac0b1a21eb5a23c594b0684ab4e3c9899eb82639 (patch)
tree	0ec11ce04dfa5965fda044ed974b651bdcd0697c
parent	6bbc789d841f783fb69a210ecb8554fb51677819 (diff)