summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Masurel <paul.masurel@gmail.com>2017-05-25 22:54:27 +0900
committerPaul Masurel <paul.masurel@gmail.com>2017-05-25 23:49:54 +0900
commitac0b1a21eb5a23c594b0684ab4e3c9899eb82639 (patch)
tree0ec11ce04dfa5965fda044ed974b651bdcd0697c
parent6bbc789d841f783fb69a210ecb8554fb51677819 (diff)
Term as a wrapper
Small changes Plastic
-rw-r--r--src/indexer/merger.rs9
-rw-r--r--src/postings/postings_writer.rs3
-rw-r--r--src/schema/mod.rs1
-rw-r--r--src/schema/term.rs150
-rw-r--r--src/store/reader.rs1
-rw-r--r--src/termdict/merger.rs26
-rw-r--r--src/termdict/mod.rs34
-rw-r--r--src/termdict/streamdict/streamer.rs4
8 files changed, 129 insertions, 99 deletions
diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs
index fcddae9..4c19254 100644
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -17,7 +17,7 @@ use fastfield::FastFieldSerializer;
use fastfield::FastFieldReader;
use store::StoreWriter;
use std::cmp::{min, max};
-use schema;
+use schema::Term;
use termdict::TermStreamer;
use postings::SegmentPostingsOption;
@@ -218,6 +218,7 @@ impl IndexMerger {
let mut segment_postings_option = SegmentPostingsOption::FreqAndPositions;
while merged_terms.advance() {
+
// Create the total list of doc ids
// by stacking the doc ids from the different segment.
//
@@ -228,8 +229,8 @@ impl IndexMerger {
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc,
// seg0.max_doc + seg1.max_doc + seg2.max_doc]
// ...
- let term_bytes = merged_terms.key();
- let current_field = schema::extract_field_from_term_bytes(term_bytes);
+ let term = Term::wrap(merged_terms.key());
+ let current_field = term.field();
if last_field != Some(current_field) {
// we reached a new field.
@@ -278,7 +279,7 @@ impl IndexMerger {
// We know that there is at least one document containing
// the term, so we add it.
- serializer.new_term(term_bytes)?;
+ serializer.new_term(term.as_ref())?;
// We can now serialize this postings, by pushing each document to the
// postings serializer.
diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs
index 65fba2f..7208fd0 100644
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -9,7 +9,6 @@ use Result;
use schema::{Schema, Field};
use analyzer::StreamingIterator;
use std::marker::PhantomData;
-use schema::extract_field_from_term_bytes;
use std::ops::DerefMut;
use datastruct::stacker::{HashMap, Heap};
use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
@@ -87,7 +86,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
let term_offsets_it = term_offsets
.iter()
.cloned()
- .map(|(key, _)| extract_field_from_term_bytes(key))
+ .map(|(key, _)| Term::wrap(key).field())
.enumerate();
let mut prev_field = Field(u32::max_value());
diff --git a/src/schema/mod.rs b/src/schema/mod.rs
index 7dae35b..a9da6e7 100644
--- a/src/schema/mod.rs
+++ b/src/schema/mod.rs
@@ -113,7 +113,6 @@ mod field;
mod value;
mod named_field_document;
-pub(crate) use self::term::extract_field_from_term_bytes;
pub use self::named_field_document::NamedFieldDocument;
pub use self::schema::{Schema, SchemaBuilder};
pub use self::value::Value;
diff --git a/src/schema/term.rs b/src/schema/term.rs
index 06d8ffb..f66144b 100644
--- a/src/schema/term.rs
+++ b/src/schema/term.rs
@@ -13,52 +13,9 @@ const INT_TERM_LEN: usize = 4 + 8;
///
/// It actually wraps a `Vec<u8>`.
#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
-pub struct Term(Vec<u8>);
-
-/// Extract `field` from Term.
-pub(crate) fn extract_field_from_term_bytes(term_bytes: &[u8]) -> Field {
- Field(BigEndian::read_u32(&term_bytes[..4]))
-}
+pub struct Term<B = Vec<u8>>(B) where B: AsRef<[u8]>;
impl Term {
- /// Returns the field.
- pub fn field(&self) -> Field {
- extract_field_from_term_bytes(&self.0)
- }
-
- /// Returns the field.
- pub fn set_field(&mut self, field: Field) {
- if self.0.len() < 4 {
- self.0.resize(4, 0u8);
- }
- BigEndian::write_u32(&mut self.0[0..4], field.0);
- }
-
- /// Builds a term given a field, and a u64-value
- ///
- /// Assuming the term has a field id of 1, and a u64 value of 3234,
- /// the Term will have 8 bytes.
- ///
- /// The first four byte are dedicated to storing the field id as a u64.
- /// The 4 following bytes are encoding the u64 value.
- pub fn from_field_u64(field: Field, val: u64) -> Term {
- let mut term = Term(vec![0u8; INT_TERM_LEN]);
- term.set_field(field);
- term.set_u64(val);
- term
- }
-
- /// Sets a u64 value in the term.
- ///
- /// U64 are serialized using (8-byte) BigEndian
- /// representation.
- /// The use of BigEndian has the benefit of preserving
- /// the natural order of the values.
- pub fn set_u64(&mut self, val: u64) {
- self.0.resize(INT_TERM_LEN, 0u8);
- BigEndian::write_u64(&mut self.0[4..], val);
- }
-
/// Builds a term given a field, and a u64-value
///
/// Assuming the term has a field id of 1, and a u64 value of 3234,
@@ -71,6 +28,7 @@ impl Term {
Term::from_field_u64(field, val_u64)
}
+
/// Builds a term given a field, and a string value
///
/// Assuming the term has a field id of 2, and a text value of "abc",
@@ -85,6 +43,20 @@ impl Term {
term
}
+ /// Builds a term given a field, and a u64-value
+ ///
+ /// Assuming the term has a field id of 1, and a u64 value of 3234,
+ /// the Term will have 8 bytes.
+ ///
+ /// The first four byte are dedicated to storing the field id as a u64.
+ /// The 4 following bytes are encoding the u64 value.
+ pub fn from_field_u64(field: Field, val: u64) -> Term {
+ let mut term = Term(vec![0u8; INT_TERM_LEN]);
+ term.set_field(field);
+ term.set_u64(val);
+ term
+ }
+
/// Creates a new Term with an empty buffer,
/// but with a given capacity.
///
@@ -95,11 +67,35 @@ impl Term {
Term(Vec::with_capacity(num_bytes))
}
- /// Assume the term is a u64 field.
+ /// Returns the field.
+ pub fn set_field(&mut self, field: Field) {
+ if self.0.len() < 4 {
+ self.0.resize(4, 0u8);
+ }
+ BigEndian::write_u32(&mut self.0[0..4], field.0);
+ }
+
+ /// Sets a u64 value in the term.
///
- /// Panics if the term is not a u64 field.
- pub fn get_u64(&self) -> u64 {
- BigEndian::read_u64(&self.0[4..])
+ /// U64 are serialized using (8-byte) BigEndian
+ /// representation.
+ /// The use of BigEndian has the benefit of preserving
+ /// the natural order of the values.
+ pub fn set_u64(&mut self, val: u64) {
+ self.0.resize(INT_TERM_LEN, 0u8);
+ BigEndian::write_u64(&mut self.0[4..], val);
+ }
+
+ /// Sets a `i64` value in the term.
+ pub fn set_i64(&mut self, val: i64) {
+ self.set_u64(common::i64_to_u64(val));
+ }
+
+
+ /// Set the texts only, keeping the field untouched.
+ pub fn set_text(&mut self, text: &str) {
+ self.0.resize(4, 0u8);
+ self.0.extend(text.as_bytes());
}
/// Builds a term from its byte representation.
@@ -110,15 +106,37 @@ impl Term {
pub(crate) fn from_bytes(data: &[u8]) -> Term {
Term(Vec::from(data))
}
+}
+
+impl<B> Term<B>
+ where B: AsRef<[u8]>
+{
+ /// Wraps a source of data
+ pub fn wrap(data: B) -> Term<B> {
+ Term(data)
+ }
- /// Returns the serialized value of the term.
- /// (this does not include the field.)
+ /// Returns the field.
+ pub fn field(&self) -> Field {
+ Field(BigEndian::read_u32(&self.0.as_ref()[..4]))
+ }
+
+ /// Returns the `u64` value stored in a term.
///
- /// If the term is a string, its value is utf-8 encoded.
- /// If the term is a u64, its value is encoded according
- /// to `byteorder::LittleEndian`.
- pub fn value(&self) -> &[u8] {
- &self.0[4..]
+ /// # Panics
+ /// ... or returns an invalid value
+ /// if the term is not a `u64` field.
+ pub fn get_u64(&self) -> u64 {
+ BigEndian::read_u64(&self.0.as_ref()[4..])
+ }
+
+ /// Returns the `i64` value stored in a term.
+ ///
+ /// # Panics
+ /// ... or returns an invalid value
+ /// if the term is not a `i64` field.
+ pub fn get_i64(&self) -> i64 {
+ common::u64_to_i64(BigEndian::read_u64(&self.0.as_ref()[4..]))
}
/// Returns the text associated with the term.
@@ -128,24 +146,30 @@ impl Term {
/// if the index is corrupted or if you try to
/// call this method on a non-string type.
pub fn text(&self) -> &str {
- str::from_utf8(self.value()).expect("Term does not contain valid utf-8.")
+ str::from_utf8(self.value_bytes()).expect("Term does not contain valid utf-8.")
}
- /// Set the texts only, keeping the field untouched.
- pub fn set_text(&mut self, text: &str) {
- self.0.resize(4, 0u8);
- self.0.extend(text.as_bytes());
+ /// Returns the serialized value of the term.
+ /// (this does not include the field.)
+ ///
+ /// If the term is a string, its value is utf-8 encoded.
+ /// If the term is a u64, its value is encoded according
+ /// to `byteorder::LittleEndian`.
+ pub fn value_bytes(&self) -> &[u8] {
+ &self.0.as_ref()[4..]
}
/// Returns the underlying `&[u8]`
pub fn as_slice(&self) -> &[u8] {
- &self.0
+ self.0.as_ref()
}
}
-impl AsRef<[u8]> for Term {
+impl<B> AsRef<[u8]> for Term<B>
+ where B: AsRef<[u8]>
+{
fn as_ref(&self) -> &[u8] {
- &self.0
+ self.0.as_ref()
}
}
diff --git a/src/store/reader.rs b/src/store/reader.rs
index 5a67778..05781a5 100644
--- a/src/store/reader.rs
+++ b/src/store/reader.rs
@@ -85,6 +85,7 @@ impl StoreReader {
}
}
+#[allow(needless_pass_by_value)]
fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) {
let data_len = data.len();
let footer_offset = data_len - size_of::<u64>() - size_of::<u32>();
diff --git a/src/termdict/merger.rs b/src/termdict/merger.rs
index 77088cb..341fa06 100644
--- a/src/termdict/merger.rs
+++ b/src/termdict/merger.rs
@@ -6,7 +6,7 @@ use postings::TermInfo;
use std::cmp::Ordering;
use termdict::TermStreamer;
use termdict::TermDictionary;
-use fst::Streamer;
+use schema::Term;
pub struct HeapItem<'a, V>
where V: 'a + BinarySerializable + Default
@@ -128,6 +128,15 @@ impl<'a, V> TermMerger<'a, V>
pub fn current_kvs(&self) -> &[HeapItem<'a, V>] {
&self.current_streamers[..]
}
+
+ /// Iterates through terms
+ pub fn next(&mut self) -> Option<Term<&[u8]>> {
+ if self.advance() {
+ Some(Term::wrap(self.current_streamers[0].streamer.key()))
+ } else {
+ None
+ }
+ }
}
@@ -140,18 +149,3 @@ impl<'a> From<&'a [SegmentReader]> for TermMerger<'a, TermInfo> {
.collect())
}
}
-
-impl<'a, V> Streamer<'a> for TermMerger<'a, V>
- where V: BinarySerializable + Default
-{
- type Item = &'a [u8];
-
- fn next(&'a mut self) -> Option<Self::Item> {
- if self.advance() {
- Some(self.current_streamers[0].streamer.key())
- } else {
- None
- }
-
- }
-}
diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs
index 97e2fbb..9ec3247 100644
--- a/src/termdict/mod.rs
+++ b/src/termdict/mod.rs
@@ -190,9 +190,9 @@ pub trait TermStreamer<V>: Sized {
fn value(&self) -> &V;
/// Return the next `(key, value)` pair.
- fn next(&mut self) -> Option<(&[u8], &V)> {
+ fn next(&mut self) -> Option<(Term<&[u8]>, &V)> {
if self.advance() {
- Some((self.key(), self.value()))
+ Some((Term::wrap(self.key()), self.value()))
} else {
None
}
@@ -261,12 +261,24 @@ mod tests {
assert_eq!(term_dict.get("abc"), Some(34u32));
assert_eq!(term_dict.get("abcd"), Some(346u32));
let mut stream = term_dict.stream();
- assert_eq!(stream.next().unwrap(), ("abc".as_bytes(), &34u32));
- assert_eq!(stream.key(), "abc".as_bytes());
- assert_eq!(*stream.value(), 34u32);
- assert_eq!(stream.next().unwrap(), ("abcd".as_bytes(), &346u32));
- assert_eq!(stream.key(), "abcd".as_bytes());
- assert_eq!(*stream.value(), 346u32);
+ {
+ {
+ let (k, v) = stream.next().unwrap();
+ assert_eq!(k.as_ref(), "abc".as_bytes());
+ assert_eq!(v, &34u32);
+ }
+ assert_eq!(stream.key(), "abc".as_bytes());
+ assert_eq!(*stream.value(), 34u32);
+ }
+ {
+ {
+ let (k, v) = stream.next().unwrap();
+ assert_eq!(k.as_slice(), "abcd".as_bytes());
+ assert_eq!(v, &346u32);
+ }
+ assert_eq!(stream.key(), "abcd".as_bytes());
+ assert_eq!(*stream.value(), 346u32);
+ }
assert!(!stream.advance());
}
@@ -337,7 +349,7 @@ mod tests {
let mut i = 0;
while let Some((streamer_k, streamer_v)) = streamer.next() {
let &(ref key, ref v) = &ids[i];
- assert_eq!(streamer_k, key.as_bytes());
+ assert_eq!(streamer_k.as_ref(), key.as_bytes());
assert_eq!(streamer_v, v);
i += 1;
}
@@ -374,7 +386,7 @@ mod tests {
for j in 0..3 {
let (streamer_k, streamer_v) = streamer.next().unwrap();
let &(ref key, ref v) = &ids[i + j];
- assert_eq!(str::from_utf8(streamer_k).unwrap(), key);
+ assert_eq!(str::from_utf8(streamer_k.as_ref()).unwrap(), key);
assert_eq!(streamer_v, v);
}
}
@@ -390,7 +402,7 @@ mod tests {
for j in 0..3 {
let (streamer_k, streamer_v) = streamer.next().unwrap();
let &(ref key, ref v) = &ids[i + j + 1];
- assert_eq!(streamer_k, key.as_bytes());
+ assert_eq!(streamer_k.as_ref(), key.as_bytes());
assert_eq!(streamer_v, v);
}
}
diff --git a/src/termdict/streamdict/streamer.rs b/src/termdict/streamdict/streamer.rs
index 7d7dd65..7e1ed1a 100644
--- a/src/termdict/streamdict/streamer.rs
+++ b/src/termdict/streamdict/streamer.rs
@@ -106,12 +106,12 @@ fn get_offset<'a, V, P: Fn(&[u8]) -> bool>(predicate: P,
let mut prev_data: Vec<u8> = streamer.current_key.clone();
while let Some((iter_key, _)) = streamer.next() {
- if !predicate(iter_key) {
+ if !predicate(iter_key.as_ref()) {
return (prev.as_ptr() as usize, prev_data);
}
prev = streamer.cursor;
prev_data.clear();
- prev_data.extend_from_slice(iter_key);
+ prev_data.extend_from_slice(iter_key.as_ref());
}
(prev.as_ptr() as usize, prev_data)
}