diff options
author | Paul Masurel <paul.masurel@gmail.com> | 2017-05-25 22:54:27 +0900 |
---|---|---|
committer | Paul Masurel <paul.masurel@gmail.com> | 2017-05-25 23:49:54 +0900 |
commit | ac0b1a21eb5a23c594b0684ab4e3c9899eb82639 (patch) | |
tree | 0ec11ce04dfa5965fda044ed974b651bdcd0697c | |
parent | 6bbc789d841f783fb69a210ecb8554fb51677819 (diff) |
Term as a wrapper
Small changes
Plastic
-rw-r--r-- | src/indexer/merger.rs | 9 | ||||
-rw-r--r-- | src/postings/postings_writer.rs | 3 | ||||
-rw-r--r-- | src/schema/mod.rs | 1 | ||||
-rw-r--r-- | src/schema/term.rs | 150 | ||||
-rw-r--r-- | src/store/reader.rs | 1 | ||||
-rw-r--r-- | src/termdict/merger.rs | 26 | ||||
-rw-r--r-- | src/termdict/mod.rs | 34 | ||||
-rw-r--r-- | src/termdict/streamdict/streamer.rs | 4 |
8 files changed, 129 insertions, 99 deletions
diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index fcddae9..4c19254 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -17,7 +17,7 @@ use fastfield::FastFieldSerializer; use fastfield::FastFieldReader; use store::StoreWriter; use std::cmp::{min, max}; -use schema; +use schema::Term; use termdict::TermStreamer; use postings::SegmentPostingsOption; @@ -218,6 +218,7 @@ impl IndexMerger { let mut segment_postings_option = SegmentPostingsOption::FreqAndPositions; while merged_terms.advance() { + // Create the total list of doc ids // by stacking the doc ids from the different segment. // @@ -228,8 +229,8 @@ impl IndexMerger { // - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, // seg0.max_doc + seg1.max_doc + seg2.max_doc] // ... - let term_bytes = merged_terms.key(); - let current_field = schema::extract_field_from_term_bytes(term_bytes); + let term = Term::wrap(merged_terms.key()); + let current_field = term.field(); if last_field != Some(current_field) { // we reached a new field. @@ -278,7 +279,7 @@ impl IndexMerger { // We know that there is at least one document containing // the term, so we add it. - serializer.new_term(term_bytes)?; + serializer.new_term(term.as_ref())?; // We can now serialize this postings, by pushing each document to the // postings serializer. diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 65fba2f..7208fd0 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -9,7 +9,6 @@ use Result; use schema::{Schema, Field}; use analyzer::StreamingIterator; use std::marker::PhantomData; -use schema::extract_field_from_term_bytes; use std::ops::DerefMut; use datastruct::stacker::{HashMap, Heap}; use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder}; @@ -87,7 +86,7 @@ impl<'a> MultiFieldPostingsWriter<'a> { let term_offsets_it = term_offsets .iter() .cloned() - .map(|(key, _)| extract_field_from_term_bytes(key)) + .map(|(key, _)| Term::wrap(key).field()) .enumerate(); let mut prev_field = Field(u32::max_value()); diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 7dae35b..a9da6e7 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -113,7 +113,6 @@ mod field; mod value; mod named_field_document; -pub(crate) use self::term::extract_field_from_term_bytes; pub use self::named_field_document::NamedFieldDocument; pub use self::schema::{Schema, SchemaBuilder}; pub use self::value::Value; diff --git a/src/schema/term.rs b/src/schema/term.rs index 06d8ffb..f66144b 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -13,52 +13,9 @@ const INT_TERM_LEN: usize = 4 + 8; /// /// It actually wraps a `Vec<u8>`. #[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)] -pub struct Term(Vec<u8>); - -/// Extract `field` from Term. -pub(crate) fn extract_field_from_term_bytes(term_bytes: &[u8]) -> Field { - Field(BigEndian::read_u32(&term_bytes[..4])) -} +pub struct Term<B = Vec<u8>>(B) where B: AsRef<[u8]>; impl Term { - /// Returns the field. - pub fn field(&self) -> Field { - extract_field_from_term_bytes(&self.0) - } - - /// Returns the field. - pub fn set_field(&mut self, field: Field) { - if self.0.len() < 4 { - self.0.resize(4, 0u8); - } - BigEndian::write_u32(&mut self.0[0..4], field.0); - } - - /// Builds a term given a field, and a u64-value - /// - /// Assuming the term has a field id of 1, and a u64 value of 3234, - /// the Term will have 8 bytes. - /// - /// The first four byte are dedicated to storing the field id as a u64. - /// The 4 following bytes are encoding the u64 value. - pub fn from_field_u64(field: Field, val: u64) -> Term { - let mut term = Term(vec![0u8; INT_TERM_LEN]); - term.set_field(field); - term.set_u64(val); - term - } - - /// Sets a u64 value in the term. - /// - /// U64 are serialized using (8-byte) BigEndian - /// representation. - /// The use of BigEndian has the benefit of preserving - /// the natural order of the values. - pub fn set_u64(&mut self, val: u64) { - self.0.resize(INT_TERM_LEN, 0u8); - BigEndian::write_u64(&mut self.0[4..], val); - } - /// Builds a term given a field, and a u64-value /// /// Assuming the term has a field id of 1, and a u64 value of 3234, @@ -71,6 +28,7 @@ impl Term { Term::from_field_u64(field, val_u64) } + /// Builds a term given a field, and a string value /// /// Assuming the term has a field id of 2, and a text value of "abc", @@ -85,6 +43,20 @@ impl Term { term } + /// Builds a term given a field, and a u64-value + /// + /// Assuming the term has a field id of 1, and a u64 value of 3234, + /// the Term will have 8 bytes. + /// + /// The first four byte are dedicated to storing the field id as a u64. + /// The 4 following bytes are encoding the u64 value. + pub fn from_field_u64(field: Field, val: u64) -> Term { + let mut term = Term(vec![0u8; INT_TERM_LEN]); + term.set_field(field); + term.set_u64(val); + term + } + /// Creates a new Term with an empty buffer, /// but with a given capacity. /// @@ -95,11 +67,35 @@ impl Term { Term(Vec::with_capacity(num_bytes)) } - /// Assume the term is a u64 field. + /// Returns the field. + pub fn set_field(&mut self, field: Field) { + if self.0.len() < 4 { + self.0.resize(4, 0u8); + } + BigEndian::write_u32(&mut self.0[0..4], field.0); + } + + /// Sets a u64 value in the term. /// - /// Panics if the term is not a u64 field. - pub fn get_u64(&self) -> u64 { - BigEndian::read_u64(&self.0[4..]) + /// U64 are serialized using (8-byte) BigEndian + /// representation. + /// The use of BigEndian has the benefit of preserving + /// the natural order of the values. + pub fn set_u64(&mut self, val: u64) { + self.0.resize(INT_TERM_LEN, 0u8); + BigEndian::write_u64(&mut self.0[4..], val); + } + + /// Sets a `i64` value in the term. + pub fn set_i64(&mut self, val: i64) { + self.set_u64(common::i64_to_u64(val)); + } + + + /// Set the texts only, keeping the field untouched. + pub fn set_text(&mut self, text: &str) { + self.0.resize(4, 0u8); + self.0.extend(text.as_bytes()); } /// Builds a term from its byte representation. @@ -110,15 +106,37 @@ impl Term { pub(crate) fn from_bytes(data: &[u8]) -> Term { Term(Vec::from(data)) } +} + +impl<B> Term<B> + where B: AsRef<[u8]> +{ + /// Wraps a source of data + pub fn wrap(data: B) -> Term<B> { + Term(data) + } - /// Returns the serialized value of the term. - /// (this does not include the field.) + /// Returns the field. + pub fn field(&self) -> Field { + Field(BigEndian::read_u32(&self.0.as_ref()[..4])) + } + + /// Returns the `u64` value stored in a term. /// - /// If the term is a string, its value is utf-8 encoded. - /// If the term is a u64, its value is encoded according - /// to `byteorder::LittleEndian`. - pub fn value(&self) -> &[u8] { - &self.0[4..] + /// # Panics + /// ... or returns an invalid value + /// if the term is not a `u64` field. + pub fn get_u64(&self) -> u64 { + BigEndian::read_u64(&self.0.as_ref()[4..]) + } + + /// Returns the `i64` value stored in a term. + /// + /// # Panics + /// ... or returns an invalid value + /// if the term is not a `i64` field. + pub fn get_i64(&self) -> i64 { + common::u64_to_i64(BigEndian::read_u64(&self.0.as_ref()[4..])) } /// Returns the text associated with the term. @@ -128,24 +146,30 @@ impl Term { /// if the index is corrupted or if you try to /// call this method on a non-string type. pub fn text(&self) -> &str { - str::from_utf8(self.value()).expect("Term does not contain valid utf-8.") + str::from_utf8(self.value_bytes()).expect("Term does not contain valid utf-8.") } - /// Set the texts only, keeping the field untouched. - pub fn set_text(&mut self, text: &str) { - self.0.resize(4, 0u8); - self.0.extend(text.as_bytes()); + /// Returns the serialized value of the term. + /// (this does not include the field.) + /// + /// If the term is a string, its value is utf-8 encoded. + /// If the term is a u64, its value is encoded according + /// to `byteorder::LittleEndian`. + pub fn value_bytes(&self) -> &[u8] { + &self.0.as_ref()[4..] } /// Returns the underlying `&[u8]` pub fn as_slice(&self) -> &[u8] { - &self.0 + self.0.as_ref() } } -impl AsRef<[u8]> for Term { +impl<B> AsRef<[u8]> for Term<B> + where B: AsRef<[u8]> +{ fn as_ref(&self) -> &[u8] { - &self.0 + self.0.as_ref() } } diff --git a/src/store/reader.rs b/src/store/reader.rs index 5a67778..05781a5 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -85,6 +85,7 @@ impl StoreReader { } } +#[allow(needless_pass_by_value)] fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) { let data_len = data.len(); let footer_offset = data_len - size_of::<u64>() - size_of::<u32>(); diff --git a/src/termdict/merger.rs b/src/termdict/merger.rs index 77088cb..341fa06 100644 --- a/src/termdict/merger.rs +++ b/src/termdict/merger.rs @@ -6,7 +6,7 @@ use postings::TermInfo; use std::cmp::Ordering; use termdict::TermStreamer; use termdict::TermDictionary; -use fst::Streamer; +use schema::Term; pub struct HeapItem<'a, V> where V: 'a + BinarySerializable + Default @@ -128,6 +128,15 @@ impl<'a, V> TermMerger<'a, V> pub fn current_kvs(&self) -> &[HeapItem<'a, V>] { &self.current_streamers[..] } + + /// Iterates through terms + pub fn next(&mut self) -> Option<Term<&[u8]>> { + if self.advance() { + Some(Term::wrap(self.current_streamers[0].streamer.key())) + } else { + None + } + } } @@ -140,18 +149,3 @@ impl<'a> From<&'a [SegmentReader]> for TermMerger<'a, TermInfo> { .collect()) } } - -impl<'a, V> Streamer<'a> for TermMerger<'a, V> - where V: BinarySerializable + Default -{ - type Item = &'a [u8]; - - fn next(&'a mut self) -> Option<Self::Item> { - if self.advance() { - Some(self.current_streamers[0].streamer.key()) - } else { - None - } - - } -} diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 97e2fbb..9ec3247 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -190,9 +190,9 @@ pub trait TermStreamer<V>: Sized { fn value(&self) -> &V; /// Return the next `(key, value)` pair. - fn next(&mut self) -> Option<(&[u8], &V)> { + fn next(&mut self) -> Option<(Term<&[u8]>, &V)> { if self.advance() { - Some((self.key(), self.value())) + Some((Term::wrap(self.key()), self.value())) } else { None } @@ -261,12 +261,24 @@ mod tests { assert_eq!(term_dict.get("abc"), Some(34u32)); assert_eq!(term_dict.get("abcd"), Some(346u32)); let mut stream = term_dict.stream(); - assert_eq!(stream.next().unwrap(), ("abc".as_bytes(), &34u32)); - assert_eq!(stream.key(), "abc".as_bytes()); - assert_eq!(*stream.value(), 34u32); - assert_eq!(stream.next().unwrap(), ("abcd".as_bytes(), &346u32)); - assert_eq!(stream.key(), "abcd".as_bytes()); - assert_eq!(*stream.value(), 346u32); + { + { + let (k, v) = stream.next().unwrap(); + assert_eq!(k.as_ref(), "abc".as_bytes()); + assert_eq!(v, &34u32); + } + assert_eq!(stream.key(), "abc".as_bytes()); + assert_eq!(*stream.value(), 34u32); + } + { + { + let (k, v) = stream.next().unwrap(); + assert_eq!(k.as_slice(), "abcd".as_bytes()); + assert_eq!(v, &346u32); + } + assert_eq!(stream.key(), "abcd".as_bytes()); + assert_eq!(*stream.value(), 346u32); + } assert!(!stream.advance()); } @@ -337,7 +349,7 @@ mod tests { let mut i = 0; while let Some((streamer_k, streamer_v)) = streamer.next() { let &(ref key, ref v) = &ids[i]; - assert_eq!(streamer_k, key.as_bytes()); + assert_eq!(streamer_k.as_ref(), key.as_bytes()); assert_eq!(streamer_v, v); i += 1; } @@ -374,7 +386,7 @@ mod tests { for j in 0..3 { let (streamer_k, streamer_v) = streamer.next().unwrap(); let &(ref key, ref v) = &ids[i + j]; - assert_eq!(str::from_utf8(streamer_k).unwrap(), key); + assert_eq!(str::from_utf8(streamer_k.as_ref()).unwrap(), key); assert_eq!(streamer_v, v); } } @@ -390,7 +402,7 @@ mod tests { for j in 0..3 { let (streamer_k, streamer_v) = streamer.next().unwrap(); let &(ref key, ref v) = &ids[i + j + 1]; - assert_eq!(streamer_k, key.as_bytes()); + assert_eq!(streamer_k.as_ref(), key.as_bytes()); assert_eq!(streamer_v, v); } } diff --git a/src/termdict/streamdict/streamer.rs b/src/termdict/streamdict/streamer.rs index 7d7dd65..7e1ed1a 100644 --- a/src/termdict/streamdict/streamer.rs +++ b/src/termdict/streamdict/streamer.rs @@ -106,12 +106,12 @@ fn get_offset<'a, V, P: Fn(&[u8]) -> bool>(predicate: P, let mut prev_data: Vec<u8> = streamer.current_key.clone(); while let Some((iter_key, _)) = streamer.next() { - if !predicate(iter_key) { + if !predicate(iter_key.as_ref()) { return (prev.as_ptr() as usize, prev_data); } prev = streamer.cursor; prev_data.clear(); - prev_data.extend_from_slice(iter_key); + prev_data.extend_from_slice(iter_key.as_ref()); } (prev.as_ptr() as usize, prev_data) } |