summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/common/mod.rs14
-rw-r--r--src/core/term_iterator.rs4
-rw-r--r--src/datastruct/stacker/heap.rs3
-rw-r--r--src/indexer/merger.rs3
-rw-r--r--src/postings/postings_writer.rs5
-rw-r--r--src/schema/term.rs66
6 files changed, 50 insertions, 45 deletions
diff --git a/src/common/mod.rs b/src/common/mod.rs
index bc31b18..ae9f567 100644
--- a/src/common/mod.rs
+++ b/src/common/mod.rs
@@ -27,20 +27,6 @@ pub trait HasLen {
}
}
-
-/// Creates an uninitialized Vec of a given usize
-///
-/// `allocate_vec` does an unsafe call to `set_len`
-/// as other solution are extremely slow in debug mode.
-pub fn allocate_vec<T>(capacity: usize) -> Vec<T> {
- let mut v = Vec::with_capacity(capacity);
- unsafe {
- v.set_len(capacity);
- }
- v
-}
-
-
const HIGHEST_BIT: u64 = 1 << 63;
diff --git a/src/core/term_iterator.rs b/src/core/term_iterator.rs
index b54377f..63bb53c 100644
--- a/src/core/term_iterator.rs
+++ b/src/core/term_iterator.rs
@@ -175,9 +175,7 @@ mod tests {
let mut term_it = searcher.terms();
let mut terms = String::new();
while let Some(term) = term_it.next() {
- unsafe {
- terms.push_str(term.text());
- }
+ terms.push_str(term.text());
}
assert_eq!(terms, "abcdef");
}
diff --git a/src/datastruct/stacker/heap.rs b/src/datastruct/stacker/heap.rs
index 1ae116f..c158ddd 100644
--- a/src/datastruct/stacker/heap.rs
+++ b/src/datastruct/stacker/heap.rs
@@ -1,6 +1,5 @@
use std::cell::UnsafeCell;
use std::mem;
-use common::allocate_vec;
use std::ptr;
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
@@ -109,7 +108,7 @@ struct InnerHeap {
impl InnerHeap {
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
- let buffer: Vec<u8> = allocate_vec(num_bytes);
+ let buffer: Vec<u8> = vec![0u8; num_bytes];
InnerHeap {
buffer: buffer,
buffer_len: num_bytes as u32,
diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs
index e4307bb..71c1fa7 100644
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -17,7 +17,6 @@ use fastfield::FastFieldSerializer;
use fastfield::FastFieldReader;
use store::StoreWriter;
use std::cmp::{min, max};
-use common::allocate_vec;
pub struct IndexMerger {
schema: Schema,
@@ -33,7 +32,7 @@ struct DeltaPositionComputer {
impl DeltaPositionComputer {
fn new() -> DeltaPositionComputer {
DeltaPositionComputer {
- buffer: allocate_vec(512)
+ buffer: vec![0u32; 512]
}
}
diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs
index 0ec5559..3a32197 100644
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -39,7 +39,8 @@ pub trait PostingsWriter {
-> u32 {
let mut pos = 0u32;
let mut num_tokens: u32 = 0u32;
- let mut term = Term::allocate(field, 100);
+ let mut term = unsafe { Term::with_capacity(100) };
+ term.set_field(field);
for field_value in field_values {
let mut tokens = SimpleTokenizer.tokenize(field_value.value().text());
// right now num_tokens and pos are redundant, but it should
@@ -118,7 +119,7 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
.iter()
.collect();
term_offsets.sort_by_key(|&(k, _v)| k);
- let mut term = Term::allocate(Field(0), 100);
+ let mut term = unsafe { Term::with_capacity(100) };
for (term_bytes, (addr, recorder)) in term_offsets {
// sadly we are required to copy the data
term.set_content(term_bytes);
diff --git a/src/schema/term.rs b/src/schema/term.rs
index 0042624..332bfe5 100644
--- a/src/schema/term.rs
+++ b/src/schema/term.rs
@@ -1,13 +1,14 @@
use std::fmt;
use common;
-use common::BinarySerializable;
-use common::allocate_vec;
use byteorder::{BigEndian, ByteOrder};
use super::Field;
use std::str;
+/// Size (in bytes) of the buffer of a int field.
+const INT_TERM_LEN: usize = 4 + 8;
+
/// Term represents the value that the token can take.
///
/// It actually wraps a `Vec<u8>`.
@@ -15,13 +16,6 @@ use std::str;
pub struct Term(Vec<u8>);
impl Term {
-
- /// Pre-allocate a term buffer.
- pub fn allocate(field: Field, num_bytes: usize) -> Term {
- let mut term = Term(Vec::with_capacity(num_bytes));
- field.serialize(&mut term.0).expect("Serializing term in a Vec should never fail");
- term
- }
/// Set the content of the term.
pub fn set_content(&mut self, content: &[u8]) {
@@ -40,6 +34,14 @@ impl Term {
Field(self.field_id())
}
+ /// Returns the field.
+ pub fn set_field(&mut self, field: Field) {
+ if self.0.len() < 4 {
+ self.0.resize(4, 0u8);
+ }
+ BigEndian::write_u32(&mut self.0[0..4], field.0);
+ }
+
/// Builds a term given a field, and a u64-value
///
/// Assuming the term has a field id of 1, and a u64 value of 3234,
@@ -48,13 +50,21 @@ impl Term {
/// The first four byte are dedicated to storing the field id as a u64.
/// The 4 following bytes are encoding the u64 value.
pub fn from_field_u64(field: Field, val: u64) -> Term {
- const U64_TERM_LEN: usize = 4 + 8;
- let mut buffer = allocate_vec(U64_TERM_LEN);
- // we want BigEndian here to have lexicographic order
- // match the natural order of `(field, val)`
- BigEndian::write_u32(&mut buffer[0..4], field.0);
- BigEndian::write_u64(&mut buffer[4..], val);
- Term(buffer)
+ let mut term = Term(vec![0u8; INT_TERM_LEN]);
+ term.set_field(field);
+ term.set_u64(val);
+ term
+ }
+
+ /// Sets a u64 value in the term.
+ ///
+ /// U64 are serialized using (8-byte) BigEndian
+ /// representation.
+ /// The use of BigEndian has the benefit of preserving
+ /// the natural order of the values.
+ pub fn set_u64(&mut self, val: u64) {
+ self.0.resize(INT_TERM_LEN, 0u8);
+ BigEndian::write_u64(&mut self.0[4..], val);
}
/// Builds a term given a field, and a u64-value
@@ -76,10 +86,22 @@ impl Term {
/// The first byte is 2, and the three following bytes are the utf-8
/// representation of "abc".
pub fn from_field_text(field: Field, text: &str) -> Term {
- let mut buffer = allocate_vec(4 + text.len());
- BigEndian::write_u32(&mut buffer[0..4], field.0);
- buffer[4..].clone_from_slice(text.as_bytes());
- Term(buffer)
+ let buffer = Vec::with_capacity(4 + text.len());
+ let mut term = Term(buffer);
+ term.set_field(field);
+ term.set_text(text);
+ term
+ }
+
+ /// Creates a new Term with an empty buffer,
+ /// but with a given capacity.
+ ///
+ /// It is declared unsafe, as the term content
+ /// is not initialized, and a call to `.field()`
+ /// would panic.
+ #[doc(hidden)]
+ pub unsafe fn with_capacity(num_bytes: usize) -> Term {
+ Term(Vec::with_capacity(num_bytes))
}
/// Assume the term is a u64 field.
@@ -113,8 +135,8 @@ impl Term {
/// If the value is not valid utf-8. This may happen
/// if the index is corrupted or if you try to
/// call this method on a non-string type.
- pub unsafe fn text(&self) -> &str {
- str::from_utf8_unchecked(self.value())
+ pub fn text(&self) -> &str {
+ str::from_utf8(self.value()).expect("Term does not contain valid utf-8.")
}
/// Set the texts only, keeping the field untouched.