summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Masurel <paul.masurel@gmail.com>2016-08-02 10:21:28 +0900
committerPaul Masurel <paul.masurel@gmail.com>2016-08-02 10:21:28 +0900
commitb1056b6455ec09595abc9a095a058b8cdbc1d384 (patch)
treed5c12427d6e9cdcaac0f6f18ab43ae4738a4a172
parent8daf78351088e0e48bd414444aac2ecf10d94632 (diff)
Added bench cli
-rw-r--r--Cargo.toml4
-rw-r--r--src/analyzer/mod.rs1
-rw-r--r--src/cli/bench.rs92
-rw-r--r--src/cli/merge.rs2
-rw-r--r--src/common/timer.rs6
-rw-r--r--src/directory/directory.rs4
-rw-r--r--src/fastfield/reader.rs3
-rw-r--r--src/postings/union_postings.rs6
-rw-r--r--src/query/multi_term_query.rs15
-rw-r--r--src/query/multi_term_scorer.rs1
-rw-r--r--src/query/query_parser.rs40
-rw-r--r--src/schema/field_entry.rs2
-rw-r--r--src/schema/schema.rs1
13 files changed, 156 insertions, 21 deletions
diff --git a/Cargo.toml b/Cargo.toml
index 6befcdf..aa14f4c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -34,3 +34,7 @@ gcc = "0.3.24"
[[bin]]
name = "tantivy-merge"
path = "src/cli/merge.rs"
+
+[[bin]]
+name = "tantivy-bench"
+path = "src/cli/bench.rs"
diff --git a/src/analyzer/mod.rs b/src/analyzer/mod.rs
index 6f87f85..f26904e 100644
--- a/src/analyzer/mod.rs
+++ b/src/analyzer/mod.rs
@@ -55,6 +55,7 @@ impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> {
}
}
}
+
}
pub struct SimpleTokenizer;
diff --git a/src/cli/bench.rs b/src/cli/bench.rs
new file mode 100644
index 0000000..e5b66fe
--- /dev/null
+++ b/src/cli/bench.rs
@@ -0,0 +1,92 @@
+
+
+extern crate argparse;
+extern crate tantivy;
+
+use argparse::{ArgumentParser, Store};
+use tantivy::Index;
+use tantivy::schema::{Field, Schema};
+use tantivy::query::QueryParser;
+use tantivy::query::Query;
+use std::path::Path;
+use std::io::BufReader;
+use std::io::BufRead;
+use std::io;
+use std::fs::File;
+use tantivy::collector::chain;
+use tantivy::collector::TopCollector;
+use tantivy::collector::CountCollector;
+
+
+fn extract_search_fields(schema: &Schema) -> Vec<Field> {
+ schema.fields()
+ .iter()
+ .enumerate()
+ .filter(|&(_, field_entry)| {
+ field_entry.is_indexed()
+ })
+ .map(|(field_id, _)| field_id as u8)
+ .map(Field)
+ .collect()
+}
+
+fn read_query_file(query_path: &String) -> io::Result<Vec<String>> {
+ let query_file: File = try!(File::open(&query_path));
+ let file = BufReader::new(&query_file);
+ Ok(file.lines()
+ .map(|l| l.unwrap())
+ .map(|q| String::from(q.trim()))
+ .collect())
+}
+
+fn run(directory: String,
+ query_filepath: String) -> io::Result<()> {
+ println!("Directory : {:?}", directory);
+ println!("Query : {:?}", directory);
+
+ let index = try!(Index::open(Path::new(&directory)));
+ let searcher = try!(index.searcher());
+ let default_search_fields: Vec<Field> = extract_search_fields(&index.schema());
+ println!("Fields {:?}", default_search_fields);
+
+ let queries = try!(read_query_file(&query_filepath));
+ println!("queries {:?}", queries);
+
+ let query_parser = QueryParser::new(index.schema(), default_search_fields);
+
+ println!("{}\t{}\t{}\t{}", "query", "num_terms", "num hits", "time in microsecs");
+ for _ in 0..10 {
+ for query_txt in &queries {
+ let query = query_parser.parse_query(&query_txt).unwrap();
+ let num_terms = query.num_terms();
+ let mut top_collector = TopCollector::with_limit(10);
+ let mut count_collector = CountCollector::new();
+ let timing;
+ {
+ let mut collector = chain().add(&mut top_collector).add(&mut count_collector);
+ timing = try!(query.search(&searcher, &mut collector));
+ }
+ println!("{}\t{}\t{}\t{}", query_txt, num_terms, count_collector.count(), timing.total_time());
+ }
+ }
+ Ok(())
+}
+
+fn main() {
+ let mut directory = String::from(".");
+ let mut query_file = String::from("query.txt");
+ {
+ let mut ap = ArgumentParser::new();
+ ap.set_description("Merge a few segments together");
+ ap.refer(&mut directory)
+ .add_option(&["-i", "--index"],
+ Store,
+ "Path to the tantivy index directory");
+ ap.refer(&mut query_file)
+ .add_option(&["-q", "--queries"],
+ Store,
+ "Path to the tantivy index directory");
+ ap.parse_args_or_exit();
+ }
+ run(directory, query_file).unwrap();
+}
diff --git a/src/cli/merge.rs b/src/cli/merge.rs
index c7cbe34..0ca66ff 100644
--- a/src/cli/merge.rs
+++ b/src/cli/merge.rs
@@ -11,7 +11,7 @@ fn main() {
let mut ap = ArgumentParser::new();
ap.set_description("Merge a few segments together");
ap.refer(&mut directory)
- .add_option(&["-d", "--directory"],
+ .add_option(&["-i", "--index"],
Store,
"Path to the tantivy index directory");
ap.parse_args_or_exit();
diff --git a/src/common/timer.rs b/src/common/timer.rs
index 8e5277d..70d5051 100644
--- a/src/common/timer.rs
+++ b/src/common/timer.rs
@@ -46,7 +46,11 @@ impl TimerTree {
timings: Vec::new(),
}
}
-
+
+ pub fn total_time(&self,) -> i64 {
+ self.timings.last().unwrap().duration
+ }
+
pub fn open(&mut self, name: &'static str) -> OpenTimer {
OpenTimer {
name: name,
diff --git a/src/directory/directory.rs b/src/directory/directory.rs
index 398ea7f..b6d90f7 100644
--- a/src/directory/directory.rs
+++ b/src/directory/directory.rs
@@ -5,6 +5,10 @@ use std::fmt;
use std::path::Path;
use directory::{ReadOnlySource, WritePtr};
+
+/// There is currently two implementations of `Directory`
+/// - [RAMDirectory](index.html)
+///
pub trait Directory: fmt::Debug + Send + Sync {
fn open_read(&self, path: &Path) -> io::Result<ReadOnlySource>;
fn open_write(&mut self, path: &Path) -> io::Result<WritePtr>;
diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs
index 2985cf3..92d75b1 100644
--- a/src/fastfield/reader.rs
+++ b/src/fastfield/reader.rs
@@ -63,6 +63,9 @@ impl U32FastFieldReader {
}
pub fn get(&self, doc: DocId) -> u32 {
+ if self.num_in_pack == 0u32 {
+ return self.min_val;
+ }
let long_addr = self.divider.divide(doc);
let ord_within_long = doc - long_addr * self.num_in_pack;
let bit_shift = (self.num_bits as u32) * ord_within_long;
diff --git a/src/postings/union_postings.rs b/src/postings/union_postings.rs
index c1b2483..43e6659 100644
--- a/src/postings/union_postings.rs
+++ b/src/postings/union_postings.rs
@@ -36,6 +36,7 @@ impl<TPostings: Postings> UnionPostings<TPostings> {
pub fn new(fieldnorms_reader: Vec<U32FastFieldReader>, postings: Vec<TPostings>, multi_term_scorer: MultiTermScorer) -> UnionPostings<TPostings> {
let num_postings = postings.len();
+ assert_eq!(fieldnorms_reader.len(), num_postings);
let mut union_postings = UnionPostings {
fieldnorms_readers: fieldnorms_reader,
postings: postings,
@@ -44,7 +45,7 @@ impl<TPostings: Postings> UnionPostings<TPostings> {
scorer: multi_term_scorer
};
for ord in 0..num_postings {
- union_postings.enqueue(ord);
+ union_postings.enqueue(ord);
}
union_postings
}
@@ -73,7 +74,8 @@ impl<TPostings: Postings> DocSet for UnionPostings<TPostings> {
let head = self.queue.pop();
match head {
Some(HeapItem(doc, ord, tf)) => {
- let fieldnorm = self.get_field_norm(ord, doc);
+ // let fieldnorm = self.get_field_norm(ord, doc);
+ let fieldnorm: u32 = 1u32;
self.scorer.update(ord, tf, fieldnorm);
self.enqueue(ord);
self.doc = doc;
diff --git a/src/query/multi_term_query.rs b/src/query/multi_term_query.rs
index df2790d..ba43150 100644
--- a/src/query/multi_term_query.rs
+++ b/src/query/multi_term_query.rs
@@ -54,6 +54,11 @@ impl Query for MultiTermQuery {
impl MultiTermQuery {
+
+ pub fn num_terms(&self,) -> usize {
+ self.terms.len()
+ }
+
fn scorer(&self, searcher: &Searcher) -> MultiTermScorer {
let num_docs = searcher.num_docs() as f32;
let idfs: Vec<f32> = self.terms.iter()
@@ -84,16 +89,12 @@ impl MultiTermQuery {
let mut decode_timer = timer.open("decode_all");
for term in &self.terms {
let _decode_one_timer = decode_timer.open("decode_one");
- match reader.read_postings(term) {
- Some(postings) => {
+ reader.read_postings(term)
+ .map(|postings| {
let field = term.get_field();
fieldnorms_readers.push(reader.get_fieldnorms_reader(field).unwrap());
segment_postings.push(postings);
- }
- None => {
- segment_postings.push(SegmentPostings::empty());
- }
- }
+ });
}
}
UnionPostings::new(fieldnorms_readers, segment_postings, multi_term_scorer)
diff --git a/src/query/multi_term_scorer.rs b/src/query/multi_term_scorer.rs
index f9f298f..f20990f 100644
--- a/src/query/multi_term_scorer.rs
+++ b/src/query/multi_term_scorer.rs
@@ -22,6 +22,7 @@ impl MultiTermScorer {
pub fn update(&mut self, term_ord: usize, term_freq: u32, fieldnorm: u32) {
if term_freq > 0 {
self.score += (term_freq as f32 / fieldnorm as f32).sqrt() * self.idf[term_ord];
+ // self.score += term_freq as f32; // / fieldnorm as f32).sqrt() * self.idf[term_ord];
self.num_fields += 1;
}
}
diff --git a/src/query/query_parser.rs b/src/query/query_parser.rs
index e8fa9cd..dcc344a 100644
--- a/src/query/query_parser.rs
+++ b/src/query/query_parser.rs
@@ -6,6 +6,8 @@ use common::TimerTree;
use query::{Query, MultiTermQuery};
use schema::Schema;
use schema::{Term, Field};
+use analyzer::SimpleTokenizer;
+use analyzer::StreamingIterator;
#[derive(Debug)]
pub enum ParsingError {
@@ -22,6 +24,16 @@ pub enum StandardQuery {
MultiTerm(MultiTermQuery),
}
+impl StandardQuery {
+ pub fn num_terms(&self,) -> usize {
+ match self {
+ &StandardQuery::MultiTerm(ref q) => {
+ q.num_terms()
+ }
+ }
+ }
+}
+
impl Query for StandardQuery {
fn search<C: Collector>(&self, searcher: &Searcher, collector: &mut C) -> io::Result<TimerTree> {
match *self {
@@ -33,6 +45,22 @@ impl Query for StandardQuery {
}
+fn compute_terms(field: Field, text: &str) -> Vec<Term> {
+ let tokenizer = SimpleTokenizer::new();
+ let mut tokens = Vec::new();
+ let mut token_it = tokenizer.tokenize(text);
+ loop {
+ match token_it.next() {
+ Some(token_str) => {
+ tokens.push(Term::from_field_text(field, token_str));
+ }
+ None => { break; }
+ }
+ }
+ tokens
+}
+
+
impl QueryParser {
pub fn new(schema: Schema,
default_fields: Vec<Field>) -> QueryParser {
@@ -50,18 +78,14 @@ impl QueryParser {
let terms = self.default_fields
.iter()
.cloned()
- .map(|field| Term::from_field_text(field, &val))
+ .flat_map(|field| compute_terms(field, &val))
.collect();
Ok(terms)
},
Literal::WithField(field_name, val) => {
match self.schema.get_field(&field_name) {
- Some(field) => {
- Ok(vec!(Term::from_field_text(field, &val)))
- }
- None => {
- Err(ParsingError::FieldDoesNotExist(field_name))
- }
+ Some(field) => Ok(compute_terms(field, &val)),
+ None => Err(ParsingError::FieldDoesNotExist(field_name))
}
}
}
@@ -109,7 +133,7 @@ pub fn query_language(input: State<&str>) -> ParseResult<Vec<Literal>, &str>
let field = many1(letter());
let term_query = (field, char(':'), term_val())
.map(|(field,_, value)| Literal::WithField(field, value));
- let term_default_field = term_val().map(|w| Literal::DefaultField(w));
+ let term_default_field = term_val().map(Literal::DefaultField);
try(term_query)
.or(term_default_field)
};
diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs
index 5f4ba7c..7f11a9d 100644
--- a/src/schema/field_entry.rs
+++ b/src/schema/field_entry.rs
@@ -23,7 +23,7 @@ impl FieldEntry {
pub fn is_indexed(&self,) -> bool {
match self {
&FieldEntry::Text(_, ref options) => options.get_indexing_options().is_indexed(),
- _ => false,
+ _ => false, // TODO handle u32 indexed
}
}
diff --git a/src/schema/schema.rs b/src/schema/schema.rs
index 560696d..8c6efb1 100644
--- a/src/schema/schema.rs
+++ b/src/schema/schema.rs
@@ -76,7 +76,6 @@ impl Schema {
fields_map: HashMap::new(),
}
}
-
pub fn get_field_entry(&self, field: Field) -> &FieldEntry {
&self.fields[field.0 as usize]