diff options
author | Paul Masurel <paul.masurel@gmail.com> | 2016-08-02 10:21:28 +0900 |
---|---|---|
committer | Paul Masurel <paul.masurel@gmail.com> | 2016-08-02 10:21:28 +0900 |
commit | b1056b6455ec09595abc9a095a058b8cdbc1d384 (patch) | |
tree | d5c12427d6e9cdcaac0f6f18ab43ae4738a4a172 | |
parent | 8daf78351088e0e48bd414444aac2ecf10d94632 (diff) |
Added bench cli
-rw-r--r-- | Cargo.toml | 4 | ||||
-rw-r--r-- | src/analyzer/mod.rs | 1 | ||||
-rw-r--r-- | src/cli/bench.rs | 92 | ||||
-rw-r--r-- | src/cli/merge.rs | 2 | ||||
-rw-r--r-- | src/common/timer.rs | 6 | ||||
-rw-r--r-- | src/directory/directory.rs | 4 | ||||
-rw-r--r-- | src/fastfield/reader.rs | 3 | ||||
-rw-r--r-- | src/postings/union_postings.rs | 6 | ||||
-rw-r--r-- | src/query/multi_term_query.rs | 15 | ||||
-rw-r--r-- | src/query/multi_term_scorer.rs | 1 | ||||
-rw-r--r-- | src/query/query_parser.rs | 40 | ||||
-rw-r--r-- | src/schema/field_entry.rs | 2 | ||||
-rw-r--r-- | src/schema/schema.rs | 1 |
13 files changed, 156 insertions, 21 deletions
@@ -34,3 +34,7 @@ gcc = "0.3.24" [[bin]] name = "tantivy-merge" path = "src/cli/merge.rs" + +[[bin]] +name = "tantivy-bench" +path = "src/cli/bench.rs" diff --git a/src/analyzer/mod.rs b/src/analyzer/mod.rs index 6f87f85..f26904e 100644 --- a/src/analyzer/mod.rs +++ b/src/analyzer/mod.rs @@ -55,6 +55,7 @@ impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> { } } } + } pub struct SimpleTokenizer; diff --git a/src/cli/bench.rs b/src/cli/bench.rs new file mode 100644 index 0000000..e5b66fe --- /dev/null +++ b/src/cli/bench.rs @@ -0,0 +1,92 @@ + + +extern crate argparse; +extern crate tantivy; + +use argparse::{ArgumentParser, Store}; +use tantivy::Index; +use tantivy::schema::{Field, Schema}; +use tantivy::query::QueryParser; +use tantivy::query::Query; +use std::path::Path; +use std::io::BufReader; +use std::io::BufRead; +use std::io; +use std::fs::File; +use tantivy::collector::chain; +use tantivy::collector::TopCollector; +use tantivy::collector::CountCollector; + + +fn extract_search_fields(schema: &Schema) -> Vec<Field> { + schema.fields() + .iter() + .enumerate() + .filter(|&(_, field_entry)| { + field_entry.is_indexed() + }) + .map(|(field_id, _)| field_id as u8) + .map(Field) + .collect() +} + +fn read_query_file(query_path: &String) -> io::Result<Vec<String>> { + let query_file: File = try!(File::open(&query_path)); + let file = BufReader::new(&query_file); + Ok(file.lines() + .map(|l| l.unwrap()) + .map(|q| String::from(q.trim())) + .collect()) +} + +fn run(directory: String, + query_filepath: String) -> io::Result<()> { + println!("Directory : {:?}", directory); + println!("Query : {:?}", directory); + + let index = try!(Index::open(Path::new(&directory))); + let searcher = try!(index.searcher()); + let default_search_fields: Vec<Field> = extract_search_fields(&index.schema()); + println!("Fields {:?}", default_search_fields); + + let queries = try!(read_query_file(&query_filepath)); + println!("queries {:?}", queries); + + let query_parser = QueryParser::new(index.schema(), default_search_fields); + + println!("{}\t{}\t{}\t{}", "query", "num_terms", "num hits", "time in microsecs"); + for _ in 0..10 { + for query_txt in &queries { + let query = query_parser.parse_query(&query_txt).unwrap(); + let num_terms = query.num_terms(); + let mut top_collector = TopCollector::with_limit(10); + let mut count_collector = CountCollector::new(); + let timing; + { + let mut collector = chain().add(&mut top_collector).add(&mut count_collector); + timing = try!(query.search(&searcher, &mut collector)); + } + println!("{}\t{}\t{}\t{}", query_txt, num_terms, count_collector.count(), timing.total_time()); + } + } + Ok(()) +} + +fn main() { + let mut directory = String::from("."); + let mut query_file = String::from("query.txt"); + { + let mut ap = ArgumentParser::new(); + ap.set_description("Merge a few segments together"); + ap.refer(&mut directory) + .add_option(&["-i", "--index"], + Store, + "Path to the tantivy index directory"); + ap.refer(&mut query_file) + .add_option(&["-q", "--queries"], + Store, + "Path to the tantivy index directory"); + ap.parse_args_or_exit(); + } + run(directory, query_file).unwrap(); +} diff --git a/src/cli/merge.rs b/src/cli/merge.rs index c7cbe34..0ca66ff 100644 --- a/src/cli/merge.rs +++ b/src/cli/merge.rs @@ -11,7 +11,7 @@ fn main() { let mut ap = ArgumentParser::new(); ap.set_description("Merge a few segments together"); ap.refer(&mut directory) - .add_option(&["-d", "--directory"], + .add_option(&["-i", "--index"], Store, "Path to the tantivy index directory"); ap.parse_args_or_exit(); diff --git a/src/common/timer.rs b/src/common/timer.rs index 8e5277d..70d5051 100644 --- a/src/common/timer.rs +++ b/src/common/timer.rs @@ -46,7 +46,11 @@ impl TimerTree { timings: Vec::new(), } } - + + pub fn total_time(&self,) -> i64 { + self.timings.last().unwrap().duration + } + pub fn open(&mut self, name: &'static str) -> OpenTimer { OpenTimer { name: name, diff --git a/src/directory/directory.rs b/src/directory/directory.rs index 398ea7f..b6d90f7 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -5,6 +5,10 @@ use std::fmt; use std::path::Path; use directory::{ReadOnlySource, WritePtr}; + +/// There is currently two implementations of `Directory` +/// - [RAMDirectory](index.html) +/// pub trait Directory: fmt::Debug + Send + Sync { fn open_read(&self, path: &Path) -> io::Result<ReadOnlySource>; fn open_write(&mut self, path: &Path) -> io::Result<WritePtr>; diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 2985cf3..92d75b1 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -63,6 +63,9 @@ impl U32FastFieldReader { } pub fn get(&self, doc: DocId) -> u32 { + if self.num_in_pack == 0u32 { + return self.min_val; + } let long_addr = self.divider.divide(doc); let ord_within_long = doc - long_addr * self.num_in_pack; let bit_shift = (self.num_bits as u32) * ord_within_long; diff --git a/src/postings/union_postings.rs b/src/postings/union_postings.rs index c1b2483..43e6659 100644 --- a/src/postings/union_postings.rs +++ b/src/postings/union_postings.rs @@ -36,6 +36,7 @@ impl<TPostings: Postings> UnionPostings<TPostings> { pub fn new(fieldnorms_reader: Vec<U32FastFieldReader>, postings: Vec<TPostings>, multi_term_scorer: MultiTermScorer) -> UnionPostings<TPostings> { let num_postings = postings.len(); + assert_eq!(fieldnorms_reader.len(), num_postings); let mut union_postings = UnionPostings { fieldnorms_readers: fieldnorms_reader, postings: postings, @@ -44,7 +45,7 @@ impl<TPostings: Postings> UnionPostings<TPostings> { scorer: multi_term_scorer }; for ord in 0..num_postings { - union_postings.enqueue(ord); + union_postings.enqueue(ord); } union_postings } @@ -73,7 +74,8 @@ impl<TPostings: Postings> DocSet for UnionPostings<TPostings> { let head = self.queue.pop(); match head { Some(HeapItem(doc, ord, tf)) => { - let fieldnorm = self.get_field_norm(ord, doc); + // let fieldnorm = self.get_field_norm(ord, doc); + let fieldnorm: u32 = 1u32; self.scorer.update(ord, tf, fieldnorm); self.enqueue(ord); self.doc = doc; diff --git a/src/query/multi_term_query.rs b/src/query/multi_term_query.rs index df2790d..ba43150 100644 --- a/src/query/multi_term_query.rs +++ b/src/query/multi_term_query.rs @@ -54,6 +54,11 @@ impl Query for MultiTermQuery { impl MultiTermQuery { + + pub fn num_terms(&self,) -> usize { + self.terms.len() + } + fn scorer(&self, searcher: &Searcher) -> MultiTermScorer { let num_docs = searcher.num_docs() as f32; let idfs: Vec<f32> = self.terms.iter() @@ -84,16 +89,12 @@ impl MultiTermQuery { let mut decode_timer = timer.open("decode_all"); for term in &self.terms { let _decode_one_timer = decode_timer.open("decode_one"); - match reader.read_postings(term) { - Some(postings) => { + reader.read_postings(term) + .map(|postings| { let field = term.get_field(); fieldnorms_readers.push(reader.get_fieldnorms_reader(field).unwrap()); segment_postings.push(postings); - } - None => { - segment_postings.push(SegmentPostings::empty()); - } - } + }); } } UnionPostings::new(fieldnorms_readers, segment_postings, multi_term_scorer) diff --git a/src/query/multi_term_scorer.rs b/src/query/multi_term_scorer.rs index f9f298f..f20990f 100644 --- a/src/query/multi_term_scorer.rs +++ b/src/query/multi_term_scorer.rs @@ -22,6 +22,7 @@ impl MultiTermScorer { pub fn update(&mut self, term_ord: usize, term_freq: u32, fieldnorm: u32) { if term_freq > 0 { self.score += (term_freq as f32 / fieldnorm as f32).sqrt() * self.idf[term_ord]; + // self.score += term_freq as f32; // / fieldnorm as f32).sqrt() * self.idf[term_ord]; self.num_fields += 1; } } diff --git a/src/query/query_parser.rs b/src/query/query_parser.rs index e8fa9cd..dcc344a 100644 --- a/src/query/query_parser.rs +++ b/src/query/query_parser.rs @@ -6,6 +6,8 @@ use common::TimerTree; use query::{Query, MultiTermQuery}; use schema::Schema; use schema::{Term, Field}; +use analyzer::SimpleTokenizer; +use analyzer::StreamingIterator; #[derive(Debug)] pub enum ParsingError { @@ -22,6 +24,16 @@ pub enum StandardQuery { MultiTerm(MultiTermQuery), } +impl StandardQuery { + pub fn num_terms(&self,) -> usize { + match self { + &StandardQuery::MultiTerm(ref q) => { + q.num_terms() + } + } + } +} + impl Query for StandardQuery { fn search<C: Collector>(&self, searcher: &Searcher, collector: &mut C) -> io::Result<TimerTree> { match *self { @@ -33,6 +45,22 @@ impl Query for StandardQuery { } +fn compute_terms(field: Field, text: &str) -> Vec<Term> { + let tokenizer = SimpleTokenizer::new(); + let mut tokens = Vec::new(); + let mut token_it = tokenizer.tokenize(text); + loop { + match token_it.next() { + Some(token_str) => { + tokens.push(Term::from_field_text(field, token_str)); + } + None => { break; } + } + } + tokens +} + + impl QueryParser { pub fn new(schema: Schema, default_fields: Vec<Field>) -> QueryParser { @@ -50,18 +78,14 @@ impl QueryParser { let terms = self.default_fields .iter() .cloned() - .map(|field| Term::from_field_text(field, &val)) + .flat_map(|field| compute_terms(field, &val)) .collect(); Ok(terms) }, Literal::WithField(field_name, val) => { match self.schema.get_field(&field_name) { - Some(field) => { - Ok(vec!(Term::from_field_text(field, &val))) - } - None => { - Err(ParsingError::FieldDoesNotExist(field_name)) - } + Some(field) => Ok(compute_terms(field, &val)), + None => Err(ParsingError::FieldDoesNotExist(field_name)) } } } @@ -109,7 +133,7 @@ pub fn query_language(input: State<&str>) -> ParseResult<Vec<Literal>, &str> let field = many1(letter()); let term_query = (field, char(':'), term_val()) .map(|(field,_, value)| Literal::WithField(field, value)); - let term_default_field = term_val().map(|w| Literal::DefaultField(w)); + let term_default_field = term_val().map(Literal::DefaultField); try(term_query) .or(term_default_field) }; diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 5f4ba7c..7f11a9d 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -23,7 +23,7 @@ impl FieldEntry { pub fn is_indexed(&self,) -> bool { match self { &FieldEntry::Text(_, ref options) => options.get_indexing_options().is_indexed(), - _ => false, + _ => false, // TODO handle u32 indexed } } diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 560696d..8c6efb1 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -76,7 +76,6 @@ impl Schema { fields_map: HashMap::new(), } } - pub fn get_field_entry(&self, field: Field) -> &FieldEntry { &self.fields[field.0 as usize] |