diff options
author | Paul Masurel <paul.masurel@gmail.com> | 2018-08-30 09:23:34 +0900 |
---|---|---|
committer | Paul Masurel <paul.masurel@gmail.com> | 2018-08-30 09:23:34 +0900 |
commit | a12d211330657931de2c972030504762cdbb8432 (patch) | |
tree | 45a02fa812f024b22858ca48bbf334c5b5eec54d /src/query | |
parent | 18814ba0c15e72dd2db09c589e647b863dbbea51 (diff) |
Extracting terms matching query in the document
Diffstat (limited to 'src/query')
-rw-r--r-- | src/query/automaton_weight.rs | 47 | ||||
-rw-r--r-- | src/query/term_query/term_weight.rs | 22 | ||||
-rw-r--r-- | src/query/weight.rs | 35 |
3 files changed, 104 insertions, 0 deletions
diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs index b38e659..d1040eb 100644 --- a/src/query/automaton_weight.rs +++ b/src/query/automaton_weight.rs @@ -7,6 +7,11 @@ use query::{Scorer, Weight}; use schema::{Field, IndexRecordOption}; use termdict::{TermDictionary, TermStreamer}; use Result; +use query::weight::MatchingTerms; +use SkipResult; +use Term; +use DocId; +use DocSet; /// A weight struct for Fuzzy Term and Regex Queries pub struct AutomatonWeight<A> @@ -36,6 +41,48 @@ impl<A> Weight for AutomatonWeight<A> where A: Automaton, { + + fn matching_terms(&self, + reader: &SegmentReader, + matching_terms: &mut MatchingTerms) -> Result<()> { + let max_doc = reader.max_doc(); + let mut doc_bitset = BitSet::with_max_value(max_doc); + + let inverted_index = reader.inverted_index(self.field); + let term_dict = inverted_index.terms(); + let mut term_stream = self.automaton_stream(term_dict); + + let doc_ids = matching_terms.sorted_doc_ids(); + let mut docs_matching_current_term: Vec<DocId> = vec![]; + + let mut term_buffer: Vec<u8> = vec![]; + + while term_stream.advance() { + docs_matching_current_term.clear(); + let term_info = term_stream.value(); + let mut segment_postings = inverted_index.read_postings_from_terminfo(term_info, IndexRecordOption::Basic); + for &doc_id in &doc_ids { + match segment_postings.skip_next(doc_id) { + SkipResult::Reached => { + docs_matching_current_term.push(doc_id); + } + SkipResult::OverStep => {} + SkipResult::End => {} + } + } + if !docs_matching_current_term.is_empty() { + term_buffer.clear(); + let term_ord = term_stream.term_ord(); + inverted_index.terms().ord_to_term(term_ord, &mut term_buffer); + let term = Term::from_field_bytes(self.field, &term_buffer[..]); + for &doc_id in &docs_matching_current_term { + matching_terms.add_term(doc_id, term.clone()); + } + } + } + Ok(()) + } + fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index ba45a80..1a9075b 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -8,6 +8,8 @@ use query::Weight; use schema::IndexRecordOption; use Result; use Term; +use SkipResult; +use query::weight::MatchingTerms; pub struct TermWeight { term: Term, @@ -38,6 +40,26 @@ impl Weight for TermWeight { } } + + fn matching_terms(&self, + reader: &SegmentReader, + matching_terms: &mut MatchingTerms) -> Result<()> { + let doc_ids = matching_terms.sorted_doc_ids(); + let mut scorer = self.scorer(reader)?; + for doc_id in doc_ids { + match scorer.skip_next(doc_id) { + SkipResult::Reached => { + matching_terms.add_term(doc_id, self.term.clone()); + } + SkipResult::OverStep => {} + SkipResult::End => { + break; + } + } + } + Ok(()) + } + fn count(&self, reader: &SegmentReader) -> Result<u32> { if reader.num_deleted_docs() == 0 { let field = self.term.field(); diff --git a/src/query/weight.rs b/src/query/weight.rs index d3d8b35..51289c5 100644 --- a/src/query/weight.rs +++ b/src/query/weight.rs @@ -1,6 +1,37 @@ use super::Scorer; use core::SegmentReader; use Result; +use DocId; +use std::collections::HashSet; +use Term; +use std::collections::BTreeMap; + + +pub struct MatchingTerms { + doc_to_terms: BTreeMap<DocId, HashSet<Term>> +} + +impl MatchingTerms { + pub fn from_doc_ids(doc_ids: &[DocId]) -> MatchingTerms { + MatchingTerms { + doc_to_terms: doc_ids + .iter() + .cloned() + .map(|doc_id| (doc_id, HashSet::default())) + .collect() + } + } + + pub fn sorted_doc_ids(&self) -> Vec<DocId> { + self.doc_to_terms.keys().cloned().collect() + } + + pub fn add_term(&mut self, doc_id: DocId, term: Term) { + if let Some(terms) = self.doc_to_terms.get_mut(&doc_id) { + terms.insert(term); + } + } +} /// A Weight is the specialization of a Query /// for a given set of segments. @@ -11,6 +42,10 @@ pub trait Weight { /// See [`Query`](./trait.Query.html). fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>>; + fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> { + Ok(()) + } + /// Returns the number documents within the given `SegmentReader`. fn count(&self, reader: &SegmentReader) -> Result<u32> { Ok(self.scorer(reader)?.count()) |