summaryrefslogtreecommitdiffstats
path: root/src/query
diff options
context:
space:
mode:
authorPaul Masurel <paul.masurel@gmail.com>2018-08-30 09:23:34 +0900
committerPaul Masurel <paul.masurel@gmail.com>2018-08-30 09:23:34 +0900
commita12d211330657931de2c972030504762cdbb8432 (patch)
tree45a02fa812f024b22858ca48bbf334c5b5eec54d /src/query
parent18814ba0c15e72dd2db09c589e647b863dbbea51 (diff)
Extracting terms matching query in the document
Diffstat (limited to 'src/query')
-rw-r--r--src/query/automaton_weight.rs47
-rw-r--r--src/query/term_query/term_weight.rs22
-rw-r--r--src/query/weight.rs35
3 files changed, 104 insertions, 0 deletions
diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs
index b38e659..d1040eb 100644
--- a/src/query/automaton_weight.rs
+++ b/src/query/automaton_weight.rs
@@ -7,6 +7,11 @@ use query::{Scorer, Weight};
use schema::{Field, IndexRecordOption};
use termdict::{TermDictionary, TermStreamer};
use Result;
+use query::weight::MatchingTerms;
+use SkipResult;
+use Term;
+use DocId;
+use DocSet;
/// A weight struct for Fuzzy Term and Regex Queries
pub struct AutomatonWeight<A>
@@ -36,6 +41,48 @@ impl<A> Weight for AutomatonWeight<A>
where
A: Automaton,
{
+
+ fn matching_terms(&self,
+ reader: &SegmentReader,
+ matching_terms: &mut MatchingTerms) -> Result<()> {
+ let max_doc = reader.max_doc();
+ let mut doc_bitset = BitSet::with_max_value(max_doc);
+
+ let inverted_index = reader.inverted_index(self.field);
+ let term_dict = inverted_index.terms();
+ let mut term_stream = self.automaton_stream(term_dict);
+
+ let doc_ids = matching_terms.sorted_doc_ids();
+ let mut docs_matching_current_term: Vec<DocId> = vec![];
+
+ let mut term_buffer: Vec<u8> = vec![];
+
+ while term_stream.advance() {
+ docs_matching_current_term.clear();
+ let term_info = term_stream.value();
+ let mut segment_postings = inverted_index.read_postings_from_terminfo(term_info, IndexRecordOption::Basic);
+ for &doc_id in &doc_ids {
+ match segment_postings.skip_next(doc_id) {
+ SkipResult::Reached => {
+ docs_matching_current_term.push(doc_id);
+ }
+ SkipResult::OverStep => {}
+ SkipResult::End => {}
+ }
+ }
+ if !docs_matching_current_term.is_empty() {
+ term_buffer.clear();
+ let term_ord = term_stream.term_ord();
+ inverted_index.terms().ord_to_term(term_ord, &mut term_buffer);
+ let term = Term::from_field_bytes(self.field, &term_buffer[..]);
+ for &doc_id in &docs_matching_current_term {
+ matching_terms.add_term(doc_id, term.clone());
+ }
+ }
+ }
+ Ok(())
+ }
+
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc);
diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs
index ba45a80..1a9075b 100644
--- a/src/query/term_query/term_weight.rs
+++ b/src/query/term_query/term_weight.rs
@@ -8,6 +8,8 @@ use query::Weight;
use schema::IndexRecordOption;
use Result;
use Term;
+use SkipResult;
+use query::weight::MatchingTerms;
pub struct TermWeight {
term: Term,
@@ -38,6 +40,26 @@ impl Weight for TermWeight {
}
}
+
+ fn matching_terms(&self,
+ reader: &SegmentReader,
+ matching_terms: &mut MatchingTerms) -> Result<()> {
+ let doc_ids = matching_terms.sorted_doc_ids();
+ let mut scorer = self.scorer(reader)?;
+ for doc_id in doc_ids {
+ match scorer.skip_next(doc_id) {
+ SkipResult::Reached => {
+ matching_terms.add_term(doc_id, self.term.clone());
+ }
+ SkipResult::OverStep => {}
+ SkipResult::End => {
+ break;
+ }
+ }
+ }
+ Ok(())
+ }
+
fn count(&self, reader: &SegmentReader) -> Result<u32> {
if reader.num_deleted_docs() == 0 {
let field = self.term.field();
diff --git a/src/query/weight.rs b/src/query/weight.rs
index d3d8b35..51289c5 100644
--- a/src/query/weight.rs
+++ b/src/query/weight.rs
@@ -1,6 +1,37 @@
use super::Scorer;
use core::SegmentReader;
use Result;
+use DocId;
+use std::collections::HashSet;
+use Term;
+use std::collections::BTreeMap;
+
+
+pub struct MatchingTerms {
+ doc_to_terms: BTreeMap<DocId, HashSet<Term>>
+}
+
+impl MatchingTerms {
+ pub fn from_doc_ids(doc_ids: &[DocId]) -> MatchingTerms {
+ MatchingTerms {
+ doc_to_terms: doc_ids
+ .iter()
+ .cloned()
+ .map(|doc_id| (doc_id, HashSet::default()))
+ .collect()
+ }
+ }
+
+ pub fn sorted_doc_ids(&self) -> Vec<DocId> {
+ self.doc_to_terms.keys().cloned().collect()
+ }
+
+ pub fn add_term(&mut self, doc_id: DocId, term: Term) {
+ if let Some(terms) = self.doc_to_terms.get_mut(&doc_id) {
+ terms.insert(term);
+ }
+ }
+}
/// A Weight is the specialization of a Query
/// for a given set of segments.
@@ -11,6 +42,10 @@ pub trait Weight {
/// See [`Query`](./trait.Query.html).
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>>;
+ fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> {
+ Ok(())
+ }
+
/// Returns the number documents within the given `SegmentReader`.
fn count(&self, reader: &SegmentReader) -> Result<u32> {
Ok(self.scorer(reader)?.count())