Extracting terms matching query in the document

author: Paul Masurel <paul.masurel@gmail.com> 2018-08-30 09:23:34 +0900
committer: Paul Masurel <paul.masurel@gmail.com> 2018-08-30 09:23:34 +0900
commit: a12d211330657931de2c972030504762cdbb8432 (patch)
tree: 45a02fa812f024b22858ca48bbf334c5b5eec54d /src/query
parent: 18814ba0c15e72dd2db09c589e647b863dbbea51 (diff)
3 files changed, 104 insertions, 0 deletions
diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs
index b38e659..d1040eb 100644
--- a/src/query/automaton_weight.rs
+++ b/src/query/automaton_weight.rs
@@ -7,6 +7,11 @@ use query::{Scorer, Weight};
 use schema::{Field, IndexRecordOption};
 use termdict::{TermDictionary, TermStreamer};
 use Result;
+use query::weight::MatchingTerms;
+use SkipResult;
+use Term;
+use DocId;
+use DocSet;
 
 /// A weight struct for Fuzzy Term and Regex Queries
 pub struct AutomatonWeight<A>
@@ -36,6 +41,48 @@ impl<A> Weight for AutomatonWeight<A>
 where
     A: Automaton,
 {
+
+    fn matching_terms(&self,
+                      reader: &SegmentReader,
+                      matching_terms: &mut MatchingTerms) -> Result<()> {
+        let max_doc = reader.max_doc();
+        let mut doc_bitset = BitSet::with_max_value(max_doc);
+
+        let inverted_index = reader.inverted_index(self.field);
+        let term_dict = inverted_index.terms();
+        let mut term_stream = self.automaton_stream(term_dict);
+
+        let doc_ids = matching_terms.sorted_doc_ids();
+        let mut docs_matching_current_term: Vec<DocId> = vec![];
+
+        let mut term_buffer: Vec<u8> = vec![];
+
+        while term_stream.advance() {
+            docs_matching_current_term.clear();
+            let term_info = term_stream.value();
+            let mut segment_postings = inverted_index.read_postings_from_terminfo(term_info, IndexRecordOption::Basic);
+            for &doc_id in &doc_ids {
+                match segment_postings.skip_next(doc_id) {
+                    SkipResult::Reached => {
+                        docs_matching_current_term.push(doc_id);
+                    }
+                    SkipResult::OverStep => {}
+                    SkipResult::End => {}
+                }
+            }
+            if !docs_matching_current_term.is_empty() {
+                term_buffer.clear();
+                let term_ord = term_stream.term_ord();
+                inverted_index.terms().ord_to_term(term_ord, &mut term_buffer);
+                let term = Term::from_field_bytes(self.field, &term_buffer[..]);
+                for &doc_id in &docs_matching_current_term {
+                    matching_terms.add_term(doc_id, term.clone());
+                }
+            }
+        }
+        Ok(())
+    }
+
     fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
         let max_doc = reader.max_doc();
         let mut doc_bitset = BitSet::with_max_value(max_doc);
diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs
index ba45a80..1a9075b 100644
--- a/src/query/term_query/term_weight.rs
+++ b/src/query/term_query/term_weight.rs
@@ -8,6 +8,8 @@ use query::Weight;
 use schema::IndexRecordOption;
 use Result;
 use Term;
+use SkipResult;
+use query::weight::MatchingTerms;
 
 pub struct TermWeight {
     term: Term,
@@ -38,6 +40,26 @@ impl Weight for TermWeight {
         }
     }
 
+
+    fn matching_terms(&self,
+                      reader: &SegmentReader,
+                      matching_terms: &mut MatchingTerms) -> Result<()> {
+        let doc_ids = matching_terms.sorted_doc_ids();
+        let mut scorer = self.scorer(reader)?;
+        for doc_id in doc_ids {
+            match scorer.skip_next(doc_id) {
+                SkipResult::Reached => {
+                    matching_terms.add_term(doc_id, self.term.clone());
+                }
+                SkipResult::OverStep => {}
+                SkipResult::End => {
+                    break;
+                }
+            }
+        }
+        Ok(())
+    }
+
     fn count(&self, reader: &SegmentReader) -> Result<u32> {
         if reader.num_deleted_docs() == 0 {
             let field = self.term.field();
diff --git a/src/query/weight.rs b/src/query/weight.rs
index d3d8b35..51289c5 100644
--- a/src/query/weight.rs
+++ b/src/query/weight.rs
@@ -1,6 +1,37 @@
 use super::Scorer;
 use core::SegmentReader;
 use Result;
+use DocId;
+use std::collections::HashSet;
+use Term;
+use std::collections::BTreeMap;
+
+
+pub struct MatchingTerms {
+    doc_to_terms: BTreeMap<DocId, HashSet<Term>>
+}
+
+impl MatchingTerms {
+    pub fn from_doc_ids(doc_ids: &[DocId]) -> MatchingTerms {
+        MatchingTerms {
+            doc_to_terms: doc_ids
+                .iter()
+                .cloned()
+                .map(|doc_id| (doc_id, HashSet::default()))
+                .collect()
+        }
+    }
+
+    pub fn sorted_doc_ids(&self) -> Vec<DocId> {
+        self.doc_to_terms.keys().cloned().collect()
+    }
+
+    pub fn add_term(&mut self, doc_id: DocId, term: Term) {
+        if let Some(terms) = self.doc_to_terms.get_mut(&doc_id) {
+            terms.insert(term);
+        }
+    }
+}
 
 /// A Weight is the specialization of a Query
 /// for a given set of segments.
@@ -11,6 +42,10 @@ pub trait Weight {
     /// See [`Query`](./trait.Query.html).
     fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>>;
 
+    fn matching_terms(&self, reader: &SegmentReader, matching_terms: &mut MatchingTerms) -> Result<()> {
+        Ok(())
+    }
+
     /// Returns the number documents within the given `SegmentReader`.
     fn count(&self, reader: &SegmentReader) -> Result<u32> {
         Ok(self.scorer(reader)?.count())
author	Paul Masurel <paul.masurel@gmail.com>	2018-08-30 09:23:34 +0900
committer	Paul Masurel <paul.masurel@gmail.com>	2018-08-30 09:23:34 +0900
commit	a12d211330657931de2c972030504762cdbb8432 (patch)
tree	45a02fa812f024b22858ca48bbf334c5b5eec54d /src/query
parent	18814ba0c15e72dd2db09c589e647b863dbbea51 (diff)