summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Masurel <paul.masurel@gmail.com>2019-07-07 17:12:31 +0900
committerGitHub <noreply@github.com>2019-07-07 17:12:31 +0900
commit3e368d92cb1ad9213ab1fede526d19f67f7f1e06 (patch)
tree58463d2a3f7530d2798079807735fd35d23d243f
parent0bc2c64a538b3a00956cfb5e2231fc1efaab656b (diff)
Issue/479 (#578)
* Sort by field relying on tweaked score * Sort by u64/i64 get independent methods.
-rw-r--r--CHANGELOG.md2
-rw-r--r--src/collector/custom_score_top_collector.rs126
-rw-r--r--src/collector/mod.rs9
-rw-r--r--src/collector/top_collector.rs9
-rw-r--r--src/collector/top_field_collector.rs272
-rw-r--r--src/collector/top_score_collector.rs416
-rw-r--r--src/collector/tweak_score_top_collector.rs129
-rw-r--r--src/core/searcher.rs2
-rw-r--r--src/error.rs5
-rw-r--r--src/fastfield/error.rs2
-rw-r--r--src/fastfield/mod.rs2
11 files changed, 673 insertions, 301 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2337131..0830c28 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,8 @@ Tantivy 0.10.0
*Tantivy 0.10.0 index format is compatible with the index format in 0.9.0.*
+- Added an API to easily tweak or entirely replace the
+ default score. See `TopDocs::tweak_score`and `TopScore::custom_score` (@pmasurel)
- Added an ASCII folding filter (@drusellers)
- Bugfix in `query.count` in presence of deletes (@pmasurel)
- Added `.explain(...)` in `Query` and `Weight` to (@pmasurel)
diff --git a/src/collector/custom_score_top_collector.rs b/src/collector/custom_score_top_collector.rs
new file mode 100644
index 0000000..c6f3b6b
--- /dev/null
+++ b/src/collector/custom_score_top_collector.rs
@@ -0,0 +1,126 @@
+use crate::collector::top_collector::{TopCollector, TopSegmentCollector};
+use crate::collector::{Collector, SegmentCollector};
+use crate::Result;
+use crate::{DocAddress, DocId, Score, SegmentReader};
+
+pub(crate) struct CustomScoreTopCollector<TCustomScorer, TScore = Score> {
+ custom_scorer: TCustomScorer,
+ collector: TopCollector<TScore>,
+}
+
+impl<TCustomScorer, TScore> CustomScoreTopCollector<TCustomScorer, TScore>
+where
+ TScore: Clone + PartialOrd,
+{
+ pub fn new(
+ custom_scorer: TCustomScorer,
+ limit: usize,
+ ) -> CustomScoreTopCollector<TCustomScorer, TScore> {
+ CustomScoreTopCollector {
+ custom_scorer,
+ collector: TopCollector::with_limit(limit),
+ }
+ }
+}
+
+/// A custom segment scorer makes it possible to define any kind of score
+/// for a given document belonging to a specific segment.
+///
+/// It is the segment local version of the [`CustomScorer`](./trait.CustomScorer.html).
+pub trait CustomSegmentScorer<TScore>: 'static {
+ /// Computes the score of a specific `doc`.
+ fn score(&self, doc: DocId) -> TScore;
+}
+
+/// `CustomScorer` makes it possible to define any kind of score.
+///
+/// The `CustomerScorer` itself does not make much of the computation itself.
+/// Instead, it helps constructing `Self::Child` instances that will compute
+/// the score at a segment scale.
+pub trait CustomScorer<TScore>: Sync {
+ /// Type of the associated [`CustomSegmentScorer`](./trait.CustomSegmentScorer.html).
+ type Child: CustomSegmentScorer<TScore>;
+ /// Builds a child scorer for a specific segment. The child scorer is associated to
+ /// a specific segment.
+ fn segment_scorer(&self, segment_reader: &SegmentReader) -> Result<Self::Child>;
+}
+
+impl<TCustomScorer, TScore> Collector for CustomScoreTopCollector<TCustomScorer, TScore>
+where
+ TCustomScorer: CustomScorer<TScore>,
+ TScore: 'static + PartialOrd + Clone + Send + Sync,
+{
+ type Fruit = Vec<(TScore, DocAddress)>;
+
+ type Child = CustomScoreTopSegmentCollector<TCustomScorer::Child, TScore>;
+
+ fn for_segment(
+ &self,
+ segment_local_id: u32,
+ segment_reader: &SegmentReader,
+ ) -> Result<Self::Child> {
+ let segment_scorer = self.custom_scorer.segment_scorer(segment_reader)?;
+ let segment_collector = self
+ .collector
+ .for_segment(segment_local_id, segment_reader)?;
+ Ok(CustomScoreTopSegmentCollector {
+ segment_collector,
+ segment_scorer,
+ })
+ }
+
+ fn requires_scoring(&self) -> bool {
+ false
+ }
+
+ fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit> {
+ self.collector.merge_fruits(segment_fruits)
+ }
+}
+
+pub struct CustomScoreTopSegmentCollector<T, TScore>
+where
+ TScore: 'static + PartialOrd + Clone + Send + Sync + Sized,
+ T: CustomSegmentScorer<TScore>,
+{
+ segment_collector: TopSegmentCollector<TScore>,
+ segment_scorer: T,
+}
+
+impl<T, TScore> SegmentCollector for CustomScoreTopSegmentCollector<T, TScore>
+where
+ TScore: 'static + PartialOrd + Clone + Send + Sync,
+ T: 'static + CustomSegmentScorer<TScore>,
+{
+ type Fruit = Vec<(TScore, DocAddress)>;
+
+ fn collect(&mut self, doc: DocId, _score: Score) {
+ let score = self.segment_scorer.score(doc);
+ self.segment_collector.collect(doc, score);
+ }
+
+ fn harvest(self) -> Vec<(TScore, DocAddress)> {
+ self.segment_collector.harvest()
+ }
+}
+
+impl<F, TScore, T> CustomScorer<TScore> for F
+where
+ F: 'static + Send + Sync + Fn(&SegmentReader) -> T,
+ T: CustomSegmentScorer<TScore>,
+{
+ type Child = T;
+
+ fn segment_scorer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
+ Ok((self)(segment_reader))
+ }
+}
+
+impl<F, TScore> CustomSegmentScorer<TScore> for F
+where
+ F: 'static + Sync + Send + Fn(DocId) -> TScore,
+{
+ fn score(&self, doc: DocId) -> TScore {
+ (self)(doc)
+ }
+}
diff --git a/src/collector/mod.rs b/src/collector/mod.rs
index c2ae35b..c19282e 100644
--- a/src/collector/mod.rs
+++ b/src/collector/mod.rs
@@ -66,7 +66,7 @@ let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
The `Collector` trait is implemented for up to 4 collectors.
If you have more than 4 collectors, you can either group them into
-tuples of tuples `(a,(b,(c,d)))`, or rely on `MultiCollector`'s.
+tuples of tuples `(a,(b,(c,d)))`, or rely on [`MultiCollector`](./struct.MultiCollector.html).
# Combining several collectors dynamically
@@ -103,8 +103,11 @@ mod top_collector;
mod top_score_collector;
pub use self::top_score_collector::TopDocs;
-mod top_field_collector;
-pub use self::top_field_collector::TopDocsByField;
+mod custom_score_top_collector;
+pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer};
+
+mod tweak_score_top_collector;
+pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
mod facet_collector;
pub use self::facet_collector::FacetCollector;
diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs
index ea2b7ca..9a9d2dc 100644
--- a/src/collector/top_collector.rs
+++ b/src/collector/top_collector.rs
@@ -177,9 +177,8 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
#[cfg(test)]
mod tests {
- use super::{TopCollector, TopSegmentCollector};
+ use super::TopSegmentCollector;
use crate::DocAddress;
- use crate::Score;
#[test]
fn test_top_collector_not_at_capacity() {
@@ -215,10 +214,4 @@ mod tests {
]
);
}
-
- #[test]
- #[should_panic]
- fn test_top_0() {
- let _collector: TopCollector<Score> = TopCollector::with_limit(0);
- }
}
diff --git a/src/collector/top_field_collector.rs b/src/collector/top_field_collector.rs
deleted file mode 100644
index 9a91d9e..0000000
--- a/src/collector/top_field_collector.rs
+++ /dev/null
@@ -1,272 +0,0 @@
-use super::Collector;
-use crate::collector::top_collector::TopCollector;
-use crate::collector::top_collector::TopSegmentCollector;
-use crate::collector::SegmentCollector;
-use crate::fastfield::FastFieldReader;
-use crate::fastfield::FastValue;
-use crate::schema::Field;
-use crate::DocAddress;
-use crate::Result;
-use crate::SegmentLocalId;
-use crate::SegmentReader;
-use crate::TantivyError;
-use std::marker::PhantomData;
-
-/// The Top Field Collector keeps track of the K documents
-/// sorted by a fast field in the index
-///
-/// The implementation is based on a `BinaryHeap`.
-/// The theorical complexity for collecting the top `K` out of `n` documents
-/// is `O(n log K)`.
-///
-/// ```rust
-/// #[macro_use]
-/// extern crate tantivy;
-/// # use tantivy::schema::{Schema, Field, FAST, TEXT};
-/// # use tantivy::{Index, Result, DocAddress};
-/// # use tantivy::query::{Query, QueryParser};
-/// use tantivy::Searcher;
-/// use tantivy::collector::TopDocs;
-///
-/// # fn main() -> tantivy::Result<()> {
-/// # let mut schema_builder = Schema::builder();
-/// # let title = schema_builder.add_text_field("title", TEXT);
-/// # let rating = schema_builder.add_u64_field("rating", FAST);
-/// # let schema = schema_builder.build();
-/// # let index = Index::create_in_ram(schema);
-/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
-/// # index_writer.add_document(doc!(
-/// # title => "The Name of the Wind",
-/// # rating => 92u64,
-/// # ));
-/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
-/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
-/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
-/// # index_writer.commit()?;
-/// # let reader = index.reader()?;
-/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
-/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
-/// # assert_eq!(top_docs,
-/// # vec![(97u64, DocAddress(0u32, 1)),
-/// # (80u64, DocAddress(0u32, 3))]);
-/// # Ok(())
-/// # }
-/// #
-/// /// Searches the document matching the given query, and
-/// /// collects the top 10 documents, order by the `field`
-/// /// given in argument.
-/// ///
-/// /// `field` is required to be a FAST field.
-/// fn docs_sorted_by_rating(searcher: &Searcher,
-/// query: &Query,
-/// sort_by_field: Field)
-/// -> Result<Vec<(u64, DocAddress)>> {
-///
-/// // This is where we build our collector!
-/// let top_docs_by_rating = TopDocs::with_limit(2).order_by_field(sort_by_field);
-///
-/// // ... and here is our documents. Not this is a simple vec.
-/// // The `u64` in the pair is the value of our fast field for each documents.
-/// searcher.search(query, &top_docs_by_rating)
-/// }
-/// ```
-pub struct TopDocsByField<T> {
- collector: TopCollector<T>,
- field: Field,
-}
-
-impl<T: FastValue + PartialOrd + Clone> TopDocsByField<T> {
- /// Creates a top field collector, with a number of documents equal to "limit".
- ///
- /// The given field name must be a fast field, otherwise the collector have an error while
- /// collecting results.
- ///
- /// This constructor is crate-private. Client are supposed to call
- /// build `TopDocsByField` object using the `TopDocs` API.
- ///
- /// e.g.:
- /// `TopDocs::with_limit(2).order_by_field(sort_by_field)`
- ///
- /// # Panics
- /// The method panics if limit is 0
- pub(crate) fn new(field: Field, limit: usize) -> TopDocsByField<T> {
- TopDocsByField {
- collector: TopCollector::with_limit(limit),
- field,
- }
- }
-}
-
-impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByField<T> {
- type Fruit = Vec<(T, DocAddress)>;
-
- type Child = TopFieldSegmentCollector<T>;
-
- fn for_segment(
- &self,
- segment_local_id: SegmentLocalId,
- reader: &SegmentReader,
- ) -> Result<TopFieldSegmentCollector<T>> {
- let collector = self.collector.for_segment(segment_local_id, reader)?;
- let reader = reader.fast_fields().u64(self.field).ok_or_else(|| {
- let field_name = reader.schema().get_field_name(self.field);
- TantivyError::SchemaError(format!("Failed to find fast field reader {:?}", field_name))
- })?;
- Ok(TopFieldSegmentCollector {
- collector,
- reader,
- _type: PhantomData,
- })
- }
-
- fn requires_scoring(&self) -> bool {
- false
- }
-
- fn merge_fruits(
- &self,
- segment_fruits: Vec<Vec<(T, DocAddress)>>,
- ) -> Result<Vec<(T, DocAddress)>> {
- self.collector.merge_fruits(segment_fruits)
- }
-}
-
-pub struct TopFieldSegmentCollector<T> {
- collector: TopSegmentCollector<u64>,
- reader: FastFieldReader<u64>,
- _type: PhantomData<T>,
-}
-
-impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
- for TopFieldSegmentCollector<T>
-{
- type Fruit = Vec<(T, DocAddress)>;
-
- fn collect(&mut self, doc: u32, _score: f32) {
- let field_value = self.reader.get(doc);
- self.collector.collect(doc, field_value);
- }
-
- fn harvest(self) -> Vec<(T, DocAddress)> {
- self.collector
- .harvest()
- .into_iter()
- .map(|(val, doc_address)| (T::from_u64(val), doc_address))
- .collect()
- }
-}
-
-#[cfg(test)]
-mod tests {
- use super::TopDocsByField;
- use crate::collector::Collector;
- use crate::collector::TopDocs;
- use crate::query::Query;
- use crate::query::QueryParser;
- use crate::schema::Field;
- use crate::schema::IntOptions;
- use crate::schema::{Schema, FAST, TEXT};
- use crate::DocAddress;
- use crate::Index;
- use crate::IndexWriter;
- use crate::TantivyError;
- use matches::assert_matches;
-
- const TITLE: &str = "title";
- const SIZE: &str = "size";
-
- #[test]
- fn test_top_collector_not_at_capacity() {
- let mut schema_builder = Schema::builder();
- let title = schema_builder.add_text_field(TITLE, TEXT);
- let size = schema_builder.add_u64_field(SIZE, FAST);
- let schema = schema_builder.build();
- let (index, query) = index("beer", title, schema, |index_writer| {
- index_writer.add_document(doc!(
- title => "bottle of beer",
- size => 12u64,
- ));
- index_writer.add_document(doc!(
- title => "growler of beer",
- size => 64u64,
- ));
- index_writer.add_document(doc!(
- title => "pint of beer",
- size => 16u64,
- ));
- });
- let searcher = index.reader().unwrap().searcher();
-
- let top_collector = TopDocs::with_limit(4).order_by_field(size);
- let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
- assert_eq!(
- top_docs,
- vec![
- (64, DocAddress(0, 1)),
- (16, DocAddress(0, 2)),
- (12, DocAddress(0, 0))
- ]
- );
- }
-
- #[test]
- #[should_panic]
- fn test_field_does_not_exist() {
- let mut schema_builder = Schema::builder();
- let title = schema_builder.add_text_field(TITLE, TEXT);
- let size = schema_builder.add_u64_field(SIZE, FAST);
- let schema = schema_builder.build();
- let (index, _) = index("beer", title, schema, |index_writer| {
- index_writer.add_document(doc!(
- title => "bottle of beer",
- size => 12u64,
- ));
- });
- let searcher = index.reader().unwrap().searcher();
- let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(Field(2));
- let segment_reader = searcher.segment_reader(0u32);
- top_collector
- .for_segment(0, segment_reader)
- .expect("should panic");
- }
-
- #[test]
- fn test_field_not_fast_field() {
- let mut schema_builder = Schema::builder();
- let title = schema_builder.add_text_field(TITLE, TEXT);
- let size = schema_builder.add_u64_field(SIZE, IntOptions::default());
- let schema = schema_builder.build();
- let (index, _) = index("beer", title, schema, |index_writer| {
- index_writer.add_document(doc!(
- title => "bottle of beer",
- size => 12u64,
- ));
- });
- let searcher = index.reader().unwrap().searcher();
- let segment = searcher.segment_reader(0);
- let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(size);
- assert_matches!(
- top_collector
- .for_segment(0, segment)
- .map(|_| ())
- .unwrap_err(),
- TantivyError::SchemaError(_)
- );
- }
-
- fn index(
- query: &str,
- query_field: Field,
- schema: Schema,
- mut doc_adder: impl FnMut(&mut IndexWriter) -> (),
- ) -> (Index, Box<dyn Query>) {
- let index = Index::create_in_ram(schema);
-
- let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
- doc_adder(&mut index_writer);
- index_writer.commit().unwrap();
- let query_parser = QueryParser::for_index(&index, vec![query_field]);
- let query = query_parser.parse_query(query).unwrap();
- (index, query)
- }
-}
diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs
index bc247d4..c9b03d0 100644
--- a/src/collector/top_score_collector.rs
+++ b/src/collector/top_score_collector.rs
@@ -1,9 +1,11 @@
use super::Collector;
+use crate::collector::custom_score_top_collector::CustomScoreTopCollector;
use crate::collector::top_collector::TopCollector;
use crate::collector::top_collector::TopSegmentCollector;
-use crate::collector::SegmentCollector;
-use crate::collector::TopDocsByField;
-use crate::fastfield::FastValue;
+use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
+use crate::collector::{
+ CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
+};
use crate::schema::Field;
use crate::DocAddress;
use crate::DocId;
@@ -77,13 +79,311 @@ impl TopDocs {
/// Set top-K to rank documents by a given fast field.
///
- /// (By default, `TopDocs` collects the top-K documents sorted by
- /// the similarity score.)
- pub fn order_by_field<T: PartialOrd + FastValue + Clone>(
+ /// ```rust
+ /// #[macro_use]
+ /// extern crate tantivy;
+ /// # use tantivy::schema::{Schema, FAST, TEXT};
+ /// # use tantivy::{Index, Result, DocAddress};
+ /// # use tantivy::query::{Query, QueryParser};
+ /// use tantivy::Searcher;
+ /// use tantivy::collector::TopDocs;
+ /// use tantivy::schema::Field;
+ ///
+ /// # fn main() -> tantivy::Result<()> {
+ /// # let mut schema_builder = Schema::builder();
+ /// # let title = schema_builder.add_text_field("title", TEXT);
+ /// # let rating = schema_builder.add_u64_field("rating", FAST);
+ /// # let schema = schema_builder.build();
+ /// #
+ /// # let index = Index::create_in_ram(schema);
+ /// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
+ /// # index_writer.add_document(doc!(
+ /// # title => "The Name of the Wind",
+ /// # rating => 92u64,
+ /// # ));
+ /// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
+ /// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
+ /// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
+ /// # index_writer.commit()?;
+ /// # let reader = index.reader()?;
+ /// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
+ /// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
+ /// # assert_eq!(top_docs,
+ /// # vec![(97u64, DocAddress(0u32, 1)),
+ /// # (80u64, DocAddress(0u32, 3))]);
+ /// # Ok(())
+ /// # }
+ ///
+ ///
+ /// /// Searches the document matching the given query, and
+ /// /// collects the top 10 documents, order by the u64-`field`
+ /// /// given in argument.
+ /// ///
+ /// /// `field` is required to be a FAST field.
+ /// fn docs_sorted_by_rating(searcher: &Searcher,
+ /// query: &Query,
+ /// sort_by_field: Field)
+ /// -> Result<Vec<(u64, DocAddress)>> {
+ ///
+ /// // This is where we build our topdocs collector
+ /// //
+ /// // Note the generics parameter that needs to match the
+ /// // type `sort_by_field`.
+ /// let top_docs_by_rating = TopDocs
+ /// ::with_limit(10)
+ /// .order_by_u64_field(sort_by_field);
+ ///
+ /// // ... and here are our documents. Note this is a simple vec.
+ /// // The `u64` in the pair is the value of our fast field for
+ /// // each documents.
+ /// //
+ /// // The vec is sorted decreasingly by `sort_by_field`, and has a
+ /// // length of 10, or less if not enough documents matched the
+ /// // query.
+ /// let resulting_docs: Vec<(u64, DocAddress)> =
+ /// searcher.search(query, &top_docs_by_rating)?;
+ ///
+ /// Ok(resulting_docs)
+ /// }
+ /// ```
+ ///
+ /// # Panics
+ ///
+ /// May panic if the field requested is not a fast field.
+ ///
+ pub fn order_by_u64_field(
self,
field: Field,
- ) -> TopDocsByField<T> {
- TopDocsByField::new(field, self.0.limit())
+ ) -> impl Collector<Fruit = Vec<(u64, DocAddress)>> {
+ self.custom_score(move |segment_reader: &SegmentReader| {
+ let ff_reader = segment_reader
+ .fast_fields()
+ .u64(field)
+ .expect("Field requested is not a i64/u64 fast field.");
+ move |doc: DocId| ff_reader.get(doc)
+ })
+ }
+
+ /// Ranks the documents using a custom score.
+ ///
+ /// This method offers a convenient way to tweak or replace
+ /// the documents score. As suggested by the prototype you can
+ /// manually define your own [`ScoreTweaker`](./trait.ScoreTweaker.html)
+ /// and pass it as an argument, but there is a much simpler way to
+ /// tweak your score: you can use a closure as in the following
+ /// example.
+ ///
+ /// # Example
+ ///
+ /// Typically, you will want to rely on one or more fast fields,
+ /// to alter the original relevance `Score`.
+ ///
+ /// For instance, in the following, we assume that we are implementing
+ /// an e-commerce website that has a fast field called `popularity`
+ /// that rates whether a product is typically often bought by users.
+ ///
+ /// In the following example will will tweak our ranking a bit by
+ /// boosting popular products a notch.
+ ///
+ /// In more serious application, this tweaking could involved running a
+ /// learning-to-rank model over various features
+ ///
+ /// ```rust
+ /// #[macro_use]
+ /// extern crate tantivy;
+ /// # use tantivy::schema::{Schema, FAST, TEXT};
+ /// # use tantivy::{Index, DocAddress, DocId, Score};
+ /// # use tantivy::query::QueryParser;
+ /// use tantivy::SegmentReader;
+ /// use tantivy::collector::TopDocs;
+ /// use tantivy::schema::Field;
+ ///
+ /// # fn create_schema() -> Schema {
+ /// # let mut schema_builder = Schema::builder();
+ /// # schema_builder.add_text_field("product_name", TEXT);
+ /// # schema_builder.add_u64_field("popularity", FAST);
+ /// # schema_builder.build()
+ /// # }
+ /// #
+ /// # fn main() -> tantivy::Result<()> {
+ /// # let schema = create_schema();
+ /// # let index = Index::create_in_ram(schema);
+ /// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
+ /// # let product_name = index.schema().get_field("product_name").unwrap();
+ /// #
+ /// let popularity: Field = index.schema().get_field("popularity").unwrap();
+ /// # index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64));
+ /// # index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64));
+ /// # index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64));
+ /// # index_writer.commit()?;
+ /// // ...
+ /// # let user_query = "diary";
+ /// # let query = QueryParser::for_index(&index, vec![product_name]).parse_query(user_query)?;
+ ///
+ /// // This is where we build our collector with our custom score.
+ /// let top_docs_by_custom_score = TopDocs
+ /// ::with_limit(10)
+ /// .tweak_score(move |segment_reader: &SegmentReader| {
+ /// // The argument is a function that returns our scoring
+ /// // function.
+ /// //
+ /// // The point of this "mother" function is to gather all
+ /// // of the segment level information we need for scoring.
+ /// // Typically, fast_fields.
+ /// //
+ /// // In our case, we will get a reader for the popularity
+ /// // fast field.
+ /// let popularity_reader =
+ /// segment_reader.fast_fields().u64(popularity).unwrap();
+ ///
+ /// // We can now define our actual scoring function
+ /// move |doc: DocId, original_score: Score| {
+ /// let popularity: u64 = popularity_reader.get(doc);
+ /// // Well.. For the sake of the example we use a simple logarithm
+ /// // function.
+ /// let popularity_boost_score = ((2u64 + popularity) as f32).log2();
+ /// popularity_boost_score * original_score
+ /// }
+ /// });
+ /// # let reader = index.reader()?;
+ /// # let searcher = reader.searcher();
+ /// // ... and here are our documents. Note this is a simple vec.
+ /// // The `Score` in the pair is our tweaked score.
+ /// let resulting_docs: Vec<(Score, DocAddress)> =
+ /// searcher.search(&*query, &top_docs_by_custom_score)?;
+ ///
+ /// # Ok(())
+ /// # }
+ /// ```
+ ///
+ /// # See also
+ /// [custom_score(...)](#method.custom_score).
+ pub fn tweak_score<TScore, TScoreSegmentTweaker, TScoreTweaker>(
+ self,
+ score_tweaker: TScoreTweaker,
+ ) -> impl Collector<Fruit = Vec<(TScore, DocAddress)>>
+ where
+ TScore: 'static + Send + Sync + Clone + PartialOrd,
+ TScoreSegmentTweaker: ScoreSegmentTweaker<TScore> + 'static,
+ TScoreTweaker: ScoreTweaker<TScore, Child = TScoreSegmentTweaker>,
+ {
+ TweakedScoreTopCollector::new(score_tweaker, self.0.limit())
+ }
+
+ /// Ranks the documents using a custom score.
+ ///
+ /// This method offers a convenient way to use a different score.
+ ///
+ /// As suggested by the prototype you can manually define your
+ /// own [`CustomScorer`](./trait.CustomScorer.html)
+ /// and pass it as an argument, but there is a much simpler way to
+ /// tweak your score: you can use a closure as in the following
+ /// example.
+ ///
+ /// # Limitation
+ ///
+ /// This method only makes it possible to compute the score from a given
+ /// `DocId`, fastfield values for the doc and any information you could
+ /// have precomputed beforehands. It does not make it possible for instance
+ /// to compute something like TfIdf as it does not have access to the list of query
+ /// terms present in the document, nor the term frequencies for the different terms.
+ ///
+ /// It can be used if your search engine relies on a learning-to-rank model for instance,
+ /// which does not rely on the term frequencies or positions as features.
+ ///
+ /// # Example
+ ///
+ /// ```rust
+ /// # #[macro_use]
+ /// # extern crate tantivy;
+ /// # use tantivy::schema::{Schema, FAST, TEXT};
+ /// # use tantivy::{Index, DocAddress, DocId};
+ /// # use tantivy::query::QueryParser;
+ /// use tantivy::SegmentReader;
+ /// use tantivy::collector::TopDocs;
+ /// use tantivy::schema::Field;
+ ///
+ /// # fn create_schema() -> Schema {
+ /// # let mut schema_builder = Schema::builder();
+ /// # schema_builder.add_text_field("product_name", TEXT);
+ /// # schema_builder.add_u64_field("popularity", FAST);
+ /// # schema_builder.add_u64_field("boosted", FAST);
+ /// # schema_builder.build()
+ /// # }
+ /// #
+ /// # fn main() -> tantivy::Result<()> {
+ /// # let schema = create_schema();
+ /// # let index = Index::create_in_ram(schema);
+ /// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
+ /// # let product_name = index.schema().get_field("product_name").unwrap();
+ /// #
+ /// let popularity: Field = index.schema().get_field("popularity").unwrap();
+ /// let boosted: Field = index.schema().get_field("boosted").unwrap();
+ /// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64));
+ /// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64));
+ /// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64));
+ /// # index_writer.commit()?;
+ /// // ...
+ /// # let user_query = "diary";
+ /// # let query = QueryParser::for_index(&index, vec![product_name]).parse_query(user_query)?;
+ ///
+ /// // This is where we build our collector with our custom score.
+ /// let top_docs_by_custom_score = TopDocs
+ /// ::with_limit(10)
+ /// .custom_score(move |segment_reader: &SegmentReader| {
+ /// // The argument is a function that returns our scoring
+ /// // function.
+ /// //
+ /// // The point of this "mother" function is to gather all
+ /// // of the segment level information we need for scoring.
+ /// // Typically, fast_fields.
+ /// //
+ /// // In our case, we will get a reader for the popularity
+ /// // fast field and a boosted field.
+ /// //
+ /// // We want to get boosted items score, and when we get
+ /// // a tie, return the item with the highest popularity.
+ /// //
+ /// // Note that this is implemented by using a `(u64, u64)`
+ /// // as a score.
+ /// let popularity_reader =
+ /// segment_reader.fast_fields().u64(popularity).unwrap();
+ /// let boosted_reader =
+ /// segment_reader.fast_fields().u64(boosted).unwrap();
+ ///
+ /// // We can now define our actual scoring function
+ /// move |doc: DocId| {
+ /// let popularity: u64 = popularity_reader.get(doc);
+ /// let boosted: u64 = boosted_reader.get(doc);
+ /// // Score do not have to be `f64` in tantivy.
+ /// // Here we return a couple to get lexicographical order
+ /// // for free.
+ /// (boosted, popularity)
+ /// }
+ /// });
+ /// # let reader = index.reader()?;
+ /// # let searcher = reader.searcher();
+ /// // ... and here are our documents. Note this is a simple vec.
+ /// // The `Score` in the pair is our tweaked score.
+ /// let resulting_docs: Vec<((u64, u64), DocAddress)> =
+ /// searcher.search(&*query, &top_docs_by_custom_score)?;
+ ///
+ /// # Ok(())
+ /// # }
+ /// ```
+ ///
+ /// # See also
+ /// [tweak_score(...)](#method.tweak_score).
+ pub fn custom_score<TScore, TCustomSegmentScorer, TCustomScorer>(
+ self,
+ custom_score: TCustomScorer,
+ ) -> impl Collector<Fruit = Vec<(TScore, DocAddress)>>
+ where
+ TScore: 'static + Send + Sync + Clone + PartialOrd,
+ TCustomSegmentScorer: CustomSegmentScorer<TScore> + 'static,
+ TCustomScorer: CustomScorer<TScore, Child = TCustomSegmentScorer>,
+ {
+ CustomScoreTopCollector::new(custom_score, self.0.limit())
}
}
@@ -128,11 +428,12 @@ impl SegmentCollector for TopScoreSegmentCollector {
#[cfg(test)]
mod tests {
use super::TopDocs;
- use crate::query::QueryParser;
- use crate::schema::Schema;
- use crate::schema::TEXT;
+ use crate::collector::Collector;
+ use crate::query::{Query, QueryParser};
+ use crate::schema::{Field, Schema, FAST, STORED, TEXT};
use crate::DocAddress;
use crate::Index;
+ use crate::IndexWriter;
use crate::Score;
fn make_index() -> Index {
@@ -200,4 +501,97 @@ mod tests {
TopDocs::with_limit(0);
}
+ const TITLE: &str = "title";
+ const SIZE: &str = "size";
+
+ #[test]
+ fn test_top_field_collector_not_at_capacity() {
+ let mut schema_builder = Schema::builder();
+ let title = schema_builder.add_text_field(TITLE, TEXT);
+ let size = schema_builder.add_u64_field(SIZE, FAST);
+ let schema = schema_builder.build();
+ let (index, query) = index("beer", title, schema, |index_writer| {
+ index_writer.add_document(doc!(
+ title => "bottle of beer",
+ size => 12u64,
+ ));
+ index_writer.add_document(doc!(
+ title => "growler of beer",
+ size => 64u64,
+ ));
+ index_writer.add_document(doc!(
+ title => "pint of beer",
+ size => 16u64,
+ ));
+ });
+ let searcher = index.reader().unwrap().searcher();
+
+ let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
+ let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
+ assert_eq!(
+ top_docs,
+ vec![
+ (64, DocAddress(0, 1)),
+ (16, DocAddress(0, 2)),
+ (12, DocAddress(0, 0))
+ ]
+ );
+ }
+
+ #[test]
+ #[should_panic]
+ fn test_field_does_not_exist() {
+ let mut schema_builder = Schema::builder();
+ let title = schema_builder.add_text_field(TITLE, TEXT);
+ let size = schema_builder.add_u64_field(SIZE, FAST);
+ let schema = schema_builder.build();
+ let (index, _) = index("beer", title, schema, |index_writer| {
+ index_writer.add_document(doc!(
+ title => "bottle of beer",
+ size => 12u64,
+ ));
+ });
+ let searcher = index.reader().unwrap().searcher();
+ let top_collector = TopDocs::with_limit(4).order_by_u64