Issue/479 (#578)

* Sort by field relying on tweaked score * Sort by u64/i64 get independent methods.
author: Paul Masurel <paul.masurel@gmail.com> 2019-07-07 17:12:31 +0900
committer: GitHub <noreply@github.com> 2019-07-07 17:12:31 +0900
commit: 3e368d92cb1ad9213ab1fede526d19f67f7f1e06 (patch)
tree: 58463d2a3f7530d2798079807735fd35d23d243f
parent: 0bc2c64a538b3a00956cfb5e2231fc1efaab656b (diff)
11 files changed, 673 insertions, 301 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2337131..0830c28 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,8 @@ Tantivy 0.10.0
 
 *Tantivy 0.10.0 index format is compatible with the index format in 0.9.0.*
 
+- Added an API to easily tweak or entirely replace the 
+ default score. See `TopDocs::tweak_score`and `TopScore::custom_score` (@pmasurel)
 - Added an ASCII folding filter (@drusellers)
 - Bugfix in `query.count` in presence of deletes (@pmasurel)
 - Added `.explain(...)` in `Query` and `Weight` to (@pmasurel)
diff --git a/src/collector/custom_score_top_collector.rs b/src/collector/custom_score_top_collector.rs
new file mode 100644
index 0000000..c6f3b6b
--- /dev/null
+++ b/src/collector/custom_score_top_collector.rs
@@ -0,0 +1,126 @@
+use crate::collector::top_collector::{TopCollector, TopSegmentCollector};
+use crate::collector::{Collector, SegmentCollector};
+use crate::Result;
+use crate::{DocAddress, DocId, Score, SegmentReader};
+
+pub(crate) struct CustomScoreTopCollector<TCustomScorer, TScore = Score> {
+    custom_scorer: TCustomScorer,
+    collector: TopCollector<TScore>,
+}
+
+impl<TCustomScorer, TScore> CustomScoreTopCollector<TCustomScorer, TScore>
+where
+    TScore: Clone + PartialOrd,
+{
+    pub fn new(
+        custom_scorer: TCustomScorer,
+        limit: usize,
+    ) -> CustomScoreTopCollector<TCustomScorer, TScore> {
+        CustomScoreTopCollector {
+            custom_scorer,
+            collector: TopCollector::with_limit(limit),
+        }
+    }
+}
+
+/// A custom segment scorer makes it possible to define any kind of score
+/// for a given document belonging to a specific segment.
+///
+/// It is the segment local version of the [`CustomScorer`](./trait.CustomScorer.html).
+pub trait CustomSegmentScorer<TScore>: 'static {
+    /// Computes the score of a specific `doc`.
+    fn score(&self, doc: DocId) -> TScore;
+}
+
+/// `CustomScorer` makes it possible to define any kind of score.
+///
+/// The `CustomerScorer` itself does not make much of the computation itself.
+/// Instead, it helps constructing `Self::Child` instances that will compute
+/// the score at a segment scale.
+pub trait CustomScorer<TScore>: Sync {
+    /// Type of the associated [`CustomSegmentScorer`](./trait.CustomSegmentScorer.html).
+    type Child: CustomSegmentScorer<TScore>;
+    /// Builds a child scorer for a specific segment. The child scorer is associated to
+    /// a specific segment.
+    fn segment_scorer(&self, segment_reader: &SegmentReader) -> Result<Self::Child>;
+}
+
+impl<TCustomScorer, TScore> Collector for CustomScoreTopCollector<TCustomScorer, TScore>
+where
+    TCustomScorer: CustomScorer<TScore>,
+    TScore: 'static + PartialOrd + Clone + Send + Sync,
+{
+    type Fruit = Vec<(TScore, DocAddress)>;
+
+    type Child = CustomScoreTopSegmentCollector<TCustomScorer::Child, TScore>;
+
+    fn for_segment(
+        &self,
+        segment_local_id: u32,
+        segment_reader: &SegmentReader,
+    ) -> Result<Self::Child> {
+        let segment_scorer = self.custom_scorer.segment_scorer(segment_reader)?;
+        let segment_collector = self
+            .collector
+            .for_segment(segment_local_id, segment_reader)?;
+        Ok(CustomScoreTopSegmentCollector {
+            segment_collector,
+            segment_scorer,
+        })
+    }
+
+    fn requires_scoring(&self) -> bool {
+        false
+    }
+
+    fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit> {
+        self.collector.merge_fruits(segment_fruits)
+    }
+}
+
+pub struct CustomScoreTopSegmentCollector<T, TScore>
+where
+    TScore: 'static + PartialOrd + Clone + Send + Sync + Sized,
+    T: CustomSegmentScorer<TScore>,
+{
+    segment_collector: TopSegmentCollector<TScore>,
+    segment_scorer: T,
+}
+
+impl<T, TScore> SegmentCollector for CustomScoreTopSegmentCollector<T, TScore>
+where
+    TScore: 'static + PartialOrd + Clone + Send + Sync,
+    T: 'static + CustomSegmentScorer<TScore>,
+{
+    type Fruit = Vec<(TScore, DocAddress)>;
+
+    fn collect(&mut self, doc: DocId, _score: Score) {
+        let score = self.segment_scorer.score(doc);
+        self.segment_collector.collect(doc, score);
+    }
+
+    fn harvest(self) -> Vec<(TScore, DocAddress)> {
+        self.segment_collector.harvest()
+    }
+}
+
+impl<F, TScore, T> CustomScorer<TScore> for F
+where
+    F: 'static + Send + Sync + Fn(&SegmentReader) -> T,
+    T: CustomSegmentScorer<TScore>,
+{
+    type Child = T;
+
+    fn segment_scorer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
+        Ok((self)(segment_reader))
+    }
+}
+
+impl<F, TScore> CustomSegmentScorer<TScore> for F
+where
+    F: 'static + Sync + Send + Fn(DocId) -> TScore,
+{
+    fn score(&self, doc: DocId) -> TScore {
+        (self)(doc)
+    }
+}
diff --git a/src/collector/mod.rs b/src/collector/mod.rs
index c2ae35b..c19282e 100644
--- a/src/collector/mod.rs
+++ b/src/collector/mod.rs
@@ -66,7 +66,7 @@ let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
 
 The `Collector` trait is implemented for up to 4 collectors.
 If you have more than 4 collectors, you can either group them into
-tuples of tuples `(a,(b,(c,d)))`, or rely on `MultiCollector`'s.
+tuples of tuples `(a,(b,(c,d)))`, or rely on [`MultiCollector`](./struct.MultiCollector.html).
 
 # Combining several collectors dynamically
 
@@ -103,8 +103,11 @@ mod top_collector;
 mod top_score_collector;
 pub use self::top_score_collector::TopDocs;
 
-mod top_field_collector;
-pub use self::top_field_collector::TopDocsByField;
+mod custom_score_top_collector;
+pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer};
+
+mod tweak_score_top_collector;
+pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
 
 mod facet_collector;
 pub use self::facet_collector::FacetCollector;
diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs
index ea2b7ca..9a9d2dc 100644
--- a/src/collector/top_collector.rs
+++ b/src/collector/top_collector.rs
@@ -177,9 +177,8 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
 
 #[cfg(test)]
 mod tests {
-    use super::{TopCollector, TopSegmentCollector};
+    use super::TopSegmentCollector;
     use crate::DocAddress;
-    use crate::Score;
 
     #[test]
     fn test_top_collector_not_at_capacity() {
@@ -215,10 +214,4 @@ mod tests {
             ]
         );
     }
-
-    #[test]
-    #[should_panic]
-    fn test_top_0() {
-        let _collector: TopCollector<Score> = TopCollector::with_limit(0);
-    }
 }
diff --git a/src/collector/top_field_collector.rs b/src/collector/top_field_collector.rs
deleted file mode 100644
index 9a91d9e..0000000
--- a/src/collector/top_field_collector.rs
+++ /dev/null
@@ -1,272 +0,0 @@
-use super::Collector;
-use crate::collector::top_collector::TopCollector;
-use crate::collector::top_collector::TopSegmentCollector;
-use crate::collector::SegmentCollector;
-use crate::fastfield::FastFieldReader;
-use crate::fastfield::FastValue;
-use crate::schema::Field;
-use crate::DocAddress;
-use crate::Result;
-use crate::SegmentLocalId;
-use crate::SegmentReader;
-use crate::TantivyError;
-use std::marker::PhantomData;
-
-/// The Top Field Collector keeps track of the K documents
-/// sorted by a fast field in the index
-///
-/// The implementation is based on a `BinaryHeap`.
-/// The theorical complexity for collecting the top `K` out of `n` documents
-/// is `O(n log K)`.
-///
-/// ```rust
-/// #[macro_use]
-/// extern crate tantivy;
-/// # use tantivy::schema::{Schema, Field, FAST, TEXT};
-/// # use tantivy::{Index, Result, DocAddress};
-/// # use tantivy::query::{Query, QueryParser};
-/// use tantivy::Searcher;
-/// use tantivy::collector::TopDocs;
-///
-/// # fn main() -> tantivy::Result<()> {
-/// #   let mut schema_builder = Schema::builder();
-/// #   let title = schema_builder.add_text_field("title", TEXT);
-/// #   let rating = schema_builder.add_u64_field("rating", FAST);
-/// #   let schema = schema_builder.build();
-/// #   let index = Index::create_in_ram(schema);
-/// #   let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
-/// #   index_writer.add_document(doc!(
-/// #       title => "The Name of the Wind",
-/// #       rating => 92u64,
-/// #   ));
-/// #   index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
-/// #   index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
-/// #   index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
-/// #   index_writer.commit()?;
-/// #   let reader = index.reader()?;
-/// #   let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
-/// #   let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
-/// #   assert_eq!(top_docs,
-/// #            vec![(97u64, DocAddress(0u32, 1)),
-/// #                 (80u64, DocAddress(0u32, 3))]);
-/// #   Ok(())
-/// # }
-/// #
-/// /// Searches the document matching the given query, and
-/// /// collects the top 10 documents, order by the `field`
-/// /// given in argument.
-/// ///
-/// /// `field` is required to be a FAST field.
-/// fn docs_sorted_by_rating(searcher: &Searcher,
-///                          query: &Query,
-///                          sort_by_field: Field)
-///     -> Result<Vec<(u64, DocAddress)>> {
-///
-///     // This is where we build our collector!
-///     let top_docs_by_rating = TopDocs::with_limit(2).order_by_field(sort_by_field);
-///
-///     // ... and here is our documents. Not this is a simple vec.
-///     // The `u64` in the pair is the value of our fast field for each documents.
-///     searcher.search(query, &top_docs_by_rating)
-/// }
-/// ```
-pub struct TopDocsByField<T> {
-    collector: TopCollector<T>,
-    field: Field,
-}
-
-impl<T: FastValue + PartialOrd + Clone> TopDocsByField<T> {
-    /// Creates a top field collector, with a number of documents equal to "limit".
-    ///
-    /// The given field name must be a fast field, otherwise the collector have an error while
-    /// collecting results.
-    ///
-    /// This constructor is crate-private. Client are supposed to call
-    /// build `TopDocsByField`  object using the `TopDocs` API.
-    ///
-    /// e.g.:
-    ///   `TopDocs::with_limit(2).order_by_field(sort_by_field)`
-    ///
-    /// # Panics
-    /// The method panics if limit is 0
-    pub(crate) fn new(field: Field, limit: usize) -> TopDocsByField<T> {
-        TopDocsByField {
-            collector: TopCollector::with_limit(limit),
-            field,
-        }
-    }
-}
-
-impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByField<T> {
-    type Fruit = Vec<(T, DocAddress)>;
-
-    type Child = TopFieldSegmentCollector<T>;
-
-    fn for_segment(
-        &self,
-        segment_local_id: SegmentLocalId,
-        reader: &SegmentReader,
-    ) -> Result<TopFieldSegmentCollector<T>> {
-        let collector = self.collector.for_segment(segment_local_id, reader)?;
-        let reader = reader.fast_fields().u64(self.field).ok_or_else(|| {
-            let field_name = reader.schema().get_field_name(self.field);
-            TantivyError::SchemaError(format!("Failed to find fast field reader {:?}", field_name))
-        })?;
-        Ok(TopFieldSegmentCollector {
-            collector,
-            reader,
-            _type: PhantomData,
-        })
-    }
-
-    fn requires_scoring(&self) -> bool {
-        false
-    }
-
-    fn merge_fruits(
-        &self,
-        segment_fruits: Vec<Vec<(T, DocAddress)>>,
-    ) -> Result<Vec<(T, DocAddress)>> {
-        self.collector.merge_fruits(segment_fruits)
-    }
-}
-
-pub struct TopFieldSegmentCollector<T> {
-    collector: TopSegmentCollector<u64>,
-    reader: FastFieldReader<u64>,
-    _type: PhantomData<T>,
-}
-
-impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
-    for TopFieldSegmentCollector<T>
-{
-    type Fruit = Vec<(T, DocAddress)>;
-
-    fn collect(&mut self, doc: u32, _score: f32) {
-        let field_value = self.reader.get(doc);
-        self.collector.collect(doc, field_value);
-    }
-
-    fn harvest(self) -> Vec<(T, DocAddress)> {
-        self.collector
-            .harvest()
-            .into_iter()
-            .map(|(val, doc_address)| (T::from_u64(val), doc_address))
-            .collect()
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::TopDocsByField;
-    use crate::collector::Collector;
-    use crate::collector::TopDocs;
-    use crate::query::Query;
-    use crate::query::QueryParser;
-    use crate::schema::Field;
-    use crate::schema::IntOptions;
-    use crate::schema::{Schema, FAST, TEXT};
-    use crate::DocAddress;
-    use crate::Index;
-    use crate::IndexWriter;
-    use crate::TantivyError;
-    use matches::assert_matches;
-
-    const TITLE: &str = "title";
-    const SIZE: &str = "size";
-
-    #[test]
-    fn test_top_collector_not_at_capacity() {
-        let mut schema_builder = Schema::builder();
-        let title = schema_builder.add_text_field(TITLE, TEXT);
-        let size = schema_builder.add_u64_field(SIZE, FAST);
-        let schema = schema_builder.build();
-        let (index, query) = index("beer", title, schema, |index_writer| {
-            index_writer.add_document(doc!(
-                title => "bottle of beer",
-                size => 12u64,
-            ));
-            index_writer.add_document(doc!(
-                title => "growler of beer",
-                size => 64u64,
-            ));
-            index_writer.add_document(doc!(
-                title => "pint of beer",
-                size => 16u64,
-            ));
-        });
-        let searcher = index.reader().unwrap().searcher();
-
-        let top_collector = TopDocs::with_limit(4).order_by_field(size);
-        let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
-        assert_eq!(
-            top_docs,
-            vec![
-                (64, DocAddress(0, 1)),
-                (16, DocAddress(0, 2)),
-                (12, DocAddress(0, 0))
-            ]
-        );
-    }
-
-    #[test]
-    #[should_panic]
-    fn test_field_does_not_exist() {
-        let mut schema_builder = Schema::builder();
-        let title = schema_builder.add_text_field(TITLE, TEXT);
-        let size = schema_builder.add_u64_field(SIZE, FAST);
-        let schema = schema_builder.build();
-        let (index, _) = index("beer", title, schema, |index_writer| {
-            index_writer.add_document(doc!(
-                title => "bottle of beer",
-                size => 12u64,
-            ));
-        });
-        let searcher = index.reader().unwrap().searcher();
-        let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(Field(2));
-        let segment_reader = searcher.segment_reader(0u32);
-        top_collector
-            .for_segment(0, segment_reader)
-            .expect("should panic");
-    }
-
-    #[test]
-    fn test_field_not_fast_field() {
-        let mut schema_builder = Schema::builder();
-        let title = schema_builder.add_text_field(TITLE, TEXT);
-        let size = schema_builder.add_u64_field(SIZE, IntOptions::default());
-        let schema = schema_builder.build();
-        let (index, _) = index("beer", title, schema, |index_writer| {
-            index_writer.add_document(doc!(
-                title => "bottle of beer",
-                size => 12u64,
-            ));
-        });
-        let searcher = index.reader().unwrap().searcher();
-        let segment = searcher.segment_reader(0);
-        let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(size);
-        assert_matches!(
-            top_collector
-                .for_segment(0, segment)
-                .map(|_| ())
-                .unwrap_err(),
-            TantivyError::SchemaError(_)
-        );
-    }
-
-    fn index(
-        query: &str,
-        query_field: Field,
-        schema: Schema,
-        mut doc_adder: impl FnMut(&mut IndexWriter) -> (),
-    ) -> (Index, Box<dyn Query>) {
-        let index = Index::create_in_ram(schema);
-
-        let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
-        doc_adder(&mut index_writer);
-        index_writer.commit().unwrap();
-        let query_parser = QueryParser::for_index(&index, vec![query_field]);
-        let query = query_parser.parse_query(query).unwrap();
-        (index, query)
-    }
-}
diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs
index bc247d4..c9b03d0 100644
--- a/src/collector/top_score_collector.rs
+++ b/src/collector/top_score_collector.rs
@@ -1,9 +1,11 @@
 use super::Collector;
+use crate::collector::custom_score_top_collector::CustomScoreTopCollector;
 use crate::collector::top_collector::TopCollector;
 use crate::collector::top_collector::TopSegmentCollector;
-use crate::collector::SegmentCollector;
-use crate::collector::TopDocsByField;
-use crate::fastfield::FastValue;
+use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
+use crate::collector::{
+    CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
+};
 use crate::schema::Field;
 use crate::DocAddress;
 use crate::DocId;
@@ -77,13 +79,311 @@ impl TopDocs {
 
     /// Set top-K to rank documents by a given fast field.
     ///
-    /// (By default, `TopDocs` collects the top-K documents sorted by
-    /// the similarity score.)
-    pub fn order_by_field<T: PartialOrd + FastValue + Clone>(
+    /// ```rust
+    /// #[macro_use]
+    /// extern crate tantivy;
+    /// # use tantivy::schema::{Schema, FAST, TEXT};
+    /// # use tantivy::{Index, Result, DocAddress};
+    /// # use tantivy::query::{Query, QueryParser};
+    /// use tantivy::Searcher;
+    /// use tantivy::collector::TopDocs;
+    /// use tantivy::schema::Field;
+    ///
+    /// # fn main() -> tantivy::Result<()> {
+    /// #   let mut schema_builder = Schema::builder();
+    /// #   let title = schema_builder.add_text_field("title", TEXT);
+    /// #   let rating = schema_builder.add_u64_field("rating", FAST);
+    /// #   let schema = schema_builder.build();
+    /// #  
+    /// #   let index = Index::create_in_ram(schema);
+    /// #   let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
+    /// #   index_writer.add_document(doc!(
+    /// #       title => "The Name of the Wind",
+    /// #       rating => 92u64,
+    /// #   ));
+    /// #   index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
+    /// #   index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
+    /// #   index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
+    /// #   index_writer.commit()?;
+    /// #   let reader = index.reader()?;
+    /// #   let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
+    /// #   let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
+    /// #   assert_eq!(top_docs,
+    /// #            vec![(97u64, DocAddress(0u32, 1)),
+    /// #                 (80u64, DocAddress(0u32, 3))]);
+    /// #   Ok(())
+    /// # }
+    ///
+    ///
+    /// /// Searches the document matching the given query, and
+    /// /// collects the top 10 documents, order by the u64-`field`
+    /// /// given in argument.
+    /// ///
+    /// /// `field` is required to be a FAST field.
+    /// fn docs_sorted_by_rating(searcher: &Searcher,
+    ///                          query: &Query,
+    ///                          sort_by_field: Field)
+    ///     -> Result<Vec<(u64, DocAddress)>> {
+    ///
+    ///     // This is where we build our topdocs collector
+    ///     //
+    ///     // Note the generics parameter that needs to match the
+    ///     // type `sort_by_field`.
+    ///     let top_docs_by_rating = TopDocs
+    ///                 ::with_limit(10)
+    ///                  .order_by_u64_field(sort_by_field);
+    ///     
+    ///     // ... and here are our documents. Note this is a simple vec.
+    ///     // The `u64` in the pair is the value of our fast field for
+    ///     // each documents.
+    ///     //
+    ///     // The vec is sorted decreasingly by `sort_by_field`, and has a
+    ///     // length of 10, or less if not enough documents matched the
+    ///     // query.
+    ///     let resulting_docs: Vec<(u64, DocAddress)> =
+    ///          searcher.search(query, &top_docs_by_rating)?;
+    ///     
+    ///     Ok(resulting_docs)
+    /// }
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// May panic if the field requested is not a fast field.
+    ///
+    pub fn order_by_u64_field(
         self,
         field: Field,
-    ) -> TopDocsByField<T> {
-        TopDocsByField::new(field, self.0.limit())
+    ) -> impl Collector<Fruit = Vec<(u64, DocAddress)>> {
+        self.custom_score(move |segment_reader: &SegmentReader| {
+            let ff_reader = segment_reader
+                .fast_fields()
+                .u64(field)
+                .expect("Field requested is not a i64/u64 fast field.");
+            move |doc: DocId| ff_reader.get(doc)
+        })
+    }
+
+    /// Ranks the documents using a custom score.
+    ///
+    /// This method offers a convenient way to tweak or replace
+    /// the documents score. As suggested by the prototype you can
+    /// manually define your own [`ScoreTweaker`](./trait.ScoreTweaker.html)
+    /// and pass it as an argument, but there is a much simpler way to
+    /// tweak your score: you can use a closure as in the following
+    /// example.
+    ///
+    /// # Example
+    ///
+    /// Typically, you will want to rely on one or more fast fields,
+    /// to alter the original relevance `Score`.
+    ///
+    /// For instance, in the following, we assume that we are implementing
+    /// an e-commerce website that has a fast field called `popularity`
+    /// that rates whether a product is typically often bought by users.
+    ///
+    /// In the following example will will tweak our ranking a bit by
+    /// boosting popular products a notch.
+    ///  
+    /// In more serious application, this tweaking could involved running a
+    /// learning-to-rank model over various features
+    ///
+    /// ```rust
+    /// #[macro_use]
+    /// extern crate tantivy;
+    /// # use tantivy::schema::{Schema, FAST, TEXT};
+    /// # use tantivy::{Index, DocAddress, DocId, Score};
+    /// # use tantivy::query::QueryParser;
+    /// use tantivy::SegmentReader;
+    /// use tantivy::collector::TopDocs;
+    /// use tantivy::schema::Field;
+    ///
+    /// # fn create_schema() -> Schema {
+    /// #    let mut schema_builder = Schema::builder();
+    /// #    schema_builder.add_text_field("product_name", TEXT);
+    /// #    schema_builder.add_u64_field("popularity", FAST);
+    /// #    schema_builder.build()
+    /// # }
+    /// #
+    /// # fn main() -> tantivy::Result<()> {
+    /// #   let schema = create_schema();
+    /// #   let index = Index::create_in_ram(schema);
+    /// #   let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
+    /// #   let product_name = index.schema().get_field("product_name").unwrap();
+    /// #   
+    /// let popularity: Field = index.schema().get_field("popularity").unwrap();
+    /// #   index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64));
+    /// #   index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64));
+    /// #   index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64));
+    /// #   index_writer.commit()?;
+    /// // ...
+    /// # let user_query = "diary";
+    /// # let query = QueryParser::for_index(&index, vec![product_name]).parse_query(user_query)?;
+    ///
+    /// // This is where we build our collector with our custom score.
+    /// let top_docs_by_custom_score = TopDocs
+    ///         ::with_limit(10)
+    ///          .tweak_score(move |segment_reader: &SegmentReader| {
+    ///             // The argument is a function that returns our scoring
+    ///             // function.
+    ///             //
+    ///             // The point of this "mother" function is to gather all
+    ///             // of the segment level information we need for scoring.
+    ///             // Typically, fast_fields.
+    ///             //
+    ///             // In our case, we will get a reader for the popularity
+    ///             // fast field.
+    ///             let popularity_reader =
+    ///                 segment_reader.fast_fields().u64(popularity).unwrap();
+    ///
+    ///             // We can now define our actual scoring function
+    ///             move |doc: DocId, original_score: Score| {
+    ///                 let popularity: u64 = popularity_reader.get(doc);
+    ///                 // Well.. For the sake of the example we use a simple logarithm
+    ///                 // function.
+    ///                 let popularity_boost_score = ((2u64 + popularity) as f32).log2();
+    ///                 popularity_boost_score * original_score
+    ///             }
+    ///           });
+    /// # let reader = index.reader()?;
+    /// # let searcher = reader.searcher();
+    /// // ... and here are our documents. Note this is a simple vec.
+    /// // The `Score` in the pair is our tweaked score.
+    /// let resulting_docs: Vec<(Score, DocAddress)> =
+    ///      searcher.search(&*query, &top_docs_by_custom_score)?;
+    ///
+    /// # Ok(())
+    /// # }
+    /// ```
+    ///
+    /// # See also
+    /// [custom_score(...)](#method.custom_score).
+    pub fn tweak_score<TScore, TScoreSegmentTweaker, TScoreTweaker>(
+        self,
+        score_tweaker: TScoreTweaker,
+    ) -> impl Collector<Fruit = Vec<(TScore, DocAddress)>>
+    where
+        TScore: 'static + Send + Sync + Clone + PartialOrd,
+        TScoreSegmentTweaker: ScoreSegmentTweaker<TScore> + 'static,
+        TScoreTweaker: ScoreTweaker<TScore, Child = TScoreSegmentTweaker>,
+    {
+        TweakedScoreTopCollector::new(score_tweaker, self.0.limit())
+    }
+
+    /// Ranks the documents using a custom score.
+    ///
+    /// This method offers a convenient way to use a different score.
+    ///
+    /// As suggested by the prototype you can manually define your
+    /// own [`CustomScorer`](./trait.CustomScorer.html)
+    /// and pass it as an argument, but there is a much simpler way to
+    /// tweak your score: you can use a closure as in the following
+    /// example.
+    ///
+    /// # Limitation
+    ///
+    /// This method only makes it possible to compute the score from a given
+    /// `DocId`, fastfield values for the doc and any information you could
+    /// have precomputed beforehands. It does not make it possible for instance
+    /// to compute something like TfIdf as it does not have access to the list of query
+    /// terms present in the document, nor the term frequencies for the different terms.
+    ///
+    /// It can be used if your search engine relies on a learning-to-rank model for instance,
+    /// which does not rely on the term frequencies or positions as features.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// # #[macro_use]
+    /// # extern crate tantivy;
+    /// # use tantivy::schema::{Schema, FAST, TEXT};
+    /// # use tantivy::{Index, DocAddress, DocId};
+    /// # use tantivy::query::QueryParser;
+    /// use tantivy::SegmentReader;
+    /// use tantivy::collector::TopDocs;
+    /// use tantivy::schema::Field;
+    ///
+    /// # fn create_schema() -> Schema {
+    /// #    let mut schema_builder = Schema::builder();
+    /// #    schema_builder.add_text_field("product_name", TEXT);
+    /// #    schema_builder.add_u64_field("popularity", FAST);
+    /// #    schema_builder.add_u64_field("boosted", FAST);
+    /// #    schema_builder.build()
+    /// # }
+    /// #
+    /// # fn main() -> tantivy::Result<()> {
+    /// #   let schema = create_schema();
+    /// #   let index = Index::create_in_ram(schema);
+    /// #   let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
+    /// #   let product_name = index.schema().get_field("product_name").unwrap();
+    /// #   
+    /// let popularity: Field = index.schema().get_field("popularity").unwrap();
+    /// let boosted: Field = index.schema().get_field("boosted").unwrap();
+    /// #   index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64));
+    /// #   index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64));
+    /// #   index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64));
+    /// #   index_writer.commit()?;
+    /// // ...
+    /// # let user_query = "diary";
+    /// # let query = QueryParser::for_index(&index, vec![product_name]).parse_query(user_query)?;
+    ///
+    /// // This is where we build our collector with our custom score.
+    /// let top_docs_by_custom_score = TopDocs
+    ///         ::with_limit(10)
+    ///          .custom_score(move |segment_reader: &SegmentReader| {
+    ///             // The argument is a function that returns our scoring
+    ///             // function.
+    ///             //
+    ///             // The point of this "mother" function is to gather all
+    ///             // of the segment level information we need for scoring.
+    ///             // Typically, fast_fields.
+    ///             //
+    ///             // In our case, we will get a reader for the popularity
+    ///             // fast field and a boosted field.
+    ///             //
+    ///             // We want to get boosted items score, and when we get
+    ///             // a tie, return the item with the highest popularity.
+    ///             //
+    ///             // Note that this is implemented by using a `(u64, u64)`
+    ///             // as a score.
+    ///             let popularity_reader =
+    ///                 segment_reader.fast_fields().u64(popularity).unwrap();
+    ///             let boosted_reader =
+    ///                 segment_reader.fast_fields().u64(boosted).unwrap();
+    ///    
+    ///             // We can now define our actual scoring function
+    ///             move |doc: DocId| {
+    ///                 let popularity: u64 = popularity_reader.get(doc);
+    ///                 let boosted: u64 = boosted_reader.get(doc);
+    ///                 // Score do not have to be `f64` in tantivy.
+    ///                 // Here we return a couple to get lexicographical order
+    ///                 // for free.
+    ///                 (boosted, popularity)
+    ///             }
+    ///           });
+    /// # let reader = index.reader()?;
+    /// # let searcher = reader.searcher();
+    /// // ... and here are our documents. Note this is a simple vec.
+    /// // The `Score` in the pair is our tweaked score.
+    /// let resulting_docs: Vec<((u64, u64), DocAddress)> =
+    ///      searcher.search(&*query, &top_docs_by_custom_score)?;
+    ///
+    /// # Ok(())
+    /// # }
+    /// ```
+    ///
+    /// # See also
+    /// [tweak_score(...)](#method.tweak_score).
+    pub fn custom_score<TScore, TCustomSegmentScorer, TCustomScorer>(
+        self,
+        custom_score: TCustomScorer,
+    ) -> impl Collector<Fruit = Vec<(TScore, DocAddress)>>
+    where
+        TScore: 'static + Send + Sync + Clone + PartialOrd,
+        TCustomSegmentScorer: CustomSegmentScorer<TScore> + 'static,
+        TCustomScorer: CustomScorer<TScore, Child = TCustomSegmentScorer>,
+    {
+        CustomScoreTopCollector::new(custom_score, self.0.limit())
     }
 }
 
@@ -128,11 +428,12 @@ impl SegmentCollector for TopScoreSegmentCollector {
 #[cfg(test)]
 mod tests {
     use super::TopDocs;
-    use crate::query::QueryParser;
-    use crate::schema::Schema;
-    use crate::schema::TEXT;
+    use crate::collector::Collector;
+    use crate::query::{Query, QueryParser};
+    use crate::schema::{Field, Schema, FAST, STORED, TEXT};
     use crate::DocAddress;
     use crate::Index;
+    use crate::IndexWriter;
     use crate::Score;
 
     fn make_index() -> Index {
@@ -200,4 +501,97 @@ mod tests {
         TopDocs::with_limit(0);
     }
 
+    const TITLE: &str = "title";
+    const SIZE: &str = "size";
+
+    #[test]
+    fn test_top_field_collector_not_at_capacity() {
+        let mut schema_builder = Schema::builder();
+        let title = schema_builder.add_text_field(TITLE, TEXT);
+        let size = schema_builder.add_u64_field(SIZE, FAST);
+        let schema = schema_builder.build();
+        let (index, query) = index("beer", title, schema, |index_writer| {
+            index_writer.add_document(doc!(
+                title => "bottle of beer",
+                size => 12u64,
+            ));
+            index_writer.add_document(doc!(
+                title => "growler of beer",
+                size => 64u64,
+            ));
+            index_writer.add_document(doc!(
+                title => "pint of beer",
+                size => 16u64,
+            ));
+        });
+        let searcher = index.reader().unwrap().searcher();
+
+        let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
+        let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
+        assert_eq!(
+            top_docs,
+            vec![
+                (64, DocAddress(0, 1)),
+                (16, DocAddress(0, 2)),
+                (12, DocAddress(0, 0))
+            ]
+        );
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_field_does_not_exist() {
+        let mut schema_builder = Schema::builder();
+        let title = schema_builder.add_text_field(TITLE, TEXT);
+        let size = schema_builder.add_u64_field(SIZE, FAST);
+        let schema = schema_builder.build();
+        let (index, _) = index("beer", title, schema, |index_writer| {
+            index_writer.add_document(doc!(
+                title => "bottle of beer",
+                size => 12u64,
+            ));
+        });
+        let searcher = index.reader().unwrap().searcher();
+        let top_collector = TopDocs::with_limit(4).order_by_u64
author	Paul Masurel <paul.masurel@gmail.com>	2019-07-07 17:12:31 +0900
committer	GitHub <noreply@github.com>	2019-07-07 17:12:31 +0900
commit	3e368d92cb1ad9213ab1fede526d19f67f7f1e06 (patch)
tree	58463d2a3f7530d2798079807735fd35d23d243f
parent	0bc2c64a538b3a00956cfb5e2231fc1efaab656b (diff)