diff options
author | Paul Masurel <paul.masurel@gmail.com> | 2019-07-07 17:12:31 +0900 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-07-07 17:12:31 +0900 |
commit | 3e368d92cb1ad9213ab1fede526d19f67f7f1e06 (patch) | |
tree | 58463d2a3f7530d2798079807735fd35d23d243f | |
parent | 0bc2c64a538b3a00956cfb5e2231fc1efaab656b (diff) |
Issue/479 (#578)
* Sort by field relying on tweaked score
* Sort by u64/i64 get independent methods.
-rw-r--r-- | CHANGELOG.md | 2 | ||||
-rw-r--r-- | src/collector/custom_score_top_collector.rs | 126 | ||||
-rw-r--r-- | src/collector/mod.rs | 9 | ||||
-rw-r--r-- | src/collector/top_collector.rs | 9 | ||||
-rw-r--r-- | src/collector/top_field_collector.rs | 272 | ||||
-rw-r--r-- | src/collector/top_score_collector.rs | 416 | ||||
-rw-r--r-- | src/collector/tweak_score_top_collector.rs | 129 | ||||
-rw-r--r-- | src/core/searcher.rs | 2 | ||||
-rw-r--r-- | src/error.rs | 5 | ||||
-rw-r--r-- | src/fastfield/error.rs | 2 | ||||
-rw-r--r-- | src/fastfield/mod.rs | 2 |
11 files changed, 673 insertions, 301 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 2337131..0830c28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,8 @@ Tantivy 0.10.0 *Tantivy 0.10.0 index format is compatible with the index format in 0.9.0.* +- Added an API to easily tweak or entirely replace the + default score. See `TopDocs::tweak_score`and `TopScore::custom_score` (@pmasurel) - Added an ASCII folding filter (@drusellers) - Bugfix in `query.count` in presence of deletes (@pmasurel) - Added `.explain(...)` in `Query` and `Weight` to (@pmasurel) diff --git a/src/collector/custom_score_top_collector.rs b/src/collector/custom_score_top_collector.rs new file mode 100644 index 0000000..c6f3b6b --- /dev/null +++ b/src/collector/custom_score_top_collector.rs @@ -0,0 +1,126 @@ +use crate::collector::top_collector::{TopCollector, TopSegmentCollector}; +use crate::collector::{Collector, SegmentCollector}; +use crate::Result; +use crate::{DocAddress, DocId, Score, SegmentReader}; + +pub(crate) struct CustomScoreTopCollector<TCustomScorer, TScore = Score> { + custom_scorer: TCustomScorer, + collector: TopCollector<TScore>, +} + +impl<TCustomScorer, TScore> CustomScoreTopCollector<TCustomScorer, TScore> +where + TScore: Clone + PartialOrd, +{ + pub fn new( + custom_scorer: TCustomScorer, + limit: usize, + ) -> CustomScoreTopCollector<TCustomScorer, TScore> { + CustomScoreTopCollector { + custom_scorer, + collector: TopCollector::with_limit(limit), + } + } +} + +/// A custom segment scorer makes it possible to define any kind of score +/// for a given document belonging to a specific segment. +/// +/// It is the segment local version of the [`CustomScorer`](./trait.CustomScorer.html). +pub trait CustomSegmentScorer<TScore>: 'static { + /// Computes the score of a specific `doc`. + fn score(&self, doc: DocId) -> TScore; +} + +/// `CustomScorer` makes it possible to define any kind of score. +/// +/// The `CustomerScorer` itself does not make much of the computation itself. +/// Instead, it helps constructing `Self::Child` instances that will compute +/// the score at a segment scale. +pub trait CustomScorer<TScore>: Sync { + /// Type of the associated [`CustomSegmentScorer`](./trait.CustomSegmentScorer.html). + type Child: CustomSegmentScorer<TScore>; + /// Builds a child scorer for a specific segment. The child scorer is associated to + /// a specific segment. + fn segment_scorer(&self, segment_reader: &SegmentReader) -> Result<Self::Child>; +} + +impl<TCustomScorer, TScore> Collector for CustomScoreTopCollector<TCustomScorer, TScore> +where + TCustomScorer: CustomScorer<TScore>, + TScore: 'static + PartialOrd + Clone + Send + Sync, +{ + type Fruit = Vec<(TScore, DocAddress)>; + + type Child = CustomScoreTopSegmentCollector<TCustomScorer::Child, TScore>; + + fn for_segment( + &self, + segment_local_id: u32, + segment_reader: &SegmentReader, + ) -> Result<Self::Child> { + let segment_scorer = self.custom_scorer.segment_scorer(segment_reader)?; + let segment_collector = self + .collector + .for_segment(segment_local_id, segment_reader)?; + Ok(CustomScoreTopSegmentCollector { + segment_collector, + segment_scorer, + }) + } + + fn requires_scoring(&self) -> bool { + false + } + + fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit> { + self.collector.merge_fruits(segment_fruits) + } +} + +pub struct CustomScoreTopSegmentCollector<T, TScore> +where + TScore: 'static + PartialOrd + Clone + Send + Sync + Sized, + T: CustomSegmentScorer<TScore>, +{ + segment_collector: TopSegmentCollector<TScore>, + segment_scorer: T, +} + +impl<T, TScore> SegmentCollector for CustomScoreTopSegmentCollector<T, TScore> +where + TScore: 'static + PartialOrd + Clone + Send + Sync, + T: 'static + CustomSegmentScorer<TScore>, +{ + type Fruit = Vec<(TScore, DocAddress)>; + + fn collect(&mut self, doc: DocId, _score: Score) { + let score = self.segment_scorer.score(doc); + self.segment_collector.collect(doc, score); + } + + fn harvest(self) -> Vec<(TScore, DocAddress)> { + self.segment_collector.harvest() + } +} + +impl<F, TScore, T> CustomScorer<TScore> for F +where + F: 'static + Send + Sync + Fn(&SegmentReader) -> T, + T: CustomSegmentScorer<TScore>, +{ + type Child = T; + + fn segment_scorer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> { + Ok((self)(segment_reader)) + } +} + +impl<F, TScore> CustomSegmentScorer<TScore> for F +where + F: 'static + Sync + Send + Fn(DocId) -> TScore, +{ + fn score(&self, doc: DocId) -> TScore { + (self)(doc) + } +} diff --git a/src/collector/mod.rs b/src/collector/mod.rs index c2ae35b..c19282e 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -66,7 +66,7 @@ let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) = The `Collector` trait is implemented for up to 4 collectors. If you have more than 4 collectors, you can either group them into -tuples of tuples `(a,(b,(c,d)))`, or rely on `MultiCollector`'s. +tuples of tuples `(a,(b,(c,d)))`, or rely on [`MultiCollector`](./struct.MultiCollector.html). # Combining several collectors dynamically @@ -103,8 +103,11 @@ mod top_collector; mod top_score_collector; pub use self::top_score_collector::TopDocs; -mod top_field_collector; -pub use self::top_field_collector::TopDocsByField; +mod custom_score_top_collector; +pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer}; + +mod tweak_score_top_collector; +pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker}; mod facet_collector; pub use self::facet_collector::FacetCollector; diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index ea2b7ca..9a9d2dc 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -177,9 +177,8 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> { #[cfg(test)] mod tests { - use super::{TopCollector, TopSegmentCollector}; + use super::TopSegmentCollector; use crate::DocAddress; - use crate::Score; #[test] fn test_top_collector_not_at_capacity() { @@ -215,10 +214,4 @@ mod tests { ] ); } - - #[test] - #[should_panic] - fn test_top_0() { - let _collector: TopCollector<Score> = TopCollector::with_limit(0); - } } diff --git a/src/collector/top_field_collector.rs b/src/collector/top_field_collector.rs deleted file mode 100644 index 9a91d9e..0000000 --- a/src/collector/top_field_collector.rs +++ /dev/null @@ -1,272 +0,0 @@ -use super::Collector; -use crate::collector::top_collector::TopCollector; -use crate::collector::top_collector::TopSegmentCollector; -use crate::collector::SegmentCollector; -use crate::fastfield::FastFieldReader; -use crate::fastfield::FastValue; -use crate::schema::Field; -use crate::DocAddress; -use crate::Result; -use crate::SegmentLocalId; -use crate::SegmentReader; -use crate::TantivyError; -use std::marker::PhantomData; - -/// The Top Field Collector keeps track of the K documents -/// sorted by a fast field in the index -/// -/// The implementation is based on a `BinaryHeap`. -/// The theorical complexity for collecting the top `K` out of `n` documents -/// is `O(n log K)`. -/// -/// ```rust -/// #[macro_use] -/// extern crate tantivy; -/// # use tantivy::schema::{Schema, Field, FAST, TEXT}; -/// # use tantivy::{Index, Result, DocAddress}; -/// # use tantivy::query::{Query, QueryParser}; -/// use tantivy::Searcher; -/// use tantivy::collector::TopDocs; -/// -/// # fn main() -> tantivy::Result<()> { -/// # let mut schema_builder = Schema::builder(); -/// # let title = schema_builder.add_text_field("title", TEXT); -/// # let rating = schema_builder.add_u64_field("rating", FAST); -/// # let schema = schema_builder.build(); -/// # let index = Index::create_in_ram(schema); -/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; -/// # index_writer.add_document(doc!( -/// # title => "The Name of the Wind", -/// # rating => 92u64, -/// # )); -/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64)); -/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64)); -/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64)); -/// # index_writer.commit()?; -/// # let reader = index.reader()?; -/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?; -/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?; -/// # assert_eq!(top_docs, -/// # vec![(97u64, DocAddress(0u32, 1)), -/// # (80u64, DocAddress(0u32, 3))]); -/// # Ok(()) -/// # } -/// # -/// /// Searches the document matching the given query, and -/// /// collects the top 10 documents, order by the `field` -/// /// given in argument. -/// /// -/// /// `field` is required to be a FAST field. -/// fn docs_sorted_by_rating(searcher: &Searcher, -/// query: &Query, -/// sort_by_field: Field) -/// -> Result<Vec<(u64, DocAddress)>> { -/// -/// // This is where we build our collector! -/// let top_docs_by_rating = TopDocs::with_limit(2).order_by_field(sort_by_field); -/// -/// // ... and here is our documents. Not this is a simple vec. -/// // The `u64` in the pair is the value of our fast field for each documents. -/// searcher.search(query, &top_docs_by_rating) -/// } -/// ``` -pub struct TopDocsByField<T> { - collector: TopCollector<T>, - field: Field, -} - -impl<T: FastValue + PartialOrd + Clone> TopDocsByField<T> { - /// Creates a top field collector, with a number of documents equal to "limit". - /// - /// The given field name must be a fast field, otherwise the collector have an error while - /// collecting results. - /// - /// This constructor is crate-private. Client are supposed to call - /// build `TopDocsByField` object using the `TopDocs` API. - /// - /// e.g.: - /// `TopDocs::with_limit(2).order_by_field(sort_by_field)` - /// - /// # Panics - /// The method panics if limit is 0 - pub(crate) fn new(field: Field, limit: usize) -> TopDocsByField<T> { - TopDocsByField { - collector: TopCollector::with_limit(limit), - field, - } - } -} - -impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByField<T> { - type Fruit = Vec<(T, DocAddress)>; - - type Child = TopFieldSegmentCollector<T>; - - fn for_segment( - &self, - segment_local_id: SegmentLocalId, - reader: &SegmentReader, - ) -> Result<TopFieldSegmentCollector<T>> { - let collector = self.collector.for_segment(segment_local_id, reader)?; - let reader = reader.fast_fields().u64(self.field).ok_or_else(|| { - let field_name = reader.schema().get_field_name(self.field); - TantivyError::SchemaError(format!("Failed to find fast field reader {:?}", field_name)) - })?; - Ok(TopFieldSegmentCollector { - collector, - reader, - _type: PhantomData, - }) - } - - fn requires_scoring(&self) -> bool { - false - } - - fn merge_fruits( - &self, - segment_fruits: Vec<Vec<(T, DocAddress)>>, - ) -> Result<Vec<(T, DocAddress)>> { - self.collector.merge_fruits(segment_fruits) - } -} - -pub struct TopFieldSegmentCollector<T> { - collector: TopSegmentCollector<u64>, - reader: FastFieldReader<u64>, - _type: PhantomData<T>, -} - -impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector - for TopFieldSegmentCollector<T> -{ - type Fruit = Vec<(T, DocAddress)>; - - fn collect(&mut self, doc: u32, _score: f32) { - let field_value = self.reader.get(doc); - self.collector.collect(doc, field_value); - } - - fn harvest(self) -> Vec<(T, DocAddress)> { - self.collector - .harvest() - .into_iter() - .map(|(val, doc_address)| (T::from_u64(val), doc_address)) - .collect() - } -} - -#[cfg(test)] -mod tests { - use super::TopDocsByField; - use crate::collector::Collector; - use crate::collector::TopDocs; - use crate::query::Query; - use crate::query::QueryParser; - use crate::schema::Field; - use crate::schema::IntOptions; - use crate::schema::{Schema, FAST, TEXT}; - use crate::DocAddress; - use crate::Index; - use crate::IndexWriter; - use crate::TantivyError; - use matches::assert_matches; - - const TITLE: &str = "title"; - const SIZE: &str = "size"; - - #[test] - fn test_top_collector_not_at_capacity() { - let mut schema_builder = Schema::builder(); - let title = schema_builder.add_text_field(TITLE, TEXT); - let size = schema_builder.add_u64_field(SIZE, FAST); - let schema = schema_builder.build(); - let (index, query) = index("beer", title, schema, |index_writer| { - index_writer.add_document(doc!( - title => "bottle of beer", - size => 12u64, - )); - index_writer.add_document(doc!( - title => "growler of beer", - size => 64u64, - )); - index_writer.add_document(doc!( - title => "pint of beer", - size => 16u64, - )); - }); - let searcher = index.reader().unwrap().searcher(); - - let top_collector = TopDocs::with_limit(4).order_by_field(size); - let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap(); - assert_eq!( - top_docs, - vec![ - (64, DocAddress(0, 1)), - (16, DocAddress(0, 2)), - (12, DocAddress(0, 0)) - ] - ); - } - - #[test] - #[should_panic] - fn test_field_does_not_exist() { - let mut schema_builder = Schema::builder(); - let title = schema_builder.add_text_field(TITLE, TEXT); - let size = schema_builder.add_u64_field(SIZE, FAST); - let schema = schema_builder.build(); - let (index, _) = index("beer", title, schema, |index_writer| { - index_writer.add_document(doc!( - title => "bottle of beer", - size => 12u64, - )); - }); - let searcher = index.reader().unwrap().searcher(); - let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(Field(2)); - let segment_reader = searcher.segment_reader(0u32); - top_collector - .for_segment(0, segment_reader) - .expect("should panic"); - } - - #[test] - fn test_field_not_fast_field() { - let mut schema_builder = Schema::builder(); - let title = schema_builder.add_text_field(TITLE, TEXT); - let size = schema_builder.add_u64_field(SIZE, IntOptions::default()); - let schema = schema_builder.build(); - let (index, _) = index("beer", title, schema, |index_writer| { - index_writer.add_document(doc!( - title => "bottle of beer", - size => 12u64, - )); - }); - let searcher = index.reader().unwrap().searcher(); - let segment = searcher.segment_reader(0); - let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(size); - assert_matches!( - top_collector - .for_segment(0, segment) - .map(|_| ()) - .unwrap_err(), - TantivyError::SchemaError(_) - ); - } - - fn index( - query: &str, - query_field: Field, - schema: Schema, - mut doc_adder: impl FnMut(&mut IndexWriter) -> (), - ) -> (Index, Box<dyn Query>) { - let index = Index::create_in_ram(schema); - - let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); - doc_adder(&mut index_writer); - index_writer.commit().unwrap(); - let query_parser = QueryParser::for_index(&index, vec![query_field]); - let query = query_parser.parse_query(query).unwrap(); - (index, query) - } -} diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index bc247d4..c9b03d0 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -1,9 +1,11 @@ use super::Collector; +use crate::collector::custom_score_top_collector::CustomScoreTopCollector; use crate::collector::top_collector::TopCollector; use crate::collector::top_collector::TopSegmentCollector; -use crate::collector::SegmentCollector; -use crate::collector::TopDocsByField; -use crate::fastfield::FastValue; +use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector; +use crate::collector::{ + CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector, +}; use crate::schema::Field; use crate::DocAddress; use crate::DocId; @@ -77,13 +79,311 @@ impl TopDocs { /// Set top-K to rank documents by a given fast field. /// - /// (By default, `TopDocs` collects the top-K documents sorted by - /// the similarity score.) - pub fn order_by_field<T: PartialOrd + FastValue + Clone>( + /// ```rust + /// #[macro_use] + /// extern crate tantivy; + /// # use tantivy::schema::{Schema, FAST, TEXT}; + /// # use tantivy::{Index, Result, DocAddress}; + /// # use tantivy::query::{Query, QueryParser}; + /// use tantivy::Searcher; + /// use tantivy::collector::TopDocs; + /// use tantivy::schema::Field; + /// + /// # fn main() -> tantivy::Result<()> { + /// # let mut schema_builder = Schema::builder(); + /// # let title = schema_builder.add_text_field("title", TEXT); + /// # let rating = schema_builder.add_u64_field("rating", FAST); + /// # let schema = schema_builder.build(); + /// # + /// # let index = Index::create_in_ram(schema); + /// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; + /// # index_writer.add_document(doc!( + /// # title => "The Name of the Wind", + /// # rating => 92u64, + /// # )); + /// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64)); + /// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64)); + /// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64)); + /// # index_writer.commit()?; + /// # let reader = index.reader()?; + /// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?; + /// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?; + /// # assert_eq!(top_docs, + /// # vec![(97u64, DocAddress(0u32, 1)), + /// # (80u64, DocAddress(0u32, 3))]); + /// # Ok(()) + /// # } + /// + /// + /// /// Searches the document matching the given query, and + /// /// collects the top 10 documents, order by the u64-`field` + /// /// given in argument. + /// /// + /// /// `field` is required to be a FAST field. + /// fn docs_sorted_by_rating(searcher: &Searcher, + /// query: &Query, + /// sort_by_field: Field) + /// -> Result<Vec<(u64, DocAddress)>> { + /// + /// // This is where we build our topdocs collector + /// // + /// // Note the generics parameter that needs to match the + /// // type `sort_by_field`. + /// let top_docs_by_rating = TopDocs + /// ::with_limit(10) + /// .order_by_u64_field(sort_by_field); + /// + /// // ... and here are our documents. Note this is a simple vec. + /// // The `u64` in the pair is the value of our fast field for + /// // each documents. + /// // + /// // The vec is sorted decreasingly by `sort_by_field`, and has a + /// // length of 10, or less if not enough documents matched the + /// // query. + /// let resulting_docs: Vec<(u64, DocAddress)> = + /// searcher.search(query, &top_docs_by_rating)?; + /// + /// Ok(resulting_docs) + /// } + /// ``` + /// + /// # Panics + /// + /// May panic if the field requested is not a fast field. + /// + pub fn order_by_u64_field( self, field: Field, - ) -> TopDocsByField<T> { - TopDocsByField::new(field, self.0.limit()) + ) -> impl Collector<Fruit = Vec<(u64, DocAddress)>> { + self.custom_score(move |segment_reader: &SegmentReader| { + let ff_reader = segment_reader + .fast_fields() + .u64(field) + .expect("Field requested is not a i64/u64 fast field."); + move |doc: DocId| ff_reader.get(doc) + }) + } + + /// Ranks the documents using a custom score. + /// + /// This method offers a convenient way to tweak or replace + /// the documents score. As suggested by the prototype you can + /// manually define your own [`ScoreTweaker`](./trait.ScoreTweaker.html) + /// and pass it as an argument, but there is a much simpler way to + /// tweak your score: you can use a closure as in the following + /// example. + /// + /// # Example + /// + /// Typically, you will want to rely on one or more fast fields, + /// to alter the original relevance `Score`. + /// + /// For instance, in the following, we assume that we are implementing + /// an e-commerce website that has a fast field called `popularity` + /// that rates whether a product is typically often bought by users. + /// + /// In the following example will will tweak our ranking a bit by + /// boosting popular products a notch. + /// + /// In more serious application, this tweaking could involved running a + /// learning-to-rank model over various features + /// + /// ```rust + /// #[macro_use] + /// extern crate tantivy; + /// # use tantivy::schema::{Schema, FAST, TEXT}; + /// # use tantivy::{Index, DocAddress, DocId, Score}; + /// # use tantivy::query::QueryParser; + /// use tantivy::SegmentReader; + /// use tantivy::collector::TopDocs; + /// use tantivy::schema::Field; + /// + /// # fn create_schema() -> Schema { + /// # let mut schema_builder = Schema::builder(); + /// # schema_builder.add_text_field("product_name", TEXT); + /// # schema_builder.add_u64_field("popularity", FAST); + /// # schema_builder.build() + /// # } + /// # + /// # fn main() -> tantivy::Result<()> { + /// # let schema = create_schema(); + /// # let index = Index::create_in_ram(schema); + /// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; + /// # let product_name = index.schema().get_field("product_name").unwrap(); + /// # + /// let popularity: Field = index.schema().get_field("popularity").unwrap(); + /// # index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64)); + /// # index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64)); + /// # index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64)); + /// # index_writer.commit()?; + /// // ... + /// # let user_query = "diary"; + /// # let query = QueryParser::for_index(&index, vec![product_name]).parse_query(user_query)?; + /// + /// // This is where we build our collector with our custom score. + /// let top_docs_by_custom_score = TopDocs + /// ::with_limit(10) + /// .tweak_score(move |segment_reader: &SegmentReader| { + /// // The argument is a function that returns our scoring + /// // function. + /// // + /// // The point of this "mother" function is to gather all + /// // of the segment level information we need for scoring. + /// // Typically, fast_fields. + /// // + /// // In our case, we will get a reader for the popularity + /// // fast field. + /// let popularity_reader = + /// segment_reader.fast_fields().u64(popularity).unwrap(); + /// + /// // We can now define our actual scoring function + /// move |doc: DocId, original_score: Score| { + /// let popularity: u64 = popularity_reader.get(doc); + /// // Well.. For the sake of the example we use a simple logarithm + /// // function. + /// let popularity_boost_score = ((2u64 + popularity) as f32).log2(); + /// popularity_boost_score * original_score + /// } + /// }); + /// # let reader = index.reader()?; + /// # let searcher = reader.searcher(); + /// // ... and here are our documents. Note this is a simple vec. + /// // The `Score` in the pair is our tweaked score. + /// let resulting_docs: Vec<(Score, DocAddress)> = + /// searcher.search(&*query, &top_docs_by_custom_score)?; + /// + /// # Ok(()) + /// # } + /// ``` + /// + /// # See also + /// [custom_score(...)](#method.custom_score). + pub fn tweak_score<TScore, TScoreSegmentTweaker, TScoreTweaker>( + self, + score_tweaker: TScoreTweaker, + ) -> impl Collector<Fruit = Vec<(TScore, DocAddress)>> + where + TScore: 'static + Send + Sync + Clone + PartialOrd, + TScoreSegmentTweaker: ScoreSegmentTweaker<TScore> + 'static, + TScoreTweaker: ScoreTweaker<TScore, Child = TScoreSegmentTweaker>, + { + TweakedScoreTopCollector::new(score_tweaker, self.0.limit()) + } + + /// Ranks the documents using a custom score. + /// + /// This method offers a convenient way to use a different score. + /// + /// As suggested by the prototype you can manually define your + /// own [`CustomScorer`](./trait.CustomScorer.html) + /// and pass it as an argument, but there is a much simpler way to + /// tweak your score: you can use a closure as in the following + /// example. + /// + /// # Limitation + /// + /// This method only makes it possible to compute the score from a given + /// `DocId`, fastfield values for the doc and any information you could + /// have precomputed beforehands. It does not make it possible for instance + /// to compute something like TfIdf as it does not have access to the list of query + /// terms present in the document, nor the term frequencies for the different terms. + /// + /// It can be used if your search engine relies on a learning-to-rank model for instance, + /// which does not rely on the term frequencies or positions as features. + /// + /// # Example + /// + /// ```rust + /// # #[macro_use] + /// # extern crate tantivy; + /// # use tantivy::schema::{Schema, FAST, TEXT}; + /// # use tantivy::{Index, DocAddress, DocId}; + /// # use tantivy::query::QueryParser; + /// use tantivy::SegmentReader; + /// use tantivy::collector::TopDocs; + /// use tantivy::schema::Field; + /// + /// # fn create_schema() -> Schema { + /// # let mut schema_builder = Schema::builder(); + /// # schema_builder.add_text_field("product_name", TEXT); + /// # schema_builder.add_u64_field("popularity", FAST); + /// # schema_builder.add_u64_field("boosted", FAST); + /// # schema_builder.build() + /// # } + /// # + /// # fn main() -> tantivy::Result<()> { + /// # let schema = create_schema(); + /// # let index = Index::create_in_ram(schema); + /// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; + /// # let product_name = index.schema().get_field("product_name").unwrap(); + /// # + /// let popularity: Field = index.schema().get_field("popularity").unwrap(); + /// let boosted: Field = index.schema().get_field("boosted").unwrap(); + /// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64)); + /// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64)); + /// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64)); + /// # index_writer.commit()?; + /// // ... + /// # let user_query = "diary"; + /// # let query = QueryParser::for_index(&index, vec![product_name]).parse_query(user_query)?; + /// + /// // This is where we build our collector with our custom score. + /// let top_docs_by_custom_score = TopDocs + /// ::with_limit(10) + /// .custom_score(move |segment_reader: &SegmentReader| { + /// // The argument is a function that returns our scoring + /// // function. + /// // + /// // The point of this "mother" function is to gather all + /// // of the segment level information we need for scoring. + /// // Typically, fast_fields. + /// // + /// // In our case, we will get a reader for the popularity + /// // fast field and a boosted field. + /// // + /// // We want to get boosted items score, and when we get + /// // a tie, return the item with the highest popularity. + /// // + /// // Note that this is implemented by using a `(u64, u64)` + /// // as a score. + /// let popularity_reader = + /// segment_reader.fast_fields().u64(popularity).unwrap(); + /// let boosted_reader = + /// segment_reader.fast_fields().u64(boosted).unwrap(); + /// + /// // We can now define our actual scoring function + /// move |doc: DocId| { + /// let popularity: u64 = popularity_reader.get(doc); + /// let boosted: u64 = boosted_reader.get(doc); + /// // Score do not have to be `f64` in tantivy. + /// // Here we return a couple to get lexicographical order + /// // for free. + /// (boosted, popularity) + /// } + /// }); + /// # let reader = index.reader()?; + /// # let searcher = reader.searcher(); + /// // ... and here are our documents. Note this is a simple vec. + /// // The `Score` in the pair is our tweaked score. + /// let resulting_docs: Vec<((u64, u64), DocAddress)> = + /// searcher.search(&*query, &top_docs_by_custom_score)?; + /// + /// # Ok(()) + /// # } + /// ``` + /// + /// # See also + /// [tweak_score(...)](#method.tweak_score). + pub fn custom_score<TScore, TCustomSegmentScorer, TCustomScorer>( + self, + custom_score: TCustomScorer, + ) -> impl Collector<Fruit = Vec<(TScore, DocAddress)>> + where + TScore: 'static + Send + Sync + Clone + PartialOrd, + TCustomSegmentScorer: CustomSegmentScorer<TScore> + 'static, + TCustomScorer: CustomScorer<TScore, Child = TCustomSegmentScorer>, + { + CustomScoreTopCollector::new(custom_score, self.0.limit()) } } @@ -128,11 +428,12 @@ impl SegmentCollector for TopScoreSegmentCollector { #[cfg(test)] mod tests { use super::TopDocs; - use crate::query::QueryParser; - use crate::schema::Schema; - use crate::schema::TEXT; + use crate::collector::Collector; + use crate::query::{Query, QueryParser}; + use crate::schema::{Field, Schema, FAST, STORED, TEXT}; use crate::DocAddress; use crate::Index; + use crate::IndexWriter; use crate::Score; fn make_index() -> Index { @@ -200,4 +501,97 @@ mod tests { TopDocs::with_limit(0); } + const TITLE: &str = "title"; + const SIZE: &str = "size"; + + #[test] + fn test_top_field_collector_not_at_capacity() { + let mut schema_builder = Schema::builder(); + let title = schema_builder.add_text_field(TITLE, TEXT); + let size = schema_builder.add_u64_field(SIZE, FAST); + let schema = schema_builder.build(); + let (index, query) = index("beer", title, schema, |index_writer| { + index_writer.add_document(doc!( + title => "bottle of beer", + size => 12u64, + )); + index_writer.add_document(doc!( + title => "growler of beer", + size => 64u64, + )); + index_writer.add_document(doc!( + title => "pint of beer", + size => 16u64, + )); + }); + let searcher = index.reader().unwrap().searcher(); + + let top_collector = TopDocs::with_limit(4).order_by_u64_field(size); + let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap(); + assert_eq!( + top_docs, + vec![ + (64, DocAddress(0, 1)), + (16, DocAddress(0, 2)), + (12, DocAddress(0, 0)) + ] + ); + } + + #[test] + #[should_panic] + fn test_field_does_not_exist() { + let mut schema_builder = Schema::builder(); + let title = schema_builder.add_text_field(TITLE, TEXT); + let size = schema_builder.add_u64_field(SIZE, FAST); + let schema = schema_builder.build(); + let (index, _) = index("beer", title, schema, |index_writer| { + index_writer.add_document(doc!( + title => "bottle of beer", + size => 12u64, + )); + }); + let searcher = index.reader().unwrap().searcher(); + let top_collector = TopDocs::with_limit(4).order_by_u64 |