diff options
author | Paul Masurel <paul.masurel@gmail.com> | 2018-03-28 08:28:49 +0900 |
---|---|---|
committer | Paul Masurel <paul.masurel@gmail.com> | 2018-03-28 08:28:49 +0900 |
commit | 8006f1df11c71ef1262ade9a682f0a850353b910 (patch) | |
tree | 9104ef723eda52858a3689e7d349c12cfcb8fb33 /src/fieldnorm | |
parent | ffa03bad71bbbf2368cc852304fc06ecaea7015a (diff) |
Added comments
Diffstat (limited to 'src/fieldnorm')
-rw-r--r-- | src/fieldnorm/mod.rs | 21 | ||||
-rw-r--r-- | src/fieldnorm/reader.rs | 32 |
2 files changed, 52 insertions, 1 deletions
diff --git a/src/fieldnorm/mod.rs b/src/fieldnorm/mod.rs index c1a28e0..45fa921 100644 --- a/src/fieldnorm/mod.rs +++ b/src/fieldnorm/mod.rs @@ -1,3 +1,21 @@ +//! The fieldnorm represents the length associated to +//! a given Field of a given document. +//! +//! This metric is important to compute the score of a +//! document : a document having a query word in one its short fields +//! (e.g. title) is likely to be more relevant than in one of its longer field +//! (e.g. body). +//! +//! It encodes `fieldnorm` on one byte with some precision loss, +//! using the exact same scheme as Lucene. Each value is place on a log-scale +//! that takes values from `0` to `255`. +//! +//! A value on this scale is identified by a `fieldnorm_id`. +//! Apart from compression, this scale also makes it possible to +//! precompute computationally expensive functions of the fieldnorm +//! in a very short array. +//! +//! This trick is used by the [BM25 similarity](). mod code; mod serializer; mod writer; @@ -7,4 +25,5 @@ pub use self::reader::FieldNormReader; pub use self::writer::FieldNormsWriter; pub use self::serializer::FieldNormsSerializer; -use self::code::{fieldnorm_to_id, id_to_fieldnorm};
\ No newline at end of file +use self::code::{fieldnorm_to_id, id_to_fieldnorm}; + diff --git a/src/fieldnorm/reader.rs b/src/fieldnorm/reader.rs index a097dd2..982eb1f 100644 --- a/src/fieldnorm/reader.rs +++ b/src/fieldnorm/reader.rs @@ -2,34 +2,66 @@ use super::{id_to_fieldnorm, fieldnorm_to_id}; use directory::ReadOnlySource; use DocId; + +/// Reads the fieldnorm associated to a document. +/// The fieldnorm represents the length associated to +/// a given Field of a given document. +/// +/// This metric is important to compute the score of a +/// document : a document having a query word in one its short fields +/// (e.g. title) is likely to be more relevant than in one of its longer field +/// (e.g. body). +/// +/// tantivy encodes `fieldnorm` on one byte with some precision loss, +/// using the same scheme as Lucene. Each value is place on a log-scale +/// that takes values from `0` to `255`. +/// +/// A value on this scale is identified by a `fieldnorm_id`. +/// Apart from compression, this scale also makes it possible to +/// precompute computationally expensive functions of the fieldnorm +/// in a very short array. pub struct FieldNormReader { data: ReadOnlySource } impl FieldNormReader { + /// Opens a field norm reader given its data source. pub fn open(data: ReadOnlySource) -> Self { FieldNormReader { data } } + /// Returns the `fieldnorm` associated to a doc id. + /// The fieldnorm is a value approximating the number + /// of tokens in a given field of the `doc_id`. + /// + /// It is imprecise, and always lower than the actual + /// number of tokens. + /// + /// The fieldnorm is effectively decoded from the + /// `fieldnorm_id` by doing a simple table lookup. pub fn fieldnorm(&self, doc_id: DocId) -> u32 { let fieldnorm_id = self.fieldnorm_id(doc_id); id_to_fieldnorm(fieldnorm_id) } + /// Returns the `fieldnorm_id` associated to a document. #[inline(always)] pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 { let fielnorms_data = self.data.as_slice(); fielnorms_data[doc_id as usize] } + /// Converts a `fieldnorm_id` into a fieldnorm. #[inline(always)] pub fn id_to_fieldnorm(id: u8) -> u32 { id_to_fieldnorm(id) } + /// Converts a `fieldnorm` into a `fieldnorm_id`. + /// (This function is not injective). #[inline(always)] pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 { fieldnorm_to_id(fieldnorm) |