summaryrefslogtreecommitdiffstats
path: root/src/fieldnorm
diff options
context:
space:
mode:
authorPaul Masurel <paul.masurel@gmail.com>2018-03-28 08:28:49 +0900
committerPaul Masurel <paul.masurel@gmail.com>2018-03-28 08:28:49 +0900
commit8006f1df11c71ef1262ade9a682f0a850353b910 (patch)
tree9104ef723eda52858a3689e7d349c12cfcb8fb33 /src/fieldnorm
parentffa03bad71bbbf2368cc852304fc06ecaea7015a (diff)
Added comments
Diffstat (limited to 'src/fieldnorm')
-rw-r--r--src/fieldnorm/mod.rs21
-rw-r--r--src/fieldnorm/reader.rs32
2 files changed, 52 insertions, 1 deletions
diff --git a/src/fieldnorm/mod.rs b/src/fieldnorm/mod.rs
index c1a28e0..45fa921 100644
--- a/src/fieldnorm/mod.rs
+++ b/src/fieldnorm/mod.rs
@@ -1,3 +1,21 @@
+//! The fieldnorm represents the length associated to
+//! a given Field of a given document.
+//!
+//! This metric is important to compute the score of a
+//! document : a document having a query word in one its short fields
+//! (e.g. title) is likely to be more relevant than in one of its longer field
+//! (e.g. body).
+//!
+//! It encodes `fieldnorm` on one byte with some precision loss,
+//! using the exact same scheme as Lucene. Each value is place on a log-scale
+//! that takes values from `0` to `255`.
+//!
+//! A value on this scale is identified by a `fieldnorm_id`.
+//! Apart from compression, this scale also makes it possible to
+//! precompute computationally expensive functions of the fieldnorm
+//! in a very short array.
+//!
+//! This trick is used by the [BM25 similarity]().
mod code;
mod serializer;
mod writer;
@@ -7,4 +25,5 @@ pub use self::reader::FieldNormReader;
pub use self::writer::FieldNormsWriter;
pub use self::serializer::FieldNormsSerializer;
-use self::code::{fieldnorm_to_id, id_to_fieldnorm}; \ No newline at end of file
+use self::code::{fieldnorm_to_id, id_to_fieldnorm};
+
diff --git a/src/fieldnorm/reader.rs b/src/fieldnorm/reader.rs
index a097dd2..982eb1f 100644
--- a/src/fieldnorm/reader.rs
+++ b/src/fieldnorm/reader.rs
@@ -2,34 +2,66 @@ use super::{id_to_fieldnorm, fieldnorm_to_id};
use directory::ReadOnlySource;
use DocId;
+
+/// Reads the fieldnorm associated to a document.
+/// The fieldnorm represents the length associated to
+/// a given Field of a given document.
+///
+/// This metric is important to compute the score of a
+/// document : a document having a query word in one its short fields
+/// (e.g. title) is likely to be more relevant than in one of its longer field
+/// (e.g. body).
+///
+/// tantivy encodes `fieldnorm` on one byte with some precision loss,
+/// using the same scheme as Lucene. Each value is place on a log-scale
+/// that takes values from `0` to `255`.
+///
+/// A value on this scale is identified by a `fieldnorm_id`.
+/// Apart from compression, this scale also makes it possible to
+/// precompute computationally expensive functions of the fieldnorm
+/// in a very short array.
pub struct FieldNormReader {
data: ReadOnlySource
}
impl FieldNormReader {
+ /// Opens a field norm reader given its data source.
pub fn open(data: ReadOnlySource) -> Self {
FieldNormReader {
data
}
}
+ /// Returns the `fieldnorm` associated to a doc id.
+ /// The fieldnorm is a value approximating the number
+ /// of tokens in a given field of the `doc_id`.
+ ///
+ /// It is imprecise, and always lower than the actual
+ /// number of tokens.
+ ///
+ /// The fieldnorm is effectively decoded from the
+ /// `fieldnorm_id` by doing a simple table lookup.
pub fn fieldnorm(&self, doc_id: DocId) -> u32 {
let fieldnorm_id = self.fieldnorm_id(doc_id);
id_to_fieldnorm(fieldnorm_id)
}
+ /// Returns the `fieldnorm_id` associated to a document.
#[inline(always)]
pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 {
let fielnorms_data = self.data.as_slice();
fielnorms_data[doc_id as usize]
}
+ /// Converts a `fieldnorm_id` into a fieldnorm.
#[inline(always)]
pub fn id_to_fieldnorm(id: u8) -> u32 {
id_to_fieldnorm(id)
}
+ /// Converts a `fieldnorm` into a `fieldnorm_id`.
+ /// (This function is not injective).
#[inline(always)]
pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
fieldnorm_to_id(fieldnorm)