summaryrefslogtreecommitdiffstats
path: root/examples/iterating_docs_and_positions.rs
blob: 0be84ec0596b70cd48057caf04f7cc64f7aa9ecb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
// # Iterating docs and positioms.
//
// At its core of tantivy, relies on a data structure
// called an inverted index.
//
// This example shows how to manually iterate through
// the list of documents containing a term, getting
// its term frequency, and accessing its positions.

// ---
// Importing tantivy...
use tantivy::schema::*;
use tantivy::{doc, DocId, DocSet, Index, Postings};

fn main() -> tantivy::Result<()> {
    // We first create a schema for the sake of the
    // example. Check the `basic_search` example for more information.
    let mut schema_builder = Schema::builder();

    // For this example, we need to make sure to index positions for our title
    // field. `TEXT` precisely does this.
    let title = schema_builder.add_text_field("title", TEXT | STORED);
    let schema = schema_builder.build();

    let index = Index::create_in_ram(schema.clone());

    let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
    index_writer.add_document(doc!(title => "The Old Man and the Sea"));
    index_writer.add_document(doc!(title => "Of Mice and Men"));
    index_writer.add_document(doc!(title => "The modern Promotheus"));
    index_writer.commit()?;

    let reader = index.reader()?;

    let searcher = reader.searcher();

    // A tantivy index is actually a collection of segments.
    // Similarly, a searcher just wraps a list `segment_reader`.
    //
    // (Because we indexed a very small number of documents over one thread
    // there is actually only one segment here, but let's iterate through the list
    // anyway)
    for segment_reader in searcher.segment_readers() {
        // A segment contains different data structure.
        // Inverted index stands for the combination of
        // - the term dictionary
        // - the inverted lists associated to each terms and their positions
        let inverted_index = segment_reader.inverted_index(title);

        // A `Term` is a text token associated with a field.
        // Let's go through all docs containing the term `title:the` and access their position
        let term_the = Term::from_field_text(title, "the");

        // This segment posting object is like a cursor over the documents matching the term.
        // The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies
        // and positions.
        //
        // If you don't need all this information, you may get better performance by decompressing less
        // information.
        if let Some(mut segment_postings) =
            inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions)
        {
            // this buffer will be used to request for positions
            let mut positions: Vec<u32> = Vec::with_capacity(100);
            while segment_postings.advance() {
                // the number of time the term appears in the document.
                let doc_id: DocId = segment_postings.doc(); //< do not try to access this before calling advance once.

                // This MAY contains deleted documents as well.
                if segment_reader.is_deleted(doc_id) {
                    continue;
                }

                // the number of time the term appears in the document.
                let term_freq: u32 = segment_postings.term_freq();
                // accessing positions is slightly expensive and lazy, do not request
                // for them if you don't need them for some documents.
                segment_postings.positions(&mut positions);

                // By definition we should have `term_freq` positions.
                assert_eq!(positions.len(), term_freq as usize);

                // This prints:
                // ```
                // Doc 0: TermFreq 2: [0, 4]
                // Doc 2: TermFreq 1: [0]
                // ```
                println!("Doc {}: TermFreq {}: {:?}", doc_id, term_freq, positions);
            }
        }
    }

    // A `Term` is a text token associated with a field.
    // Let's go through all docs containing the term `title:the` and access their position
    let term_the = Term::from_field_text(title, "the");

    // Some other powerful operations (especially `.skip_to`) may be useful to consume these
    // posting lists rapidly.
    // You can check for them in the [`DocSet`](https://docs.rs/tantivy/~0/tantivy/trait.DocSet.html) trait
    // and the [`Postings`](https://docs.rs/tantivy/~0/tantivy/trait.Postings.html) trait

    // Also, for some VERY specific high performance use case like an OLAP analysis of logs,
    // you can get better performance by accessing directly the blocks of doc ids.
    for segment_reader in searcher.segment_readers() {
        // A segment contains different data structure.
        // Inverted index stands for the combination of
        // - the term dictionary
        // - the inverted lists associated to each terms and their positions
        let inverted_index = segment_reader.inverted_index(title);

        // This segment posting object is like a cursor over the documents matching the term.
        // The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies
        // and positions.
        //
        // If you don't need all this information, you may get better performance by decompressing less
        // information.
        if let Some(mut block_segment_postings) =
            inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)
        {
            while block_segment_postings.advance() {
                // Once again these docs MAY contains deleted documents as well.
                let docs = block_segment_postings.docs();
                // Prints `Docs [0, 2].`
                println!("Docs {:?}", docs);
            }
        }
    }

    Ok(())
}