examples/deleting_updating_documents.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

// # Deleting and Updating (?) documents
//
// This example explains how to delete and update documents.
// In fact there is actually no such thing as an update in tantivy.
//
// To update a document, you need to delete a document and then reinsert
// its new version.
//
// ---
// Importing tantivy...
#[macro_use]
extern crate tantivy;
use tantivy::collector::TopDocs;
use tantivy::query::TermQuery;
use tantivy::schema::*;
use tantivy::Index;
use tantivy::IndexReader;

// A simple helper function to fetch a single document
// given its id from our index.
// It will be helpful to check our work.
fn extract_doc_given_isbn(
    reader: &IndexReader,
    isbn_term: &Term,
) -> tantivy::Result<Option<Document>> {
    let searcher = reader.searcher();

    // This is the simplest query you can think of.
    // It matches all of the documents containing a specific term.
    //
    // The second argument is here to tell we don't care about decoding positions,
    // or term frequencies.
    let term_query = TermQuery::new(isbn_term.clone(), IndexRecordOption::Basic);
    let top_docs = searcher.search(&term_query, &TopDocs::with_limit(1))?;

    if let Some((_score, doc_address)) = top_docs.first() {
        let doc = searcher.doc(*doc_address)?;
        Ok(Some(doc))
    } else {
        // no doc matching this ID.
        Ok(None)
    }
}

fn main() -> tantivy::Result<()> {
    // # Defining the schema
    //
    // Check out the *basic_search* example if this makes
    // small sense to you.
    let mut schema_builder = Schema::builder();

    // Tantivy does not really have a notion of primary id.
    // This may change in the future.
    //
    // Still, we can create a `isbn` field and use it as an id. This
    // field can be `u64` or a `text`, depending on your use case.
    // It just needs to be indexed.
    //
    // If it is `text`, let's make sure to keep it `raw` and let's avoid
    // running any text processing on it.
    // This is done by associating this field to the tokenizer named `raw`.
    // Rather than building our [`TextOptions`](//docs.rs/tantivy/~0/tantivy/schema/struct.TextOptions.html) manually,
    // We use the `STRING` shortcut. `STRING` stands for indexed (without term frequency or positions)
    // and untokenized.
    //
    // Because we also want to be able to see this `id` in our returned documents,
    // we also mark the field as stored.
    let isbn = schema_builder.add_text_field("isbn", STRING | STORED);
    let title = schema_builder.add_text_field("title", TEXT | STORED);
    let schema = schema_builder.build();

    let index = Index::create_in_ram(schema.clone());

    let mut index_writer = index.writer(50_000_000)?;

    // Let's add a couple of documents, for the sake of the example.
    let mut old_man_doc = Document::default();
    old_man_doc.add_text(title, "The Old Man and the Sea");
    index_writer.add_document(doc!(
        isbn => "978-0099908401",
        title => "The old Man and the see"
    ));
    index_writer.add_document(doc!(
        isbn => "978-0140177398",
        title => "Of Mice and Men",
    ));
    index_writer.add_document(doc!(
       title => "Frankentein", //< Oops there is a typo here.
       isbn => "978-9176370711",
    ));
    index_writer.commit()?;
    let reader = index.reader()?;

    let frankenstein_isbn = Term::from_field_text(isbn, "978-9176370711");

    // Oops our frankenstein doc seems mispelled
    let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
    assert_eq!(
        schema.to_json(&frankenstein_doc_misspelled),
        r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
    );

    // # Update = Delete + Insert
    //
    // Here we will want to update the typo in the `Frankenstein` book.
    //
    // Tantivy does not handle updates directly, we need to delete
    // and reinsert the document.
    //
    // This can be complicated as it means you need to have access
    // to the entire document. It is good practise to integrate tantivy
    // with a key value store for this reason.
    //
    // To remove one of the document, we just call `delete_term`
    // on its id.
    //
    // Note that `tantivy` does nothing to enforce the idea that
    // there is only one document associated to this id.
    //
    // Also you might have noticed that we apply the delete before
    // having committed. This does not matter really...
    index_writer.delete_term(frankenstein_isbn.clone());

    // We now need to reinsert our document without the typo.
    index_writer.add_document(doc!(
       title => "Frankenstein",
       isbn => "978-9176370711",
    ));

    // You are guaranteed that your clients will only observe your index in
    // the state it was in after a commit.
    // In this example, your search engine will at no point be missing the *Frankenstein* document.
    // Everything happened as if the document was updated.
    index_writer.commit()?;
    // We reload our searcher to make our change available to clients.
    reader.reload()?;

    // No more typo!
    let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
    assert_eq!(
        schema.to_json(&frankenstein_new_doc),
        r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,
    );

    Ok(())
}