#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")] #![cfg_attr(feature = "cargo-clippy", allow(module_inception))] #![cfg_attr(feature = "cargo-clippy", allow(inline_always))] #![feature(box_syntax)] #![feature(optin_builtin_traits)] #![feature(conservative_impl_trait)] #![feature(integer_atomics)] #![cfg_attr(test, feature(test))] #![cfg_attr(test, feature(step_by))] #![doc(test(attr(allow(unused_variables), deny(warnings))))] #![allow(unknown_lints)] #![warn(missing_docs)] //! # `tantivy` //! //! Tantivy is a search engine library. //! Think `Lucene`, but in Rust. //! //! A good place for you to get started is to check out //! the example code ( //! [literate programming](http://fulmicoton.com/tantivy-examples/simple_search.html) / //! [source code](https://github.com/fulmicoton/tantivy/blob/master/examples/simple_search.rs)) #[macro_use] extern crate lazy_static; #[macro_use] extern crate serde_derive; #[macro_use] extern crate log; #[macro_use] extern crate error_chain; #[macro_use] extern crate version; extern crate fst; extern crate byteorder; extern crate memmap; extern crate regex; extern crate tempfile; extern crate atomicwrites; extern crate tempdir; extern crate serde; extern crate bincode; extern crate serde_json; extern crate time; extern crate lz4; extern crate uuid; extern crate num_cpus; extern crate combine; extern crate itertools; extern crate chan; extern crate crossbeam; extern crate bit_set; extern crate futures; extern crate futures_cpupool; extern crate owning_ref; extern crate stable_deref_trait; #[cfg(test)] extern crate env_logger; #[cfg(feature="simdcompression")] extern crate libc; #[cfg(windows)] extern crate winapi; #[cfg(test)] extern crate test; #[cfg(test)] extern crate rand; #[cfg(test)] mod functional_test; #[macro_use] mod macros; pub use error::{Error, ErrorKind, ResultExt}; /// Tantivy result. pub type Result = std::result::Result; mod core; mod compression; mod indexer; mod common; mod error; mod analyzer; mod datastruct; pub mod termdict; /// Row-oriented, slow, compressed storage of documents pub mod store; /// Query module pub mod query; pub mod directory; /// Collector module pub mod collector; /// Postings module (also called inverted index) pub mod postings; /// Schema pub mod schema; pub mod fastfield; pub use directory::Directory; pub use core::{Index, Segment, SegmentId, SegmentMeta, Searcher}; pub use indexer::IndexWriter; pub use schema::{Term, Document}; pub use core::SegmentReader; pub use self::common::TimerTree; pub use postings::DocSet; pub use postings::Postings; pub use core::SegmentComponent; pub use postings::SegmentPostingsOption; pub use common::{i64_to_u64, u64_to_i64}; /// Expose the current version of tantivy, as well /// whether it was compiled with the simd compression. pub fn version() -> &'static str { if cfg!(feature = "simdcompression") { concat!(version!(), "-simd") } else { concat!(version!(), "-nosimd") } } /// Defines tantivy's merging strategy pub mod merge_policy { pub use indexer::MergePolicy; pub use indexer::LogMergePolicy; pub use indexer::NoMergePolicy; pub use indexer::DefaultMergePolicy; } /// u32 identifying a document within a segment. /// Documents have their doc id assigned incrementally, /// as they are added in the segment. pub type DocId = u32; /// f32 the score of a document. pub type Score = f32; /// A segment local id identifies a segment. /// It only makes sense for a given searcher. pub type SegmentLocalId = u32; impl DocAddress { /// Return the segment ordinal. /// The segment ordinal is an id identifying the segment /// hosting the document. It is only meaningful, in the context /// of a searcher. pub fn segment_ord(&self) -> SegmentLocalId { self.0 } /// Return the segment local `DocId` pub fn doc(&self) -> DocId { self.1 } } /// `DocAddress` contains all the necessary information /// to identify a document given a `Searcher` object. /// /// It consists in an id identifying its segment, and /// its segment-local `DocId`. /// /// The id used for the segment is actually an ordinal /// in the list of segment hold by a `Searcher`. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct DocAddress(pub SegmentLocalId, pub DocId); #[cfg(test)] mod tests { use collector::tests::TestCollector; use Index; use core::SegmentReader; use query::BooleanQuery; use postings::SegmentPostingsOption; use schema::*; use DocSet; use IndexWriter; use postings::SegmentPostingsOption::FreqAndPositions; use fastfield::{FastFieldReader, U64FastFieldReader, I64FastFieldReader}; use Postings; use rand::{XorShiftRng, Rng, SeedableRng}; fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec { let seed: &[u32; 4] = &[1, 2, 3, seed_val]; let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed); (0..u32::max_value()) .filter(|_| rng.next_f32() < ratio) .take(n) .collect() } pub fn generate_array(n: usize, ratio: f32) -> Vec { generate_array_with_seed(n, ratio, 4) } fn sample_with_seed(n: u32, ratio: f32, seed_val: u32) -> Vec { let seed: &[u32; 4] = &[1, 2, 3, seed_val]; let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed); (0..n).filter(|_| rng.next_f32() < ratio).collect() } pub fn sample(n: u32, ratio: f32) -> Vec { sample_with_seed(n, ratio, 4) } #[test] fn test_indexing() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_from_tempdir(schema).unwrap(); { // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { let doc = doc!(text_field=>"af b"); index_writer.add_document(doc); } { let doc = doc!(text_field=>"a b c"); index_writer.add_document(doc); } { let doc = doc!(text_field=>"a b c d"); index_writer.add_document(doc); } assert!(index_writer.commit().is_ok()); } } #[test] fn test_docfreq() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let index = Index::create_in_ram(schema_builder.build()); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { index_writer.add_document(doc!(text_field=>"a b c")); index_writer.commit().unwrap(); } { { let doc = doc!(text_field=>"a"); index_writer.add_document(doc); } { let doc = doc!(text_field=>"a a"); index_writer.add_document(doc); } index_writer.commit().unwrap(); } { let doc = doc!(text_field=>"c"); index_writer.add_document(doc); index_writer.commit().unwrap(); } { index.load_searchers().unwrap(); let searcher = index.searcher(); let term_a = Term::from_field_text(text_field, "a"); assert_eq!(searcher.doc_freq(&term_a), 3); let term_b = Term::from_field_text(text_field, "b"); assert_eq!(searcher.doc_freq(&term_b), 1); let term_c = Term::from_field_text(text_field, "c"); assert_eq!(searcher.doc_freq(&term_c), 2); let term_d = Term::from_field_text(text_field, "d"); assert_eq!(searcher.doc_freq(&term_d), 0); } } #[test] fn test_fieldnorm() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let index = Index::create_in_ram(schema_builder.build()); { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { let doc = doc!(text_field=>"a b c"); index_writer.add_document(doc); } { let doc = doc!(); index_writer.add_document(doc); } { let doc = doc!(text_field=>"a b"); index_writer.add_document(doc); } index_writer.commit().unwrap(); } { index.load_searchers().unwrap(); let searcher = index.searcher(); let segment_reader: &SegmentReader = searcher.segment_reader(0); let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field).unwrap(); assert_eq!(fieldnorms_reader.get(0), 3); assert_eq!(fieldnorms_reader.get(1), 0); assert_eq!(fieldnorms_reader.get(2), 2); } } #[test] fn test_delete_postings1() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let term_abcd = Term::from_field_text(text_field, "abcd"); let term_a = Term::from_field_text(text_field, "a"); let term_b = Term::from_field_text(text_field, "b"); let term_c = Term::from_field_text(text_field, "c"); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); { // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { // 0 let doc = doc!(text_field=>"a b"); index_writer.add_document(doc); } { // 1 let doc = doc!(text_field=>" a c"); index_writer.add_document(doc); } { // 2 let doc = doc!(text_field=>" b c"); index_writer.add_document(doc); } { // 3 let doc = doc!(text_field=>" b d"); index_writer.add_document(doc); } { index_writer.delete_term(Term::from_field_text(text_field, "c")); } { index_writer.delete_term(Term::from_field_text(text_field, "a")); } { // 4 let doc = doc!(text_field=>" b c"); index_writer.add_document(doc); } { // 5 let doc = doc!(text_field=>" a"); index_writer.add_document(doc); } index_writer.commit().unwrap(); } { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); { let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); assert!(!postings.advance()); } { let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); assert_eq!(postings.doc(), 4); assert!(!postings.advance()); } } { // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { // 0 let doc = doc!(text_field=>"a b"); index_writer.add_document(doc); } { // 1 index_writer.delete_term(Term::from_field_text(text_field, "c")); } index_writer.rollback().unwrap(); } { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); { let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 5); assert!(!postings.advance()); } { let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); assert_eq!(postings.doc(), 4); assert!(!postings.advance()); } } { // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { let doc = doc!(text_field=>"a b"); index_writer.add_document(doc); } { index_writer.delete_term(Term::from_field_text(text_field, "c")); } index_writer = index_writer.rollback().unwrap(); index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.commit().unwrap(); } { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); { let mut postings = reader.read_postings(&term_a, FreqAndPositions).unwrap(); assert!(!postings.advance()); } { let mut postings = reader.read_postings(&term_b, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 3); assert!(postings.advance()); assert_eq!(postings.doc(), 4); assert!(!postings.advance()); } { let mut postings = reader.read_postings(&term_c, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 4); assert!(!postings.advance()); } } } #[test] fn test_indexed_u64() { let mut schema_builder = SchemaBuilder::default(); let field = schema_builder.add_u64_field("value", INT_INDEXED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); index_writer.add_document(doc!(field=>1u64)); index_writer.commit().unwrap(); index.load_searchers().unwrap(); let searcher = index.searcher(); let term = Term::from_field_u64(field, 1u64); let mut postings = searcher .segment_reader(0) .read_postings(&term, SegmentPostingsOption::NoFreq) .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 0); assert!(!postings.advance()); } #[test] fn test_indexed_i64() { let mut schema_builder = SchemaBuilder::default(); let value_field = schema_builder.add_i64_field("value", INT_INDEXED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let negative_val = -1i64; index_writer.add_document(doc!(value_field => negative_val)); index_writer.commit().unwrap(); index.load_searchers().unwrap(); let searcher = index.searcher(); let term = Term::from_field_i64(value_field, negative_val); let mut postings = searcher .segment_reader(0) .read_postings(&term, SegmentPostingsOption::NoFreq) .unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 0); assert!(!postings.advance()); } #[test] fn test_delete_postings2() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); // writing the segment let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap(); let add_document = |index_writer: &mut IndexWriter, val: &'static str| { let doc = doc!(text_field=>val); index_writer.add_document(doc); }; let remove_document = |index_writer: &mut IndexWriter, val: &'static str| { let delterm = Term::from_field_text(text_field, val); index_writer.delete_term(delterm); }; add_document(&mut index_writer, "63"); add_document(&mut index_writer, "70"); add_document(&mut index_writer, "34"); add_document(&mut index_writer, "1"); add_document(&mut index_writer, "38"); add_document(&mut index_writer, "33"); add_document(&mut index_writer, "40"); add_document(&mut index_writer, "17"); remove_document(&mut index_writer, "38"); remove_document(&mut index_writer, "34"); index_writer.commit().unwrap(); index.load_searchers().unwrap(); let searcher = index.searcher(); assert_eq!(searcher.num_docs(), 6); } #[test] fn test_termfreq() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); { // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { let doc = doc!(text_field=>"af af af bc bc"); index_writer.add_document(doc); } index_writer.commit().unwrap(); } { index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); let term_abcd = Term::from_field_text(text_field, "abcd"); assert!(reader.read_postings(&term_abcd, FreqAndPositions).is_none()); let term_af = Term::from_field_text(text_field, "af"); let mut postings = reader.read_postings(&term_af, FreqAndPositions).unwrap(); assert!(postings.advance()); assert_eq!(postings.doc(), 0); assert_eq!(postings.term_freq(), 3); assert!(!postings.advance()); } } #[test] fn test_searcher_1() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); { // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { let doc = doc!(text_field=>"af af af b"); index_writer.add_document(doc); } { let doc = doc!(text_field=>"a b c"); index_writer.add_document(doc); } { let doc = doc!(text_field=>"a b c d"); index_writer.add_document(doc); } index_writer.commit().unwrap(); } { index.load_searchers().unwrap(); let searcher = index.searcher(); let get_doc_ids = |terms: Vec| { let query = BooleanQuery::new_multiterms_query(terms); let mut collector = TestCollector::default(); assert!(searcher.search(&query, &mut collector).is_ok()); collector.docs() }; { assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]), vec![1, 2]); } { assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]), vec![0]); } { assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]), vec![0, 1, 2]); } { assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "c")]), vec![1, 2]); } { assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "d")]), vec![2]); } { assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b"), Term::from_field_text(text_field, "a")]), vec![0, 1, 2]); } } } #[test] fn test_searcher_2() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); { // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { let doc = doc!(text_field=>"af b"); index_writer.add_document(doc); } { let doc = doc!(text_field=>"a b c"); index_writer.add_document(doc); } { let doc = doc!(text_field=>"a b c d"); index_writer.add_document(doc); } index_writer.commit().unwrap(); } index.searcher(); } #[test] fn test_doc_macro() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let other_text_field = schema_builder.add_text_field("text2", TEXT); let document = doc!(text_field => "tantivy", text_field => "some other value", other_text_field => "short"); assert_eq!(document.len(), 3); let values = document.get_all(text_field); assert_eq!(values.len(), 2); assert_eq!(values[0].text(), "tantivy"); assert_eq!(values[1].text(), "some other value"); let values = document.get_all(other_text_field); assert_eq!(values.len(), 1); assert_eq!(values[0].text(), "short"); } #[test] fn test_wrong_fast_field_type() { let mut schema_builder = SchemaBuilder::default(); let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST); let fast_field_signed = schema_builder.add_i64_field("signed", FAST); let text_field = schema_builder.add_text_field("text", TEXT); let stored_int_field = schema_builder.add_u64_field("text", INT_STORED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap(); { let document = doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64); index_writer.add_document(document); index_writer.commit().unwrap(); } index.load_searchers().unwrap(); let searcher = index.searcher(); let segment_reader: &SegmentReader = searcher.segment_reader(0); { let fast_field_reader_res = segment_reader.get_fast_field_reader::(text_field); assert!(fast_field_reader_res.is_err()); } { let fast_field_reader_res = segment_reader.get_fast_field_reader::(stored_int_field); assert!(fast_field_reader_res.is_err()); } { let fast_field_reader_res = segment_reader.get_fast_field_reader::(fast_field_signed); assert!(fast_field_reader_res.is_err()); } { let fast_field_reader_res = segment_reader.get_fast_field_reader::(fast_field_signed); assert!(fast_field_reader_res.is_ok()); let fast_field_reader = fast_field_reader_res.unwrap(); assert_eq!(fast_field_reader.get(0), 4i64) } { let fast_field_reader_res = segment_reader.get_fast_field_reader::(fast_field_signed); assert!(fast_field_reader_res.is_ok()); let fast_field_reader = fast_field_reader_res.unwrap(); assert_eq!(fast_field_reader.get(0), 4i64) } } }