summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.travis.yml2
-rw-r--r--CHANGELOG.md13
-rw-r--r--Cargo.toml16
-rw-r--r--Makefile3
-rw-r--r--ci/script.sh5
-rw-r--r--examples/basic_search.rs40
-rw-r--r--examples/custom_collector.rs5
-rw-r--r--examples/custom_tokenizer.rs5
-rw-r--r--examples/deleting_updating_documents.rs5
-rw-r--r--examples/faceted_search.rs9
-rw-r--r--examples/integer_range_search.rs6
-rw-r--r--examples/iterating_docs_and_positions.rs5
-rw-r--r--examples/multiple_producer.rs15
-rw-r--r--examples/snippet.rs9
-rw-r--r--examples/stop_words.rs4
-rw-r--r--query-grammar/Cargo.toml16
-rw-r--r--query-grammar/src/lib.rs17
-rw-r--r--query-grammar/src/occur.rs (renamed from src/query/occur.rs)43
-rw-r--r--query-grammar/src/query_grammar.rs380
-rw-r--r--query-grammar/src/user_input_ast.rs (renamed from src/query/query_parser/user_input_ast.rs)56
-rw-r--r--src/collector/count_collector.rs6
-rw-r--r--src/collector/facet_collector.rs6
-rw-r--r--src/collector/mod.rs1
-rw-r--r--src/collector/multi_collector.rs6
-rw-r--r--src/collector/top_score_collector.rs30
-rw-r--r--src/common/composite_file.rs7
-rw-r--r--src/common/counting_writer.rs9
-rw-r--r--src/common/mod.rs19
-rw-r--r--src/core/index.rs36
-rw-r--r--src/core/index_meta.rs1
-rw-r--r--src/core/segment_id.rs55
-rw-r--r--src/directory/directory.rs8
-rw-r--r--src/directory/footer.rs213
-rw-r--r--src/directory/managed_directory.rs101
-rw-r--r--src/directory/mmap_directory.rs17
-rw-r--r--src/directory/mod.rs36
-rw-r--r--src/directory/ram_directory.rs11
-rw-r--r--src/fastfield/multivalued/writer.rs4
-rw-r--r--src/fastfield/writer.rs7
-rw-r--r--src/indexer/index_writer.rs14
-rw-r--r--src/indexer/segment_updater.rs6
-rw-r--r--src/indexer/segment_writer.rs2
-rwxr-xr-xsrc/lib.rs25
-rw-r--r--src/macros.rs6
-rw-r--r--src/postings/postings_writer.rs7
-rw-r--r--src/postings/serializer.rs5
-rw-r--r--src/query/automaton_weight.rs18
-rw-r--r--src/query/fuzzy_query.rs30
-rw-r--r--src/query/intersection.rs2
-rw-r--r--src/query/intersection_two.rs2
-rw-r--r--src/query/mod.rs3
-rw-r--r--src/query/query_parser/logical_ast.rs1
-rw-r--r--src/query/query_parser/mod.rs2
-rw-r--r--src/query/query_parser/query_grammar.rs284
-rw-r--r--src/query/query_parser/query_parser.rs127
-rw-r--r--src/query/query_parser/stemmer.rs44
-rw-r--r--src/query/range_query.rs57
-rw-r--r--src/query/regex_query.rs102
-rw-r--r--src/query/term_query/mod.rs14
-rw-r--r--src/query/term_query/term_query.rs15
-rw-r--r--src/query/union.rs2
-rw-r--r--src/reader/mod.rs3
-rw-r--r--src/reader/pool.rs4
-rw-r--r--src/schema/facet.rs4
-rw-r--r--src/schema/field_entry.rs4
-rw-r--r--src/schema/field_type.rs19
-rw-r--r--src/schema/index_record_option.rs16
-rw-r--r--src/schema/schema.rs114
-rw-r--r--src/schema/term.rs27
-rw-r--r--src/schema/value.rs10
-rw-r--r--src/snippet/mod.rs55
-rw-r--r--src/store/writer.rs3
-rw-r--r--src/tokenizer/alphanum_only.rs3
-rw-r--r--src/tokenizer/mod.rs12
-rw-r--r--src/tokenizer/ngram_tokenizer.rs3
-rw-r--r--src/tokenizer/remove_long.rs3
-rw-r--r--src/tokenizer/stop_word_filter.rs3
-rw-r--r--src/tokenizer/tokenizer.rs55
-rw-r--r--src/tokenizer/tokenizer_manager.rs13
-rw-r--r--tests/failpoints/mod.rs6
80 files changed, 1489 insertions, 863 deletions
diff --git a/.travis.yml b/.travis.yml
index 9fc2578..c4c59cd 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -47,6 +47,7 @@ matrix:
before_install:
- set -e
- rustup self update
+ - rustup component add rustfmt
install:
- sh ci/install.sh
@@ -60,6 +61,7 @@ before_script:
script:
- bash ci/script.sh
+ - cargo fmt --all -- --check
before_deploy:
- sh ci/before_deploy.sh
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5f74923..a0df3a1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,19 @@ Tantivy 0.11.0
=====================
- Added f64 field. Internally reuse u64 code the same way i64 does (@fdb-hiroshima)
+- Various bugfixes in the query parser.
+ - Better handling of hyphens in query parser. (#609)
+ - Better handling of whitespaces.
+- Closes #498 - add support for Elastic-style unbounded range queries for alphanumeric types eg. "title:>hello", "weight:>=70.5", "height:<200" (@petr-tik)
+- API change around `Box<BoxableTokenizer>`. See detail in #629
+- Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
+- Add footer with some metadata to index files. #605 (@fdb-hiroshima)
+
+## How to update?
+
+- `Box<dyn BoxableTokenizer>` has been replaced by a `BoxedTokenizer` struct.
+- Regex are now compiled when the `RegexQuery` instance is built. As a result, it can now return
+an error and handling the `Result` is required.
Tantivy 0.10.2
diff --git a/Cargo.toml b/Cargo.toml
index d422534..6528cf6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "tantivy"
-version = "0.10.2"
+version = "0.11.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -15,8 +15,9 @@ edition = "2018"
[dependencies]
base64 = "0.10.0"
byteorder = "1.0"
-once_cell = "0.2"
-regex = "1.0"
+crc32fast = "1.2.0"
+once_cell = "1.0"
+regex ={version = "1.3.0", default-features = false, features = ["std"]}
tantivy-fst = "0.1"
memmap = {version = "0.7", optional=true}
lz4 = {version="1.20", optional=true}
@@ -24,8 +25,6 @@ snap = {version="0.2"}
atomicwrites = {version="0.2.2", optional=true}
tempfile = "3.0"
log = "0.4"
-combine = ">=3.6.0,<4.0.0"
-tempdir = "0.3"
serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
@@ -36,13 +35,14 @@ levenshtein_automata = {version="0.1", features=["fst_automaton"]}
notify = {version="4", optional=true}
bit-set = "0.5"
uuid = { version = "0.7.2", features = ["v4", "serde"] }
-crossbeam = "0.5"
+crossbeam = "0.7"
futures = "0.1"
futures-cpupool = "0.1"
owning_ref = "0.4"
stable_deref_trait = "1.0.0"
rust-stemmers = "1.1"
downcast-rs = { version="1.0" }
+tantivy-query-grammar = { path="./query-grammar" }
bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]}
census = "0.2"
fnv = "1.0.6"
@@ -81,6 +81,9 @@ failpoints = ["fail/failpoints"]
unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"]
+[workspace]
+members = ["query-grammar"]
+
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }
@@ -88,7 +91,6 @@ travis-ci = { repository = "tantivy-search/tantivy" }
version = "0.3"
features = ["failpoints"]
-
# Following the "fail" crate best practises, we isolate
# tests that define specific behavior in fail check points
# in a different binary.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..05f0f44
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,3 @@
+test:
+ echo "Run test only... No examples."
+ cargo test --tests --lib
diff --git a/ci/script.sh b/ci/script.sh
index 9f3cf88..87f8902 100644
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -7,7 +7,7 @@ set -ex
main() {
if [ ! -z $CODECOV ]; then
echo "Codecov"
- cargo build --verbose && cargo coverage --verbose && bash <(curl -s https://codecov.io/bash) -s target/kcov
+ cargo build --verbose && cargo coverage --verbose --all && bash <(curl -s https://codecov.io/bash) -s target/kcov
else
echo "Build"
cross build --target $TARGET
@@ -15,7 +15,8 @@ main() {
return
fi
echo "Test"
- cross test --target $TARGET --no-default-features --features mmap -- --test-threads 1
+ cross test --target $TARGET --no-default-features --features mmap
+ cross test --target $TARGET --no-default-features --features mmap query-grammar
fi
for example in $(ls examples/*.rs)
do
diff --git a/examples/basic_search.rs b/examples/basic_search.rs
index 416f86f..c8ac36c 100644
--- a/examples/basic_search.rs
+++ b/examples/basic_search.rs
@@ -5,26 +5,23 @@
//
// We will :
// - define our schema
-// = create an index in a directory
-// - index few documents in our index
-// - search for the best document matchings "sea whale"
-// - retrieve the best document original content.
+// - create an index in a directory
+// - index a few documents into our index
+// - search for the best document matching a basic query
+// - retrieve the best document's original content.
// ---
// Importing tantivy...
-#[macro_use]
-extern crate tantivy;
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
-use tantivy::Index;
-use tantivy::ReloadPolicy;
-use tempdir::TempDir;
+use tantivy::{doc, Index, ReloadPolicy};
+use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
// Let's create a temporary directory for the
// sake of this example
- let index_path = TempDir::new("tantivy_example_dir")?;
+ let index_path = TempDir::new()?;
// # Defining the schema
//
@@ -33,7 +30,7 @@ fn main() -> tantivy::Result<()> {
// and for each field, its type and "the way it should
// be indexed".
- // first we need to define a schema ...
+ // First we need to define a schema ...
let mut schema_builder = Schema::builder();
// Our first field is title.
@@ -48,7 +45,7 @@ fn main() -> tantivy::Result<()> {
//
// `STORED` means that the field will also be saved
// in a compressed, row-oriented key-value store.
- // This store is useful to reconstruct the
+ // This store is useful for reconstructing the
// documents that were selected during the search phase.
schema_builder.add_text_field("title", TEXT | STORED);
@@ -57,8 +54,7 @@ fn main() -> tantivy::Result<()> {
// need to be able to be able to retrieve it
// for our application.
//
- // We can make our index lighter and
- // by omitting `STORED` flag.
+ // We can make our index lighter by omitting the `STORED` flag.
schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
@@ -71,7 +67,7 @@ fn main() -> tantivy::Result<()> {
// with our schema in the directory.
let index = Index::create_in_dir(&index_path, schema.clone())?;
- // To insert document we need an index writer.
+ // To insert a document we will need an index writer.
// There must be only one writer at a time.
// This single `IndexWriter` is already
// multithreaded.
@@ -149,8 +145,8 @@ fn main() -> tantivy::Result<()> {
// At this point our documents are not searchable.
//
//
- // We need to call .commit() explicitly to force the
- // index_writer to finish processing the documents in the queue,
+ // We need to call `.commit()` explicitly to force the
+ // `index_writer` to finish processing the documents in the queue,
// flush the current index to the disk, and advertise
// the existence of new documents.
//
@@ -162,14 +158,14 @@ fn main() -> tantivy::Result<()> {
// persistently indexed.
//
// In the scenario of a crash or a power failure,
- // tantivy behaves as if has rolled back to its last
+ // tantivy behaves as if it has rolled back to its last
// commit.
// # Searching
//
// ### Searcher
//
- // A reader is required to get search the index.
+ // A reader is required first in order to search an index.
// It acts as a `Searcher` pool that reloads itself,
// depending on a `ReloadPolicy`.
//
@@ -185,7 +181,7 @@ fn main() -> tantivy::Result<()> {
// We now need to acquire a searcher.
//
- // A searcher points to snapshotted, immutable version of the index.
+ // A searcher points to a snapshotted, immutable version of the index.
//
// Some search experience might require more than
// one query. Using the same searcher ensures that all of these queries will run on the
@@ -205,7 +201,7 @@ fn main() -> tantivy::Result<()> {
// in both title and body.
let query_parser = QueryParser::for_index(&index, vec![title, body]);
- // QueryParser may fail if the query is not in the right
+ // `QueryParser` may fail if the query is not in the right
// format. For user facing applications, this can be a problem.
// A ticket has been opened regarding this problem.
let query = query_parser.parse_query("sea whale")?;
@@ -221,7 +217,7 @@ fn main() -> tantivy::Result<()> {
//
// We are not interested in all of the documents but
// only in the top 10. Keeping track of our top 10 best documents
- // is the role of the TopDocs.
+ // is the role of the `TopDocs` collector.
// We can now perform our query.
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
diff --git a/examples/custom_collector.rs b/examples/custom_collector.rs
index e63eb9f..c277ede 100644
--- a/examples/custom_collector.rs
+++ b/examples/custom_collector.rs
@@ -9,15 +9,12 @@
// ---
// Importing tantivy...
-#[macro_use]
-extern crate tantivy;
use tantivy::collector::{Collector, SegmentCollector};
use tantivy::fastfield::FastFieldReader;
use tantivy::query::QueryParser;
use tantivy::schema::Field;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
-use tantivy::SegmentReader;
-use tantivy::{Index, TantivyError};
+use tantivy::{doc, Index, SegmentReader, TantivyError};
#[derive(Default)]
struct Stats {
diff --git a/examples/custom_tokenizer.rs b/examples/custom_tokenizer.rs
index 5730adb..4db6d10 100644
--- a/examples/custom_tokenizer.rs
+++ b/examples/custom_tokenizer.rs
@@ -2,14 +2,11 @@
//
// In this example, we'll see how to define a tokenizer pipeline
// by aligning a bunch of `TokenFilter`.
-
-#[macro_use]
-extern crate tantivy;
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::NgramTokenizer;
-use tantivy::Index;
+use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> {
// # Defining the schema
diff --git a/examples/deleting_updating_documents.rs b/examples/deleting_updating_documents.rs
index 82fdd90..1eda6ce 100644
--- a/examples/deleting_updating_documents.rs
+++ b/examples/deleting_updating_documents.rs
@@ -8,13 +8,10 @@
//
// ---
// Importing tantivy...
-#[macro_use]
-extern crate tantivy;
use tantivy::collector::TopDocs;
use tantivy::query::TermQuery;
use tantivy::schema::*;
-use tantivy::Index;
-use tantivy::IndexReader;
+use tantivy::{doc, Index, IndexReader};
// A simple helper function to fetch a single document
// given its id from our index.
diff --git a/examples/faceted_search.rs b/examples/faceted_search.rs
index 98e0a27..7ac67c3 100644
--- a/examples/faceted_search.rs
+++ b/examples/faceted_search.rs
@@ -12,17 +12,16 @@
// ---
// Importing tantivy...
-#[macro_use]
-extern crate tantivy;
use tantivy::collector::FacetCollector;
use tantivy::query::AllQuery;
use tantivy::schema::*;
-use tantivy::Index;
+use tantivy::{doc, Index};
+use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
// Let's create a temporary directory for the
// sake of this example
- let index_path = TempDir::new("tantivy_facet_example_dir")?;
+ let index_path = TempDir::new()?;
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("name", TEXT | STORED);
@@ -74,5 +73,3 @@ fn main() -> tantivy::Result<()> {
Ok(())
}
-
-use tempdir::TempDir;
diff --git a/examples/integer_range_search.rs b/examples/integer_range_search.rs
index dea3145..12edd6e 100644
--- a/examples/integer_range_search.rs
+++ b/examples/integer_range_search.rs
@@ -2,14 +2,10 @@
//
// Below is an example of creating an indexed integer field in your schema
// You can use RangeQuery to get a Count of all occurrences in a given range.
-
-#[macro_use]
-extern crate tantivy;
use tantivy::collector::Count;
use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED};
-use tantivy::Index;
-use tantivy::Result;
+use tantivy::{doc, Index, Result};
fn run() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field
diff --git a/examples/iterating_docs_and_positions.rs b/examples/iterating_docs_and_positions.rs
index 4668de3..0be84ec 100644
--- a/examples/iterating_docs_and_positions.rs
+++ b/examples/iterating_docs_and_positions.rs
@@ -9,11 +9,8 @@
// ---
// Importing tantivy...
-#[macro_use]
-extern crate tantivy;
use tantivy::schema::*;
-use tantivy::Index;
-use tantivy::{DocId, DocSet, Postings};
+use tantivy::{doc, DocId, DocSet, Index, Postings};
fn main() -> tantivy::Result<()> {
// We first create a schema for the sake of the
diff --git a/examples/mu