diff options
author | Kornel <kornel@geekhood.net> | 2019-01-19 15:49:35 +0000 |
---|---|---|
committer | Kornel <kornel@geekhood.net> | 2019-01-19 15:49:35 +0000 |
commit | c8a2cedac5a70a346309cf31fc924a7fdce94d48 (patch) | |
tree | 088bcabfc98caf4534fea08925483b4e42518e27 /search_index | |
parent | 35bff1dccf02bbb67a726590c2ff9fd55ff854fc (diff) |
Move dir
Diffstat (limited to 'search_index')
-rw-r--r-- | search_index/Cargo.toml | 15 | ||||
-rw-r--r-- | search_index/README.md | 3 | ||||
-rw-r--r-- | search_index/src/lib_search_index.rs | 185 |
3 files changed, 203 insertions, 0 deletions
diff --git a/search_index/Cargo.toml b/search_index/Cargo.toml new file mode 100644 index 0000000..a3ccde4 --- /dev/null +++ b/search_index/Cargo.toml @@ -0,0 +1,15 @@ +[package] +version = "0.3.1" +edition = "2018" +name = "search_index" +authors = ["Kornel <kornel@geekhood.net>"] + +[lib] +name = "search_index" +path = "src/lib_search_index.rs" + +[dependencies] +tantivy = "0.8" +kitchen_sink = { path = "../kitchen_sink", version = "0.7.0" } +rayon = "1.0.3" +rand = "0.6.1" diff --git a/search_index/README.md b/search_index/README.md new file mode 100644 index 0000000..55006bc --- /dev/null +++ b/search_index/README.md @@ -0,0 +1,3 @@ +# Tantivy index of crates.rs + +This is a part of the [crates.rs project](https://gitlab.com/crates.rs/crates.rs). diff --git a/search_index/src/lib_search_index.rs b/search_index/src/lib_search_index.rs new file mode 100644 index 0000000..5e79afd --- /dev/null +++ b/search_index/src/lib_search_index.rs @@ -0,0 +1,185 @@ +use std::{fs, path::Path}; +use std::cmp::Ordering; +use tantivy::{self, collector::TopDocs, query::QueryParser, schema::*, Index, IndexWriter}; + +const CRATE_SCORE_MAX: f64 = 1000000.0; + +pub struct CrateSearchIndex { + /// as-is + crate_name_field: Field, + /// space-separated + keywords_field: Field, + description_field: Field, + readme_field: Field, + /// raw number + monthly_downloads: Field, + /// semver string + crate_version: Field, + /// number in range 0..=SCORE_MAX denoting desirability of the crate + crate_score: Field, + + tantivy_index: Index, +} + +#[derive(Debug, Clone)] +pub struct CrateFound { + pub crate_name: String, + pub description: String, + pub keywords: String, + pub score: f32, + pub version: String, + pub monthly_downloads: u64, +} + +pub struct Indexer { + index: CrateSearchIndex, + writer: IndexWriter, +} + +impl CrateSearchIndex { + pub fn new(index_dir: impl AsRef<Path>) -> tantivy::Result<Self> { + let index_dir = index_dir.as_ref().join("tantivy"); + if !index_dir.exists() { + return Self::reset_db_to_empty(&index_dir); + } + let tantivy_index = Index::open_in_dir(index_dir)?; + let schema = tantivy_index.schema(); + + Ok(Self { + tantivy_index, + crate_name_field: schema.get_field("crate_name").expect("schema"), + keywords_field: schema.get_field("keywords").expect("schema"), + description_field: schema.get_field("description").expect("schema"), + readme_field: schema.get_field("readme").expect("schema"), + monthly_downloads: schema.get_field("monthly_downloads").expect("schema"), + crate_version: schema.get_field("crate_version").expect("schema"), + crate_score: schema.get_field("crate_score").expect("schema"), + }) + } + + fn reset_db_to_empty(index_dir: &Path) -> tantivy::Result<Self> { + let _ = fs::create_dir_all(index_dir); + + let mut schema_builder = SchemaBuilder::default(); + + let crate_name_field = schema_builder.add_text_field("crate_name", STRING | STORED); // STRING means stored literally + let keywords_field = schema_builder.add_text_field("keywords", TEXT); + let description_field = schema_builder.add_text_field("description", TEXT | STORED); + let text_field_indexing = TextFieldIndexing::default().set_tokenizer("en_stem").set_index_option(IndexRecordOption::WithFreqs); + let text_options = TextOptions::default().set_indexing_options(text_field_indexing); + let readme_field = schema_builder.add_text_field("readme", text_options); + let crate_version = schema_builder.add_text_field("crate_version", STRING | STORED); + let monthly_downloads = schema_builder.add_u64_field("monthly_downloads", INT_STORED); + let crate_score = schema_builder.add_u64_field("crate_score", INT_STORED); + + let schema = schema_builder.build(); + let tantivy_index = Index::create_in_dir(index_dir, schema)?; + tantivy_index.load_searchers()?; + + Ok(Self { tantivy_index, crate_name_field, keywords_field, description_field, readme_field, monthly_downloads, crate_version, crate_score }) + } + + pub fn search(&self, query_text: &str, limit: usize) -> tantivy::Result<Vec<CrateFound>> { + let query_text = query_text.trim(); + let mut query_parser = QueryParser::for_index(&self.tantivy_index, vec![ + self.crate_name_field, self.keywords_field, self.description_field, self.readme_field, + ]); + query_parser.set_conjunction_by_default(); + + let query = query_parser.parse_query(query_text) + .or_else(|_| { + let mangled_query: String = query_text.chars().map(|ch| { + if ch.is_alphanumeric() {ch} else {' '} + }).collect(); + query_parser.parse_query(mangled_query.trim()) + })?; + + let searcher = self.tantivy_index.searcher(); + let top_docs = searcher.search(&*query, &TopDocs::with_limit(limit+limit/3))?; + + let mut docs = top_docs.into_iter().enumerate().map(|(i, (score, doc_address))| { + let retrieved_doc = searcher.doc(doc_address)?; + let mut doc = self.tantivy_index.schema().to_named_doc(&retrieved_doc).0; + let mut base_score = take_int(doc.get("crate_score")) as f64; + // first few crates can be ordered by tantivy's relevance + let position_bonus = if i < 4 {CRATE_SCORE_MAX / 8.} else {0.}; + let crate_name = take_string(doc.remove("crate_name")); + // bonus for exact match + if crate_name == query_text { + base_score += CRATE_SCORE_MAX/10.; + } + Ok(CrateFound { + score: (score as f64 * (base_score + position_bonus)) as f32, + crate_name, + description: take_string(doc.remove("description")), + keywords: take_string(doc.remove("keywords")), + version: take_string(doc.remove("crate_version")), + monthly_downloads: take_int(doc.get("monthly_downloads")), + }) + }) + .collect::<tantivy::Result<Vec<_>>>()?; + + // re-sort using our base score + docs.sort_by(|a,b| b.score.partial_cmp(&a.score).unwrap_or(Ordering::Equal)); + docs.truncate(limit); // search picked a few more results to cut out chaff using crate_score + Ok(docs) + } +} + +fn take_string(val: Option<Vec<Value>>) -> String { + match val { + Some(mut val) => match val.remove(0) { + Value::Str(s) => s, + _ => panic!("invalid value type"), + }, + _ => String::new(), + } +} + +fn take_int(val: Option<&Vec<Value>>) -> u64 { + match val { + Some(val) => match val.get(0) { + Some(Value::U64(s)) => *s, + Some(Value::I64(s)) => *s as u64, + _ => panic!("invalid value type"), + }, + _ => 0, + } +} + +impl Indexer { + pub fn new(index: CrateSearchIndex) -> tantivy::Result<Self> { + Ok(Self { writer: index.tantivy_index.writer(250_000_000)?, index }) + } + + /// score is float 0..=1 range + pub fn add(&mut self, crate_name: &str, version: &str, description: &str, keywords: &[&str], readme: Option<&str>, monthly_downloads: u64, score: f64) { + // delete old doc if any + let crate_name_term = Term::from_field_text(self.index.crate_name_field, crate_name); + self.writer.delete_term(crate_name_term); + + // index new one + let mut doc = Document::default(); + doc.add_text(self.index.crate_name_field, crate_name); + doc.add_text(self.index.keywords_field, &keywords.join(", ")); + doc.add_text(self.index.description_field, description); + if let Some(readme) = readme { + doc.add_text(self.index.readme_field, readme); + } + doc.add_text(self.index.crate_version, version); + doc.add_u64(self.index.monthly_downloads, monthly_downloads); + doc.add_u64(self.index.crate_score, (score * CRATE_SCORE_MAX).ceil() as u64); + self.writer.add_document(doc); + } + + pub fn commit(&mut self) -> tantivy::Result<()> { + self.writer.commit()?; + self.index.tantivy_index.load_searchers()?; + Ok(()) + } + + pub fn bye(self) -> tantivy::Result<CrateSearchIndex> { + self.writer.wait_merging_threads()?; + Ok(self.index) + } +} |