From 056a0728c775e37460ed00791ad503e03a88f3d6 Mon Sep 17 00:00:00 2001 From: Matthias Beyer Date: Sat, 17 Apr 2021 14:37:46 +0200 Subject: Initial import Signed-off-by: Matthias Beyer --- src/cli.rs | 36 ++++++++++++++++++++ src/config.rs | 8 +++++ src/main.rs | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/schema.rs | 16 +++++++++ 4 files changed, 167 insertions(+) create mode 100644 src/cli.rs create mode 100644 src/config.rs create mode 100644 src/main.rs create mode 100644 src/schema.rs (limited to 'src') diff --git a/src/cli.rs b/src/cli.rs new file mode 100644 index 0000000..9098cfe --- /dev/null +++ b/src/cli.rs @@ -0,0 +1,36 @@ +use clap::App; +use clap::Arg; +use clap::crate_authors; +use clap::crate_version; + +pub fn app<'a>() -> App<'a, 'a> { + + App::new("fss") + .author(crate_authors!()) + .version(crate_version!()) + .about("Filesystemsearch") + + .subcommand(App::new("index") + .version(crate_version!()) + .about("Index a file") + .arg(Arg::with_name("file") + .required(true) + .multiple(true) + .value_name("FILE") + .help("Index these files") + ) + ) + + .subcommand(App::new("search") + .version(crate_version!()) + .about("Search for a file") + .arg(Arg::with_name("term") + .required(true) + .multiple(true) + .value_name("TERM") + .help("Search with these terms") + ) + ) +} + + diff --git a/src/config.rs b/src/config.rs new file mode 100644 index 0000000..27dddb8 --- /dev/null +++ b/src/config.rs @@ -0,0 +1,8 @@ +use std::path::PathBuf; + +#[derive(Debug, serde::Deserialize, getset::Getters)] +pub struct Config { + #[getset(get = "pub")] + database_path: PathBuf +} + diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..8fd5fb2 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,107 @@ +use std::io::Write; +use std::path::PathBuf; + +use anyhow::Context; +use anyhow::Error; +use anyhow::Result; +use anyhow::anyhow; +use itertools::Itertools; + +mod config; +mod cli; +mod schema; + +fn main() -> Result<()> { + let cli = crate::cli::app(); + let _ = env_logger::try_init()?; + let mut config = ::config::Config::default(); + { + let xdg = xdg::BaseDirectories::with_prefix("fss")?; + let xdg_config = xdg.find_config_file("config.toml") + .ok_or_else(|| anyhow!("No configuration file found with XDG: {}", xdg.get_config_home().display()))?; + + log::debug!("Configuration file found with XDG: {}", xdg_config.display()); + config.merge(::config::File::from(xdg_config).required(false)) + .context("Failed to load config.toml from XDG configuration directory")?; + } + let config = config.try_into::()?; + + let index_path = tantivy::directory::MmapDirectory::open(config.database_path())?; + let schema = crate::schema::schema(); + + let index = tantivy::Index::open_or_create(index_path, schema.clone())?; + + let field_path = schema.get_field("path") + .ok_or_else(|| anyhow!("BUG"))?; + let field_ft = schema.get_field("ft") + .ok_or_else(|| anyhow!("BUG"))?; + let field_body = schema.get_field("body") + .ok_or_else(|| anyhow!("BUG"))?; + + match cli.get_matches().subcommand() { + ("index", Some(mtch)) => { + let mut index_writer = index.writer(50_000_000)?; + mtch.values_of("file") + .unwrap() // safe by clap + .map(|filepath| { + let path_str = String::from(filepath); + let path = PathBuf::from(&path_str); + + let filetype = path.extension() + .map(ToOwned::to_owned) + .and_then(|osstr| osstr.to_str().map(|s| s.to_string())) + .ok_or_else(|| anyhow!("Path {} is not UTF8", filepath))?; + + let mut doc = tantivy::Document::default(); + doc.add_text(field_path, &path_str); + doc.add_text(field_ft, &filetype); + + doc.add_text(field_body, std::fs::read_to_string(path)?); + + index_writer.add_document(doc); + Ok(()) + }) + .collect::>>()?; + + index_writer.commit()?; + Ok(()) + }, + + ("search", Some(mtch)) => { + let query_str = mtch.values_of("term") + .unwrap() // safe by clap + .join(" "); + + let reader = index + .reader_builder() + .reload_policy(tantivy::ReloadPolicy::OnCommit) + .try_into()?; + + let searcher = reader.searcher(); + let query_parser = tantivy::query::QueryParser::for_index(&index, vec![field_path.clone(), field_ft, field_body]); + let query = query_parser.parse_query(&query_str)?; + + let top_docs = searcher.search(&query, &tantivy::collector::TopDocs::with_limit(10))?; + let mut output = std::io::stdout(); + + top_docs.into_iter() + .map(|(_score, adr)| { + let retrieved_doc = searcher.doc(adr)?; + retrieved_doc.get_all(field_path) + .map(|value| { + value.text().ok_or_else(|| anyhow!("Not a text value..")) + }) + .map_ok(|txt| { + writeln!(output, "{}", txt).map_err(Error::from) + }) + .collect::>>() + }) + .collect::>>>() + .map(|_| ()) + }, + + (_other, _) => { + unimplemented!() + }, + } +} diff --git a/src/schema.rs b/src/schema.rs new file mode 100644 index 0000000..9a137eb --- /dev/null +++ b/src/schema.rs @@ -0,0 +1,16 @@ +use tantivy::schema::*; + +pub fn schema() -> Schema { + let mut schema_builder = Schema::builder(); + + let body_options = TextOptions::default() + .set_stored() + .set_indexing_options(TextFieldIndexing::default() + .set_tokenizer("default") + .set_index_option(IndexRecordOption::WithFreqsAndPositions)); + + schema_builder.add_text_field("path", STRING | STORED); + schema_builder.add_text_field("ft", STRING | STORED); + schema_builder.add_text_field("body", body_options ); + schema_builder.build() +} -- cgit v1.2.3