diff options
author | Kornel <kornel@geekhood.net> | 2019-07-28 01:05:41 +0100 |
---|---|---|
committer | Kornel <kornel@geekhood.net> | 2019-07-28 01:31:20 +0100 |
commit | 23d90445324bb904431b6333c253c311e32c8a1b (patch) | |
tree | 14a5e614d3ef8cb56493d8bb45111364686a73a1 | |
parent | ffc2cfc648414d497e88cc146a27c770861a4a91 (diff) |
Correct text extraction
-rw-r--r-- | crate_db/Cargo.toml | 1 | ||||
-rw-r--r-- | crate_db/src/lib_crate_db.rs | 22 | ||||
-rw-r--r-- | kitchen_sink/src/lib_kitchen_sink.rs | 2 | ||||
-rw-r--r-- | reindex/src/bin/reindex_crates.rs | 13 | ||||
m--------- | render_readme | 0 | ||||
-rw-r--r-- | rich_crate/src/rich_crate_version.rs | 6 |
6 files changed, 22 insertions, 22 deletions
diff --git a/crate_db/Cargo.toml b/crate_db/Cargo.toml index f76a110..e6f5a73 100644 --- a/crate_db/Cargo.toml +++ b/crate_db/Cargo.toml @@ -20,3 +20,4 @@ chrono = "0.4.2" thread_local = "0.3.6" parking_lot = "0.9" rake = { git = "https://github.com/kornelski/rake-rs" } +render_readme = { git = "https://gitlab.com/crates.rs/render_readme.git", version = "0.6.3" } diff --git a/crate_db/src/lib_crate_db.rs b/crate_db/src/lib_crate_db.rs index 52c76d6..527460e 100644 --- a/crate_db/src/lib_crate_db.rs +++ b/crate_db/src/lib_crate_db.rs @@ -13,11 +13,11 @@ extern crate lazy_static; use chrono::prelude::*; use failure::ResultExt; use rich_crate::Include; -use rich_crate::Markup; use rich_crate::Origin; use rich_crate::Repo; use rich_crate::RichCrate; use rich_crate::RichCrateVersion; +use render_readme::Renderer; use rusqlite::*; use std::cell::RefCell; use std::collections::HashMap; @@ -129,7 +129,7 @@ impl CrateDb { insert_keyword.add_synonyms(&self.tag_synonyms); { - let d = Self::extract_text_phrases(&c); + let d = Self::extract_text_phrases(&c, &Renderer::new(None)); let mut sw = rake::StopWords::new(); sw.reserve(STOPWORDS.len()); sw.extend(STOPWORDS.iter().map(|s| s.to_string())); // TODO: use real stopwords, THEN filter via STOPWORDS again, because multiple Rust-y words are fine @@ -814,7 +814,7 @@ impl CrateDb { } // returns an array of lowercase phrases - fn extract_text_phrases(krate: &RichCrateVersion) -> Vec<(f64, String)> { + fn extract_text_phrases(krate: &RichCrateVersion, renderer: &Renderer) -> Vec<(f64, String)> { let mut out = Vec::new(); let mut len = 0; if let Some(s) = krate.description() { @@ -827,9 +827,7 @@ impl CrateDb { } if let Ok(Some(r)) = krate.readme() { // render readme to DOM, extract nodes - let sub = match r.markup { - Markup::Markdown(ref s) | Markup::Rst(ref s) => s, - }; + let sub = renderer.visible_text(&r.markup); for par in sub.split('\n') { if len > 200 { break; @@ -840,13 +838,15 @@ impl CrateDb { continue; } let par = par.replace("http://", " ").replace("https://", " "); - len += par.len(); - out.push((0.4, par.to_lowercase())); - } - } - out + if !par.is_empty() { + len += par.len(); + out.push((0.4, par.to_lowercase())); } + } } + out + } +} pub enum RepoChange { Removed { crate_name: String, weight: f64 }, diff --git a/kitchen_sink/src/lib_kitchen_sink.rs b/kitchen_sink/src/lib_kitchen_sink.rs index 4b0d191..f82ccd8 100644 --- a/kitchen_sink/src/lib_kitchen_sink.rs +++ b/kitchen_sink/src/lib_kitchen_sink.rs @@ -639,7 +639,7 @@ impl KitchenSink { pub fn is_readme_short(&self, readme: Result<Option<&Readme>, ()>) -> bool { if let Ok(Some(ref r)) = readme { match r.markup { - Markup::Markdown(ref s) | Markup::Rst(ref s) => s.len() < 1000, + Markup::Markdown(ref s) | Markup::Rst(ref s) | Markup::Html(ref s) => s.len() < 1000, } } else { true diff --git a/reindex/src/bin/reindex_crates.rs b/reindex/src/bin/reindex_crates.rs index 189daa8..6419969 100644 --- a/reindex/src/bin/reindex_crates.rs +++ b/reindex/src/bin/reindex_crates.rs @@ -7,7 +7,7 @@ use rand::{seq::SliceRandom, thread_rng}; use ranking::CrateTemporalInputs; use ranking::CrateVersionInputs; use rayon; -use render_readme::{Markup, Renderer}; +use render_readme::Renderer; use search_index::*; use std::collections::HashSet; use std::sync::mpsc; @@ -31,12 +31,13 @@ fn main() { let (tx, rx) = mpsc::sync_channel(64); let index_thread = std::thread::spawn({ + let renderer = renderer.clone(); move || -> Result<(), failure::Error> { let mut n = 0; let mut next_n = 100; while let Ok((ver, downloads_per_month, score)) = rx.recv() { if stopped() {break;} - index_search(&mut indexer, &ver, downloads_per_month, score)?; + index_search(&mut indexer, &renderer, &ver, downloads_per_month, score)?; n += 1; if n == next_n { next_n *= 2; @@ -117,7 +118,7 @@ fn index_crate(crates: &KitchenSink, c: &Origin, renderer: &Renderer, search_sen Ok(v) } -fn index_search(indexer: &mut Indexer, k: &RichCrateVersion, downloads_per_month: usize, score: f64) -> Result<(), failure::Error> { +fn index_search(indexer: &mut Indexer, renderer: &Renderer, k: &RichCrateVersion, downloads_per_month: usize, score: f64) -> Result<(), failure::Error> { let keywords: Vec<_> = k.keywords(Include::Cleaned).collect(); let mut lib_tmp = None; @@ -125,13 +126,11 @@ fn index_search(indexer: &mut Indexer, k: &RichCrateVersion, downloads_per_month lib_tmp = k.lib_file_markdown(); lib_tmp.as_ref() }).map(|markup| { - match markup { - Markup::Markdown(ref s) | Markup::Rst(ref s) => s.as_str(), - } + renderer.visible_text(markup) }); let version = k.version(); - indexer.add(k.short_name(), version, k.description().unwrap_or(""), &keywords, readme, downloads_per_month as u64, score); + indexer.add(k.short_name(), version, k.description().unwrap_or(""), &keywords, readme.as_ref().map(|s| s.as_str()), downloads_per_month as u64, score); Ok(()) } diff --git a/render_readme b/render_readme -Subproject bb36743b5340cdeaf94b351272ac9c63e1241ee +Subproject 8fc48291857f3b52b94df310aeb9134ff7f8384 diff --git a/rich_crate/src/rich_crate_version.rs b/rich_crate/src/rich_crate_version.rs index 873a7c8..10a2d4f 100644 --- a/rich_crate/src/rich_crate_version.rs +++ b/rich_crate/src/rich_crate_version.rs @@ -7,6 +7,7 @@ pub use cargo_toml::{DepsSet, Edition, FeatureSet, MaintenanceStatus, TargetDeps use categories::Categories; use crates_index::Version; use repo_url::Repo; +use render_readme::Renderer; use semver; use std::borrow::Cow; use std::collections::BTreeMap; @@ -152,9 +153,8 @@ impl RichCrateVersion { } }; if let Ok(Some(r)) = self.readme() { - match r.markup { - Markup::Markdown(ref s) | Markup::Rst(ref s) => add_words(s), - } + let s = Renderer::new(None).visible_text(&r.markup); + add_words(&s); } add_words(self.short_name()); if let Some(s) = self.description() {add_words(s);} |