diff options
-rw-r--r-- | crate_db/src/lib_crate_db.rs | 14 | ||||
-rw-r--r-- | kitchen_sink/src/lib_kitchen_sink.rs | 4 | ||||
-rw-r--r-- | ranking/src/lib_ranking.rs | 30 | ||||
-rw-r--r-- | reindex/src/bin/reindex_crates.rs | 59 | ||||
-rw-r--r-- | reindex/src/bin/reindex_search.rs | 31 |
5 files changed, 96 insertions, 42 deletions
diff --git a/crate_db/src/lib_crate_db.rs b/crate_db/src/lib_crate_db.rs index 8dd926b..df5141d 100644 --- a/crate_db/src/lib_crate_db.rs +++ b/crate_db/src/lib_crate_db.rs @@ -100,7 +100,7 @@ impl CrateDb { } /// Add data of the latest version of a crate to the index - pub fn index_latest(&self, c: &RichCrateVersion, deps_stats: &[(&str, f32)], (is_build, is_dev): (bool, bool)) -> FResult<()> { + pub fn index_latest(&self, c: &RichCrateVersion, deps_stats: &[(&str, f32)], score: f64, (is_build, is_dev): (bool, bool)) -> FResult<()> { let origin = c.origin().to_str(); let mut insert_keyword = KeywordInsert::new()?; @@ -202,7 +202,7 @@ impl CrateDb { let (categories, had_explicit_categories) = { let keywords = insert_keyword.keywords.iter().map(|(k,_)| k.to_string()); - self.extract_crate_categories(&tx, c, keywords, is_important_ish)? + self.extract_crate_categories(&tx, c, keywords, score, is_important_ish)? }; if !had_explicit_categories { @@ -236,7 +236,7 @@ impl CrateDb { /// (rank-relevance, relevance, slug) /// /// Rank relevance is normalized and biased towards one top category - fn extract_crate_categories(&self, conn: &Connection, c: &RichCrateVersion, keywords: impl Iterator<Item=String>, is_important_ish: bool) -> FResult<(Vec<(f64, f64, String)>, bool)> { + fn extract_crate_categories(&self, conn: &Connection, c: &RichCrateVersion, keywords: impl Iterator<Item=String>, score: f64, is_important_ish: bool) -> FResult<(Vec<(f64, f64, String)>, bool)> { let (explicit_categories, invalid_categories): (Vec<_>, Vec<_>) = c.category_slugs(Include::AuthoritativeOnly) .map(|k| k.to_string()) .partition(|slug| { @@ -274,13 +274,12 @@ impl CrateDb { .unwrap_or(0.) .max(0.3); // prevents div/0, ensures odd choices stay low - let is_sys = c.is_sys(); let categories = categories .into_iter() .map(|(relevance_weight, slug)| { let rank_weight = relevance_weight/max_weight * if relevance_weight >= max_weight*0.99 {1.} else {0.4} // a crate is only in 1 category - * if is_sys {0.92} else {1.}; // rank sys crates below their high-level wrappers // TODO do same for derive helpers + * (0.5 + score * 0.5); // so far the score is a bit dodgy, so apply it lightly (rank_weight, relevance_weight, slug) }) .collect(); @@ -666,9 +665,10 @@ impl CrateDb { /// Returns recent_downloads and weight/importance as well pub fn top_crates_in_category_partially_ranked(&self, slug: &str, limit: u32) -> FResult<Vec<(Origin, u32, f64)>> { self.with_connection(|conn| { - // sort by relevance to the category, downrank for being removed from crates + // sort by relevance to the category, downrank for being crappy (later also downranked for being removed from crates) + // low number of downloads is mostly by rank, rather than downloads let mut query = conn.prepare_cached( - "SELECT k.origin, k.recent_downloads, (k.recent_downloads * c.rank_weight) as w + "SELECT k.origin, k.recent_downloads, ((k.recent_downloads + 2000) * c.rank_weight) as w FROM categories c JOIN crates k on c.crate_id = k.id WHERE c.slug = ?1 diff --git a/kitchen_sink/src/lib_kitchen_sink.rs b/kitchen_sink/src/lib_kitchen_sink.rs index 0fe9614..60c755d 100644 --- a/kitchen_sink/src/lib_kitchen_sink.rs +++ b/kitchen_sink/src/lib_kitchen_sink.rs @@ -925,7 +925,7 @@ impl KitchenSink { Ok(()) } - pub fn index_crate_highest_version(&self, v: &RichCrateVersion) -> CResult<()> { + pub fn index_crate_highest_version(&self, v: &RichCrateVersion, score: f64) -> CResult<()> { if stopped() {Err(KitchenSinkErr::Stopped)?;} // direct deps are used as extra keywords for similarity matching, @@ -945,7 +945,7 @@ impl KitchenSink { } } } - self.crate_db.index_latest(v, &weighed_deps, self.is_build_or_dev(v))?; + self.crate_db.index_latest(v, &weighed_deps, score, self.is_build_or_dev(v))?; Ok(()) } diff --git a/ranking/src/lib_ranking.rs b/ranking/src/lib_ranking.rs index 34064a4..09e8b2c 100644 --- a/ranking/src/lib_ranking.rs +++ b/ranking/src/lib_ranking.rs @@ -80,12 +80,12 @@ fn cargo_toml_score(cr: &CrateVersionInputs) -> Score { s.frac("description len", 20, (cr.description.len() as f64 / 300.).min(1.)); // build.rs slows compilation down, so better not use it unless necessary (links means a sys create, so somewhat justified) - s.n("build.rs", 20, if !cr.has_build_rs && !cr.has_links {20} else if cr.has_links {10} else {0}); + s.n("build.rs", 10, if !cr.has_build_rs && !cr.has_links {10} else if cr.has_links {5} else {0}); // users report examples are super valuable - s.has("has_examples", 100, cr.has_examples); + s.has("has_examples", 50, cr.has_examples); // probably less buggy than if winging it - s.has("has_tests", 70, cr.has_tests); + s.has("has_tests", 50, cr.has_tests); // probably optimized s.has("has_benches", 10, cr.has_benches); @@ -106,7 +106,7 @@ fn cargo_toml_score(cr: &CrateVersionInputs) -> Score { // it's the best practice, may help building old versions of the project // s.has("has_lockfile", 5, cr.has_lockfile); // assume it's CI, which helps improve quality - s.has("has_badges", 10, cr.has_badges); + s.has("has_badges", 20, cr.has_badges); // not official // s.has("has_changelog", 5, cr.has_changelog); @@ -124,7 +124,7 @@ fn cargo_toml_score(cr: &CrateVersionInputs) -> Score { // TODO: being nightly should be a negative score s.has("works on stable", 20, !cr.is_nightly); // fresh - s.has("2018 edition", 10, cr.edition != Edition::E2015); + s.has("2018 edition", 5, cr.edition != Edition::E2015); // license proliferation is bad s.has("useful license", 10, if cr.is_app { @@ -253,26 +253,26 @@ fn versions_score(ver: &[CrateVersion]) -> Score { let semver = ver.iter().filter(|s| !s.yanked).filter_map(|s| SemVer::parse(&s.num).ok()).collect::<Vec<_>>(); s.has("more than one release", 20, semver.len() > 1); - if !semver.is_empty() { // all yanked + if semver.is_empty() { // all yanked return s; } let oldest = ver.iter().map(|v| &v.created_at).min().and_then(|s| s.parse::<DateTime<Utc>>().ok()); let newest = ver.iter().map(|v| &v.created_at).max().and_then(|s| s.parse::<DateTime<Utc>>().ok()); if let (Some(oldest), Some(newest)) = (oldest, newest) { - s.n("crate development time", 40, (newest - oldest).num_days() / 11); + s.n("development history", 40, (newest - oldest).num_days() / 11); } // don't count 0.0.x s.n("number of non-experimental releases", 15, semver.iter().filter(|v| (v.major > 0 || v.minor > 0) && v.pre.is_empty()).count() as u32); // patch releases are correlated with stable, polished code - s.n("patch releases", 20, 5 * semver.iter().filter(|v| v.major > 0 && v.patch > 0).count() as u32); + s.n("patch releases", 20, 4 * semver.iter().filter(|v| v.major > 0 && v.patch > 0).count() as u32); s.n("a high patch release", 10, semver.iter().map(|v| v.patch as u32).max().unwrap_or(0)); // for 0.x crates it's hard to knwo what is a patch release - s.has("an unstable patch/feature release", 10, semver.iter().any(|v| v.major == 0 && v.patch > 1)); + s.has("an unstable patch/feature release", 8, semver.iter().any(|v| v.major == 0 && v.patch > 1)); // careful release process is a sign of maturity s.has("a prerelease", 5, semver.iter().any(|v| !v.pre.is_empty())); - s.has("a stable release", 15, semver.iter().any(|v| v.major > 0 && v.major < 20)); + s.has("a stable release", 10, semver.iter().any(|v| v.major > 0 && v.major < 20)); s.has("yanked", 2, ver.iter().any(|v| v.yanked)); // author cares to remove bad versions s } @@ -280,7 +280,7 @@ fn versions_score(ver: &[CrateVersion]) -> Score { fn authors_score(authors: &[Author], owners: &[CrateOwner]) -> Score { let mut s = Score::new(); s.n("bus factor", 5, owners.len() as u32); - s.n("more than one owner", 5, owners.len() > 1); + s.n("more than one owner", 8, owners.len() > 1); s.n("authors", 5, authors.len() as u32); s } @@ -288,10 +288,10 @@ fn authors_score(authors: &[Author], owners: &[CrateOwner]) -> Score { pub fn crate_score_version(cr: &CrateVersionInputs) -> Score { let mut score = Score::new(); - score.group("Cargo.toml", 1, cargo_toml_score(cr)); - score.group("README", 1, readme_score(cr.readme)); - score.group("Versions", 1, versions_score(cr.versions)); - score.group("Authors/Owners", 1, authors_score(cr.authors, cr.owners)); + score.group("Cargo.toml", 2, cargo_toml_score(cr)); + score.group("README", 4, readme_score(cr.readme)); + score.group("Versions", 4, versions_score(cr.versions)); + score.group("Authors/Owners", 3, authors_score(cr.authors, cr.owners)); score } diff --git a/reindex/src/bin/reindex_crates.rs b/reindex/src/bin/reindex_crates.rs index 1162376..4f5cb96 100644 --- a/reindex/src/bin/reindex_crates.rs +++ b/reindex/src/bin/reindex_crates.rs @@ -1,3 +1,6 @@ +use ranking::CrateVersionInputs; +use kitchen_sink::RichCrate; +use render_readme::Renderer; use either::*; use failure; use kitchen_sink::{self, stopped, CrateData, KitchenSink, Origin, RichCrateVersion}; @@ -16,6 +19,7 @@ fn main() { std::process::exit(1); }, }); + let renderer = Arc::new(Renderer::new(None)); let everything = std::env::args().nth(1).map_or(false, |a| a == "--all"); let repos = !everything; @@ -34,12 +38,13 @@ fn main() { return; } let crates = Arc::clone(&crates); + let renderer = Arc::clone(&renderer); s1.spawn(move |s2| { if stopped() { return; } print!("{} ", i); - match index_crate(&crates, &k) { + match index_crate(&crates, &k, &renderer) { Ok(v) => { if repos { s2.spawn(move |_| { @@ -65,14 +70,62 @@ fn main() { }); } -fn index_crate(crates: &KitchenSink, c: &Origin) -> Result<RichCrateVersion, failure::Error> { +fn index_crate(crates: &KitchenSink, c: &Origin, renderer: &Renderer) -> Result<RichCrateVersion, failure::Error> { let v = crates.rich_crate_version(c, CrateData::FullNoDerived)?; - crates.index_crate_highest_version(&v)?; let k = crates.rich_crate(c)?; + let score = crate_base_score(&k, &v, renderer); + crates.index_crate_highest_version(&v, score)?; crates.index_crate(&k)?; Ok(v) } + +fn crate_base_score(all: &RichCrate, k: &RichCrateVersion, renderer: &Renderer) -> f64 { + let readme = k.readme().ok().and_then(|r| r).map(|readme| { + renderer.page_node(&readme.markup, None, false) + }); + let mut score = ranking::crate_score_version(&CrateVersionInputs { + versions: all.versions(), + description: k.description().unwrap_or(""), + readme: readme.as_ref(), + owners: all.owners(), + authors: k.authors(), + edition: k.edition(), + is_app: k.is_app(), + has_build_rs: k.has_buildrs(), + has_links: k.links().is_some(), + has_documentation_link: k.documentation().is_some(), + has_homepage_link: k.homepage().is_some(), + has_repository_link: k.repository().is_some(), + has_keywords: k.has_own_keywords(), + has_categories: k.has_own_categories(), + has_features: !k.features().is_empty(), + has_examples: k.has_examples(), + has_benches: k.has_benches(), + has_tests: k.has_tests(), + // has_lockfile: k.has_lockfile(), + // has_changelog: k.has_changelog(), + license: k.license().unwrap_or(""), + has_badges: k.has_badges(), + maintenance: k.maintenance(), + is_nightly: k.is_nightly(), + }).total(); + + + // there's usually a non-macro/non-sys sibling + if k.is_proc_macro() || k.is_sys() { + score *= 0.9; + } + + // k bye + if k.is_yanked() { + score *= 0.001; + } + + score +} + + fn print_res<T>(res: Result<T, failure::Error>) { if let Err(e) = res { eprintln!("••• Error: {}", e); diff --git a/reindex/src/bin/reindex_search.rs b/reindex/src/bin/reindex_search.rs index 94a849e..083bda1 100644 --- a/reindex/src/bin/reindex_search.rs +++ b/reindex/src/bin/reindex_search.rs @@ -73,7 +73,7 @@ fn crate_base_score(all: &RichCrate, k: &RichCrateVersion, renderer: &Renderer) let readme = k.readme().ok().and_then(|r| r).map(|readme| { renderer.page_node(&readme.markup, None, false) }); - ranking::crate_score_version(&CrateVersionInputs { + let mut score = ranking::crate_score_version(&CrateVersionInputs { versions: all.versions(), description: k.description().unwrap_or(""), readme: readme.as_ref(), @@ -98,7 +98,20 @@ fn crate_base_score(all: &RichCrate, k: &RichCrateVersion, renderer: &Renderer) has_badges: k.has_badges(), maintenance: k.maintenance(), is_nightly: k.is_nightly(), - }).total() + }).total(); + + + // there's usually a non-macro/non-sys sibling + if k.is_proc_macro() || k.is_sys() { + score *= 0.9; + } + + // k bye + if k.is_yanked() { + score *= 0.001; + } + + score } fn index(indexer: &mut Indexer, renderer: &Renderer, all: &RichCrate, k: &RichCrateVersion, popularity: usize) -> Result<(), failure::Error> { @@ -119,19 +132,7 @@ fn index(indexer: &mut Indexer, renderer: &Renderer, all: &RichCrate, k: &RichCr // based on crate's own content and metadata let base_score = crate_base_score(all, k, renderer); - let mut score = (0.5 + pop_score) * base_score; - - // there's usually a non-macro sibling - if k.is_proc_macro() { - score *= 0.9; - } - - // k bye - if k.is_yanked() { - score *= 0.001; - } - - score = score.min(1.0); // keep it in the range + let score = ((0.5 + pop_score) * base_score).min(1.0); println!("{:0.3} {:0.3} {}: {}", score, base_score, k.short_name(), k.description().unwrap_or("")); |