summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--crate_db/src/lib_crate_db.rs14
-rw-r--r--kitchen_sink/src/lib_kitchen_sink.rs4
-rw-r--r--ranking/src/lib_ranking.rs30
-rw-r--r--reindex/src/bin/reindex_crates.rs59
-rw-r--r--reindex/src/bin/reindex_search.rs31
5 files changed, 96 insertions, 42 deletions
diff --git a/crate_db/src/lib_crate_db.rs b/crate_db/src/lib_crate_db.rs
index 8dd926b..df5141d 100644
--- a/crate_db/src/lib_crate_db.rs
+++ b/crate_db/src/lib_crate_db.rs
@@ -100,7 +100,7 @@ impl CrateDb {
}
/// Add data of the latest version of a crate to the index
- pub fn index_latest(&self, c: &RichCrateVersion, deps_stats: &[(&str, f32)], (is_build, is_dev): (bool, bool)) -> FResult<()> {
+ pub fn index_latest(&self, c: &RichCrateVersion, deps_stats: &[(&str, f32)], score: f64, (is_build, is_dev): (bool, bool)) -> FResult<()> {
let origin = c.origin().to_str();
let mut insert_keyword = KeywordInsert::new()?;
@@ -202,7 +202,7 @@ impl CrateDb {
let (categories, had_explicit_categories) = {
let keywords = insert_keyword.keywords.iter().map(|(k,_)| k.to_string());
- self.extract_crate_categories(&tx, c, keywords, is_important_ish)?
+ self.extract_crate_categories(&tx, c, keywords, score, is_important_ish)?
};
if !had_explicit_categories {
@@ -236,7 +236,7 @@ impl CrateDb {
/// (rank-relevance, relevance, slug)
///
/// Rank relevance is normalized and biased towards one top category
- fn extract_crate_categories(&self, conn: &Connection, c: &RichCrateVersion, keywords: impl Iterator<Item=String>, is_important_ish: bool) -> FResult<(Vec<(f64, f64, String)>, bool)> {
+ fn extract_crate_categories(&self, conn: &Connection, c: &RichCrateVersion, keywords: impl Iterator<Item=String>, score: f64, is_important_ish: bool) -> FResult<(Vec<(f64, f64, String)>, bool)> {
let (explicit_categories, invalid_categories): (Vec<_>, Vec<_>) = c.category_slugs(Include::AuthoritativeOnly)
.map(|k| k.to_string())
.partition(|slug| {
@@ -274,13 +274,12 @@ impl CrateDb {
.unwrap_or(0.)
.max(0.3); // prevents div/0, ensures odd choices stay low
- let is_sys = c.is_sys();
let categories = categories
.into_iter()
.map(|(relevance_weight, slug)| {
let rank_weight = relevance_weight/max_weight
* if relevance_weight >= max_weight*0.99 {1.} else {0.4} // a crate is only in 1 category
- * if is_sys {0.92} else {1.}; // rank sys crates below their high-level wrappers // TODO do same for derive helpers
+ * (0.5 + score * 0.5); // so far the score is a bit dodgy, so apply it lightly
(rank_weight, relevance_weight, slug)
})
.collect();
@@ -666,9 +665,10 @@ impl CrateDb {
/// Returns recent_downloads and weight/importance as well
pub fn top_crates_in_category_partially_ranked(&self, slug: &str, limit: u32) -> FResult<Vec<(Origin, u32, f64)>> {
self.with_connection(|conn| {
- // sort by relevance to the category, downrank for being removed from crates
+ // sort by relevance to the category, downrank for being crappy (later also downranked for being removed from crates)
+ // low number of downloads is mostly by rank, rather than downloads
let mut query = conn.prepare_cached(
- "SELECT k.origin, k.recent_downloads, (k.recent_downloads * c.rank_weight) as w
+ "SELECT k.origin, k.recent_downloads, ((k.recent_downloads + 2000) * c.rank_weight) as w
FROM categories c
JOIN crates k on c.crate_id = k.id
WHERE c.slug = ?1
diff --git a/kitchen_sink/src/lib_kitchen_sink.rs b/kitchen_sink/src/lib_kitchen_sink.rs
index 0fe9614..60c755d 100644
--- a/kitchen_sink/src/lib_kitchen_sink.rs
+++ b/kitchen_sink/src/lib_kitchen_sink.rs
@@ -925,7 +925,7 @@ impl KitchenSink {
Ok(())
}
- pub fn index_crate_highest_version(&self, v: &RichCrateVersion) -> CResult<()> {
+ pub fn index_crate_highest_version(&self, v: &RichCrateVersion, score: f64) -> CResult<()> {
if stopped() {Err(KitchenSinkErr::Stopped)?;}
// direct deps are used as extra keywords for similarity matching,
@@ -945,7 +945,7 @@ impl KitchenSink {
}
}
}
- self.crate_db.index_latest(v, &weighed_deps, self.is_build_or_dev(v))?;
+ self.crate_db.index_latest(v, &weighed_deps, score, self.is_build_or_dev(v))?;
Ok(())
}
diff --git a/ranking/src/lib_ranking.rs b/ranking/src/lib_ranking.rs
index 34064a4..09e8b2c 100644
--- a/ranking/src/lib_ranking.rs
+++ b/ranking/src/lib_ranking.rs
@@ -80,12 +80,12 @@ fn cargo_toml_score(cr: &CrateVersionInputs) -> Score {
s.frac("description len", 20, (cr.description.len() as f64 / 300.).min(1.));
// build.rs slows compilation down, so better not use it unless necessary (links means a sys create, so somewhat justified)
- s.n("build.rs", 20, if !cr.has_build_rs && !cr.has_links {20} else if cr.has_links {10} else {0});
+ s.n("build.rs", 10, if !cr.has_build_rs && !cr.has_links {10} else if cr.has_links {5} else {0});
// users report examples are super valuable
- s.has("has_examples", 100, cr.has_examples);
+ s.has("has_examples", 50, cr.has_examples);
// probably less buggy than if winging it
- s.has("has_tests", 70, cr.has_tests);
+ s.has("has_tests", 50, cr.has_tests);
// probably optimized
s.has("has_benches", 10, cr.has_benches);
@@ -106,7 +106,7 @@ fn cargo_toml_score(cr: &CrateVersionInputs) -> Score {
// it's the best practice, may help building old versions of the project
// s.has("has_lockfile", 5, cr.has_lockfile);
// assume it's CI, which helps improve quality
- s.has("has_badges", 10, cr.has_badges);
+ s.has("has_badges", 20, cr.has_badges);
// not official
// s.has("has_changelog", 5, cr.has_changelog);
@@ -124,7 +124,7 @@ fn cargo_toml_score(cr: &CrateVersionInputs) -> Score {
// TODO: being nightly should be a negative score
s.has("works on stable", 20, !cr.is_nightly);
// fresh
- s.has("2018 edition", 10, cr.edition != Edition::E2015);
+ s.has("2018 edition", 5, cr.edition != Edition::E2015);
// license proliferation is bad
s.has("useful license", 10, if cr.is_app {
@@ -253,26 +253,26 @@ fn versions_score(ver: &[CrateVersion]) -> Score {
let semver = ver.iter().filter(|s| !s.yanked).filter_map(|s| SemVer::parse(&s.num).ok()).collect::<Vec<_>>();
s.has("more than one release", 20, semver.len() > 1);
- if !semver.is_empty() { // all yanked
+ if semver.is_empty() { // all yanked
return s;
}
let oldest = ver.iter().map(|v| &v.created_at).min().and_then(|s| s.parse::<DateTime<Utc>>().ok());
let newest = ver.iter().map(|v| &v.created_at).max().and_then(|s| s.parse::<DateTime<Utc>>().ok());
if let (Some(oldest), Some(newest)) = (oldest, newest) {
- s.n("crate development time", 40, (newest - oldest).num_days() / 11);
+ s.n("development history", 40, (newest - oldest).num_days() / 11);
}
// don't count 0.0.x
s.n("number of non-experimental releases", 15, semver.iter().filter(|v| (v.major > 0 || v.minor > 0) && v.pre.is_empty()).count() as u32);
// patch releases are correlated with stable, polished code
- s.n("patch releases", 20, 5 * semver.iter().filter(|v| v.major > 0 && v.patch > 0).count() as u32);
+ s.n("patch releases", 20, 4 * semver.iter().filter(|v| v.major > 0 && v.patch > 0).count() as u32);
s.n("a high patch release", 10, semver.iter().map(|v| v.patch as u32).max().unwrap_or(0));
// for 0.x crates it's hard to knwo what is a patch release
- s.has("an unstable patch/feature release", 10, semver.iter().any(|v| v.major == 0 && v.patch > 1));
+ s.has("an unstable patch/feature release", 8, semver.iter().any(|v| v.major == 0 && v.patch > 1));
// careful release process is a sign of maturity
s.has("a prerelease", 5, semver.iter().any(|v| !v.pre.is_empty()));
- s.has("a stable release", 15, semver.iter().any(|v| v.major > 0 && v.major < 20));
+ s.has("a stable release", 10, semver.iter().any(|v| v.major > 0 && v.major < 20));
s.has("yanked", 2, ver.iter().any(|v| v.yanked)); // author cares to remove bad versions
s
}
@@ -280,7 +280,7 @@ fn versions_score(ver: &[CrateVersion]) -> Score {
fn authors_score(authors: &[Author], owners: &[CrateOwner]) -> Score {
let mut s = Score::new();
s.n("bus factor", 5, owners.len() as u32);
- s.n("more than one owner", 5, owners.len() > 1);
+ s.n("more than one owner", 8, owners.len() > 1);
s.n("authors", 5, authors.len() as u32);
s
}
@@ -288,10 +288,10 @@ fn authors_score(authors: &[Author], owners: &[CrateOwner]) -> Score {
pub fn crate_score_version(cr: &CrateVersionInputs) -> Score {
let mut score = Score::new();
- score.group("Cargo.toml", 1, cargo_toml_score(cr));
- score.group("README", 1, readme_score(cr.readme));
- score.group("Versions", 1, versions_score(cr.versions));
- score.group("Authors/Owners", 1, authors_score(cr.authors, cr.owners));
+ score.group("Cargo.toml", 2, cargo_toml_score(cr));
+ score.group("README", 4, readme_score(cr.readme));
+ score.group("Versions", 4, versions_score(cr.versions));
+ score.group("Authors/Owners", 3, authors_score(cr.authors, cr.owners));
score
}
diff --git a/reindex/src/bin/reindex_crates.rs b/reindex/src/bin/reindex_crates.rs
index 1162376..4f5cb96 100644
--- a/reindex/src/bin/reindex_crates.rs
+++ b/reindex/src/bin/reindex_crates.rs
@@ -1,3 +1,6 @@
+use ranking::CrateVersionInputs;
+use kitchen_sink::RichCrate;
+use render_readme::Renderer;
use either::*;
use failure;
use kitchen_sink::{self, stopped, CrateData, KitchenSink, Origin, RichCrateVersion};
@@ -16,6 +19,7 @@ fn main() {
std::process::exit(1);
},
});
+ let renderer = Arc::new(Renderer::new(None));
let everything = std::env::args().nth(1).map_or(false, |a| a == "--all");
let repos = !everything;
@@ -34,12 +38,13 @@ fn main() {
return;
}
let crates = Arc::clone(&crates);
+ let renderer = Arc::clone(&renderer);
s1.spawn(move |s2| {
if stopped() {
return;
}
print!("{} ", i);
- match index_crate(&crates, &k) {
+ match index_crate(&crates, &k, &renderer) {
Ok(v) => {
if repos {
s2.spawn(move |_| {
@@ -65,14 +70,62 @@ fn main() {
});
}
-fn index_crate(crates: &KitchenSink, c: &Origin) -> Result<RichCrateVersion, failure::Error> {
+fn index_crate(crates: &KitchenSink, c: &Origin, renderer: &Renderer) -> Result<RichCrateVersion, failure::Error> {
let v = crates.rich_crate_version(c, CrateData::FullNoDerived)?;
- crates.index_crate_highest_version(&v)?;
let k = crates.rich_crate(c)?;
+ let score = crate_base_score(&k, &v, renderer);
+ crates.index_crate_highest_version(&v, score)?;
crates.index_crate(&k)?;
Ok(v)
}
+
+fn crate_base_score(all: &RichCrate, k: &RichCrateVersion, renderer: &Renderer) -> f64 {
+ let readme = k.readme().ok().and_then(|r| r).map(|readme| {
+ renderer.page_node(&readme.markup, None, false)
+ });
+ let mut score = ranking::crate_score_version(&CrateVersionInputs {
+ versions: all.versions(),
+ description: k.description().unwrap_or(""),
+ readme: readme.as_ref(),
+ owners: all.owners(),
+ authors: k.authors(),
+ edition: k.edition(),
+ is_app: k.is_app(),
+ has_build_rs: k.has_buildrs(),
+ has_links: k.links().is_some(),
+ has_documentation_link: k.documentation().is_some(),
+ has_homepage_link: k.homepage().is_some(),
+ has_repository_link: k.repository().is_some(),
+ has_keywords: k.has_own_keywords(),
+ has_categories: k.has_own_categories(),
+ has_features: !k.features().is_empty(),
+ has_examples: k.has_examples(),
+ has_benches: k.has_benches(),
+ has_tests: k.has_tests(),
+ // has_lockfile: k.has_lockfile(),
+ // has_changelog: k.has_changelog(),
+ license: k.license().unwrap_or(""),
+ has_badges: k.has_badges(),
+ maintenance: k.maintenance(),
+ is_nightly: k.is_nightly(),
+ }).total();
+
+
+ // there's usually a non-macro/non-sys sibling
+ if k.is_proc_macro() || k.is_sys() {
+ score *= 0.9;
+ }
+
+ // k bye
+ if k.is_yanked() {
+ score *= 0.001;
+ }
+
+ score
+}
+
+
fn print_res<T>(res: Result<T, failure::Error>) {
if let Err(e) = res {
eprintln!("••• Error: {}", e);
diff --git a/reindex/src/bin/reindex_search.rs b/reindex/src/bin/reindex_search.rs
index 94a849e..083bda1 100644
--- a/reindex/src/bin/reindex_search.rs
+++ b/reindex/src/bin/reindex_search.rs
@@ -73,7 +73,7 @@ fn crate_base_score(all: &RichCrate, k: &RichCrateVersion, renderer: &Renderer)
let readme = k.readme().ok().and_then(|r| r).map(|readme| {
renderer.page_node(&readme.markup, None, false)
});
- ranking::crate_score_version(&CrateVersionInputs {
+ let mut score = ranking::crate_score_version(&CrateVersionInputs {
versions: all.versions(),
description: k.description().unwrap_or(""),
readme: readme.as_ref(),
@@ -98,7 +98,20 @@ fn crate_base_score(all: &RichCrate, k: &RichCrateVersion, renderer: &Renderer)
has_badges: k.has_badges(),
maintenance: k.maintenance(),
is_nightly: k.is_nightly(),
- }).total()
+ }).total();
+
+
+ // there's usually a non-macro/non-sys sibling
+ if k.is_proc_macro() || k.is_sys() {
+ score *= 0.9;
+ }
+
+ // k bye
+ if k.is_yanked() {
+ score *= 0.001;
+ }
+
+ score
}
fn index(indexer: &mut Indexer, renderer: &Renderer, all: &RichCrate, k: &RichCrateVersion, popularity: usize) -> Result<(), failure::Error> {
@@ -119,19 +132,7 @@ fn index(indexer: &mut Indexer, renderer: &Renderer, all: &RichCrate, k: &RichCr
// based on crate's own content and metadata
let base_score = crate_base_score(all, k, renderer);
- let mut score = (0.5 + pop_score) * base_score;
-
- // there's usually a non-macro sibling
- if k.is_proc_macro() {
- score *= 0.9;
- }
-
- // k bye
- if k.is_yanked() {
- score *= 0.001;
- }
-
- score = score.min(1.0); // keep it in the range
+ let score = ((0.5 + pop_score) * base_score).min(1.0);
println!("{:0.3} {:0.3} {}: {}", score, base_score, k.short_name(), k.description().unwrap_or(""));