summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKornel <kornel@geekhood.net>2019-03-29 14:59:36 +0000
committerKornel <kornel@geekhood.net>2019-03-29 21:38:06 +0000
commita9302819102902bb8044d493ba4707f4a4689c29 (patch)
tree74fe3cb8666fb88676f9611529ca05a15e108700
parent5f9d39a4f449953351c765a072118a27797ff121 (diff)
Expand ranking to include time and removals
-rw-r--r--kitchen_sink/src/lib_kitchen_sink.rs14
-rw-r--r--ranking/src/lib_ranking.rs84
-rw-r--r--ranking/src/scorer.rs8
-rw-r--r--reindex/src/bin/reindex_crates.rs54
4 files changed, 127 insertions, 33 deletions
diff --git a/kitchen_sink/src/lib_kitchen_sink.rs b/kitchen_sink/src/lib_kitchen_sink.rs
index 264c736..a7b235e 100644
--- a/kitchen_sink/src/lib_kitchen_sink.rs
+++ b/kitchen_sink/src/lib_kitchen_sink.rs
@@ -354,6 +354,12 @@ impl KitchenSink {
}
}
+ /// Fudge-factor score proprtional to how many times a crate has been removed from some project
+ pub fn crate_removals(&self, origin: &Origin) -> Option<f64> {
+ self.removals
+ .get(|| self.crate_db.removals().expect("fetch crate removals"))
+ .get(origin).cloned()
+ }
pub fn downloads_per_month(&self, origin: &Origin) -> CResult<Option<usize>> {
self.downloads_recent(origin).map(|dl| dl.map(|n| n/3))
@@ -1331,13 +1337,7 @@ impl KitchenSink {
Ok(match cache.entry(slug.to_owned()) {
Occupied(e) => Arc::clone(e.get()),
Vacant(e) => {
- let some_extra_for_removals = 30 + wanted_num/10;
- let mut crates = self.crate_db.top_crates_in_category_partially_ranked(slug, wanted_num + some_extra_for_removals)?;
- let removals = self.removals.get(|| self.crate_db.removals().unwrap());
- for c in &mut crates {
- c.2 /= 300. + removals.get(&c.0).cloned().unwrap_or(2.);
- }
- crates.sort_by(|a, b| b.2.partial_cmp(&a.2).expect("nan?"));
+ let crates = self.crate_db.top_crates_in_category_partially_ranked(slug, wanted_num)?;
let crates: Vec<_> = crates.into_iter().map(|(o, r, _)| (o, r)).take(wanted_num as usize).collect();
let res = Arc::new(crates);
e.insert(Arc::clone(&res));
diff --git a/ranking/src/lib_ranking.rs b/ranking/src/lib_ranking.rs
index 09e8b2c..5e05a80 100644
--- a/ranking/src/lib_ranking.rs
+++ b/ranking/src/lib_ranking.rs
@@ -48,25 +48,30 @@ pub struct CrateVersionInputs<'a> {
}
/// Changes over time, but doesn't depend on crate's own ranking
-pub struct CrateTemporalInputs {
- /// 1.0 fresh, 0.0 totally outdated and deprecated
- pub dependency_freshness: Vec<f32>,
- pub recent_downloads: u32,
- pub recent_downloads_minus_most_downloaded_user: u32,
+pub struct CrateTemporalInputs<'a> {
+ pub versions: &'a [CrateVersion],
+ // 1.0 fresh, 0.0 totally outdated and deprecated
+ // pub dependency_freshness: Vec<f32>,
+ pub downloads_per_month: u32,
+ /// Looking at downloads of direct dependencies.
+ /// This way internal derive/impl/core crates that have one big user get 0 here.
+ pub downloads_per_month_minus_most_downloaded_user: u32,
+ pub is_app: bool,
pub has_docs_rs: bool,
+ pub is_nightly: bool,
// low priority, because it's unranked! it'll be re-evaluated later
- pub number_of_reverse_deps: u32,
+ pub number_of_direct_reverse_deps: u32,
+ /// use max(runtime, dev, build), because the crate is going to be one of these kinds
+ pub number_of_indirect_reverse_deps: u32,
+ /// Includes non-optional (i.e. it's the upper bound, not just the optional ones)
+ pub number_of_indirect_reverse_optional_deps: u32,
// most recent commit
// avg time issues are left unanswered?
-}
-
-/// Crate's own base ranking influences these rankings
-pub struct CrateContextInputs {
- pub crate_score_context_free: f64,
- pub owner_pageranks: Vec<f32>,
- pub reverse_deps_rankings: Vec<f32>,
+ // pub crate_score_context_free: f64,
+ // pub owner_pageranks: Vec<f32>,
+ // pub reverse_deps_rankings: Vec<f32>,
}
pub struct Env {
@@ -296,11 +301,56 @@ pub fn crate_score_version(cr: &CrateVersionInputs) -> Score {
score
}
-// pub fn crate_score_temporal(inputs: &CrateTemporalInputs) -> Score {
-// let mut score = Score::new();
+pub fn crate_score_temporal(cr: &CrateTemporalInputs) -> Score {
+ let mut score = Score::new();
-// score
-// }
+ let newest = cr.versions.iter().max_by_key(|v| &v.created_at).expect("at least 1 ver?");
+ let freshness_score = match newest.created_at.parse::<DateTime<Utc>>() {
+ Ok(latest_date) => {
+ // Assume higher versions, and especially patch versions, mean the crate is more mature
+ // and needs fewer updates
+ let version_stability_interval = match SemVer::parse(&newest.num) {
+ Ok(ref ver) if ver.patch > 3 && ver.major > 0 => 500,
+ Ok(ref ver) if ver.patch > 3 => 350,
+ Ok(ref ver) if ver.patch > 0 => 250,
+ Ok(ref ver) if ver.major > 0 => 200,
+ Ok(ref ver) if ver.minor > 3 => 150,
+ _ => 80,
+ };
+ let expected_update_interval = version_stability_interval.min(cr.versions.len() as i64 * 50) / if cr.is_nightly {2} else {1};
+ let age = (Utc::now() - latest_date).num_days();
+ let days_past_expiration_date = (age - expected_update_interval).max(0);
+ // score decays for a ~year after the crate should have been updated
+ let decay_days = if cr.is_nightly {60} else {200} + expected_update_interval/2;
+ (decay_days - days_past_expiration_date).max(0) as f64 / (decay_days as f64)
+ },
+ Err(e) => {
+ eprintln!("Release time parse error: {}", e);
+ 0.
+ }
+ };
+ score.frac("Freshness of latest release", 8, freshness_score);
+
+ // Low numbers are just bots/noise.
+ let downloads = (cr.downloads_per_month as f64 - 100.).max(0.) + 100.;
+ let downloads_cleaned = (cr.downloads_per_month_minus_most_downloaded_user as f64 - 50.).max(0.) + 50.;
+ // distribution of downloads follows power law.
+ // apps have much harder to get high download numbers.
+ let pop = (downloads.log2() - 6.6) / (if cr.is_app {1.0} else {2.0});
+ let pop_cleaned = (downloads_cleaned.log2() - 5.6) / (if cr.is_app {1.0} else {2.0});
+ assert!(pop > 0.);
+ assert!(pop_cleaned > 0.);
+ // FIXME: max should be based on the most downloaded crate?
+ score.score_f("Downloads", 8., pop/2.);
+ score.score_f("Downloads (cleaned)", 18., pop_cleaned);
+
+ score.score_f("Direct rev deps", 10., (cr.number_of_direct_reverse_deps as f64).sqrt());
+ let indirect = 1. + cr.number_of_indirect_reverse_optional_deps as f64 / 3.;
+ score.score_f("Indirect rev deps", 10., indirect.log2());
+
+ score.has("docs.rs", 1, cr.has_docs_rs);
+ score
+}
// pub fn crate_score_contextual(inputs: &CrateContextInputs) -> Score {
// let mut score = Score::new();
diff --git a/ranking/src/scorer.rs b/ranking/src/scorer.rs
index e476a52..329709f 100644
--- a/ranking/src/scorer.rs
+++ b/ranking/src/scorer.rs
@@ -42,12 +42,8 @@ impl Score {
pub fn score_f(&mut self, for_what: &'static str, max_score: f64, n: impl Into<f64>) -> ScoreAdj<'_> {
let n = n.into();
self.total += max_score;
- if n > 0. {
- self.scores.push((n, max_score, for_what));
- ScoreAdj { score: self.scores.last_mut().map(|(s, ..)| s) }
- } else {
- ScoreAdj::default()
- }
+ self.scores.push((n.max(0.), max_score, for_what));
+ ScoreAdj { score: self.scores.last_mut().map(|(s, ..)| s) }
}
/// Start a new group of scores, and `max_score` is the max total score of the group
diff --git a/reindex/src/bin/reindex_crates.rs b/reindex/src/bin/reindex_crates.rs
index ff7b3cd..c06cdc9 100644
--- a/reindex/src/bin/reindex_crates.rs
+++ b/reindex/src/bin/reindex_crates.rs
@@ -1,3 +1,4 @@
+use ranking::CrateTemporalInputs;
use ranking::CrateVersionInputs;
use kitchen_sink::RichCrate;
use render_readme::Renderer;
@@ -73,18 +74,18 @@ fn main() {
fn index_crate(crates: &KitchenSink, c: &Origin, renderer: &Renderer) -> Result<RichCrateVersion, failure::Error> {
let v = crates.rich_crate_version(c, CrateData::FullNoDerived)?;
let k = crates.rich_crate(c)?;
- let score = crate_base_score(&k, &v, renderer);
+ let score = crate_overall_score(crates, &k, &v, renderer);
crates.index_crate_highest_version(&v, score)?;
crates.index_crate(&k, score)?;
Ok(v)
}
-fn crate_base_score(all: &RichCrate, k: &RichCrateVersion, renderer: &Renderer) -> f64 {
+fn crate_overall_score(crates: &KitchenSink, all: &RichCrate, k: &RichCrateVersion, renderer: &Renderer) -> f64 {
let readme = k.readme().ok().and_then(|r| r).map(|readme| {
renderer.page_node(&readme.markup, None, false)
});
- let mut score = ranking::crate_score_version(&CrateVersionInputs {
+ let base_score = ranking::crate_score_version(&CrateVersionInputs {
versions: all.versions(),
description: k.description().unwrap_or(""),
readme: readme.as_ref(),
@@ -111,6 +112,53 @@ fn crate_base_score(all: &RichCrate, k: &RichCrateVersion, renderer: &Renderer)
is_nightly: k.is_nightly(),
}).total();
+ let downloads_per_month = crates.downloads_per_month_or_equivalent(all.origin()).expect("dl numbers").unwrap_or(0) as u32;
+
+ let mut temp_inp = CrateTemporalInputs {
+ versions: all.versions(),
+ downloads_per_month,
+ downloads_per_month_minus_most_downloaded_user: downloads_per_month,
+ is_app: k.is_app(),
+ has_docs_rs: crates.has_docs_rs(k.short_name(), k.version()),
+ number_of_direct_reverse_deps: 0,
+ number_of_indirect_reverse_deps: 0,
+ number_of_indirect_reverse_optional_deps: 0,
+ };
+
+ let mut direct_rev_deps = 0;
+ let mut indirect_reverse_optional_deps = 0;
+ if let Some(deps) = crates.dependents_stats_of_crates_io_crate(k.short_name()) {
+ direct_rev_deps = deps.direct as u32;
+ indirect_reverse_optional_deps = (deps.runtime.def as u32 + deps.runtime.opt as u32)
+ .max(deps.dev as u32)
+ .max(deps.build.def as u32 + deps.build.opt as u32);
+
+ temp_inp.number_of_direct_reverse_deps = direct_rev_deps;
+ temp_inp.number_of_indirect_reverse_deps = deps.runtime.def.max(deps.build.def).into();
+ temp_inp.number_of_indirect_reverse_optional_deps = indirect_reverse_optional_deps;
+ let biggest = deps.rev_dep_names.iter()
+ .filter_map(|name| crates.downloads_per_month(&Origin::from_crates_io_name(name)).ok().and_then(|x| x))
+ .max().unwrap_or(0);
+ temp_inp.downloads_per_month_minus_most_downloaded_user = downloads_per_month.saturating_sub(biggest as u32);
+ }
+
+ let removals_divisor = if let Some(removals_weighed) = crates.crate_removals(k.origin()) {
+
+ // count some indirect/optional deps in case removals have been due to moving the crate behind another facade
+ // +20 is a fudge factor to smooth out nosiy data for rarely used crates.
+ // We don't care about small amount of removals, only mass exodus from big dead crates.
+ let effective_rev_deps = 20. + (direct_rev_deps as f64).max(indirect_reverse_optional_deps as f64 / 5.);
+ let removals_ratio = removals_weighed / (effective_rev_deps * 3.);
+ // if it's used more than removed, ratio < 1 is fine.
+ removals_ratio.max(1.).min(3.)
+ } else {
+ 1.
+ };
+
+ let temp_score = ranking::crate_score_temporal(&temp_inp);
+ let temp_score = temp_score.total();
+
+ let mut score = (base_score + temp_score) * 0.5 / removals_divisor;
// there's usually a non-macro/non-sys sibling
if k.is_proc_macro() || k.is_sys() {