summaryrefslogtreecommitdiffstats
path: root/ranking
diff options
context:
space:
mode:
authorKornel <kornel@geekhood.net>2019-03-29 14:59:36 +0000
committerKornel <kornel@geekhood.net>2019-03-29 21:38:06 +0000
commita9302819102902bb8044d493ba4707f4a4689c29 (patch)
tree74fe3cb8666fb88676f9611529ca05a15e108700 /ranking
parent5f9d39a4f449953351c765a072118a27797ff121 (diff)
Expand ranking to include time and removals
Diffstat (limited to 'ranking')
-rw-r--r--ranking/src/lib_ranking.rs84
-rw-r--r--ranking/src/scorer.rs8
2 files changed, 69 insertions, 23 deletions
diff --git a/ranking/src/lib_ranking.rs b/ranking/src/lib_ranking.rs
index 09e8b2c..5e05a80 100644
--- a/ranking/src/lib_ranking.rs
+++ b/ranking/src/lib_ranking.rs
@@ -48,25 +48,30 @@ pub struct CrateVersionInputs<'a> {
}
/// Changes over time, but doesn't depend on crate's own ranking
-pub struct CrateTemporalInputs {
- /// 1.0 fresh, 0.0 totally outdated and deprecated
- pub dependency_freshness: Vec<f32>,
- pub recent_downloads: u32,
- pub recent_downloads_minus_most_downloaded_user: u32,
+pub struct CrateTemporalInputs<'a> {
+ pub versions: &'a [CrateVersion],
+ // 1.0 fresh, 0.0 totally outdated and deprecated
+ // pub dependency_freshness: Vec<f32>,
+ pub downloads_per_month: u32,
+ /// Looking at downloads of direct dependencies.
+ /// This way internal derive/impl/core crates that have one big user get 0 here.
+ pub downloads_per_month_minus_most_downloaded_user: u32,
+ pub is_app: bool,
pub has_docs_rs: bool,
+ pub is_nightly: bool,
// low priority, because it's unranked! it'll be re-evaluated later
- pub number_of_reverse_deps: u32,
+ pub number_of_direct_reverse_deps: u32,
+ /// use max(runtime, dev, build), because the crate is going to be one of these kinds
+ pub number_of_indirect_reverse_deps: u32,
+ /// Includes non-optional (i.e. it's the upper bound, not just the optional ones)
+ pub number_of_indirect_reverse_optional_deps: u32,
// most recent commit
// avg time issues are left unanswered?
-}
-
-/// Crate's own base ranking influences these rankings
-pub struct CrateContextInputs {
- pub crate_score_context_free: f64,
- pub owner_pageranks: Vec<f32>,
- pub reverse_deps_rankings: Vec<f32>,
+ // pub crate_score_context_free: f64,
+ // pub owner_pageranks: Vec<f32>,
+ // pub reverse_deps_rankings: Vec<f32>,
}
pub struct Env {
@@ -296,11 +301,56 @@ pub fn crate_score_version(cr: &CrateVersionInputs) -> Score {
score
}
-// pub fn crate_score_temporal(inputs: &CrateTemporalInputs) -> Score {
-// let mut score = Score::new();
+pub fn crate_score_temporal(cr: &CrateTemporalInputs) -> Score {
+ let mut score = Score::new();
-// score
-// }
+ let newest = cr.versions.iter().max_by_key(|v| &v.created_at).expect("at least 1 ver?");
+ let freshness_score = match newest.created_at.parse::<DateTime<Utc>>() {
+ Ok(latest_date) => {
+ // Assume higher versions, and especially patch versions, mean the crate is more mature
+ // and needs fewer updates
+ let version_stability_interval = match SemVer::parse(&newest.num) {
+ Ok(ref ver) if ver.patch > 3 && ver.major > 0 => 500,
+ Ok(ref ver) if ver.patch > 3 => 350,
+ Ok(ref ver) if ver.patch > 0 => 250,
+ Ok(ref ver) if ver.major > 0 => 200,
+ Ok(ref ver) if ver.minor > 3 => 150,
+ _ => 80,
+ };
+ let expected_update_interval = version_stability_interval.min(cr.versions.len() as i64 * 50) / if cr.is_nightly {2} else {1};
+ let age = (Utc::now() - latest_date).num_days();
+ let days_past_expiration_date = (age - expected_update_interval).max(0);
+ // score decays for a ~year after the crate should have been updated
+ let decay_days = if cr.is_nightly {60} else {200} + expected_update_interval/2;
+ (decay_days - days_past_expiration_date).max(0) as f64 / (decay_days as f64)
+ },
+ Err(e) => {
+ eprintln!("Release time parse error: {}", e);
+ 0.
+ }
+ };
+ score.frac("Freshness of latest release", 8, freshness_score);
+
+ // Low numbers are just bots/noise.
+ let downloads = (cr.downloads_per_month as f64 - 100.).max(0.) + 100.;
+ let downloads_cleaned = (cr.downloads_per_month_minus_most_downloaded_user as f64 - 50.).max(0.) + 50.;
+ // distribution of downloads follows power law.
+ // apps have much harder to get high download numbers.
+ let pop = (downloads.log2() - 6.6) / (if cr.is_app {1.0} else {2.0});
+ let pop_cleaned = (downloads_cleaned.log2() - 5.6) / (if cr.is_app {1.0} else {2.0});
+ assert!(pop > 0.);
+ assert!(pop_cleaned > 0.);
+ // FIXME: max should be based on the most downloaded crate?
+ score.score_f("Downloads", 8., pop/2.);
+ score.score_f("Downloads (cleaned)", 18., pop_cleaned);
+
+ score.score_f("Direct rev deps", 10., (cr.number_of_direct_reverse_deps as f64).sqrt());
+ let indirect = 1. + cr.number_of_indirect_reverse_optional_deps as f64 / 3.;
+ score.score_f("Indirect rev deps", 10., indirect.log2());
+
+ score.has("docs.rs", 1, cr.has_docs_rs);
+ score
+}
// pub fn crate_score_contextual(inputs: &CrateContextInputs) -> Score {
// let mut score = Score::new();
diff --git a/ranking/src/scorer.rs b/ranking/src/scorer.rs
index e476a52..329709f 100644
--- a/ranking/src/scorer.rs
+++ b/ranking/src/scorer.rs
@@ -42,12 +42,8 @@ impl Score {
pub fn score_f(&mut self, for_what: &'static str, max_score: f64, n: impl Into<f64>) -> ScoreAdj<'_> {
let n = n.into();
self.total += max_score;
- if n > 0. {
- self.scores.push((n, max_score, for_what));
- ScoreAdj { score: self.scores.last_mut().map(|(s, ..)| s) }
- } else {
- ScoreAdj::default()
- }
+ self.scores.push((n.max(0.), max_score, for_what));
+ ScoreAdj { score: self.scores.last_mut().map(|(s, ..)| s) }
}
/// Start a new group of scores, and `max_score` is the max total score of the group