diff options
author | Kornel <kornel@geekhood.net> | 2019-03-29 14:59:36 +0000 |
---|---|---|
committer | Kornel <kornel@geekhood.net> | 2019-03-29 21:38:06 +0000 |
commit | a9302819102902bb8044d493ba4707f4a4689c29 (patch) | |
tree | 74fe3cb8666fb88676f9611529ca05a15e108700 | |
parent | 5f9d39a4f449953351c765a072118a27797ff121 (diff) |
Expand ranking to include time and removals
-rw-r--r-- | kitchen_sink/src/lib_kitchen_sink.rs | 14 | ||||
-rw-r--r-- | ranking/src/lib_ranking.rs | 84 | ||||
-rw-r--r-- | ranking/src/scorer.rs | 8 | ||||
-rw-r--r-- | reindex/src/bin/reindex_crates.rs | 54 |
4 files changed, 127 insertions, 33 deletions
diff --git a/kitchen_sink/src/lib_kitchen_sink.rs b/kitchen_sink/src/lib_kitchen_sink.rs index 264c736..a7b235e 100644 --- a/kitchen_sink/src/lib_kitchen_sink.rs +++ b/kitchen_sink/src/lib_kitchen_sink.rs @@ -354,6 +354,12 @@ impl KitchenSink { } } + /// Fudge-factor score proprtional to how many times a crate has been removed from some project + pub fn crate_removals(&self, origin: &Origin) -> Option<f64> { + self.removals + .get(|| self.crate_db.removals().expect("fetch crate removals")) + .get(origin).cloned() + } pub fn downloads_per_month(&self, origin: &Origin) -> CResult<Option<usize>> { self.downloads_recent(origin).map(|dl| dl.map(|n| n/3)) @@ -1331,13 +1337,7 @@ impl KitchenSink { Ok(match cache.entry(slug.to_owned()) { Occupied(e) => Arc::clone(e.get()), Vacant(e) => { - let some_extra_for_removals = 30 + wanted_num/10; - let mut crates = self.crate_db.top_crates_in_category_partially_ranked(slug, wanted_num + some_extra_for_removals)?; - let removals = self.removals.get(|| self.crate_db.removals().unwrap()); - for c in &mut crates { - c.2 /= 300. + removals.get(&c.0).cloned().unwrap_or(2.); - } - crates.sort_by(|a, b| b.2.partial_cmp(&a.2).expect("nan?")); + let crates = self.crate_db.top_crates_in_category_partially_ranked(slug, wanted_num)?; let crates: Vec<_> = crates.into_iter().map(|(o, r, _)| (o, r)).take(wanted_num as usize).collect(); let res = Arc::new(crates); e.insert(Arc::clone(&res)); diff --git a/ranking/src/lib_ranking.rs b/ranking/src/lib_ranking.rs index 09e8b2c..5e05a80 100644 --- a/ranking/src/lib_ranking.rs +++ b/ranking/src/lib_ranking.rs @@ -48,25 +48,30 @@ pub struct CrateVersionInputs<'a> { } /// Changes over time, but doesn't depend on crate's own ranking -pub struct CrateTemporalInputs { - /// 1.0 fresh, 0.0 totally outdated and deprecated - pub dependency_freshness: Vec<f32>, - pub recent_downloads: u32, - pub recent_downloads_minus_most_downloaded_user: u32, +pub struct CrateTemporalInputs<'a> { + pub versions: &'a [CrateVersion], + // 1.0 fresh, 0.0 totally outdated and deprecated + // pub dependency_freshness: Vec<f32>, + pub downloads_per_month: u32, + /// Looking at downloads of direct dependencies. + /// This way internal derive/impl/core crates that have one big user get 0 here. + pub downloads_per_month_minus_most_downloaded_user: u32, + pub is_app: bool, pub has_docs_rs: bool, + pub is_nightly: bool, // low priority, because it's unranked! it'll be re-evaluated later - pub number_of_reverse_deps: u32, + pub number_of_direct_reverse_deps: u32, + /// use max(runtime, dev, build), because the crate is going to be one of these kinds + pub number_of_indirect_reverse_deps: u32, + /// Includes non-optional (i.e. it's the upper bound, not just the optional ones) + pub number_of_indirect_reverse_optional_deps: u32, // most recent commit // avg time issues are left unanswered? -} - -/// Crate's own base ranking influences these rankings -pub struct CrateContextInputs { - pub crate_score_context_free: f64, - pub owner_pageranks: Vec<f32>, - pub reverse_deps_rankings: Vec<f32>, + // pub crate_score_context_free: f64, + // pub owner_pageranks: Vec<f32>, + // pub reverse_deps_rankings: Vec<f32>, } pub struct Env { @@ -296,11 +301,56 @@ pub fn crate_score_version(cr: &CrateVersionInputs) -> Score { score } -// pub fn crate_score_temporal(inputs: &CrateTemporalInputs) -> Score { -// let mut score = Score::new(); +pub fn crate_score_temporal(cr: &CrateTemporalInputs) -> Score { + let mut score = Score::new(); -// score -// } + let newest = cr.versions.iter().max_by_key(|v| &v.created_at).expect("at least 1 ver?"); + let freshness_score = match newest.created_at.parse::<DateTime<Utc>>() { + Ok(latest_date) => { + // Assume higher versions, and especially patch versions, mean the crate is more mature + // and needs fewer updates + let version_stability_interval = match SemVer::parse(&newest.num) { + Ok(ref ver) if ver.patch > 3 && ver.major > 0 => 500, + Ok(ref ver) if ver.patch > 3 => 350, + Ok(ref ver) if ver.patch > 0 => 250, + Ok(ref ver) if ver.major > 0 => 200, + Ok(ref ver) if ver.minor > 3 => 150, + _ => 80, + }; + let expected_update_interval = version_stability_interval.min(cr.versions.len() as i64 * 50) / if cr.is_nightly {2} else {1}; + let age = (Utc::now() - latest_date).num_days(); + let days_past_expiration_date = (age - expected_update_interval).max(0); + // score decays for a ~year after the crate should have been updated + let decay_days = if cr.is_nightly {60} else {200} + expected_update_interval/2; + (decay_days - days_past_expiration_date).max(0) as f64 / (decay_days as f64) + }, + Err(e) => { + eprintln!("Release time parse error: {}", e); + 0. + } + }; + score.frac("Freshness of latest release", 8, freshness_score); + + // Low numbers are just bots/noise. + let downloads = (cr.downloads_per_month as f64 - 100.).max(0.) + 100.; + let downloads_cleaned = (cr.downloads_per_month_minus_most_downloaded_user as f64 - 50.).max(0.) + 50.; + // distribution of downloads follows power law. + // apps have much harder to get high download numbers. + let pop = (downloads.log2() - 6.6) / (if cr.is_app {1.0} else {2.0}); + let pop_cleaned = (downloads_cleaned.log2() - 5.6) / (if cr.is_app {1.0} else {2.0}); + assert!(pop > 0.); + assert!(pop_cleaned > 0.); + // FIXME: max should be based on the most downloaded crate? + score.score_f("Downloads", 8., pop/2.); + score.score_f("Downloads (cleaned)", 18., pop_cleaned); + + score.score_f("Direct rev deps", 10., (cr.number_of_direct_reverse_deps as f64).sqrt()); + let indirect = 1. + cr.number_of_indirect_reverse_optional_deps as f64 / 3.; + score.score_f("Indirect rev deps", 10., indirect.log2()); + + score.has("docs.rs", 1, cr.has_docs_rs); + score +} // pub fn crate_score_contextual(inputs: &CrateContextInputs) -> Score { // let mut score = Score::new(); diff --git a/ranking/src/scorer.rs b/ranking/src/scorer.rs index e476a52..329709f 100644 --- a/ranking/src/scorer.rs +++ b/ranking/src/scorer.rs @@ -42,12 +42,8 @@ impl Score { pub fn score_f(&mut self, for_what: &'static str, max_score: f64, n: impl Into<f64>) -> ScoreAdj<'_> { let n = n.into(); self.total += max_score; - if n > 0. { - self.scores.push((n, max_score, for_what)); - ScoreAdj { score: self.scores.last_mut().map(|(s, ..)| s) } - } else { - ScoreAdj::default() - } + self.scores.push((n.max(0.), max_score, for_what)); + ScoreAdj { score: self.scores.last_mut().map(|(s, ..)| s) } } /// Start a new group of scores, and `max_score` is the max total score of the group diff --git a/reindex/src/bin/reindex_crates.rs b/reindex/src/bin/reindex_crates.rs index ff7b3cd..c06cdc9 100644 --- a/reindex/src/bin/reindex_crates.rs +++ b/reindex/src/bin/reindex_crates.rs @@ -1,3 +1,4 @@ +use ranking::CrateTemporalInputs; use ranking::CrateVersionInputs; use kitchen_sink::RichCrate; use render_readme::Renderer; @@ -73,18 +74,18 @@ fn main() { fn index_crate(crates: &KitchenSink, c: &Origin, renderer: &Renderer) -> Result<RichCrateVersion, failure::Error> { let v = crates.rich_crate_version(c, CrateData::FullNoDerived)?; let k = crates.rich_crate(c)?; - let score = crate_base_score(&k, &v, renderer); + let score = crate_overall_score(crates, &k, &v, renderer); crates.index_crate_highest_version(&v, score)?; crates.index_crate(&k, score)?; Ok(v) } -fn crate_base_score(all: &RichCrate, k: &RichCrateVersion, renderer: &Renderer) -> f64 { +fn crate_overall_score(crates: &KitchenSink, all: &RichCrate, k: &RichCrateVersion, renderer: &Renderer) -> f64 { let readme = k.readme().ok().and_then(|r| r).map(|readme| { renderer.page_node(&readme.markup, None, false) }); - let mut score = ranking::crate_score_version(&CrateVersionInputs { + let base_score = ranking::crate_score_version(&CrateVersionInputs { versions: all.versions(), description: k.description().unwrap_or(""), readme: readme.as_ref(), @@ -111,6 +112,53 @@ fn crate_base_score(all: &RichCrate, k: &RichCrateVersion, renderer: &Renderer) is_nightly: k.is_nightly(), }).total(); + let downloads_per_month = crates.downloads_per_month_or_equivalent(all.origin()).expect("dl numbers").unwrap_or(0) as u32; + + let mut temp_inp = CrateTemporalInputs { + versions: all.versions(), + downloads_per_month, + downloads_per_month_minus_most_downloaded_user: downloads_per_month, + is_app: k.is_app(), + has_docs_rs: crates.has_docs_rs(k.short_name(), k.version()), + number_of_direct_reverse_deps: 0, + number_of_indirect_reverse_deps: 0, + number_of_indirect_reverse_optional_deps: 0, + }; + + let mut direct_rev_deps = 0; + let mut indirect_reverse_optional_deps = 0; + if let Some(deps) = crates.dependents_stats_of_crates_io_crate(k.short_name()) { + direct_rev_deps = deps.direct as u32; + indirect_reverse_optional_deps = (deps.runtime.def as u32 + deps.runtime.opt as u32) + .max(deps.dev as u32) + .max(deps.build.def as u32 + deps.build.opt as u32); + + temp_inp.number_of_direct_reverse_deps = direct_rev_deps; + temp_inp.number_of_indirect_reverse_deps = deps.runtime.def.max(deps.build.def).into(); + temp_inp.number_of_indirect_reverse_optional_deps = indirect_reverse_optional_deps; + let biggest = deps.rev_dep_names.iter() + .filter_map(|name| crates.downloads_per_month(&Origin::from_crates_io_name(name)).ok().and_then(|x| x)) + .max().unwrap_or(0); + temp_inp.downloads_per_month_minus_most_downloaded_user = downloads_per_month.saturating_sub(biggest as u32); + } + + let removals_divisor = if let Some(removals_weighed) = crates.crate_removals(k.origin()) { + + // count some indirect/optional deps in case removals have been due to moving the crate behind another facade + // +20 is a fudge factor to smooth out nosiy data for rarely used crates. + // We don't care about small amount of removals, only mass exodus from big dead crates. + let effective_rev_deps = 20. + (direct_rev_deps as f64).max(indirect_reverse_optional_deps as f64 / 5.); + let removals_ratio = removals_weighed / (effective_rev_deps * 3.); + // if it's used more than removed, ratio < 1 is fine. + removals_ratio.max(1.).min(3.) + } else { + 1. + }; + + let temp_score = ranking::crate_score_temporal(&temp_inp); + let temp_score = temp_score.total(); + + let mut score = (base_score + temp_score) * 0.5 / removals_divisor; // there's usually a non-macro/non-sys sibling if k.is_proc_macro() || k.is_sys() { |