summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKornel <kornel@geekhood.net>2019-03-09 21:03:06 +0000
committerKornel <kornel@geekhood.net>2019-03-13 23:45:09 +0000
commitf9945a13adb38206cf84187bbded9f898f208052 (patch)
tree0677c733a8f29b0b212cce14d4294f466ba6dbc0
parent4d4a96f88548b28bc5cafb7858e59975e7ca7bdc (diff)
New ranking
m---------cargo_toml0
-rw-r--r--front_end/Cargo.toml2
-rw-r--r--front_end/src/bin/crate_pages.rs4
-rw-r--r--front_end/src/bin/serp.rs2
-rw-r--r--front_end/src/bin/website.rs2
-rw-r--r--front_end/src/crate_page.rs4
-rw-r--r--kitchen_sink/Cargo.toml6
-rw-r--r--kitchen_sink/src/index.rs2
-rw-r--r--kitchen_sink/src/lib_kitchen_sink.rs2
-rw-r--r--ranking/Cargo.toml4
-rw-r--r--ranking/src/lib_ranking.rs337
-rw-r--r--ranking/src/main.rs51
-rw-r--r--ranking/src/scorer.rs95
-rw-r--r--reindex/Cargo.toml6
-rw-r--r--reindex/src/bin/reindex_search.rs71
m---------render_readme0
-rw-r--r--rich_crate/Cargo.toml6
-rw-r--r--rich_crate/src/rich_crate.rs6
-rw-r--r--rich_crate/src/rich_crate_version.rs40
-rw-r--r--server/Cargo.toml2
-rw-r--r--server/src/main.rs2
21 files changed, 538 insertions, 106 deletions
diff --git a/cargo_toml b/cargo_toml
-Subproject 545d7341e6849600fb0cfd2067bcb3cc7f1e3c6
+Subproject e3f80e07bc6b8988f72d226658ed4e657898e95
diff --git a/front_end/Cargo.toml b/front_end/Cargo.toml
index 3f0bded..a63d001 100644
--- a/front_end/Cargo.toml
+++ b/front_end/Cargo.toml
@@ -15,7 +15,7 @@ ructe = "0.5.6"
[dependencies]
kitchen_sink = { path = "../kitchen_sink", version = "0.7.0" }
rich_crate = { path = "../rich_crate" }
-render_readme = { git = "https://gitlab.com/crates.rs/render_readme.git", version = "0.5.0" }
+render_readme = { git = "https://gitlab.com/crates.rs/render_readme.git", version = "0.6.0" }
categories = { path = "../categories" }
udedokei = { path = "../udedokei" }
search_index = { path = "../search_index" }
diff --git a/front_end/src/bin/crate_pages.rs b/front_end/src/bin/crate_pages.rs
index 82d1ceb..f9b7089 100644
--- a/front_end/src/bin/crate_pages.rs
+++ b/front_end/src/bin/crate_pages.rs
@@ -20,7 +20,7 @@ fn main() {
}
fn is_useful1(allver: &RichCrate) -> bool {
- if allver.versions().count() < 2 {
+ if allver.versions().len() < 2 {
eprintln!("{} one release", allver.name());
return false;
}
@@ -64,7 +64,7 @@ fn run(filter: Option<String>) -> Result<(), failure::Error> {
let crates = Arc::new(kitchen_sink::KitchenSink::new_default()?);
// crates.prewarm();
let image_filter = Arc::new(ImageOptimAPIFilter::new("czjpqfbdkz", crates.main_cache_dir().join("images.db"))?);
- let markup = &Renderer::new_filter(Highlighter::new(), image_filter);
+ let markup = &Renderer::new_filter(Some(Highlighter::new()), image_filter);
rayon::scope(move |s1| {
for origin in crates.all_crates() {
if let Some(ref filter) = filter {
diff --git a/front_end/src/bin/serp.rs b/front_end/src/bin/serp.rs
index 9d86d50..9114135 100644
--- a/front_end/src/bin/serp.rs
+++ b/front_end/src/bin/serp.rs
@@ -28,7 +28,7 @@ fn run() -> Result<(), failure::Error> {
println!("http://localhost:3000/search");
let image_filter = Arc::new(ImageOptimAPIFilter::new("czjpqfbdkz", "../data/images.db")?);
- let markup = Renderer::new_filter(Highlighter::new(), image_filter);
+ let markup = Renderer::new_filter(Some(Highlighter::new()), image_filter);
front_end::render_serp_page(&mut f, &query, &results, &markup)?;
Ok(())
diff --git a/front_end/src/bin/website.rs b/front_end/src/bin/website.rs
index 5a3fbf5..b28fd32 100644
--- a/front_end/src/bin/website.rs
+++ b/front_end/src/bin/website.rs
@@ -40,7 +40,7 @@ fn run() -> Result<(), failure::Error> {
let crates = KitchenSink::new_default().context("init caches, data, etc.")?;
let done_pages = Mutex::new(HashSet::with_capacity(5000));
let image_filter = Arc::new(ImageOptimAPIFilter::new("czjpqfbdkz", crates.main_cache_dir().join("images.db"))?);
- let markup = Renderer::new_filter(Highlighter::new(), image_filter);
+ let markup = Renderer::new_filter(Some(Highlighter::new()), image_filter);
println!("Generating homepage and category pages…");
let (res1, res2) = rayon::join(
diff --git a/front_end/src/crate_page.rs b/front_end/src/crate_page.rs
index 70ac3be..e527247 100644
--- a/front_end/src/crate_page.rs
+++ b/front_end/src/crate_page.rs
@@ -559,7 +559,7 @@ impl<'a> CratePage<'a> {
}
pub fn all_versions(&self) -> impl Iterator<Item = Version<'a>> {
- self.all.versions().map(|v| Version {
+ self.all.versions().iter().map(|v| Version {
yanked: v.yanked,
num: &v.num,
semver: SemVer::parse(&v.num).expect("semver parse"),
@@ -568,7 +568,7 @@ impl<'a> CratePage<'a> {
}
pub fn published_date(&self) -> DateTime<FixedOffset> {
- let min_iso_date = self.all.versions().map(|v| &v.created_at).min().expect("any version in the crate");
+ let min_iso_date = self.all.versions().iter().map(|v| &v.created_at).min().expect("any version in the crate");
DateTime::parse_from_rfc3339(min_iso_date).expect("created_at parse")
}
diff --git a/kitchen_sink/Cargo.toml b/kitchen_sink/Cargo.toml
index 50195dc..e0082cb 100644
--- a/kitchen_sink/Cargo.toml
+++ b/kitchen_sink/Cargo.toml
@@ -1,7 +1,7 @@
[package]
edition = "2018"
name = "kitchen_sink"
-version = "0.7.0"
+version = "0.7.1"
authors = ["Kornel <kornel@geekhood.net>"]
publish = false
@@ -19,11 +19,11 @@ crate_files = { path = "../crate_files", version = "0.2" }
user_db = { path = "../user_db", version = "0.3" }
crate_db = { path = "../crate_db", version = "0.4.0" }
categories = { path = "../categories" }
-rich_crate = { path = "../rich_crate", version = "0.4.1" }
+rich_crate = { path = "../rich_crate", version = "0.4.2" }
simple_cache = { git = "https://gitlab.com/crates.rs/simple_cache.git", version = "0.7.0" }
lazyonce = "0.3.0"
repo_url = { git = "https://gitlab.com/crates.rs/repo_url.git", version = "0.3.0" }
-cargo_toml = "0.6.4"
+cargo_toml = "0.6.5"
serde = "1.0.43"
serde_derive = "1.0.70"
serde_json = "1.0.33"
diff --git a/kitchen_sink/src/index.rs b/kitchen_sink/src/index.rs
index 8d50a0b..f3129d9 100644
--- a/kitchen_sink/src/index.rs
+++ b/kitchen_sink/src/index.rs
@@ -386,5 +386,5 @@ fn index_test() {
let stats = idx.deps_stats();
assert!(stats.total > 13800);
let lode = stats.counts.get("lodepng").unwrap();
- assert_eq!(10, lode.runtime.0);
+ assert_eq!(11, lode.runtime.0);
}
diff --git a/kitchen_sink/src/lib_kitchen_sink.rs b/kitchen_sink/src/lib_kitchen_sink.rs
index 8067057..0fe9614 100644
--- a/kitchen_sink/src/lib_kitchen_sink.rs
+++ b/kitchen_sink/src/lib_kitchen_sink.rs
@@ -311,7 +311,7 @@ impl KitchenSink {
self.rich_crate(o).ok()
})
.filter(move |k| {
- let latest = k.versions().map(|v| v.created_at.as_str()).max().unwrap_or("");
+ let latest = k.versions().iter().map(|v| v.created_at.as_str()).max().unwrap_or("");
if let Ok(timestamp) = DateTime::parse_from_rfc3339(latest) {
timestamp.timestamp() >= min_timestamp as i64
} else {
diff --git a/ranking/Cargo.toml b/ranking/Cargo.toml
index e8425d8..4719915 100644
--- a/ranking/Cargo.toml
+++ b/ranking/Cargo.toml
@@ -10,7 +10,9 @@ path = "src/lib_ranking.rs"
[dependencies]
rayon = "1.0.3"
-kitchen_sink = { path = "../kitchen_sink", version = "0.7.0" }
rich_crate = { path = "../rich_crate" }
fxhash = "0.2.1"
chrono = "0.4.6"
+cargo_toml = "0.6.5"
+render_readme = { git = "https://gitlab.com/crates.rs/render_readme.git", version = "0.6.1" }
+semver = "0.9.0"
diff --git a/ranking/src/lib_ranking.rs b/ranking/src/lib_ranking.rs
index 47de967..34064a4 100644
--- a/ranking/src/lib_ranking.rs
+++ b/ranking/src/lib_ranking.rs
@@ -1,4 +1,333 @@
-// mod scorer;
-// pub use self::scorer::*;
-mod authorrank;
-pub use crate::authorrank::*;
+mod scorer;
+use render_readme::Handle;
+use render_readme::NodeData;
+use rich_crate::Edition;
+use rich_crate::Author;
+use rich_crate::CrateVersion;
+use cargo_toml::MaintenanceStatus;
+use rich_crate::CrateOwner;
+use semver::Version as SemVer;
+pub use self::scorer::*;
+use chrono::prelude::*;
+
+/// Only changes when a new version is released
+pub struct CrateVersionInputs<'a> {
+ pub versions: &'a [CrateVersion],
+ pub description: &'a str,
+ pub readme: Option<&'a Handle>,
+ pub owners: &'a [CrateOwner],
+ pub authors: &'a [Author],
+ pub edition: Edition,
+ pub is_app: bool,
+ pub has_build_rs: bool,
+ pub has_links: bool,
+ pub has_documentation_link: bool,
+ pub has_homepage_link: bool,
+ pub has_repository_link: bool,
+ pub has_keywords: bool,
+ pub has_categories: bool,
+ pub has_features: bool,
+ pub has_examples: bool,
+ pub has_benches: bool,
+ pub has_tests: bool,
+ // pub has_lockfile: bool,
+ // pub has_changelog: bool,
+ pub license: &'a str,
+ pub has_badges: bool,
+ pub maintenance: MaintenanceStatus,
+ pub is_nightly: bool,
+
+ // (relative) weight of dependencies?
+
+ // rust loc
+ // #[test] cases
+ // assert! calls
+ // comments ratio (normalized to project size)
+
+ // look for deprecated in the description
+}
+
+/// Changes over time, but doesn't depend on crate's own ranking
+pub struct CrateTemporalInputs {
+ /// 1.0 fresh, 0.0 totally outdated and deprecated
+ pub dependency_freshness: Vec<f32>,
+ pub recent_downloads: u32,
+ pub recent_downloads_minus_most_downloaded_user: u32,
+ pub has_docs_rs: bool,
+
+ // low priority, because it's unranked! it'll be re-evaluated later
+ pub number_of_reverse_deps: u32,
+
+ // most recent commit
+ // avg time issues are left unanswered?
+}
+
+/// Crate's own base ranking influences these rankings
+pub struct CrateContextInputs {
+ pub crate_score_context_free: f64,
+ pub owner_pageranks: Vec<f32>,
+ pub reverse_deps_rankings: Vec<f32>,
+}
+
+pub struct Env {
+ pub max_recent_downloads: u32,
+ pub max_crates: u32,
+}
+
+fn cargo_toml_score(cr: &CrateVersionInputs) -> Score {
+ let mut s = Score::new();
+
+ s.frac("description len", 20, (cr.description.len() as f64 / 300.).min(1.));
+
+ // build.rs slows compilation down, so better not use it unless necessary (links means a sys create, so somewhat justified)
+ s.n("build.rs", 20, if !cr.has_build_rs && !cr.has_links {20} else if cr.has_links {10} else {0});
+
+ // users report examples are super valuable
+ s.has("has_examples", 100, cr.has_examples);
+ // probably less buggy than if winging it
+ s.has("has_tests", 70, cr.has_tests);
+ // probably optimized
+ s.has("has_benches", 10, cr.has_benches);
+
+ // docs are very important (TODO: this may be redundant with docs.rs)
+ s.has("has_documentation_link", 30, cr.has_documentation_link);
+ s.has("has_homepage_link", 30, cr.has_homepage_link);
+
+ // we care about being able to analyze
+ s.has("has_repository_link", 20, cr.has_repository_link);
+
+ // helps crates.rs show crate in the right place
+ s.has("has_keywords", 10, cr.has_keywords);
+ s.has("has_categories", 5, cr.has_categories);
+
+ // probably non-trivial crate
+ s.has("has_features", 5, cr.has_features);
+
+ // it's the best practice, may help building old versions of the project
+ // s.has("has_lockfile", 5, cr.has_lockfile);
+ // assume it's CI, which helps improve quality
+ s.has("has_badges", 10, cr.has_badges);
+
+ // not official
+ // s.has("has_changelog", 5, cr.has_changelog);
+
+ s.n("maintenance status", 30, match cr.maintenance {
+ MaintenanceStatus::ActivelyDeveloped => 30,
+ MaintenanceStatus::Experimental => 25,
+ MaintenanceStatus::None => 20,
+ MaintenanceStatus::PassivelyMaintained => 10,
+ MaintenanceStatus::AsIs => 5,
+ MaintenanceStatus::LookingForMaintainer => 4,
+ MaintenanceStatus::Deprecated => 0,
+ });
+
+ // TODO: being nightly should be a negative score
+ s.has("works on stable", 20, !cr.is_nightly);
+ // fresh
+ s.has("2018 edition", 10, cr.edition != Edition::E2015);
+
+ // license proliferation is bad
+ s.has("useful license", 10, if cr.is_app {
+ // for end-user apps assume user freedom > developer freedom
+ cr.license.contains("GPL") || cr.license.contains("CC-BY-SA") || cr.license.contains("MPL")
+ } else {
+ // for libs assume developer freedom > user freedom
+ cr.license.contains("MIT") || cr.license.contains("BSD") || cr.license.contains("Apache") || cr.license.contains("CC0")
+ });
+
+ s
+}
+
+#[derive(Default)]
+struct MarkupProps {
+ text_len: usize,
+ code_len: usize,
+ list_or_table_rows: u16,
+ images: u16,
+ pre_blocks: u16,
+ sections: u16,
+}
+
+fn is_badge_url(url: &str) -> bool {
+ let url = url.trim_start_matches("http://").trim_start_matches("https://")
+ .trim_start_matches("www.")
+ .trim_start_matches("flat.")
+ .trim_start_matches("images.")
+ .trim_start_matches("img.")
+ .trim_start_matches("api.")
+ .trim_start_matches("ci.")
+ .trim_start_matches("build.");
+ url.starts_with("appveyor.com") ||
+ url.starts_with("badge.") ||
+ url.starts_with("badgen.") ||
+ url.starts_with("badges.") ||
+ url.starts_with("codecov.io") ||
+ url.starts_with("coveralls.io") ||
+ url.starts_with("docs.rs") ||
+ url.starts_with("gitlab.com") ||
+ url.starts_with("isitmaintained.com") ||
+ url.starts_with("meritbadge") ||
+ url.starts_with("microbadger") ||
+ url.starts_with("ohloh.net") ||
+ url.starts_with("openhub.net") ||
+ url.starts_with("repostatus.org") ||
+ url.starts_with("shields.io") ||
+ url.starts_with("snapcraft.io") ||
+ url.starts_with("spearow.io") ||
+ url.starts_with("travis-ci.") ||
+ url.starts_with("zenodo.org") ||
+ url.ends_with("?branch=master") ||
+ url.ends_with("/pipeline.svg") ||
+ url.ends_with("/coverage.svg") ||
+ url.ends_with("/build.svg") ||
+ url.ends_with("badge.svg") ||
+ url.ends_with("badge.png")
+}
+
+fn fill_props(node: &Handle, props: &mut MarkupProps, mut in_code: bool) {
+ match node.data {
+ NodeData::Text {ref contents} => {
+ let len = contents.borrow().trim().len();
+ if len > 0 {
+ if in_code {
+ props.code_len += len + 1; // +1 to account for separators that were trimmed
+ } else {
+ props.text_len += len + 1;
+ }
+ }
+ return; // has no children
+ },
+ NodeData::Element {ref name, ref attrs, ..} => {
+ match name.local.get(..).unwrap() {
+ "img" => {
+ if let Some(src) = attrs.borrow().iter().find(|a| a.name.local.get(..).unwrap() == "src") {
+ if is_badge_url(&src.value) {
+ return; // don't count badges
+ }
+ }
+ props.images += 1;
+ return;
+ },
+ "li" | "tr" => props.list_or_table_rows += 1,
+ "a" => {
+ if let Some(href) = attrs.borrow().iter().find(|a| a.name.local.get(..).unwrap() == "href") {
+ if is_badge_url(&href.value) {
+ return; // don't count badge image children
+ }
+ }
+ },
+ "pre" => {
+ in_code = true;
+ props.pre_blocks += 1;
+ },
+ "code" => in_code = true,
+ "h1" | "h2" | "h3" | "h4" | "h5" => props.sections += 1,
+ _ => {},
+ }
+ },
+ _ => {},
+ }
+ for child in node.children.borrow().iter() {
+ fill_props(child, props, in_code);
+ }
+}
+
+fn readme_score(readme: Option<&Handle>) -> Score {
+ let mut s = Score::new();
+ let mut props = Default::default();
+ if let Some(readme) = readme {
+ fill_props(readme, &mut props, false);
+ }
+ s.frac("text length", 75, (props.text_len as f64 /3000.).min(1.0));
+ s.frac("code length", 100, (props.code_len as f64 /2000.).min(1.0));
+ s.has("has code", 30, props.code_len > 150 && props.pre_blocks > 0); // people really like seeing a code example
+ s.n("code blocks", 25, props.pre_blocks * 5);
+ s.n("images", 35, props.images * 25); // I like pages with logos
+ s.n("sections", 30, props.sections * 4);
+ s.n("list or table rows", 25, props.list_or_table_rows * 2);
+ s
+}
+
+fn versions_score(ver: &[CrateVersion]) -> Score {
+ let mut s = Score::new();
+ let semver = ver.iter().filter(|s| !s.yanked).filter_map(|s| SemVer::parse(&s.num).ok()).collect::<Vec<_>>();
+ s.has("more than one release", 20, semver.len() > 1);
+
+ if !semver.is_empty() { // all yanked
+ return s;
+ }
+
+ let oldest = ver.iter().map(|v| &v.created_at).min().and_then(|s| s.parse::<DateTime<Utc>>().ok());
+ let newest = ver.iter().map(|v| &v.created_at).max().and_then(|s| s.parse::<DateTime<Utc>>().ok());
+ if let (Some(oldest), Some(newest)) = (oldest, newest) {
+ s.n("crate development time", 40, (newest - oldest).num_days() / 11);
+ }
+ // don't count 0.0.x
+ s.n("number of non-experimental releases", 15, semver.iter().filter(|v| (v.major > 0 || v.minor > 0) && v.pre.is_empty()).count() as u32);
+
+ // patch releases are correlated with stable, polished code
+ s.n("patch releases", 20, 5 * semver.iter().filter(|v| v.major > 0 && v.patch > 0).count() as u32);
+ s.n("a high patch release", 10, semver.iter().map(|v| v.patch as u32).max().unwrap_or(0));
+ // for 0.x crates it's hard to knwo what is a patch release
+ s.has("an unstable patch/feature release", 10, semver.iter().any(|v| v.major == 0 && v.patch > 1));
+ // careful release process is a sign of maturity
+ s.has("a prerelease", 5, semver.iter().any(|v| !v.pre.is_empty()));
+ s.has("a stable release", 15, semver.iter().any(|v| v.major > 0 && v.major < 20));
+ s.has("yanked", 2, ver.iter().any(|v| v.yanked)); // author cares to remove bad versions
+ s
+}
+
+fn authors_score(authors: &[Author], owners: &[CrateOwner]) -> Score {
+ let mut s = Score::new();
+ s.n("bus factor", 5, owners.len() as u32);
+ s.n("more than one owner", 5, owners.len() > 1);
+ s.n("authors", 5, authors.len() as u32);
+ s
+}
+
+pub fn crate_score_version(cr: &CrateVersionInputs) -> Score {
+ let mut score = Score::new();
+
+ score.group("Cargo.toml", 1, cargo_toml_score(cr));
+ score.group("README", 1, readme_score(cr.readme));
+ score.group("Versions", 1, versions_score(cr.versions));
+ score.group("Authors/Owners", 1, authors_score(cr.authors, cr.owners));
+
+ score
+}
+
+// pub fn crate_score_temporal(inputs: &CrateTemporalInputs) -> Score {
+// let mut score = Score::new();
+
+// score
+// }
+
+// pub fn crate_score_contextual(inputs: &CrateContextInputs) -> Score {
+// let mut score = Score::new();
+
+// score
+// }
+
+#[test]
+fn test_readme_score() {
+ let ren = render_readme::Renderer::new(None);
+ let dom = ren.page_node(&render_readme::Markup::Markdown("# hello world [link](http://hrefval)
+![img](imgsrc)
+![badg](http://travis-ci.org/badge.svg)
+
+```
+code
+```
+
+* list
+* items
+".into()), None, false);
+ let mut p = Default::default();
+ fill_props(&dom, &mut p, false);
+ assert_eq!(p.images, 1);
+ assert_eq!(p.sections, 1);
+ assert_eq!(p.list_or_table_rows, 2);
+ assert_eq!(p.pre_blocks, 1);
+ assert_eq!(p.code_len, 5);
+ assert_eq!(p.text_len, 28);
+}
diff --git a/ranking/src/main.rs b/ranking/src/main.rs
deleted file mode 100644
index f2bc30b..0000000
--- a/ranking/src/main.rs
+++ /dev/null
@@ -1,51 +0,0 @@
-#![allow(unused)]
-
-use kitchen_sink::{KitchenSink, CrateData};
-
-fn main() {
- let mut crates = KitchenSink::new_default().unwrap();
- // crates.cache_only(true);
-
- let (authors, deps) = rayon::join(
- || ranking::do_author_pr(&crates).unwrap(),
- || crates_by_rev_dep(&crates));
-
- let mut top: Vec<_> = authors.iter().collect();
- top.sort_by(|a,b| b.1.partial_cmp(&a.1).unwrap());
- top.truncate(100);
- for (author, score) in top {
- println!("{}: {:0.4}", author, score);
- }
-
- let mut by_risk: Vec<_> = deps.into_iter().filter(|&(_, rev_deps, _)| rev_deps > 5).map(|(name, rev_deps, owners)| {
- // most trusted finds most risky crates by unvetted authors.
- // least trusted would find crates with weakest links
- // (which is useful too, but too soon to address when we have almost no reviews for anything yet)
- let most_trusted = owners.into_iter().filter_map(|o| authors.get(&*o).cloned()).fold(0., |a:f64,b:f64| a.max(b));
- let risk = (rev_deps as f64) / (0.000001 + most_trusted);
- (name, risk)
- }).collect();
-
- by_risk.sort_by(|a,b| b.1.partial_cmp(&a.1).unwrap());
- by_risk.truncate(200);
- for (s, a) in by_risk {
- println!("{} {}", s, a);
- }
-}
-
-fn crates_by_rev_dep(crates: &KitchenSink) -> Vec<(&str, u32, Vec<Box<str>>)> {
- let mut res = Vec::new();
- for k in crates.all_crates_io_crates().values() {
- let name = k.name();
- if let Some(rev) = crates.dependents_stats_of_crates_io_crate(name) {
- if let Ok(owners) = crates.crates_io_crate_owners(name, k.latest_version().version()) {
- let rev_dep_count = rev.runtime.0 as u32 * 2 + rev.runtime.1 as u32 + rev.build.0 as u32 * 2 + rev.build.1 as u32 + rev.dev as u32 / 2;
- let owners = owners.into_iter()
- .filter_map(|o| o.github_login().map(|l| l.to_ascii_lowercase().into_boxed_str()))
- .collect();
- res.push((name, rev_dep_count, owners));
- }
- }
- }
- res
-}
diff --git a/ranking/src/scorer.rs b/ranking/src/scorer.rs
new file mode 100644
index 0000000..e476a52
--- /dev/null
+++ b/ranking/src/scorer.rs
@@ -0,0 +1,95 @@
+use std::borrow::Borrow;
+
+#[derive(Debug, Clone, Default)]
+pub struct Score {
+ scores: Vec<(f64, f64, &'static str)>,
+ total: f64,
+}
+
+#[derive(Debug, Default)]
+pub struct ScoreAdj<'a> {
+ score: Option<&'a mut f64>,
+}
+
+impl Score {
+ pub fn new() -> Self {
+ Self::default()
+ }
+
+ #[inline]
+ /// Add score if it has the given property
+ pub fn has(&mut self, for_what: &'static str, score: u32, has_it: bool) -> ScoreAdj<'_> {
+ self.score_f(for_what, score as f64, if has_it { score as f64 } else { 0. })
+ }
+
+ #[inline]
+ /// Add this much score, up to the max
+ pub fn n(&mut self, for_what: &'static str, max_score: u32, n: impl Into<i64>) -> ScoreAdj<'_> {
+ self.score_f(for_what, max_score as f64, n.into() as f64)
+ }
+
+ /// Add `max_score` * `n` where n is in 0..1
+ pub fn frac(&mut self, for_what: &'static str, max_score: u32, n: impl Into<f64>) -> ScoreAdj<'_> {
+ let n = n.into();
+ assert!(n >= 0.);
+ assert!(n <= 1.);
+ let max_score = max_score as f64;
+ self.score_f(for_what, max_score, n * max_score)
+ }
+
+ #[inline]
+ /// Add `n` of `max_score` points
+ pub fn score_f(&mut self, for_what: &'static str, max_score: f64, n: impl Into<f64>) -> ScoreAdj<'_> {
+ let n = n.into();
+ self.total += max_score;
+ if n > 0. {
+ self.scores.push((n, max_score, for_what));
+ ScoreAdj { score: self.scores.last_mut().map(|(s, ..)| s) }
+ } else {
+ ScoreAdj::default()
+ }
+ }
+
+ /// Start a new group of scores, and `max_score` is the max total score of the group
+ pub fn group<'a>(&mut self, for_what: &'static str, max_score: u32, group: impl Borrow<Score>) -> ScoreAdj<'_> {
+ self.frac(for_what, max_score, group.borrow().total())
+ }
+
+ /// Get total score
+ pub fn total(&self) -> f64 {
+ let sum = self.scores.iter().map(|&(v, limit, _)| v.max(0.).min(limit)).sum::<f64>();
+ sum / self.total as f64
+ }
+}
+
+impl<'a> ScoreAdj<'a> {
+ pub fn mul(&mut self, by: f64) {
+ self.adj(|n| n * by)
+ }
+
+ pub fn adj(&mut self, adj_with: impl FnOnce(f64) -> f64) {
+ if let Some(s) = self.score.as_mut() {
+ **s = adj_with(**s);
+ }
+ }
+}
+
+#[test]
+fn scores() {
+ let mut s1 = Score::new();
+ s1.has("foo", 5, true);
+ assert_eq!(1., s1.total());
+ s1.has("bar", 15, false);
+ assert!(s1.total() <= 0.26);
+ assert!(s1.total() >= 0.24);
+ let mut s2 = Score::new();
+ s2.n("baz", 10, 5);
+ s2.frac("baz2", 28, 0.5);
+ assert!(s2.total() >= 0.49);
+ assert!(s2.total() <= 0.51);
+ let mut s3 = Score::new();
+ s3.group("prev", 100, s1);
+ s3.group("prev", 10, s2);
+ assert!(s3.total() >= 0.26);
+ assert!(s3.total() <= 0.28);
+}
diff --git a/reindex/Cargo.toml b/reindex/Cargo.toml
index 58710b1..44d3b6c 100644
--- a/reindex/Cargo.toml
+++ b/reindex/Cargo.toml
@@ -1,5 +1,5 @@
[package]
-version = "0.3.0"
+version = "0.3.1"
edition = "2018"
name = "reindex"
authors = ["Kornel <kornel@geekhood.net>"]
@@ -7,11 +7,13 @@ authors = ["Kornel <kornel@geekhood.net>"]
[dependencies]
crate_db = { path = "../crate_db", version = "0.4.0" }
github_info = { path = "../github_info", version = "0.8.0" }
-kitchen_sink = { path = "../kitchen_sink", version = "0.7.0" }
+kitchen_sink = { path = "../kitchen_sink", version = "0.7.1" }
repo_url = { git = "https://gitlab.com/crates.rs/repo_url.git" }
user_db = { path = "../user_db", version = "0.3" }
failure = "0.1.1"
rayon = "1.0.3"
rand = "0.6"
search_index = { path = "../search_index" }
+render_readme = { git = "https://gitlab.com/crates.rs/render_readme.git" }
+ranking = { path = "../ranking" }
either = "1.5.0"
diff --git a/reindex/src/bin/reindex_search.rs b/reindex/src/bin/reindex_search.rs
index 6b897b3..94a849e 100644
--- a/reindex/src/bin/reindex_search.rs
+++ b/reindex/src/bin/reindex_search.rs
@@ -13,6 +13,8 @@ use kitchen_sink::stopped;
use std::sync::Arc;
use std::sync::mpsc;
use std::thread;
+use ranking::CrateVersionInputs;
+use render_readme::Renderer;
fn main() {
if let Err(e) = run() {
@@ -49,8 +51,9 @@ fn run() -> Result<(), failure::Error> {
let mut n = 0;
let mut next_n = 100;
+ let renderer = Renderer::new(None);
while let Ok((all, ver)) = rx.recv() {
- index(&mut indexer, &all, &ver, crates2.downloads_per_month_or_equivalent(all.origin())?.unwrap_or(0))?;
+ index(&mut indexer, &renderer, &all, &ver, crates2.downloads_per_month_or_equivalent(all.origin())?.unwrap_or(0))?;
if stopped() {break;}
n += 1;
if n == next_n {
@@ -65,7 +68,40 @@ fn run() -> Result<(), failure::Error> {
t.join().unwrap()
}
-fn index(indexer: &mut Indexer, all: &RichCrate, k: &RichCrateVersion, popularity: usize) -> Result<(), failure::Error> {
+
+fn crate_base_score(all: &RichCrate, k: &RichCrateVersion, renderer: &Renderer) -> f64 {
+ let readme = k.readme().ok().and_then(|r| r).map(|readme| {
+ renderer.page_node(&readme.markup, None, false)
+ });
+ ranking::crate_score_version(&CrateVersionInputs {
+ versions: all.versions(),
+ description: k.description().unwrap_or(""),
+ readme: readme.as_ref(),
+ owners: all.owners(),
+ authors: k.authors(),
+ edition: k.edition(),
+ is_app: k.is_app(),
+ has_build_rs: k.has_buildrs(),
+ has_links: k.links().is_some(),
+ has_documentation_link: k.documentation().is_some(),
+ has_homepage_link: k.homepage().is_some(),
+ has_repository_link: k.repository().is_some(),
+ has_keywords: k.has_own_keywords(),
+ has_categories: k.has_own_categories(),
+ has_features: !k.features().is_empty(),
+ has_examples: k.has_examples(),
+ has_benches: k.has_benches(),
+ has_tests: k.has_tests(),
+ // has_lockfile: k.has_lockfile(),
+ // has_changelog: k.has_changelog(),
+ license: k.license().unwrap_or(""),
+ has_badges: k.has_badges(),
+ maintenance: k.maintenance(),
+ is_nightly: k.is_nightly(),
+ }).total()
+}
+
+fn index(indexer: &mut Indexer, renderer: &Renderer, all: &RichCrate, k: &RichCrateVersion, popularity: usize) -> Result<(), failure::Error> {
let keywords: Vec<_> = k.keywords(Include::Cleaned).collect();
let readme = match k.readme() {
@@ -78,31 +114,12 @@ fn index(indexer: &mut Indexer, all: &RichCrate, k: &RichCrateVersion, popularit
// Base score is from donwloads per month.
// apps have it harder to get download numbers
- let mut score = ((popularity+10) as f64).log2() / (if k.is_app() {7.0} else {14.0});
-
- // Try to get rid of junk crates
- if !version.starts_with("0.0.") && !version.starts_with("0.1.0") {
- score += 1.;
- }
- let releases = all.versions().count().min(10);
- if releases > 1 {
- score += releases as f64 / 10.0;
- }
-
- // bus factor
- if k.authors().len() > 1 {
- score += 0.1;
- }
+ let pop_score = ((popularity+10) as f64).log2() / (if k.is_app() {7.0} else {14.0});
- // Prefer stable crates
- if version.starts_with("0.") {
- score *= 0.9;
- }
+ // based on crate's own content and metadata
+ let base_score = crate_base_score(all, k, renderer);
- // long descriptions are better
- if k.description().map_or(false, |d| d.len() > 50) {
- score += 0.1;
- }
+ let mut score = (0.5 + pop_score) * base_score;
// there's usually a non-macro sibling
if k.is_proc_macro() {
@@ -114,9 +131,9 @@ fn index(indexer: &mut Indexer, all: &RichCrate, k: &RichCrateVersion, popularit
score *= 0.001;
}
- score = (score / 4.0).min(1.0); // keep it in the range
+ score = score.min(1.0); // keep it in the range
- println!("{:0.3} {}: {}", score, k.short_name(), k.description().unwrap_or(""));
+ println!("{:0.3} {:0.3} {}: {}", score, base_score, k.short_name(), k.description().unwrap_or(""));
indexer.add(k.short_name(), version, k.description().unwrap_or(""), &keywords, readme, popularity as u64, score);
Ok(())
diff --git a/render_readme b/render_readme
-Subproject c1ad7be04973af1003fbaa903f6a608878b5a15
+Subproject abced45509f74ddddb2f1b07a6b275b5e6b0885
diff --git a/rich_crate/Cargo.toml b/rich_crate/Cargo.toml
index e4f9d2d..9d04343 100644
--- a/rich_crate/Cargo.toml
+++ b/rich_crate/Cargo.toml
@@ -1,7 +1,7 @@
[package]
edition = "2018"
name = "rich_crate"
-version = "0.4.1"
+version = "0.4.2"
authors = ["Kornel <kornel@geekhood.net>"]
description = "Crate struct enriched with additional crates.rs metadata"
license = "Apache-2.0 OR MIT"
@@ -9,11 +9,11 @@ license = "Apache-2.0 OR MIT"
[dependencies]
crates-index = "0.12.0"
crates_io_client = { git = "https://gitlab.com/crates.rs/crates_io_client.git", version = "0.6.0" }
-render_readme = { git = "https://gitlab.com/crates.rs/render_readme.git", version = "0.5.0" }
+render_readme = { git = "https://gitlab.com/crates.rs/render_readme.git", version = "0.6.0" }
categories = { path = "../categories" }
udedokei = { path = "../udedokei" }
cargo_author = "1.0.0"
-cargo_toml = "0.6.4"
+cargo_toml = "0.6.5"