diff options
m--------- | cargo_toml | 0 | ||||
-rw-r--r-- | front_end/Cargo.toml | 2 | ||||
-rw-r--r-- | front_end/src/bin/crate_pages.rs | 4 | ||||
-rw-r--r-- | front_end/src/bin/serp.rs | 2 | ||||
-rw-r--r-- | front_end/src/bin/website.rs | 2 | ||||
-rw-r--r-- | front_end/src/crate_page.rs | 4 | ||||
-rw-r--r-- | kitchen_sink/Cargo.toml | 6 | ||||
-rw-r--r-- | kitchen_sink/src/index.rs | 2 | ||||
-rw-r--r-- | kitchen_sink/src/lib_kitchen_sink.rs | 2 | ||||
-rw-r--r-- | ranking/Cargo.toml | 4 | ||||
-rw-r--r-- | ranking/src/lib_ranking.rs | 337 | ||||
-rw-r--r-- | ranking/src/main.rs | 51 | ||||
-rw-r--r-- | ranking/src/scorer.rs | 95 | ||||
-rw-r--r-- | reindex/Cargo.toml | 6 | ||||
-rw-r--r-- | reindex/src/bin/reindex_search.rs | 71 | ||||
m--------- | render_readme | 0 | ||||
-rw-r--r-- | rich_crate/Cargo.toml | 6 | ||||
-rw-r--r-- | rich_crate/src/rich_crate.rs | 6 | ||||
-rw-r--r-- | rich_crate/src/rich_crate_version.rs | 40 | ||||
-rw-r--r-- | server/Cargo.toml | 2 | ||||
-rw-r--r-- | server/src/main.rs | 2 |
21 files changed, 538 insertions, 106 deletions
diff --git a/cargo_toml b/cargo_toml -Subproject 545d7341e6849600fb0cfd2067bcb3cc7f1e3c6 +Subproject e3f80e07bc6b8988f72d226658ed4e657898e95 diff --git a/front_end/Cargo.toml b/front_end/Cargo.toml index 3f0bded..a63d001 100644 --- a/front_end/Cargo.toml +++ b/front_end/Cargo.toml @@ -15,7 +15,7 @@ ructe = "0.5.6" [dependencies] kitchen_sink = { path = "../kitchen_sink", version = "0.7.0" } rich_crate = { path = "../rich_crate" } -render_readme = { git = "https://gitlab.com/crates.rs/render_readme.git", version = "0.5.0" } +render_readme = { git = "https://gitlab.com/crates.rs/render_readme.git", version = "0.6.0" } categories = { path = "../categories" } udedokei = { path = "../udedokei" } search_index = { path = "../search_index" } diff --git a/front_end/src/bin/crate_pages.rs b/front_end/src/bin/crate_pages.rs index 82d1ceb..f9b7089 100644 --- a/front_end/src/bin/crate_pages.rs +++ b/front_end/src/bin/crate_pages.rs @@ -20,7 +20,7 @@ fn main() { } fn is_useful1(allver: &RichCrate) -> bool { - if allver.versions().count() < 2 { + if allver.versions().len() < 2 { eprintln!("{} one release", allver.name()); return false; } @@ -64,7 +64,7 @@ fn run(filter: Option<String>) -> Result<(), failure::Error> { let crates = Arc::new(kitchen_sink::KitchenSink::new_default()?); // crates.prewarm(); let image_filter = Arc::new(ImageOptimAPIFilter::new("czjpqfbdkz", crates.main_cache_dir().join("images.db"))?); - let markup = &Renderer::new_filter(Highlighter::new(), image_filter); + let markup = &Renderer::new_filter(Some(Highlighter::new()), image_filter); rayon::scope(move |s1| { for origin in crates.all_crates() { if let Some(ref filter) = filter { diff --git a/front_end/src/bin/serp.rs b/front_end/src/bin/serp.rs index 9d86d50..9114135 100644 --- a/front_end/src/bin/serp.rs +++ b/front_end/src/bin/serp.rs @@ -28,7 +28,7 @@ fn run() -> Result<(), failure::Error> { println!("http://localhost:3000/search"); let image_filter = Arc::new(ImageOptimAPIFilter::new("czjpqfbdkz", "../data/images.db")?); - let markup = Renderer::new_filter(Highlighter::new(), image_filter); + let markup = Renderer::new_filter(Some(Highlighter::new()), image_filter); front_end::render_serp_page(&mut f, &query, &results, &markup)?; Ok(()) diff --git a/front_end/src/bin/website.rs b/front_end/src/bin/website.rs index 5a3fbf5..b28fd32 100644 --- a/front_end/src/bin/website.rs +++ b/front_end/src/bin/website.rs @@ -40,7 +40,7 @@ fn run() -> Result<(), failure::Error> { let crates = KitchenSink::new_default().context("init caches, data, etc.")?; let done_pages = Mutex::new(HashSet::with_capacity(5000)); let image_filter = Arc::new(ImageOptimAPIFilter::new("czjpqfbdkz", crates.main_cache_dir().join("images.db"))?); - let markup = Renderer::new_filter(Highlighter::new(), image_filter); + let markup = Renderer::new_filter(Some(Highlighter::new()), image_filter); println!("Generating homepage and category pages…"); let (res1, res2) = rayon::join( diff --git a/front_end/src/crate_page.rs b/front_end/src/crate_page.rs index 70ac3be..e527247 100644 --- a/front_end/src/crate_page.rs +++ b/front_end/src/crate_page.rs @@ -559,7 +559,7 @@ impl<'a> CratePage<'a> { } pub fn all_versions(&self) -> impl Iterator<Item = Version<'a>> { - self.all.versions().map(|v| Version { + self.all.versions().iter().map(|v| Version { yanked: v.yanked, num: &v.num, semver: SemVer::parse(&v.num).expect("semver parse"), @@ -568,7 +568,7 @@ impl<'a> CratePage<'a> { } pub fn published_date(&self) -> DateTime<FixedOffset> { - let min_iso_date = self.all.versions().map(|v| &v.created_at).min().expect("any version in the crate"); + let min_iso_date = self.all.versions().iter().map(|v| &v.created_at).min().expect("any version in the crate"); DateTime::parse_from_rfc3339(min_iso_date).expect("created_at parse") } diff --git a/kitchen_sink/Cargo.toml b/kitchen_sink/Cargo.toml index 50195dc..e0082cb 100644 --- a/kitchen_sink/Cargo.toml +++ b/kitchen_sink/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2018" name = "kitchen_sink" -version = "0.7.0" +version = "0.7.1" authors = ["Kornel <kornel@geekhood.net>"] publish = false @@ -19,11 +19,11 @@ crate_files = { path = "../crate_files", version = "0.2" } user_db = { path = "../user_db", version = "0.3" } crate_db = { path = "../crate_db", version = "0.4.0" } categories = { path = "../categories" } -rich_crate = { path = "../rich_crate", version = "0.4.1" } +rich_crate = { path = "../rich_crate", version = "0.4.2" } simple_cache = { git = "https://gitlab.com/crates.rs/simple_cache.git", version = "0.7.0" } lazyonce = "0.3.0" repo_url = { git = "https://gitlab.com/crates.rs/repo_url.git", version = "0.3.0" } -cargo_toml = "0.6.4" +cargo_toml = "0.6.5" serde = "1.0.43" serde_derive = "1.0.70" serde_json = "1.0.33" diff --git a/kitchen_sink/src/index.rs b/kitchen_sink/src/index.rs index 8d50a0b..f3129d9 100644 --- a/kitchen_sink/src/index.rs +++ b/kitchen_sink/src/index.rs @@ -386,5 +386,5 @@ fn index_test() { let stats = idx.deps_stats(); assert!(stats.total > 13800); let lode = stats.counts.get("lodepng").unwrap(); - assert_eq!(10, lode.runtime.0); + assert_eq!(11, lode.runtime.0); } diff --git a/kitchen_sink/src/lib_kitchen_sink.rs b/kitchen_sink/src/lib_kitchen_sink.rs index 8067057..0fe9614 100644 --- a/kitchen_sink/src/lib_kitchen_sink.rs +++ b/kitchen_sink/src/lib_kitchen_sink.rs @@ -311,7 +311,7 @@ impl KitchenSink { self.rich_crate(o).ok() }) .filter(move |k| { - let latest = k.versions().map(|v| v.created_at.as_str()).max().unwrap_or(""); + let latest = k.versions().iter().map(|v| v.created_at.as_str()).max().unwrap_or(""); if let Ok(timestamp) = DateTime::parse_from_rfc3339(latest) { timestamp.timestamp() >= min_timestamp as i64 } else { diff --git a/ranking/Cargo.toml b/ranking/Cargo.toml index e8425d8..4719915 100644 --- a/ranking/Cargo.toml +++ b/ranking/Cargo.toml @@ -10,7 +10,9 @@ path = "src/lib_ranking.rs" [dependencies] rayon = "1.0.3" -kitchen_sink = { path = "../kitchen_sink", version = "0.7.0" } rich_crate = { path = "../rich_crate" } fxhash = "0.2.1" chrono = "0.4.6" +cargo_toml = "0.6.5" +render_readme = { git = "https://gitlab.com/crates.rs/render_readme.git", version = "0.6.1" } +semver = "0.9.0" diff --git a/ranking/src/lib_ranking.rs b/ranking/src/lib_ranking.rs index 47de967..34064a4 100644 --- a/ranking/src/lib_ranking.rs +++ b/ranking/src/lib_ranking.rs @@ -1,4 +1,333 @@ -// mod scorer; -// pub use self::scorer::*; -mod authorrank; -pub use crate::authorrank::*; +mod scorer; +use render_readme::Handle; +use render_readme::NodeData; +use rich_crate::Edition; +use rich_crate::Author; +use rich_crate::CrateVersion; +use cargo_toml::MaintenanceStatus; +use rich_crate::CrateOwner; +use semver::Version as SemVer; +pub use self::scorer::*; +use chrono::prelude::*; + +/// Only changes when a new version is released +pub struct CrateVersionInputs<'a> { + pub versions: &'a [CrateVersion], + pub description: &'a str, + pub readme: Option<&'a Handle>, + pub owners: &'a [CrateOwner], + pub authors: &'a [Author], + pub edition: Edition, + pub is_app: bool, + pub has_build_rs: bool, + pub has_links: bool, + pub has_documentation_link: bool, + pub has_homepage_link: bool, + pub has_repository_link: bool, + pub has_keywords: bool, + pub has_categories: bool, + pub has_features: bool, + pub has_examples: bool, + pub has_benches: bool, + pub has_tests: bool, + // pub has_lockfile: bool, + // pub has_changelog: bool, + pub license: &'a str, + pub has_badges: bool, + pub maintenance: MaintenanceStatus, + pub is_nightly: bool, + + // (relative) weight of dependencies? + + // rust loc + // #[test] cases + // assert! calls + // comments ratio (normalized to project size) + + // look for deprecated in the description +} + +/// Changes over time, but doesn't depend on crate's own ranking +pub struct CrateTemporalInputs { + /// 1.0 fresh, 0.0 totally outdated and deprecated + pub dependency_freshness: Vec<f32>, + pub recent_downloads: u32, + pub recent_downloads_minus_most_downloaded_user: u32, + pub has_docs_rs: bool, + + // low priority, because it's unranked! it'll be re-evaluated later + pub number_of_reverse_deps: u32, + + // most recent commit + // avg time issues are left unanswered? +} + +/// Crate's own base ranking influences these rankings +pub struct CrateContextInputs { + pub crate_score_context_free: f64, + pub owner_pageranks: Vec<f32>, + pub reverse_deps_rankings: Vec<f32>, +} + +pub struct Env { + pub max_recent_downloads: u32, + pub max_crates: u32, +} + +fn cargo_toml_score(cr: &CrateVersionInputs) -> Score { + let mut s = Score::new(); + + s.frac("description len", 20, (cr.description.len() as f64 / 300.).min(1.)); + + // build.rs slows compilation down, so better not use it unless necessary (links means a sys create, so somewhat justified) + s.n("build.rs", 20, if !cr.has_build_rs && !cr.has_links {20} else if cr.has_links {10} else {0}); + + // users report examples are super valuable + s.has("has_examples", 100, cr.has_examples); + // probably less buggy than if winging it + s.has("has_tests", 70, cr.has_tests); + // probably optimized + s.has("has_benches", 10, cr.has_benches); + + // docs are very important (TODO: this may be redundant with docs.rs) + s.has("has_documentation_link", 30, cr.has_documentation_link); + s.has("has_homepage_link", 30, cr.has_homepage_link); + + // we care about being able to analyze + s.has("has_repository_link", 20, cr.has_repository_link); + + // helps crates.rs show crate in the right place + s.has("has_keywords", 10, cr.has_keywords); + s.has("has_categories", 5, cr.has_categories); + + // probably non-trivial crate + s.has("has_features", 5, cr.has_features); + + // it's the best practice, may help building old versions of the project + // s.has("has_lockfile", 5, cr.has_lockfile); + // assume it's CI, which helps improve quality + s.has("has_badges", 10, cr.has_badges); + + // not official + // s.has("has_changelog", 5, cr.has_changelog); + + s.n("maintenance status", 30, match cr.maintenance { + MaintenanceStatus::ActivelyDeveloped => 30, + MaintenanceStatus::Experimental => 25, + MaintenanceStatus::None => 20, + MaintenanceStatus::PassivelyMaintained => 10, + MaintenanceStatus::AsIs => 5, + MaintenanceStatus::LookingForMaintainer => 4, + MaintenanceStatus::Deprecated => 0, + }); + + // TODO: being nightly should be a negative score + s.has("works on stable", 20, !cr.is_nightly); + // fresh + s.has("2018 edition", 10, cr.edition != Edition::E2015); + + // license proliferation is bad + s.has("useful license", 10, if cr.is_app { + // for end-user apps assume user freedom > developer freedom + cr.license.contains("GPL") || cr.license.contains("CC-BY-SA") || cr.license.contains("MPL") + } else { + // for libs assume developer freedom > user freedom + cr.license.contains("MIT") || cr.license.contains("BSD") || cr.license.contains("Apache") || cr.license.contains("CC0") + }); + + s +} + +#[derive(Default)] +struct MarkupProps { + text_len: usize, + code_len: usize, + list_or_table_rows: u16, + images: u16, + pre_blocks: u16, + sections: u16, +} + +fn is_badge_url(url: &str) -> bool { + let url = url.trim_start_matches("http://").trim_start_matches("https://") + .trim_start_matches("www.") + .trim_start_matches("flat.") + .trim_start_matches("images.") + .trim_start_matches("img.") + .trim_start_matches("api.") + .trim_start_matches("ci.") + .trim_start_matches("build."); + url.starts_with("appveyor.com") || + url.starts_with("badge.") || + url.starts_with("badgen.") || + url.starts_with("badges.") || + url.starts_with("codecov.io") || + url.starts_with("coveralls.io") || + url.starts_with("docs.rs") || + url.starts_with("gitlab.com") || + url.starts_with("isitmaintained.com") || + url.starts_with("meritbadge") || + url.starts_with("microbadger") || + url.starts_with("ohloh.net") || + url.starts_with("openhub.net") || + url.starts_with("repostatus.org") || + url.starts_with("shields.io") || + url.starts_with("snapcraft.io") || + url.starts_with("spearow.io") || + url.starts_with("travis-ci.") || + url.starts_with("zenodo.org") || + url.ends_with("?branch=master") || + url.ends_with("/pipeline.svg") || + url.ends_with("/coverage.svg") || + url.ends_with("/build.svg") || + url.ends_with("badge.svg") || + url.ends_with("badge.png") +} + +fn fill_props(node: &Handle, props: &mut MarkupProps, mut in_code: bool) { + match node.data { + NodeData::Text {ref contents} => { + let len = contents.borrow().trim().len(); + if len > 0 { + if in_code { + props.code_len += len + 1; // +1 to account for separators that were trimmed + } else { + props.text_len += len + 1; + } + } + return; // has no children + }, + NodeData::Element {ref name, ref attrs, ..} => { + match name.local.get(..).unwrap() { + "img" => { + if let Some(src) = attrs.borrow().iter().find(|a| a.name.local.get(..).unwrap() == "src") { + if is_badge_url(&src.value) { + return; // don't count badges + } + } + props.images += 1; + return; + }, + "li" | "tr" => props.list_or_table_rows += 1, + "a" => { + if let Some(href) = attrs.borrow().iter().find(|a| a.name.local.get(..).unwrap() == "href") { + if is_badge_url(&href.value) { + return; // don't count badge image children + } + } + }, + "pre" => { + in_code = true; + props.pre_blocks += 1; + }, + "code" => in_code = true, + "h1" | "h2" | "h3" | "h4" | "h5" => props.sections += 1, + _ => {}, + } + }, + _ => {}, + } + for child in node.children.borrow().iter() { + fill_props(child, props, in_code); + } +} + +fn readme_score(readme: Option<&Handle>) -> Score { + let mut s = Score::new(); + let mut props = Default::default(); + if let Some(readme) = readme { + fill_props(readme, &mut props, false); + } + s.frac("text length", 75, (props.text_len as f64 /3000.).min(1.0)); + s.frac("code length", 100, (props.code_len as f64 /2000.).min(1.0)); + s.has("has code", 30, props.code_len > 150 && props.pre_blocks > 0); // people really like seeing a code example + s.n("code blocks", 25, props.pre_blocks * 5); + s.n("images", 35, props.images * 25); // I like pages with logos + s.n("sections", 30, props.sections * 4); + s.n("list or table rows", 25, props.list_or_table_rows * 2); + s +} + +fn versions_score(ver: &[CrateVersion]) -> Score { + let mut s = Score::new(); + let semver = ver.iter().filter(|s| !s.yanked).filter_map(|s| SemVer::parse(&s.num).ok()).collect::<Vec<_>>(); + s.has("more than one release", 20, semver.len() > 1); + + if !semver.is_empty() { // all yanked + return s; + } + + let oldest = ver.iter().map(|v| &v.created_at).min().and_then(|s| s.parse::<DateTime<Utc>>().ok()); + let newest = ver.iter().map(|v| &v.created_at).max().and_then(|s| s.parse::<DateTime<Utc>>().ok()); + if let (Some(oldest), Some(newest)) = (oldest, newest) { + s.n("crate development time", 40, (newest - oldest).num_days() / 11); + } + // don't count 0.0.x + s.n("number of non-experimental releases", 15, semver.iter().filter(|v| (v.major > 0 || v.minor > 0) && v.pre.is_empty()).count() as u32); + + // patch releases are correlated with stable, polished code + s.n("patch releases", 20, 5 * semver.iter().filter(|v| v.major > 0 && v.patch > 0).count() as u32); + s.n("a high patch release", 10, semver.iter().map(|v| v.patch as u32).max().unwrap_or(0)); + // for 0.x crates it's hard to knwo what is a patch release + s.has("an unstable patch/feature release", 10, semver.iter().any(|v| v.major == 0 && v.patch > 1)); + // careful release process is a sign of maturity + s.has("a prerelease", 5, semver.iter().any(|v| !v.pre.is_empty())); + s.has("a stable release", 15, semver.iter().any(|v| v.major > 0 && v.major < 20)); + s.has("yanked", 2, ver.iter().any(|v| v.yanked)); // author cares to remove bad versions + s +} + +fn authors_score(authors: &[Author], owners: &[CrateOwner]) -> Score { + let mut s = Score::new(); + s.n("bus factor", 5, owners.len() as u32); + s.n("more than one owner", 5, owners.len() > 1); + s.n("authors", 5, authors.len() as u32); + s +} + +pub fn crate_score_version(cr: &CrateVersionInputs) -> Score { + let mut score = Score::new(); + + score.group("Cargo.toml", 1, cargo_toml_score(cr)); + score.group("README", 1, readme_score(cr.readme)); + score.group("Versions", 1, versions_score(cr.versions)); + score.group("Authors/Owners", 1, authors_score(cr.authors, cr.owners)); + + score +} + +// pub fn crate_score_temporal(inputs: &CrateTemporalInputs) -> Score { +// let mut score = Score::new(); + +// score +// } + +// pub fn crate_score_contextual(inputs: &CrateContextInputs) -> Score { +// let mut score = Score::new(); + +// score +// } + +#[test] +fn test_readme_score() { + let ren = render_readme::Renderer::new(None); + let dom = ren.page_node(&render_readme::Markup::Markdown("# hello world [link](http://hrefval) +![img](imgsrc) +![badg](http://travis-ci.org/badge.svg) + +``` +code +``` + +* list +* items +".into()), None, false); + let mut p = Default::default(); + fill_props(&dom, &mut p, false); + assert_eq!(p.images, 1); + assert_eq!(p.sections, 1); + assert_eq!(p.list_or_table_rows, 2); + assert_eq!(p.pre_blocks, 1); + assert_eq!(p.code_len, 5); + assert_eq!(p.text_len, 28); +} diff --git a/ranking/src/main.rs b/ranking/src/main.rs deleted file mode 100644 index f2bc30b..0000000 --- a/ranking/src/main.rs +++ /dev/null @@ -1,51 +0,0 @@ -#![allow(unused)] - -use kitchen_sink::{KitchenSink, CrateData}; - -fn main() { - let mut crates = KitchenSink::new_default().unwrap(); - // crates.cache_only(true); - - let (authors, deps) = rayon::join( - || ranking::do_author_pr(&crates).unwrap(), - || crates_by_rev_dep(&crates)); - - let mut top: Vec<_> = authors.iter().collect(); - top.sort_by(|a,b| b.1.partial_cmp(&a.1).unwrap()); - top.truncate(100); - for (author, score) in top { - println!("{}: {:0.4}", author, score); - } - - let mut by_risk: Vec<_> = deps.into_iter().filter(|&(_, rev_deps, _)| rev_deps > 5).map(|(name, rev_deps, owners)| { - // most trusted finds most risky crates by unvetted authors. - // least trusted would find crates with weakest links - // (which is useful too, but too soon to address when we have almost no reviews for anything yet) - let most_trusted = owners.into_iter().filter_map(|o| authors.get(&*o).cloned()).fold(0., |a:f64,b:f64| a.max(b)); - let risk = (rev_deps as f64) / (0.000001 + most_trusted); - (name, risk) - }).collect(); - - by_risk.sort_by(|a,b| b.1.partial_cmp(&a.1).unwrap()); - by_risk.truncate(200); - for (s, a) in by_risk { - println!("{} {}", s, a); - } -} - -fn crates_by_rev_dep(crates: &KitchenSink) -> Vec<(&str, u32, Vec<Box<str>>)> { - let mut res = Vec::new(); - for k in crates.all_crates_io_crates().values() { - let name = k.name(); - if let Some(rev) = crates.dependents_stats_of_crates_io_crate(name) { - if let Ok(owners) = crates.crates_io_crate_owners(name, k.latest_version().version()) { - let rev_dep_count = rev.runtime.0 as u32 * 2 + rev.runtime.1 as u32 + rev.build.0 as u32 * 2 + rev.build.1 as u32 + rev.dev as u32 / 2; - let owners = owners.into_iter() - .filter_map(|o| o.github_login().map(|l| l.to_ascii_lowercase().into_boxed_str())) - .collect(); - res.push((name, rev_dep_count, owners)); - } - } - } - res -} diff --git a/ranking/src/scorer.rs b/ranking/src/scorer.rs new file mode 100644 index 0000000..e476a52 --- /dev/null +++ b/ranking/src/scorer.rs @@ -0,0 +1,95 @@ +use std::borrow::Borrow; + +#[derive(Debug, Clone, Default)] +pub struct Score { + scores: Vec<(f64, f64, &'static str)>, + total: f64, +} + +#[derive(Debug, Default)] +pub struct ScoreAdj<'a> { + score: Option<&'a mut f64>, +} + +impl Score { + pub fn new() -> Self { + Self::default() + } + + #[inline] + /// Add score if it has the given property + pub fn has(&mut self, for_what: &'static str, score: u32, has_it: bool) -> ScoreAdj<'_> { + self.score_f(for_what, score as f64, if has_it { score as f64 } else { 0. }) + } + + #[inline] + /// Add this much score, up to the max + pub fn n(&mut self, for_what: &'static str, max_score: u32, n: impl Into<i64>) -> ScoreAdj<'_> { + self.score_f(for_what, max_score as f64, n.into() as f64) + } + + /// Add `max_score` * `n` where n is in 0..1 + pub fn frac(&mut self, for_what: &'static str, max_score: u32, n: impl Into<f64>) -> ScoreAdj<'_> { + let n = n.into(); + assert!(n >= 0.); + assert!(n <= 1.); + let max_score = max_score as f64; + self.score_f(for_what, max_score, n * max_score) + } + + #[inline] + /// Add `n` of `max_score` points + pub fn score_f(&mut self, for_what: &'static str, max_score: f64, n: impl Into<f64>) -> ScoreAdj<'_> { + let n = n.into(); + self.total += max_score; + if n > 0. { + self.scores.push((n, max_score, for_what)); + ScoreAdj { score: self.scores.last_mut().map(|(s, ..)| s) } + } else { + ScoreAdj::default() + } + } + + /// Start a new group of scores, and `max_score` is the max total score of the group + pub fn group<'a>(&mut self, for_what: &'static str, max_score: u32, group: impl Borrow<Score>) -> ScoreAdj<'_> { + self.frac(for_what, max_score, group.borrow().total()) + } + + /// Get total score + pub fn total(&self) -> f64 { + let sum = self.scores.iter().map(|&(v, limit, _)| v.max(0.).min(limit)).sum::<f64>(); + sum / self.total as f64 + } +} + +impl<'a> ScoreAdj<'a> { + pub fn mul(&mut self, by: f64) { + self.adj(|n| n * by) + } + + pub fn adj(&mut self, adj_with: impl FnOnce(f64) -> f64) { + if let Some(s) = self.score.as_mut() { + **s = adj_with(**s); + } + } +} + +#[test] +fn scores() { + let mut s1 = Score::new(); + s1.has("foo", 5, true); + assert_eq!(1., s1.total()); + s1.has("bar", 15, false); + assert!(s1.total() <= 0.26); + assert!(s1.total() >= 0.24); + let mut s2 = Score::new(); + s2.n("baz", 10, 5); + s2.frac("baz2", 28, 0.5); + assert!(s2.total() >= 0.49); + assert!(s2.total() <= 0.51); + let mut s3 = Score::new(); + s3.group("prev", 100, s1); + s3.group("prev", 10, s2); + assert!(s3.total() >= 0.26); + assert!(s3.total() <= 0.28); +} diff --git a/reindex/Cargo.toml b/reindex/Cargo.toml index 58710b1..44d3b6c 100644 --- a/reindex/Cargo.toml +++ b/reindex/Cargo.toml @@ -1,5 +1,5 @@ [package] -version = "0.3.0" +version = "0.3.1" edition = "2018" name = "reindex" authors = ["Kornel <kornel@geekhood.net>"] @@ -7,11 +7,13 @@ authors = ["Kornel <kornel@geekhood.net>"] [dependencies] crate_db = { path = "../crate_db", version = "0.4.0" } github_info = { path = "../github_info", version = "0.8.0" } -kitchen_sink = { path = "../kitchen_sink", version = "0.7.0" } +kitchen_sink = { path = "../kitchen_sink", version = "0.7.1" } repo_url = { git = "https://gitlab.com/crates.rs/repo_url.git" } user_db = { path = "../user_db", version = "0.3" } failure = "0.1.1" rayon = "1.0.3" rand = "0.6" search_index = { path = "../search_index" } +render_readme = { git = "https://gitlab.com/crates.rs/render_readme.git" } +ranking = { path = "../ranking" } either = "1.5.0" diff --git a/reindex/src/bin/reindex_search.rs b/reindex/src/bin/reindex_search.rs index 6b897b3..94a849e 100644 --- a/reindex/src/bin/reindex_search.rs +++ b/reindex/src/bin/reindex_search.rs @@ -13,6 +13,8 @@ use kitchen_sink::stopped; use std::sync::Arc; use std::sync::mpsc; use std::thread; +use ranking::CrateVersionInputs; +use render_readme::Renderer; fn main() { if let Err(e) = run() { @@ -49,8 +51,9 @@ fn run() -> Result<(), failure::Error> { let mut n = 0; let mut next_n = 100; + let renderer = Renderer::new(None); while let Ok((all, ver)) = rx.recv() { - index(&mut indexer, &all, &ver, crates2.downloads_per_month_or_equivalent(all.origin())?.unwrap_or(0))?; + index(&mut indexer, &renderer, &all, &ver, crates2.downloads_per_month_or_equivalent(all.origin())?.unwrap_or(0))?; if stopped() {break;} n += 1; if n == next_n { @@ -65,7 +68,40 @@ fn run() -> Result<(), failure::Error> { t.join().unwrap() } -fn index(indexer: &mut Indexer, all: &RichCrate, k: &RichCrateVersion, popularity: usize) -> Result<(), failure::Error> { + +fn crate_base_score(all: &RichCrate, k: &RichCrateVersion, renderer: &Renderer) -> f64 { + let readme = k.readme().ok().and_then(|r| r).map(|readme| { + renderer.page_node(&readme.markup, None, false) + }); + ranking::crate_score_version(&CrateVersionInputs { + versions: all.versions(), + description: k.description().unwrap_or(""), + readme: readme.as_ref(), + owners: all.owners(), + authors: k.authors(), + edition: k.edition(), + is_app: k.is_app(), + has_build_rs: k.has_buildrs(), + has_links: k.links().is_some(), + has_documentation_link: k.documentation().is_some(), + has_homepage_link: k.homepage().is_some(), + has_repository_link: k.repository().is_some(), + has_keywords: k.has_own_keywords(), + has_categories: k.has_own_categories(), + has_features: !k.features().is_empty(), + has_examples: k.has_examples(), + has_benches: k.has_benches(), + has_tests: k.has_tests(), + // has_lockfile: k.has_lockfile(), + // has_changelog: k.has_changelog(), + license: k.license().unwrap_or(""), + has_badges: k.has_badges(), + maintenance: k.maintenance(), + is_nightly: k.is_nightly(), + }).total() +} + +fn index(indexer: &mut Indexer, renderer: &Renderer, all: &RichCrate, k: &RichCrateVersion, popularity: usize) -> Result<(), failure::Error> { let keywords: Vec<_> = k.keywords(Include::Cleaned).collect(); let readme = match k.readme() { @@ -78,31 +114,12 @@ fn index(indexer: &mut Indexer, all: &RichCrate, k: &RichCrateVersion, popularit // Base score is from donwloads per month. // apps have it harder to get download numbers - let mut score = ((popularity+10) as f64).log2() / (if k.is_app() {7.0} else {14.0}); - - // Try to get rid of junk crates - if !version.starts_with("0.0.") && !version.starts_with("0.1.0") { - score += 1.; - } - let releases = all.versions().count().min(10); - if releases > 1 { - score += releases as f64 / 10.0; - } - - // bus factor - if k.authors().len() > 1 { - score += 0.1; - } + let pop_score = ((popularity+10) as f64).log2() / (if k.is_app() {7.0} else {14.0}); - // Prefer stable crates - if version.starts_with("0.") { - score *= 0.9; - } + // based on crate's own content and metadata + let base_score = crate_base_score(all, k, renderer); - // long descriptions are better - if k.description().map_or(false, |d| d.len() > 50) { - score += 0.1; - } + let mut score = (0.5 + pop_score) * base_score; // there's usually a non-macro sibling if k.is_proc_macro() { @@ -114,9 +131,9 @@ fn index(indexer: &mut Indexer, all: &RichCrate, k: &RichCrateVersion, popularit score *= 0.001; } - score = (score / 4.0).min(1.0); // keep it in the range + score = score.min(1.0); // keep it in the range - println!("{:0.3} {}: {}", score, k.short_name(), k.description().unwrap_or("")); + println!("{:0.3} {:0.3} {}: {}", score, base_score, k.short_name(), k.description().unwrap_or("")); indexer.add(k.short_name(), version, k.description().unwrap_or(""), &keywords, readme, popularity as u64, score); Ok(()) diff --git a/render_readme b/render_readme -Subproject c1ad7be04973af1003fbaa903f6a608878b5a15 +Subproject abced45509f74ddddb2f1b07a6b275b5e6b0885 diff --git a/rich_crate/Cargo.toml b/rich_crate/Cargo.toml index e4f9d2d..9d04343 100644 --- a/rich_crate/Cargo.toml +++ b/rich_crate/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2018" name = "rich_crate" -version = "0.4.1" +version = "0.4.2" authors = ["Kornel <kornel@geekhood.net>"] description = "Crate struct enriched with additional crates.rs metadata" license = "Apache-2.0 OR MIT" @@ -9,11 +9,11 @@ license = "Apache-2.0 OR MIT" [dependencies] crates-index = "0.12.0" crates_io_client = { git = "https://gitlab.com/crates.rs/crates_io_client.git", version = "0.6.0" } -render_readme = { git = "https://gitlab.com/crates.rs/render_readme.git", version = "0.5.0" } +render_readme = { git = "https://gitlab.com/crates.rs/render_readme.git", version = "0.6.0" } categories = { path = "../categories" } udedokei = { path = "../udedokei" } cargo_author = "1.0.0" -cargo_toml = "0.6.4" +cargo_toml = "0.6.5" repo_url = { git = "https://gitlab.com/crates.rs/repo_url.git" } semver = "0.9.0" parse_cfg = "2.0.0" diff --git a/rich_crate/src/rich_crate.rs b/rich_crate/src/rich_crate.rs index cb9f514..9c72094 100644 --- a/rich_crate/src/rich_crate.rs +++ b/rich_crate/src/rich_crate.rs @@ -1,4 +1,4 @@ -use crates_io_client::CrateOwner; +pub use crates_io_client::CrateOwner; pub use crates_io_client::DownloadWeek; use crate::Origin; @@ -49,7 +49,7 @@ impl RichCrate { &self.owners } - pub fn versions(&self) -> impl Iterator<Item |