diff options
author | Kornel <kornel@geekhood.net> | 2020-02-23 19:27:51 +0000 |
---|---|---|
committer | Kornel <kornel@geekhood.net> | 2020-02-23 19:27:51 +0000 |
commit | 97a9e602b44871572d040e8208c63190a7e74612 (patch) | |
tree | c099ebd45c6fea0304dd55d848881ca1acfd3941 | |
parent | f617258e79b9bd02d267184bc0c33b8908cc39b9 (diff) |
Owner info from data dump
-rw-r--r-- | crates_io_client/src/crate_owners.rs | 9 | ||||
-rw-r--r-- | datadump/src/main.rs | 75 | ||||
-rw-r--r-- | kitchen_sink/src/lib_kitchen_sink.rs | 23 |
3 files changed, 98 insertions, 9 deletions
diff --git a/crates_io_client/src/crate_owners.rs b/crates_io_client/src/crate_owners.rs index 8df6c69..4dad6d7 100644 --- a/crates_io_client/src/crate_owners.rs +++ b/crates_io_client/src/crate_owners.rs @@ -25,6 +25,15 @@ pub struct CrateOwner { pub url: String, // "https://github.com/rust-bus", pub name: Option<String>, // "maintainers", pub avatar: Option<String>, // "https://avatars1.githubusercontent.com/u/38887296?v=4" + + #[serde(default)] + pub github_id: Option<u32>, + + #[serde(default)] + pub invited_at: Option<String>, + + #[serde(default)] + pub invited_by_github_id: Option<u32>, } impl CrateOwner { diff --git a/datadump/src/main.rs b/datadump/src/main.rs index 09018d8..e25963d 100644 --- a/datadump/src/main.rs +++ b/datadump/src/main.rs @@ -1,7 +1,10 @@ #![allow(unused)] #![allow(dead_code)] +use std::convert::TryInto; use chrono::prelude::*; use kitchen_sink::KitchenSink; +use kitchen_sink::OwnerKind; +use kitchen_sink::CrateOwner; use libflate::gzip::Decoder; use serde_derive::Deserialize; use std::collections::HashMap; @@ -15,6 +18,7 @@ type BoxErr = Box<dyn std::error::Error + Sync + Send>; #[tokio::main] async fn main() -> Result<(), BoxErr> { + tokio::runtime::Handle::current().spawn(async move { let mut a = Archive::new(Decoder::new(BufReader::new(File::open("db-dump.tar.gz")?))?); let ksink = KitchenSink::new_default().await?; @@ -70,9 +74,16 @@ async fn main() -> Result<(), BoxErr> { index_downloads(crates, versions, &downloads, &ksink)?; } } + if let (Some(crates), Some(teams), Some(users)) = (&crates, &teams, &users) { + if let Some(crate_owners) = crate_owners.take() { + eprintln!("Indexing {} owners", crate_owners.len()); + index_owners(crates, crate_owners, teams, users, &ksink)?; + } + } } } Ok(()) + }).await.unwrap() } #[inline(never)] @@ -93,6 +104,50 @@ fn index_downloads(crates: &CratesMap, versions: &VersionsMap, downloads: &Versi Ok(()) } +#[inline(never)] +fn index_owners(crates: &CratesMap, owners: CrateOwners, teams: &Teams, users: &Users, ksink: &KitchenSink) -> Result<(), BoxErr> { + for (crate_id, owners) in owners { + if let Some(k) = crates.get(&crate_id) { + let owners: Vec<_> = owners.into_iter().map(|o| { + let invited_by_github_id = o.created_by_id.and_then(|id| users.get(&id).map(|u| u.github_id as u32).or_else(|| teams.get(&id).map(|t| t.github_id))); + match o.owner_kind { + 0 => { + let u = users.get(&o.owner_id).expect("owner consistency"); + CrateOwner { + id: o.owner_id as _, + login: u.login.to_owned(), + invited_at: Some(o.created_at), + invited_by_github_id, + github_id: u.github_id.try_into().ok(), + name: Some(u.name.to_owned()), + avatar: None, + url: String::new(), + kind: OwnerKind::User, + } + }, + 1 => { + let u = teams.get(&o.owner_id).expect("owner consistency"); + CrateOwner { + id: o.owner_id as _, + login: u.login.to_owned(), + invited_at: Some(o.created_at), + github_id: Some(u.github_id), + invited_by_github_id, + name: Some(u.name.to_owned()), + avatar: None, + url: String::new(), + kind: OwnerKind::Team, + } + }, + _ => panic!("bad owner type"), + } + }).collect(); + ksink.set_crates_io_crate_owners(&k.to_ascii_lowercase(), owners).map_err(|_| "ugh")?; + } + } + Ok(()) +} + #[derive(Deserialize)] struct CrateOwnerRow { crate_id: u32, @@ -102,14 +157,16 @@ struct CrateOwnerRow { owner_kind: u8, } +type CrateOwners = HashMap<u32, Vec<CrateOwnerRow>>; + #[inline(never)] -fn parse_crate_owners(file: impl Read) -> Result<HashMap<u32, CrateOwnerRow>, BoxErr> { +fn parse_crate_owners(file: impl Read) -> Result<CrateOwners, BoxErr> { let mut csv = csv::ReaderBuilder::new().has_headers(true).flexible(false).from_reader(file); let mut out = HashMap::with_capacity(NUM_CRATES); for r in csv.records() { let r = r?; let r = r.deserialize::<CrateOwnerRow>(None).map_err(|e| format!("wat? {:#?} {}", r, e))?; - out.insert(r.crate_id, r); + out.entry(r.crate_id).or_insert_with(|| Vec::with_capacity(1)).push(r); } Ok(out) } @@ -119,12 +176,14 @@ struct TeamRow { avatar: String, github_id: u32, id: u32, - login: String, - name: String, + login: String, // in the funny format + name: String, // human str } +type Teams = HashMap<u32, TeamRow>; + #[inline(never)] -fn parse_teams(file: impl Read) -> Result<HashMap<u32, TeamRow>, BoxErr> { +fn parse_teams(file: impl Read) -> Result<Teams, BoxErr> { let mut csv = csv::ReaderBuilder::new().has_headers(true).flexible(false).from_reader(file); let mut out = HashMap::with_capacity(NUM_CRATES); for r in csv.records() { @@ -138,14 +197,16 @@ fn parse_teams(file: impl Read) -> Result<HashMap<u32, TeamRow>, BoxErr> { #[derive(Deserialize)] struct UserRow { avatar: String, - github_id: i32, // -1 happens :( + github_id: i32, // there is -1 :( login: String, id: u32, name: String, } +type Users = HashMap<u32, UserRow>; + #[inline(never)] -fn parse_users(file: impl Read) -> Result<HashMap<u32, UserRow>, BoxErr> { +fn parse_users(file: impl Read) -> Result<Users, BoxErr> { let mut csv = csv::ReaderBuilder::new().has_headers(true).flexible(false).from_reader(file); let mut out = HashMap::with_capacity(NUM_CRATES); for r in csv.records() { diff --git a/kitchen_sink/src/lib_kitchen_sink.rs b/kitchen_sink/src/lib_kitchen_sink.rs index 70ec683..ce480f7 100644 --- a/kitchen_sink/src/lib_kitchen_sink.rs +++ b/kitchen_sink/src/lib_kitchen_sink.rs @@ -46,7 +46,7 @@ use categories::Category; use chrono::prelude::*; use chrono::DateTime; use crate_db::{builddb::BuildDb, CrateDb, CrateVersionData, RepoChange}; -use crates_io_client::CrateOwner; +pub use crates_io_client::CrateOwner; use double_checked_cell_async::DoubleCheckedCell; use failure::ResultExt; use futures::future::join_all; @@ -165,6 +165,7 @@ pub struct KitchenSink { main_cache_dir: PathBuf, yearly: AllDownloads, category_overrides: HashMap<String, Vec<Cow<'static, str>>>, + crates_io_owners_cache: TempCache<Vec<CrateOwner>>, } impl KitchenSink { @@ -205,6 +206,7 @@ impl KitchenSink { yearly: AllDownloads::new(&main_cache_dir), main_cache_dir, category_overrides: Self::load_category_overrides(&data_path.join("category_overrides.txt"))?, + crates_io_owners_cache: TempCache::new(&data_path.join("cio-owners.tmp"))?, }) }) } @@ -424,6 +426,10 @@ impl KitchenSink { login: o.login, kind: OwnerKind::User, // FIXME: crates-io uses teams, and we'd need to find the right team? is "owners" a guaranteed thing? name: o.name, + github_id: Some(o.id), + + invited_at: None, + invited_by_github_id: None, } }).collect(), format!("github/{}/{}", repo.owner, package), @@ -1713,7 +1719,12 @@ impl KitchenSink { async fn crate_owners(&self, krate: &RichCrateVersion) -> CResult<Vec<CrateOwner>> { match krate.origin() { - Origin::CratesIo(name) => self.crates_io_crate_owners(name, krate.version()).await, + Origin::CratesIo(name) => { + if let Some(o) = self.crates_io_owners_cache.get(krate.name())? { + return Ok(o); + } + self.crates_io_crate_owners(name, krate.version()).await + }, Origin::GitLab {..} => Ok(vec![]), Origin::GitHub {repo, ..} => Ok(vec![ CrateOwner { @@ -1725,6 +1736,10 @@ impl KitchenSink { login: repo.owner.to_string(), kind: OwnerKind::User, // FIXME: crates-io uses teams, and we'd need to find the right team? is "owners" a guaranteed thing? name: None, + + invited_at: None, + github_id: None, + invited_by_github_id: None, } ]), } @@ -1734,6 +1749,10 @@ impl KitchenSink { Ok(self.crates_io.crate_owners(crate_name, version).await.context("crate_owners")?.unwrap_or_default()) } + pub fn set_crates_io_crate_owners(&self, crate_name: &str, owners: Vec<CrateOwner>) -> Result<(), ()> { + self.crates_io_owners_cache.set(crate_name, owners).map_err(drop) + } + // Sorted from the top, returns origins pub async fn top_crates_in_category(&self, slug: &str) -> CResult<Arc<Vec<Origin>>> { { |