From e29ffdaa0bc1ae3906f88111397bd0776797046c Mon Sep 17 00:00:00 2001 From: Kornel Date: Wed, 11 Mar 2020 22:47:11 +0000 Subject: Handle missing github IDs --- crates_io_client/src/crate_owners.rs | 1 - datadump/src/main.rs | 2 -- github_info/Cargo.toml | 2 +- github_info/src/lib_github.rs | 6 ++-- github_info/src/model.rs | 21 ++++++++--- github_v3/Cargo.toml | 2 +- github_v3/examples/users.rs | 18 ++++++++-- github_v3/src/lib.rs | 2 -- kitchen_sink/src/lib_kitchen_sink.rs | 69 ++++++++++++++++++++++-------------- server/Cargo.toml | 2 +- 10 files changed, 82 insertions(+), 43 deletions(-) diff --git a/crates_io_client/src/crate_owners.rs b/crates_io_client/src/crate_owners.rs index c3861e7..c1d6c1d 100644 --- a/crates_io_client/src/crate_owners.rs +++ b/crates_io_client/src/crate_owners.rs @@ -22,7 +22,6 @@ pub enum OwnerKind { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CrateOwner { - pub id: usize, // 362, pub login: String, // "github:rust-bus:maintainers", pub kind: OwnerKind, // "team" || "user" pub url: Option, // "https://github.com/rust-bus", diff --git a/datadump/src/main.rs b/datadump/src/main.rs index d002cb5..1f9df2e 100644 --- a/datadump/src/main.rs +++ b/datadump/src/main.rs @@ -133,7 +133,6 @@ async fn index_owners(crates: &CratesMap, owners: CrateOwners, teams: &Teams, us return None; } CrateOwner { - id: o.owner_id as _, login: u.login.to_owned(), invited_at: Some(invited_at), invited_by_github_id, @@ -147,7 +146,6 @@ async fn index_owners(crates: &CratesMap, owners: CrateOwners, teams: &Teams, us 1 => { let u = teams.get(&o.owner_id).expect("owner consistency"); CrateOwner { - id: o.owner_id as _, login: u.login.to_owned(), invited_at: Some(invited_at), github_id: Some(u.github_id), diff --git a/github_info/Cargo.toml b/github_info/Cargo.toml index ecddb8e..d223869 100644 --- a/github_info/Cargo.toml +++ b/github_info/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2018" name = "github_info" -version = "0.9.0" +version = "0.9.1" authors = ["Kornel "] [lib] diff --git a/github_info/src/lib_github.rs b/github_info/src/lib_github.rs index dafb2a0..d3b1cea 100644 --- a/github_info/src/lib_github.rs +++ b/github_info/src/lib_github.rs @@ -71,9 +71,9 @@ impl GitHub { client: github_v3::Client::new(Some(token)), user_orgs: TempCache::new(&cache_path.as_ref().with_file_name("github_user_orgs.bin"))?, orgs: TempCache::new(&cache_path.as_ref().with_file_name("github_orgs2.bin"))?, - users: TempCache::new(&cache_path.as_ref().with_file_name("github_users2.bin"))?, - commits: TempCache::new(&cache_path.as_ref().with_file_name("github_commits.bin"))?, - releases: TempCache::new(&cache_path.as_ref().with_file_name("github_releases.bin"))?, + users: TempCache::new(&cache_path.as_ref().with_file_name("github_users3.bin"))?, + commits: TempCache::new(&cache_path.as_ref().with_file_name("github_commits2.bin"))?, + releases: TempCache::new(&cache_path.as_ref().with_file_name("github_releases2.bin"))?, contribs: TempCache::new(&cache_path.as_ref().with_file_name("github_contribs.bin"))?, repos: TempCache::new(&cache_path.as_ref().with_file_name("github_repos2.bin"))?, emails: TempCache::new(&cache_path.as_ref().with_file_name("github_emails.bin"))?, diff --git a/github_info/src/model.rs b/github_info/src/model.rs index b7a8a25..ebc0454 100644 --- a/github_info/src/model.rs +++ b/github_info/src/model.rs @@ -68,6 +68,19 @@ pub struct User { pub created_at: Option, } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MinimalUser { + pub id: Option, + pub login: String, + pub name: Option, + pub avatar_url: Option, // "https://avatars0.githubusercontent.com/u/1111?v=4", + pub gravatar_id: Option, // "", + pub html_url: String, // "https://github.com/zzzz", + #[serde(rename = "type")] + pub user_type: UserType, + pub created_at: Option, +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ContribWeek { #[serde(rename = "w")] @@ -89,7 +102,7 @@ pub struct SearchResults { pub struct UserContrib { pub total: u32, pub weeks: Vec, - pub author: Option, + pub author: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -111,8 +124,8 @@ pub struct GitCommit { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CommitMeta { pub sha: String, // TODO: deserialize to bin - pub author: Option, - pub committer: Option, + pub author: Option, + pub committer: Option, pub commit: GitCommit, // parents: [{sha}] } @@ -137,7 +150,7 @@ pub struct GitHubRepo { pub has_pages: bool, pub archived: bool, pub default_branch: Option, - pub owner: Option, + pub owner: Option, #[serde(default)] pub topics: Vec, diff --git a/github_v3/Cargo.toml b/github_v3/Cargo.toml index 617bb0b..d4192f9 100644 --- a/github_v3/Cargo.toml +++ b/github_v3/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "github_v3" description = "Async GitHub API v3 client" -version = "0.3.0" +version = "0.3.1" authors = ["Kornel "] keywords = ["github", "rest-api", "async"] categories = ["web-programming", "web-programming::http-client"] diff --git a/github_v3/examples/users.rs b/github_v3/examples/users.rs index 14d0f66..f2b76c2 100644 --- a/github_v3/examples/users.rs +++ b/github_v3/examples/users.rs @@ -1,10 +1,24 @@ -use futures::StreamExt; use github_v3::*; +use serde_derive::*; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct User { + pub id: u32, + pub login: String, + pub name: Option, + pub avatar_url: Option, + pub gravatar_id: Option, + pub html_url: String, + pub blog: Option, + #[serde(rename = "type")] + pub user_type: String, + pub created_at: Option, +} #[tokio::main] async fn main() -> Result<(), GHError> { let gh = Client::new_from_env(); - let mut users = gh.get().path("users").send().await?.array::(); + let mut users = gh.get().path("users").send().await?.array::(); while let Some(Ok(user)) = users.next().await { println!("User! {:#?}", user); diff --git a/github_v3/src/lib.rs b/github_v3/src/lib.rs index e1cb472..4fb75bb 100644 --- a/github_v3/src/lib.rs +++ b/github_v3/src/lib.rs @@ -9,8 +9,6 @@ use std::sync::atomic::Ordering::SeqCst; use std::sync::Arc; use std::time::{Duration, SystemTime}; -pub mod model; - pub struct Response { res: reqwest::Response, client: Arc, diff --git a/kitchen_sink/src/lib_kitchen_sink.rs b/kitchen_sink/src/lib_kitchen_sink.rs index af015f0..bc20d75 100644 --- a/kitchen_sink/src/lib_kitchen_sink.rs +++ b/kitchen_sink/src/lib_kitchen_sink.rs @@ -3,6 +3,7 @@ #[macro_use] extern crate serde_derive; +use github_info::MinimalUser; use futures::stream::StreamExt; mod yearly; pub use crate::yearly::*; @@ -553,13 +554,12 @@ impl KitchenSink { let versions = self.get_repo_versions(origin, &host, &cachebust).await?; Ok(RichCrate::new(origin.clone(), gh.owner.into_iter().map(|o| { CrateOwner { - id: 0, avatar: o.avatar_url, url: Some(o.html_url), login: o.login, kind: OwnerKind::User, // FIXME: crates-io uses teams, and we'd need to find the right team? is "owners" a guaranteed thing? name: o.name, - github_id: Some(o.id), + github_id: o.id, invited_at: None, invited_by_github_id: None, @@ -1287,6 +1287,17 @@ impl KitchenSink { Ok(self.crate_db.top_keyword(&krate.origin()).await?) } + /// Maintenance: add user to local db index + pub(crate) async fn index_user_m(&self, user: &MinimalUser, commit: &GitCommitAuthor) -> CResult<()> { + if stopped() {Err(KitchenSinkErr::Stopped)?;} + let user = self.gh.user_by_login(&user.login).await?.ok_or_else(|| KitchenSinkErr::AuthorNotFound(user.login.clone()))?; + if !self.user_db.email_has_github(&commit.email)? { + println!("{} => {}", commit.email, user.login); + self.user_db.index_user(&user, Some(&commit.email), commit.name.as_ref().map(|s| s.as_str()))?; + } + Ok(()) + } + /// Maintenance: add user to local db index pub fn index_user(&self, user: &User, commit: &GitCommitAuthor) -> CResult<()> { if stopped() {Err(KitchenSinkErr::Stopped)?;} @@ -1501,20 +1512,20 @@ impl KitchenSink { self.crate_db.index_repo_crates(repo, manif).await.context("index rev repo")?; let mut changes = Vec::new(); - if let Repo { host: RepoHost::GitHub(ref repo), .. } = repo { - if let Some(commits) = self.repo_commits(repo, as_of_version).await? { - for c in commits { - if let Some(a) = c.author { - self.index_user(&a, &c.commit.author)?; - } - if let Some(a) = c.committer { - self.index_user(&a, &c.commit.committer)?; - } + if let Repo { host: RepoHost::GitHub(ref repo), .. } = repo { + if let Some(commits) = self.repo_commits(repo, as_of_version).await? { + for c in commits { + if let Some(a) = c.author { + self.index_user_m(&a, &c.commit.author).await?; + } + if let Some(a) = c.committer { + self.index_user_m(&a, &c.commit.committer).await?; } } } + } - if stopped() {Err(KitchenSinkErr::Stopped)?;} + if stopped() {Err(KitchenSinkErr::Stopped)?;} tokio::task::block_in_place(|| { crate_git_checkout::find_dependency_changes(&checkout, |added, removed, age| { @@ -1646,7 +1657,7 @@ impl KitchenSink { if contributors.len() >= 100 { hit_max_contributor_count = true; } - let mut by_login = HashMap::new(); + let mut by_login: HashMap = HashMap::new(); for contr in contributors { if let Some(author) = contr.author { if author.user_type == UserType::Bot { @@ -1657,8 +1668,17 @@ impl KitchenSink { w.commits as f64 + ((w.added + w.deleted*2) as f64).sqrt() }).sum::(); - by_login.entry(author.login.to_lowercase()) - .or_insert((0., author)).0 += count; + use std::collections::hash_map::Entry; + match by_login.entry(author.login.to_ascii_lowercase()) { + Entry::Vacant(e) => { + if let Ok(Some(user)) = self.gh.user_by_login(&author.login).await { + e.insert((count, user)); + } + }, + Entry::Occupied(mut e) => { + e.get_mut().0 += count; + }, + } } } Ok((hit_max_contributor_count, by_login)) @@ -1890,7 +1910,6 @@ impl KitchenSink { Origin::GitLab {..} => Ok(vec![]), Origin::GitHub {repo, ..} => Ok(vec![ CrateOwner { - id: 0, avatar: None, // FIXME: read from GH url: Some(format!("https://github.com/{}", repo.owner)), @@ -1962,10 +1981,6 @@ impl KitchenSink { /// To make categories more varied, lower score of crates by same authors, with same keywords async fn knock_duplicates(&self, crates: &mut Vec<(Origin, f64)>) { - let mut seen_owners = HashMap::new(); - let mut seen_keywords = HashMap::new(); - let mut seen_owner_keywords = HashMap::new(); - let with_owners = futures::stream::iter(crates.drain(..)) .filter_map(|(o, score)| async move { let c = match self.rich_crate_version_async(&o).await { @@ -1991,22 +2006,24 @@ impl KitchenSink { let mut top_keywords: Vec<_> = top_keywords.into_iter().collect(); top_keywords.sort_by(|a, b| b.1.cmp(&a.1)); let top_keywords: HashSet<_> = top_keywords.iter().copied().take((top_keywords.len() / 10).min(10).max(2)).map(|(k, _)| k.to_string()).collect(); - eprintln!("top cat keywords {:?}", top_keywords); crates.clear(); - for (origin, score, owners, keywords) in with_owners { + let mut seen_owners = HashMap::new(); + let mut seen_keywords = HashMap::new(); + let mut seen_owner_keywords = HashMap::new(); + for (origin, score, owners, keywords) in &with_owners { let mut weight_sum = 0; let mut score_sum = 0.0; for owner in owners.iter().take(5) { - let n = seen_owners.entry(owner.id).or_insert(0u32); + let n = seen_owners.entry(&owner.login).or_insert(0u32); score_sum += (*n).saturating_sub(3) as f64; // authors can have a few crates with no penalty weight_sum += 2; *n += 2; } - let primary_owner_id = owners.get(0).map(|o| o.id).unwrap_or(0); + let primary_owner_id = owners.get(0).map(|o| o.login.as_str()).unwrap_or(""); for keyword in keywords.into_iter().take(5) { // obvious keywords are too repetitive and affect innocent crates - if !top_keywords.contains(&keyword) { + if !top_keywords.contains(keyword.as_str()) { let n = seen_keywords.entry(keyword.clone()).or_insert(0u32); score_sum += (*n).saturating_sub(4) as f64; // keywords are expected to repeat a bit weight_sum += 1; @@ -2025,7 +2042,7 @@ impl KitchenSink { // +7 here allows some duplication, and penalizes harder only after a few crates // adding original score means it'll never get lower than 1/3rd let new_score = score * 0.5 + (score + 7.) / (7. + dupe_points); - crates.push((origin, new_score)); + crates.push((origin.to_owned(), new_score)); } crates.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(Ordering::Equal)); } diff --git a/server/Cargo.toml b/server/Cargo.toml index fb51c2b..f47ba6f 100644 --- a/server/Cargo.toml +++ b/server/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "crates-server" -version = "0.13.0" +version = "0.13.1" authors = ["Kornel "] edition = "2018" description = "Crates.rs web server" -- cgit v1.2.3