summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKornel <kornel@geekhood.net>2020-03-11 22:47:11 +0000
committerKornel <kornel@geekhood.net>2020-03-11 22:48:37 +0000
commite29ffdaa0bc1ae3906f88111397bd0776797046c (patch)
tree094e837e1a3402468f29dbbc773db31ec407960c
parent2b91c72b7d1e8a7c44251b66164ca8cd296bbd19 (diff)
Handle missing github IDs
-rw-r--r--crates_io_client/src/crate_owners.rs1
-rw-r--r--datadump/src/main.rs2
-rw-r--r--github_info/Cargo.toml2
-rw-r--r--github_info/src/lib_github.rs6
-rw-r--r--github_info/src/model.rs21
-rw-r--r--github_v3/Cargo.toml2
-rw-r--r--github_v3/examples/users.rs18
-rw-r--r--github_v3/src/lib.rs2
-rw-r--r--kitchen_sink/src/lib_kitchen_sink.rs69
-rw-r--r--server/Cargo.toml2
10 files changed, 82 insertions, 43 deletions
diff --git a/crates_io_client/src/crate_owners.rs b/crates_io_client/src/crate_owners.rs
index c3861e7..c1d6c1d 100644
--- a/crates_io_client/src/crate_owners.rs
+++ b/crates_io_client/src/crate_owners.rs
@@ -22,7 +22,6 @@ pub enum OwnerKind {
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrateOwner {
- pub id: usize, // 362,
pub login: String, // "github:rust-bus:maintainers",
pub kind: OwnerKind, // "team" || "user"
pub url: Option<String>, // "https://github.com/rust-bus",
diff --git a/datadump/src/main.rs b/datadump/src/main.rs
index d002cb5..1f9df2e 100644
--- a/datadump/src/main.rs
+++ b/datadump/src/main.rs
@@ -133,7 +133,6 @@ async fn index_owners(crates: &CratesMap, owners: CrateOwners, teams: &Teams, us
return None;
}
CrateOwner {
- id: o.owner_id as _,
login: u.login.to_owned(),
invited_at: Some(invited_at),
invited_by_github_id,
@@ -147,7 +146,6 @@ async fn index_owners(crates: &CratesMap, owners: CrateOwners, teams: &Teams, us
1 => {
let u = teams.get(&o.owner_id).expect("owner consistency");
CrateOwner {
- id: o.owner_id as _,
login: u.login.to_owned(),
invited_at: Some(invited_at),
github_id: Some(u.github_id),
diff --git a/github_info/Cargo.toml b/github_info/Cargo.toml
index ecddb8e..d223869 100644
--- a/github_info/Cargo.toml
+++ b/github_info/Cargo.toml
@@ -1,7 +1,7 @@
[package]
edition = "2018"
name = "github_info"
-version = "0.9.0"
+version = "0.9.1"
authors = ["Kornel <kornel@geekhood.net>"]
[lib]
diff --git a/github_info/src/lib_github.rs b/github_info/src/lib_github.rs
index dafb2a0..d3b1cea 100644
--- a/github_info/src/lib_github.rs
+++ b/github_info/src/lib_github.rs
@@ -71,9 +71,9 @@ impl GitHub {
client: github_v3::Client::new(Some(token)),
user_orgs: TempCache::new(&cache_path.as_ref().with_file_name("github_user_orgs.bin"))?,
orgs: TempCache::new(&cache_path.as_ref().with_file_name("github_orgs2.bin"))?,
- users: TempCache::new(&cache_path.as_ref().with_file_name("github_users2.bin"))?,
- commits: TempCache::new(&cache_path.as_ref().with_file_name("github_commits.bin"))?,
- releases: TempCache::new(&cache_path.as_ref().with_file_name("github_releases.bin"))?,
+ users: TempCache::new(&cache_path.as_ref().with_file_name("github_users3.bin"))?,
+ commits: TempCache::new(&cache_path.as_ref().with_file_name("github_commits2.bin"))?,
+ releases: TempCache::new(&cache_path.as_ref().with_file_name("github_releases2.bin"))?,
contribs: TempCache::new(&cache_path.as_ref().with_file_name("github_contribs.bin"))?,
repos: TempCache::new(&cache_path.as_ref().with_file_name("github_repos2.bin"))?,
emails: TempCache::new(&cache_path.as_ref().with_file_name("github_emails.bin"))?,
diff --git a/github_info/src/model.rs b/github_info/src/model.rs
index b7a8a25..ebc0454 100644
--- a/github_info/src/model.rs
+++ b/github_info/src/model.rs
@@ -69,6 +69,19 @@ pub struct User {
}
#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct MinimalUser {
+ pub id: Option<u32>,
+ pub login: String,
+ pub name: Option<String>,
+ pub avatar_url: Option<String>, // "https://avatars0.githubusercontent.com/u/1111?v=4",
+ pub gravatar_id: Option<String>, // "",
+ pub html_url: String, // "https://github.com/zzzz",
+ #[serde(rename = "type")]
+ pub user_type: UserType,
+ pub created_at: Option<String>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ContribWeek {
#[serde(rename = "w")]
pub week_timestamp: u32,
@@ -89,7 +102,7 @@ pub struct SearchResults<T> {
pub struct UserContrib {
pub total: u32,
pub weeks: Vec<ContribWeek>,
- pub author: Option<User>,
+ pub author: Option<MinimalUser>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -111,8 +124,8 @@ pub struct GitCommit {
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CommitMeta {
pub sha: String, // TODO: deserialize to bin
- pub author: Option<User>,
- pub committer: Option<User>,
+ pub author: Option<MinimalUser>,
+ pub committer: Option<MinimalUser>,
pub commit: GitCommit,
// parents: [{sha}]
}
@@ -137,7 +150,7 @@ pub struct GitHubRepo {
pub has_pages: bool,
pub archived: bool,
pub default_branch: Option<String>,
- pub owner: Option<User>,
+ pub owner: Option<MinimalUser>,
#[serde(default)]
pub topics: Vec<String>,
diff --git a/github_v3/Cargo.toml b/github_v3/Cargo.toml
index 617bb0b..d4192f9 100644
--- a/github_v3/Cargo.toml
+++ b/github_v3/Cargo.toml
@@ -1,7 +1,7 @@
[package]
name = "github_v3"
description = "Async GitHub API v3 client"
-version = "0.3.0"
+version = "0.3.1"
authors = ["Kornel <kornel@geekhood.net>"]
keywords = ["github", "rest-api", "async"]
categories = ["web-programming", "web-programming::http-client"]
diff --git a/github_v3/examples/users.rs b/github_v3/examples/users.rs
index 14d0f66..f2b76c2 100644
--- a/github_v3/examples/users.rs
+++ b/github_v3/examples/users.rs
@@ -1,10 +1,24 @@
-use futures::StreamExt;
use github_v3::*;
+use serde_derive::*;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct User {
+ pub id: u32,
+ pub login: String,
+ pub name: Option<String>,
+ pub avatar_url: Option<String>,
+ pub gravatar_id: Option<String>,
+ pub html_url: String,
+ pub blog: Option<String>,
+ #[serde(rename = "type")]
+ pub user_type: String,
+ pub created_at: Option<String>,
+}
#[tokio::main]
async fn main() -> Result<(), GHError> {
let gh = Client::new_from_env();
- let mut users = gh.get().path("users").send().await?.array::<model::User>();
+ let mut users = gh.get().path("users").send().await?.array::<User>();
while let Some(Ok(user)) = users.next().await {
println!("User! {:#?}", user);
diff --git a/github_v3/src/lib.rs b/github_v3/src/lib.rs
index e1cb472..4fb75bb 100644
--- a/github_v3/src/lib.rs
+++ b/github_v3/src/lib.rs
@@ -9,8 +9,6 @@ use std::sync::atomic::Ordering::SeqCst;
use std::sync::Arc;
use std::time::{Duration, SystemTime};
-pub mod model;
-
pub struct Response {
res: reqwest::Response,
client: Arc<ClientInner>,
diff --git a/kitchen_sink/src/lib_kitchen_sink.rs b/kitchen_sink/src/lib_kitchen_sink.rs
index af015f0..bc20d75 100644
--- a/kitchen_sink/src/lib_kitchen_sink.rs
+++ b/kitchen_sink/src/lib_kitchen_sink.rs
@@ -3,6 +3,7 @@
#[macro_use]
extern crate serde_derive;
+use github_info::MinimalUser;
use futures::stream::StreamExt;
mod yearly;
pub use crate::yearly::*;
@@ -553,13 +554,12 @@ impl KitchenSink {
let versions = self.get_repo_versions(origin, &host, &cachebust).await?;
Ok(RichCrate::new(origin.clone(), gh.owner.into_iter().map(|o| {
CrateOwner {
- id: 0,
avatar: o.avatar_url,
url: Some(o.html_url),
login: o.login,
kind: OwnerKind::User, // FIXME: crates-io uses teams, and we'd need to find the right team? is "owners" a guaranteed thing?
name: o.name,
- github_id: Some(o.id),
+ github_id: o.id,
invited_at: None,
invited_by_github_id: None,
@@ -1288,6 +1288,17 @@ impl KitchenSink {
}
/// Maintenance: add user to local db index
+ pub(crate) async fn index_user_m(&self, user: &MinimalUser, commit: &GitCommitAuthor) -> CResult<()> {
+ if stopped() {Err(KitchenSinkErr::Stopped)?;}
+ let user = self.gh.user_by_login(&user.login).await?.ok_or_else(|| KitchenSinkErr::AuthorNotFound(user.login.clone()))?;
+ if !self.user_db.email_has_github(&commit.email)? {
+ println!("{} => {}", commit.email, user.login);
+ self.user_db.index_user(&user, Some(&commit.email), commit.name.as_ref().map(|s| s.as_str()))?;
+ }
+ Ok(())
+ }
+
+ /// Maintenance: add user to local db index
pub fn index_user(&self, user: &User, commit: &GitCommitAuthor) -> CResult<()> {
if stopped() {Err(KitchenSinkErr::Stopped)?;}
if !self.user_db.email_has_github(&commit.email)? {
@@ -1501,20 +1512,20 @@ impl KitchenSink {
self.crate_db.index_repo_crates(repo, manif).await.context("index rev repo")?;
let mut changes = Vec::new();
- if let Repo { host: RepoHost::GitHub(ref repo), .. } = repo {
- if let Some(commits) = self.repo_commits(repo, as_of_version).await? {
- for c in commits {
- if let Some(a) = c.author {
- self.index_user(&a, &c.commit.author)?;
- }
- if let Some(a) = c.committer {
- self.index_user(&a, &c.commit.committer)?;
- }
+ if let Repo { host: RepoHost::GitHub(ref repo), .. } = repo {
+ if let Some(commits) = self.repo_commits(repo, as_of_version).await? {
+ for c in commits {
+ if let Some(a) = c.author {
+ self.index_user_m(&a, &c.commit.author).await?;
+ }
+ if let Some(a) = c.committer {
+ self.index_user_m(&a, &c.commit.committer).await?;
}
}
}
+ }
- if stopped() {Err(KitchenSinkErr::Stopped)?;}
+ if stopped() {Err(KitchenSinkErr::Stopped)?;}
tokio::task::block_in_place(|| {
crate_git_checkout::find_dependency_changes(&checkout, |added, removed, age| {
@@ -1646,7 +1657,7 @@ impl KitchenSink {
if contributors.len() >= 100 {
hit_max_contributor_count = true;
}
- let mut by_login = HashMap::new();
+ let mut by_login: HashMap<String, (f64, User)> = HashMap::new();
for contr in contributors {
if let Some(author) = contr.author {
if author.user_type == UserType::Bot {
@@ -1657,8 +1668,17 @@ impl KitchenSink {
w.commits as f64 +
((w.added + w.deleted*2) as f64).sqrt()
}).sum::<f64>();
- by_login.entry(author.login.to_lowercase())
- .or_insert((0., author)).0 += count;
+ use std::collections::hash_map::Entry;
+ match by_login.entry(author.login.to_ascii_lowercase()) {
+ Entry::Vacant(e) => {
+ if let Ok(Some(user)) = self.gh.user_by_login(&author.login).await {
+ e.insert((count, user));
+ }
+ },
+ Entry::Occupied(mut e) => {
+ e.get_mut().0 += count;
+ },
+ }
}
}
Ok((hit_max_contributor_count, by_login))
@@ -1890,7 +1910,6 @@ impl KitchenSink {
Origin::GitLab {..} => Ok(vec![]),
Origin::GitHub {repo, ..} => Ok(vec![
CrateOwner {
- id: 0,
avatar: None,
// FIXME: read from GH
url: Some(format!("https://github.com/{}", repo.owner)),
@@ -1962,10 +1981,6 @@ impl KitchenSink {
/// To make categories more varied, lower score of crates by same authors, with same keywords
async fn knock_duplicates(&self, crates: &mut Vec<(Origin, f64)>) {
- let mut seen_owners = HashMap::new();
- let mut seen_keywords = HashMap::new();
- let mut seen_owner_keywords = HashMap::new();
-
let with_owners = futures::stream::iter(crates.drain(..))
.filter_map(|(o, score)| async move {
let c = match self.rich_crate_version_async(&o).await {
@@ -1991,22 +2006,24 @@ impl KitchenSink {
let mut top_keywords: Vec<_> = top_keywords.into_iter().collect();
top_keywords.sort_by(|a, b| b.1.cmp(&a.1));
let top_keywords: HashSet<_> = top_keywords.iter().copied().take((top_keywords.len() / 10).min(10).max(2)).map(|(k, _)| k.to_string()).collect();
- eprintln!("top cat keywords {:?}", top_keywords);
crates.clear();
- for (origin, score, owners, keywords) in with_owners {
+ let mut seen_owners = HashMap::new();
+ let mut seen_keywords = HashMap::new();
+ let mut seen_owner_keywords = HashMap::new();
+ for (origin, score, owners, keywords) in &with_owners {
let mut weight_sum = 0;
let mut score_sum = 0.0;
for owner in owners.iter().take(5) {
- let n = seen_owners.entry(owner.id).or_insert(0u32);
+ let n = seen_owners.entry(&owner.login).or_insert(0u32);
score_sum += (*n).saturating_sub(3) as f64; // authors can have a few crates with no penalty
weight_sum += 2;
*n += 2;
}
- let primary_owner_id = owners.get(0).map(|o| o.id).unwrap_or(0);
+ let primary_owner_id = owners.get(0).map(|o| o.login.as_str()).unwrap_or("");
for keyword in keywords.into_iter().take(5) {
// obvious keywords are too repetitive and affect innocent crates
- if !top_keywords.contains(&keyword) {
+ if !top_keywords.contains(keyword.as_str()) {
let n = seen_keywords.entry(keyword.clone()).or_insert(0u32);
score_sum += (*n).saturating_sub(4) as f64; // keywords are expected to repeat a bit
weight_sum += 1;
@@ -2025,7 +2042,7 @@ impl KitchenSink {
// +7 here allows some duplication, and penalizes harder only after a few crates
// adding original score means it'll never get lower than 1/3rd
let new_score = score * 0.5 + (score + 7.) / (7. + dupe_points);
- crates.push((origin, new_score));
+ crates.push((origin.to_owned(), new_score));
}
crates.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(Ordering::Equal));
}
diff --git a/server/Cargo.toml b/server/Cargo.toml
index fb51c2b..f47ba6f 100644
--- a/server/Cargo.toml
+++ b/server/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "crates-server"
-version = "0.13.0"
+version = "0.13.1"
authors = ["Kornel <kornel@geekhood.net>"]
edition = "2018"
description = "Crates.rs web server"