diff options
author | Kornel <kornel@geekhood.net> | 2020-02-28 12:07:41 +0000 |
---|---|---|
committer | Kornel <kornel@geekhood.net> | 2020-02-28 12:07:41 +0000 |
commit | 80dcfbb5a159e94b95775fff8d5866e4cfb7336b (patch) | |
tree | 477b56a2f297ef566e7c201aaa24752f553ea654 | |
parent | c513639d48166d440e0887fbc607ba9b07ec3659 (diff) | |
parent | 8b92b5c9c8e68bdcf9de1e21b2e2813c1147d99d (diff) |
Merge crate_git_checkout
-rw-r--r-- | crate_git_checkout/Cargo.toml | 19 | ||||
-rw-r--r-- | crate_git_checkout/README.md | 3 | ||||
-rw-r--r-- | crate_git_checkout/src/crate_git_checkout.rs | 387 | ||||
-rw-r--r-- | crate_git_checkout/src/iter.rs | 78 |
4 files changed, 487 insertions, 0 deletions
diff --git a/crate_git_checkout/Cargo.toml b/crate_git_checkout/Cargo.toml new file mode 100644 index 0000000..9c4ef3c --- /dev/null +++ b/crate_git_checkout/Cargo.toml @@ -0,0 +1,19 @@ +[package] +edition = "2018" +description = "Analyze git repository containing Cargo crates" +name = "crate_git_checkout" +version = "0.4.3" +authors = ["Kornel <kornel@geekhood.net>"] + +[lib] +name = "crate_git_checkout" +path = "src/crate_git_checkout.rs" + +[dependencies] +git2 = "0.11.0" +cargo_toml = "0.8.0" +render_readme = { git = "https://gitlab.com/crates.rs/render_readme.git" } +repo_url = { path = "../repo_url" } +failure = "0.1.6" +urlencoding = "1.0.0" +lazy_static = "1.4.0" diff --git a/crate_git_checkout/README.md b/crate_git_checkout/README.md new file mode 100644 index 0000000..e7764ca --- /dev/null +++ b/crate_git_checkout/README.md @@ -0,0 +1,3 @@ +# Searching bare git checkouts + +Helper functions to get `README` and `Cargo.toml` files out of a cloned repository, using libgit2. diff --git a/crate_git_checkout/src/crate_git_checkout.rs b/crate_git_checkout/src/crate_git_checkout.rs new file mode 100644 index 0000000..f49735a --- /dev/null +++ b/crate_git_checkout/src/crate_git_checkout.rs @@ -0,0 +1,387 @@ +use crate::iter::HistoryIter; +use cargo_toml::{Manifest, Package}; +use failure; +use git2; +use git2::build::RepoBuilder; +use git2::Commit; +pub use git2::Oid; +pub use git2::Repository; +use git2::{Blob, ObjectType, Reference, Tree}; +use lazy_static::lazy_static; +use render_readme; +use render_readme::{Markup, Readme}; +use repo_url::Repo; +use std::collections::hash_map::Entry::Vacant; +use std::collections::{HashMap, HashSet}; +use std::fs; +use std::io; +use std::path::Path; +use std::process::Command; +use std::sync::Arc; +use std::sync::Mutex; +use urlencoding; + +mod iter; + +lazy_static! { + static ref GLOBAL_LOCK: Mutex<HashMap<String, Arc<Mutex<()>>>> = Mutex::new(HashMap::new()); +} + +#[derive(Debug, Clone)] +pub struct ParseError(pub String); + +fn commit_history_iter<'a>(repo: &Repository, commit: &Reference<'a>) -> Result<HistoryIter<'a>, git2::Error> { + if repo.is_shallow() { + repo.find_remote("origin")?.fetch(&["master"], None, None)?; + } + Ok(HistoryIter::new(commit.peel_to_commit()?)) +} + +pub fn checkout(repo: &Repo, base_path: &Path) -> Result<Repository, git2::Error> { + let repo = get_repo(repo, base_path)?; + Ok(repo) +} + +#[inline] +pub fn iter_blobs<F>(repo: &Repository, at: Option<Oid>, cb: F) -> Result<(), failure::Error> + where F: FnMut(&str, &Tree<'_>, &str, Blob<'_>) -> Result<(), failure::Error> +{ + let tree = if let Some(oid) = at { + repo.find_tree(oid)? + } else { + let head = repo.head()?; + head.peel_to_tree()? + }; + iter_blobs_in_tree(repo, &tree, cb) +} + +#[inline] +pub fn iter_blobs_in_tree<F>(repo: &Repository, tree: &Tree<'_>, mut cb: F) -> Result<(), failure::Error> + where F: FnMut(&str, &Tree<'_>, &str, Blob<'_>) -> Result<(), failure::Error> +{ + iter_blobs_recurse(repo, tree, &mut String::with_capacity(500), &mut cb)?; + Ok(()) +} + +fn iter_blobs_recurse<F>(repo: &Repository, tree: &Tree<'_>, path: &mut String, cb: &mut F) -> Result<(), failure::Error> + where F: FnMut(&str, &Tree<'_>, &str, Blob<'_>) -> Result<(), failure::Error> +{ + for i in tree.iter() { + let name = match i.name() { + Some(n) => n, + _ => continue, + }; + match i.kind() { + Some(ObjectType::Tree) => { + let sub = repo.find_tree(i.id())?; + let pre_len = path.len(); + if !path.is_empty() { + path.push('/'); + } + path.push_str(name); + iter_blobs_recurse(repo, &sub, path, cb)?; + path.truncate(pre_len); + }, + Some(ObjectType::Blob) => { + cb(path, tree, name, repo.find_blob(i.id())?)?; + }, + _ => {}, + } + } + Ok(()) +} + +fn get_repo(repo: &Repo, base_path: &Path) -> Result<Repository, git2::Error> { + // ensure one clone per dir at a time + let lock = GLOBAL_LOCK.lock().unwrap().entry(repo.canonical_git_url().to_string()).or_insert_with(|| Arc::new(Mutex::new(()))).clone(); + let _lock = lock.lock().unwrap(); + + let shallow = false; + let url = &*repo.canonical_git_url(); + + let repo_path = base_path.join(urlencoding::encode(url)); + + match Repository::open(&repo_path) { + Ok(repo) => Ok(repo), + Err(err) => { + if !url.starts_with("http://") && !url.starts_with("https://") && !url.starts_with("git@github.com:") { + eprintln!("Rejecting non-HTTP git URL: {}", url); + return Err(err); + } + if err.code() == git2::ErrorCode::Exists { + if let Ok(repo) = Repository::open(&repo_path) { + return Ok(repo); + } + let _ = fs::remove_dir_all(&repo_path); + } + if shallow { + let ok = Command::new("git") + .arg("clone") + .arg("--depth=64") + .arg("--config").arg("core.askPass=true") + .arg("--") + .arg(&*url) + .arg(&repo_path) + .output() + .map(|output| output.status.success()) + .unwrap_or(false); + if ok { + return Repository::open(repo_path); + } + } + let mut ch = RepoBuilder::new(); + ch.bare(true); + // no support for depth=1! + ch.clone(&url, &repo_path) + }, + } +} + +/// Returns (path, Tree Oid, Cargo.toml) +pub fn find_manifests(repo: &Repository) -> Result<(Vec<(String, Oid, Manifest)>, Vec<ParseError>), failure::Error> { + let head = repo.head()?; + let tree = head.peel_to_tree()?; + find_manifests_in_tree(&repo, &tree) +} + +struct GitFS<'a, 'b> { + repo: &'b Repository, + tree: &'b Tree<'a>, +} + +impl cargo_toml::AbstractFilesystem for GitFS<'_, '_> { + fn file_names_in(&self, dir_path: &str) -> Result<HashSet<Box<str>>, io::Error> { + self.file_names_in_tree(&self.tree, Some(dir_path)) + } +} + +impl GitFS<'_, '_> { + fn file_names_in_tree(&self, curr_dir: &Tree<'_>, dir_path: Option<&str>) -> Result<HashSet<Box<str>>, io::Error> { + if let Some(dir_path) = dir_path { + let mut parts = dir_path.splitn(2, '/'); + let subdir_name = parts.next().unwrap(); + let rest = parts.next(); + for item in curr_dir.iter() { + if item.name() == Some(subdir_name) { + if let Ok(tree) = self.repo.find_tree(item.id()) { + return self.file_names_in_tree(&tree, rest); + } + } + } + Ok(HashSet::new()) // dir not found + } else { + let mut res = HashSet::new(); + for item in curr_dir.iter() { + if let Some(n) = item.name() { + res.insert(n.into()); + } + } + Ok(res) + } + } +} + +/// Path, tree Oid, parsed TOML +fn find_manifests_in_tree(repo: &Repository, start_tree: &Tree<'_>) -> Result<(Vec<(String, Oid, Manifest)>, Vec<ParseError>), failure::Error> { + let mut tomls = Vec::with_capacity(8); + let mut warnings = Vec::new(); + iter_blobs_in_tree(repo, start_tree, |inner_path, inner_tree, name, blob| { + if name == "Cargo.toml" { + match Manifest::from_slice(blob.content()) { + Ok(mut toml) => { + toml.complete_from_abstract_filesystem(GitFS { repo, tree: inner_tree })?; + if toml.package.is_some() { + tomls.push((inner_path.to_owned(), inner_tree.id(), toml)) + } + }, + Err(err) => { + warnings.push(ParseError(format!("Can't parse {}/{}/{}: {}", repo.path().display(), inner_path, name, err))); + }, + } + } + Ok(()) + })?; + Ok((tomls, warnings)) +} + +pub fn path_in_repo(repo: &Repository, crate_name: &str) -> Result<Option<(String, Oid, Manifest)>, failure::Error> { + let head = repo.head()?; + let tree = head.peel_to_tree()?; + path_in_repo_in_tree(repo, &tree, crate_name) +} + +fn path_in_repo_in_tree(repo: &Repository, tree: &Tree<'_>, crate_name: &str) -> Result<Option<(String, Oid, Manifest)>, failure::Error> { + Ok(find_manifests_in_tree(repo, tree)?.0 + .into_iter() + .find(|(_, _, manifest)| manifest.package.as_ref().map_or(false, |p| p.name == crate_name))) +} + +#[derive(Debug, Copy, Clone, Default)] +struct State { + since: Option<usize>, + until: Option<usize>, +} + +pub type PackageVersionTimestamps = HashMap<String, HashMap<String, i64>>; + +pub fn find_versions(repo: &Repository) -> Result<PackageVersionTimestamps, failure::Error> { + let mut package_versions: PackageVersionTimestamps = HashMap::with_capacity(4); + for commit in repo.tag_names(None)?.iter() + .filter_map(|s| s) + .filter_map(|tag| repo.find_reference(&format!("refs/tags/{}", tag)).map_err(|e| eprintln!("bad tag {}: {}", tag, e)).ok()) + .filter_map(|r| r.peel_to_commit().map_err(|e| eprintln!("bad ref/tag: {}", e)).ok()) + { + for (_, _, manifest) in find_manifests_in_tree(&repo, &commit.tree()?)?.0 { + if let Some(pkg) = manifest.package { + add_package(&mut package_versions, pkg, &commit); + } + } + } + + eprintln!("no tags, falling back to slow versions"); + if package_versions.is_empty() { + return find_dependency_changes(repo, |_, _, _| {}); + } + + Ok(package_versions) +} + +fn add_package(package_versions: &mut PackageVersionTimestamps, pkg: Package, commit: &Commit) { + // Find oldest occurence of each version, assuming it's a release date + let time_epoch = commit.time().seconds(); + let ver_time = package_versions.entry(pkg.name).or_insert_with(HashMap::new) + .entry(pkg.version).or_insert(time_epoch); + *ver_time = (*ver_time).min(time_epoch); +} + +/// Callback gets added, removed, number of commits ago. +pub fn find_dependency_changes(repo: &Repository, mut cb: impl FnMut(HashSet<String>, HashSet<String>, usize)) -> Result<PackageVersionTimestamps, failure::Error> { + let head = repo.head()?; + + let mut newer_deps: HashMap<String, State> = HashMap::with_capacity(100); + let mut package_versions: PackageVersionTimestamps = HashMap::with_capacity(4); + + // iterates from the latest! + // The generation number here is not quite accurate (due to diamond-shaped histories), + // but I need the fiction of it being linerar for this implementation. + // A recursive implementation could do it better, maybe. + let commits = commit_history_iter(&repo, &head)?.filter(|c| !c.is_merge).map(|c| c.commit); + for (age, commit) in commits.enumerate().take(1000) { + // All deps in a repo, because we traverse history once per repo, not once per crate, + // and because moving of deps between internal crates doesn't count. + let mut older_deps = HashSet::with_capacity(100); + for (_, _, manifest) in find_manifests_in_tree(&repo, &commit.tree()?)?.0 { + // Find oldest occurence of each version, assuming it's a release date + if let Some(pkg) = manifest.package { + add_package(&mut package_versions, pkg, &commit); + } + + older_deps.extend(manifest.dependencies.into_iter().map(|(k, _)| k)); + older_deps.extend(manifest.dev_dependencies.into_iter().map(|(k, _)| k)); + older_deps.extend(manifest.build_dependencies.into_iter().map(|(k, _)| k)); + } + + let mut added = HashSet::with_capacity(10); + let mut removed = HashSet::with_capacity(10); + + for (dep, state) in &mut newer_deps { + // if it's Some(), it's going to be added in the future! so it's not there now + // (as a side effect if dependency is added, removed, then re-added, it tracks only the most recent add/remove) + if state.since.is_none() && !older_deps.contains(dep) { + added.insert(dep.clone()); + state.since = Some(age); + } + } + + for dep in older_deps { + if let Vacant(e) = newer_deps.entry(dep) { + if age > 0 { + removed.insert(e.key().clone()); + e.insert(State { since: None, until: Some(age) }); + } else { + e.insert(State::default()); + } + } + } + + cb(added, removed, age); + } + Ok(package_versions) +} + +// FIXME: buggy, barely works +pub fn find_readme(repo: &Repository, package: &Package) -> Result<Option<Readme>, failure::Error> { + let head = repo.head()?; + let tree = head.peel_to_tree()?; + let mut readme = None; + let mut found_best = false; // it'll find many readmes, including fallbacks + + let mut prefix = path_in_repo_in_tree(&repo, &tree, &package.name)?; + if let Some((ref mut prefix, ..)) = prefix { + if !prefix.ends_with('/') { + prefix.push('/'); + } + } + let prefix = prefix.as_ref().map(|(s, ..)| s.as_str()).unwrap_or(""); + + iter_blobs_in_tree(&repo, &tree, |base, _inner_tree, name, blob| { + if found_best { + return Ok(()); // done + } + let is_correct_dir = base.starts_with(prefix); + let rel_path = if is_correct_dir { + &base[prefix.len()..] + } else if readme.is_none() { + base + } else { + return Ok(()); // don't search bad dirs if there's some readme already + }; + let rel_path_name = Path::new(rel_path).join(name); + if is_readme_filename(&rel_path_name, Some(package)) { + let text = String::from_utf8_lossy(blob.content()).to_string(); + let markup = if rel_path_name.extension().map_or(false, |e| e == "rst") { + Markup::Rst(text) + } else { + Markup::Markdown(text) + }; + readme = Some(readme_from_repo(markup, &package.repository, base)); + found_best = is_correct_dir; + } + Ok(()) + })?; + Ok(readme) +} + +fn readme_from_repo(markup: Markup, repo_url: &Option<String>, base_dir_in_repo: &str) -> Readme { + let repo = repo_url.as_ref().and_then(|url| Repo::new(url).ok()); + let base_url = repo.as_ref().map(|r| r.readme_base_url(base_dir_in_repo)); + let base_image_url = repo.map(|r| r.readme_base_image_url(base_dir_in_repo)); + + Readme::new(markup, base_url, base_image_url) +} + +/// Check if given filename is a README. If `package` is missing, guess. +fn is_readme_filename(path: &Path, package: Option<&Package>) -> bool { + path.to_str().map_or(false, |s| { + if let Some(&Package { readme: Some(ref r), .. }) = package { + let r = r.trim_start_matches("../"); // hacky hack + r.eq_ignore_ascii_case(s) // crates published on Mac have this + } else { + render_readme::is_readme_filename(path) + } + }) +} + +#[test] +fn git_fs() { + let repo = Repository::open(".git").expect("own git repo"); + let (m, w) = find_manifests(&repo).expect("has manifests"); + assert_eq!(1, m.len()); + assert_eq!(0, w.len()); + assert_eq!("", &m[0].0); + let manif = &m[0].2; + let pkg = manif.package.as_ref().expect("package"); + assert_eq!("crate_git_checkout", &pkg.name); + assert!(manif.lib.is_some()); + assert_eq!(0, manif.bin.len()); +} diff --git a/crate_git_checkout/src/iter.rs b/crate_git_checkout/src/iter.rs new file mode 100644 index 0000000..aab3142 --- /dev/null +++ b/crate_git_checkout/src/iter.rs @@ -0,0 +1,78 @@ +use git2::Commit; +use git2::Oid; +use std::cmp::Ordering; +use std::collections::BinaryHeap; +use std::collections::HashSet; + +pub struct HistoryIter<'repo> { + seen: HashSet<Oid>, + to_visit: BinaryHeap<Generation<'repo>>, +} + +pub struct HistoryItem<'repo> { + pub commit: Commit<'repo>, + pub is_merge: bool, +} + +struct Generation<'repo> { + num: u32, + nth: u32, + commit: Commit<'repo>, +} + +impl<'repo> HistoryIter<'repo> { + pub fn new(start: Commit<'repo>) -> Self { + let mut to_visit = BinaryHeap::with_capacity(16); + to_visit.push(Generation{ + commit:start, + num:0, nth:0, + }); + Self { + seen: HashSet::with_capacity(500), + to_visit, + } + } +} + +impl<'repo> Iterator for HistoryIter<'repo> { + type Item = HistoryItem<'repo>; + + fn next(&mut self) -> Option<Self::Item> { + if let Some(Generation { commit, num, .. }) = self.to_visit.pop() { + let seen = &mut self.seen; // technically needed only after merges + let mut is_merge = false; + self.to_visit.extend(commit.parents() + .take(1) + .filter(|commit| { + seen.insert(commit.id()) + }) + .enumerate() + .map(|(nth, commit)| { + if nth > 0 {is_merge = true;} + Generation {num: num+1, nth: nth as u32, commit} + })); + Some(HistoryItem { + commit, is_merge, + }) + } else { + None + } + } +} + +impl<'repo> PartialEq for Generation<'repo> { + fn eq(&self, other: &Generation<'_>) -> bool { + other.num == self.num && self.nth == other.nth + } +} +impl<'repo> PartialOrd for Generation<'repo> { + fn partial_cmp(&self, other: &Generation<'_>) -> Option<Ordering> { + Some(self.cmp(other)) + } +} +impl<'repo> Eq for Generation<'repo> {} +impl<'repo> Ord for Generation<'repo> { + fn cmp(&self, other: &Generation<'_>) -> Ordering { + other.num.cmp(&self.num).then(other.nth.cmp(&self.nth)) + } +} |