use std::cmp; use std::ffi::OsStr; use std::fmt; use std::fs::{self, FileType, Metadata}; use std::io; use std::path::{Path, PathBuf}; use std::sync::Arc; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::thread; use std::time::Duration; use std::vec; use crossbeam::sync::MsQueue; use same_file::Handle; use walkdir::{self, WalkDir}; use dir::{Ignore, IgnoreBuilder}; use gitignore::GitignoreBuilder; use overrides::Override; use types::Types; use {Error, PartialErrorBuilder}; /// A directory entry with a possible error attached. /// /// The error typically refers to a problem parsing ignore files in a /// particular directory. #[derive(Debug)] pub struct DirEntry { dent: DirEntryInner, err: Option, } impl DirEntry { /// The full path that this entry represents. pub fn path(&self) -> &Path { self.dent.path() } /// Whether this entry corresponds to a symbolic link or not. pub fn path_is_symlink(&self) -> bool { self.dent.path_is_symlink() } /// Returns true if and only if this entry corresponds to stdin. /// /// i.e., The entry has depth 0 and its file name is `-`. pub fn is_stdin(&self) -> bool { self.dent.is_stdin() } /// Return the metadata for the file that this entry points to. pub fn metadata(&self) -> Result { self.dent.metadata() } /// Return the file type for the file that this entry points to. /// /// This entry doesn't have a file type if it corresponds to stdin. pub fn file_type(&self) -> Option { self.dent.file_type() } /// Return the file name of this entry. /// /// If this entry has no file name (e.g., `/`), then the full path is /// returned. pub fn file_name(&self) -> &OsStr { self.dent.file_name() } /// Returns the depth at which this entry was created relative to the root. pub fn depth(&self) -> usize { self.dent.depth() } /// Returns the underlying inode number if one exists. /// /// If this entry doesn't have an inode number, then `None` is returned. #[cfg(unix)] pub fn ino(&self) -> Option { self.dent.ino() } /// Returns an error, if one exists, associated with processing this entry. /// /// An example of an error is one that occurred while parsing an ignore /// file. pub fn error(&self) -> Option<&Error> { self.err.as_ref() } fn new_stdin() -> DirEntry { DirEntry { dent: DirEntryInner::Stdin, err: None, } } fn new_walkdir(dent: walkdir::DirEntry, err: Option) -> DirEntry { DirEntry { dent: DirEntryInner::Walkdir(dent), err: err, } } fn new_raw(dent: DirEntryRaw, err: Option) -> DirEntry { DirEntry { dent: DirEntryInner::Raw(dent), err: err, } } } /// DirEntryInner is the implementation of DirEntry. /// /// It specifically represents three distinct sources of directory entries: /// /// 1. From the walkdir crate. /// 2. Special entries that represent things like stdin. /// 3. From a path. /// /// Specifically, (3) has to essentially re-create the DirEntry implementation /// from WalkDir. #[derive(Debug)] enum DirEntryInner { Stdin, Walkdir(walkdir::DirEntry), Raw(DirEntryRaw), } impl DirEntryInner { fn path(&self) -> &Path { use self::DirEntryInner::*; match *self { Stdin => Path::new(""), Walkdir(ref x) => x.path(), Raw(ref x) => x.path(), } } fn path_is_symlink(&self) -> bool { use self::DirEntryInner::*; match *self { Stdin => false, Walkdir(ref x) => x.path_is_symlink(), Raw(ref x) => x.path_is_symlink(), } } fn is_stdin(&self) -> bool { match *self { DirEntryInner::Stdin => true, _ => false, } } fn metadata(&self) -> Result { use self::DirEntryInner::*; match *self { Stdin => { let err = Error::Io(io::Error::new( io::ErrorKind::Other, " has no metadata")); Err(err.with_path("")) } Walkdir(ref x) => { x.metadata().map_err(|err| { Error::Io(io::Error::from(err)).with_path(x.path()) }) } Raw(ref x) => x.metadata(), } } fn file_type(&self) -> Option { use self::DirEntryInner::*; match *self { Stdin => None, Walkdir(ref x) => Some(x.file_type()), Raw(ref x) => Some(x.file_type()), } } fn file_name(&self) -> &OsStr { use self::DirEntryInner::*; match *self { Stdin => OsStr::new(""), Walkdir(ref x) => x.file_name(), Raw(ref x) => x.file_name(), } } fn depth(&self) -> usize { use self::DirEntryInner::*; match *self { Stdin => 0, Walkdir(ref x) => x.depth(), Raw(ref x) => x.depth(), } } #[cfg(unix)] fn ino(&self) -> Option { use walkdir::DirEntryExt; use self::DirEntryInner::*; match *self { Stdin => None, Walkdir(ref x) => Some(x.ino()), Raw(ref x) => Some(x.ino()), } } } /// DirEntryRaw is essentially copied from the walkdir crate so that we can /// build `DirEntry`s from whole cloth in the parallel iterator. struct DirEntryRaw { /// The path as reported by the `fs::ReadDir` iterator (even if it's a /// symbolic link). path: PathBuf, /// The file type. Necessary for recursive iteration, so store it. ty: FileType, /// Is set when this entry was created from a symbolic link and the user /// expects the iterator to follow symbolic links. follow_link: bool, /// The depth at which this entry was generated relative to the root. depth: usize, /// The underlying inode number (Unix only). #[cfg(unix)] ino: u64, } impl fmt::Debug for DirEntryRaw { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // Leaving out FileType because it doesn't have a debug impl // in Rust 1.9. We could add it if we really wanted to by manually // querying each possibly file type. Meh. ---AG f.debug_struct("DirEntryRaw") .field("path", &self.path) .field("follow_link", &self.follow_link) .field("depth", &self.depth) .finish() } } impl DirEntryRaw { fn path(&self) -> &Path { &self.path } fn path_is_symlink(&self) -> bool { self.ty.is_symlink() || self.follow_link } fn metadata(&self) -> Result { if self.follow_link { fs::metadata(&self.path) } else { fs::symlink_metadata(&self.path) }.map_err(|err| Error::Io(io::Error::from(err)).with_path(&self.path)) } fn file_type(&self) -> FileType { self.ty } fn file_name(&self) -> &OsStr { self.path.file_name().unwrap_or_else(|| self.path.as_os_str()) } fn depth(&self) -> usize { self.depth } #[cfg(unix)] fn ino(&self) -> u64 { self.ino } fn from_entry( depth: usize, ent: &fs::DirEntry, ) -> Result { let ty = ent.file_type().map_err(|err| { let err = Error::Io(io::Error::from(err)).with_path(ent.path()); Error::WithDepth { depth: depth, err: Box::new(err), } })?; Ok(DirEntryRaw::from_entry_os(depth, ent, ty)) } #[cfg(not(unix))] fn from_entry_os( depth: usize, ent: &fs::DirEntry, ty: fs::FileType, ) -> DirEntryRaw { DirEntryRaw { path: ent.path(), ty: ty, follow_link: false, depth: depth, } } #[cfg(unix)] fn from_entry_os( depth: usize, ent: &fs::DirEntry, ty: fs::FileType, ) -> DirEntryRaw { use std::os::unix::fs::DirEntryExt; DirEntryRaw { path: ent.path(), ty: ty, follow_link: false, depth: depth, ino: ent.ino(), } } #[cfg(not(unix))] fn from_link(depth: usize, pb: PathBuf) -> Result { let md = fs::metadata(&pb).map_err(|err| { Error::Io(err).with_path(&pb) })?; Ok(DirEntryRaw { path: pb, ty: md.file_type(), follow_link: true, depth: depth, }) } #[cfg(unix)] fn from_link(depth: usize, pb: PathBuf) -> Result { use std::os::unix::fs::MetadataExt; let md = fs::metadata(&pb).map_err(|err| { Error::Io(err).with_path(&pb) })?; Ok(DirEntryRaw { path: pb, ty: md.file_type(), follow_link: true, depth: depth, ino: md.ino(), }) } } /// WalkBuilder builds a recursive directory iterator. /// /// The builder supports a large number of configurable options. This includes /// specific glob overrides, file type matching, toggling whether hidden /// files are ignored or not, and of course, support for respecting gitignore /// files. /// /// By default, all ignore files found are respected. This includes `.ignore`, /// `.gitignore`, `.git/info/exclude` and even your global gitignore /// globs, usually found in `$XDG_CONFIG_HOME/git/ignore`. /// /// Some standard recursive directory options are also supported, such as /// limiting the recursive depth or whether to follow symbolic links (disabled /// by default). /// /// # Ignore rules /// /// There are many rules that influence whether a particular file or directory /// is skipped by this iterator. Those rules are documented here. Note that /// the rules assume a default configuration. /// /// * First, glob overrides are checked. If a path matches a glob override, /// then matching stops. The path is then only skipped if the glob that matched /// the path is an ignore glob. (An override glob is a whitelist glob unless it /// starts with a `!`, in which case it is an ignore glob.) /// * Second, ignore files are checked. Ignore files currently only come from /// git ignore files (`.gitignore`, `.git/info/exclude` and the configured /// global gitignore file), plain `.ignore` files, which have the same format /// as gitignore files, or explicitly added ignore files. The precedence order /// is: `.ignore`, `.gitignore`, `.git/info/exclude`, global gitignore and /// finally explicitly added ignore files. Note that precedence between /// different types of ignore files is not impacted by the directory hierarchy; /// any `.ignore` file overrides all `.gitignore` files. Within each precedence /// level, more nested ignore files have a higher precedence than less nested /// ignore files. /// * Third, if the previous step yields an ignore match, then all matching /// is stopped and the path is skipped. If it yields a whitelist match, then /// matching continues. A whitelist match can be overridden by a later matcher. /// * Fourth, unless the path is a directory, the file type matcher is run on /// the path. As above, if it yields an ignore match, then all matching is /// stopped and the path is skipped. If it yields a whitelist match, then /// matching continues. /// * Fifth, if the path hasn't been whitelisted and it is hidden, then the /// path is skipped. /// * Sixth, unless the path is a directory, the size of the file is compared /// against the max filesize limit. If it exceeds the limit, it is skipped. /// * Seventh, if the path has made it this far then it is yielded in the /// iterator. #[derive(Clone)] pub struct WalkBuilder { paths: Vec, ig_builder: IgnoreBuilder, parents: bool, max_depth: Option, max_filesize: Option, follow_links: bool, sorter: Option cmp::Ordering + Send + Sync + 'static >>, threads: usize, } impl fmt::Debug for WalkBuilder { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("WalkBuilder") .field("paths", &self.paths) .field("ig_builder", &self.ig_builder) .field("parents", &self.parents) .field("max_depth", &self.max_depth) .field("max_filesize", &self.max_filesize) .field("follow_links", &self.follow_links) .field("threads", &self.threads) .finish() } } impl WalkBuilder { /// Create a new builder for a recursive directory iterator for the /// directory given. /// /// Note that if you want to traverse multiple different directories, it /// is better to call `add` on this builder than to create multiple /// `Walk` values. pub fn new>(path: P) -> WalkBuilder { WalkBuilder { paths: vec![path.as_ref().to_path_buf()], ig_builder: IgnoreBuilder::new(), parents: true, max_depth: None, max_filesize: None, follow_links: false, sorter: None, threads: 0, } } /// Build a new `Walk` iterator. pub fn build(&self) -> Walk { let follow_links = self.follow_links; let max_depth = self.max_depth; let cmp = self.sorter.clone(); let its = self.paths.iter().map(move |p| { if p == Path::new("-") { (p.to_path_buf(), None) } else { let mut wd = WalkDir::new(p); wd = wd.follow_links(follow_links || p.is_file()); if let Some(max_depth) = max_depth { wd = wd.max_depth(max_depth); } if let Some(ref cmp) = cmp { let cmp = cmp.clone(); wd = wd.sort_by(move |a, b| { cmp(a.file_name(), b.file_name()) }); } (p.to_path_buf(), Some(WalkEventIter::from(wd))) } }).collect::>().into_iter(); let ig_root = self.ig_builder.build(); Walk { its: its, it: None, ig_root: ig_root.clone(), ig: ig_root.clone(), max_filesize: self.max_filesize, parents: self.parents, } } /// Build a new `WalkParallel` iterator. /// /// Note that this *doesn't* return something that implements `Iterator`. /// Instead, the returned value must be run with a closure. e.g., /// `builder.build_parallel().run(|| |path| println!("{:?}", path))`. pub fn build_parallel(&self) -> WalkParallel { WalkParallel { paths: self.paths.clone().into_iter(), ig_root: self.ig_builder.build(), max_depth: self.max_depth, max_filesize: self.max_filesize, follow_links: self.follow_links, parents: self.parents, threads: self.threads, } } /// Add a file path to the iterator. /// /// Each additional file path added is traversed recursively. This should /// be preferred over building multiple `Walk` iterators since this /// enables reusing resources across iteration. pub fn add>(&mut self, path: P) -> &mut WalkBuilder { self.paths.push(path.as_ref().to_path_buf()); self } /// The maximum depth to recurse. /// /// The default, `None`, imposes no depth restriction. pub fn max_depth(&mut self, depth: Option) -> &mut WalkBuilder { self.max_depth = depth; self } /// Whether to follow symbolic links or not. pub fn follow_links(&mut self, yes: bool) -> &mut WalkBuilder { self.follow_links = yes; self } /// Whether to ignore files above the specified limit. pub fn max_filesize(&mut self, filesize: Option) -> &mut WalkBuilder { self.max_filesize = filesize; self } /// The number of threads to use for traversal. /// /// Note that this only has an effect when using `build_parallel`. /// /// The default setting is `0`, which chooses the number of threads /// automatically using heuristics. pub fn threads(&mut self, n: usize) -> &mut WalkBuilder { self.threads = n; self } /// Add an ignore file to the matcher. /// /// This has lower precedence than all other sources of ignore rules. /// /// If there was a problem adding the ignore file, then an error is /// returned. Note that the error may indicate *partial* failure. For /// example, if an ignore file contains an invalid glob, all other globs /// are still applied. pub fn add_ignore>(&mut self, path: P) -> Option { let mut builder = GitignoreBuilder::new(""); let mut errs = PartialErrorBuilder::default(); errs.maybe_push(builder.add(path)); match builder.build() { Ok(gi) => { self.ig_builder.add_ignore(gi); } Err(err) => { errs.push(err); } } errs.into_error_option() } /// Add an override matcher. /// /// By default, no override matcher is used. /// /// This overrides any previous setting. pub fn overrides(&mut self, overrides: Override) -> &mut WalkBuilder { self.ig_builder.overrides(overrides); self } /// Add a file type matcher. /// /// By default, no file type matcher is used. /// /// This overrides any previous setting. pub fn types(&mut self, types: Types) -> &mut WalkBuilder { self.ig_builder.types(types); self } /// Enables all the standard ignore filters. /// /// This toggles, as a group, all the filters that are enabled by default: /// /// - [hidden()](#method.hidden) /// - [parents()](#method.parents) /// - [ignore()](#method.ignore) /// - [git_ignore()](#method.git_ignore) /// - [git_global()](#method.git_global) /// - [git_exclude()](#method.git_exclude) /// /// They may still be toggled individually after calling this function. /// /// This is (by definition) enabled by default. pub fn standard_filters(&mut self, yes: bool) -> &mut WalkBuilder { self.hidden(yes) .parents(yes) .ignore(yes) .git_ignore(yes) .git_global(yes) .git_exclude(yes) } /// Enables ignoring hidden files. /// /// This is enabled by default. pub fn hidden(&mut self, yes: bool) -> &mut WalkBuilder { self.ig_builder.hidden(yes); self } /// Enables reading ignore files from parent directories. /// /// If this is enabled, then the parent directories of each file path given /// are traversed for ignore files (subject to the ignore settings on /// this builder). Note that file paths are canonicalized with respect to /// the current working directory in order to determine parent directories. /// /// This is enabled by default. pub fn parents(&mut self, yes: bool) -> &mut WalkBuilder { self.parents = yes; self } /// Enables reading `.ignore` files. /// /// `.ignore` files have the same semantics as `gitignore` files and are /// supported by search tools such as ripgrep and The Silver Searcher. /// /// This is enabled by default. pub fn ignore(&mut self, yes: bool) -> &mut WalkBuilder { self.ig_builder.ignore(yes); self } /// Enables reading a global gitignore file, whose path is specified in /// git's `core.excludesFile` config option. /// /// Git's config file location is `$HOME/.gitconfig`. If `$HOME/.gitconfig` /// does not exist or does not specify `core.excludesFile`, then /// `$XDG_CONFIG_HOME/git/ignore` is read. If `$XDG_CONFIG_HOME` is not /// set or is empty, then `$HOME/.config/git/ignore` is used instead. /// /// This is enabled by default. pub fn git_global(&mut self, yes: bool) -> &mut WalkBuilder { self.ig_builder.git_global(yes); self } /// Enables reading `.gitignore` files. /// /// `.gitignore` files have match semantics as described in the `gitignore` /// man page. /// /// This is enabled by default. pub fn git_ignore(&mut self, yes: bool) -> &mut WalkBuilder { self.ig_builder.git_ignore(yes); self } /// Enables reading `.git/info/exclude` files. /// /// `.git/info/exclude` files have match semantics as described in the /// `gitignore` man page. /// /// This is enabled by default. pub fn git_exclude(&mut self, yes: bool) -> &mut WalkBuilder { self.ig_builder.git_exclude(yes); self } /// Set a function for sorting directory entries by file name. /// /// If a compare function is set, the resulting iterator will return all /// paths in sorted order. The compare function will be called to compare /// names from entries from the same directory using only the name of the /// entry. /// /// Note that this is not used in the parallel iterator. pub fn sort_by_file_name(&mut self, cmp: F) -> &mut WalkBuilder where F: Fn(&OsStr, &OsStr) -> cmp::Ordering + Send + Sync + 'static { self.sorter = Some(Arc::new(cmp)); self } } /// Walk is a recursive directory iterator over file paths in one or more /// directories. /// /// Only file and directory paths matching the rules are returned. By default, /// ignore files like `.gitignore` are respected. The precise matching rules /// and precedence is explained in the documentation for `WalkBuilder`. pub struct Walk { its: vec::IntoIter<(PathBuf, Option)>, it: Option, ig_root: Ignore, ig: Ignore, max_filesize: Option, parents: bool, } impl Walk { /// Creates a new recursive directory iterator for the file path given. /// /// Note that this uses default settings, which include respecting /// `.gitignore` files. To configure the iterator, use `WalkBuilder` /// instead. pub fn new>(path: P) -> Walk { WalkBuilder::new(path).build() } fn skip_entry(&self, ent: &walkdir::DirEntry) -> bool { if ent.depth() == 0 { return false; } let is_dir = ent.file_type().is_dir(); let max_size = self.max_filesize; let should_skip_path = skip_path(&self.ig, ent.path(), is_dir); let should_skip_filesize = if !is_dir && max_size.is_some() { skip_filesize(max_size.unwrap(), ent.path(), &ent.metadata().ok()) } else { false }; should_skip_path || should_skip_filesize } } impl Iterator for Walk { type Item = Result; #[inline(always)] fn next(&mut self) -> Option> { loop { let ev = match self.it.as_mut().and_then(|it| it.next()) { Some(ev) => ev, None => { match self.its.next() { None => return None, Some((_, None)) => { return Some(Ok(DirEntry::new_stdin())); } Some((path, Some(it))) => { self.it = Some(it); if self.parents && path.is_dir() { let (ig, err) = self.ig_root.add_parents(path); self.ig = ig; if let Some(err) = err { return Some(Err(err)); } } else { self.ig = self.ig_root.clone(); } } } continue; } }; match ev { Err(err) => { return Some(Err(Error::from_walkdir(err))); } Ok(WalkEvent::Exit) => { self.ig = self.ig.parent().unwrap(); } Ok(WalkEvent::Dir(ent)) => { if self.skip_entry(&ent) { self.it.as_mut().unwrap().it.skip_current_dir(); // Still need to push this on the stack because // we'll get a WalkEvent::Exit event for this dir. // We don't care if it errors though. let (igtmp, _) = self.ig.add_child(ent.path()); self.ig = igtmp; continue; } let (igtmp, err) = self.ig.add_child(ent.path()); self.ig = igtmp; return Some(Ok(DirEntry::new_walkdir(ent, err))); } Ok(WalkEvent::File(ent)) => { if self.skip_entry(&ent) { continue; } return Some(Ok(DirEntry::new_walkdir(ent, None))); } } } } } /// WalkEventIter transforms a WalkDir iterator into an iterator that more /// accurately describes the directory tree. Namely, it emits events that are /// one of three types: directory, file or "exit." An "exit" event means that /// the entire contents of a directory have been enumerated. struct WalkEventIter { depth: usize, it: walkdir::IntoIter, next: Option>, } #[derive(Debug)] enum WalkEvent { Dir(walkdir::DirEntry), File(walkdir::DirEntry), Exit, } impl From for WalkEventIter { fn from(it: WalkDir) -> WalkEventIter { WalkEventIter { depth: 0, it: it.into_iter(), next: None } } } impl Iterator for WalkEventIter { type Item = walkdir::Result; #[inline(always)] fn next(&mut self) -> Option> { let dent = self.next.take().or_else(|| self.it.next()); let depth = match dent { None => 0, Some(Ok(ref dent)) => dent.depth(), Some(Err(ref err)) => err.depth(), }; if depth < self.depth { self.depth -= 1; self.next = dent; return Some(Ok(WalkEvent::Exit)); } self.depth = depth; match dent { None => None, Some(Err(err)) => Some(Err(err)), Some(Ok(dent)) => { if dent.file_type().is_dir() { self.depth += 1; Some(Ok(WalkEvent::Dir(dent))) } else { Some(Ok(WalkEvent::File(dent))) } } } } } /// WalkState is used in the parallel recursive directory iterator to indicate /// whether walking should continue as normal, skip descending into a /// particular directory or quit the walk entirely. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum WalkState { /// Continue walking as normal. Continue, /// If the directory entry given is a directory, don't descend into it. /// In all other cases, this has no effect. Skip, /// Quit the entire iterator as soon as possible. /// /// Note that this is an inherently asynchronous action. It is possible /// for more entries to be yielded even after instructing the iterator /// to quit. Quit, } impl WalkState { fn is_quit(&self) -> bool { *self == WalkState::Quit } } /// WalkParallel is a parallel recursive directory iterator over files paths /// in one or more directories. /// /// Only file and directory paths matching the rules are returned. By default, /// ignore files like `.gitignore` are respected. The precise matching rules /// and precedence is explained in the documentation for `WalkBuilder`. /// /// Unlike `Walk`, this uses multiple threads for traversing a directory. pub struct WalkParallel { paths: vec::IntoIter, ig_root: Ignore, parents: bool, max_filesize: Option, max_depth: Option, follow_links: bool, threads: usize, } impl WalkParallel { /// Execute the parallel recursive directory iterator. `mkf` is called /// for each thread used for iteration. The function produced by `mkf` /// is then in turn called for each visited file path. pub fn run( self, mut mkf: F, ) where F: FnMut() -> Box) -> WalkState + Send + 'static> { let mut f = mkf(); let threads = self.threads(); let queue = Arc::new(MsQueue::new()); let mut any_work = false; // Send the initial set of root paths to the pool of workers. // Note that we only send directories. For files, we send to them the // callback directly. for path in self.paths { let dent = if path == Path::new("-") { DirEntry::new_stdin() } else { match DirEntryRaw::from_link(0, path) { Ok(dent) => DirEntry::new_raw(dent, None), Err(err) => { if f(Err(err)).is_quit() { return; } continue; } } }; queue.push(Message::Work(Work { dent: dent, ignore: self.ig_root.clone(), })); any_work = true; } // ... but there's no need to start workers if we don't need them. if !any_work { return; } // Create the workers and then wait for them to finish. let num_waiting = Arc::new(AtomicUsize::new(0)); let num_quitting = Arc::new(AtomicUsize::new(0)); let quit_now = Arc::new(AtomicBool::new(false)); let mut handles = vec![]; for _ in 0..threads { let worker = Worker { f: mkf(), queue: queue.clone(), quit_now: quit_now.clone(), is_waiting: false, is_quitting: false, num_waiting: num_waiting.clone(), num_quitting: num_quitting.clone(), threads: threads, parents: self.parents, max_depth: self.max_depth, max_filesize: self.max_filesize, follow_links: self.follow_links, }; handles.push(thread::spawn(|| worker.run())); } for handle in handles { handle.join().unwrap(); } } fn threads(&self) -> usize { if self.threads == 0 { 2 } else { self.threads } } } /// Message is the set of instructions that a worker knows how to process. enum Message { /// A work item corresponds to a directory that should be descended into. /// Work items for entries that should be skipped or ignored should not /// be produced. Work(Work), /// This instruction indicates that the worker should start quitting. Quit, } /// A unit of work for each worker to process. /// /// Each unit of work corresponds to a directory that should be descended /// into. struct Work { /// The directory entry. dent: DirEntry, /// Any ignore matchers that have been built for this directory's parents. ignore: Ignore, } impl Work { /// Returns true if and only if this work item is a directory. fn is_dir(&self) -> bool { self.dent.file_type().map_or(false, |t| t.is_dir()) } /// Adds ignore rules for parent directories. /// /// Note that this only applies to entries at depth 0. On all other /// entries, this is a no-op. fn add_parents(&mut self) -> Option { if self.dent.depth() > 0 { return None; } // At depth 0, the path of this entry is a root path, so we can // use it directly to add parent ignore rules. let (ig, err) = self.ignore.add_parents(self.dent.path()); self.ignore = ig; err } /// Reads the directory contents of this work item and adds ignore /// rules for this directory. /// /// If there was a problem with reading the directory contents, then /// an error is returned. If there was a problem reading the ignore /// rules for this directory, then the error is attached to this /// work item's directory entry. fn read_dir(&mut self) -> Result { let readdir = match fs::read_dir(self.dent.path()) { Ok(readdir) => readdir, Err(err) => { let err = Error::from(err) .with_path(self.dent.path()) .with_depth(self.dent.depth()); return Err(err); } }; let (ig, err) = self.ignore.add_child(self.dent.path()); self.ignore = ig; self.dent.err = err; Ok(readdir) } } /// A worker is responsible for descending into directories, updating the /// ignore matchers, producing new work and invoking the caller's callback. /// /// Note that a worker is *both* a producer and a consumer. struct Worker { /// The caller's callback. f: Box) -> WalkState + Send + 'static>, /// A queue of work items. This is multi-producer and multi-consumer. queue: Arc>, /// Whether all workers should quit at the next opportunity. Note that /// this is distinct from quitting because of exhausting the contents of /// a directory. Instead, this is used when the caller's callback indicates /// that the iterator should quit immediately. quit_now: Arc, /// Whether this worker is waiting for more work. is_waiting: bool, /// Whether this worker has started to quit. is_quitting: bool, /// The number of workers waiting for more work. num_waiting: Arc, /// The number of workers waiting to quit. num_quitting: Arc, /// The total number of workers. threads: usize, /// Whether to create ignore matchers for parents of caller specified /// directories. parents: bool, /// The maximum depth of directories to descend. A value of `0` means no /// descension at all. max_depth: Option, /// The maximum size a searched file can be (in bytes). If a file exceeds /// this size it will be skipped. max_filesize: Option, /// Whether to follow symbolic links or not. When this is enabled, loop /// detection is performed. follow_links: bool, } impl Worker { /// Runs this worker until there is no more work left to do. /// /// The worker will call the caller's callback for all entries that aren't /// skipped by the ignore matcher. fn run(mut self) { while let Some(mut work) = self.get_work() { // If the work is not a directory, then we can just execute the // caller's callback immediately and move on. if !work.is_dir() { if (self.f)(Ok(work.dent)).is_quit() { self.quit_now(); return; } continue; } if self.parents { if let Some(err) = work.add_parents() { if (self.f)(Err(err)).is_quit() { self.quit_now(); return; } } } let readdir = match work.read_dir() { Ok(readdir) => readdir, Err(err) => { if (self.f)(Err(err)).is_quit() { self.quit_now(); return; } continue; } }; let depth = work.dent.depth(); match (self.f)(Ok(work.dent)) { WalkState::Continue => {} WalkState::Skip => continue, WalkState::Quit => { self.quit_now(); return; } } if self.max_depth.map_or(false, |max| depth >= max) { continue; } for result in readdir { if self.run_one(&work.ignore, depth + 1, result).is_quit() { self.quit_now(); return; } } } } /// Runs the worker on a single entry from a directory iterator. /// /// If the entry is a path that should be ignored, then this is a no-op. /// Otherwise, the entry is pushed on to the queue. (The actual execution /// of the callback happens in `run`.) /// /// If an error occurs while reading the entry, then it is sent to the /// caller's callback. /// /// `ig` is the `Ignore` matcher for the parent directory. `depth` should /// be the depth of this entry. `result` should be the item yielded by /// a directory iterator. fn run_one( &mut self, ig: &Ignore, depth: usize, result: Result, ) -> WalkState { let fs_dent = match result { Ok(fs_dent) => fs_dent, Err(err) => { return (self.f)(Err(Error::from(err).with_depth(depth))); } }; let mut dent = match DirEntryRaw::from_entry(depth, &fs_dent) { Ok(dent) => DirEntry::new_raw(dent, None), Err(err) => { return (self.f)(Err(err)); } }; let is_symlink = dent.file_type().map_or(false, |ft| ft.is_symlink()); if self.follow_links && is_symlink { let path = dent.path().to_path_buf(); dent = match DirEntryRaw::from_link(depth, path) { Ok(dent) => DirEntry::new_raw(dent, None), Err(err) => { return (self.f)(Err(err)); } }; if dent.file_type().map_or(false, |ft| ft.is_dir()) { if let Err(err) = check_symlink_loop(ig, dent.path(), depth) { return (self.f)(Err(err)); } } } let is_dir = dent.file_type().map_or(false, |ft| ft.is_dir()); let max_size = self.max_filesize; let should_skip_path = skip_path(ig, dent.path(), is_dir); let should_skip_filesize = if !is_dir && max_size.is_some() { skip_filesize(max_size.unwrap(), dent.path(), &dent.metadata().ok()) } else { false }; if !should_skip_path && !should_skip_filesize { self.queue.push(Message::Work(Work { dent: dent, ignore: ig.clone(), })); } WalkState::Continue } /// Returns the next directory to descend into. /// /// If all work has been exhausted, then this returns None. The worker /// should then subsequently quit. fn get_work(&mut self) -> Option { loop { if self.is_quit_now() { return None; } match self.queue.try_pop() { Some(Message::Work(work)) => { self.waiting(false); self.quitting(false); return Some(work); } Some(Message::Quit) => { // We can't just quit because a Message::Quit could be // spurious. For example, it's possible to observe that // all workers are waiting even if there's more work to // be done. // // Therefore, we do a bit of a dance to wait until all // workers have signaled that they're ready to quit before // actually quitting. // // If the Quit message turns out to be spurious, then the // loop below will break and we'll go back to looking for // more work. self.waiting(true); self.quitting(true); while !self.is_quit_now() { let nwait = self.num_waiting(); let nquit = self.num_quitting(); // If the number of waiting workers dropped, then // abort our attempt to quit. if nwait < self.threads { break; } // If all workers are in this quit loop, then we // can stop. if nquit == self.threads { return None; } // Otherwise, spin. } } None => { self.waiting(true); self.quitting(false); if self.num_waiting() == self.threads { for _ in 0..self.threads { self.queue.push(Message::Quit); } } else { // You're right to consider this suspicious, but it's // a useful heuristic to permit producers to catch up // to consumers without burning the CPU. It is also // useful as a means to prevent burning the CPU if only // one worker is left doing actual work. It's not // perfect and it doesn't leave the CPU completely // idle, but it's not clear what else we can do. :-/ thread::sleep(Duration::from_millis(1)); } } } } } /// Indicates that all workers should quit immediately. fn quit_now(&self) { self.quit_now.store(true, Ordering::SeqCst); } /// Returns true if this worker should quit immediately. fn is_quit_now(&self) -> bool { self.quit_now.load(Ordering::SeqCst) } /// Returns the total number of workers waiting for work. fn num_waiting(&self) -> usize { self.num_waiting.load(Ordering::SeqCst) } /// Returns the total number of workers ready to quit. fn num_quitting(&self) -> usize { self.num_quitting.load(Ordering::SeqCst) } /// Sets this worker's "quitting" state to the value of `yes`. fn quitting(&mut self, yes: bool) { if yes { if !self.is_quitting { self.is_quitting = true; self.num_quitting.fetch_add(1, Ordering::SeqCst); } } else { if self.is_quitting { self.is_quitting = false; self.num_quitting.fetch_sub(1, Ordering::SeqCst); } } } /// Sets this worker's "waiting" state to the value of `yes`. fn waiting(&mut self, yes: bool) { if yes { if !self.is_waiting { self.is_waiting = true; self.num_waiting.fetch_add(1, Ordering::SeqCst); } } else { if self.is_waiting { self.is_waiting = false; self.num_waiting.fetch_sub(1, Ordering::SeqCst); } } } } fn check_symlink_loop( ig_parent: &Ignore, child_path: &Path, child_depth: usize, ) -> Result<(), Error> { let hchild = Handle::from_path(child_path).map_err(|err| { Error::from(err).with_path(child_path).with_depth(child_depth) })?; for ig in ig_parent.parents().take_while(|ig| !ig.is_absolute_parent()) { let h = Handle::from_path(ig.path()).map_err(|err| { Error::from(err).with_path(child_path).with_depth(child_depth) })?; if hchild == h { return Err(Error::Loop { ancestor: ig.path().to_path_buf(), child: child_path.to_path_buf(), }.with_depth(child_depth)); } } Ok(()) } // Before calling this function, make sure that you ensure that is really // necessary as the arguments imply a file stat. fn skip_filesize( max_filesize: u64, path: &Path, ent: &Option ) -> bool { let filesize = match *ent { Some(ref md) => Some(md.len()), None => None }; if let Some(fs) = filesize { if fs > max_filesize { debug!("ignoring {}: {} bytes", path.display(), fs); true } else { false } } else { false } } fn skip_path(ig: &Ignore, path: &Path, is_dir: bool) -> bool { let m = ig.matched(path, is_dir); if m.is_ignore() { debug!("ignoring {}: {:?}", path.display(), m); true } else if m.is_whitelist() { debug!("whitelisting {}: {:?}", path.display(), m); false } else { false } } #[cfg(test)] mod tests { use std::fs::{self, File}; use std::io::Write; use std::path::Path; use std::sync::{Arc, Mutex}; use tempdir::TempDir; use super::{WalkBuilder, WalkState}; fn wfile>(path: P, contents: &str) { let mut file = File::create(path).unwrap(); file.write_all(contents.as_bytes()).unwrap(); } fn wfile_size>(path: P, size: u64) { let file = File::create(path).unwrap(); file.set_len(size).unwrap(); } #[cfg(unix)] fn symlink, Q: AsRef>(src: P, dst: Q) { use std::os::unix::fs::symlink; symlink(src, dst).unwrap(); } fn mkdirp>(path: P) { fs::create_dir_all(path).unwrap(); } fn normal_path(unix: &str) -> String { if cfg!(windows) { unix.replace("\\", "/") } else { unix.to_string() } } fn walk_collect(prefix: &Path, builder: &WalkBuilder) -> Vec { let mut paths = vec![]; for result in builder.build() { let dent = match result { Err(_) => continue, Ok(dent) => dent, }; let path = dent.path().strip_prefix(prefix).unwrap(); if path.as_os_str().is_empty() { continue; } paths.push(normal_path(path.to_str().unwrap())); } paths.sort(); paths } fn walk_collect_parallel( prefix: &Path, builder: &WalkBuilder, ) -> Vec { let paths = Arc::new(Mutex::new(vec![])); let prefix = Arc::new(prefix.to_path_buf()); builder.build_parallel().run(|| { let paths = paths.clone(); let prefix = prefix.clone(); Box::new(move |result| { let dent = match result { Err(_) => return WalkState::Continue, Ok(dent) => dent, }; let path = dent.path().strip_prefix(&**prefix).unwrap(); if path.as_os_str().is_empty() { return WalkState::Continue; } let mut paths = paths.lock().unwrap(); paths.push(normal_path(path.to_str().unwrap())); WalkState::Continue }) }); let mut paths = paths.lock().unwrap(); paths.sort(); paths.to_vec() } fn mkpaths(paths: &[&str]) -> Vec { let mut paths: Vec<_> = paths.iter().map(|s| s.to_string()).collect(); paths.sort(); paths } fn assert_paths( prefix: &Path, builder: &WalkBuilder, expected: &[&str], ) { let got = walk_collect(prefix, builder); assert_eq!(got, mkpaths(expected)); let got = walk_collect_parallel(prefix, builder); assert_eq!(got, mkpaths(expected)); } #[test] fn no_ignores() { let td = TempDir::new("walk-test-").unwrap(); mkdirp(td.path().join("a/b/c")); mkdirp(td.path().join("x/y")); wfile(td.path().join("a/b/foo"), ""); wfile(td.path().join("x/y/foo"), ""); assert_paths(td.path(), &WalkBuilder::new(td.path()), &[ "x", "x/y", "x/y/foo", "a", "a/b", "a/b/foo", "a/b/c", ]); } #[test] fn gitignore() { let td = TempDir::new("walk-test-").unwrap(); mkdirp(td.path().join("a")); wfile(td.path().join(".gitignore"), "foo"); wfile(td.path().join("foo"), ""); wfile(td.path().join("a/foo"), ""); wfile(td.path().join("bar"), ""); wfile(td.path().join("a/bar"), ""); assert_paths(td.path(), &WalkBuilder::new(td.path()), &[ "bar", "a", "a/bar", ]); } #[test] fn explicit_ignore() { let td = TempDir::new("walk-test-").unwrap(); let igpath = td.path().join(".not-an-ignore"); mkdirp(td.path().join("a")); wfile(&igpath, "foo"); wfile(td.path().join("foo"), ""); wfile(td.path().join("a/foo"), ""); wfile(td.path().join("bar"), ""); wfile(td.path().join("a/bar"), ""); let mut builder = WalkBuilder::new(td.path()); assert!(builder.add_ignore(&igpath).is_none()); assert_paths(td.path(), &builder, &["bar", "a", "a/bar"]); } #[test] fn gitignore_parent() { let td = TempDir::new("walk-test-").unwrap(); mkdirp(td.path().join("a")); wfile(td.path().join(".gitignore"), "foo"); wfile(td.path().join("a/foo"), ""); wfile(td.path().join("a/bar"), ""); let root = td.path().join("a"); assert_paths(&root, &WalkBuilder::new(&root), &["bar"]); } #[test] fn max_depth() { let td = TempDir::new("walk-test-").unwrap(); mkdirp(td.path().join("a/b/c")); wfile(td.path().join("foo"), ""); wfile(td.path().join("a/foo"), ""); wfile(td.path().join("a/b/foo"), ""); wfile(td.path().join("a/b/c/foo"), ""); let mut builder = WalkBuilder::new(td.path()); assert_paths(td.path(), &builder, &[ "a", "a/b", "a/b/c", "foo", "a/foo", "a/b/foo", "a/b/c/foo", ]); assert_paths(td.path(), builder.max_depth(Some(0)), &[]); assert_paths(td.path(), builder.max_depth(Some(1)), &["a", "foo"]); assert_paths(td.path(), builder.max_depth(Some(2)), &[ "a", "a/b", "foo", "a/foo", ]); } #[test] fn max_filesize() { let td = TempDir::new("walk-test-").unwrap(); mkdirp(td.path().join("a/b")); wfile_size(td.path().join("foo"), 0); wfile_size(td.path().join("bar"), 400); wfile_size(td.path().join("baz"), 600); wfile_size(td.path().join("a/foo"), 600); wfile_size(td.path().join("a/bar"), 500); wfile_size(td.path().join("a/baz"), 200); let mut builder = WalkBuilder::new(td.path()); assert_paths(td.path(), &builder, &[ "a", "a/b", "foo", "bar", "baz", "a/foo", "a/bar", "a/baz", ]); assert_paths(td.path(), builder.max_filesize(Some(0)), &[ "a", "a/b", "foo" ]); assert_paths(td.path(), builder.max_filesize(Some(500)), &[ "a", "a/b", "foo", "bar", "a/bar", "a/baz" ]); assert_paths(td.path(), builder.max_filesize(Some(50000)), &[ "a", "a/b", "foo", "bar", "baz", "a/foo", "a/bar", "a/baz", ]); } #[cfg(unix)] // because symlinks on windows are weird #[test] fn symlinks() { let td = TempDir::new("walk-test-").unwrap(); mkdirp(td.path().join("a/b")); symlink(td.path().join("a/b"), td.path().join("z")); wfile(td.path().join("a/b/foo"), ""); let mut builder = WalkBuilder::new(td.path()); assert_paths(td.path(), &builder, &[ "a", "a/b", "a/b/foo", "z", ]); assert_paths(td.path(), &builder.follow_links(true), &[ "a", "a/b", "a/b/foo", "z", "z/foo", ]); } #[cfg(unix)] // because symlinks on windows are weird #[test] fn symlink_loop() { let td = TempDir::new("walk-test-").unwrap(); mkdirp(td.path().join("a/b")); symlink(td.path().join("a"), td.path().join("a/b/c")); let mut builder = WalkBuilder::new(td.path()); assert_paths(td.path(), &builder, &[ "a", "a/b", "a/b/c", ]); assert_paths(td.path(), &builder.follow_links(true), &[ "a", "a/b", ]); } }