summaryrefslogtreecommitdiffstats
path: root/src/file_sum
diff options
context:
space:
mode:
authorCanop <cano.petrole@gmail.com>2020-06-25 18:00:06 +0200
committerCanop <cano.petrole@gmail.com>2020-06-25 18:00:06 +0200
commitf35ee714052f0bdbe3dbd6cc673e883be882fe1f (patch)
treeaabdf2f2ecf03d6e9d4964dd642de7c486cc2eda /src/file_sum
parent3bc5358107189817b93e6b575e358dbd44d64703 (diff)
deep last modified date computed in background on directories
Diffstat (limited to 'src/file_sum')
-rw-r--r--src/file_sum/mod.rs150
-rw-r--r--src/file_sum/sum_computation.rs152
2 files changed, 302 insertions, 0 deletions
diff --git a/src/file_sum/mod.rs b/src/file_sum/mod.rs
new file mode 100644
index 0000000..fa6eaf5
--- /dev/null
+++ b/src/file_sum/mod.rs
@@ -0,0 +1,150 @@
+/// compute the summed sum of directories.
+/// A cache is used to avoid recomputing the
+/// same directories again and again.
+/// Hard links are checked to avoid counting
+/// twice an inode.
+///
+
+mod sum_computation;
+
+use {
+ crate::task_sync::Dam,
+ std::{
+ collections::HashMap,
+ fs::Metadata,
+ ops::AddAssign,
+ path::{Path, PathBuf},
+ sync::Mutex,
+ time::UNIX_EPOCH,
+ },
+};
+
+const SUM_NAMES: &[&str] = &["", "K", "M", "G", "T", "P", "E", "Z", "Y"];
+
+lazy_static! {
+ static ref SUM_CACHE_MUTEX: Mutex<HashMap<PathBuf, FileSum>> = Mutex::new(HashMap::new());
+}
+
+pub fn clear_cache() {
+ let mut sum_cache = SUM_CACHE_MUTEX.lock().unwrap();
+ sum_cache.clear();
+}
+
+pub fn extract_seconds(md: &Metadata) -> u64 {
+ md.modified().map_or(
+ 0,
+ |st| st.duration_since(UNIX_EPOCH).map_or(0, |d| d.as_secs())
+ )
+}
+
+#[derive(Debug, Copy, Clone)]
+pub struct FileSum {
+ real_size: u64, // bytes, the space it takes on disk
+ sparse: bool, // only for non directories: tells whether the file is sparse
+ count: usize, // number of files
+ modified: u64, // seconds from Epoch to last modification, or 0 if there was an error
+}
+
+impl FileSum {
+ pub fn new(
+ real_size: u64,
+ sparse: bool,
+ count: usize,
+ modified: u64,
+ ) -> Self {
+ Self { real_size, sparse, count, modified }
+ }
+
+ pub fn zero() -> Self {
+ Self::new(0, false, 0, 0)
+ }
+
+ pub fn incr(&mut self) {
+ self.count += 1;
+ }
+
+ /// return the sum of the given file, which is assumed
+ /// to be a normal file (ie not a directory)
+ pub fn from_file(path: &Path) -> Self {
+ sum_computation::compute_file_sum(path)
+ }
+
+ /// Return the sum of the directory, either by computing it of by
+ /// fetching it from cache.
+ /// If the lifetime expires before complete computation, None is returned.
+ pub fn from_dir(path: &Path, dam: &Dam) -> Option<Self> {
+ let mut sum_cache = SUM_CACHE_MUTEX.lock().unwrap();
+ match sum_cache.get(path) {
+ Some(sum) => Some(*sum),
+ None => {
+ let sum = time!(
+ Debug,
+ "sum computation",
+ path,
+ sum_computation::compute_dir_sum(path, dam),
+ );
+ if let Some(sum) = sum {
+ sum_cache.insert(PathBuf::from(path), sum);
+ }
+ sum
+ }
+ }
+ }
+
+ pub fn part_of_size(self, total: Self) -> f32 {
+ if total.real_size == 0 {
+ 0.0
+ } else {
+ self.real_size as f32 / total.real_size as f32
+ }
+ }
+ /// format a number of bytes as a string, for example 247K
+ pub fn to_size_string(self) -> String {
+ let mut v = self.real_size;
+ let mut i = 0;
+ while v >= 5000 && i < SUM_NAMES.len() - 1 {
+ v /= 1000;
+ i += 1;
+ }
+ format!("{}{}", v, &SUM_NAMES[i])
+ }
+ /// return the number of files (normally at least 1)
+ pub fn to_count(self) -> usize {
+ self.count
+ }
+ /// return the number of seconds from Epoch to last modification,
+ /// or 0 if the computation failed
+ pub fn to_seconds(self) -> u64 {
+ self.modified
+ }
+ /// return the size in bytes
+ pub fn to_size(self) -> u64 {
+ self.real_size
+ }
+ pub fn to_valid_seconds(self) -> Option<i64> {
+ if self.modified != 0 {
+ Some(self.modified as i64)
+ } else {
+ None
+ }
+ }
+ /// tell whether the file has holes (in which case the size displayed by
+ /// other tools may be greater than the "real" one returned by broot).
+ /// Not computed (return false) on windows or for directories.
+ pub fn is_sparse(self) -> bool {
+ self.sparse
+ }
+}
+
+impl AddAssign for FileSum {
+ #[allow(clippy::suspicious_op_assign_impl)]
+ fn add_assign(&mut self, other: Self) {
+ *self = Self::new(
+ self.real_size + other.real_size,
+ self.sparse | other.sparse,
+ self.count + other.count,
+ self.modified.max(other.modified),
+ );
+ }
+}
+
diff --git a/src/file_sum/sum_computation.rs b/src/file_sum/sum_computation.rs
new file mode 100644
index 0000000..6e7a099
--- /dev/null
+++ b/src/file_sum/sum_computation.rs
@@ -0,0 +1,152 @@
+use {
+ super::{extract_seconds, FileSum},
+ crate::task_sync::Dam,
+ crossbeam::channel,
+ std::{
+ collections::HashSet,
+ fs,
+ path::{Path, PathBuf},
+ sync::{
+ atomic::{AtomicIsize, Ordering},
+ Arc, Mutex,
+ },
+ thread,
+ },
+};
+
+#[cfg(unix)]
+use std::os::unix::fs::MetadataExt;
+
+const THREADS_COUNT: usize = 8;
+
+/// compute the consolidated numbers for a directory, with implementation
+/// varying depending on the OS:
+/// On unix, the computation is done on blocks of 512 bytes
+/// see https://doc.rust-lang.org/std/os/unix/fs/trait.MetadataExt.html#tymethod.blocks
+pub fn compute_dir_sum(path: &Path, dam: &Dam) -> Option<FileSum> {
+ //debug!("compute size of dir {:?} --------------- ", path);
+
+ // to avoid counting twice an inode, we store them in a set
+ #[cfg(unix)]
+ let inodes = Arc::new(Mutex::new(HashSet::<u64>::default()));
+
+ // this MPMC channel contains the directory paths which must be handled.
+ // A None means there's nothing left and the thread may send its result and stop
+ let (dirs_sender, dirs_receiver) = channel::unbounded();
+
+ // this MPMC channel is here for the threads to send their results
+ // at end of computation
+ let (thread_sum_sender, thread_sum_receiver) = channel::bounded(THREADS_COUNT);
+
+ // busy is the number of directories which are either being processed or queued
+ // We use this count to determine when threads can stop waiting for tasks
+ let busy = Arc::new(AtomicIsize::new(1));
+ dirs_sender.send(Some(PathBuf::from(path))).unwrap();
+
+ // Each thread does a summation without merge and the data are merged
+ // at the end (this avoids waiting for a mutex during computation)
+ for _ in 0..THREADS_COUNT {
+ let busy = Arc::clone(&busy);
+ let (dirs_sender, dirs_receiver) = (dirs_sender.clone(), dirs_receiver.clone());
+
+ #[cfg(unix)]
+ let inodes = inodes.clone();
+
+ let observer = dam.observer();
+ let thread_sum_sender = thread_sum_sender.clone();
+ thread::spawn(move || {
+ let mut thread_sum = FileSum::zero();
+ loop {
+ let o = dirs_receiver.recv();
+ if let Ok(Some(open_dir)) = o {
+ if let Ok(entries) = fs::read_dir(&open_dir) {
+ for e in entries.flatten() {
+ if let Ok(md) = e.metadata() {
+ if md.is_dir() {
+ // we add the directory to the channel of dirs needing
+ // processing
+ busy.fetch_add(1, Ordering::Relaxed);
+ dirs_sender.send(Some(e.path())).unwrap();
+ } else {
+
+ #[cfg(unix)]
+ if md.nlink() > 1 {
+ let mut inodes = inodes.lock().unwrap();
+ if !inodes.insert(md.ino()) {
+ // it was already in the set
+ continue;
+ }
+ }
+
+ }
+
+ #[cfg(unix)]
+ let size = md.blocks() * 512;
+
+ #[cfg(not(unix))]
+ let size = md.len();
+
+ let seconds = extract_seconds(&md);
+ let entry_sum = FileSum::new(size, false, 1, seconds);
+ thread_sum += entry_sum;
+ } else {
+ // we can't measure much but we can count the file
+ thread_sum.incr();
+ }
+ }
+ }
+ busy.fetch_sub(1, Ordering::Relaxed);
+ }
+ if busy.load(Ordering::Relaxed) < 1 {
+ dirs_sender.send(None).unwrap(); // to unlock the next waiting thread
+ break;
+ }
+ if observer.has_event() {
+ break;
+ }
+ }
+ thread_sum_sender.send(thread_sum).unwrap();
+ });
+ }
+ // Wait for the threads to finish and consolidate their results
+ let mut sum = compute_file_sum(path);
+ for _ in 0..THREADS_COUNT {
+ match thread_sum_receiver.recv() {
+ Ok(thread_sum) => {
+ sum += thread_sum;
+ }
+ Err(e) => {
+ warn!("Error while recv summing thread result : {:?}", e);
+ }
+ }
+ }
+ if dam.has_event() {
+ return None;
+ }
+ Some(sum)
+}
+
+/// compute the sum for a regular file (not a folder)
+pub fn compute_file_sum(path: &Path) -> FileSum {
+ match fs::metadata(path) {
+ Ok(md) => {
+ let seconds = extract_seconds(&md);
+
+ #[cfg(unix)]
+ {
+ let nominal_size = md.size();
+ let block_size = md.blocks() * 512;
+ FileSum::new(
+ block_size.min(nominal_size),
+ block_size < nominal_size,
+ 1,
+ seconds,
+ )
+ }
+
+ #[cfg(not(unix))]
+ FileSum::new(md.len(), false, 1, seconds)
+ }
+ Err(_) => FileSum::new(0, false, 1, 0),
+ }
+}