diff options
author | Jon Moroney <darakian@gmail.com> | 2020-08-02 15:41:03 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-08-02 15:41:03 -0700 |
commit | 89b2e8d439854f1379d97a8f159ae7c3c8a18933 (patch) | |
tree | 14974e15f898aadd7f624b95c269316e93023657 | |
parent | 7035276acddb21e9b13bc4350fd8d35c0e26ddc6 (diff) | |
parent | cb44786864148be2664e9fc6489ebf0029080cdb (diff) |
Merge pull request #35 from darakian/factor-out-fileinfo
Factor out fileinfo to its own file
-rw-r--r-- | src/fileinfo.rs | 211 | ||||
-rw-r--r-- | src/lib.rs | 212 | ||||
-rw-r--r-- | src/main.rs | 2 |
3 files changed, 214 insertions, 211 deletions
diff --git a/src/fileinfo.rs b/src/fileinfo.rs new file mode 100644 index 0000000..a0088f4 --- /dev/null +++ b/src/fileinfo.rs @@ -0,0 +1,211 @@ +use serde::ser::{Serialize, Serializer, SerializeStruct}; +use siphasher::sip128::Hasher128; +use std::hash::Hasher; +use std::path::PathBuf; +use std::cmp::Ordering; +use std::io::Read; +use std::fs; + +const BLOCK_SIZE: usize = 4096; + +#[derive(PartialEq)] +pub enum HashMode{ + Full, + Partial +} + +/// Serializable struct containing entries for a specific file. These structs will identify individual files as a collection of paths and associated hash and length data. +#[derive(Debug)] +pub struct Fileinfo{ + full_hash: Option<u128>, + partial_hash: Option<u128>, + file_length: u64, + pub(crate) file_paths: Vec<PathBuf>, +} + +impl Fileinfo{ + /// Creates a new Fileinfo collection struct. + /// + /// # Examples + /// ``` + /// use std::path::Path; + /// use ddh::fileinfo::Fileinfo; + /// + /// Fileinfo::new( + /// None, + /// None, + /// 3, + /// Path::new("./foo/bar.txt").to_path_buf() + /// ); + /// ``` + pub fn new(full_hash: Option<u128>, partial_hash: Option<u128>, length: u64, path: PathBuf) -> Self{ + Fileinfo{full_hash: full_hash, partial_hash: partial_hash, file_length: length, file_paths: vec![path]} + } + /// Gets the length of the files in the current collection. + /// + /// # Examples + /// ``` + /// use std::path::Path; + /// use ddh::fileinfo::Fileinfo; + /// + /// let fi = Fileinfo::new(None, None, 3, Path::new("./foo/bar.txt").to_path_buf()); + /// let len = fi.get_length(); + /// assert_eq!(3, len); + /// ``` + pub fn get_length(&self) -> u64{ + self.file_length + } + /// Gets the hash of the full file if available. + /// + /// # Examples + /// ``` + /// use std::path::Path; + /// use ddh::fileinfo::Fileinfo; + /// + /// let fi = Fileinfo::new(Some(123), None, 3, Path::new("./foo/bar.txt").to_path_buf()); + /// let f_hash = fi.get_full_hash(); + /// assert_eq!(Some(123), f_hash); + /// ``` + pub fn get_full_hash(&self) -> Option<u128>{ + self.full_hash + } + pub(crate) fn set_full_hash(&mut self, hash: Option<u128>) -> (){ + self.full_hash = hash + } + /// Gets the hash of the partially read file if available. + /// + /// # Examples + /// ``` + /// use std::path::Path; + /// use ddh::fileinfo::Fileinfo; + /// + /// let fi = Fileinfo::new(None, Some(123), 3, Path::new("./foo/bar.txt").to_path_buf()); + /// let p_hash = fi.get_partial_hash(); + /// assert_eq!(Some(123), p_hash); + /// ``` + pub fn get_partial_hash(&self) -> Option<u128>{ + self.partial_hash + } + pub(crate) fn set_partial_hash(&mut self, hash: Option<u128>) -> (){ + self.partial_hash = hash + } + /// Gets a candidate name. This will be the name of the first file inserted into the collection and so can vary. + /// + /// # Examples + /// ``` + /// use std::path::Path; + /// use ddh::fileinfo::Fileinfo; + /// + /// let fi = Fileinfo::new(None, None, 3, Path::new("./foo/bar.txt").to_path_buf()); + /// let some_name = fi.get_candidate_name(); + /// assert_eq!("bar.txt", some_name) + /// ``` + pub fn get_candidate_name(&self) -> &str{ + self.file_paths + .iter() + .next() + .unwrap() + .to_str() + .unwrap() + .rsplit("/") + .next() + .unwrap() + } + /// Gets all paths in the current collection. This can be used to get the names of each file with the string `rsplit("/")` method. + /// + /// # Examples + /// ``` + /// use std::path::Path; + /// use ddh::fileinfo::Fileinfo; + /// + /// let fi = Fileinfo::new(None, None, 3, Path::new("./foo/bar.txt").to_path_buf()); + /// let all_files = fi.get_paths(); + /// assert_eq!(&vec![Path::new("./foo/bar.txt").to_path_buf()], + /// all_files); + /// ``` + pub fn get_paths(&self) -> &Vec<PathBuf>{ + return &self.file_paths + } + + pub fn generate_hash(&mut self, mode: HashMode) -> Option<u128>{ + let mut hasher = siphasher::sip128::SipHasher::new(); + match fs::File::open( + self.file_paths + .iter() + .next() + .expect("Cannot read file path from struct") + ) { + Ok(mut f) => { + /* We want a read call to be "large" for two reasons + 1) Force filesystem read ahead behavior + 2) Fewer system calls for a given file. + Currently 16KB */ + let mut hash_buffer = [0;BLOCK_SIZE * 4]; + loop { + match f.read(&mut hash_buffer) { + Ok(n) if n>0 => hasher.write(&hash_buffer), + Ok(n) if n==0 => break, + Err(_e) => { + return None + }, + _ => panic!("Negative length read in hashing"), + } + if mode == HashMode::Partial{ + return Some(hasher.finish128().into()); + } + } + return Some(hasher.finish128().into()); + } + Err(_e) => { + return None + } + } + } +} + +impl Serialize for Fileinfo{ + fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> + where + S: Serializer, + { + let mut state = serializer.serialize_struct("Fileinfo", 4)?; + state.serialize_field("partial_hash", &self.partial_hash)?; + state.serialize_field("full_hash", &self.full_hash)?; + state.serialize_field("file_length", &self.file_length)?; + state.serialize_field("file_paths", &self.file_paths)?; + state.end() + } +} + +impl PartialEq for Fileinfo{ + fn eq(&self, other: &Fileinfo) -> bool { + (self.file_length==other.file_length)&& + (self.partial_hash==other.partial_hash)&& + (self.full_hash==other.full_hash) + } +} +impl Eq for Fileinfo{} + +impl PartialOrd for Fileinfo{ + fn partial_cmp(&self, other: &Fileinfo) -> Option<Ordering>{ + if self.full_hash.is_some() && other.full_hash.is_some(){ + Some(self.full_hash.cmp(&other.full_hash)) + } else if self.partial_hash.is_some() && other.partial_hash.is_some(){ + Some(self.partial_hash.cmp(&other.partial_hash)) + } else { + Some(self.file_length.cmp(&other.file_length)) + } + } +} + +impl Ord for Fileinfo{ + fn cmp(&self, other: &Fileinfo) -> Ordering { + if self.full_hash.is_some() && other.full_hash.is_some(){ + self.full_hash.cmp(&other.full_hash) + } else if self.partial_hash.is_some() && other.partial_hash.is_some(){ + self.partial_hash.cmp(&other.partial_hash) + } else { + self.file_length.cmp(&other.file_length) + } + } +} @@ -2,230 +2,22 @@ //! //! `ddh` is a collection of functions and structs to aid in analysing filesystem directories. -pub mod utils; +pub mod fileinfo; +use fileinfo::{Fileinfo, HashMode}; -use std::hash::{Hasher}; use std::fs::{self, DirEntry}; -use std::io::{Read}; use std::path::{PathBuf, Path}; -use std::cmp::Ordering; -use serde::ser::{Serialize, Serializer, SerializeStruct}; -use siphasher::sip128::Hasher128; use rayon::prelude::*; use std::sync::mpsc::{Sender, channel}; use std::collections::hash_map::{HashMap, Entry}; use std::io::{Error, ErrorKind}; use nohash_hasher::IntMap; -const BLOCK_SIZE: usize = 4096; - -#[derive(PartialEq)] -enum HashMode{ - Full, - Partial -} - enum ChannelPackage{ Success(Fileinfo), Fail(PathBuf, std::io::Error), } -/// Serializable struct containing entries for a specific file. These structs will identify individual files as a collection of paths and associated hash and length data. -#[derive(Debug)] -pub struct Fileinfo{ - full_hash: Option<u128>, - partial_hash: Option<u128>, - file_length: u64, - file_paths: Vec<PathBuf>, -} - -impl Fileinfo{ - /// Creates a new Fileinfo collection struct. - /// - /// # Examples - /// ``` - /// use std::path::Path; - /// use ddh::Fileinfo; - /// - /// Fileinfo::new( - /// None, - /// None, - /// 3, - /// Path::new("./foo/bar.txt").to_path_buf() - /// ); - /// ``` - pub fn new(full_hash: Option<u128>, partial_hash: Option<u128>, length: u64, path: PathBuf) -> Self{ - Fileinfo{full_hash: full_hash, partial_hash: partial_hash, file_length: length, file_paths: vec![path]} - } - /// Gets the length of the files in the current collection. - /// - /// # Examples - /// ``` - /// use std::path::Path; - /// use ddh::Fileinfo; - /// - /// let fi = Fileinfo::new(None, None, 3, Path::new("./foo/bar.txt").to_path_buf()); - /// let len = fi.get_length(); - /// assert_eq!(3, len); - /// ``` - pub fn get_length(&self) -> u64{ - self.file_length - } - /// Gets the hash of the full file if available. - /// - /// # Examples - /// ``` - /// use std::path::Path; - /// use ddh::Fileinfo; - /// - /// let fi = Fileinfo::new(Some(123), None, 3, Path::new("./foo/bar.txt").to_path_buf()); - /// let f_hash = fi.get_full_hash(); - /// assert_eq!(Some(123), f_hash); - /// ``` - pub fn get_full_hash(&self) -> Option<u128>{ - self.full_hash - } - fn set_full_hash(&mut self, hash: Option<u128>) -> (){ - self.full_hash = hash - } - /// Gets the hash of the partially read file if available. - /// - /// # Examples - /// ``` - /// use std::path::Path; - /// use ddh::Fileinfo; - /// - /// let fi = Fileinfo::new(None, Some(123), 3, Path::new("./foo/bar.txt").to_path_buf()); - /// let p_hash = fi.get_partial_hash(); - /// assert_eq!(Some(123), p_hash); - /// ``` - pub fn get_partial_hash(&self) -> Option<u128>{ - self.partial_hash - } - fn set_partial_hash(&mut self, hash: Option<u128>) -> (){ - self.partial_hash = hash - } - /// Gets a candidate name. This will be the name of the first file inserted into the collection and so can vary. - /// - /// # Examples - /// ``` - /// use std::path::Path; - /// use ddh::Fileinfo; - /// - /// let fi = Fileinfo::new(None, None, 3, Path::new("./foo/bar.txt").to_path_buf()); - /// let some_name = fi.get_candidate_name(); - /// assert_eq!("bar.txt", some_name) - /// ``` - pub fn get_candidate_name(&self) -> &str{ - self.file_paths - .iter() - .next() - .unwrap() - .to_str() - .unwrap() - .rsplit("/") - .next() - .unwrap() - } - /// Gets all paths in the current collection. This can be used to get the names of each file with the string `rsplit("/")` method. - /// - /// # Examples - /// ``` - /// use std::path::Path; - /// use ddh::Fileinfo; - /// - /// let fi = Fileinfo::new(None, None, 3, Path::new("./foo/bar.txt").to_path_buf()); - /// let all_files = fi.get_paths(); - /// assert_eq!(&vec![Path::new("./foo/bar.txt").to_path_buf()], - /// all_files); - /// ``` - pub fn get_paths(&self) -> &Vec<PathBuf>{ - return &self.file_paths - } - - fn generate_hash(&mut self, mode: HashMode) -> Option<u128>{ - let mut hasher = siphasher::sip128::SipHasher::new(); - match fs::File::open( - self.file_paths - .iter() - .next() - .expect("Cannot read file path from struct") - ) { - Ok(mut f) => { - /* We want a read call to be "large" for two reasons - 1) Force filesystem read ahead behavior - 2) Fewer system calls for a given file. - Currently 16KB */ - let mut hash_buffer = [0;BLOCK_SIZE * 4]; - loop { - match f.read(&mut hash_buffer) { - Ok(n) if n>0 => hasher.write(&hash_buffer), - Ok(n) if n==0 => break, - Err(_e) => { - return None - }, - _ => panic!("Negative length read in hashing"), - } - if mode == HashMode::Partial{ - return Some(hasher.finish128().into()); - } - } - return Some(hasher.finish128().into()); - } - Err(_e) => { - return None - } - } - } -} - -impl Serialize for Fileinfo{ - fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> - where - S: Serializer, - { - let mut state = serializer.serialize_struct("Fileinfo", 4)?; - state.serialize_field("partial_hash", &self.partial_hash)?; - state.serialize_field("full_hash", &self.full_hash)?; - state.serialize_field("file_length", &self.file_length)?; - state.serialize_field("file_paths", &self.file_paths)?; - state.end() - } -} - -impl PartialEq for Fileinfo{ - fn eq(&self, other: &Fileinfo) -> bool { - (self.file_length==other.file_length)&& - (self.partial_hash==other.partial_hash)&& - (self.full_hash==other.full_hash) - } -} -impl Eq for Fileinfo{} - -impl PartialOrd for Fileinfo{ - fn partial_cmp(&self, other: &Fileinfo) -> Option<Ordering>{ - if self.full_hash.is_some() && other.full_hash.is_some(){ - Some(self.full_hash.cmp(&other.full_hash)) - } else if self.partial_hash.is_some() && other.partial_hash.is_some(){ - Some(self.partial_hash.cmp(&other.partial_hash)) - } else { - Some(self.file_length.cmp(&other.file_length)) - } - } -} - -impl Ord for Fileinfo{ - fn cmp(&self, other: &Fileinfo) -> Ordering { - if self.full_hash.is_some() && other.full_hash.is_some(){ - self.full_hash.cmp(&other.full_hash) - } else if self.partial_hash.is_some() && other.partial_hash.is_some(){ - self.partial_hash.cmp(&other.partial_hash) - } else { - self.file_length.cmp(&other.file_length) - } - } -} - /// Constructs a list of unique files from a list of directories. /// /// # Examples diff --git a/src/main.rs b/src/main.rs index adb56b7..3fcbfc6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,7 +3,7 @@ use std::fs::{self}; use std::io::prelude::*; use clap::{Arg, App}; use rayon::prelude::*; -use ddh::{Fileinfo}; +use ddh::fileinfo::{Fileinfo}; use std::path::{PathBuf}; #[derive(Debug, Copy, Clone)] |