diff options
author | Vladimir Feinberg <vladimir.feinberg@gmail.com> | 2021-06-28 21:53:15 -0700 |
---|---|---|
committer | Vladimir Feinberg <vladimir.feinberg@gmail.com> | 2021-06-28 21:53:15 -0700 |
commit | fdde7e7a6ec3a14046dabc30de7051fab87c513f (patch) | |
tree | 9bd41c3939195d8e4f9cf58567a83c2892afc646 | |
parent | f22d454936fac7dfc6a91abae0809708d7f6e472 (diff) |
extend to multiple files
-rw-r--r-- | slb/src/fileblocks.rs | 24 | ||||
-rw-r--r-- | slb/src/main.rs | 9 | ||||
-rwxr-xr-x | test.sh | 11 |
3 files changed, 40 insertions, 4 deletions
diff --git a/slb/src/fileblocks.rs b/slb/src/fileblocks.rs index da557e7..2cee2fa 100644 --- a/slb/src/fileblocks.rs +++ b/slb/src/fileblocks.rs @@ -50,6 +50,30 @@ impl FileChunk { } } +/// Uses up to `max_chunks + paths.len()` chunks to chunkify multiple files. +pub fn chunkify_multiple(paths: &[PathBuf], max_chunks: usize, min_size: usize) -> Vec<FileChunk> { + assert!(max_chunks > 0); + let sizes: Vec<usize> = paths + .iter() + .map(|path| { + fs::metadata(path) + .expect("metadata") + .len() + .try_into() + .unwrap() + }) + .collect(); + let avg_size = (sizes.iter().copied().sum::<usize>() / paths.len()).max(1); + paths + .iter() + .zip(sizes.into_iter()) + .flat_map(|(path, sz)| { + let desired_chunks: usize = (sz / avg_size).max(1); + chunkify(&path, desired_chunks, min_size).into_iter() + }) + .collect() +} + /// Returns a list of up to `max_chunks` file chunks splitting up the given /// file, roughly of the same size, which should be rougly at least /// `min_size`, newline aligned. diff --git a/slb/src/main.rs b/slb/src/main.rs index 508eea3..5036a3e 100644 --- a/slb/src/main.rs +++ b/slb/src/main.rs @@ -83,10 +83,9 @@ struct Opt { #[structopt(long)] folder: String, - // TODO: support multiple infiles and chunkify across them - /// The input file to read lines from. + /// The input files to read lines from. #[structopt(long)] - infile: PathBuf, + infile: Vec<PathBuf>, /// Output file prefixes. #[structopt(long)] @@ -122,6 +121,7 @@ fn main() { let bufsize = opt.bufsize.unwrap_or(16) * 1024; let queuesize = 256; + assert!(!opt.infile.is_empty()); // TODO: could play with mapper/folder parallelism, bufsize, // and queuesize being tuned. Ideally could do tuning automatically. @@ -130,7 +130,8 @@ fn main() { // Allow enough chunks for parallelism but not so few the chunksize // is small. - let chunks = fileblocks::chunkify(&opt.infile, nthreads, bufsize); + + let chunks = fileblocks::chunkify_multiple(&opt.infile, nthreads, bufsize); let nthreads = chunks.len(); // smaller b/c of min bufsize assert!(nthreads >= 1); @@ -37,5 +37,16 @@ for f in $testfiles ; do sort -k2nr -k1 -o "expected-$b" "expected-$b" sort -k2nr -k1 -o "actual-$b" "actual-$b" diff "expected-$b" "actual-$b" >/dev/null + + split -n l/10 "$f" split-${b}- + "$cwd/target/release/slb" \ + --mapper 'tr "[:space:]" "\n" | rg -v "^$"' \ + --folder "awk '{a[\$0]++}END{for(k in a)print k,a[k]}'" \ + --infile split-${b}-* \ + --outprefix "actual-split-$b." + cat actual-split-${b}.* > "actual-split-$b" + rm actual-split-${b}.* + sort -k2nr -k1 -o "actual-split-$b" "actual-split-$b" + diff "actual-$b" "actual-split-$b" >/dev/null done popd >/dev/null |