extend to multiple files

author: Vladimir Feinberg <vladimir.feinberg@gmail.com> 2021-06-28 21:53:15 -0700
committer: Vladimir Feinberg <vladimir.feinberg@gmail.com> 2021-06-28 21:53:15 -0700
commit: fdde7e7a6ec3a14046dabc30de7051fab87c513f (patch)
tree: 9bd41c3939195d8e4f9cf58567a83c2892afc646
parent: f22d454936fac7dfc6a91abae0809708d7f6e472 (diff)
3 files changed, 40 insertions, 4 deletions
diff --git a/slb/src/fileblocks.rs b/slb/src/fileblocks.rs
index da557e7..2cee2fa 100644
--- a/slb/src/fileblocks.rs
+++ b/slb/src/fileblocks.rs
@@ -50,6 +50,30 @@ impl FileChunk {
     }
 }
 
+/// Uses up to `max_chunks + paths.len()` chunks to chunkify multiple files.
+pub fn chunkify_multiple(paths: &[PathBuf], max_chunks: usize, min_size: usize) -> Vec<FileChunk> {
+    assert!(max_chunks > 0);
+    let sizes: Vec<usize> = paths
+        .iter()
+        .map(|path| {
+            fs::metadata(path)
+                .expect("metadata")
+                .len()
+                .try_into()
+                .unwrap()
+        })
+        .collect();
+    let avg_size = (sizes.iter().copied().sum::<usize>() / paths.len()).max(1);
+    paths
+        .iter()
+        .zip(sizes.into_iter())
+        .flat_map(|(path, sz)| {
+            let desired_chunks: usize = (sz / avg_size).max(1);
+            chunkify(&path, desired_chunks, min_size).into_iter()
+        })
+        .collect()
+}
+
 /// Returns a list of up to `max_chunks` file chunks splitting up the given
 /// file, roughly of the same size, which should be rougly at least
 /// `min_size`, newline aligned.
diff --git a/slb/src/main.rs b/slb/src/main.rs
index 508eea3..5036a3e 100644
--- a/slb/src/main.rs
+++ b/slb/src/main.rs
@@ -83,10 +83,9 @@ struct Opt {
     #[structopt(long)]
     folder: String,
 
-    // TODO: support multiple infiles and chunkify across them
-    /// The input file to read lines from.
+    /// The input files to read lines from.
     #[structopt(long)]
-    infile: PathBuf,
+    infile: Vec<PathBuf>,
 
     /// Output file prefixes.
     #[structopt(long)]
@@ -122,6 +121,7 @@ fn main() {
     let bufsize = opt.bufsize.unwrap_or(16) * 1024;
     let queuesize = 256;
 
+    assert!(!opt.infile.is_empty());
     // TODO: could play with mapper/folder parallelism, bufsize,
     // and queuesize being tuned. Ideally could do tuning automatically.
 
@@ -130,7 +130,8 @@ fn main() {
 
     // Allow enough chunks for parallelism but not so few the chunksize
     // is small.
-    let chunks = fileblocks::chunkify(&opt.infile, nthreads, bufsize);
+
+    let chunks = fileblocks::chunkify_multiple(&opt.infile, nthreads, bufsize);
     let nthreads = chunks.len(); // smaller b/c of min bufsize
     assert!(nthreads >= 1);
 
diff --git a/test.sh b/test.sh
index c7ba4b3..f565dd9 100755
--- a/test.sh
+++ b/test.sh
@@ -37,5 +37,16 @@ for f in $testfiles ; do
     sort -k2nr -k1 -o "expected-$b" "expected-$b"
     sort -k2nr -k1 -o "actual-$b" "actual-$b"
     diff "expected-$b" "actual-$b" >/dev/null
+
+    split -n l/10 "$f" split-${b}-
+    "$cwd/target/release/slb" \
+        --mapper 'tr "[:space:]" "\n" | rg -v "^$"' \
+        --folder "awk '{a[\$0]++}END{for(k in a)print k,a[k]}'" \
+        --infile split-${b}-* \
+        --outprefix "actual-split-$b."
+    cat actual-split-${b}.* > "actual-split-$b"
+    rm actual-split-${b}.*
+    sort -k2nr -k1 -o "actual-split-$b" "actual-split-$b"
+    diff "actual-$b" "actual-split-$b" >/dev/null
 done 
 popd >/dev/null
author	Vladimir Feinberg <vladimir.feinberg@gmail.com>	2021-06-28 21:53:15 -0700
committer	Vladimir Feinberg <vladimir.feinberg@gmail.com>	2021-06-28 21:53:15 -0700
commit	fdde7e7a6ec3a14046dabc30de7051fab87c513f (patch)
tree	9bd41c3939195d8e4f9cf58567a83c2892afc646
parent	f22d454936fac7dfc6a91abae0809708d7f6e472 (diff)