summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVladimir Feinberg <vladimir.feinberg@gmail.com>2021-06-28 21:53:15 -0700
committerVladimir Feinberg <vladimir.feinberg@gmail.com>2021-06-28 21:53:15 -0700
commitfdde7e7a6ec3a14046dabc30de7051fab87c513f (patch)
tree9bd41c3939195d8e4f9cf58567a83c2892afc646
parentf22d454936fac7dfc6a91abae0809708d7f6e472 (diff)
extend to multiple files
-rw-r--r--slb/src/fileblocks.rs24
-rw-r--r--slb/src/main.rs9
-rwxr-xr-xtest.sh11
3 files changed, 40 insertions, 4 deletions
diff --git a/slb/src/fileblocks.rs b/slb/src/fileblocks.rs
index da557e7..2cee2fa 100644
--- a/slb/src/fileblocks.rs
+++ b/slb/src/fileblocks.rs
@@ -50,6 +50,30 @@ impl FileChunk {
}
}
+/// Uses up to `max_chunks + paths.len()` chunks to chunkify multiple files.
+pub fn chunkify_multiple(paths: &[PathBuf], max_chunks: usize, min_size: usize) -> Vec<FileChunk> {
+ assert!(max_chunks > 0);
+ let sizes: Vec<usize> = paths
+ .iter()
+ .map(|path| {
+ fs::metadata(path)
+ .expect("metadata")
+ .len()
+ .try_into()
+ .unwrap()
+ })
+ .collect();
+ let avg_size = (sizes.iter().copied().sum::<usize>() / paths.len()).max(1);
+ paths
+ .iter()
+ .zip(sizes.into_iter())
+ .flat_map(|(path, sz)| {
+ let desired_chunks: usize = (sz / avg_size).max(1);
+ chunkify(&path, desired_chunks, min_size).into_iter()
+ })
+ .collect()
+}
+
/// Returns a list of up to `max_chunks` file chunks splitting up the given
/// file, roughly of the same size, which should be rougly at least
/// `min_size`, newline aligned.
diff --git a/slb/src/main.rs b/slb/src/main.rs
index 508eea3..5036a3e 100644
--- a/slb/src/main.rs
+++ b/slb/src/main.rs
@@ -83,10 +83,9 @@ struct Opt {
#[structopt(long)]
folder: String,
- // TODO: support multiple infiles and chunkify across them
- /// The input file to read lines from.
+ /// The input files to read lines from.
#[structopt(long)]
- infile: PathBuf,
+ infile: Vec<PathBuf>,
/// Output file prefixes.
#[structopt(long)]
@@ -122,6 +121,7 @@ fn main() {
let bufsize = opt.bufsize.unwrap_or(16) * 1024;
let queuesize = 256;
+ assert!(!opt.infile.is_empty());
// TODO: could play with mapper/folder parallelism, bufsize,
// and queuesize being tuned. Ideally could do tuning automatically.
@@ -130,7 +130,8 @@ fn main() {
// Allow enough chunks for parallelism but not so few the chunksize
// is small.
- let chunks = fileblocks::chunkify(&opt.infile, nthreads, bufsize);
+
+ let chunks = fileblocks::chunkify_multiple(&opt.infile, nthreads, bufsize);
let nthreads = chunks.len(); // smaller b/c of min bufsize
assert!(nthreads >= 1);
diff --git a/test.sh b/test.sh
index c7ba4b3..f565dd9 100755
--- a/test.sh
+++ b/test.sh
@@ -37,5 +37,16 @@ for f in $testfiles ; do
sort -k2nr -k1 -o "expected-$b" "expected-$b"
sort -k2nr -k1 -o "actual-$b" "actual-$b"
diff "expected-$b" "actual-$b" >/dev/null
+
+ split -n l/10 "$f" split-${b}-
+ "$cwd/target/release/slb" \
+ --mapper 'tr "[:space:]" "\n" | rg -v "^$"' \
+ --folder "awk '{a[\$0]++}END{for(k in a)print k,a[k]}'" \
+ --infile split-${b}-* \
+ --outprefix "actual-split-$b."
+ cat actual-split-${b}.* > "actual-split-$b"
+ rm actual-split-${b}.*
+ sort -k2nr -k1 -o "actual-split-$b" "actual-split-$b"
+ diff "actual-$b" "actual-split-$b" >/dev/null
done
popd >/dev/null