diff options
author | Vladimir Feinberg <vladimir.feinberg@gmail.com> | 2021-06-28 21:53:30 -0700 |
---|---|---|
committer | Vladimir Feinberg <vladimir.feinberg@gmail.com> | 2021-06-28 21:53:30 -0700 |
commit | 70409a48fe0e55226e21c8aa16ee6f355ad7ae8b (patch) | |
tree | 33ad49d09445a8ee0be3e7b9446c8ee949e27c68 | |
parent | fdde7e7a6ec3a14046dabc30de7051fab87c513f (diff) | |
parent | b8565aad0766973fcf87736674c918fe29961634 (diff) |
Merge branch 'main' of github.com:vlad17/slb into main
-rw-r--r-- | README.md | 18 | ||||
-rw-r--r-- | examples/wc.awk | 8 | ||||
-rw-r--r-- | slb/Cargo.toml | 12 |
3 files changed, 25 insertions, 13 deletions
@@ -3,7 +3,6 @@ Like `parallel --pipe --roundrobin` but load balancing is performed based on input line hashing. When performing keyed aggregations in child processes this is crucial since then only one shard contains a given key. Here's a word count example on a 16-physical-cpu machine: ``` -cargo build --release curl -o enwik9.bz2 https://cs.fit.edu/~mmahoney/compression/enwik9.bz2 bunzip2 enwik9.bz2 examples/clean.sh < enwik9 > enwik9.clean ; rm enwik9 @@ -11,7 +10,7 @@ examples/clean.sh < enwik9 > enwik9.clean ; rm enwik9 /usr/bin/time -f "%e sec" awk -f examples/wc.awk enwik9.clean > wikawk.txt # 203.97 sec -/usr/bin/time -f "%e sec" target/release/slb \ +/usr/bin/time -f "%e sec" slb \ --mapper 'tr " " "\n" | rg -v "^$"' \ --folder "awk '{a[\$0]++}END{for(k in a)print k,a[k]}'" \ --infile enwik9.clean \ @@ -72,7 +71,7 @@ parallel --pipepart -a kdd12.tr wc -w | awk '{a+=$0}END{print a}' /usr/bin/time -f "%e sec %M KB" awk -f examples/svm-featurecount.awk kdd12.tr > results-awk.txt # 1032.18 sec 13721032 KB -/usr/bin/time -f "%e sec %M KB" target/release/slb \ +/usr/bin/time -f "%e sec %M KB" slb \ --mapper 'sed -E "s/^[^ ]+ //" | sed -E "s/:[^ ]+//g" | tr " " "\n" | rg -v "^$"' \ --folder "awk '{a[\$0]++}END{for(k in a)print k,a[k]}'" \ --infile kdd12.tr \ @@ -104,7 +103,7 @@ du -hs kdda /usr/bin/time -f "%e sec %M KB" awk -f examples/svm-countdistinct.awk kdda > cdawk.txt # 388.72 sec 23895104 KB -/usr/bin/time -f "%e sec %M KB" target/release/slb \ +/usr/bin/time -f "%e sec %M KB" slb \ --mapper 'sed -E "s/^[^ ]+ //" | tr " " "\n" | tr ":" " " | rg -v "^$"' \ --folder "awk '{if(!(\$1 in a)||length(a[\$1])<100)a[\$1][\$2]=1}END{for(k in a)print k,length(a[k])}'" \ --infile kdda \ @@ -127,8 +126,15 @@ Note the above examples demonstrate the convenience of the tool: The last point holds because `slb` ensures each parallel invocation recieves a _unique partition_ of the key space. In turn, we use less memory because each folder is only tracking aggregates for its own key space and less code because we do not need to write a combiner that merges two maps. -To install locally, run +To install locally from `crates.io`, run ``` -cargo install --path slb/ +cargo install slb ``` + +## Dev Stuff + +Rudimentary testing via `./test.sh`. + +Re-publish to `crates.io` with `cd slb && cargo publish`. + diff --git a/examples/wc.awk b/examples/wc.awk index ca709df..426e5e7 100644 --- a/examples/wc.awk +++ b/examples/wc.awk @@ -1,11 +1,9 @@ #!/bin/awk # counts the number of appearances for each word in the stream -BEGIN { - RS = "[[:space:]]" -} 1 { - if ($0 != "") - a[$0]++ + for (i=1; i<=NF; i++) + if ($i != "") + a[$i]++ } END { for (k in a) diff --git a/slb/Cargo.toml b/slb/Cargo.toml index 278c63b..d447dfa 100644 --- a/slb/Cargo.toml +++ b/slb/Cargo.toml @@ -1,11 +1,19 @@ [package] name = "slb" -version = "0.1.0" +version = "0.1.1" authors = ["Vladimir Feinberg <vladimir.feinberg@gmail.com>"] edition = "2018" +description = "Sharded load balancing text-streaming Unix tool" +homepage = "https://github.com/vlad17/slb" +repository = "https://github.com/vlad17/slb" +readme = "../README.md" +keywords = ["stream", "text", "parallel", "cli"] +categories = ["command-line-utilities", "concurrency"] +license = "Apache-2.0" [dependencies] memchr = "2.3" structopt = "0.3" num_cpus = "1.13" -bstr = "0.2"
\ No newline at end of file +bstr = "0.2" + |