summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVladimir Feinberg <vladimir.feinberg@gmail.com>2021-06-28 21:53:30 -0700
committerVladimir Feinberg <vladimir.feinberg@gmail.com>2021-06-28 21:53:30 -0700
commit70409a48fe0e55226e21c8aa16ee6f355ad7ae8b (patch)
tree33ad49d09445a8ee0be3e7b9446c8ee949e27c68
parentfdde7e7a6ec3a14046dabc30de7051fab87c513f (diff)
parentb8565aad0766973fcf87736674c918fe29961634 (diff)
Merge branch 'main' of github.com:vlad17/slb into main
-rw-r--r--README.md18
-rw-r--r--examples/wc.awk8
-rw-r--r--slb/Cargo.toml12
3 files changed, 25 insertions, 13 deletions
diff --git a/README.md b/README.md
index a197174..680a804 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,6 @@
Like `parallel --pipe --roundrobin` but load balancing is performed based on input line hashing. When performing keyed aggregations in child processes this is crucial since then only one shard contains a given key. Here's a word count example on a 16-physical-cpu machine:
```
-cargo build --release
curl -o enwik9.bz2 https://cs.fit.edu/~mmahoney/compression/enwik9.bz2
bunzip2 enwik9.bz2
examples/clean.sh < enwik9 > enwik9.clean ; rm enwik9
@@ -11,7 +10,7 @@ examples/clean.sh < enwik9 > enwik9.clean ; rm enwik9
/usr/bin/time -f "%e sec" awk -f examples/wc.awk enwik9.clean > wikawk.txt
# 203.97 sec
-/usr/bin/time -f "%e sec" target/release/slb \
+/usr/bin/time -f "%e sec" slb \
--mapper 'tr " " "\n" | rg -v "^$"' \
--folder "awk '{a[\$0]++}END{for(k in a)print k,a[k]}'" \
--infile enwik9.clean \
@@ -72,7 +71,7 @@ parallel --pipepart -a kdd12.tr wc -w | awk '{a+=$0}END{print a}'
/usr/bin/time -f "%e sec %M KB" awk -f examples/svm-featurecount.awk kdd12.tr > results-awk.txt
# 1032.18 sec 13721032 KB
-/usr/bin/time -f "%e sec %M KB" target/release/slb \
+/usr/bin/time -f "%e sec %M KB" slb \
--mapper 'sed -E "s/^[^ ]+ //" | sed -E "s/:[^ ]+//g" | tr " " "\n" | rg -v "^$"' \
--folder "awk '{a[\$0]++}END{for(k in a)print k,a[k]}'" \
--infile kdd12.tr \
@@ -104,7 +103,7 @@ du -hs kdda
/usr/bin/time -f "%e sec %M KB" awk -f examples/svm-countdistinct.awk kdda > cdawk.txt
# 388.72 sec 23895104 KB
-/usr/bin/time -f "%e sec %M KB" target/release/slb \
+/usr/bin/time -f "%e sec %M KB" slb \
--mapper 'sed -E "s/^[^ ]+ //" | tr " " "\n" | tr ":" " " | rg -v "^$"' \
--folder "awk '{if(!(\$1 in a)||length(a[\$1])<100)a[\$1][\$2]=1}END{for(k in a)print k,length(a[k])}'" \
--infile kdda \
@@ -127,8 +126,15 @@ Note the above examples demonstrate the convenience of the tool:
The last point holds because `slb` ensures each parallel invocation recieves a _unique partition_ of the key space. In turn, we use less memory because each folder is only tracking aggregates for its own key space and less code because we do not need to write a combiner that merges two maps.
-To install locally, run
+To install locally from `crates.io`, run
```
-cargo install --path slb/
+cargo install slb
```
+
+## Dev Stuff
+
+Rudimentary testing via `./test.sh`.
+
+Re-publish to `crates.io` with `cd slb && cargo publish`.
+
diff --git a/examples/wc.awk b/examples/wc.awk
index ca709df..426e5e7 100644
--- a/examples/wc.awk
+++ b/examples/wc.awk
@@ -1,11 +1,9 @@
#!/bin/awk
# counts the number of appearances for each word in the stream
-BEGIN {
- RS = "[[:space:]]"
-}
1 {
- if ($0 != "")
- a[$0]++
+ for (i=1; i<=NF; i++)
+ if ($i != "")
+ a[$i]++
}
END {
for (k in a)
diff --git a/slb/Cargo.toml b/slb/Cargo.toml
index 278c63b..d447dfa 100644
--- a/slb/Cargo.toml
+++ b/slb/Cargo.toml
@@ -1,11 +1,19 @@
[package]
name = "slb"
-version = "0.1.0"
+version = "0.1.1"
authors = ["Vladimir Feinberg <vladimir.feinberg@gmail.com>"]
edition = "2018"
+description = "Sharded load balancing text-streaming Unix tool"
+homepage = "https://github.com/vlad17/slb"
+repository = "https://github.com/vlad17/slb"
+readme = "../README.md"
+keywords = ["stream", "text", "parallel", "cli"]
+categories = ["command-line-utilities", "concurrency"]
+license = "Apache-2.0"
[dependencies]
memchr = "2.3"
structopt = "0.3"
num_cpus = "1.13"
-bstr = "0.2" \ No newline at end of file
+bstr = "0.2"
+