Merge branch 'main' of github.com:vlad17/slb into main

author: Vladimir Feinberg <vladimir.feinberg@gmail.com> 2021-06-28 21:53:30 -0700
committer: Vladimir Feinberg <vladimir.feinberg@gmail.com> 2021-06-28 21:53:30 -0700
commit: 70409a48fe0e55226e21c8aa16ee6f355ad7ae8b (patch)
tree: 33ad49d09445a8ee0be3e7b9446c8ee949e27c68
parent: fdde7e7a6ec3a14046dabc30de7051fab87c513f (diff)
parent: b8565aad0766973fcf87736674c918fe29961634 (diff)
3 files changed, 25 insertions, 13 deletions
diff --git a/README.md b/README.md
index a197174..680a804 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,6 @@
 Like `parallel --pipe --roundrobin` but load balancing is performed based on input line hashing. When performing keyed aggregations in child processes this is crucial since then only one shard contains a given key. Here's a word count example on a 16-physical-cpu machine:
 
 ```
-cargo build --release
 curl -o enwik9.bz2 https://cs.fit.edu/~mmahoney/compression/enwik9.bz2
 bunzip2 enwik9.bz2
 examples/clean.sh < enwik9 > enwik9.clean ; rm enwik9
@@ -11,7 +10,7 @@ examples/clean.sh < enwik9 > enwik9.clean ; rm enwik9
 /usr/bin/time -f "%e sec" awk -f examples/wc.awk enwik9.clean > wikawk.txt
 # 203.97 sec
 
-/usr/bin/time -f "%e sec" target/release/slb \
+/usr/bin/time -f "%e sec" slb \
   --mapper 'tr " " "\n" | rg -v "^$"' \
   --folder "awk '{a[\$0]++}END{for(k in a)print k,a[k]}'" \
   --infile enwik9.clean \
@@ -72,7 +71,7 @@ parallel --pipepart -a kdd12.tr wc -w | awk '{a+=$0}END{print a}'
 /usr/bin/time -f "%e sec %M KB" awk -f examples/svm-featurecount.awk kdd12.tr > results-awk.txt
 # 1032.18 sec 13721032 KB
 
-/usr/bin/time -f "%e sec %M KB" target/release/slb \
+/usr/bin/time -f "%e sec %M KB" slb \
   --mapper 'sed -E "s/^[^ ]+ //" | sed -E "s/:[^ ]+//g" | tr " " "\n" | rg -v "^$"' \
   --folder "awk '{a[\$0]++}END{for(k in a)print k,a[k]}'" \
   --infile kdd12.tr \
@@ -104,7 +103,7 @@ du -hs kdda
 /usr/bin/time -f "%e sec %M KB" awk -f examples/svm-countdistinct.awk kdda > cdawk.txt
 # 388.72 sec 23895104 KB
 
-/usr/bin/time -f "%e sec %M KB" target/release/slb \
+/usr/bin/time -f "%e sec %M KB" slb \
   --mapper 'sed -E "s/^[^ ]+ //" | tr " " "\n" | tr ":" " " | rg -v "^$"' \
   --folder "awk '{if(!(\$1 in a)||length(a[\$1])<100)a[\$1][\$2]=1}END{for(k in a)print k,length(a[k])}'" \
   --infile kdda \
@@ -127,8 +126,15 @@ Note the above examples demonstrate the convenience of the tool:
 
 The last point holds because `slb` ensures each parallel invocation recieves a _unique partition_ of the key space. In turn, we use less memory because each folder is only tracking aggregates for its own key space and less code because we do not need to write a combiner that merges two maps.
 
-To install locally, run
+To install locally from `crates.io`, run
 
 ```
-cargo install --path slb/
+cargo install slb
 ```
+
+## Dev Stuff
+
+Rudimentary testing via `./test.sh`.
+
+Re-publish to `crates.io` with `cd slb && cargo publish`.
+
diff --git a/examples/wc.awk b/examples/wc.awk
index ca709df..426e5e7 100644
--- a/examples/wc.awk
+++ b/examples/wc.awk
@@ -1,11 +1,9 @@
 #!/bin/awk
 # counts the number of appearances for each word in the stream
-BEGIN {
-    RS = "[[:space:]]"
-}
 1 {
-    if ($0 != "")
-        a[$0]++
+    for (i=1; i<=NF; i++)
+        if ($i != "")
+            a[$i]++
 }
 END {
     for (k in a)
diff --git a/slb/Cargo.toml b/slb/Cargo.toml
index 278c63b..d447dfa 100644
--- a/slb/Cargo.toml
+++ b/slb/Cargo.toml
@@ -1,11 +1,19 @@
 [package]
 name = "slb"
-version = "0.1.0"
+version = "0.1.1"
 authors = ["Vladimir Feinberg <vladimir.feinberg@gmail.com>"]
 edition = "2018"
+description = "Sharded load balancing text-streaming Unix tool"
+homepage = "https://github.com/vlad17/slb"
+repository = "https://github.com/vlad17/slb"
+readme = "../README.md"
+keywords = ["stream", "text", "parallel", "cli"]
+categories = ["command-line-utilities", "concurrency"]
+license = "Apache-2.0"
 
 [dependencies]
 memchr = "2.3"
 structopt = "0.3"
 num_cpus = "1.13"
-bstr = "0.2"
-\ No newline at end of file
+bstr = "0.2"
+
author	Vladimir Feinberg <vladimir.feinberg@gmail.com>	2021-06-28 21:53:30 -0700
committer	Vladimir Feinberg <vladimir.feinberg@gmail.com>	2021-06-28 21:53:30 -0700
commit	70409a48fe0e55226e21c8aa16ee6f355ad7ae8b (patch)
tree	33ad49d09445a8ee0be3e7b9446c8ee949e27c68
parent	fdde7e7a6ec3a14046dabc30de7051fab87c513f (diff)
parent	b8565aad0766973fcf87736674c918fe29961634 (diff)