Add some basic benchmarks.

author: Andrew Gallant <jamslam@gmail.com> 2014-12-06 17:47:16 -0500
committer: Andrew Gallant <jamslam@gmail.com> 2014-12-06 17:47:16 -0500
commit: 31642acc645aca7d3f269db23bbafa7076c2727c (patch)
tree: 0a94c82d23d56f4bb27ee1413283047d167972aa
parent: ccc4a24564ea33fca5f1b8ad2f95184d66c7ad60 (diff)
3 files changed, 144 insertions, 0 deletions
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
new file mode 100644
index 0000000..5e313fb
--- /dev/null
+++ b/BENCHMARKS.md
@@ -0,0 +1,43 @@
+These are some very basic and unscientific benchmarks of various commands
+provided by `xsv`. Please see below for more information.
+
+These benchmarks were run with
+[worldcitiespop_mil.csv](http://burntsushi.net/stuff/worldcitiespop_mil.csv),
+which is a random 1,000,000 row subset of the world city population dataset
+from the [Data Science Toolkit](https://github.com/petewarden/dstkdata).
+
+These benchmarks were run on an Intel i3930K (6 CPUs, 12 threads) with 32GB of
+memory.
+
+```
+count                   0.54 seconds  84.28 MB/sec
+flatten                 4.26 seconds  10.68 MB/sec
+flatten_condensed       4.40 seconds  10.34 MB/sec
+frequency               3.50 seconds  13.00 MB/sec
+index                   0.52 seconds  87.52 MB/sec
+sample_10               0.74 seconds  61.50 MB/sec
+sample_1000             0.71 seconds  64.10 MB/sec
+sample_100000           0.87 seconds  52.31 MB/sec
+search                  1.11 seconds  41.00 MB/sec
+select                  0.70 seconds  65.01 MB/sec
+sort                    3.79 seconds  12.00 MB/sec
+slice_one_middle        0.34 seconds  133.86 MB/sec
+slice_one_middle_index  0.01 seconds  4551.36 MB/sec
+stats                   1.56 seconds  29.17 MB/sec
+stats_index             0.29 seconds  156.94 MB/sec
+stats_everything        4.84 seconds  9.40 MB/sec
+stats_everything_index  3.58 seconds  12.71 MB/sec
+```
+
+
+### Details
+
+The purpose of these benchmarks is to provide a rough ballpark estimate of how
+fast each command is. My hope is that they can also catch significant
+performance regressions.
+
+The `count` command can be viewed as a sort of baseline of the fastest possible
+command that parses every record in CSV data.
+
+The benchmarks that end with `_index` are run with indexing enabled.
+
diff --git a/README.md b/README.md
index e948a75..4b010a0 100644
--- a/README.md
+++ b/README.md
@@ -323,6 +323,13 @@ Compilation will probably take 1-2 minutes depending on your machine. The
 binary will end up in `./target/release/xsv`.
 
 
+### Benchmarks
+
+I've compiled some [very rough
+benchmarks](https://github.com/BurntSushi/xsv/blob/master/README.md) of various
+`xsv` commands.
+
+
 ### Motivation
 
 Here are several valid criticisms of this project:
diff --git a/scripts/benchmark-basic b/scripts/benchmark-basic
new file mode 100755
index 0000000..6abbc65
--- /dev/null
+++ b/scripts/benchmark-basic
@@ -0,0 +1,94 @@
+#!/bin/sh
+
+# This script does some very basic benchmarks with 'xsv' on a city population
+# data set (which is a strict subset of the `worldcitiespop` data set). If it
+# doesn't exist on your system, it will be downloaded to /tmp for you.
+#
+# These aren't meant to be overly rigorous, but they should be enough to catch
+# significant regressions.
+#
+# Make sure you're using an `xsv` generated by `cargo build --release`.
+
+set -e
+
+pat="$1"
+data=/tmp/worldcitiespop_mil.csv
+data_idx=/tmp/worldcitiespop_mil.csv.idx
+if [ ! -r "$data" ]; then
+  curl -sS http://burntsushi.net/stuff/worldcitiespop_mil.csv > "$data"
+fi
+data_size=$(stat --format '%s' "$data")
+
+function real_seconds {
+  cmd=$(echo $@ "> /dev/null 2>&1")
+  t=$(
+    $(which time) -p sh -c "$cmd" 2>&1 \
+      | grep '^real' \
+      | awk '{print $2}')
+  if [ $(echo "$t < 0.01" | bc) = 1 ]; then
+    t=0.01
+  fi
+  echo $t
+}
+
+function benchmark {
+  rm -f "$data_idx"
+  t1=$(real_seconds "$@")
+  rm -f "$data_idx"
+  t2=$(real_seconds "$@")
+  rm -f "$data_idx"
+  t3=$(real_seconds "$@")
+  echo "scale=2; ($t1 + $t2 + $t3) / 3" | bc
+}
+
+function benchmark_with_index {
+  rm -f "$data_idx"
+  xsv index "$data"
+  t1=$(real_seconds "$@")
+  t2=$(real_seconds "$@")
+  t3=$(real_seconds "$@")
+  rm -f "$data_idx"
+  echo "scale=2; ($t1 + $t2 + $t3) / 3" | bc
+}
+
+function run {
+  index=
+  while true; do
+    case "$1" in
+      --index) index="yes" && shift ;;
+      *) break ;;
+    esac
+  done
+  name="$1"
+  shift
+
+  if [ -z "$pat" ] || echo "$name" | grep -E -q "^$pat$"; then
+    if [ -z "$index" ]; then
+      t=$(benchmark "$@")
+    else
+      t=$(benchmark_with_index "$@")
+    fi
+    mb_per=$(echo "scale=2; ($data_size / $t) / 2^20" | bc)
+    printf "%s\t%0.02f seconds\t%s MB/sec\n" $name $t $mb_per
+  fi
+}
+
+{
+  run count xsv count "$data"
+  run flatten xsv flatten "$data"
+  run flatten_condensed xsv flatten "$data" --condensed 50
+  run frequency xsv frequency "$data"
+  run index xsv index "$data"
+  run sample_10 xsv sample 10 "$data"
+  run sample_1000 xsv sample 1000 "$data"
+  run sample_100000 xsv sample 100000 "$data"
+  run search xsv search -s Country "'(?i)us'" "$data"
+  run select xsv select Country "$data"
+  run sort xsv sort -s AccentCity "$data"
+  run slice_one_middle xsv slice -i 500000 "$data"
+  run --index slice_one_middle_index xsv slice -i 500000 "$data"
+  run stats xsv stats "$data"
+  run --index stats_index xsv stats "$data"
+  run stats_everything xsv stats "$data" --everything
+  run --index stats_everything_index xsv stats "$data" --everything
+} | column -t -s'	'
author	Andrew Gallant <jamslam@gmail.com>	2014-12-06 17:47:16 -0500
committer	Andrew Gallant <jamslam@gmail.com>	2014-12-06 17:47:16 -0500
commit	31642acc645aca7d3f269db23bbafa7076c2727c (patch)
tree	0a94c82d23d56f4bb27ee1413283047d167972aa
parent	ccc4a24564ea33fca5f1b8ad2f95184d66c7ad60 (diff)