summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2014-12-06 17:47:16 -0500
committerAndrew Gallant <jamslam@gmail.com>2014-12-06 17:47:16 -0500
commit31642acc645aca7d3f269db23bbafa7076c2727c (patch)
tree0a94c82d23d56f4bb27ee1413283047d167972aa
parentccc4a24564ea33fca5f1b8ad2f95184d66c7ad60 (diff)
Add some basic benchmarks.
-rw-r--r--BENCHMARKS.md43
-rw-r--r--README.md7
-rwxr-xr-xscripts/benchmark-basic94
3 files changed, 144 insertions, 0 deletions
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
new file mode 100644
index 0000000..5e313fb
--- /dev/null
+++ b/BENCHMARKS.md
@@ -0,0 +1,43 @@
+These are some very basic and unscientific benchmarks of various commands
+provided by `xsv`. Please see below for more information.
+
+These benchmarks were run with
+[worldcitiespop_mil.csv](http://burntsushi.net/stuff/worldcitiespop_mil.csv),
+which is a random 1,000,000 row subset of the world city population dataset
+from the [Data Science Toolkit](https://github.com/petewarden/dstkdata).
+
+These benchmarks were run on an Intel i3930K (6 CPUs, 12 threads) with 32GB of
+memory.
+
+```
+count 0.54 seconds 84.28 MB/sec
+flatten 4.26 seconds 10.68 MB/sec
+flatten_condensed 4.40 seconds 10.34 MB/sec
+frequency 3.50 seconds 13.00 MB/sec
+index 0.52 seconds 87.52 MB/sec
+sample_10 0.74 seconds 61.50 MB/sec
+sample_1000 0.71 seconds 64.10 MB/sec
+sample_100000 0.87 seconds 52.31 MB/sec
+search 1.11 seconds 41.00 MB/sec
+select 0.70 seconds 65.01 MB/sec
+sort 3.79 seconds 12.00 MB/sec
+slice_one_middle 0.34 seconds 133.86 MB/sec
+slice_one_middle_index 0.01 seconds 4551.36 MB/sec
+stats 1.56 seconds 29.17 MB/sec
+stats_index 0.29 seconds 156.94 MB/sec
+stats_everything 4.84 seconds 9.40 MB/sec
+stats_everything_index 3.58 seconds 12.71 MB/sec
+```
+
+
+### Details
+
+The purpose of these benchmarks is to provide a rough ballpark estimate of how
+fast each command is. My hope is that they can also catch significant
+performance regressions.
+
+The `count` command can be viewed as a sort of baseline of the fastest possible
+command that parses every record in CSV data.
+
+The benchmarks that end with `_index` are run with indexing enabled.
+
diff --git a/README.md b/README.md
index e948a75..4b010a0 100644
--- a/README.md
+++ b/README.md
@@ -323,6 +323,13 @@ Compilation will probably take 1-2 minutes depending on your machine. The
binary will end up in `./target/release/xsv`.
+### Benchmarks
+
+I've compiled some [very rough
+benchmarks](https://github.com/BurntSushi/xsv/blob/master/README.md) of various
+`xsv` commands.
+
+
### Motivation
Here are several valid criticisms of this project:
diff --git a/scripts/benchmark-basic b/scripts/benchmark-basic
new file mode 100755
index 0000000..6abbc65
--- /dev/null
+++ b/scripts/benchmark-basic
@@ -0,0 +1,94 @@
+#!/bin/sh
+
+# This script does some very basic benchmarks with 'xsv' on a city population
+# data set (which is a strict subset of the `worldcitiespop` data set). If it
+# doesn't exist on your system, it will be downloaded to /tmp for you.
+#
+# These aren't meant to be overly rigorous, but they should be enough to catch
+# significant regressions.
+#
+# Make sure you're using an `xsv` generated by `cargo build --release`.
+
+set -e
+
+pat="$1"
+data=/tmp/worldcitiespop_mil.csv
+data_idx=/tmp/worldcitiespop_mil.csv.idx
+if [ ! -r "$data" ]; then
+ curl -sS http://burntsushi.net/stuff/worldcitiespop_mil.csv > "$data"
+fi
+data_size=$(stat --format '%s' "$data")
+
+function real_seconds {
+ cmd=$(echo $@ "> /dev/null 2>&1")
+ t=$(
+ $(which time) -p sh -c "$cmd" 2>&1 \
+ | grep '^real' \
+ | awk '{print $2}')
+ if [ $(echo "$t < 0.01" | bc) = 1 ]; then
+ t=0.01
+ fi
+ echo $t
+}
+
+function benchmark {
+ rm -f "$data_idx"
+ t1=$(real_seconds "$@")
+ rm -f "$data_idx"
+ t2=$(real_seconds "$@")
+ rm -f "$data_idx"
+ t3=$(real_seconds "$@")
+ echo "scale=2; ($t1 + $t2 + $t3) / 3" | bc
+}
+
+function benchmark_with_index {
+ rm -f "$data_idx"
+ xsv index "$data"
+ t1=$(real_seconds "$@")
+ t2=$(real_seconds "$@")
+ t3=$(real_seconds "$@")
+ rm -f "$data_idx"
+ echo "scale=2; ($t1 + $t2 + $t3) / 3" | bc
+}
+
+function run {
+ index=
+ while true; do
+ case "$1" in
+ --index) index="yes" && shift ;;
+ *) break ;;
+ esac
+ done
+ name="$1"
+ shift
+
+ if [ -z "$pat" ] || echo "$name" | grep -E -q "^$pat$"; then
+ if [ -z "$index" ]; then
+ t=$(benchmark "$@")
+ else
+ t=$(benchmark_with_index "$@")
+ fi
+ mb_per=$(echo "scale=2; ($data_size / $t) / 2^20" | bc)
+ printf "%s\t%0.02f seconds\t%s MB/sec\n" $name $t $mb_per
+ fi
+}
+
+{
+ run count xsv count "$data"
+ run flatten xsv flatten "$data"
+ run flatten_condensed xsv flatten "$data" --condensed 50
+ run frequency xsv frequency "$data"
+ run index xsv index "$data"
+ run sample_10 xsv sample 10 "$data"
+ run sample_1000 xsv sample 1000 "$data"
+ run sample_100000 xsv sample 100000 "$data"
+ run search xsv search -s Country "'(?i)us'" "$data"
+ run select xsv select Country "$data"
+ run sort xsv sort -s AccentCity "$data"
+ run slice_one_middle xsv slice -i 500000 "$data"
+ run --index slice_one_middle_index xsv slice -i 500000 "$data"
+ run stats xsv stats "$data"
+ run --index stats_index xsv stats "$data"
+ run stats_everything xsv stats "$data" --everything
+ run --index stats_everything_index xsv stats "$data" --everything
+} | column -t -s' '