diff options
author | Andrew Gallant <jamslam@gmail.com> | 2014-12-06 17:47:16 -0500 |
---|---|---|
committer | Andrew Gallant <jamslam@gmail.com> | 2014-12-06 17:47:16 -0500 |
commit | 31642acc645aca7d3f269db23bbafa7076c2727c (patch) | |
tree | 0a94c82d23d56f4bb27ee1413283047d167972aa | |
parent | ccc4a24564ea33fca5f1b8ad2f95184d66c7ad60 (diff) |
Add some basic benchmarks.
-rw-r--r-- | BENCHMARKS.md | 43 | ||||
-rw-r--r-- | README.md | 7 | ||||
-rwxr-xr-x | scripts/benchmark-basic | 94 |
3 files changed, 144 insertions, 0 deletions
diff --git a/BENCHMARKS.md b/BENCHMARKS.md new file mode 100644 index 0000000..5e313fb --- /dev/null +++ b/BENCHMARKS.md @@ -0,0 +1,43 @@ +These are some very basic and unscientific benchmarks of various commands +provided by `xsv`. Please see below for more information. + +These benchmarks were run with +[worldcitiespop_mil.csv](http://burntsushi.net/stuff/worldcitiespop_mil.csv), +which is a random 1,000,000 row subset of the world city population dataset +from the [Data Science Toolkit](https://github.com/petewarden/dstkdata). + +These benchmarks were run on an Intel i3930K (6 CPUs, 12 threads) with 32GB of +memory. + +``` +count 0.54 seconds 84.28 MB/sec +flatten 4.26 seconds 10.68 MB/sec +flatten_condensed 4.40 seconds 10.34 MB/sec +frequency 3.50 seconds 13.00 MB/sec +index 0.52 seconds 87.52 MB/sec +sample_10 0.74 seconds 61.50 MB/sec +sample_1000 0.71 seconds 64.10 MB/sec +sample_100000 0.87 seconds 52.31 MB/sec +search 1.11 seconds 41.00 MB/sec +select 0.70 seconds 65.01 MB/sec +sort 3.79 seconds 12.00 MB/sec +slice_one_middle 0.34 seconds 133.86 MB/sec +slice_one_middle_index 0.01 seconds 4551.36 MB/sec +stats 1.56 seconds 29.17 MB/sec +stats_index 0.29 seconds 156.94 MB/sec +stats_everything 4.84 seconds 9.40 MB/sec +stats_everything_index 3.58 seconds 12.71 MB/sec +``` + + +### Details + +The purpose of these benchmarks is to provide a rough ballpark estimate of how +fast each command is. My hope is that they can also catch significant +performance regressions. + +The `count` command can be viewed as a sort of baseline of the fastest possible +command that parses every record in CSV data. + +The benchmarks that end with `_index` are run with indexing enabled. + @@ -323,6 +323,13 @@ Compilation will probably take 1-2 minutes depending on your machine. The binary will end up in `./target/release/xsv`. +### Benchmarks + +I've compiled some [very rough +benchmarks](https://github.com/BurntSushi/xsv/blob/master/README.md) of various +`xsv` commands. + + ### Motivation Here are several valid criticisms of this project: diff --git a/scripts/benchmark-basic b/scripts/benchmark-basic new file mode 100755 index 0000000..6abbc65 --- /dev/null +++ b/scripts/benchmark-basic @@ -0,0 +1,94 @@ +#!/bin/sh + +# This script does some very basic benchmarks with 'xsv' on a city population +# data set (which is a strict subset of the `worldcitiespop` data set). If it +# doesn't exist on your system, it will be downloaded to /tmp for you. +# +# These aren't meant to be overly rigorous, but they should be enough to catch +# significant regressions. +# +# Make sure you're using an `xsv` generated by `cargo build --release`. + +set -e + +pat="$1" +data=/tmp/worldcitiespop_mil.csv +data_idx=/tmp/worldcitiespop_mil.csv.idx +if [ ! -r "$data" ]; then + curl -sS http://burntsushi.net/stuff/worldcitiespop_mil.csv > "$data" +fi +data_size=$(stat --format '%s' "$data") + +function real_seconds { + cmd=$(echo $@ "> /dev/null 2>&1") + t=$( + $(which time) -p sh -c "$cmd" 2>&1 \ + | grep '^real' \ + | awk '{print $2}') + if [ $(echo "$t < 0.01" | bc) = 1 ]; then + t=0.01 + fi + echo $t +} + +function benchmark { + rm -f "$data_idx" + t1=$(real_seconds "$@") + rm -f "$data_idx" + t2=$(real_seconds "$@") + rm -f "$data_idx" + t3=$(real_seconds "$@") + echo "scale=2; ($t1 + $t2 + $t3) / 3" | bc +} + +function benchmark_with_index { + rm -f "$data_idx" + xsv index "$data" + t1=$(real_seconds "$@") + t2=$(real_seconds "$@") + t3=$(real_seconds "$@") + rm -f "$data_idx" + echo "scale=2; ($t1 + $t2 + $t3) / 3" | bc +} + +function run { + index= + while true; do + case "$1" in + --index) index="yes" && shift ;; + *) break ;; + esac + done + name="$1" + shift + + if [ -z "$pat" ] || echo "$name" | grep -E -q "^$pat$"; then + if [ -z "$index" ]; then + t=$(benchmark "$@") + else + t=$(benchmark_with_index "$@") + fi + mb_per=$(echo "scale=2; ($data_size / $t) / 2^20" | bc) + printf "%s\t%0.02f seconds\t%s MB/sec\n" $name $t $mb_per + fi +} + +{ + run count xsv count "$data" + run flatten xsv flatten "$data" + run flatten_condensed xsv flatten "$data" --condensed 50 + run frequency xsv frequency "$data" + run index xsv index "$data" + run sample_10 xsv sample 10 "$data" + run sample_1000 xsv sample 1000 "$data" + run sample_100000 xsv sample 100000 "$data" + run search xsv search -s Country "'(?i)us'" "$data" + run select xsv select Country "$data" + run sort xsv sort -s AccentCity "$data" + run slice_one_middle xsv slice -i 500000 "$data" + run --index slice_one_middle_index xsv slice -i 500000 "$data" + run stats xsv stats "$data" + run --index stats_index xsv stats "$data" + run stats_everything xsv stats "$data" --everything + run --index stats_everything_index xsv stats "$data" --everything +} | column -t -s' ' |