summaryrefslogtreecommitdiffstats
path: root/benchsuite
diff options
context:
space:
mode:
authorAndrew Gallant <jamslam@gmail.com>2016-09-11 01:05:36 -0400
committerAndrew Gallant <jamslam@gmail.com>2016-09-11 01:05:36 -0400
commit9bf7696ec8cacc74baaa4003cdfba0dab65245fd (patch)
treecefd7d96af71bf8c5753341ad2a2d5319f58cdbb /benchsuite
parentcb0f8fd2fa7ed1770fc698ea2ac033e3bc7994d7 (diff)
Initial cut at a benchmark suite for CLI search tools.0.0.17
Diffstat (limited to 'benchsuite')
-rwxr-xr-xbenchsuite918
1 files changed, 918 insertions, 0 deletions
diff --git a/benchsuite b/benchsuite
new file mode 100755
index 00000000..381b57fb
--- /dev/null
+++ b/benchsuite
@@ -0,0 +1,918 @@
+#!/usr/bin/env python
+
+'''
+benchsuite is a benchmark runner for comparing command line search tools.
+'''
+
+import argparse
+import csv
+import os
+import os.path as path
+from multiprocessing import cpu_count
+import re
+import statistics
+import subprocess
+import sys
+import time
+
+# Some constants for identifying the corpora we use to run tests.
+# We establish two very different kinds of corpora: a small number of large
+# files and a large number of small files. These are vastly different use cases
+# not only because of their performance characteristics, but also the
+# strategies used to increase the relevance of results returned.
+
+SUBTITLES_DIR = 'subtitles'
+SUBTITLES_EN_NAME = 'OpenSubtitles2016.raw.en'
+SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME
+SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.en.gz'
+SUBTITLES_RU_NAME = 'OpenSubtitles2016.raw.ru'
+SUBTITLES_RU_NAME_GZ = '%s.gz' % SUBTITLES_RU_NAME
+SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.ru.gz'
+
+LINUX_DIR = 'linux'
+LINUX_CLONE = 'git://github.com/BurntSushi/linux'
+
+
+def bench_linux_literal_default(suite_dir):
+ '''
+ Benchmark the speed of a literal using *default* settings.
+
+ This is a purposefully unfair benchmark for use in performance
+ analysis, but it is pedagogically useful.
+ '''
+ require(suite_dir, 'linux')
+ cwd = path.join(suite_dir, LINUX_DIR)
+ pat = 'PM_RESUME'
+
+ def mkcmd(*args, **kwargs):
+ kwargs['cwd'] = cwd
+ return Command(*args, **kwargs)
+
+ # N.B. This is a purposefully unfair benchmark for illustrative purposes
+ # of how the default modes for each search tool differ.
+ return Benchmark(pattern=pat, commands=[
+ mkcmd('rg', ['rg', pat]),
+ mkcmd('ag', ['ag', pat]),
+ # ucg reports the exact same matches as ag and rg even though it
+ # doesn't read gitignore files. Instead, it has a file whitelist
+ # that happens to match up exactly with the gitignores for this search.
+ mkcmd('ucg', ['ucg', pat]),
+ mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'C'}),
+ mkcmd('pt', ['pt', pat]),
+ # sift reports an extra line here for a binary file matched.
+ mkcmd('sift', ['sift', pat]),
+ ])
+
+
+def bench_linux_literal(suite_dir):
+ '''
+ Benchmark the speed of a literal, attempting to be fair.
+
+ This tries to use the minimum set of options available in all tools
+ to test how fast they are. For example, it makes sure there is no
+ case insensitive matching and that line numbers are computed.
+ '''
+ require(suite_dir, 'linux')
+ cwd = path.join(suite_dir, LINUX_DIR)
+ pat = 'PM_RESUME'
+
+ def mkcmd(*args, **kwargs):
+ kwargs['cwd'] = cwd
+ return Command(*args, **kwargs)
+
+ return Benchmark(pattern=pat, commands=[
+ mkcmd('rg', ['rg', '-n', pat]),
+ mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
+ mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]),
+ mkcmd('ag', ['ag', '-s', pat]),
+ mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]),
+ mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
+ mkcmd('git grep', [
+ 'git', 'grep', '-I', '-n', pat,
+ ], env={'LC_ALL': 'C'}),
+ mkcmd('pt', ['pt', pat]),
+ mkcmd('sift', [
+ 'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
+ ]),
+ ])
+
+
+def bench_linux_literal_casei(suite_dir):
+ '''
+ Benchmark the speed of a case insensitive literal search.
+
+ This is like the linux_literal benchmark, except we ask the
+ search tools to do case insensitive search.
+ '''
+ require(suite_dir, 'linux')
+ cwd = path.join(suite_dir, LINUX_DIR)
+ pat = 'PM_RESUME'
+
+ def mkcmd(*args, **kwargs):
+ kwargs['cwd'] = cwd
+ return Command(*args, **kwargs)
+
+ return Benchmark(pattern=pat, commands=[
+ mkcmd('rg', ['rg', '-n', '-i', pat]),
+ mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]),
+ mkcmd('rg-novcs-mmap', [
+ 'rg', '--mmap', '--no-ignore', '-n', '-i', pat,
+ ]),
+ mkcmd('ag', ['ag', '-i', pat]),
+ mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-i', pat]),
+ mkcmd('ucg', ['ucg', '-i', pat]),
+ mkcmd('git grep', [
+ 'git', 'grep', '-I', '-n', '-i', pat,
+ ], env={'LC_ALL': 'C'}),
+ # sift yields more matches than it should here. Specifically, it gets
+ # matches in Module.symvers and System.map in the repo root. Both of
+ # those files show up in the repo root's .gitignore file.
+ mkcmd('sift', [
+ 'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-i', pat,
+ ]),
+ ])
+
+
+def bench_linux_re_literal_suffix(suite_dir):
+ '''
+ Benchmark the speed of a literal inside a regex.
+
+ This, for example, inhibits a prefix byte optimization used
+ inside of Go's regex engine (relevant for sift and pt).
+ '''
+ require(suite_dir, 'linux')
+ cwd = path.join(suite_dir, LINUX_DIR)
+ pat = '[A-Z]+_RESUME'
+
+ def mkcmd(*args, **kwargs):
+ kwargs['cwd'] = cwd
+ return Command(*args, **kwargs)
+
+ return Benchmark(pattern=pat, commands=[
+ mkcmd('rg', ['rg', '-n', pat]),
+ mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
+ mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]),
+ mkcmd('ag', ['ag', '-s', pat]),
+ mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]),
+ mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
+ mkcmd(
+ 'git grep',
+ ['git', 'grep', '-E', '-I', '-n', pat],
+ env={'LC_ALL': 'C'},
+ ),
+ mkcmd('sift', [
+ 'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
+ ]),
+ ])
+
+
+def bench_linux_word(suite_dir):
+ '''
+ Benchmark use of the -w ("match word") flag in each tool.
+
+ sift has a lot of trouble with this because it forces it into Go's
+ regex engine by surrounding the pattern with \b assertions.
+ '''
+ require(suite_dir, 'linux')
+ cwd = path.join(suite_dir, LINUX_DIR)
+ pat = 'PM_RESUME'
+
+ def mkcmd(*args, **kwargs):
+ kwargs['cwd'] = cwd
+ return Command(*args, **kwargs)
+
+ return Benchmark(pattern=pat, commands=[
+ mkcmd('rg', ['rg', '-n', '-w', pat]),
+ mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-w', pat]),
+ mkcmd('rg-novcs-mmap', [
+ 'rg', '--mmap', '--no-ignore', '-n', '-w', pat,
+ ]),
+ mkcmd('ag', ['ag', '-s', '-w', pat]),
+ mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', '-w', pat]),
+ mkcmd('ucg', ['ucg', '--nosmart-case', '-w', pat]),
+ mkcmd(
+ 'git grep',
+ ['git', 'grep', '-E', '-I', '-n', '-w', pat],
+ env={'LC_ALL': 'C'},
+ ),
+ mkcmd('sift', [
+ 'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-w', pat,
+ ]),
+ ])
+
+
+def bench_linux_unicode_greek(suite_dir):
+ '''
+ Benchmark matching of a Unicode category.
+
+ Only three tools (ripgrep, sift and pt) support this.
+ '''
+ require(suite_dir, 'linux')
+ cwd = path.join(suite_dir, LINUX_DIR)
+ pat = r'\p{Greek}'
+
+ def mkcmd(*args, **kwargs):
+ kwargs['cwd'] = cwd
+ return Command(*args, **kwargs)
+
+ return Benchmark(pattern=pat, commands=[
+ mkcmd('rg', ['rg', '-n', pat]),
+ # sift tries to search a bunch of PDF files and clutters up the
+ # results, even though --binary-skip is provided. They are excluded
+ # here explicitly, but don't have a measurable impact on performance.
+ mkcmd('sift', [
+ 'sift', '-n', '--binary-skip',
+ '--exclude-files', '.*',
+ '--exclude-files', '*.pdf',
+ pat,
+ ]),
+ ])
+
+
+def bench_linux_unicode_greek_casei(suite_dir):
+ '''
+ Benchmark matching of a Unicode category, case insensitively.
+
+ Only ripgrep gets this right (and it's still fast).
+ '''
+ require(suite_dir, 'linux')
+ cwd = path.join(suite_dir, LINUX_DIR)
+ pat = r'\p{Greek}'
+
+ def mkcmd(*args, **kwargs):
+ kwargs['cwd'] = cwd
+ return Command(*args, **kwargs)
+
+ return Benchmark(pattern=pat, commands=[
+ mkcmd('rg', ['rg', '-n', '-i', pat]),
+ # sift tries to search a bunch of PDF files and clutters up the
+ # results, even though --binary-skip is provided. They are excluded
+ # here explicitly, but don't have a measurable impact on performance.
+ mkcmd('sift', [
+ 'sift', '-n', '--binary-skip',
+ '--exclude-files', '.*',
+ '--exclude-files', '*.pdf',
+ pat,
+ ]),
+ ])
+
+
+def bench_linux_unicode_word(suite_dir):
+ '''
+ Benchmark Unicode aware \w character class.
+
+ Only ripgrep and git-grep (with LC_ALL=en_US.UTF-8) actually get
+ this right. Everything else uses the standard ASCII interpretation
+ of \w.
+ '''
+ require(suite_dir, 'linux')
+ cwd = path.join(suite_dir, LINUX_DIR)
+ pat = r'\wAh'
+
+ def mkcmd(*args, **kwargs):
+ kwargs['cwd'] = cwd
+ return Command(*args, **kwargs)
+
+ return Benchmark(pattern=pat, commands=[
+ mkcmd('rg', ['rg', '-n', pat]),
+ mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]),
+ mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
+ mkcmd('rg-novcs-mmap', [
+ 'rg', '--mmap', '--no-ignore', '-n', pat,
+ ]),
+ mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
+ mkcmd('ag-novcs (no Unicode)', [
+ 'ag', '--skip-vcs-ignores', '-s', pat,
+ ]),
+ mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]),
+ mkcmd(
+ 'git grep',
+ ['git', 'grep', '-E', '-I', '-n', pat],
+ env={'LC_ALL': 'en_US.UTF-8'},
+ ),
+ mkcmd(
+ 'git grep (no Unicode)',
+ ['git', 'grep', '-E', '-I', '-n', pat],
+ env={'LC_ALL': 'C'},
+ ),
+ mkcmd('sift (no Unicode)', [
+ 'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
+ ]),
+ ])
+
+
+def bench_linux_no_literal(suite_dir):
+ '''
+ Benchmark a regex that defeats all literal optimizations.
+
+ Most search patterns have some kind of literal in them, which
+ typically permits searches to take some shortcuts. Therefore, the
+ applicability of this benchmark is somewhat suspicious, but the
+ suite wouldn't feel complete without it.
+ '''
+ require(suite_dir, 'linux')
+ cwd = path.join(suite_dir, LINUX_DIR)
+ pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'
+
+ def mkcmd(*args, **kwargs):
+ kwargs['cwd'] = cwd
+ return Command(*args, **kwargs)
+
+ return Benchmark(pattern=pat, commands=[
+ mkcmd('rg', ['rg', '-n', pat]),
+ mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
+ mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]),
+ mkcmd('rg-novcs (no Unicode)', [
+ 'rg', '--no-ignore', '-n', '(?-u)' + pat,
+ ]),
+ mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
+ mkcmd('ag-novcs (no Unicode)', [
+ 'ag', '--skip-vcs-ignores', '-s', pat,
+ ]),
+ mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]),
+ mkcmd(
+ 'git grep',
+ ['git', 'grep', '-E', '-I', '-n', pat],
+ env={'LC_ALL': 'en_US.UTF-8'},
+ ),
+ mkcmd(
+ 'git grep (no Unicode)',
+ ['git', 'grep', '-E', '-I', '-n', pat],
+ env={'LC_ALL': 'C'},
+ ),
+ mkcmd('sift (no Unicode)', [
+ 'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
+ ]),
+ ])
+
+
+def bench_linux_alternates(suite_dir):
+ '''
+ Benchmark a small alternation of literals.
+
+ sift doesn't make the cut. It's more than 10x slower than the next
+ fastest result. The slowdown is likely because the Go regexp engine
+ doesn't do any literal optimizations for this case (there is no
+ common leading byte).
+ '''
+ require(suite_dir, 'linux')
+ cwd = path.join(suite_dir, LINUX_DIR)
+ pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT'
+
+ def mkcmd(*args, **kwargs):
+ kwargs['cwd'] = cwd
+ return Command(*args, **kwargs)
+
+ return Benchmark(pattern=pat, commands=[
+ mkcmd('rg', ['rg', '-n', pat]),
+ mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
+ mkcmd('rg-novcs-mmap', [
+ 'rg', '--mmap', '--no-ignore', '-n', pat,
+ ]),
+ mkcmd('ag', ['ag', '-s', pat]),
+ mkcmd('ag-novcs', [
+ 'ag', '--skip-vcs-ignores', '-s', pat,
+ ]),
+ mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
+ mkcmd(
+ 'git grep',
+ ['git', 'grep', '-E', '-I', '-n', pat],
+ env={'LC_ALL': 'C'},
+ ),
+ ])
+
+
+def bench_linux_alternates_casei(suite_dir):
+ 'Benchmark a small alternation of literals case insensitively.'
+ require(suite_dir, 'linux')
+ cwd = path.join(suite_dir, LINUX_DIR)
+ pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT'
+
+ def mkcmd(*args, **kwargs):
+ kwargs['cwd'] = cwd
+ return Command(*args, **kwargs)
+
+ return Benchmark(pattern=pat, commands=[
+ mkcmd('rg', ['rg', '-n', '-i', pat]),
+ mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]),
+ mkcmd('rg-novcs-mmap', [
+ 'rg', '--mmap', '--no-ignore', '-n', '-i', pat,
+ ]),
+ mkcmd('ag', ['ag', '-i', pat]),
+ mkcmd('ag-novcs', [
+ 'ag', '--skip-vcs-ignores', '-i', pat,
+ ]),
+ mkcmd('ucg', ['ucg', '-i', pat]),
+ mkcmd(
+ 'git grep',
+ ['git', 'grep', '-E', '-I', '-n', '-i', pat],
+ env={'LC_ALL': 'C'},
+ ),
+ ])
+
+
+# BREADCRUMBS(burntsushi): We should benchmark an alternation for `linux` as
+# well.
+
+def bench_sherlock(suite_dir):
+ 'TODO: Fix this and add more single file benchmarks.'
+ require(suite_dir, 'subtitles-en')
+ en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME)
+ pat = 'Sherlock'
+
+ return Benchmark(pattern=pat, commands=[
+ Command('rg', ['rg', pat, en]),
+ Command('grep', ['grep', '-a', pat, en])
+ ])
+
+
+class MissingDependencies(Exception):
+ '''
+ A missing dependency exception.
+
+ This exception occurs when running a benchmark that requires a
+ particular corpus that isn't available.
+
+ :ivar list(str) missing_names:
+ A list of missing dependency names. These names correspond to
+ names that can be used with the --download flag.
+ '''
+ def __init__(self, missing_names):
+ self.missing_names = missing_names
+
+ def __str__(self):
+ return 'MissingDependency(%s)' % repr(self.missing_names)
+
+
+class Benchmark(object):
+ '''
+ A single benchmark corresponding to a grouping of commands.
+
+ The main purpose of a benchmark is to compare the performance
+ characteristics of a group of commands.
+ '''
+
+ def __init__(self, name=None, pattern=None, commands=None,
+ warmup_count=1, count=3, line_count=True):
+ '''
+ Create a single benchmark.
+
+ A single benchmark is composed of a set of commands that are
+ benchmarked and compared against one another. A benchmark may
+ have multiple commands that use the same search tool (but
+ probably should have something differentiating them).
+
+ The grouping of commands is a purely human driven process.
+
+ By default, the output of every command is sent to /dev/null.
+ Other types of behavior are available via the methods defined
+ on this benchmark.
+
+ :param str name:
+ A human readable string denoting the name of this
+ benchmark.
+ :param str pattern:
+ The pattern that is used in search.
+ :param list(Command) commands:
+ A list of commands to initialize this benchmark with. More
+ commands may be added before running the benchmark.
+ :param int warmup_count:
+ The number of times to run each command before recording
+ samples.
+ :param int count:
+ The number of samples to collect from each command.
+ :param bool line_count:
+ When set, the lines of each search are counted and included
+ in the samples produced.
+ '''
+ self.name = name
+ self.pattern = pattern
+ self.commands = commands or []
+ self.warmup_count = warmup_count
+ self.count = count
+ self.line_count = line_count
+
+ def run(self):
+ '''
+ Runs this benchmark and returns the results.
+
+ :rtype: Result
+ '''
+ result = Result(self)
+ for cmd in self.commands:
+ # Do a warmup first.
+ for _ in range(self.warmup_count):
+ self.run_one(cmd)
+ for _ in range(self.count):
+ result.add(cmd, **self.run_one(cmd))
+ return result
+
+ def run_one(self, cmd):
+ '''
+ Runs the given command exactly once.
+
+ Returns an object that includes the time taken by the command.
+ If this benchmark was configured to count the number of lines
+ returned, then the line count is also returned.
+
+ :param Command cmd: The command to run.
+ :returns:
+ A dict with two fields, duration and line_count.
+ The duration is in seconds, with fractional milliseconds,
+ and is guaranteed to be available. The line_count is set
+ to None unless line counting is enabled, in which case,
+ it is the number of lines in the search output.
+ :rtype: int
+ '''
+ cmd.kwargs['stderr'] = subprocess.DEVNULL
+ if self.line_count:
+ cmd.kwargs['stdout'] = subprocess.PIPE
+ else:
+ cmd.kwargs['stdout'] = subprocess.DEVNULL
+
+ start = time.time()
+ completed = cmd.run()
+ end = time.time()
+
+ line_count = None
+ if self.line_count:
+ line_count = completed.stdout.count(b'\n')
+ return {
+ 'duration': end - start,
+ 'line_count': line_count,
+ }
+
+
+class Result(object):
+ '''
+ The result of running a benchmark.
+
+ Benchmark results consist of a set of samples, where each sample
+ corresponds to a single run of a single command in the benchmark.
+ Various statistics can be computed from these samples such as mean
+ and standard deviation.
+ '''
+ def __init__(self, benchmark):
+ '''
+ Create a new set of results, initially empty.
+
+ :param Benchmarl benchmark:
+ The benchmark that produced these results.
+ '''
+ self.benchmark = benchmark
+ self.samples = []
+
+ def add(self, cmd, duration, line_count=None):
+ '''
+ Add a new sample to this result set.
+
+ :param Command cmd:
+ The command that produced this sample.
+ :param int duration:
+ The duration, in milliseconds, that the command took to
+ run.
+ :param int line_count:
+ The number of lines in the search output. This is optional.
+ '''
+ self.samples.append({
+ 'cmd': cmd,
+ 'duration': duration,
+ 'line_count': line_count,
+ })
+
+ def fastest_sample(self):
+ '''
+ Returns the fastest recorded sample.
+ '''
+ return min(self.samples, key=lambda s: s['duration'])
+
+ def fastest_cmd(self):
+ '''
+ Returns the fastest command according to distribution.
+ '''
+ means = []
+ for cmd in self.benchmark.commands:
+ mean, _ = self.distribution_for(cmd)
+ means.append((cmd, mean))
+ return min(means, key=lambda tup: tup[1])[0]
+
+ def samples_for(self, cmd):
+ 'Returns an iterable of samples for cmd'
+ yield from (s for s in self.samples if s['cmd'].name == cmd.name)
+
+ def line_counts_for(self, cmd):
+ '''
+ Returns the line counts recorded for each command.
+
+ :returns:
+ A dictionary from command name to a set of line
+ counts recorded.
+ '''
+ return {s['line_count'] for s in self.samples_for(cmd)
+ if s['line_count'] is not None}
+
+ def distribution_for(self, cmd):
+ '''
+ Returns the distribution (mean +/- std) of the given command.
+
+ :rtype: (float, float)
+ :returns:
+ A tuple containing the mean and standard deviation, in that
+ order.
+ '''
+ mean = statistics.mean(
+ s['duration'] for s in self.samples_for(cmd))
+ stdev = statistics.stdev(
+ s['duration'] for s in self.samples_for(cmd))
+ return mean, stdev
+
+
+class Command(object):
+ def __init__(self, name, cmd, *args, **kwargs):
+ '''
+ Create a new command that is run as part of a benchmark.
+
+ *args and **kwargs are passed directly to ``subprocess.run``.
+ An exception to this is stdin/stdout/stderr. Output
+ redirection is completely controlled by the benchmark harness.
+ Trying to set them here will trigger an assert.
+
+ :param str name:
+ The human readable name of this command. This is
+ particularly useful if the same search tool is used
+ multiple times in the same benchmark with different
+ arguments.
+ :param list(str) cmd:
+ The command to run as a list of arguments (including the
+ command name itself).
+ '''
+ assert 'stdin' not in kwargs
+ assert 'stdout' not in kwargs
+ assert 'stderr' not in kwargs
+ self.name = name
+ self.cmd = cmd
+ self.args = args
+ self.kwargs = kwargs
+
+ def run(self):
+ '''
+ Runs this command and returns its status.
+
+ :rtype: subprocess.CompletedProcess
+ '''
+ return subprocess.run(self.cmd, *self.args, **self.kwargs)
+
+
+def eprint(*args, **kwargs):
+ 'Like print, but to stderr.'
+ kwargs['file'] = sys.stderr
+ print(*args, **kwargs)
+
+
+def run_cmd(cmd, *args, **kwargs):
+ '''
+ Print the command to stderr and run it.
+
+ If the command fails, throw a traceback.
+ '''
+ eprint('# %s' % ' '.join(cmd))
+ kwargs['check'] = True
+ return subprocess.run(cmd, *args, **kwargs)
+
+
+def require(suite_dir, *names):
+ '''
+ Declare a dependency on the given names for a benchmark.
+
+ If any dependency doesn't exist, then fail with an error message.
+ '''
+ errs = []
+ for name in names:
+ fun_name = name.replace('-', '_')
+ if not globals()['has_%s' % fun_name](suite_dir):
+ errs.append(name)
+ if len(errs) > 0:
+ raise MissingDependencies(errs)
+
+
+def download_linux(suite_dir):
+ 'Download and build the Linux kernel.'
+ checkout_dir = path.join(suite_dir, LINUX_DIR)
+ if not os.path.isdir(checkout_dir):
+ # Clone from my fork so that we always get the same corpus *and* still
+ # do a shallow clone. Shallow clones are much much cheaper than full
+ # clones.
+ run_cmd(['git', 'clone', '--depth', '1', LINUX_CLONE, checkout_dir])
+ # We want to build the kernel because the process of building it produces
+ # a lot of junk in the repository that a search tool probably shouldn't
+ # touch.
+ if not os.path.exists(path.join(checkout_dir, 'vmlinux')):
+ eprint('# Building Linux kernel...')
+ run_cmd(['make', 'defconfig'], cwd=checkout_dir)
+ run_cmd(['make', '-j', str(cpu_count())], cwd=checkout_dir)
+
+
+def has_linux(suite_dir):
+ 'Returns true if we believe the Linux kernel is built.'
+ checkout_dir = path.join(suite_dir, LINUX_DIR)
+ return path.exists(path.join(checkout_dir, 'vmlinux'))
+
+
+def download_subtitles_en(suite_dir):
+ 'Download and decompress English subtitles.'
+ subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
+ en_path_gz = path.join(subtitle_dir, SUBTITLES_EN_NAME_GZ)
+ en_path = path.join(subtitle_dir, SUBTITLES_EN_NAME)
+
+ if not os.path.isdir(subtitle_dir):
+ os.makedirs(subtitle_dir)
+ if not os.path.exists(en_path):
+ if not os.path.exists(en_path_gz):
+ run_cmd(['curl', '-LO', SUBTITLES_EN_URL], cwd=subtitle_dir)
+ run_cmd(['gunzip', en_path_gz], cwd=subtitle_dir)
+
+
+def has_subtitles_en(suite_dir):
+ 'Returns true if English subtitles have been downloaded.'
+ subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
+ return path.exists(path.join(subtitle_dir, SUBTITLES_EN_NAME))
+
+
+def download_subtitles_ru(suite_dir):
+ 'Download and decompress Russian subtitles.'
+ subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
+ ru_path_gz = path.join(subtitle_dir, SUBTITLES_RU_NAME_GZ)
+ ru_path = path.join(subtitle_dir, SUBTITLES_RU_NAME)
+
+ if not os.path.isdir(subtitle_dir):
+ os.makedirs(subtitle_dir)
+ if not os.path.exists(ru_path):
+ if not os.path.exists(ru_path_gz):
+ run_cmd(['curl', '-LO', SUBTITLES_RU_URL], cwd=subtitle_dir)
+ run_cmd(['gunzip', ru_path_gz], cwd=subtitle_dir)
+
+
+def has_subtitles_ru(suite_dir):
+ 'Returns true if Russian subtitles have been downloaded.'
+ subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
+ return path.exists(path.join(subtitle_dir, SUBTITLES_RU_NAME))
+
+
+def download(suite_dir, choices):
+ '''
+ Download choices into suite_dir.
+
+ Specifically, choices specifies a list of corpora to fetch.
+
+ :param str suite_dir:
+ The directory in which to download corpora.
+ :param list(str) choices:
+ A list of corpora to download. Available choices are:
+ all, linux, subtitles-en, subtitles-ru.
+ '''
+ for choice in args.download:
+ if choice == 'linux':
+ download_linux(suite_dir)
+ elif choice == 'subtitles-en':
+ download_subtitles_en(suite_dir)
+ elif choice == 'subtitles-ru':
+ download_subtitles_ru(suite_dir)
+ elif choice == 'all':
+ download_linux(suite_dir)
+ download_subtitles_en(suite_dir)
+ download_subtitles_ru(suite_dir)
+ else:
+ eprint('Unrecognized download choice: %s' % choice)
+ sys.exit(1)
+
+
+def collect_benchmarks(suite_dir, filter_pat=None):
+ '''
+ Return an iterable of all runnable benchmarks.
+
+ :param str suite_dir:
+ The directory containing corpora.
+ :param str filter_pat:
+ A single regular expression that is used to filter benchmarks
+ by their name. When not specified, all benchmarks are run.
+ :returns:
+ An iterable over all runnable benchmarks. If a benchmark
+ requires corpora that are missing, then a log message is
+ emitted to stderr and it is not yielded.
+ '''
+ for fun in sorted(globals()):
+ if not fun.startswith('bench_'):
+ continue
+ name = re.sub('^bench_', '', fun)
+ if filter_pat is not None and not re.search(filter_pat, name):
+ continue
+ try:
+ benchmark = globals()[fun](suite_dir)
+ except MissingDependencies as e:
+ eprint(
+ 'missing: %s, skipping benchmark %s (try running with: %s)' % (
+ ', '.join(e.missing_names),
+ name,
+ ' '.join(['--download %s' % n for n in e.missing_names]),
+ ))
+ continue
+ benchmark.name = name
+ yield benchmark
+
+
+def main():
+ p = argparse.ArgumentParser('Command line search tool benchmark suite.')
+ p.add_argument(
+ '--dir', metavar='PATH', default=os.getcwd(),
+ help='The directory in which to download data and perform searches.')
+ p.add_argument(
+ '--download', metavar='CORPUS', action='append',
+ choices=['all', 'linux', 'subtitles-en', 'subtitles-ru'],
+ help='Download and prepare corpus data, then exit without running '
+ 'any benchmarks. Note that this command is intended to be '
+ 'idempotent. WARNING: This downloads over a gigabyte of data, '
+ 'and also includes building the Linux kernel. If "all" is used '
+ 'then the total uncompressed size is around 13 GB.')
+ p.add_argument(
+ '-f', '--force', action='store_true',
+ help='Overwrite existing files if there is a conflict.')
+ p.add_argument(
+ '--list', action='store_true',
+ help='List available benchmarks by name.')
+ p.add_argument(
+ '--raw', metavar='PATH',
+ help='Dump raw data (all samples collected) in CSV format to the '
+ 'file path provided.')
+ p.add_argument(
+ 'bench', metavar='PAT', nargs='?',
+ help='A regex pattern that will only run benchmarks that match.')
+ args = p.parse_args()
+
+ if args.download is not None and len(args.download) > 0:
+ download(args.dir, args.choices)
+ sys.exit(0)
+
+ if not path.isdir(args.dir):
+ os.makedirs(args.dir)
+ if args.raw is not None and path.exists(args.raw) and not args.force:
+ eprint('File %s already exists (delete it or use --force)' % args.raw)
+ sys.exit(1)
+ raw_handle, raw_csv_wtr = None, None
+ if args.raw is not None:
+ fields = [
+ 'benchmark', 'warmup_iter', 'iter',
+ 'name', 'command', 'duration', 'lines', 'env',
+ ]
+ raw_handle = open(args.raw, 'w+')
+ raw_csv_wtr = csv.DictWriter(raw_handle, fields)
+ raw_csv_wtr.writerow({x: x for x in fields})
+
+ benchmarks = collect_benchmarks(args.dir, filter_pat=args.bench)
+ for i, b in enumerate(benchmarks):
+ result = b.run()
+ fastest_cmd = result.fastest_cmd()
+ fastest_sample = result.fastest_sample()
+ max_name_len = max(len(cmd.name) for cmd in b.commands)
+
+ if i > 0:
+ print()
+ header = '%s (pattern: %s)' % (b.name, b.pattern)
+ print('%s\n%s' % (header, '-' * len(header)))
+ for cmd in b.commands:
+ name = cmd.name
+ mean, stdev = result.distribution_for(cmd)
+ line_counts = result.line_counts_for(cmd)
+ show_fast_cmd, show_line_counts = '', ''
+ if fastest_cmd.name == cmd.name:
+ show_fast_cmd = '*'
+ if fastest_sample['cmd'].name == cmd.name:
+ name += '*'
+ if len(line_counts) > 0:
+ counts = map(str, line_counts)
+ show_line_counts = ' (lines: %s)' % ', '.join(counts)
+ fmt = '{name:{pad}} {mean:0.3f} +/- {stdev:0.3f}{lines}{fast_cmd}'
+ print(fmt.format(
+ name=name, pad=max_name_len + 2, fast_cmd=show_fast_cmd,
+ mean=mean, stdev=stdev, lines=show_line_counts))
+ sys.stdout.flush()
+
+ if raw_csv_wtr is not None:
+ for sample in result.samples:
+ cmd, duration = sample['cmd'], sample['duration']
+ env = ' '.join(['%s=%s' % (k, v)
+ for k, v in cmd.kwargs.get('env', {}).items()])
+ raw_csv_wtr.writerow({
+ 'benchmark': b.name,
+ 'warmup_iter': b.warmup_count,
+ 'iter': b.count,
+ 'name': sample['cmd'].name,
+ 'command': ' '.join(cmd.cmd),
+ 'duration': duration,
+ 'lines': sample['line_count'] or '',
+ 'env': env,
+ })
+ raw_handle.flush()
+
+
+if __name__ == '__main__':
+ main()