Initial cut at a benchmark suite for CLI search tools.0.0.17

author: Andrew Gallant <jamslam@gmail.com> 2016-09-11 01:05:36 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2016-09-11 01:05:36 -0400
commit: 9bf7696ec8cacc74baaa4003cdfba0dab65245fd (patch)
tree: cefd7d96af71bf8c5753341ad2a2d5319f58cdbb /benchsuite
parent: cb0f8fd2fa7ed1770fc698ea2ac033e3bc7994d7 (diff)
1 files changed, 918 insertions, 0 deletions
diff --git a/benchsuite b/benchsuite
new file mode 100755
index 00000000..381b57fb
--- /dev/null
+++ b/benchsuite
@@ -0,0 +1,918 @@
+#!/usr/bin/env python
+
+'''
+benchsuite is a benchmark runner for comparing command line search tools.
+'''
+
+import argparse
+import csv
+import os
+import os.path as path
+from multiprocessing import cpu_count
+import re
+import statistics
+import subprocess
+import sys
+import time
+
+# Some constants for identifying the corpora we use to run tests.
+# We establish two very different kinds of corpora: a small number of large
+# files and a large number of small files. These are vastly different use cases
+# not only because of their performance characteristics, but also the
+# strategies used to increase the relevance of results returned.
+
+SUBTITLES_DIR = 'subtitles'
+SUBTITLES_EN_NAME = 'OpenSubtitles2016.raw.en'
+SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME
+SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.en.gz'
+SUBTITLES_RU_NAME = 'OpenSubtitles2016.raw.ru'
+SUBTITLES_RU_NAME_GZ = '%s.gz' % SUBTITLES_RU_NAME
+SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.ru.gz'
+
+LINUX_DIR = 'linux'
+LINUX_CLONE = 'git://github.com/BurntSushi/linux'
+
+
+def bench_linux_literal_default(suite_dir):
+    '''
+    Benchmark the speed of a literal using *default* settings.
+
+    This is a purposefully unfair benchmark for use in performance
+    analysis, but it is pedagogically useful.
+    '''
+    require(suite_dir, 'linux')
+    cwd = path.join(suite_dir, LINUX_DIR)
+    pat = 'PM_RESUME'
+
+    def mkcmd(*args, **kwargs):
+        kwargs['cwd'] = cwd
+        return Command(*args, **kwargs)
+
+    # N.B. This is a purposefully unfair benchmark for illustrative purposes
+    # of how the default modes for each search tool differ.
+    return Benchmark(pattern=pat, commands=[
+        mkcmd('rg', ['rg', pat]),
+        mkcmd('ag', ['ag', pat]),
+        # ucg reports the exact same matches as ag and rg even though it
+        # doesn't read gitignore files. Instead, it has a file whitelist
+        # that happens to match up exactly with the gitignores for this search.
+        mkcmd('ucg', ['ucg', pat]),
+        mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'C'}),
+        mkcmd('pt', ['pt', pat]),
+        # sift reports an extra line here for a binary file matched.
+        mkcmd('sift', ['sift', pat]),
+    ])
+
+
+def bench_linux_literal(suite_dir):
+    '''
+    Benchmark the speed of a literal, attempting to be fair.
+
+    This tries to use the minimum set of options available in all tools
+    to test how fast they are. For example, it makes sure there is no
+    case insensitive matching and that line numbers are computed.
+    '''
+    require(suite_dir, 'linux')
+    cwd = path.join(suite_dir, LINUX_DIR)
+    pat = 'PM_RESUME'
+
+    def mkcmd(*args, **kwargs):
+        kwargs['cwd'] = cwd
+        return Command(*args, **kwargs)
+
+    return Benchmark(pattern=pat, commands=[
+        mkcmd('rg', ['rg', '-n', pat]),
+        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
+        mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]),
+        mkcmd('ag', ['ag', '-s', pat]),
+        mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]),
+        mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
+        mkcmd('git grep', [
+            'git', 'grep', '-I', '-n', pat,
+        ], env={'LC_ALL': 'C'}),
+        mkcmd('pt', ['pt', pat]),
+        mkcmd('sift', [
+            'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
+        ]),
+    ])
+
+
+def bench_linux_literal_casei(suite_dir):
+    '''
+    Benchmark the speed of a case insensitive literal search.
+
+    This is like the linux_literal benchmark, except we ask the
+    search tools to do case insensitive search.
+    '''
+    require(suite_dir, 'linux')
+    cwd = path.join(suite_dir, LINUX_DIR)
+    pat = 'PM_RESUME'
+
+    def mkcmd(*args, **kwargs):
+        kwargs['cwd'] = cwd
+        return Command(*args, **kwargs)
+
+    return Benchmark(pattern=pat, commands=[
+        mkcmd('rg', ['rg', '-n', '-i', pat]),
+        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]),
+        mkcmd('rg-novcs-mmap', [
+            'rg', '--mmap', '--no-ignore', '-n', '-i', pat,
+        ]),
+        mkcmd('ag', ['ag', '-i', pat]),
+        mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-i', pat]),
+        mkcmd('ucg', ['ucg', '-i', pat]),
+        mkcmd('git grep', [
+            'git', 'grep', '-I', '-n', '-i', pat,
+        ], env={'LC_ALL': 'C'}),
+        # sift yields more matches than it should here. Specifically, it gets
+        # matches in Module.symvers and System.map in the repo root. Both of
+        # those files show up in the repo root's .gitignore file.
+        mkcmd('sift', [
+            'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-i', pat,
+        ]),
+    ])
+
+
+def bench_linux_re_literal_suffix(suite_dir):
+    '''
+    Benchmark the speed of a literal inside a regex.
+
+    This, for example, inhibits a prefix byte optimization used
+    inside of Go's regex engine (relevant for sift and pt).
+    '''
+    require(suite_dir, 'linux')
+    cwd = path.join(suite_dir, LINUX_DIR)
+    pat = '[A-Z]+_RESUME'
+
+    def mkcmd(*args, **kwargs):
+        kwargs['cwd'] = cwd
+        return Command(*args, **kwargs)
+
+    return Benchmark(pattern=pat, commands=[
+        mkcmd('rg', ['rg', '-n', pat]),
+        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
+        mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]),
+        mkcmd('ag', ['ag', '-s', pat]),
+        mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]),
+        mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
+        mkcmd(
+            'git grep',
+            ['git', 'grep', '-E', '-I', '-n', pat],
+            env={'LC_ALL': 'C'},
+        ),
+        mkcmd('sift', [
+            'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
+        ]),
+    ])
+
+
+def bench_linux_word(suite_dir):
+    '''
+    Benchmark use of the -w ("match word") flag in each tool.
+
+    sift has a lot of trouble with this because it forces it into Go's
+    regex engine by surrounding the pattern with \b assertions.
+    '''
+    require(suite_dir, 'linux')
+    cwd = path.join(suite_dir, LINUX_DIR)
+    pat = 'PM_RESUME'
+
+    def mkcmd(*args, **kwargs):
+        kwargs['cwd'] = cwd
+        return Command(*args, **kwargs)
+
+    return Benchmark(pattern=pat, commands=[
+        mkcmd('rg', ['rg', '-n', '-w', pat]),
+        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-w', pat]),
+        mkcmd('rg-novcs-mmap', [
+            'rg', '--mmap', '--no-ignore', '-n', '-w', pat,
+        ]),
+        mkcmd('ag', ['ag', '-s', '-w', pat]),
+        mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', '-w', pat]),
+        mkcmd('ucg', ['ucg', '--nosmart-case', '-w', pat]),
+        mkcmd(
+            'git grep',
+            ['git', 'grep', '-E', '-I', '-n', '-w', pat],
+            env={'LC_ALL': 'C'},
+        ),
+        mkcmd('sift', [
+            'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-w', pat,
+        ]),
+    ])
+
+
+def bench_linux_unicode_greek(suite_dir):
+    '''
+    Benchmark matching of a Unicode category.
+
+    Only three tools (ripgrep, sift and pt) support this.
+    '''
+    require(suite_dir, 'linux')
+    cwd = path.join(suite_dir, LINUX_DIR)
+    pat = r'\p{Greek}'
+
+    def mkcmd(*args, **kwargs):
+        kwargs['cwd'] = cwd
+        return Command(*args, **kwargs)
+
+    return Benchmark(pattern=pat, commands=[
+        mkcmd('rg', ['rg', '-n', pat]),
+        # sift tries to search a bunch of PDF files and clutters up the
+        # results, even though --binary-skip is provided. They are excluded
+        # here explicitly, but don't have a measurable impact on performance.
+        mkcmd('sift', [
+            'sift', '-n', '--binary-skip',
+            '--exclude-files', '.*',
+            '--exclude-files', '*.pdf',
+            pat,
+        ]),
+    ])
+
+
+def bench_linux_unicode_greek_casei(suite_dir):
+    '''
+    Benchmark matching of a Unicode category, case insensitively.
+
+    Only ripgrep gets this right (and it's still fast).
+    '''
+    require(suite_dir, 'linux')
+    cwd = path.join(suite_dir, LINUX_DIR)
+    pat = r'\p{Greek}'
+
+    def mkcmd(*args, **kwargs):
+        kwargs['cwd'] = cwd
+        return Command(*args, **kwargs)
+
+    return Benchmark(pattern=pat, commands=[
+        mkcmd('rg', ['rg', '-n', '-i', pat]),
+        # sift tries to search a bunch of PDF files and clutters up the
+        # results, even though --binary-skip is provided. They are excluded
+        # here explicitly, but don't have a measurable impact on performance.
+        mkcmd('sift', [
+            'sift', '-n', '--binary-skip',
+            '--exclude-files', '.*',
+            '--exclude-files', '*.pdf',
+            pat,
+        ]),
+    ])
+
+
+def bench_linux_unicode_word(suite_dir):
+    '''
+    Benchmark Unicode aware \w character class.
+
+    Only ripgrep and git-grep (with LC_ALL=en_US.UTF-8) actually get
+    this right. Everything else uses the standard ASCII interpretation
+    of \w.
+    '''
+    require(suite_dir, 'linux')
+    cwd = path.join(suite_dir, LINUX_DIR)
+    pat = r'\wAh'
+
+    def mkcmd(*args, **kwargs):
+        kwargs['cwd'] = cwd
+        return Command(*args, **kwargs)
+
+    return Benchmark(pattern=pat, commands=[
+        mkcmd('rg', ['rg', '-n', pat]),
+        mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]),
+        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
+        mkcmd('rg-novcs-mmap', [
+            'rg', '--mmap', '--no-ignore', '-n', pat,
+        ]),
+        mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
+        mkcmd('ag-novcs (no Unicode)', [
+            'ag', '--skip-vcs-ignores', '-s', pat,
+        ]),
+        mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]),
+        mkcmd(
+            'git grep',
+            ['git', 'grep', '-E', '-I', '-n', pat],
+            env={'LC_ALL': 'en_US.UTF-8'},
+        ),
+        mkcmd(
+            'git grep (no Unicode)',
+            ['git', 'grep', '-E', '-I', '-n', pat],
+            env={'LC_ALL': 'C'},
+        ),
+        mkcmd('sift (no Unicode)', [
+            'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
+        ]),
+    ])
+
+
+def bench_linux_no_literal(suite_dir):
+    '''
+    Benchmark a regex that defeats all literal optimizations.
+
+    Most search patterns have some kind of literal in them, which
+    typically permits searches to take some shortcuts. Therefore, the
+    applicability of this benchmark is somewhat suspicious, but the
+    suite wouldn't feel complete without it.
+    '''
+    require(suite_dir, 'linux')
+    cwd = path.join(suite_dir, LINUX_DIR)
+    pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'
+
+    def mkcmd(*args, **kwargs):
+        kwargs['cwd'] = cwd
+        return Command(*args, **kwargs)
+
+    return Benchmark(pattern=pat, commands=[
+        mkcmd('rg', ['rg', '-n', pat]),
+        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
+        mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]),
+        mkcmd('rg-novcs (no Unicode)', [
+            'rg', '--no-ignore', '-n', '(?-u)' + pat,
+        ]),
+        mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
+        mkcmd('ag-novcs (no Unicode)', [
+            'ag', '--skip-vcs-ignores', '-s', pat,
+        ]),
+        mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]),
+        mkcmd(
+            'git grep',
+            ['git', 'grep', '-E', '-I', '-n', pat],
+            env={'LC_ALL': 'en_US.UTF-8'},
+        ),
+        mkcmd(
+            'git grep (no Unicode)',
+            ['git', 'grep', '-E', '-I', '-n', pat],
+            env={'LC_ALL': 'C'},
+        ),
+        mkcmd('sift (no Unicode)', [
+            'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
+        ]),
+    ])
+
+
+def bench_linux_alternates(suite_dir):
+    '''
+    Benchmark a small alternation of literals.
+
+    sift doesn't make the cut. It's more than 10x slower than the next
+    fastest result. The slowdown is likely because the Go regexp engine
+    doesn't do any literal optimizations for this case (there is no
+    common leading byte).
+    '''
+    require(suite_dir, 'linux')
+    cwd = path.join(suite_dir, LINUX_DIR)
+    pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT'
+
+    def mkcmd(*args, **kwargs):
+        kwargs['cwd'] = cwd
+        return Command(*args, **kwargs)
+
+    return Benchmark(pattern=pat, commands=[
+        mkcmd('rg', ['rg', '-n', pat]),
+        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
+        mkcmd('rg-novcs-mmap', [
+            'rg', '--mmap', '--no-ignore', '-n', pat,
+        ]),
+        mkcmd('ag', ['ag', '-s', pat]),
+        mkcmd('ag-novcs', [
+            'ag', '--skip-vcs-ignores', '-s', pat,
+        ]),
+        mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
+        mkcmd(
+            'git grep',
+            ['git', 'grep', '-E', '-I', '-n', pat],
+            env={'LC_ALL': 'C'},
+        ),
+    ])
+
+
+def bench_linux_alternates_casei(suite_dir):
+    'Benchmark a small alternation of literals case insensitively.'
+    require(suite_dir, 'linux')
+    cwd = path.join(suite_dir, LINUX_DIR)
+    pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT'
+
+    def mkcmd(*args, **kwargs):
+        kwargs['cwd'] = cwd
+        return Command(*args, **kwargs)
+
+    return Benchmark(pattern=pat, commands=[
+        mkcmd('rg', ['rg', '-n', '-i', pat]),
+        mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]),
+        mkcmd('rg-novcs-mmap', [
+            'rg', '--mmap', '--no-ignore', '-n', '-i', pat,
+        ]),
+        mkcmd('ag', ['ag', '-i', pat]),
+        mkcmd('ag-novcs', [
+            'ag', '--skip-vcs-ignores', '-i', pat,
+        ]),
+        mkcmd('ucg', ['ucg', '-i', pat]),
+        mkcmd(
+            'git grep',
+            ['git', 'grep', '-E', '-I', '-n', '-i', pat],
+            env={'LC_ALL': 'C'},
+        ),
+    ])
+
+
+# BREADCRUMBS(burntsushi): We should benchmark an alternation for `linux` as
+# well.
+
+def bench_sherlock(suite_dir):
+    'TODO: Fix this and add more single file benchmarks.'
+    require(suite_dir, 'subtitles-en')
+    en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME)
+    pat = 'Sherlock'
+
+    return Benchmark(pattern=pat, commands=[
+        Command('rg', ['rg', pat, en]),
+        Command('grep', ['grep', '-a', pat, en])
+    ])
+
+
+class MissingDependencies(Exception):
+    '''
+    A missing dependency exception.
+
+    This exception occurs when running a benchmark that requires a
+    particular corpus that isn't available.
+
+    :ivar list(str) missing_names:
+        A list of missing dependency names. These names correspond to
+        names that can be used with the --download flag.
+    '''
+    def __init__(self, missing_names):
+        self.missing_names = missing_names
+
+    def __str__(self):
+        return 'MissingDependency(%s)' % repr(self.missing_names)
+
+
+class Benchmark(object):
+    '''
+    A single benchmark corresponding to a grouping of commands.
+
+    The main purpose of a benchmark is to compare the performance
+    characteristics of a group of commands.
+    '''
+
+    def __init__(self, name=None, pattern=None, commands=None,
+                 warmup_count=1, count=3, line_count=True):
+        '''
+        Create a single benchmark.
+
+        A single benchmark is composed of a set of commands that are
+        benchmarked and compared against one another. A benchmark may
+        have multiple commands that use the same search tool (but
+        probably should have something differentiating them).
+
+        The grouping of commands is a purely human driven process.
+
+        By default, the output of every command is sent to /dev/null.
+        Other types of behavior are available via the methods defined
+        on this benchmark.
+
+        :param str name:
+            A human readable string denoting the name of this
+            benchmark.
+        :param str pattern:
+            The pattern that is used in search.
+        :param list(Command) commands:
+            A list of commands to initialize this benchmark with. More
+            commands may be added before running the benchmark.
+        :param int warmup_count:
+            The number of times to run each command before recording
+            samples.
+        :param int count:
+            The number of samples to collect from each command.
+        :param bool line_count:
+            When set, the lines of each search are counted and included
+            in the samples produced.
+        '''
+        self.name = name
+        self.pattern = pattern
+        self.commands = commands or []
+        self.warmup_count = warmup_count
+        self.count = count
+        self.line_count = line_count
+
+    def run(self):
+        '''
+        Runs this benchmark and returns the results.
+
+        :rtype: Result
+        '''
+        result = Result(self)
+        for cmd in self.commands:
+            # Do a warmup first.
+            for _ in range(self.warmup_count):
+                self.run_one(cmd)
+            for _ in range(self.count):
+                result.add(cmd, **self.run_one(cmd))
+        return result
+
+    def run_one(self, cmd):
+        '''
+        Runs the given command exactly once.
+
+        Returns an object that includes the time taken by the command.
+        If this benchmark was configured to count the number of lines
+        returned, then the line count is also returned.
+
+        :param Command cmd: The command to run.
+        :returns:
+            A dict with two fields, duration and line_count.
+            The duration is in seconds, with fractional milliseconds,
+            and is guaranteed to be available. The line_count is set
+            to None unless line counting is enabled, in which case,
+            it is the number of lines in the search output.
+        :rtype: int
+        '''
+        cmd.kwargs['stderr'] = subprocess.DEVNULL
+        if self.line_count:
+            cmd.kwargs['stdout'] = subprocess.PIPE
+        else:
+            cmd.kwargs['stdout'] = subprocess.DEVNULL
+
+        start = time.time()
+        completed = cmd.run()
+        end = time.time()
+
+        line_count = None
+        if self.line_count:
+            line_count = completed.stdout.count(b'\n')
+        return {
+            'duration': end - start,
+            'line_count': line_count,
+        }
+
+
+class Result(object):
+    '''
+    The result of running a benchmark.
+
+    Benchmark results consist of a set of samples, where each sample
+    corresponds to a single run of a single command in the benchmark.
+    Various statistics can be computed from these samples such as mean
+    and standard deviation.
+    '''
+    def __init__(self, benchmark):
+        '''
+        Create a new set of results, initially empty.
+
+        :param Benchmarl benchmark:
+            The benchmark that produced these results.
+        '''
+        self.benchmark = benchmark
+        self.samples = []
+
+    def add(self, cmd, duration, line_count=None):
+        '''
+        Add a new sample to this result set.
+
+        :param Command cmd:
+            The command that produced this sample.
+        :param int duration:
+            The duration, in milliseconds, that the command took to
+            run.
+        :param int line_count:
+            The number of lines in the search output. This is optional.
+        '''
+        self.samples.append({
+            'cmd': cmd,
+            'duration': duration,
+            'line_count': line_count,
+        })
+
+    def fastest_sample(self):
+        '''
+        Returns the fastest recorded sample.
+        '''
+        return min(self.samples, key=lambda s: s['duration'])
+
+    def fastest_cmd(self):
+        '''
+        Returns the fastest command according to distribution.
+        '''
+        means = []
+        for cmd in self.benchmark.commands:
+            mean, _ = self.distribution_for(cmd)
+            means.append((cmd, mean))
+        return min(means, key=lambda tup: tup[1])[0]
+
+    def samples_for(self, cmd):
+        'Returns an iterable of samples for cmd'
+        yield from (s for s in self.samples if s['cmd'].name == cmd.name)
+
+    def line_counts_for(self, cmd):
+        '''
+        Returns the line counts recorded for each command.
+
+        :returns:
+            A dictionary from command name to a set of line
+            counts recorded.
+        '''
+        return {s['line_count'] for s in self.samples_for(cmd)
+                                if s['line_count'] is not None}
+
+    def distribution_for(self, cmd):
+        '''
+        Returns the distribution (mean +/- std) of the given command.
+
+        :rtype: (float, float)
+        :returns:
+            A tuple containing the mean and standard deviation, in that
+            order.
+        '''
+        mean = statistics.mean(
+            s['duration'] for s in self.samples_for(cmd))
+        stdev = statistics.stdev(
+            s['duration'] for s in self.samples_for(cmd))
+        return mean, stdev
+
+
+class Command(object):
+    def __init__(self, name, cmd, *args, **kwargs):
+        '''
+        Create a new command that is run as part of a benchmark.
+
+        *args and **kwargs are passed directly to ``subprocess.run``.
+        An exception to this is stdin/stdout/stderr. Output
+        redirection is completely controlled by the benchmark harness.
+        Trying to set them here will trigger an assert.
+
+        :param str name:
+            The human readable name of this command. This is
+            particularly useful if the same search tool is used
+            multiple times in the same benchmark with different
+            arguments.
+        :param list(str) cmd:
+            The command to run as a list of arguments (including the
+            command name itself).
+        '''
+        assert 'stdin' not in kwargs
+        assert 'stdout' not in kwargs
+        assert 'stderr' not in kwargs
+        self.name = name
+        self.cmd = cmd
+        self.args = args
+        self.kwargs = kwargs
+
+    def run(self):
+        '''
+        Runs this command and returns its status.
+
+        :rtype: subprocess.CompletedProcess
+        '''
+        return subprocess.run(self.cmd, *self.args, **self.kwargs)
+
+
+def eprint(*args, **kwargs):
+    'Like print, but to stderr.'
+    kwargs['file'] = sys.stderr
+    print(*args, **kwargs)
+
+
+def run_cmd(cmd, *args, **kwargs):
+    '''
+    Print the command to stderr and run it.
+
+    If the command fails, throw a traceback.
+    '''
+    eprint('# %s' % ' '.join(cmd))
+    kwargs['check'] = True
+    return subprocess.run(cmd, *args, **kwargs)
+
+
+def require(suite_dir, *names):
+    '''
+    Declare a dependency on the given names for a benchmark.
+
+    If any dependency doesn't exist, then fail with an error message.
+    '''
+    errs = []
+    for name in names:
+        fun_name = name.replace('-', '_')
+        if not globals()['has_%s' % fun_name](suite_dir):
+            errs.append(name)
+    if len(errs) > 0:
+        raise MissingDependencies(errs)
+
+
+def download_linux(suite_dir):
+    'Download and build the Linux kernel.'
+    checkout_dir = path.join(suite_dir, LINUX_DIR)
+    if not os.path.isdir(checkout_dir):
+        # Clone from my fork so that we always get the same corpus *and* still
+        # do a shallow clone. Shallow clones are much much cheaper than full
+        # clones.
+        run_cmd(['git', 'clone', '--depth', '1', LINUX_CLONE, checkout_dir])
+    # We want to build the kernel because the process of building it produces
+    # a lot of junk in the repository that a search tool probably shouldn't
+    # touch.
+    if not os.path.exists(path.join(checkout_dir, 'vmlinux')):
+        eprint('# Building Linux kernel...')
+        run_cmd(['make', 'defconfig'], cwd=checkout_dir)
+        run_cmd(['make', '-j', str(cpu_count())], cwd=checkout_dir)
+
+
+def has_linux(suite_dir):
+    'Returns true if we believe the Linux kernel is built.'
+    checkout_dir = path.join(suite_dir, LINUX_DIR)
+    return path.exists(path.join(checkout_dir, 'vmlinux'))
+
+
+def download_subtitles_en(suite_dir):
+    'Download and decompress English subtitles.'
+    subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
+    en_path_gz = path.join(subtitle_dir, SUBTITLES_EN_NAME_GZ)
+    en_path = path.join(subtitle_dir, SUBTITLES_EN_NAME)
+
+    if not os.path.isdir(subtitle_dir):
+        os.makedirs(subtitle_dir)
+    if not os.path.exists(en_path):
+        if not os.path.exists(en_path_gz):
+            run_cmd(['curl', '-LO', SUBTITLES_EN_URL], cwd=subtitle_dir)
+        run_cmd(['gunzip', en_path_gz], cwd=subtitle_dir)
+
+
+def has_subtitles_en(suite_dir):
+    'Returns true if English subtitles have been downloaded.'
+    subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
+    return path.exists(path.join(subtitle_dir, SUBTITLES_EN_NAME))
+
+
+def download_subtitles_ru(suite_dir):
+    'Download and decompress Russian subtitles.'
+    subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
+    ru_path_gz = path.join(subtitle_dir, SUBTITLES_RU_NAME_GZ)
+    ru_path = path.join(subtitle_dir, SUBTITLES_RU_NAME)
+
+    if not os.path.isdir(subtitle_dir):
+        os.makedirs(subtitle_dir)
+    if not os.path.exists(ru_path):
+        if not os.path.exists(ru_path_gz):
+            run_cmd(['curl', '-LO', SUBTITLES_RU_URL], cwd=subtitle_dir)
+        run_cmd(['gunzip', ru_path_gz], cwd=subtitle_dir)
+
+
+def has_subtitles_ru(suite_dir):
+    'Returns true if Russian subtitles have been downloaded.'
+    subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
+    return path.exists(path.join(subtitle_dir, SUBTITLES_RU_NAME))
+
+
+def download(suite_dir, choices):
+    '''
+    Download choices into suite_dir.
+
+    Specifically, choices specifies a list of corpora to fetch.
+
+    :param str suite_dir:
+        The directory in which to download corpora.
+    :param list(str) choices:
+        A list of corpora to download. Available choices are:
+        all, linux, subtitles-en, subtitles-ru.
+    '''
+    for choice in args.download:
+        if choice == 'linux':
+            download_linux(suite_dir)
+        elif choice == 'subtitles-en':
+            download_subtitles_en(suite_dir)
+        elif choice == 'subtitles-ru':
+            download_subtitles_ru(suite_dir)
+        elif choice == 'all':
+            download_linux(suite_dir)
+            download_subtitles_en(suite_dir)
+            download_subtitles_ru(suite_dir)
+        else:
+            eprint('Unrecognized download choice: %s' % choice)
+            sys.exit(1)
+
+
+def collect_benchmarks(suite_dir, filter_pat=None):
+    '''
+    Return an iterable of all runnable benchmarks.
+
+    :param str suite_dir:
+        The directory containing corpora.
+    :param str filter_pat:
+        A single regular expression that is used to filter benchmarks
+        by their name. When not specified, all benchmarks are run.
+    :returns:
+        An iterable over all runnable benchmarks. If a benchmark
+        requires corpora that are missing, then a log message is
+        emitted to stderr and it is not yielded.
+    '''
+    for fun in sorted(globals()):
+        if not fun.startswith('bench_'):
+            continue
+        name = re.sub('^bench_', '', fun)
+        if filter_pat is not None and not re.search(filter_pat, name):
+            continue
+        try:
+            benchmark = globals()[fun](suite_dir)
+        except MissingDependencies as e:
+            eprint(
+                'missing: %s, skipping benchmark %s (try running with: %s)' % (
+                    ', '.join(e.missing_names),
+                    name,
+                    ' '.join(['--download %s' % n for n in e.missing_names]),
+                ))
+            continue
+        benchmark.name = name
+        yield benchmark
+
+
+def main():
+    p = argparse.ArgumentParser('Command line search tool benchmark suite.')
+    p.add_argument(
+        '--dir', metavar='PATH', default=os.getcwd(),
+        help='The directory in which to download data and perform searches.')
+    p.add_argument(
+        '--download', metavar='CORPUS', action='append',
+        choices=['all', 'linux', 'subtitles-en', 'subtitles-ru'],
+        help='Download and prepare corpus data, then exit without running '
+             'any benchmarks. Note that this command is intended to be '
+             'idempotent. WARNING: This downloads over a gigabyte of data, '
+             'and also includes building the Linux kernel. If "all" is used '
+             'then the total uncompressed size is around 13 GB.')
+    p.add_argument(
+        '-f', '--force', action='store_true',
+        help='Overwrite existing files if there is a conflict.')
+    p.add_argument(
+        '--list', action='store_true',
+        help='List available benchmarks by name.')
+    p.add_argument(
+        '--raw', metavar='PATH',
+        help='Dump raw data (all samples collected) in CSV format to the '
+             'file path provided.')
+    p.add_argument(
+        'bench', metavar='PAT', nargs='?',
+        help='A regex pattern that will only run benchmarks that match.')
+    args = p.parse_args()
+
+    if args.download is not None and len(args.download) > 0:
+        download(args.dir, args.choices)
+        sys.exit(0)
+
+    if not path.isdir(args.dir):
+        os.makedirs(args.dir)
+    if args.raw is not None and path.exists(args.raw) and not args.force:
+        eprint('File %s already exists (delete it or use --force)' % args.raw)
+        sys.exit(1)
+    raw_handle, raw_csv_wtr = None, None
+    if args.raw is not None:
+        fields = [
+            'benchmark', 'warmup_iter', 'iter',
+            'name', 'command', 'duration', 'lines', 'env',
+        ]
+        raw_handle = open(args.raw, 'w+')
+        raw_csv_wtr = csv.DictWriter(raw_handle, fields)
+        raw_csv_wtr.writerow({x: x for x in fields})
+
+    benchmarks = collect_benchmarks(args.dir, filter_pat=args.bench)
+    for i, b in enumerate(benchmarks):
+        result = b.run()
+        fastest_cmd = result.fastest_cmd()
+        fastest_sample = result.fastest_sample()
+        max_name_len = max(len(cmd.name) for cmd in b.commands)
+
+        if i > 0:
+            print()
+        header = '%s (pattern: %s)' % (b.name, b.pattern)
+        print('%s\n%s' % (header, '-' * len(header)))
+        for cmd in b.commands:
+            name = cmd.name
+            mean, stdev = result.distribution_for(cmd)
+            line_counts = result.line_counts_for(cmd)
+            show_fast_cmd, show_line_counts = '', ''
+            if fastest_cmd.name == cmd.name:
+                show_fast_cmd = '*'
+            if fastest_sample['cmd'].name == cmd.name:
+                name += '*'
+            if len(line_counts) > 0:
+                counts = map(str, line_counts)
+                show_line_counts = ' (lines: %s)' % ', '.join(counts)
+            fmt = '{name:{pad}} {mean:0.3f} +/- {stdev:0.3f}{lines}{fast_cmd}'
+            print(fmt.format(
+                name=name, pad=max_name_len + 2, fast_cmd=show_fast_cmd,
+                mean=mean, stdev=stdev, lines=show_line_counts))
+        sys.stdout.flush()
+
+        if raw_csv_wtr is not None:
+            for sample in result.samples:
+                cmd, duration = sample['cmd'], sample['duration']
+                env = ' '.join(['%s=%s' % (k, v)
+                                for k, v in cmd.kwargs.get('env', {}).items()])
+                raw_csv_wtr.writerow({
+                    'benchmark': b.name,
+                    'warmup_iter': b.warmup_count,
+                    'iter': b.count,
+                    'name': sample['cmd'].name,
+                    'command': ' '.join(cmd.cmd),
+                    'duration': duration,
+                    'lines': sample['line_count'] or '',
+                    'env': env,
+                })
+            raw_handle.flush()
+
+
+if __name__ == '__main__':
+    main()
author	Andrew Gallant <jamslam@gmail.com>	2016-09-11 01:05:36 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2016-09-11 01:05:36 -0400
commit	9bf7696ec8cacc74baaa4003cdfba0dab65245fd (patch)
tree	cefd7d96af71bf8c5753341ad2a2d5319f58cdbb /benchsuite
parent	cb0f8fd2fa7ed1770fc698ea2ac033e3bc7994d7 (diff)