#!/usr/bin/env python3

'''
benchsuite is a benchmark runner for comparing command line search tools.
'''

import argparse
import csv
import os
import os.path as path
from multiprocessing import cpu_count
import re
import shutil
import statistics
import subprocess
import sys
import time

# Some constants for identifying the corpora we use to run tests.
# We establish two very different kinds of corpora: a small number of large
# files and a large number of small files. These are vastly different use cases
# not only because of their performance characteristics, but also the
# strategies used to increase the relevance of results returned.

SUBTITLES_DIR = 'subtitles'
SUBTITLES_EN_NAME = 'OpenSubtitles2016.raw.en'
SUBTITLES_EN_NAME_SAMPLE = 'OpenSubtitles2016.raw.sample.en'
SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME
SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.en.gz'  # noqa
SUBTITLES_RU_NAME = 'OpenSubtitles2016.raw.ru'
SUBTITLES_RU_NAME_GZ = '%s.gz' % SUBTITLES_RU_NAME
SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.ru.gz'  # noqa

LINUX_DIR = 'linux'
LINUX_CLONE = 'git://github.com/BurntSushi/linux'

# Grep takes locale settings from the environment. There is a *substantial*
# performance impact for enabling Unicode, so we need to handle this explicitly
# in our benchmarks.
GREP_ASCII = {'LC_ALL': 'C'}
GREP_UNICODE = {'LC_ALL': 'en_US.UTF-8'}

# Sift tries really hard to search everything by default. In our code search
# benchmarks, we don't want that.
SIFT = [
    'sift',
    '--binary-skip',
    '--exclude-files', '.*',
    '--exclude-files', '*.pdf',
]


def bench_linux_literal_default(suite_dir):
    '''
    Benchmark the speed of a literal using *default* settings.

    This is a purposefully unfair benchmark for use in performance
    analysis, but it is pedagogically useful to demonstrate how
    default behaviors differ.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = 'PM_RESUME'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', pat]),
        mkcmd('ag', ['ag', pat]),
        # ucg reports the exact same matches as ag and rg even though it
        # doesn't read gitignore files. Instead, it has a file whitelist
        # that happens to match up exactly with the gitignores for this search.
        mkcmd('ucg', ['ucg', pat]),
        # I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the
        # default, but I'd guess it to be on most desktop systems.
        mkcmd('pt', ['pt', pat]),
        # sift reports an extra line here for a binary file matched.
        mkcmd('sift', ['sift', pat]),
        mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}),
    ])


def bench_linux_literal(suite_dir):
    '''
    Benchmark the speed of a literal, attempting to be fair.

    This tries to use the minimum set of options available in all tools
    to test how fast they are. For example, it makes sure there is
    no case insensitive matching and that line numbers are computed
    (because some tools don't permit disabling line numbers).
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = 'PM_RESUME'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg (ignore)', ['rg', '-n', pat]),
        mkcmd('rg (ignore) (mmap)', ['rg', '-n', '--mmap', pat]),
        mkcmd('ag (ignore) (mmap)', ['ag', '-s', pat]),
        mkcmd('pt (ignore)', ['pt', pat]),
        mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]),
        mkcmd('git grep (ignore)', [
            'git', 'grep', '-I', '-n', pat,
        ], env={'LC_ALL': 'C'}),
        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
    ])


def bench_linux_literal_casei(suite_dir):
    '''
    Benchmark the speed of a case insensitive literal search.

    This is like the linux_literal benchmark, except we ask the
    search tools to do case insensitive search.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = 'PM_RESUME'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]),
        mkcmd('rg (ignore) (mmap)', ['rg', '-n', '-i', '--mmap', pat]),
        mkcmd('ag (ignore) (mmap)', ['ag', '-i', pat]),
        mkcmd('pt (ignore)', ['pt', '-i', pat]),
        mkcmd('sift (ignore)', SIFT + ['-n', '-i', '--git', pat]),
        # It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here,
        # since that is certainly what ripgrep is doing, but this is for an
        # ASCII literal, so we should give `git grep` all the opportunity to
        # do its best.
        mkcmd('git grep (ignore)', [
            'git', 'grep', '-I', '-n', '-i', pat,
        ], env={'LC_ALL': 'C'}),
        mkcmd('rg (whitelist)', [
            'rg', '-n', '-i', '--no-ignore', '-tall', pat,
        ]),
        mkcmd('ucg (whitelist)', ['ucg', '-i', pat]),
    ])


def bench_linux_re_literal_suffix(suite_dir):
    '''
    Benchmark the speed of a literal inside a regex.

    This, for example, inhibits a prefix byte optimization used
    inside of Go's regex engine (relevant for sift and pt).
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = '[A-Z]+_RESUME'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg (ignore)', ['rg', '-n', pat]),
        mkcmd('ag (ignore)', ['ag', '-s', pat]),
        mkcmd('pt (ignore)', ['pt', '-e', pat]),
        mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]),
        mkcmd(
            'git grep (ignore)',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'C'},
        ),
        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
    ])


def bench_linux_word(suite_dir):
    '''
    Benchmark use of the -w ("match word") flag in each tool.

    sift has a lot of trouble with this because it forces it into Go's
    regex engine by surrounding the pattern with \b assertions.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = 'PM_RESUME'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg (ignore)', ['rg', '-n', '-w', pat]),
        mkcmd('ag (ignore)', ['ag', '-s', '-w', pat]),
        mkcmd('pt (ignore)', ['pt', '-w', pat]),
        mkcmd('sift (ignore)', SIFT + ['-n', '-w', '--git', pat]),
        mkcmd(
            'git grep (ignore)',
            ['git', 'grep', '-E', '-I', '-n', '-w', pat],
            env={'LC_ALL': 'C'},
        ),
        mkcmd('rg (whitelist)', [
            'rg', '-n', '-w', '--no-ignore', '-tall', pat,
        ]),
        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', '-w', pat]),
    ])


def bench_linux_unicode_greek(suite_dir):
    '''
    Benchmark matching of a Unicode category.

    Only three tools (ripgrep, sift and pt) support this. We omit
    pt because it is too slow.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = r'\p{Greek}'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', '-n', pat]),
        mkcmd('pt', ['pt', '-e', pat]),
        mkcmd('sift', SIFT + ['-n', '--git', pat]),
    ])


def bench_linux_unicode_greek_casei(suite_dir):
    '''
    Benchmark matching of a Unicode category, case insensitively.

    Only ripgrep gets this right (and it's still fast).
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = r'\p{Greek}'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', '-n', '-i', pat]),
        mkcmd('pt', ['pt', '-i', '-e', pat]),
        mkcmd('sift', SIFT + ['-n', '-i', '--git', pat]),
    ])


def bench_linux_unicode_word(suite_dir):
    '''
    Benchmark Unicode aware \w character class.

    Only ripgrep and git-grep (with LC_ALL=en_US.UTF-8) actually get
    this right. Everything else uses the standard ASCII interpretation
    of \w.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = r'\wAh'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg (ignore)', ['rg', '-n', pat]),
        mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]),
        mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]),
        mkcmd('pt (ignore) (ASCII)', ['pt', '-e', pat]),
        mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', '--git', pat]),
        mkcmd(
            'git grep (ignore)',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'en_US.UTF-8'},
        ),
        mkcmd(
            'git grep (ignore) (ASCII)',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'C'},
        ),
        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
        mkcmd('rg (whitelist) (ASCII)', [
            'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat,
        ]),
        mkcmd('ucg (ASCII)', ['ucg', '--nosmart-case', pat]),
    ])


def bench_linux_no_literal(suite_dir):
    '''
    Benchmark a regex that defeats all literal optimizations.

    Most search patterns have some kind of literal in them, which
    typically permits searches to take some shortcuts. Therefore, the
    applicability of this benchmark is somewhat suspicious, but the
    suite wouldn't feel complete without it.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg (ignore)', ['rg', '-n', pat]),
        mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]),
        mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]),
        mkcmd('pt (ignore) (ASCII)', ['pt', '-e', pat]),
        mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', '--git', pat]),
        mkcmd(
            'git grep (ignore)',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'en_US.UTF-8'},
        ),
        mkcmd(
            'git grep (ignore) (ASCII)',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'C'},
        ),
        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
        mkcmd('rg (whitelist) (ASCII)', [
            'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat,
        ]),
        mkcmd('ucg (whitelist) (ASCII)', ['ucg', '--nosmart-case', pat]),
    ])


def bench_linux_alternates(suite_dir):
    '''
    Benchmark a small alternation of literals.

    sift doesn't make the cut. It's more than 10x slower than the next
    fastest result. The slowdown is likely because the Go regexp engine
    doesn't do any literal optimizations for this case (there is no
    common leading byte).
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg (ignore)', ['rg', '-n', pat]),
        mkcmd('ag (ignore)', ['ag', '-s', pat]),
        mkcmd(
            'git grep (ignore)',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'C'},
        ),
        mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', pat]),
        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
    ])


def bench_linux_alternates_casei(suite_dir):
    'Benchmark a small alternation of literals case insensitively.'
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
    pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT'

    def mkcmd(*args, **kwargs):
        kwargs['cwd'] = cwd
        return Command(*args, **kwargs)

    return Benchmark(pattern=pat, commands=[
        mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]),
        mkcmd('ag (ignore)', ['ag', '-i', pat]),
        mkcmd(
            'git grep (ignore)',
            ['git', 'grep', '-E', '-I', '-n', '-i', pat],
            env={'LC_ALL': 'C'},
        ),
        mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', '-i', pat]),
        mkcmd('ucg (whitelist)', ['ucg', '-i', pat]),
    ])


def bench_subtitles_en_literal(suite_dir):
    '''
    Benchmark the speed of an ASCII string literal.
    '''
    require(suite_dir, 'subtitles-en')
    en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
    pat = 'Sherlock Holmes'

    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', pat, en]),
        Command('rg (no mmap)', ['rg', '--no-mmap', pat, en]),
        Command('pt', ['pt', '-N', pat, en]),
        Command('sift', ['sift', pat, en]),
        Command('grep', ['grep', '-a', pat, en], env=GREP_ASCII),
        Command('rg (lines)', ['rg', '-n', pat, en]),
        Command('ag (lines)', ['ag', '-s', pat, en]),
        Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]),
        Command('pt (lines)', ['pt', pat, en]),
        Command('sift (lines)', ['sift', '-n', pat, en]),
        Command('grep (lines)', ['grep', '-an', pat, en], env=GREP_ASCII),
    ])


def bench_subtitles_en_literal_casei(suite_dir):
    '''
    Benchmark the speed of a Unicode-y string case insensitively.
    '''
    require(suite_dir, 'subtitles-en')
    en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
    pat = 'Sherlock Holmes'

    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', '-i', pat, en]),
        Command('grep', ['grep', '-ai', pat, en], env=GREP_UNICODE),
        Command('grep (ASCII)', [
            'grep', '-E', '-ai', pat, en,
        ], env=GREP_ASCII),
        Command('rg (lines)', ['rg', '-n', '-i', pat, en]),
        Command('ag (lines) (ASCII)', ['ag', '-i', pat, en]),
        Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, en]),
    ])


def bench_subtitles_en_literal_word(suite_dir):
    '''
    Benchmark the speed of finding a literal inside word boundaries.
    '''
    require(suite_dir, 'subtitles-en')
    en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
    pat = 'Sherlock Holmes'

    return Benchmark(pattern=pat, commands=[
        Command('rg (ASCII)', [
            'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', en,
        ]),
        Command('ag (ASCII)', ['ag', '-sw', pat, en]),
        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
        Command('grep (ASCII)', [
            'grep', '-anw', pat, en,
        ], env=GREP_ASCII),
        Command('rg', ['rg', '-nw', pat, en]),
        Command('grep', ['grep', '-anw', pat, en], env=GREP_UNICODE),
    ])


def bench_subtitles_en_alternate(suite_dir):
    '''
    Benchmark the speed of a set of alternate literals.
    '''
    require(suite_dir, 'subtitles-en')
    en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
    pat = '|'.join([
        'Sherlock Holmes',
        'John Watson',
        'Irene Adler',
        'Inspector Lestrade',
        'Professor Moriarty',
    ])

    return Benchmark(pattern=pat, commands=[
        Command('rg (lines)', ['rg', '-n', pat, en]),
        Command('ag (lines)', ['ag', '-s', pat, en]),
        Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]),
        Command('grep (lines)', [
            'grep', '-E', '-an', pat, en,
        ], env=GREP_ASCII),
        Command('rg', ['rg', pat, en]),
        Command('grep', [
            'grep', '-E', '-a', pat, en,
        ], env=GREP_ASCII),
    ])


def bench_subtitles_en_alternate_casei(suite_dir):
    '''
    Benchmark the speed of a set of alternate literals.
    '''
    require(suite_dir, 'subtitles-en')
    en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
    pat = '|'.join([
        'Sherlock Holmes',
        'John Watson',
        'Irene Adler',
        'Inspector Lestrade',
        'Professor Moriarty',
    ])

    return Benchmark(pattern=pat, commands=[
        Command('ag (ASCII)', ['ag', '-s', '-i', pat, en]),
        Command('ucg (ASCII)', ['ucg', '-i', pat, en]),
        Command('grep (ASCII)', [
            'grep', '-E', '-ani', pat, en,
        ], env=GREP_ASCII),
        Command('rg', ['rg', '-n', '-i', pat, en]),
        Command('grep', ['grep', '-E', '-ani', pat, en], env=GREP_UNICODE),
    ])


def bench_subtitles_en_surrounding_words(suite_dir):
    '''
    Benchmark a more complex regex with an inner literal.
    '''
    require(suite_dir, 'subtitles-en')
    en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
    pat = r'\w+\s+Holmes\s+\w+'

    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', '-n', pat, en]),
        Command('grep', ['grep', '-E', '-an', pat, en], env=GREP_UNICODE),
        Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
        Command('ag (ASCII)', ['ag', '-s', pat, en]),
        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
        Command('grep (ASCII)', [
            'grep', '-E', '-an', pat, en,
        ], env=GREP_ASCII),
    ])


def bench_subtitles_en_no_literal(suite_dir):
    '''
    Benchmark the speed of a regex with no literals.

    Note that we don't even try to run grep with Unicode support
    on this one. While it should eventually get the right answer,
    I killed it after it had already been running for two minutes
    and showed no signs of finishing soon.
    '''
    require(suite_dir, 'subtitles-en')
    en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
    pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'

    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', '-n', pat, en]),
        Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
        Command('ag (ASCII)', ['ag', '-s', pat, en]),
        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
        Command('grep (ASCII)', [
            'grep', '-E', '-an', pat, en,
        ], env=GREP_ASCII),
    ])


def bench_subtitles_ru_literal(suite_dir):
    '''
    Benchmark the speed of a Unicode-y string literal.
    '''
    require(suite_dir, 'subtitles-ru')
    ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
    pat = 'Шерлок Холмс'  # Sherlock Holmes

    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', pat, ru]),
        Command('rg (no mmap)', ['rg', '--no-mmap', pat, ru]),
        Command('pt', ['pt', '-N', pat, ru]),
        Command('sift', ['sift', pat, ru]),
        Command('grep', ['grep', '-a', pat, ru], env=GREP_ASCII),
        Command('rg (lines)', ['rg', '-n', pat, ru]),
        Command('ag (lines)', ['ag', '-s', pat, ru]),
        Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]),
        Command('pt (lines)', ['pt', pat, ru]),
        Command('sift (lines)', ['sift', '-n', pat, ru]),
        Command('grep (lines)', ['grep', '-an', pat, ru], env=GREP_ASCII),
    ])


def bench_subtitles_ru_literal_casei(suite_dir):
    '''
    Benchmark the speed of a Unicode-y string case insensitively.
    '''
    require(suite_dir, 'subtitles-ru')
    ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
    pat = 'Шерлок Холмс'  # Sherlock Holmes

    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', '-i', pat, ru]),
        Command('grep', ['grep', '-ai', pat, ru], env=GREP_UNICODE),
        Command('grep (ASCII)', [
            'grep', '-E', '-ai', pat, ru,
        ], env=GREP_ASCII),
        Command('rg (lines)', ['rg', '-n', '-i', pat, ru]),
        Command('ag (lines) (ASCII)', ['ag', '-i', pat, ru]),
        Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, ru]),
    ])


def bench_subtitles_ru_literal_word(suite_dir):
    '''
    Benchmark the speed of finding a literal inside word boundaries.
    '''
    require(suite_dir, 'subtitles-ru')
    ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
    pat = 'Шерлок Холмс'  # Sherlock Holmes

    return Benchmark(pattern=pat, commands=[
        Command('rg (ASCII)', [
            'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', ru,
        ]),
        Command('ag (ASCII)', ['ag', '-sw', pat, ru]),
        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
        Command('grep (ASCII)', [
            'grep', '-anw', pat, ru,
        ], env=GREP_ASCII),
        Command('rg', ['rg', '-nw', pat, ru]),
        Command('grep', ['grep', '-anw', pat, ru], env=GREP_UNICODE),
    ])


def bench_subtitles_ru_alternate(suite_dir):
    '''
    Benchmark the speed of a set of alternate literals.
    '''
    require(suite_dir, 'subtitles-ru')
    ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
    pat = '|'.join([
        'Шерлок Холмс',  # Sherlock Holmes
        'Джон Уотсон',  # John Watson
        'Ирен Адлер',  # Irene Adler
        'инспектор Лестрейд',  # Inspector Lestrade
        'профессор Мориарти',  # Professor Moriarty
    ])

    return Benchmark(pattern=pat, commands=[
        Command('rg (lines)', ['rg', '-n', pat, ru]),
        Command('ag (lines)', ['ag', '-s', pat, ru]),
        Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]),
        Command('grep (lines)', [
            'grep', '-E', '-an', pat, ru,
        ], env=GREP_ASCII),
        Command('rg', ['rg', pat, ru]),
        Command('grep', [
            'grep', '-E', '-a', pat, ru,
        ], env=GREP_ASCII),
    ])


def bench_subtitles_ru_alternate_casei(suite_dir):
    '''
    Benchmark the speed of a set of alternate literals.
    '''
    require(suite_dir, 'subtitles-ru')
    ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
    pat = '|'.join([
        'Шерлок Холмс',  # Sherlock Holmes
        'Джон Уотсон',  # John Watson
        'Ирен Адлер',  # Irene Adler
        'инспектор Лестрейд',  # Inspector Lestrade
        'профессор Мориарти',  # Professor Moriarty
    ])

    return Benchmark(pattern=pat, commands=[
        Command('ag (ASCII)', ['ag', '-s', '-i', pat, ru]),
        Command('ucg (ASCII)', ['ucg', '-i', pat, ru]),
        Command('grep (ASCII)', [
            'grep', '-E', '-ani', pat, ru,
        ], env=GREP_ASCII),
        Command('rg', ['rg', '-n', '-i', pat, ru]),
        Command('grep', ['grep', '-E', '-ani', pat, ru], env=GREP_UNICODE),
    ])


def bench_subtitles_ru_surrounding_words(suite_dir):
    '''
    Benchmark a more complex regex with an inner literal.
    '''
    require(suite_dir, 'subtitles-en')
    ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
    pat = r'\w+\s+Холмс\s+\w+'

    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', '-n', pat, ru]),
        Command('grep', ['grep', '-E', '-an', pat, ru], env=GREP_UNICODE),
        Command('ag (ASCII)', ['ag', '-s', pat, ru]),
        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
        Command('grep (ASCII)', [
            'grep', '-E', '-an', pat, ru,
        ], env=GREP_ASCII),
    ])


def bench_subtitles_ru_no_literal(suite_dir):
    '''
    Benchmark the speed of a regex with no literals.

    Note that we don't even try to run grep with Unicode support
    on this one. While it should eventually get the right answer,
    I killed it after it had already been running for two minutes
    and showed no signs of finishing soon.
    '''
    require(suite_dir, 'subtitles-ru')
    ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
    pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'

    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', '-n', pat, ru]),
        Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, ru]),
        Command('ag (ASCII)', ['ag', '-s', pat, ru]),
        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
        Command('grep (ASCII)', [
            'grep', '-E', '-an', pat, ru,
        ], env=GREP_ASCII),
    ])


class MissingDependencies(Exception):
    '''
    A missing dependency exception.

    This exception occurs when running a benchmark that requires a
    particular corpus that isn't available.

    :ivar list(str) missing_names:
        A list of missing dependency names. These names correspond to
        names that can be used with the --download flag.
    '''
    def __init__(self, missing_names):
        self.missing_names = missing_names

    def __str__(self):
        return 'MissingDependency(%s)' % repr(self.missing_names)


class MissingCommands(Exception):
    '''
    A missing command exception.

    This exception occurs when running a command in a benchmark
    where the command could not be found on the current system.

    :ivar list(str) missing_names:
        The names of the command binaries that could not be found.
    '''
    def __init__(self, missing_names):
        self.missing_names = sorted(set(missing_names))

    def __str__(self):
        return 'MissingCommands(%s)' % repr(self.missing_names)


class Benchmark(object):
    '''
    A single benchmark corresponding to a grouping of commands.

    The main purpose of a benchmark is to compare the performance
    characteristics of a group of commands.
    '''

    def __init__(self, name=None, pattern=None, commands=None,
                 warmup_count=1, count=3, line_count=True,
                 allow_missing_commands=False,
                 disabled_cmds=None):
        '''
        Create a single benchmark.

        A single benchmark is composed of a set of commands that are
        benchmarked and compared against one another. A benchmark may
        have multiple commands that use the same search tool (but
        probably should have something differentiating them).

        The grouping of commands is a purely human driven process.

        By default, the output of every command is sent to /dev/null.
        Other types of behavior are available via the methods defined
        on this benchmark.

        :param str name:
            A human readable string denoting the name of this
            benchmark.
        :param str pattern:
            The pattern that is used in search.
        :param list(Command) commands:
            A list of commands to initialize this benchmark with. More
            commands may be added before running the benchmark.
        :param int warmup_count:
            The number of times to run each command before recording
            samples.
        :param int count:
            The number of samples to collect from each command.
        :param bool line_count:
            When set, the lines of each search are counted and included
            in the samples produced.
        :param bool allow_missing_commands:
            When set, if a command is missing, then the benchmark
            will simply skip it.
        :param list(str) disabled_cmds:
            A list of commands to skip.
        '''
        self.name = name
        self.pattern = pattern
        self.commands = commands or []
        self.warmup_count = warmup_count
        self.count = count
        self.line_count = line_count
        self.allow_missing_commands = allow_missing_commands
        self.disabled_cmds = set(disabled_cmds or [])

    def raise_if_missing(self):
        '''
        Raises a MissingCommands exception if applicable.

        A MissingCommands exception is raised when the following
        criteria are met: 1) allow_missing_commands is False, and 2) at
        least one command in this benchmark could not be found on this
        system.
        '''
        missing_commands = []
        for c in self.commands:
            if c.binary_name in self.disabled_cmds or c.exists():
                continue
            missing_commands.append(c.binary_name)
        if not self.allow_missing_commands and len(missing_commands) > 0:
            raise MissingCommands(missing_commands)

    def run(self):
        '''
        Runs this benchmark and returns the results.

        :rtype: Result
        :raises:
            MissingCommands if any command doesn't exist.
            (Unless allow_missing_commands is enabled.)
        '''
        self.raise_if_missing()
        result = Result(self)
        for cmd in self.commands:
            if cmd.binary_name in self.disabled_cmds:
                continue
            if self.allow_missing_commands and not cmd.exists():
                # Skip this command if we're OK with it.
                continue
            # Do a warmup first.
            for _ in range(self.warmup_count):
                self.run_one(cmd)
            for _ in range(self.count):
                result.add(cmd, **self.run_one(cmd))
        return result

    def run_one(self, cmd):
        '''
        Runs the given command exactly once.

        Returns an object that includes the time taken by the command.
        If this benchmark was configured to count the number of lines
        returned, then the line count is also returned.

        :param Command cmd: The command to run.
        :returns:
            A dict with two fields, duration and line_count.
            The duration is in seconds, with fractional milliseconds,
            and is guaranteed to be available. The line_count is set
            to None unless line counting is enabled, in which case,
            it is the number of lines in the search output.
        :rtype: int
        '''
        if not cmd.exists():
            raise MissingCommands([cmd.cmd[0]])
        cmd.kwargs['stderr'] = subprocess.DEVNULL
        if self.line_count:
            cmd.kwargs['stdout'] = subprocess.PIPE
        else:
            cmd.kwargs['stdout'] = subprocess.DEVNULL

        start = time.time()
        completed = cmd.run()
        end = time.time()

        line_count = None
        if self.line_count:
            line_count = completed.stdout.count(b'\n')
        return {
            'duration': end - start,
            'line_count': line_count,
        }


class Result(object):
    '''
    The result of running a benchmark.

    Benchmark results consist of a set of samples, where each sample
    corresponds to a single run of a single command in the benchmark.
    Various statistics can be computed from these samples such as mean
    and standard deviation.
    '''
    def __init__(self, benchmark):
        '''
        Create a new set of results, initially empty.

        :param Benchmarl benchmark:
            The benchmark that produced these results.
        '''
        self.benchmark = benchmark
        self.samples = []

    def add(self, cmd, duration, line_count=None):
        '''
        Add a new sample to this result set.

        :param Command cmd:
            The command that produced this sample.
        :param int duration:
            The duration, in milliseconds, that the command took to
            run.
        :param int line_count:
            The number of lines in the search output. This is optional.
        '''
        self.samples.append({
            'cmd': cmd,
            'duration': duration,
            'line_count': line_count,
        })

    def fastest_sample(self):
        '''
        Returns the fastest recorded sample.
        '''
        return min(self.samples, key=lambda s: s['duration'])

    def fastest_cmd(self):
        '''
        Returns the fastest command according to distribution.
        '''
        means = []
        for cmd in self.benchmark.commands:
            mean, _ = self.distribution_for(cmd)
            if mean is None:
                continue
            means.append((cmd, mean))
        return min(means, key=lambda tup: tup[1])[0]

    def samples_for(self, cmd):
        'Returns an iterable of samples for cmd'
        yield from (s for s in self.samples if s['cmd'].name == cmd.name)

    def line_counts_for(self, cmd):
        '''
        Returns the line counts recorded for each command.

        :returns:
            A dictionary from command name to a set of line
            counts recorded.
        '''
        return {s['line_count']
                for s in self.samples_for(cmd)
                if s['line_count'] is not None}

    def distribution_for(self, cmd):
        '''
        Returns the distribution (mean +/- std) of the given command.

        If there are no samples for this command (i.e., it was skipped),
        then return ``(None, None)``.

        :rtype: (float, float)
        :returns:
            A tuple containing the mean and standard deviation, in that
            order.
        '''
        samples = list(s['duration'] for s in self.samples_for(cmd))
        if len(samples) == 0:
            return None, None
        return statistics.mean(samples), statistics.stdev(samples)


class Command(object):
    def __init__(self, name, cmd, *args, **kwargs):
        '''
        Create a new command that is run as part of a benchmark.

        *args and **kwargs are passed directly to ``subprocess.run``.
        An exception to this is stdin/stdout/stderr. Output
        redirection is completely controlled by the benchmark harness.
        Trying to set them here will trigger an assert.

        :param str name:
            The human readable name of this command. This is
            particularly useful if the same search tool is used
            multiple times in the same benchmark with different
            arguments.
        :param list(str) cmd:
            The command to run as a list of arguments (including the
            command name itself).
        '''
        assert 'stdin' not in kwargs
        assert 'stdout' not in kwargs
        assert 'stderr' not in kwargs
        self.name = name
        self.cmd = cmd
        self.args = args
        self.kwargs = kwargs

    def exists(self):
        'Returns true if and only if this command exists.'
        return shutil.which(self.binary_name) is not None

    @property
    def binary_name(self):
        'Return the binary name of this command.'
        return self.cmd[0]

    def run(self):
        '''
        Runs this command and returns its status.

        :rtype: subprocess.CompletedProcess
        '''
        return subprocess.run(self.cmd, *self.args, **self.kwargs)


def eprint(*args, **kwargs):
    'Like print, but to stderr.'
    kwargs['file'] = sys.stderr
    print(*args, **kwargs)


def run_cmd(cmd, *args, **kwargs):
    '''
    Print the command to stderr and run it.

    If the command fails, throw a traceback.
    '''
    eprint('# %s' % ' '.join(cmd))
    kwargs['check'] = True
    return subprocess.run(cmd, *args, **kwargs)


def require(suite_dir, *names):
    '''
    Declare a dependency on the given names for a benchmark.

    If any dependency doesn't exist, then fail with an error message.
    '''
    errs = []
    for name in names:
        fun_name = name.replace('-', '_')
        if not globals()['has_%s' % fun_name](suite_dir):
            errs.append(name)
    if len(errs) > 0:
        raise MissingDependencies(errs)


def download_linux(suite_dir):
    'Download and build the Linux kernel.'
    checkout_dir = path.join(suite_dir, LINUX_DIR)
    if not os.path.isdir(checkout_dir):
        # Clone from my fork so that we always get the same corpus *and* still
        # do a shallow clone. Shallow clones are much much cheaper than full
        # clones.
        run_cmd(['git', 'clone', '--depth', '1', LINUX_CLONE, checkout_dir])
    # We want to build the kernel because the process of building it produces
    # a lot of junk in the repository that a search tool probably shouldn't
    # touch.
    if not os.path.exists(path.join(checkout_dir, 'vmlinux')):
        eprint('# Building Linux kernel...')
        run_cmd(['make', 'defconfig'], cwd=checkout_dir)
        run_cmd(['make', '-j', str(cpu_count())], cwd=checkout_dir)


def has_linux(suite_dir):
    'Returns true if we believe the Linux kernel is built.'
    checkout_dir = path.join(suite_dir, LINUX_DIR)
    return path.exists(path.join(checkout_dir, 'vmlinux'))


def download_subtitles_en(suite_dir):
    'Download and decompress English subtitles.'
    subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
    en_path_gz = path.join(subtitle_dir, SUBTITLES_EN_NAME_GZ)
    en_path = path.join(subtitle_dir, SUBTITLES_EN_NAME)
    en_path_sample = path.join(subtitle_dir, SUBTITLES_EN_NAME_SAMPLE)

    if not os.path.isdir(subtitle_dir):
        os.makedirs(subtitle_dir)
    if not os.path.exists(en_path):
        if not os.path.exists(en_path_gz):
            run_cmd(['curl', '-LO', SUBTITLES_EN_URL], cwd=subtitle_dir)
        run_cmd(['gunzip', en_path_gz])
    if not os.path.exists(en_path_sample):
        # Get a sample roughly the same size as the Russian corpus so that
        # benchmarks finish in a reasonable time.
        with open(path.join(subtitle_dir, en_path_sample), 'wb+') as f:
            run_cmd(
                ['head', '-n', '32722372', en_path],
                cwd=subtitle_dir, stdout=f)


def has_subtitles_en(suite_dir):
    'Returns true if English subtitles have been downloaded.'
    subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
    return path.exists(path.join(subtitle_dir, SUBTITLES_EN_NAME_SAMPLE))


def download_subtitles_ru(suite_dir):
    'Download and decompress Russian subtitles.'
    subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
    ru_path_gz = path.join(subtitle_dir, SUBTITLES_RU_NAME_GZ)
    ru_path = path.join(subtitle_dir, SUBTITLES_RU_NAME)

    if not os.path.isdir(subtitle_dir):
        os.makedirs(subtitle_dir)
    if not os.path.exists(ru_path):
        if not os.path.exists(ru_path_gz):
            run_cmd(['curl', '-LO', SUBTITLES_RU_URL], cwd=subtitle_dir)
        run_cmd(['gunzip', ru_path_gz])


def has_subtitles_ru(suite_dir):
    'Returns true if Russian subtitles have been downloaded.'
    subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
    return path.exists(path.join(subtitle_dir, SUBTITLES_RU_NAME))


def download(suite_dir, choices):
    '''
    Download choices into suite_dir.

    Specifically, choices specifies a list of corpora to fetch.

    :param str suite_dir:
        The directory in which to download corpora.
    :param list(str) choices:
        A list of corpora to download. Available choices are:
        all, linux, subtitles-en, subtitles-ru.
    '''
    for choice in choices:
        if choice == 'linux':
            download_linux(suite_dir)
        elif choice == 'subtitles-en':
            download_subtitles_en(suite_dir)
        elif choice == 'subtitles-ru':
            download_subtitles_ru(suite_dir)
        elif choice == 'all':
            download_linux(suite_dir)
            download_subtitles_en(suite_dir)
            download_subtitles_ru(suite_dir)
        else:
            eprint('Unrecognized download choice: %s' % choice)
            sys.exit(1)


def collect_benchmarks(suite_dir, filter_pat=None,
                       allow_missing_commands=False,
                       disabled_cmds=None,
                       warmup_iter=1, bench_iter=3):
    '''
    Return an iterable of all runnable benchmarks.

    :param str suite_dir:
        The directory containing corpora.
    :param str filter_pat:
        A single regular expression that is used to filter benchmarks
        by their name. When not specified, all benchmarks are run.
    :returns:
        An iterable over all runnable benchmarks. If a benchmark
        requires corpora that are missing, then a log message is
        emitted to stderr and it is not yielded.
    '''
    for fun in sorted(globals()):
        if not fun.startswith('bench_'):
            continue
        name = re.sub('^bench_', '', fun)
        if filter_pat is not None and not re.search(filter_pat, name):
            continue
        try:
            benchmark = globals()[fun](suite_dir)
            benchmark.name = name
            benchmark.warmup_count = warmup_iter
            benchmark.count = bench_iter
            benchmark.allow_missing_commands = allow_missing_commands
            benchmark.disabled_cmds = disabled_cmds
            benchmark.raise_if_missing()
        except MissingDependencies as e:
            eprint(
                'missing: %s, skipping benchmark %s (try running with: %s)' % (
                    ', '.join(e.missing_names),
                    name,
                    ' '.join(['--download %s' % n for n in e.missing_names]),
                ))
            continue
        except MissingCommands as e:
            fmt = 'missing commands: %s, skipping benchmark %s ' \
                  '(run with --allow-missing to run incomplete benchmarks)'
            eprint(fmt % (', '.join(e.missing_names), name))
            continue
        yield benchmark


def main():
    download_choices = ['all', 'linux', 'subtitles-en', 'subtitles-ru']
    p = argparse.ArgumentParser('Command line search tool benchmark suite.')
    p.add_argument(
        '--dir', metavar='PATH', default=os.getcwd(),
        help='The directory in which to download data and perform searches.')
    p.add_argument(
        '--download', metavar='CORPUS', action='append',
        choices=download_choices,
        help='Download and prepare corpus data, then exit without running '
             'any benchmarks. Note that this command is intended to be '
             'idempotent. WARNING: This downloads over a gigabyte of data, '
             'and also includes building the Linux kernel. If "all" is used '
             'then the total uncompressed size is around 13 GB. '
             'Choices: %s' % ', '.join(download_choices))
    p.add_argument(
        '--allow-missing', action='store_true',
        help='Permit benchmarks to run even if some commands are missing.')
    p.add_argument(
        '--disabled', help='A list of comma separated commands to skip.')
    p.add_argument(
        '-f', '--force', action='store_true',
        help='Overwrite existing files if there is a conflict.')
    p.add_argument(
        '--list', action='store_true',
        help='List available benchmarks by name.')
    p.add_argument(
        '--raw', metavar='PATH',
        help='Dump raw data (all samples collected) in CSV format to the '
             'file path provided.')
    p.add_argument(
        '--warmup-iter', metavar='INTEGER', type=int, default=1,
        help='The number of iterations to run each command before '
             'recording measurements.')
    p.add_argument(
        '--bench-iter', metavar='INTEGER', type=int, default=3,
        help='The number of iterations to run each command while '
             'recording measurements.')
    p.add_argument(
        'bench', metavar='PAT', nargs='?',
        help='A regex pattern that will only run benchmarks that match.')
    args = p.parse_args()

    if args.list:
        benchmarks = collect_benchmarks(
            args.dir, filter_pat=args.bench,
            allow_missing_commands=args.allow_missing,
            disabled_cmds=(args.disabled or '').split(','),
            warmup_iter=args.warmup_iter, bench_iter=args.bench_iter)
        for b in benchmarks:
            print(b.name)
        sys.exit(0)
    if args.download is not None and len(args.download) > 0:
        download(args.dir, args.download)
        sys.exit(0)

    if not path.isdir(args.dir):
        os.makedirs(args.dir)
    if args.raw is not None and path.exists(args.raw) and not args.force:
        eprint('File %s already exists (delete it or use --force)' % args.raw)
        sys.exit(1)
    raw_handle, raw_csv_wtr = None, None
    if args.raw is not None:
        fields = [
            'benchmark', 'warmup_iter', 'iter',
            'name', 'command', 'duration', 'lines', 'env',
        ]
        raw_handle = open(args.raw, 'w+')
        raw_csv_wtr = csv.DictWriter(raw_handle, fields)
        raw_csv_wtr.writerow({x: x for x in fields})

    benchmarks = collect_benchmarks(
        args.dir, filter_pat=args.bench,
        allow_missing_commands=args.allow_missing,
        disabled_cmds=(args.disabled or '').split(','),
        warmup_iter=args.warmup_iter, bench_iter=args.bench_iter)
    for i, b in enumerate(benchmarks):
        result = b.run()
        fastest_cmd = result.fastest_cmd()
        fastest_sample = result.fastest_sample()
        max_name_len = max(len(cmd.name) for cmd in b.commands)

        if i > 0:
            print()
        header = '%s (pattern: %s)' % (b.name, b.pattern)
        print('%s\n%s' % (header, '-' * len(header)))
        for cmd in b.commands:
            name = cmd.name
            mean, stdev = result.distribution_for(cmd)
            if mean is None:
                # If we couldn't get a distribution for this command then
                # it was skipped.
                continue
            line_counts = result.line_counts_for(cmd)
            show_fast_cmd, show_line_counts = '', ''
            if fastest_cmd.name == cmd.name:
                show_fast_cmd = '*'
            if fastest_sample['cmd'].name == cmd.name:
                name += '*'
            if len(line_counts) > 0:
                counts = map(str, line_counts)
                show_line_counts = ' (lines: %s)' % ', '.join(counts)
            fmt = '{name:{pad}} {mean:0.3f} +/- {stdev:0.3f}{lines}{fast_cmd}'
            print(fmt.format(
                name=name, pad=max_name_len + 2, fast_cmd=show_fast_cmd,
                mean=mean, stdev=stdev, lines=show_line_counts))
        sys.stdout.flush()

        if raw_csv_wtr is not None:
            for sample in result.samples:
                cmd, duration = sample['cmd'], sample['duration']
                env = ' '.join(['%s=%s' % (k, v)
                                for k, v in cmd.kwargs.get('env', {}).items()])
                raw_csv_wtr.writerow({
                    'benchmark': b.name,
                    'warmup_iter': b.warmup_count,
                    'iter': b.count,
                    'name': sample['cmd'].name,
                    'command': ' '.join(cmd.cmd),
                    'duration': duration,
                    'lines': sample['line_count'] or '',
                    'env': env,
                })
            raw_handle.flush()


if __name__ == '__main__':
    main()