#!/usr/bin/env python
'''
benchsuite is a benchmark runner for comparing command line search tools.
'''
import argparse
import csv
import os
import os.path as path
from multiprocessing import cpu_count
import re
import statistics
import subprocess
import sys
import time
# Some constants for identifying the corpora we use to run tests.
# We establish two very different kinds of corpora: a small number of large
# files and a large number of small files. These are vastly different use cases
# not only because of their performance characteristics, but also the
# strategies used to increase the relevance of results returned.
SUBTITLES_DIR = 'subtitles'
SUBTITLES_EN_NAME = 'OpenSubtitles2016.raw.en'
SUBTITLES_EN_NAME_SAMPLE = 'OpenSubtitles2016.raw.sample.en'
SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME
SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.en.gz'
SUBTITLES_RU_NAME = 'OpenSubtitles2016.raw.ru'
SUBTITLES_RU_NAME_GZ = '%s.gz' % SUBTITLES_RU_NAME
SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.ru.gz'
LINUX_DIR = 'linux'
LINUX_CLONE = 'git://github.com/BurntSushi/linux'
# Grep takes locale settings from the environment. There is a *substantial*
# performance impact for enabling Unicode, so we need to handle this explicitly
# in our benchmarks.
GREP_ASCII = {'LC_ALL': 'C'}
GREP_UNICODE = {'LC_ALL': 'en_US.UTF-8'}
def bench_linux_literal_default(suite_dir):
'''
Benchmark the speed of a literal using *default* settings.
This is a purposefully unfair benchmark for use in performance
analysis, but it is pedagogically useful.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
# N.B. This is a purposefully unfair benchmark for illustrative purposes
# of how the default modes for each search tool differ.
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', pat]),
mkcmd('ag', ['ag', pat]),
# ucg reports the exact same matches as ag and rg even though it
# doesn't read gitignore files. Instead, it has a file whitelist
# that happens to match up exactly with the gitignores for this search.
mkcmd('ucg', ['ucg', pat]),
# I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the
# default, but I'd guess it to be on most desktop systems.
mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}),
mkcmd('pt', ['pt', pat]),
# sift reports an extra line here for a binary file matched.
mkcmd('sift', ['sift', pat]),
])
def bench_linux_literal(suite_dir):
'''
Benchmark the speed of a literal, attempting to be fair.
This tries to use the minimum set of options available in all tools
to test how fast they are. For example, it makes sure there is no
case insensitive matching and that line numbers are computed.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]),
mkcmd('rg (mmap)', ['rg', '-n', '--mmap', pat]),
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
mkcmd('ag (mmap)', ['ag', '-s', pat]),
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
mkcmd('git grep', [
'git', 'grep', '-I', '-n', pat,
], env={'LC_ALL': 'C'}),
mkcmd('pt', ['pt', pat]),
mkcmd('sift', [
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
]),
])
def bench_linux_literal_casei(suite_dir):
'''
Benchmark the speed of a case insensitive literal search.
This is like the linux_literal benchmark, except we ask the
search tools to do case insensitive search.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', '-i', pat]),
mkcmd('rg (mmap)', ['rg', '-n', '-i', pat]),
mkcmd('rg (whitelist)', [
'rg', '-n', '-i', '--no-ignore', '-tall', pat,
]),
mkcmd('ag (mmap)', ['ag', '-i', pat]),
mkcmd('ucg', ['ucg', '-i', pat]),
# It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here,
# since that is certainly what ripgrep is doing, but this is for an
# ASCII literal, so we should give `git grep` all the opportunity to
# do its best.
mkcmd('git grep', [
'git', 'grep', '-I', '-n', '-i', pat,
], env={'LC_ALL': 'C'}),
# sift yields more matches than it should here. Specifically, it gets