#!/usr/bin/env python3
'''
benchsuite is a benchmark runner for comparing command line search tools.
'''
import argparse
import csv
import os
import os.path as path
from multiprocessing import cpu_count
import re
import shutil
import statistics
import subprocess
import sys
import time
# Some constants for identifying the corpora we use to run tests.
# We establish two very different kinds of corpora: a small number of large
# files and a large number of small files. These are vastly different use cases
# not only because of their performance characteristics, but also the
# strategies used to increase the relevance of results returned.
SUBTITLES_DIR = 'subtitles'
SUBTITLES_EN_NAME = 'en.txt'
SUBTITLES_EN_NAME_SAMPLE = 'en.sample.txt'
SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME
SUBTITLES_EN_URL = 'https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/en.txt.gz' # noqa
SUBTITLES_RU_NAME = 'ru.txt'
SUBTITLES_RU_NAME_GZ = '%s.gz' % SUBTITLES_RU_NAME
SUBTITLES_RU_URL = 'https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/ru.txt.gz' # noqa
LINUX_DIR = 'linux'
LINUX_CLONE = 'https://github.com/BurntSushi/linux'
# Grep takes locale settings from the environment. There is a *substantial*
# performance impact for enabling Unicode, so we need to handle this explicitly
# in our benchmarks.
GREP_ASCII = {'LC_ALL': 'C'}
GREP_UNICODE = {'LC_ALL': 'en_US.UTF-8'}
# Sift tries really hard to search everything by default. In our code search
# benchmarks, we don't want that.
SIFT = [
'sift',
'--binary-skip',
'--exclude-files', '.*',
'--exclude-files', '*.pdf',
]
def bench_linux_literal_default(suite_dir):
'''
Benchmark the speed of a literal using *default* settings.
This is a purposefully unfair benchmark for use in performance
analysis, but it is pedagogically useful to demonstrate how default
behaviors differ. For example, ugrep and grep don't do any smart
filtering by default, so they will invariably search more files
than ripgrep, ag or git grep.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', pat]),
mkcmd('ag', ['ag', pat]),
# I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the
# default, but I'd guess it to be on most desktop systems.
mkcmd('git grep', ['git', 'grep', pat], env=GREP_UNICODE),
mkcmd('ugrep', ['ugrep', '-r', pat, './']),
mkcmd('grep', ['grep', '-r', pat, './'], env=GREP_UNICODE),
])
def bench_linux_literal(suite_dir):
'''
Benchmark the speed of a literal, attempting to be fair.
This tries to use the minimum set of options available in all tools
to test how fast they are. For example, it makes sure there is
no case insensitive matching and that line numbers are computed
(because some tools don't permit disabling line numbers).
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd