diff options
-rwxr-xr-x | benchsuite/benchsuite | 163 |
1 files changed, 45 insertions, 118 deletions
diff --git a/benchsuite/benchsuite b/benchsuite/benchsuite index 9353cf49..f8cf6ea8 100755 --- a/benchsuite/benchsuite +++ b/benchsuite/benchsuite @@ -71,15 +71,8 @@ def bench_linux_literal_default(suite_dir): return Benchmark(pattern=pat, commands=[ mkcmd('rg', ['rg', pat]), mkcmd('ag', ['ag', pat]), - # ucg reports the exact same matches as ag and rg even though it - # doesn't read gitignore files. Instead, it has a file whitelist - # that happens to match up exactly with the gitignores for this search. - mkcmd('ucg', ['ucg', pat]), # I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the # default, but I'd guess it to be on most desktop systems. - mkcmd('pt', ['pt', pat]), - # sift reports an extra line here for a binary file matched. - mkcmd('sift', ['sift', pat]), mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}), ]) @@ -102,16 +95,12 @@ def bench_linux_literal(suite_dir): return Command(*args, **kwargs) return Benchmark(pattern=pat, commands=[ - mkcmd('rg (ignore)', ['rg', '-n', pat]), - mkcmd('rg (ignore) (mmap)', ['rg', '-n', '--mmap', pat]), - mkcmd('ag (ignore) (mmap)', ['ag', '-s', pat]), - mkcmd('pt (ignore)', ['pt', pat]), - mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]), - mkcmd('git grep (ignore)', [ + mkcmd('rg', ['rg', '-n', pat]), + mkcmd('rg (mmap)', ['rg', '-n', '--mmap', pat]), + mkcmd('ag (mmap)', ['ag', '-s', pat]), + mkcmd('git grep', [ 'git', 'grep', '-I', '-n', pat, ], env={'LC_ALL': 'C'}), - mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]), - mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]), ]) @@ -131,31 +120,22 @@ def bench_linux_literal_casei(suite_dir): return Command(*args, **kwargs) return Benchmark(pattern=pat, commands=[ - mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]), - mkcmd('rg (ignore) (mmap)', ['rg', '-n', '-i', '--mmap', pat]), - mkcmd('ag (ignore) (mmap)', ['ag', '-i', pat]), - mkcmd('pt (ignore)', ['pt', '-i', pat]), - mkcmd('sift (ignore)', SIFT + ['-n', '-i', '--git', pat]), + mkcmd('rg', ['rg', '-n', '-i', pat]), + mkcmd('rg (mmap)', ['rg', '-n', '-i', '--mmap', pat]), + mkcmd('ag (mmap)', ['ag', '-i', pat]), # It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here, # since that is certainly what ripgrep is doing, but this is for an # ASCII literal, so we should give `git grep` all the opportunity to # do its best. - mkcmd('git grep (ignore)', [ + mkcmd('git grep', [ 'git', 'grep', '-I', '-n', '-i', pat, ], env={'LC_ALL': 'C'}), - mkcmd('rg (whitelist)', [ - 'rg', '-n', '-i', '--no-ignore', '-tall', pat, - ]), - mkcmd('ucg (whitelist)', ['ucg', '-i', pat]), ]) def bench_linux_re_literal_suffix(suite_dir): ''' Benchmark the speed of a literal inside a regex. - - This, for example, inhibits a prefix byte optimization used - inside of Go's regex engine (relevant for sift and pt). ''' require(suite_dir, 'linux') cwd = path.join(suite_dir, LINUX_DIR) @@ -166,26 +146,19 @@ def bench_linux_re_literal_suffix(suite_dir): return Command(*args, **kwargs) return Benchmark(pattern=pat, commands=[ - mkcmd('rg (ignore)', ['rg', '-n', pat]), - mkcmd('ag (ignore)', ['ag', '-s', pat]), - mkcmd('pt (ignore)', ['pt', '-e', pat]), - mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]), + mkcmd('rg', ['rg', '-n', pat]), + mkcmd('ag', ['ag', '-s', pat]), mkcmd( - 'git grep (ignore)', + 'git grep', ['git', 'grep', '-E', '-I', '-n', pat], env={'LC_ALL': 'C'}, ), - mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]), - mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]), ]) def bench_linux_word(suite_dir): ''' Benchmark use of the -w ("match word") flag in each tool. - - sift has a lot of trouble with this because it forces it into Go's - regex engine by surrounding the pattern with \b assertions. ''' require(suite_dir, 'linux') cwd = path.join(suite_dir, LINUX_DIR) @@ -196,28 +169,19 @@ def bench_linux_word(suite_dir): return Command(*args, **kwargs) return Benchmark(pattern=pat, commands=[ - mkcmd('rg (ignore)', ['rg', '-n', '-w', pat]), - mkcmd('ag (ignore)', ['ag', '-s', '-w', pat]), - mkcmd('pt (ignore)', ['pt', '-w', pat]), - mkcmd('sift (ignore)', SIFT + ['-n', '-w', '--git', pat]), + mkcmd('rg', ['rg', '-n', '-w', pat]), + mkcmd('ag', ['ag', '-s', '-w', pat]), mkcmd( - 'git grep (ignore)', + 'git grep', ['git', 'grep', '-E', '-I', '-n', '-w', pat], env={'LC_ALL': 'C'}, ), - mkcmd('rg (whitelist)', [ - 'rg', '-n', '-w', '--no-ignore', '-tall', pat, - ]), - mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', '-w', pat]), ]) def bench_linux_unicode_greek(suite_dir): ''' Benchmark matching of a Unicode category. - - Only three tools (ripgrep, sift and pt) support this. We omit - pt because it is too slow. ''' require(suite_dir, 'linux') cwd = path.join(suite_dir, LINUX_DIR) @@ -229,8 +193,6 @@ def bench_linux_unicode_greek(suite_dir): return Benchmark(pattern=pat, commands=[ mkcmd('rg', ['rg', '-n', pat]), - mkcmd('pt', ['pt', '-e', pat]), - mkcmd('sift', SIFT + ['-n', '--git', pat]), ]) @@ -250,8 +212,6 @@ def bench_linux_unicode_greek_casei(suite_dir): return Benchmark(pattern=pat, commands=[ mkcmd('rg', ['rg', '-n', '-i', pat]), - mkcmd('pt', ['pt', '-i', '-e', pat]), - mkcmd('sift', SIFT + ['-n', '-i', '--git', pat]), ]) @@ -272,26 +232,19 @@ def bench_linux_unicode_word(suite_dir): return Command(*args, **kwargs) return Benchmark(pattern=pat, commands=[ - mkcmd('rg (ignore)', ['rg', '-n', pat]), - mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]), - mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]), - mkcmd('pt (ignore) (ASCII)', ['pt', '-e', pat]), - mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', '--git', pat]), + mkcmd('rg', ['rg', '-n', pat]), + mkcmd('rg (ASCII)', ['rg', '-n', '(?-u)' + pat]), + mkcmd('ag (ASCII)', ['ag', '-s', pat]), mkcmd( - 'git grep (ignore)', + 'git grep', ['git', 'grep', '-E', '-I', '-n', pat], env={'LC_ALL': 'en_US.UTF-8'}, ), mkcmd( - 'git grep (ignore) (ASCII)', + 'git grep (ASCII)', ['git', 'grep', '-E', '-I', '-n', pat], env={'LC_ALL': 'C'}, ), - mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]), - mkcmd('rg (whitelist) (ASCII)', [ - 'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat, - ]), - mkcmd('ucg (ASCII)', ['ucg', '--nosmart-case', pat]), ]) @@ -313,26 +266,19 @@ def bench_linux_no_literal(suite_dir): return Command(*args, **kwargs) return Benchmark(pattern=pat, commands=[ - mkcmd('rg (ignore)', ['rg', '-n', pat]), - mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]), - mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]), - mkcmd('pt (ignore) (ASCII)', ['pt', '-e', pat]), - mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', '--git', pat]), + mkcmd('rg', ['rg', '-n', pat]), + mkcmd('rg (ASCII)', ['rg', '-n', '(?-u)' + pat]), + mkcmd('ag (ASCII)', ['ag', '-s', pat]), mkcmd( - 'git grep (ignore)', + 'git grep', ['git', 'grep', '-E', '-I', '-n', pat], env={'LC_ALL': 'en_US.UTF-8'}, ), mkcmd( - 'git grep (ignore) (ASCII)', + 'git grep (ASCII)', ['git', 'grep', '-E', '-I', '-n', pat], env={'LC_ALL': 'C'}, ), - mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]), - mkcmd('rg (whitelist) (ASCII)', [ - 'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat, - ]), - mkcmd('ucg (whitelist) (ASCII)', ['ucg', '--nosmart-case', pat]), ]) @@ -354,15 +300,13 @@ def bench_linux_alternates(suite_dir): return Command(*args, **kwargs) return Benchmark(pattern=pat, commands=[ - mkcmd('rg (ignore)', ['rg', '-n', pat]), - mkcmd('ag (ignore)', ['ag', '-s', pat]), + mkcmd('rg', ['rg', '-n', pat]), + mkcmd('ag', ['ag', '-s', pat]), mkcmd( - 'git grep (ignore)', + 'git grep', ['git', 'grep', '-E', '-I', '-n', pat], env={'LC_ALL': 'C'}, ), - mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', pat]), - mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]), ]) @@ -377,15 +321,13 @@ def bench_linux_alternates_casei(suite_dir): return Command(*args, **kwargs) return Benchmark(pattern=pat, commands=[ - mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]), - mkcmd('ag (ignore)', ['ag', '-i', pat]), + mkcmd('rg', ['rg', '-n', '-i', pat]), + mkcmd('ag', ['ag', '-i', pat]), mkcmd( - 'git grep (ignore)', + 'git grep', ['git', 'grep', '-E', '-I', '-n', '-i', pat], env={'LC_ALL': 'C'}, ), - mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', '-i', pat]), - mkcmd('ucg (whitelist)', ['ucg', '-i', pat]), ]) @@ -400,15 +342,10 @@ def bench_subtitles_en_literal(suite_dir): return Benchmark(pattern=pat, commands=[ Command('rg', ['rg', pat, en]), Command('rg (no mmap)', ['rg', '--no-mmap', pat, en]), - Command('pt', ['pt', '-N', pat, en]), - Command('sift', ['sift', pat, en]), - Command('grep', ['grep', '-a', pat, en], env=GREP_ASCII), + Command('grep', ['grep', pat, en], env=GREP_ASCII), Command('rg (lines)', ['rg', '-n', pat, en]), Command('ag (lines)', ['ag', '-s', pat, en]), - Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]), - Command('pt (lines)', ['pt', pat, en]), - Command('sift (lines)', ['sift', '-n', pat, en]), - Command('grep (lines)', ['grep', '-an', pat, en], env=GREP_ASCII), + Command('grep (lines)', ['grep', '-n', pat, en], env=GREP_ASCII), ]) @@ -428,7 +365,6 @@ def bench_subtitles_en_literal_casei(suite_dir): ], env=GREP_ASCII), Command('rg (lines)', ['rg', '-n', '-i', pat, en]), Command('ag (lines) (ASCII)', ['ag', '-i', pat, en]), - Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, en]), ]) @@ -445,7 +381,6 @@ def bench_subtitles_en_literal_word(suite_dir): 'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', en, ]), Command('ag (ASCII)', ['ag', '-sw', pat, en]), - Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]), Command('grep (ASCII)', [ 'grep', '-anw', pat, en, ], env=GREP_ASCII), @@ -471,7 +406,6 @@ def bench_subtitles_en_alternate(suite_dir): return Benchmark(pattern=pat, commands=[ Command('rg (lines)', ['rg', '-n', pat, en]), Command('ag (lines)', ['ag', '-s', pat, en]), - Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]), Command('grep (lines)', [ 'grep', '-E', '-an', pat, en, ], env=GREP_ASCII), @@ -498,7 +432,6 @@ def bench_subtitles_en_alternate_casei(suite_dir): return Benchmark(pattern=pat, commands=[ Command('ag (ASCII)', ['ag', '-s', '-i', pat, en]), - Command('ucg (ASCII)', ['ucg', '-i', pat, en]), Command('grep (ASCII)', [ 'grep', '-E', '-ani', pat, en, ], env=GREP_ASCII), @@ -520,7 +453,6 @@ def bench_subtitles_en_surrounding_words(suite_dir): Command('grep', ['grep', '-E', '-an', pat, en], env=GREP_UNICODE), Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]), Command('ag (ASCII)', ['ag', '-s', pat, en]), - Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]), Command('grep (ASCII)', [ 'grep', '-E', '-an', pat, en, ], env=GREP_ASCII), @@ -544,7 +476,6 @@ def bench_subtitles_en_no_literal(suite_dir): Command('rg', ['rg', '-n', pat, en]), Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]), Command('ag (ASCII)', ['ag', '-s', pat, en]), - Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]), Command('grep (ASCII)', [ 'grep', '-E', '-an', pat, en, ], env=GREP_ASCII), @@ -562,14 +493,9 @@ def bench_subtitles_ru_literal(suite_dir): return Benchmark(pattern=pat, commands=[ Command('rg', ['rg', pat, ru]), Command('rg (no mmap)', ['rg', '--no-mmap', pat, ru]), - Command('pt', ['pt', '-N', pat, ru]), - Command('sift', ['sift', pat, ru]), Command('grep', ['grep', '-a', pat, ru], env=GREP_ASCII), Command('rg (lines)', ['rg', '-n', pat, ru]), Command('ag (lines)', ['ag', '-s', pat, ru]), - Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]), - Command('pt (lines)', ['pt', pat, ru]), - Command('sift (lines)', ['sift', '-n', pat, ru]), Command('grep (lines)', ['grep', '-an', pat, ru], env=GREP_ASCII), ]) @@ -590,7 +516,6 @@ def bench_subtitles_ru_literal_casei(suite_dir): ], env=GREP_ASCII), Command('rg (lines)', ['rg', '-n', '-i', pat, ru]), Command('ag (lines) (ASCII)', ['ag', '-i', pat, ru]), - Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, ru]), ]) @@ -607,7 +532,6 @@ def bench_subtitles_ru_literal_word(suite_dir): 'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', ru, ]), Command('ag (ASCII)', ['ag', '-sw', pat, ru]), - Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]), Command('grep (ASCII)', [ 'grep', '-anw', pat, ru, ], env=GREP_ASCII), @@ -633,7 +557,6 @@ def bench_subtitles_ru_alternate(suite_dir): return Benchmark(pattern=pat, commands=[ Command('rg (lines)', ['rg', '-n', pat, ru]), Command('ag (lines)', ['ag', '-s', pat, ru]), - Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]), Command('grep (lines)', [ 'grep', '-E', '-an', pat, ru, ], env=GREP_ASCII), @@ -660,7 +583,6 @@ def bench_subtitles_ru_alternate_casei(suite_dir): return Benchmark(pattern=pat, commands=[ Command('ag (ASCII)', ['ag', '-s', '-i', pat, ru]), - Command('ucg (ASCII)', ['ucg', '-i', pat, ru]), Command('grep (ASCII)', [ 'grep', '-E', '-ani', pat, ru, ], env=GREP_ASCII), @@ -681,7 +603,6 @@ def bench_subtitles_ru_surrounding_words(suite_dir): Command('rg', ['rg', '-n', pat, ru]), Command('grep', ['grep', '-E', '-an', pat, ru], env=GREP_UNICODE), Command('ag (ASCII)', ['ag', '-s', pat, ru]), - Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]), Command('grep (ASCII)', [ 'grep', '-E', '-an', pat, ru, ], env=GREP_ASCII), @@ -705,7 +626,6 @@ def bench_subtitles_ru_no_literal(suite_dir): Command('rg', ['rg', '-n', pat, ru]), Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, ru]), Command('ag (ASCII)', ['ag', '-s', pat, ru]), - Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]), Command('grep (ASCII)', [ 'grep', '-E', '-an', pat, ru, ], env=GREP_ASCII), @@ -758,7 +678,7 @@ class Benchmark(object): def __init__(self, name=None, pattern=None, commands=None, warmup_count=1, count=3, line_count=True, allow_missing_commands=False, - disabled_cmds=None): + disabled_cmds=None, order=0): ''' Create a single benchmark. @@ -794,6 +714,8 @@ class Benchmark(object): will simply skip it. :param list(str) disabled_cmds: A list of commands to skip. + :param int order: + An integer indicating the sequence number of this benchmark. ''' self.name = name self.pattern = pattern @@ -803,6 +725,7 @@ class Benchmark(object): self.line_count = line_count self.allow_missing_commands = allow_missing_commands self.disabled_cmds = set(disabled_cmds or []) + self.order = order def raise_if_missing(self): ''' @@ -1165,19 +1088,22 @@ def collect_benchmarks(suite_dir, filter_pat=None, requires corpora that are missing, then a log message is emitted to stderr and it is not yielded. ''' - for fun in sorted(globals()): - if not fun.startswith('bench_'): + benchmarks = [] + for global_name in globals(): + if not global_name.startswith('bench_'): continue - name = re.sub('^bench_', '', fun) + name = re.sub('^bench_', '', global_name) if filter_pat is not None and not re.search(filter_pat, name): continue try: - benchmark = globals()[fun](suite_dir) + fun = globals()[global_name] + benchmark = fun(suite_dir) benchmark.name = name benchmark.warmup_count = warmup_iter benchmark.count = bench_iter benchmark.allow_missing_commands = allow_missing_commands benchmark.disabled_cmds = disabled_cmds + benchmark.order = fun.__code__.co_firstlineno benchmark.raise_if_missing() except MissingDependencies as e: eprint( @@ -1192,7 +1118,8 @@ def collect_benchmarks(suite_dir, filter_pat=None, '(run with --allow-missing to run incomplete benchmarks)' eprint(fmt % (', '.join(e.missing_names), name)) continue - yield benchmark + benchmarks.append(benchmark) + return sorted(benchmarks, key=lambda b: b.order) def main(): |