benchsuite: remove sift, pt and ucg

None of these tools got particularly popular (except for pt briefly), but they do not appear to be active projects nowadays. While ucg was fast, sift and pt were ecscruiating slow in a number of cases that required special care in the benchmarks. This also fixes the ordering of benchmark output to reflect the ordering in the source of the benchsuite script.
author: Andrew Gallant <jamslam@gmail.com> 2020-10-14 15:01:15 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2020-10-14 15:16:07 -0400
commit: 5ebb3ad03921921324cf399e0a9c4cf1d874c48e (patch)
tree: bda83bc15d5a1c032daf4fb9b79aacd89acad76b
parent: b0066274cbb36e2cf4a76aded5f8a98d1f79e61a (diff)
1 files changed, 45 insertions, 118 deletions
diff --git a/benchsuite/benchsuite b/benchsuite/benchsuite
index 9353cf49..f8cf6ea8 100755
--- a/benchsuite/benchsuite
+++ b/benchsuite/benchsuite
@@ -71,15 +71,8 @@ def bench_linux_literal_default(suite_dir):
     return Benchmark(pattern=pat, commands=[
         mkcmd('rg', ['rg', pat]),
         mkcmd('ag', ['ag', pat]),
-        # ucg reports the exact same matches as ag and rg even though it
-        # doesn't read gitignore files. Instead, it has a file whitelist
-        # that happens to match up exactly with the gitignores for this search.
-        mkcmd('ucg', ['ucg', pat]),
         # I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the
         # default, but I'd guess it to be on most desktop systems.
-        mkcmd('pt', ['pt', pat]),
-        # sift reports an extra line here for a binary file matched.
-        mkcmd('sift', ['sift', pat]),
         mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}),
     ])
 
@@ -102,16 +95,12 @@ def bench_linux_literal(suite_dir):
         return Command(*args, **kwargs)
 
     return Benchmark(pattern=pat, commands=[
-        mkcmd('rg (ignore)', ['rg', '-n', pat]),
-        mkcmd('rg (ignore) (mmap)', ['rg', '-n', '--mmap', pat]),
-        mkcmd('ag (ignore) (mmap)', ['ag', '-s', pat]),
-        mkcmd('pt (ignore)', ['pt', pat]),
-        mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]),
-        mkcmd('git grep (ignore)', [
+        mkcmd('rg', ['rg', '-n', pat]),
+        mkcmd('rg (mmap)', ['rg', '-n', '--mmap', pat]),
+        mkcmd('ag (mmap)', ['ag', '-s', pat]),
+        mkcmd('git grep', [
             'git', 'grep', '-I', '-n', pat,
         ], env={'LC_ALL': 'C'}),
-        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
-        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
     ])
 
 
@@ -131,31 +120,22 @@ def bench_linux_literal_casei(suite_dir):
         return Command(*args, **kwargs)
 
     return Benchmark(pattern=pat, commands=[
-        mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]),
-        mkcmd('rg (ignore) (mmap)', ['rg', '-n', '-i', '--mmap', pat]),
-        mkcmd('ag (ignore) (mmap)', ['ag', '-i', pat]),
-        mkcmd('pt (ignore)', ['pt', '-i', pat]),
-        mkcmd('sift (ignore)', SIFT + ['-n', '-i', '--git', pat]),
+        mkcmd('rg', ['rg', '-n', '-i', pat]),
+        mkcmd('rg (mmap)', ['rg', '-n', '-i', '--mmap', pat]),
+        mkcmd('ag (mmap)', ['ag', '-i', pat]),
         # It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here,
         # since that is certainly what ripgrep is doing, but this is for an
         # ASCII literal, so we should give `git grep` all the opportunity to
         # do its best.
-        mkcmd('git grep (ignore)', [
+        mkcmd('git grep', [
             'git', 'grep', '-I', '-n', '-i', pat,
         ], env={'LC_ALL': 'C'}),
-        mkcmd('rg (whitelist)', [
-            'rg', '-n', '-i', '--no-ignore', '-tall', pat,
-        ]),
-        mkcmd('ucg (whitelist)', ['ucg', '-i', pat]),
     ])
 
 
 def bench_linux_re_literal_suffix(suite_dir):
     '''
     Benchmark the speed of a literal inside a regex.
-
-    This, for example, inhibits a prefix byte optimization used
-    inside of Go's regex engine (relevant for sift and pt).
     '''
     require(suite_dir, 'linux')
     cwd = path.join(suite_dir, LINUX_DIR)
@@ -166,26 +146,19 @@ def bench_linux_re_literal_suffix(suite_dir):
         return Command(*args, **kwargs)
 
     return Benchmark(pattern=pat, commands=[
-        mkcmd('rg (ignore)', ['rg', '-n', pat]),
-        mkcmd('ag (ignore)', ['ag', '-s', pat]),
-        mkcmd('pt (ignore)', ['pt', '-e', pat]),
-        mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]),
+        mkcmd('rg', ['rg', '-n', pat]),
+        mkcmd('ag', ['ag', '-s', pat]),
         mkcmd(
-            'git grep (ignore)',
+            'git grep',
             ['git', 'grep', '-E', '-I', '-n', pat],
             env={'LC_ALL': 'C'},
         ),
-        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
-        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
     ])
 
 
 def bench_linux_word(suite_dir):
     '''
     Benchmark use of the -w ("match word") flag in each tool.
-
-    sift has a lot of trouble with this because it forces it into Go's
-    regex engine by surrounding the pattern with \b assertions.
     '''
     require(suite_dir, 'linux')
     cwd = path.join(suite_dir, LINUX_DIR)
@@ -196,28 +169,19 @@ def bench_linux_word(suite_dir):
         return Command(*args, **kwargs)
 
     return Benchmark(pattern=pat, commands=[
-        mkcmd('rg (ignore)', ['rg', '-n', '-w', pat]),
-        mkcmd('ag (ignore)', ['ag', '-s', '-w', pat]),
-        mkcmd('pt (ignore)', ['pt', '-w', pat]),
-        mkcmd('sift (ignore)', SIFT + ['-n', '-w', '--git', pat]),
+        mkcmd('rg', ['rg', '-n', '-w', pat]),
+        mkcmd('ag', ['ag', '-s', '-w', pat]),
         mkcmd(
-            'git grep (ignore)',
+            'git grep',
             ['git', 'grep', '-E', '-I', '-n', '-w', pat],
             env={'LC_ALL': 'C'},
         ),
-        mkcmd('rg (whitelist)', [
-            'rg', '-n', '-w', '--no-ignore', '-tall', pat,
-        ]),
-        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', '-w', pat]),
     ])
 
 
 def bench_linux_unicode_greek(suite_dir):
     '''
     Benchmark matching of a Unicode category.
-
-    Only three tools (ripgrep, sift and pt) support this. We omit
-    pt because it is too slow.
     '''
     require(suite_dir, 'linux')
     cwd = path.join(suite_dir, LINUX_DIR)
@@ -229,8 +193,6 @@ def bench_linux_unicode_greek(suite_dir):
 
     return Benchmark(pattern=pat, commands=[
         mkcmd('rg', ['rg', '-n', pat]),
-        mkcmd('pt', ['pt', '-e', pat]),
-        mkcmd('sift', SIFT + ['-n', '--git', pat]),
     ])
 
 
@@ -250,8 +212,6 @@ def bench_linux_unicode_greek_casei(suite_dir):
 
     return Benchmark(pattern=pat, commands=[
         mkcmd('rg', ['rg', '-n', '-i', pat]),
-        mkcmd('pt', ['pt', '-i', '-e', pat]),
-        mkcmd('sift', SIFT + ['-n', '-i', '--git', pat]),
     ])
 
 
@@ -272,26 +232,19 @@ def bench_linux_unicode_word(suite_dir):
         return Command(*args, **kwargs)
 
     return Benchmark(pattern=pat, commands=[
-        mkcmd('rg (ignore)', ['rg', '-n', pat]),
-        mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]),
-        mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]),
-        mkcmd('pt (ignore) (ASCII)', ['pt', '-e', pat]),
-        mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', '--git', pat]),
+        mkcmd('rg', ['rg', '-n', pat]),
+        mkcmd('rg (ASCII)', ['rg', '-n', '(?-u)' + pat]),
+        mkcmd('ag (ASCII)', ['ag', '-s', pat]),
         mkcmd(
-            'git grep (ignore)',
+            'git grep',
             ['git', 'grep', '-E', '-I', '-n', pat],
             env={'LC_ALL': 'en_US.UTF-8'},
         ),
         mkcmd(
-            'git grep (ignore) (ASCII)',
+            'git grep (ASCII)',
             ['git', 'grep', '-E', '-I', '-n', pat],
             env={'LC_ALL': 'C'},
         ),
-        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
-        mkcmd('rg (whitelist) (ASCII)', [
-            'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat,
-        ]),
-        mkcmd('ucg (ASCII)', ['ucg', '--nosmart-case', pat]),
     ])
 
 
@@ -313,26 +266,19 @@ def bench_linux_no_literal(suite_dir):
         return Command(*args, **kwargs)
 
     return Benchmark(pattern=pat, commands=[
-        mkcmd('rg (ignore)', ['rg', '-n', pat]),
-        mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]),
-        mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]),
-        mkcmd('pt (ignore) (ASCII)', ['pt', '-e', pat]),
-        mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', '--git', pat]),
+        mkcmd('rg', ['rg', '-n', pat]),
+        mkcmd('rg (ASCII)', ['rg', '-n', '(?-u)' + pat]),
+        mkcmd('ag (ASCII)', ['ag', '-s', pat]),
         mkcmd(
-            'git grep (ignore)',
+            'git grep',
             ['git', 'grep', '-E', '-I', '-n', pat],
             env={'LC_ALL': 'en_US.UTF-8'},
         ),
         mkcmd(
-            'git grep (ignore) (ASCII)',
+            'git grep (ASCII)',
             ['git', 'grep', '-E', '-I', '-n', pat],
             env={'LC_ALL': 'C'},
         ),
-        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
-        mkcmd('rg (whitelist) (ASCII)', [
-            'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat,
-        ]),
-        mkcmd('ucg (whitelist) (ASCII)', ['ucg', '--nosmart-case', pat]),
     ])
 
 
@@ -354,15 +300,13 @@ def bench_linux_alternates(suite_dir):
         return Command(*args, **kwargs)
 
     return Benchmark(pattern=pat, commands=[
-        mkcmd('rg (ignore)', ['rg', '-n', pat]),
-        mkcmd('ag (ignore)', ['ag', '-s', pat]),
+        mkcmd('rg', ['rg', '-n', pat]),
+        mkcmd('ag', ['ag', '-s', pat]),
         mkcmd(
-            'git grep (ignore)',
+            'git grep',
             ['git', 'grep', '-E', '-I', '-n', pat],
             env={'LC_ALL': 'C'},
         ),
-        mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', pat]),
-        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
     ])
 
 
@@ -377,15 +321,13 @@ def bench_linux_alternates_casei(suite_dir):
         return Command(*args, **kwargs)
 
     return Benchmark(pattern=pat, commands=[
-        mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]),
-        mkcmd('ag (ignore)', ['ag', '-i', pat]),
+        mkcmd('rg', ['rg', '-n', '-i', pat]),
+        mkcmd('ag', ['ag', '-i', pat]),
         mkcmd(
-            'git grep (ignore)',
+            'git grep',
             ['git', 'grep', '-E', '-I', '-n', '-i', pat],
             env={'LC_ALL': 'C'},
         ),
-        mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', '-i', pat]),
-        mkcmd('ucg (whitelist)', ['ucg', '-i', pat]),
     ])
 
 
@@ -400,15 +342,10 @@ def bench_subtitles_en_literal(suite_dir):
     return Benchmark(pattern=pat, commands=[
         Command('rg', ['rg', pat, en]),
         Command('rg (no mmap)', ['rg', '--no-mmap', pat, en]),
-        Command('pt', ['pt', '-N', pat, en]),
-        Command('sift', ['sift', pat, en]),
-        Command('grep', ['grep', '-a', pat, en], env=GREP_ASCII),
+        Command('grep', ['grep', pat, en], env=GREP_ASCII),
         Command('rg (lines)', ['rg', '-n', pat, en]),
         Command('ag (lines)', ['ag', '-s', pat, en]),
-        Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]),
-        Command('pt (lines)', ['pt', pat, en]),
-        Command('sift (lines)', ['sift', '-n', pat, en]),
-        Command('grep (lines)', ['grep', '-an', pat, en], env=GREP_ASCII),
+        Command('grep (lines)', ['grep', '-n', pat, en], env=GREP_ASCII),
     ])
 
 
@@ -428,7 +365,6 @@ def bench_subtitles_en_literal_casei(suite_dir):
         ], env=GREP_ASCII),
         Command('rg (lines)', ['rg', '-n', '-i', pat, en]),
         Command('ag (lines) (ASCII)', ['ag', '-i', pat, en]),
-        Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, en]),
     ])
 
 
@@ -445,7 +381,6 @@ def bench_subtitles_en_literal_word(suite_dir):
             'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', en,
         ]),
         Command('ag (ASCII)', ['ag', '-sw', pat, en]),
-        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
         Command('grep (ASCII)', [
             'grep', '-anw', pat, en,
         ], env=GREP_ASCII),
@@ -471,7 +406,6 @@ def bench_subtitles_en_alternate(suite_dir):
     return Benchmark(pattern=pat, commands=[
         Command('rg (lines)', ['rg', '-n', pat, en]),
         Command('ag (lines)', ['ag', '-s', pat, en]),
-        Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]),
         Command('grep (lines)', [
             'grep', '-E', '-an', pat, en,
         ], env=GREP_ASCII),
@@ -498,7 +432,6 @@ def bench_subtitles_en_alternate_casei(suite_dir):
 
     return Benchmark(pattern=pat, commands=[
         Command('ag (ASCII)', ['ag', '-s', '-i', pat, en]),
-        Command('ucg (ASCII)', ['ucg', '-i', pat, en]),
         Command('grep (ASCII)', [
             'grep', '-E', '-ani', pat, en,
         ], env=GREP_ASCII),
@@ -520,7 +453,6 @@ def bench_subtitles_en_surrounding_words(suite_dir):
         Command('grep', ['grep', '-E', '-an', pat, en], env=GREP_UNICODE),
         Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
         Command('ag (ASCII)', ['ag', '-s', pat, en]),
-        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
         Command('grep (ASCII)', [
             'grep', '-E', '-an', pat, en,
         ], env=GREP_ASCII),
@@ -544,7 +476,6 @@ def bench_subtitles_en_no_literal(suite_dir):
         Command('rg', ['rg', '-n', pat, en]),
         Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
         Command('ag (ASCII)', ['ag', '-s', pat, en]),
-        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
         Command('grep (ASCII)', [
             'grep', '-E', '-an', pat, en,
         ], env=GREP_ASCII),
@@ -562,14 +493,9 @@ def bench_subtitles_ru_literal(suite_dir):
     return Benchmark(pattern=pat, commands=[
         Command('rg', ['rg', pat, ru]),
         Command('rg (no mmap)', ['rg', '--no-mmap', pat, ru]),
-        Command('pt', ['pt', '-N', pat, ru]),
-        Command('sift', ['sift', pat, ru]),
         Command('grep', ['grep', '-a', pat, ru], env=GREP_ASCII),
         Command('rg (lines)', ['rg', '-n', pat, ru]),
         Command('ag (lines)', ['ag', '-s', pat, ru]),
-        Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]),
-        Command('pt (lines)', ['pt', pat, ru]),
-        Command('sift (lines)', ['sift', '-n', pat, ru]),
         Command('grep (lines)', ['grep', '-an', pat, ru], env=GREP_ASCII),
     ])
 
@@ -590,7 +516,6 @@ def bench_subtitles_ru_literal_casei(suite_dir):
         ], env=GREP_ASCII),
         Command('rg (lines)', ['rg', '-n', '-i', pat, ru]),
         Command('ag (lines) (ASCII)', ['ag', '-i', pat, ru]),
-        Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, ru]),
     ])
 
 
@@ -607,7 +532,6 @@ def bench_subtitles_ru_literal_word(suite_dir):
             'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', ru,
         ]),
         Command('ag (ASCII)', ['ag', '-sw', pat, ru]),
-        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
         Command('grep (ASCII)', [
             'grep', '-anw', pat, ru,
         ], env=GREP_ASCII),
@@ -633,7 +557,6 @@ def bench_subtitles_ru_alternate(suite_dir):
     return Benchmark(pattern=pat, commands=[
         Command('rg (lines)', ['rg', '-n', pat, ru]),
         Command('ag (lines)', ['ag', '-s', pat, ru]),
-        Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]),
         Command('grep (lines)', [
             'grep', '-E', '-an', pat, ru,
         ], env=GREP_ASCII),
@@ -660,7 +583,6 @@ def bench_subtitles_ru_alternate_casei(suite_dir):
 
     return Benchmark(pattern=pat, commands=[
         Command('ag (ASCII)', ['ag', '-s', '-i', pat, ru]),
-        Command('ucg (ASCII)', ['ucg', '-i', pat, ru]),
         Command('grep (ASCII)', [
             'grep', '-E', '-ani', pat, ru,
         ], env=GREP_ASCII),
@@ -681,7 +603,6 @@ def bench_subtitles_ru_surrounding_words(suite_dir):
         Command('rg', ['rg', '-n', pat, ru]),
         Command('grep', ['grep', '-E', '-an', pat, ru], env=GREP_UNICODE),
         Command('ag (ASCII)', ['ag', '-s', pat, ru]),
-        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
         Command('grep (ASCII)', [
             'grep', '-E', '-an', pat, ru,
         ], env=GREP_ASCII),
@@ -705,7 +626,6 @@ def bench_subtitles_ru_no_literal(suite_dir):
         Command('rg', ['rg', '-n', pat, ru]),
         Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, ru]),
         Command('ag (ASCII)', ['ag', '-s', pat, ru]),
-        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
         Command('grep (ASCII)', [
             'grep', '-E', '-an', pat, ru,
         ], env=GREP_ASCII),
@@ -758,7 +678,7 @@ class Benchmark(object):
     def __init__(self, name=None, pattern=None, commands=None,
                  warmup_count=1, count=3, line_count=True,
                  allow_missing_commands=False,
-                 disabled_cmds=None):
+                 disabled_cmds=None, order=0):
         '''
         Create a single benchmark.
 
@@ -794,6 +714,8 @@ class Benchmark(object):
             will simply skip it.
         :param list(str) disabled_cmds:
             A list of commands to skip.
+        :param int order:
+            An integer indicating the sequence number of this benchmark.
         '''
         self.name = name
         self.pattern = pattern
@@ -803,6 +725,7 @@ class Benchmark(object):
         self.line_count = line_count
         self.allow_missing_commands = allow_missing_commands
         self.disabled_cmds = set(disabled_cmds or [])
+        self.order = order
 
     def raise_if_missing(self):
         '''
@@ -1165,19 +1088,22 @@ def collect_benchmarks(suite_dir, filter_pat=None,
         requires corpora that are missing, then a log message is
         emitted to stderr and it is not yielded.
     '''
-    for fun in sorted(globals()):
-        if not fun.startswith('bench_'):
+    benchmarks = []
+    for global_name in globals():
+        if not global_name.startswith('bench_'):
             continue
-        name = re.sub('^bench_', '', fun)
+        name = re.sub('^bench_', '', global_name)
         if filter_pat is not None and not re.search(filter_pat, name):
             continue
         try:
-            benchmark = globals()[fun](suite_dir)
+            fun = globals()[global_name]
+            benchmark = fun(suite_dir)
             benchmark.name = name
             benchmark.warmup_count = warmup_iter
             benchmark.count = bench_iter
             benchmark.allow_missing_commands = allow_missing_commands
             benchmark.disabled_cmds = disabled_cmds
+            benchmark.order = fun.__code__.co_firstlineno
             benchmark.raise_if_missing()
         except MissingDependencies as e:
             eprint(
@@ -1192,7 +1118,8 @@ def collect_benchmarks(suite_dir, filter_pat=None,
                   '(run with --allow-missing to run incomplete benchmarks)'
             eprint(fmt % (', '.join(e.missing_names), name))
             continue
-        yield benchmark
+        benchmarks.append(benchmark)
+    return sorted(benchmarks, key=lambda b: b.order)
 
 
 def main():
author	Andrew Gallant <jamslam@gmail.com>	2020-10-14 15:01:15 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2020-10-14 15:16:07 -0400
commit	5ebb3ad03921921324cf399e0a9c4cf1d874c48e (patch)
tree	bda83bc15d5a1c032daf4fb9b79aacd89acad76b
parent	b0066274cbb36e2cf4a76aded5f8a98d1f79e61a (diff)