benchsuite: add ugrep commands to benchmarks

author: Andrew Gallant <jamslam@gmail.com> 2020-10-14 17:00:35 -0400
committer: Andrew Gallant <jamslam@gmail.com> 2020-10-14 17:00:35 -0400
commit: de0c24f31c6a1218c1ff78ae8a6920650e226949 (patch)
tree: 9549bf0af3705dd7e23912235ecec3abf72efc30
parent: c55e7af675f392f2829c69bb50a9334d1f7d967a (diff)
1 files changed, 78 insertions, 4 deletions
diff --git a/benchsuite/benchsuite b/benchsuite/benchsuite
index 5a67503d..a70eb540 100755
--- a/benchsuite/benchsuite
+++ b/benchsuite/benchsuite
@@ -57,8 +57,10 @@ def bench_linux_literal_default(suite_dir):
     Benchmark the speed of a literal using *default* settings.
 
     This is a purposefully unfair benchmark for use in performance
-    analysis, but it is pedagogically useful to demonstrate how
-    default behaviors differ.
+    analysis, but it is pedagogically useful to demonstrate how default
+    behaviors differ. For example, ugrep and grep don't do any smart
+    filtering by default, so they will invariably search more files
+    than ripgrep, ag or git grep.
     '''
     require(suite_dir, 'linux')
     cwd = path.join(suite_dir, LINUX_DIR)
@@ -73,7 +75,9 @@ def bench_linux_literal_default(suite_dir):
         mkcmd('ag', ['ag', pat]),
         # I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the
         # default, but I'd guess it to be on most desktop systems.
-        mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}),
+        mkcmd('git grep', ['git', 'grep', pat], env=GREP_UNICODE),
+        mkcmd('ugrep', ['ugrep', '-r', pat, './']),
+        mkcmd('grep', ['grep', '-r', pat, './'], env=GREP_UNICODE),
     ])
 
 
@@ -101,6 +105,10 @@ def bench_linux_literal(suite_dir):
         mkcmd('git grep', [
             'git', 'grep', '-I', '-n', pat,
         ], env={'LC_ALL': 'C'}),
+        mkcmd('ugrep', [
+            'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
+            '-n', pat, './',
+        ])
     ])
 
 
@@ -130,6 +138,10 @@ def bench_linux_literal_casei(suite_dir):
         mkcmd('git grep', [
             'git', 'grep', '-I', '-n', '-i', pat,
         ], env={'LC_ALL': 'C'}),
+        mkcmd('ugrep', [
+            'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
+            '-n', '-i', pat, './',
+        ])
     ])
 
 
@@ -153,6 +165,10 @@ def bench_linux_re_literal_suffix(suite_dir):
             ['git', 'grep', '-E', '-I', '-n', pat],
             env={'LC_ALL': 'C'},
         ),
+        mkcmd('ugrep', [
+            'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
+            '-n', pat, './',
+        ])
     ])
 
 
@@ -176,6 +192,10 @@ def bench_linux_word(suite_dir):
             ['git', 'grep', '-E', '-I', '-n', '-w', pat],
             env={'LC_ALL': 'C'},
         ),
+        mkcmd('ugrep', [
+            'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
+            '-n', '-w', pat, './',
+        ])
     ])
 
 
@@ -193,6 +213,10 @@ def bench_linux_unicode_greek(suite_dir):
 
     return Benchmark(pattern=pat, commands=[
         mkcmd('rg', ['rg', '-n', pat]),
+        mkcmd('ugrep', [
+            'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
+            '-n', pat, './',
+        ])
     ])
 
 
@@ -212,6 +236,10 @@ def bench_linux_unicode_greek_casei(suite_dir):
 
     return Benchmark(pattern=pat, commands=[
         mkcmd('rg', ['rg', '-n', '-i', pat]),
+        mkcmd('ugrep', [
+            'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
+            '-n', '-i', pat, './',
+        ])
     ])
 
 
@@ -245,6 +273,14 @@ def bench_linux_unicode_word(suite_dir):
             ['git', 'grep', '-E', '-I', '-n', pat],
             env={'LC_ALL': 'C'},
         ),
+        mkcmd('ugrep', [
+            'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
+            '-n', pat, './',
+        ]),
+        mkcmd('ugrep (ASCII)', [
+            'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
+            '-n', '-U', pat, './',
+        ]),
     ])
 
 
@@ -279,6 +315,14 @@ def bench_linux_no_literal(suite_dir):
             ['git', 'grep', '-E', '-I', '-n', pat],
             env={'LC_ALL': 'C'},
         ),
+        mkcmd('ugrep', [
+            'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
+            '-n', pat, './',
+        ]),
+        mkcmd('ugrep (ASCII)', [
+            'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
+            '-n', '-U', pat, './',
+        ]),
     ])
 
 
@@ -307,6 +351,10 @@ def bench_linux_alternates(suite_dir):
             ['git', 'grep', '-E', '-I', '-n', pat],
             env={'LC_ALL': 'C'},
         ),
+        mkcmd('ugrep', [
+            'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
+            '-n', pat, './',
+        ])
     ])
 
 
@@ -328,6 +376,10 @@ def bench_linux_alternates_casei(suite_dir):
             ['git', 'grep', '-E', '-I', '-n', '-i', pat],
             env={'LC_ALL': 'C'},
         ),
+        mkcmd('ugrep', [
+            'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
+            '-n', '-i', pat, './',
+        ])
     ])
 
 
@@ -346,6 +398,7 @@ def bench_subtitles_en_literal(suite_dir):
         Command('rg (lines)', ['rg', '-n', pat, en]),
         Command('ag (lines)', ['ag', '-s', pat, en]),
         Command('grep (lines)', ['grep', '-n', pat, en], env=GREP_ASCII),
+        Command('ugrep (lines)', ['ugrep', '-n', pat, en])
     ])
 
 
@@ -363,6 +416,7 @@ def bench_subtitles_en_literal_casei(suite_dir):
         Command('grep (ASCII)', ['grep', '-E', '-i', pat, en], env=GREP_ASCII),
         Command('rg (lines)', ['rg', '-n', '-i', pat, en]),
         Command('ag (lines) (ASCII)', ['ag', '-i', pat, en]),
+        Command('ugrep (lines)', ['ugrep', '-n', '-i', pat, en])
     ])
 
 
@@ -380,6 +434,7 @@ def bench_subtitles_en_literal_word(suite_dir):
         ]),
         Command('ag (ASCII)', ['ag', '-sw', pat, en]),
         Command('grep (ASCII)', ['grep', '-nw', pat, en], env=GREP_ASCII),
+        Command('ugrep (ASCII)', ['ugrep', '-nw', pat, en]),
         Command('rg', ['rg', '-nw', pat, en]),
         Command('grep', ['grep', '-nw', pat, en], env=GREP_UNICODE),
     ])
@@ -403,6 +458,7 @@ def bench_subtitles_en_alternate(suite_dir):
         Command('rg (lines)', ['rg', '-n', pat, en]),
         Command('ag (lines)', ['ag', '-s', pat, en]),
         Command('grep (lines)', ['grep', '-E', '-n', pat, en], env=GREP_ASCII),
+        Command('ugrep (lines)', ['ugrep', '-n', pat, en]),
         Command('rg', ['rg', pat, en]),
         Command('grep', ['grep', '-E', pat, en], env=GREP_ASCII),
     ])
@@ -427,6 +483,7 @@ def bench_subtitles_en_alternate_casei(suite_dir):
         Command('grep (ASCII)', [
             'grep', '-E', '-ni', pat, en,
         ], env=GREP_ASCII),
+        Command('ugrep (ASCII)', ['ugrep', '-n', '-i', pat, en]),
         Command('rg', ['rg', '-n', '-i', pat, en]),
         Command('grep', ['grep', '-E', '-ni', pat, en], env=GREP_UNICODE),
     ])
@@ -443,9 +500,11 @@ def bench_subtitles_en_surrounding_words(suite_dir):
     return Benchmark(pattern=pat, commands=[
         Command('rg', ['rg', '-n', pat, en]),
         Command('grep', ['grep', '-E', '-n', pat, en], env=GREP_UNICODE),
+        Command('ugrep', ['ugrep', '-n', pat, en]),
         Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
         Command('ag (ASCII)', ['ag', '-s', pat, en]),
         Command('grep (ASCII)', ['grep', '-E', '-n', pat, en], env=GREP_ASCII),
+        Command('ugrep (ASCII)', ['ugrep', '-n', '-U', pat, en])
     ])
 
 
@@ -464,9 +523,11 @@ def bench_subtitles_en_no_literal(suite_dir):
 
     return Benchmark(pattern=pat, commands=[
         Command('rg', ['rg', '-n', pat, en]),
+        Command('ugrep', ['ugrep', '-n', pat, en]),
         Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
         Command('ag (ASCII)', ['ag', '-s', pat, en]),
         Command('grep (ASCII)', ['grep', '-E', '-n', pat, en], env=GREP_ASCII),
+        Command('ugrep (ASCII)', ['ugrep', '-n', '-U', pat, en])
     ])
 
 
@@ -485,6 +546,7 @@ def bench_subtitles_ru_literal(suite_dir):
         Command('rg (lines)', ['rg', '-n', pat, ru]),
         Command('ag (lines)', ['ag', '-s', pat, ru]),
         Command('grep (lines)', ['grep', '-n', pat, ru], env=GREP_ASCII),
+        Command('ugrep (lines)', ['ugrep', '-n', pat, ru])
     ])
 
 
@@ -502,6 +564,7 @@ def bench_subtitles_ru_literal_casei(suite_dir):
         Command('grep (ASCII)', ['grep', '-E', '-i', pat, ru], env=GREP_ASCII),
         Command('rg (lines)', ['rg', '-n', '-i', pat, ru]),
         Command('ag (lines) (ASCII)', ['ag', '-i', pat, ru]),
+        Command('ugrep (lines) (ASCII)', ['ugrep', '-n', '-i', pat, ru])
     ])
 
 
@@ -515,12 +578,17 @@ def bench_subtitles_ru_literal_word(suite_dir):
 
     return Benchmark(pattern=pat, commands=[
         Command('rg (ASCII)', [
-            'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', ru,
+            # You might think we'd use \b here for word boundaries, but both
+            # GNU grep and ripgrep implement -w with the formulation below.
+            # Since we can't use Unicode in a pattern and disable Unicode word
+            # boundaries, we just hand-jam this ourselves.
+            'rg', '-n', r'(?-u:^|\W)' + pat + r'(?-u:$|\W)', ru,
         ]),
         Command('ag (ASCII)', ['ag', '-sw', pat, ru]),
         Command('grep (ASCII)', [
             'grep', '-nw', pat, ru,
         ], env=GREP_ASCII),
+        Command('ugrep (ASCII)', ['ugrep', '-nw', pat, ru]),
         Command('rg', ['rg', '-nw', pat, ru]),
         Command('grep', ['grep', '-nw', pat, ru], env=GREP_UNICODE),
     ])
@@ -544,6 +612,7 @@ def bench_subtitles_ru_alternate(suite_dir):
         Command('rg (lines)', ['rg', '-n', pat, ru]),
         Command('ag (lines)', ['ag', '-s', pat, ru]),
         Command('grep (lines)', ['grep', '-E', '-n', pat, ru], env=GREP_ASCII),
+        Command('ugrep (lines)', ['ugrep', '-n', pat, ru]),
         Command('rg', ['rg', pat, ru]),
         Command('grep', ['grep', '-E', pat, ru], env=GREP_ASCII),
     ])
@@ -568,6 +637,7 @@ def bench_subtitles_ru_alternate_casei(suite_dir):
         Command('grep (ASCII)', [
             'grep', '-E', '-ni', pat, ru,
         ], env=GREP_ASCII),
+        Command('ugrep (ASCII)', ['ugrep', '-n', '-i', pat, ru]),
         Command('rg', ['rg', '-n', '-i', pat, ru]),
         Command('grep', ['grep', '-E', '-ni', pat, ru], env=GREP_UNICODE),
     ])
@@ -584,8 +654,10 @@ def bench_subtitles_ru_surrounding_words(suite_dir):
     return Benchmark(pattern=pat, commands=[
         Command('rg', ['rg', '-n', pat, ru]),
         Command('grep', ['grep', '-E', '-n', pat, ru], env=GREP_UNICODE),
+        Command('ugrep', ['ugrep', '-n', pat, ru]),
         Command('ag (ASCII)', ['ag', '-s', pat, ru]),
         Command('grep (ASCII)', ['grep', '-E', '-n', pat, ru], env=GREP_ASCII),
+        Command('ugrep (ASCII)', ['ugrep', '-n', '-U', pat, ru]),
     ])
 
 
@@ -604,9 +676,11 @@ def bench_subtitles_ru_no_literal(suite_dir):
 
     return Benchmark(pattern=pat, commands=[
         Command('rg', ['rg', '-n', pat, ru]),
+        Command('ugrep', ['ugrep', '-n', pat, ru]),
         Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, ru]),
         Command('ag (ASCII)', ['ag', '-s', pat, ru]),
         Command('grep (ASCII)', ['grep', '-E', '-n', pat, ru], env=GREP_ASCII),
+        Command('ugrep (ASCII)', ['ugrep', '-n', '-U', pat, ru])
     ])
author	Andrew Gallant <jamslam@gmail.com>	2020-10-14 17:00:35 -0400
committer	Andrew Gallant <jamslam@gmail.com>	2020-10-14 17:00:35 -0400
commit	de0c24f31c6a1218c1ff78ae8a6920650e226949 (patch)
tree	9549bf0af3705dd7e23912235ecec3abf72efc30
parent	c55e7af675f392f2829c69bb50a9334d1f7d967a (diff)