benchsuite: pass '-a' to ugrep in some cases

It looks like it incorrectly treats a file that is purely valid UTF-8 as a binary file, which in turn effectively renders all of the Russian subtitle benchmarks moot for ugrep. So we pass '-a' to force ugrep to treat the file as text. This technically gives ugrep an edge because it now no longer needs to look to see if the haystack is binary or not. In practice this is usually implemented using highly optimized SIMD routines (e.g., 'memchr'), so it tends not to matter much. We might also consider passing '-a' to all grep commands. But... I think using '-a' is the less common case and we should try to benchmark the common case.
author: Andrew Gallant <jamslam@gmail.com> 2022-12-16 11:21:58 -0500
committer: Andrew Gallant <jamslam@gmail.com> 2022-12-16 11:21:58 -0500
commit: 1be86392e0772fd0dbefcfbe57fa3ff15fc75da1 (patch)
tree: 1dc5d101a54267053e809d178b9ad33e48fa70a4 /benchsuite
parent: 63058453fa7ec562feedca0d599ee59780b2d9e8 (diff)
1 files changed, 20 insertions, 9 deletions
diff --git a/benchsuite/benchsuite b/benchsuite/benchsuite
index c6a87220..8ab233a8 100755
--- a/benchsuite/benchsuite
+++ b/benchsuite/benchsuite
@@ -544,7 +544,11 @@ def bench_subtitles_ru_literal(suite_dir):
         Command('rg (lines)', ['rg', '-n', pat, ru]),
         Command('ag (lines)', ['ag', '-s', pat, ru]),
         Command('grep (lines)', ['grep', '-n', pat, ru], env=GREP_ASCII),
-        Command('ugrep (lines)', ['ugrep', '-n', pat, ru])
+        # ugrep incorrectly identifies this corpus as binary, but it is
+        # entirely valid UTF-8. So we tell ugrep to always treat the corpus
+        # as text even though this technically gives it an edge over other
+        # tools. (It no longer needs to check for binary data.)
+        Command('ugrep (lines)', ['ugrep', '-a', '-n', pat, ru])
     ])
 
 
@@ -562,7 +566,8 @@ def bench_subtitles_ru_literal_casei(suite_dir):
         Command('grep (ASCII)', ['grep', '-E', '-i', pat, ru], env=GREP_ASCII),
         Command('rg (lines)', ['rg', '-n', '-i', pat, ru]),
         Command('ag (lines) (ASCII)', ['ag', '-i', pat, ru]),
-        Command('ugrep (lines) (ASCII)', ['ugrep', '-n', '-i', pat, ru])
+        # See bench_subtitles_ru_literal for why we use '-a' here.
+        Command('ugrep (lines) (ASCII)', ['ugrep', '-a', '-n', '-i', pat, ru])
     ])
 
 
@@ -586,7 +591,8 @@ def bench_subtitles_ru_literal_word(suite_dir):
         Command('grep (ASCII)', [
             'grep', '-nw', pat, ru,
         ], env=GREP_ASCII),
-        Command('ugrep (ASCII)', ['ugrep', '-nw', pat, ru]),
+        # See bench_subtitles_ru_literal for why we use '-a' here.
+        Command('ugrep (ASCII)', ['ugrep', '-anw', pat, ru]),
         Command('rg', ['rg', '-nw', pat, ru]),
         Command('grep', ['grep', '-nw', pat, ru], env=GREP_UNICODE),
     ])
@@ -610,7 +616,8 @@ def bench_subtitles_ru_alternate(suite_dir):
         Command('rg (lines)', ['rg', '-n', pat, ru]),
         Command('ag (lines)', ['ag', '-s', pat, ru]),
         Command('grep (lines)', ['grep', '-E', '-n', pat, ru], env=GREP_ASCII),
-        Command('ugrep (lines)', ['ugrep', '-n', pat, ru]),
+        # See bench_subtitles_ru_literal for why we use '-a' here.
+        Command('ugrep (lines)', ['ugrep', '-an', pat, ru]),
         Command('rg', ['rg', pat, ru]),
         Command('grep', ['grep', '-E', pat, ru], env=GREP_ASCII),
     ])
@@ -635,7 +642,8 @@ def bench_subtitles_ru_alternate_casei(suite_dir):
         Command('grep (ASCII)', [
             'grep', '-E', '-ni', pat, ru,
         ], env=GREP_ASCII),
-        Command('ugrep (ASCII)', ['ugrep', '-n', '-i', pat, ru]),
+        # See bench_subtitles_ru_literal for why we use '-a' here.
+        Command('ugrep (ASCII)', ['ugrep', '-ani', pat, ru]),
         Command('rg', ['rg', '-n', '-i', pat, ru]),
         Command('grep', ['grep', '-E', '-ni', pat, ru], env=GREP_UNICODE),
     ])
@@ -652,10 +660,11 @@ def bench_subtitles_ru_surrounding_words(suite_dir):
     return Benchmark(pattern=pat, commands=[
         Command('rg', ['rg', '-n', pat, ru]),
         Command('grep', ['grep', '-E', '-n', pat, ru], env=GREP_UNICODE),
-        Command('ugrep', ['ugrep', '-n', pat, ru]),
+        Command('ugrep', ['ugrep', '-an', pat, ru]),
         Command('ag (ASCII)', ['ag', '-s', pat, ru]),
         Command('grep (ASCII)', ['grep', '-E', '-n', pat, ru], env=GREP_ASCII),
-        Command('ugrep (ASCII)', ['ugrep', '-n', '-U', pat, ru]),
+        # See bench_subtitles_ru_literal for why we use '-a' here.
+        Command('ugrep (ASCII)', ['ugrep', '-a', '-n', '-U', pat, ru]),
     ])
 
 
@@ -674,11 +683,13 @@ def bench_subtitles_ru_no_literal(suite_dir):
 
     return Benchmark(pattern=pat, commands=[
         Command('rg', ['rg', '-n', pat, ru]),
-        Command('ugrep', ['ugrep', '-n', pat, ru]),
+        # See bench_subtitles_ru_literal for why we use '-a' here.
+        Command('ugrep', ['ugrep', '-an', pat, ru]),
         Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, ru]),
         Command('ag (ASCII)', ['ag', '-s', pat, ru]),
         Command('grep (ASCII)', ['grep', '-E', '-n', pat, ru], env=GREP_ASCII),
-        Command('ugrep (ASCII)', ['ugrep', '-n', '-U', pat, ru])
+        # See bench_subtitles_ru_literal for why we use '-a' here.
+        Command('ugrep (ASCII)', ['ugrep', '-anU', pat, ru])
     ])
author	Andrew Gallant <jamslam@gmail.com>	2022-12-16 11:21:58 -0500
committer	Andrew Gallant <jamslam@gmail.com>	2022-12-16 11:21:58 -0500
commit	1be86392e0772fd0dbefcfbe57fa3ff15fc75da1 (patch)
tree	1dc5d101a54267053e809d178b9ad33e48fa70a4 /benchsuite
parent	63058453fa7ec562feedca0d599ee59780b2d9e8 (diff)