diff options
author | Andrew Gallant <jamslam@gmail.com> | 2020-10-14 14:17:23 -0400 |
---|---|---|
committer | Andrew Gallant <jamslam@gmail.com> | 2020-10-14 14:17:23 -0400 |
commit | b0066274cbb36e2cf4a76aded5f8a98d1f79e61a (patch) | |
tree | 7186b84d7d59ec7bbe9138636396f1cff5533bfa | |
parent | def993bad1a9275cdc249f04048e5b2065b79f05 (diff) |
benchsuite: update subtitle URLs
Since the English subtitle file actually changed its content, we tweak
the benchmark to use a slightly bigger sample that more closely matches
the file size of the Russian subtitle file.
Also, the BurntSushi/linux repo has been updated and I've confirmed that
it builds on my Linux machine.
Fixes #1257
-rwxr-xr-x | benchsuite/benchsuite | 18 |
1 files changed, 10 insertions, 8 deletions
diff --git a/benchsuite/benchsuite b/benchsuite/benchsuite index b849b454..9353cf49 100755 --- a/benchsuite/benchsuite +++ b/benchsuite/benchsuite @@ -23,13 +23,15 @@ import time # strategies used to increase the relevance of results returned. SUBTITLES_DIR = 'subtitles' -SUBTITLES_EN_NAME = 'OpenSubtitles2016.raw.en' -SUBTITLES_EN_NAME_SAMPLE = 'OpenSubtitles2016.raw.sample.en' +SUBTITLES_EN_NAME = 'en.txt' +SUBTITLES_EN_NAME_SAMPLE = 'en.sample.txt' SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME -SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.en.gz' # noqa -SUBTITLES_RU_NAME = 'OpenSubtitles2016.raw.ru' +# SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.en.gz' # noqa +SUBTITLES_EN_URL = 'https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/en.txt.gz' # noqa +SUBTITLES_RU_NAME = 'ru.txt' SUBTITLES_RU_NAME_GZ = '%s.gz' % SUBTITLES_RU_NAME -SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.ru.gz' # noqa +# SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.ru.gz' # noqa +SUBTITLES_RU_URL = 'https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/ru.txt.gz' # noqa LINUX_DIR = 'linux' LINUX_CLONE = 'git://github.com/BurntSushi/linux' @@ -255,11 +257,11 @@ def bench_linux_unicode_greek_casei(suite_dir): def bench_linux_unicode_word(suite_dir): ''' - Benchmark Unicode aware \w character class. + Benchmark Unicode aware \\w character class. Only ripgrep and git-grep (with LC_ALL=en_US.UTF-8) actually get this right. Everything else uses the standard ASCII interpretation - of \w. + of \\w. ''' require(suite_dir, 'linux') cwd = path.join(suite_dir, LINUX_DIR) @@ -1088,7 +1090,7 @@ def download_subtitles_en(suite_dir): # benchmarks finish in a reasonable time. with open(path.join(subtitle_dir, en_path_sample), 'wb+') as f: run_cmd( - ['head', '-n', '32722372', en_path], + ['head', '-n', '55000000', en_path], cwd=subtitle_dir, stdout=f) |