diff options
author | Sam Tay <sam.chong.tay@gmail.com> | 2020-06-27 20:58:23 -0700 |
---|---|---|
committer | Sam Tay <sam.chong.tay@gmail.com> | 2020-06-27 21:05:25 -0700 |
commit | da6c82fa307278cdeb2c15f9a31282fb80ac768f (patch) | |
tree | 210443c5c08c2c634916da88009e5be2c72517b1 /benches | |
parent | 307e1973e799ace3303184f40aaf5c205194fb33 (diff) |
Rule out regex as potential improvement
Diffstat (limited to 'benches')
-rw-r--r-- | benches/parsing.rs | 16 |
1 files changed, 16 insertions, 0 deletions
diff --git a/benches/parsing.rs b/benches/parsing.rs index 0bfe44c..ae6ccf0 100644 --- a/benches/parsing.rs +++ b/benches/parsing.rs @@ -3,6 +3,22 @@ use so::stackexchange::scraper::{DuckDuckGo, Google, Scraper}; use std::collections::HashMap; use std::time::Duration; +/// Note: these benchmarks show that replacing question_url_to_id with regex, i.e. +/// ```rust +/// fn question_url_to_id(site_url: &str, input: &str) -> Option<String> { +/// let re: Regex = Regex::new(&format!( +/// "[^\\.]{}/(:?q|questions)/(?P<id>\\d+)", +/// site_url.replace('.', "\\.") +/// )) +/// .unwrap(); +/// Some(re.captures(input)?.name("id")?.as_str().to_owned()) +/// } +/// ``` +/// **greatly** degrades peformance (maybe due to the fact that the regex depends on configuration +/// and can't be compiled with lazy_static?). +/// +/// Still, I could try creating a regex that captures the url encoded SE url and question id and +/// multiline regex the entire HTML document. It might be faster than the scraper library? fn bench_parsers(c: &mut Criterion) { let limit: u16 = 10; let mut sites = HashMap::new(); |