Rule out regex as potential improvement

author: Sam Tay <sam.chong.tay@gmail.com> 2020-06-27 20:58:23 -0700
committer: Sam Tay <sam.chong.tay@gmail.com> 2020-06-27 21:05:25 -0700
commit: da6c82fa307278cdeb2c15f9a31282fb80ac768f (patch)
tree: 210443c5c08c2c634916da88009e5be2c72517b1 /benches
parent: 307e1973e799ace3303184f40aaf5c205194fb33 (diff)
1 files changed, 16 insertions, 0 deletions
diff --git a/benches/parsing.rs b/benches/parsing.rs
index 0bfe44c..ae6ccf0 100644
--- a/benches/parsing.rs
+++ b/benches/parsing.rs
@@ -3,6 +3,22 @@ use so::stackexchange::scraper::{DuckDuckGo, Google, Scraper};
 use std::collections::HashMap;
 use std::time::Duration;
 
+/// Note: these benchmarks show that replacing question_url_to_id with regex, i.e.
+/// ```rust
+/// fn question_url_to_id(site_url: &str, input: &str) -> Option<String> {
+///     let re: Regex = Regex::new(&format!(
+///         "[^\\.]{}/(:?q|questions)/(?P<id>\\d+)",
+///         site_url.replace('.', "\\.")
+///     ))
+///     .unwrap();
+///     Some(re.captures(input)?.name("id")?.as_str().to_owned())
+/// }
+/// ```
+/// **greatly** degrades peformance (maybe due to the fact that the regex depends on configuration
+/// and can't be compiled with lazy_static?).
+///
+/// Still, I could try creating a regex that captures the url encoded SE url and question id and
+/// multiline regex the entire HTML document. It might be faster than the scraper library?
 fn bench_parsers(c: &mut Criterion) {
     let limit: u16 = 10;
     let mut sites = HashMap::new();
author	Sam Tay <sam.chong.tay@gmail.com>	2020-06-27 20:58:23 -0700
committer	Sam Tay <sam.chong.tay@gmail.com>	2020-06-27 21:05:25 -0700
commit	da6c82fa307278cdeb2c15f9a31282fb80ac768f (patch)
tree	210443c5c08c2c634916da88009e5be2c72517b1 /benches
parent	307e1973e799ace3303184f40aaf5c205194fb33 (diff)