Rule out regex as potential improvement

author: Sam Tay <sam.chong.tay@gmail.com> 2020-06-27 20:58:23 -0700
committer: Sam Tay <sam.chong.tay@gmail.com> 2020-06-27 21:05:25 -0700
commit: da6c82fa307278cdeb2c15f9a31282fb80ac768f (patch)
tree: 210443c5c08c2c634916da88009e5be2c72517b1
parent: 307e1973e799ace3303184f40aaf5c205194fb33 (diff)
4 files changed, 20 insertions, 39 deletions
diff --git a/Cargo.lock b/Cargo.lock
index cd04639..f2a364d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -16,15 +16,6 @@ dependencies = [
 ]
 
 [[package]]
-name = "aho-corasick"
-version = "0.7.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86"
-dependencies = [
- "memchr",
-]
-
-[[package]]
 name = "ansi_term"
 version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1681,10 +1672,7 @@ version = "1.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6"
 dependencies = [
- "aho-corasick",
- "memchr",
  "regex-syntax",
- "thread_local",
 ]
 
 [[package]]
@@ -1993,7 +1981,6 @@ dependencies = [
  "phf",
  "pulldown-cmark",
  "rayon",
- "regex",
  "reqwest",
  "scraper",
  "serde",
@@ -2167,15 +2154,6 @@ dependencies = [
 ]
 
 [[package]]
-name = "thread_local"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
-dependencies = [
- "lazy_static",
-]
-
-[[package]]
 name = "time"
 version = "0.1.43"
 source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/TODO.md b/TODO.md
index 509b646..aff72c7 100644
--- a/TODO.md
+++ b/TODO.md
@@ -2,15 +2,14 @@
 
 ### chores
 2. Move to github actions ASAP, travis & appveyor are a PITA. See resources below.
-3. Benchmark parsing. Probaly way faster to use regex to find question IDs
-   within URLs (or *gasp* the entire doc).
 4. Refactor layout handling (see TODO on `tui::views::LayoutView::relayout`)
 5. Release on AUR & Homebrew
+6. Benchmark markdown parsing: see what I'm gaining by borrowing and querying
+   entity hash set. If building my own spannedstring source from md output
+   doesn't affect performance, do it! This would rule out a large class of
+   indexing panics coming from cursive.
 
 ### bugs
-0. **Priority** Another parser bug: some links return /questions/tagged/; need to make sure
-   we only select digits. Hello regex. (use test/duckduckgo/tagged.html to write
-   a new test).
 1.
 ```
 so --search-engine google --site stackoverflow --site askubuntu how to stop typing sudo
@@ -19,11 +18,6 @@ results in
 ```
 thread '<unnamed>' panicked at 'assertion failed: value_pos >= source_pos', /home/sam/.cargo/registry/src/github.com-1ecc6299db9ec823/cursive_core-0.1.0/src/utils/span.rs:372:17
 ```
-So maybe the md parser should just build its own source for
-SpannedString, and own everything...
-2. Searching site:meta.stackexchange.com also returns results from
-   math.meta.stackexchange.com; this will lead to requesting question IDs that
-   don't exist. Need to improve parser to account for this.
 
 ### feature ideas
 - Add sort option, e.g. relevance|votes|date
diff --git a/benches/parsing.rs b/benches/parsing.rs
index 0bfe44c..ae6ccf0 100644
--- a/benches/parsing.rs
+++ b/benches/parsing.rs
@@ -3,6 +3,22 @@ use so::stackexchange::scraper::{DuckDuckGo, Google, Scraper};
 use std::collections::HashMap;
 use std::time::Duration;
 
+/// Note: these benchmarks show that replacing question_url_to_id with regex, i.e.
+/// ```rust
+/// fn question_url_to_id(site_url: &str, input: &str) -> Option<String> {
+///     let re: Regex = Regex::new(&format!(
+///         "[^\\.]{}/(:?q|questions)/(?P<id>\\d+)",
+///         site_url.replace('.', "\\.")
+///     ))
+///     .unwrap();
+///     Some(re.captures(input)?.name("id")?.as_str().to_owned())
+/// }
+/// ```
+/// **greatly** degrades peformance (maybe due to the fact that the regex depends on configuration
+/// and can't be compiled with lazy_static?).
+///
+/// Still, I could try creating a regex that captures the url encoded SE url and question id and
+/// multiline regex the entire HTML document. It might be faster than the scraper library?
 fn bench_parsers(c: &mut Criterion) {
     let limit: u16 = 10;
     let mut sites = HashMap::new();
diff --git a/src/stackexchange/scraper.rs b/src/stackexchange/scraper.rs
index 34a25ce..531d3b0 100644
--- a/src/stackexchange/scraper.rs
+++ b/src/stackexchange/scraper.rs
@@ -169,14 +169,7 @@ fn parse_with_selector(
     })
 }
 
-/// For example
-/// ```
-/// let id = "stackoverflow.com";
-/// let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor";
-/// assert_eq!(question_url_to_id(site_url, input), Some(String::from("11828270")))
-/// ```
 // TODO use str_prefix once its stable
-// TODO benchmark this. regex is almost undoubtably superior here
 fn question_url_to_id(site_url: &str, input: &str) -> Option<String> {
     ["/questions/", "/q/"].iter().find_map(|segment| {
         let fragment = site_url.trim_end_matches('/').to_owned() + segment;
author	Sam Tay <sam.chong.tay@gmail.com>	2020-06-27 20:58:23 -0700
committer	Sam Tay <sam.chong.tay@gmail.com>	2020-06-27 21:05:25 -0700
commit	da6c82fa307278cdeb2c15f9a31282fb80ac768f (patch)
tree	210443c5c08c2c634916da88009e5be2c72517b1
parent	307e1973e799ace3303184f40aaf5c205194fb33 (diff)