summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSam Tay <sam.chong.tay@gmail.com>2020-06-27 20:58:23 -0700
committerSam Tay <sam.chong.tay@gmail.com>2020-06-27 21:05:25 -0700
commitda6c82fa307278cdeb2c15f9a31282fb80ac768f (patch)
tree210443c5c08c2c634916da88009e5be2c72517b1
parent307e1973e799ace3303184f40aaf5c205194fb33 (diff)
Rule out regex as potential improvement
-rw-r--r--Cargo.lock22
-rw-r--r--TODO.md14
-rw-r--r--benches/parsing.rs16
-rw-r--r--src/stackexchange/scraper.rs7
4 files changed, 20 insertions, 39 deletions
diff --git a/Cargo.lock b/Cargo.lock
index cd04639..f2a364d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -16,15 +16,6 @@ dependencies = [
]
[[package]]
-name = "aho-corasick"
-version = "0.7.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86"
-dependencies = [
- "memchr",
-]
-
-[[package]]
name = "ansi_term"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1681,10 +1672,7 @@ version = "1.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6"
dependencies = [
- "aho-corasick",
- "memchr",
"regex-syntax",
- "thread_local",
]
[[package]]
@@ -1993,7 +1981,6 @@ dependencies = [
"phf",
"pulldown-cmark",
"rayon",
- "regex",
"reqwest",
"scraper",
"serde",
@@ -2167,15 +2154,6 @@ dependencies = [
]
[[package]]
-name = "thread_local"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
-dependencies = [
- "lazy_static",
-]
-
-[[package]]
name = "time"
version = "0.1.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/TODO.md b/TODO.md
index 509b646..aff72c7 100644
--- a/TODO.md
+++ b/TODO.md
@@ -2,15 +2,14 @@
### chores
2. Move to github actions ASAP, travis & appveyor are a PITA. See resources below.
-3. Benchmark parsing. Probaly way faster to use regex to find question IDs
- within URLs (or *gasp* the entire doc).
4. Refactor layout handling (see TODO on `tui::views::LayoutView::relayout`)
5. Release on AUR & Homebrew
+6. Benchmark markdown parsing: see what I'm gaining by borrowing and querying
+ entity hash set. If building my own spannedstring source from md output
+ doesn't affect performance, do it! This would rule out a large class of
+ indexing panics coming from cursive.
### bugs
-0. **Priority** Another parser bug: some links return /questions/tagged/; need to make sure
- we only select digits. Hello regex. (use test/duckduckgo/tagged.html to write
- a new test).
1.
```
so --search-engine google --site stackoverflow --site askubuntu how to stop typing sudo
@@ -19,11 +18,6 @@ results in
```
thread '<unnamed>' panicked at 'assertion failed: value_pos >= source_pos', /home/sam/.cargo/registry/src/github.com-1ecc6299db9ec823/cursive_core-0.1.0/src/utils/span.rs:372:17
```
-So maybe the md parser should just build its own source for
-SpannedString, and own everything...
-2. Searching site:meta.stackexchange.com also returns results from
- math.meta.stackexchange.com; this will lead to requesting question IDs that
- don't exist. Need to improve parser to account for this.
### feature ideas
- Add sort option, e.g. relevance|votes|date
diff --git a/benches/parsing.rs b/benches/parsing.rs
index 0bfe44c..ae6ccf0 100644
--- a/benches/parsing.rs
+++ b/benches/parsing.rs
@@ -3,6 +3,22 @@ use so::stackexchange::scraper::{DuckDuckGo, Google, Scraper};
use std::collections::HashMap;
use std::time::Duration;
+/// Note: these benchmarks show that replacing question_url_to_id with regex, i.e.
+/// ```rust
+/// fn question_url_to_id(site_url: &str, input: &str) -> Option<String> {
+/// let re: Regex = Regex::new(&format!(
+/// "[^\\.]{}/(:?q|questions)/(?P<id>\\d+)",
+/// site_url.replace('.', "\\.")
+/// ))
+/// .unwrap();
+/// Some(re.captures(input)?.name("id")?.as_str().to_owned())
+/// }
+/// ```
+/// **greatly** degrades peformance (maybe due to the fact that the regex depends on configuration
+/// and can't be compiled with lazy_static?).
+///
+/// Still, I could try creating a regex that captures the url encoded SE url and question id and
+/// multiline regex the entire HTML document. It might be faster than the scraper library?
fn bench_parsers(c: &mut Criterion) {
let limit: u16 = 10;
let mut sites = HashMap::new();
diff --git a/src/stackexchange/scraper.rs b/src/stackexchange/scraper.rs
index 34a25ce..531d3b0 100644
--- a/src/stackexchange/scraper.rs
+++ b/src/stackexchange/scraper.rs
@@ -169,14 +169,7 @@ fn parse_with_selector(
})
}
-/// For example
-/// ```
-/// let id = "stackoverflow.com";
-/// let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor";
-/// assert_eq!(question_url_to_id(site_url, input), Some(String::from("11828270")))
-/// ```
// TODO use str_prefix once its stable
-// TODO benchmark this. regex is almost undoubtably superior here
fn question_url_to_id(site_url: &str, input: &str) -> Option<String> {
["/questions/", "/q/"].iter().find_map(|segment| {
let fragment = site_url.trim_end_matches('/').to_owned() + segment;