From da6c82fa307278cdeb2c15f9a31282fb80ac768f Mon Sep 17 00:00:00 2001 From: Sam Tay Date: Sat, 27 Jun 2020 20:58:23 -0700 Subject: Rule out regex as potential improvement --- Cargo.lock | 22 ---------------------- TODO.md | 14 ++++---------- benches/parsing.rs | 16 ++++++++++++++++ src/stackexchange/scraper.rs | 7 ------- 4 files changed, 20 insertions(+), 39 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cd04639..f2a364d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -15,15 +15,6 @@ dependencies = [ "const-random", ] -[[package]] -name = "aho-corasick" -version = "0.7.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86" -dependencies = [ - "memchr", -] - [[package]] name = "ansi_term" version = "0.11.0" @@ -1681,10 +1672,7 @@ version = "1.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6" dependencies = [ - "aho-corasick", - "memchr", "regex-syntax", - "thread_local", ] [[package]] @@ -1993,7 +1981,6 @@ dependencies = [ "phf", "pulldown-cmark", "rayon", - "regex", "reqwest", "scraper", "serde", @@ -2166,15 +2153,6 @@ dependencies = [ "syn", ] -[[package]] -name = "thread_local" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" -dependencies = [ - "lazy_static", -] - [[package]] name = "time" version = "0.1.43" diff --git a/TODO.md b/TODO.md index 509b646..aff72c7 100644 --- a/TODO.md +++ b/TODO.md @@ -2,15 +2,14 @@ ### chores 2. Move to github actions ASAP, travis & appveyor are a PITA. See resources below. -3. Benchmark parsing. Probaly way faster to use regex to find question IDs - within URLs (or *gasp* the entire doc). 4. Refactor layout handling (see TODO on `tui::views::LayoutView::relayout`) 5. Release on AUR & Homebrew +6. Benchmark markdown parsing: see what I'm gaining by borrowing and querying + entity hash set. If building my own spannedstring source from md output + doesn't affect performance, do it! This would rule out a large class of + indexing panics coming from cursive. ### bugs -0. **Priority** Another parser bug: some links return /questions/tagged/; need to make sure - we only select digits. Hello regex. (use test/duckduckgo/tagged.html to write - a new test). 1. ``` so --search-engine google --site stackoverflow --site askubuntu how to stop typing sudo @@ -19,11 +18,6 @@ results in ``` thread '' panicked at 'assertion failed: value_pos >= source_pos', /home/sam/.cargo/registry/src/github.com-1ecc6299db9ec823/cursive_core-0.1.0/src/utils/span.rs:372:17 ``` -So maybe the md parser should just build its own source for -SpannedString, and own everything... -2. Searching site:meta.stackexchange.com also returns results from - math.meta.stackexchange.com; this will lead to requesting question IDs that - don't exist. Need to improve parser to account for this. ### feature ideas - Add sort option, e.g. relevance|votes|date diff --git a/benches/parsing.rs b/benches/parsing.rs index 0bfe44c..ae6ccf0 100644 --- a/benches/parsing.rs +++ b/benches/parsing.rs @@ -3,6 +3,22 @@ use so::stackexchange::scraper::{DuckDuckGo, Google, Scraper}; use std::collections::HashMap; use std::time::Duration; +/// Note: these benchmarks show that replacing question_url_to_id with regex, i.e. +/// ```rust +/// fn question_url_to_id(site_url: &str, input: &str) -> Option { +/// let re: Regex = Regex::new(&format!( +/// "[^\\.]{}/(:?q|questions)/(?P\\d+)", +/// site_url.replace('.', "\\.") +/// )) +/// .unwrap(); +/// Some(re.captures(input)?.name("id")?.as_str().to_owned()) +/// } +/// ``` +/// **greatly** degrades peformance (maybe due to the fact that the regex depends on configuration +/// and can't be compiled with lazy_static?). +/// +/// Still, I could try creating a regex that captures the url encoded SE url and question id and +/// multiline regex the entire HTML document. It might be faster than the scraper library? fn bench_parsers(c: &mut Criterion) { let limit: u16 = 10; let mut sites = HashMap::new(); diff --git a/src/stackexchange/scraper.rs b/src/stackexchange/scraper.rs index 34a25ce..531d3b0 100644 --- a/src/stackexchange/scraper.rs +++ b/src/stackexchange/scraper.rs @@ -169,14 +169,7 @@ fn parse_with_selector( }) } -/// For example -/// ``` -/// let id = "stackoverflow.com"; -/// let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor"; -/// assert_eq!(question_url_to_id(site_url, input), Some(String::from("11828270"))) -/// ``` // TODO use str_prefix once its stable -// TODO benchmark this. regex is almost undoubtably superior here fn question_url_to_id(site_url: &str, input: &str) -> Option { ["/questions/", "/q/"].iter().find_map(|segment| { let fragment = site_url.trim_end_matches('/').to_owned() + segment; -- cgit v1.2.3