diff options
author | Sam Tay <sam.chong.tay@gmail.com> | 2020-06-27 20:33:59 -0700 |
---|---|---|
committer | Sam Tay <sam.chong.tay@gmail.com> | 2020-06-27 20:33:59 -0700 |
commit | 307e1973e799ace3303184f40aaf5c205194fb33 (patch) | |
tree | 6a8f853dcf34b5124f7185e8bf3f6941e6937ff0 | |
parent | e2814eeccecfb98bd899d2c69bee23cd57541880 (diff) |
Add benchmarks for HTML parsing
-rw-r--r-- | Cargo.lock | 215 | ||||
-rw-r--r-- | Cargo.toml | 7 | ||||
-rw-r--r-- | benches/parsing.rs | 58 | ||||
-rw-r--r-- | src/lib.rs | 7 | ||||
-rw-r--r-- | src/stackexchange/mod.rs | 3 |
5 files changed, 288 insertions, 2 deletions
@@ -16,6 +16,15 @@ dependencies = [ ] [[package]] +name = "aho-corasick" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86" +dependencies = [ + "memchr", +] + +[[package]] name = "ansi_term" version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -108,6 +117,18 @@ dependencies = [ ] [[package]] +name = "bstr" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31accafdb70df7871592c058eca3985b71104e15ac32f64706022c58867da931" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + +[[package]] name = "bumpalo" version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -126,6 +147,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "130aac562c0dd69c56b3b1cc8ffd2e17be31d0b6c25b61c96b76231aa23e39e1" [[package]] +name = "cast" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b9434b9a5aa1450faa3f9cb14ea0e8c53bb5d2b3c1bfd1ab4fc03e9f33fbfb0" +dependencies = [ + "rustc_version", +] + +[[package]] name = "cc" version = "1.0.54" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -224,6 +254,41 @@ dependencies = [ ] [[package]] +name = "criterion" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63f696897c88b57f4ffe3c69d8e1a0613c7d0e6c4833363c8560fbde9c47b966" +dependencies = [ + "atty", + "cast", + "clap", + "criterion-plot", + "csv", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddeaf7989f00f2e1d871a26a110f3ed713632feac17f65f03ca938c542618b60" +dependencies = [ + "cast", + "itertools", +] + +[[package]] name = "crossbeam" version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -348,6 +413,28 @@ dependencies = [ ] [[package]] +name = "csv" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00affe7f6ab566df61b4be3ce8cf16bc2576bca0963ceb0955e45d514bf9a279" +dependencies = [ + "bstr", + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] name = "cursive" version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -867,6 +954,15 @@ dependencies = [ ] [[package]] +name = "itertools" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b" +dependencies = [ + "either", +] + +[[package]] name = "itoa" version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1220,6 +1316,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b631f7e854af39a1739f401cf34a8a013dfe09eac4fa4dba91e9768bd28168d" [[package]] +name = "oorandom" +version = "11.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a170cebd8021a008ea92e4db85a72f80b35df514ec664b296fdcbb654eac0b2c" + +[[package]] name = "openssl" version = "0.10.29" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1407,6 +1509,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05da548ad6865900e60eaba7f589cc0783590a92e940c26953ff81ddbab2d677" [[package]] +name = "plotters" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d1685fbe7beba33de0330629da9d955ac75bd54f33d7b79f9a895590124f6bb" +dependencies = [ + "js-sys", + "num-traits", + "wasm-bindgen", + "web-sys", +] + +[[package]] name = "ppv-lite86" version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1562,6 +1676,33 @@ dependencies = [ ] [[package]] +name = "regex" +version = "1.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", + "thread_local", +] + +[[package]] +name = "regex-automata" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4" +dependencies = [ + "byteorder", +] + +[[package]] +name = "regex-syntax" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26412eb97c6b088a6997e05f69403a802a92d520de2f8e63c2b65f9e0f47c4e8" + +[[package]] name = "remove_dir_all" version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1619,12 +1760,30 @@ dependencies = [ ] [[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +dependencies = [ + "semver", +] + +[[package]] name = "ryu" version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" [[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] name = "schannel" version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1700,6 +1859,21 @@ dependencies = [ ] [[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +dependencies = [ + "semver-parser", +] + +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" + +[[package]] name = "serde" version = "1.0.111" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1808,6 +1982,7 @@ name = "so" version = "0.3.3" dependencies = [ "clap", + "criterion", "crossterm", "cursive", "directories", @@ -1818,6 +1993,7 @@ dependencies = [ "phf", "pulldown-cmark", "rayon", + "regex", "reqwest", "scraper", "serde", @@ -1991,6 +2167,15 @@ dependencies = [ ] [[package]] +name = "thread_local" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" +dependencies = [ + "lazy_static", +] + +[[package]] name = "time" version = "0.1.43" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -2001,6 +2186,16 @@ dependencies = [ ] [[package]] +name = "tinytemplate" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d3dc76004a03cec1c5932bca4cdc2e39aaa798e3f82363dd94f9adf6098c12f" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] name = "tokio" version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -2161,6 +2356,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" [[package]] +name = "walkdir" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "777182bc735b6424e1a57516d35ed72cb8019d85c8c9bf536dccb3445c1a2f7d" +dependencies = [ + "same-file", + "winapi 0.3.8", + "winapi-util", +] + +[[package]] name = "want" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -2283,6 +2489,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi 0.3.8", +] + +[[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -15,7 +15,12 @@ edition = "2018" appveyor = { repository = "samtay/so", branch = "master", service = "github" } travis-ci = { repository = "samtay/so", branch = "master" } -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[dev-dependencies] +criterion = "0.3" + +[[bench]] +name = "parsing" +harness = false [dependencies] thiserror = "1.0" diff --git a/benches/parsing.rs b/benches/parsing.rs new file mode 100644 index 0000000..0bfe44c --- /dev/null +++ b/benches/parsing.rs @@ -0,0 +1,58 @@ +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use so::stackexchange::scraper::{DuckDuckGo, Google, Scraper}; +use std::collections::HashMap; +use std::time::Duration; + +fn bench_parsers(c: &mut Criterion) { + let limit: u16 = 10; + let mut sites = HashMap::new(); + sites.insert( + String::from("stackoverflow"), + String::from("stackoverflow.com"), + ); + sites.insert(String::from("askubuntu"), String::from("askubuntu.com")); + + let mut group = c.benchmark_group("Scraping html"); + + group.sample_size(80); + group.measurement_time(Duration::from_secs(10)); + group.throughput(Throughput::Elements(limit as u64)); + + group.bench_with_input( + BenchmarkId::new("Google.parse", "exit-vim"), + include_str!("../test/google/exit-vim.html"), + |b, html| b.iter(|| Google.parse(html, &sites, limit)), + ); + + group.bench_with_input( + BenchmarkId::new("DuckDuckGo.parse", "exit-vim"), + include_str!("../test/duckduckgo/exit-vim.html"), + |b, html| b.iter(|| DuckDuckGo.parse(html, &sites, limit)), + ); + + let mut sites = HashMap::new(); + sites.insert( + String::from("stackoverflow"), + String::from("stackoverflow.com"), + ); + + group.bench_with_input( + BenchmarkId::new("Google.parse", "/q/"), + include_str!("../test/google/parsing-q.html"), + |b, html| b.iter(|| Google.parse(html, &sites, limit)), + ); + + let mut sites = HashMap::new(); + sites.insert(String::from("meta"), String::from("meta.stackexchange.com")); + + group.bench_with_input( + BenchmarkId::new("DuckDuckGo.parse", "tagged"), + include_str!("../test/duckduckgo/tagged.html"), + |b, html| b.iter(|| DuckDuckGo.parse(html, &sites, limit)), + ); + + group.finish(); +} + +criterion_group!(benches, bench_parsers); +criterion_main!(benches); diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..e9ffa33 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,7 @@ +pub mod cli; +pub mod config; +pub mod error; +pub mod stackexchange; +pub mod term; +pub mod tui; +pub mod utils; diff --git a/src/stackexchange/mod.rs b/src/stackexchange/mod.rs index 2d4feb7..b0e1345 100644 --- a/src/stackexchange/mod.rs +++ b/src/stackexchange/mod.rs @@ -1,7 +1,8 @@ mod api; mod local_storage; -mod scraper; mod search; +// Exposed for benchmarking +pub mod scraper; pub use api::{Answer, Question}; pub use local_storage::LocalStorage; |