summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSam Tay <sam.chong.tay@gmail.com>2020-06-27 20:33:59 -0700
committerSam Tay <sam.chong.tay@gmail.com>2020-06-27 20:33:59 -0700
commit307e1973e799ace3303184f40aaf5c205194fb33 (patch)
tree6a8f853dcf34b5124f7185e8bf3f6941e6937ff0
parente2814eeccecfb98bd899d2c69bee23cd57541880 (diff)
Add benchmarks for HTML parsing
-rw-r--r--Cargo.lock215
-rw-r--r--Cargo.toml7
-rw-r--r--benches/parsing.rs58
-rw-r--r--src/lib.rs7
-rw-r--r--src/stackexchange/mod.rs3
5 files changed, 288 insertions, 2 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 371cf62..cd04639 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -16,6 +16,15 @@ dependencies = [
]
[[package]]
+name = "aho-corasick"
+version = "0.7.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
name = "ansi_term"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -108,6 +117,18 @@ dependencies = [
]
[[package]]
+name = "bstr"
+version = "0.2.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31accafdb70df7871592c058eca3985b71104e15ac32f64706022c58867da931"
+dependencies = [
+ "lazy_static",
+ "memchr",
+ "regex-automata",
+ "serde",
+]
+
+[[package]]
name = "bumpalo"
version = "3.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -126,6 +147,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "130aac562c0dd69c56b3b1cc8ffd2e17be31d0b6c25b61c96b76231aa23e39e1"
[[package]]
+name = "cast"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b9434b9a5aa1450faa3f9cb14ea0e8c53bb5d2b3c1bfd1ab4fc03e9f33fbfb0"
+dependencies = [
+ "rustc_version",
+]
+
+[[package]]
name = "cc"
version = "1.0.54"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -224,6 +254,41 @@ dependencies = [
]
[[package]]
+name = "criterion"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63f696897c88b57f4ffe3c69d8e1a0613c7d0e6c4833363c8560fbde9c47b966"
+dependencies = [
+ "atty",
+ "cast",
+ "clap",
+ "criterion-plot",
+ "csv",
+ "itertools",
+ "lazy_static",
+ "num-traits",
+ "oorandom",
+ "plotters",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "walkdir",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddeaf7989f00f2e1d871a26a110f3ed713632feac17f65f03ca938c542618b60"
+dependencies = [
+ "cast",
+ "itertools",
+]
+
+[[package]]
name = "crossbeam"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -348,6 +413,28 @@ dependencies = [
]
[[package]]
+name = "csv"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00affe7f6ab566df61b4be3ce8cf16bc2576bca0963ceb0955e45d514bf9a279"
+dependencies = [
+ "bstr",
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
name = "cursive"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -867,6 +954,15 @@ dependencies = [
]
[[package]]
+name = "itertools"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b"
+dependencies = [
+ "either",
+]
+
+[[package]]
name = "itoa"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1220,6 +1316,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b631f7e854af39a1739f401cf34a8a013dfe09eac4fa4dba91e9768bd28168d"
[[package]]
+name = "oorandom"
+version = "11.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a170cebd8021a008ea92e4db85a72f80b35df514ec664b296fdcbb654eac0b2c"
+
+[[package]]
name = "openssl"
version = "0.10.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1407,6 +1509,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05da548ad6865900e60eaba7f589cc0783590a92e940c26953ff81ddbab2d677"
[[package]]
+name = "plotters"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d1685fbe7beba33de0330629da9d955ac75bd54f33d7b79f9a895590124f6bb"
+dependencies = [
+ "js-sys",
+ "num-traits",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
name = "ppv-lite86"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1562,6 +1676,33 @@ dependencies = [
]
[[package]]
+name = "regex"
+version = "1.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+ "thread_local",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4"
+dependencies = [
+ "byteorder",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.6.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26412eb97c6b088a6997e05f69403a802a92d520de2f8e63c2b65f9e0f47c4e8"
+
+[[package]]
name = "remove_dir_all"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1619,12 +1760,30 @@ dependencies = [
]
[[package]]
+name = "rustc_version"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
+dependencies = [
+ "semver",
+]
+
+[[package]]
name = "ryu"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
name = "schannel"
version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1700,6 +1859,21 @@ dependencies = [
]
[[package]]
+name = "semver"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
+dependencies = [
+ "semver-parser",
+]
+
+[[package]]
+name = "semver-parser"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
+
+[[package]]
name = "serde"
version = "1.0.111"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1808,6 +1982,7 @@ name = "so"
version = "0.3.3"
dependencies = [
"clap",
+ "criterion",
"crossterm",
"cursive",
"directories",
@@ -1818,6 +1993,7 @@ dependencies = [
"phf",
"pulldown-cmark",
"rayon",
+ "regex",
"reqwest",
"scraper",
"serde",
@@ -1991,6 +2167,15 @@ dependencies = [
]
[[package]]
+name = "thread_local"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
+dependencies = [
+ "lazy_static",
+]
+
+[[package]]
name = "time"
version = "0.1.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -2001,6 +2186,16 @@ dependencies = [
]
[[package]]
+name = "tinytemplate"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d3dc76004a03cec1c5932bca4cdc2e39aaa798e3f82363dd94f9adf6098c12f"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
+[[package]]
name = "tokio"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -2161,6 +2356,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed"
[[package]]
+name = "walkdir"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "777182bc735b6424e1a57516d35ed72cb8019d85c8c9bf536dccb3445c1a2f7d"
+dependencies = [
+ "same-file",
+ "winapi 0.3.8",
+ "winapi-util",
+]
+
+[[package]]
name = "want"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -2283,6 +2489,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
+name = "winapi-util"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
+dependencies = [
+ "winapi 0.3.8",
+]
+
+[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/Cargo.toml b/Cargo.toml
index 6f8b245..b2180c3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,12 @@ edition = "2018"
appveyor = { repository = "samtay/so", branch = "master", service = "github" }
travis-ci = { repository = "samtay/so", branch = "master" }
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+[dev-dependencies]
+criterion = "0.3"
+
+[[bench]]
+name = "parsing"
+harness = false
[dependencies]
thiserror = "1.0"
diff --git a/benches/parsing.rs b/benches/parsing.rs
new file mode 100644
index 0000000..0bfe44c
--- /dev/null
+++ b/benches/parsing.rs
@@ -0,0 +1,58 @@
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
+use so::stackexchange::scraper::{DuckDuckGo, Google, Scraper};
+use std::collections::HashMap;
+use std::time::Duration;
+
+fn bench_parsers(c: &mut Criterion) {
+ let limit: u16 = 10;
+ let mut sites = HashMap::new();
+ sites.insert(
+ String::from("stackoverflow"),
+ String::from("stackoverflow.com"),
+ );
+ sites.insert(String::from("askubuntu"), String::from("askubuntu.com"));
+
+ let mut group = c.benchmark_group("Scraping html");
+
+ group.sample_size(80);
+ group.measurement_time(Duration::from_secs(10));
+ group.throughput(Throughput::Elements(limit as u64));
+
+ group.bench_with_input(
+ BenchmarkId::new("Google.parse", "exit-vim"),
+ include_str!("../test/google/exit-vim.html"),
+ |b, html| b.iter(|| Google.parse(html, &sites, limit)),
+ );
+
+ group.bench_with_input(
+ BenchmarkId::new("DuckDuckGo.parse", "exit-vim"),
+ include_str!("../test/duckduckgo/exit-vim.html"),
+ |b, html| b.iter(|| DuckDuckGo.parse(html, &sites, limit)),
+ );
+
+ let mut sites = HashMap::new();
+ sites.insert(
+ String::from("stackoverflow"),
+ String::from("stackoverflow.com"),
+ );
+
+ group.bench_with_input(
+ BenchmarkId::new("Google.parse", "/q/"),
+ include_str!("../test/google/parsing-q.html"),
+ |b, html| b.iter(|| Google.parse(html, &sites, limit)),
+ );
+
+ let mut sites = HashMap::new();
+ sites.insert(String::from("meta"), String::from("meta.stackexchange.com"));
+
+ group.bench_with_input(
+ BenchmarkId::new("DuckDuckGo.parse", "tagged"),
+ include_str!("../test/duckduckgo/tagged.html"),
+ |b, html| b.iter(|| DuckDuckGo.parse(html, &sites, limit)),
+ );
+
+ group.finish();
+}
+
+criterion_group!(benches, bench_parsers);
+criterion_main!(benches);
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..e9ffa33
--- /dev/null
+++ b/src/lib.rs
@@ -0,0 +1,7 @@
+pub mod cli;
+pub mod config;
+pub mod error;
+pub mod stackexchange;
+pub mod term;
+pub mod tui;
+pub mod utils;
diff --git a/src/stackexchange/mod.rs b/src/stackexchange/mod.rs
index 2d4feb7..b0e1345 100644
--- a/src/stackexchange/mod.rs
+++ b/src/stackexchange/mod.rs
@@ -1,7 +1,8 @@
mod api;
mod local_storage;
-mod scraper;
mod search;
+// Exposed for benchmarking
+pub mod scraper;
pub use api::{Answer, Question};
pub use local_storage::LocalStorage;