From 0c4bafb3eb996b0e70707a32c11e8a1a2f9572ba Mon Sep 17 00:00:00 2001 From: Sam Tay Date: Mon, 22 Jun 2020 21:59:20 -0700 Subject: Add duckduckgo search engine --- Cargo.lock | 311 ++++++++- Cargo.toml | 5 +- TODO.md | 51 +- roadmap.md | 1 + src/cli.rs | 26 +- src/config.rs | 13 +- src/error.rs | 4 +- src/main.rs | 27 +- src/stackexchange.rs | 540 ++++++++++---- src/tui/app.rs | 54 -- src/tui/markdown.rs | 6 +- test/bad-user-agent.html | 1 + test/exit-vim.html | 1745 ++++++++++++++++++++++++++++++++++++++++++++++ 13 files changed, 2549 insertions(+), 235 deletions(-) create mode 100644 test/bad-user-agent.html create mode 100644 test/exit-vim.html diff --git a/Cargo.lock b/Cargo.lock index 3d8f148..e87755a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -113,6 +113,12 @@ version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2e8c087f005730276d1096a652e92a8bacee2e2472bcc9715a74d2bec38b5820" +[[package]] +name = "byteorder" +version = "1.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" + [[package]] name = "bytes" version = "0.5.4" @@ -314,6 +320,33 @@ dependencies = [ "winapi 0.3.8", ] +[[package]] +name = "cssparser" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "matches", + "phf", + "proc-macro2", + "quote", + "smallvec", + "syn", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfae75de57f2b2e85e8768c3ea840fd159c8f33e2b6522c7835b7abac81be16e" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "cursive" version = "0.15.0" @@ -331,8 +364,10 @@ dependencies = [ "log", "maplit", "ncurses", + "pancurses", "signal-hook", "term_size", + "termion", "unicode-segmentation", "unicode-width", ] @@ -395,6 +430,17 @@ dependencies = [ "syn", ] +[[package]] +name = "derive_more" +version = "0.99.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc655351f820d774679da6cdc23355a93de496867d8203496675162e17b1d671" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "directories" version = "2.0.2" @@ -423,6 +469,21 @@ version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4358a9e11b9a09cf52383b451b49a169e8d797b68aa02301ff586d70d9661ea3" +[[package]] +name = "dtoa-short" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59020b8513b76630c49d918c33db9f4c91638e7d3404a28084083b87e33f76f2" +dependencies = [ + "dtoa", +] + +[[package]] +name = "ego-tree" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591" + [[package]] name = "either" version = "1.5.3" @@ -530,6 +591,16 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" +[[package]] +name = "futf" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c9c1ce3fa9336301af935ab852c437817d14cd33690446569392e65170aac3b" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures" version = "0.3.5" @@ -625,6 +696,24 @@ dependencies = [ "slab", ] +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width", +] + [[package]] name = "getrandom" version = "0.1.14" @@ -664,6 +753,20 @@ dependencies = [ "libc", ] +[[package]] +name = "html5ever" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "http" version = "0.2.1" @@ -824,12 +927,35 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + [[package]] name = "maplit" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" +[[package]] +name = "markup5ever" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab" +dependencies = [ + "log", + "phf", + "phf_codegen", + "serde", + "serde_derive", + "serde_json", + "string_cache", + "string_cache_codegen", + "tendril", +] + [[package]] name = "matches" version = "0.1.8" @@ -995,6 +1121,18 @@ dependencies = [ "winapi 0.3.8", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" + +[[package]] +name = "nodrop" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" + [[package]] name = "num" version = "0.2.1" @@ -1069,6 +1207,12 @@ dependencies = [ "libc", ] +[[package]] +name = "numtoa" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8f8bdf33df195859076e54ab11ee78a1b208382d3a26ec40d142ffc1ecc49ef" + [[package]] name = "once_cell" version = "1.4.0" @@ -1117,6 +1261,19 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "pancurses" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3058bc37c433096b2ac7afef1c5cdfae49ede0a4ffec3dfc1df1df0959d0ff0" +dependencies = [ + "libc", + "log", + "ncurses", + "pdcurses-sys", + "winreg 0.5.1", +] + [[package]] name = "parking_lot" version = "0.10.2" @@ -1141,6 +1298,16 @@ dependencies = [ "winapi 0.3.8", ] +[[package]] +name = "pdcurses-sys" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "084dd22796ff60f1225d4eb6329f33afaf4c85419d51d440ab6b8c6f4529166b" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "percent-encoding" version = "2.1.0" @@ -1158,6 +1325,16 @@ dependencies = [ "proc-macro-hack", ] +[[package]] +name = "phf_codegen" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" +dependencies = [ + "phf_generator", + "phf_shared", +] + [[package]] name = "phf_generator" version = "0.8.0" @@ -1235,6 +1412,12 @@ version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "237a5ed80e274dbc66f86bd59c1e25edc039660be53194b5fe0a482e0f2612ea" +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "proc-macro-hack" version = "0.5.16" @@ -1358,6 +1541,15 @@ version = "0.1.56" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84" +[[package]] +name = "redox_termios" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" +dependencies = [ + "redox_syscall", +] + [[package]] name = "redox_users" version = "0.3.4" @@ -1411,7 +1603,7 @@ dependencies = [ "wasm-bindgen", "wasm-bindgen-futures", "web-sys", - "winreg", + "winreg 0.7.0", ] [[package]] @@ -1448,6 +1640,22 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "scraper" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e02aa790c80c2e494130dec6a522033b6a23603ffc06360e9fe6c611ea2c12" +dependencies = [ + "cssparser", + "ego-tree", + "getopts", + "html5ever", + "matches", + "selectors", + "smallvec", + "tendril", +] + [[package]] name = "security-framework" version = "0.4.4" @@ -1471,6 +1679,26 @@ dependencies = [ "libc", ] +[[package]] +name = "selectors" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe" +dependencies = [ + "bitflags", + "cssparser", + "derive_more", + "fxhash", + "log", + "matches", + "phf", + "phf_codegen", + "precomputed-hash", + "servo_arc", + "smallvec", + "thin-slice", +] + [[package]] name = "serde" version = "1.0.111" @@ -1526,6 +1754,16 @@ dependencies = [ "yaml-rust", ] +[[package]] +name = "servo_arc" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432" +dependencies = [ + "nodrop", + "stable_deref_trait", +] + [[package]] name = "signal-hook" version = "0.1.15" @@ -1576,10 +1814,12 @@ dependencies = [ "futures", "lazy_static", "minimad", + "percent-encoding", "phf", "pulldown-cmark", "rayon", "reqwest", + "scraper", "serde", "serde_json", "serde_yaml", @@ -1607,6 +1847,31 @@ version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dba1a27d3efae4351c8051072d619e3ade2820635c3958d826bfea39d59b54c8" +[[package]] +name = "string_cache" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2940c75beb4e3bf3a494cef919a747a2cb81e52571e212bfbd185074add7208a" +dependencies = [ + "lazy_static", + "new_debug_unreachable", + "phf_shared", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f24c8e5e19d22a726626f1a5e16fe15b132dcf21d10177fa5a45ce7962996b97" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + [[package]] name = "strsim" version = "0.8.0" @@ -1644,6 +1909,17 @@ dependencies = [ "winapi 0.3.8", ] +[[package]] +name = "tendril" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "707feda9f2582d5d680d733e38755547a3e8fb471e7ba11452ecfd9ce93a5d3b" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "term_size" version = "0.3.2" @@ -1667,6 +1943,18 @@ dependencies = [ "thiserror", ] +[[package]] +name = "termion" +version = "1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c22cec9d8978d906be5ac94bceb5a010d885c626c4c8855721a4dbd20e3ac905" +dependencies = [ + "libc", + "numtoa", + "redox_syscall", + "redox_termios", +] + [[package]] name = "textwrap" version = "0.11.0" @@ -1676,6 +1964,12 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "thin-slice" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" + [[package]] name = "thiserror" version = "1.0.19" @@ -1842,6 +2136,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "utf-8" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7" + [[package]] name = "vcpkg" version = "0.2.9" @@ -1988,6 +2288,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "winreg" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a27a759395c1195c4cc5cda607ef6f8f6498f64e78f7900f5de0a127a424704a" +dependencies = [ + "winapi 0.3.8", +] + [[package]] name = "winreg" version = "0.7.0" diff --git a/Cargo.toml b/Cargo.toml index 25d5708..6443c85 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,10 +15,13 @@ serde_json = "1.0" serde_yaml = "0.8" reqwest = { version = "0.10", features = ["gzip", "json"] } -futures = "0.3" tokio = { version = "0.2", features = ["full"] } +futures = "0.3" rayon = "1.3" +percent-encoding = "2.1" +scraper = "0.12" + lazy_static = "1.4" minimad = "0.6" termimad = "0.8" diff --git a/TODO.md b/TODO.md index 056472f..6b53cb2 100644 --- a/TODO.md +++ b/TODO.md @@ -1,11 +1,22 @@ # TODO ### v0.3.0 -1. Duckduck go search ftw, e.g. -``` -(site:stackoverflow.com OR site:unix.stackexchange.com) what is linux -``` -etc. +1. Keep relevance ordering !!! + +### v0.3.1 +1. Much of the code can be reused for google: + * parsing href after `"url="` (similar to uddg) + * formatting `(site:stackoverflow.com OR site:unix.stackexchange.com) what is linux` + So make a `Scraper` trait and implement it for DDG & Google. Then + `stackexchange` can just code against `Scraper` and choose based on + `--search-engine | -e' argument` +2. Maybe reorganize to + - stackexchange + - api + - scraper + + + ### Endless future improvements for the TUI 1. Init with smaller layout depending on initial screen size. @@ -20,23 +31,29 @@ etc. ### resources for later -#### async -1. start with [this](http://patshaughnessy.net/2020/1/20/downloading-100000-files-using-async-rust) but also see the following gist and thread through the below links to make sure its actually async.. -0. breakdown of futures+reqwest [here](https://stackoverflow.com/questions/51044467/how-can-i-perform-parallel-asynchronous-http-get-requests-with-reqwest) -0. general concurrency in rust [info](https://blog.yoshuawuyts.com/streams-concurrency/) -0. [Intro to async rust](http://jamesmcm.github.io/blog/2020/05/06/a-practical-introduction-to-async-programming-in-rust/) -1. Async API calls [tokio](https://stackoverflow.com/a/57770687) -2. Parallel calls against multiple sites [vid](https://www.youtube.com/watch?v=O-LagKc0MPA) -0. OR JUST THREADS [see here](https://rust-lang.github.io/async-book/01_getting_started/02_why_async.html) - #### scraping 6. Google stuff [scraping with reqwest](https://rust-lang-nursery.github.io/rust-cookbook/web/scraping.html)) +```python +# if necessary, choose one of these to mimic browswer request +USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0', + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0', + 'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0', + ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) ' + 'Chrome/19.0.1084.46 Safari/536.5'), + ('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46' + 'Safari/536.5'), ) + +# checks for search engine blocks +BLOCK_INDICATORS = ( + 'form id="captcha-form"', + 'This page appears when Google automatically detects requests coming from your computer ' + 'network which appear to be in violation of the Terms of Service' +) +``` + #### distribution 1. oh game over [dawg](https://github.com/japaric/trust) -2. also, use [feature flags]() to select backend. Only use crossterm on Windows - since it is rather jumpy... - #### ideas 5. Add sort option, e.g. relevance|votes|date diff --git a/roadmap.md b/roadmap.md index efe6dfe..b5d8d36 100644 --- a/roadmap.md +++ b/roadmap.md @@ -33,3 +33,4 @@ [ ] add duckduckgo logo to readme [ ] per platform package mgmt [ ] more testing +[ ] maybe add google engine too. but fuck google. diff --git a/src/cli.rs b/src/cli.rs index 7d946d0..7abde66 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -58,7 +58,7 @@ pub fn get_opts() -> Result { .takes_value(true) .default_value(limit) .validator(|s| s.parse::().map_err(|e| e.to_string()).map(|_| ())) - .help("Question limit per site query"), + .help("Question limit"), ) .arg( Arg::with_name("lucky") @@ -69,7 +69,8 @@ pub fn get_opts() -> Result { Arg::with_name("no-lucky") .long("no-lucky") .help("Disable lucky") - .conflicts_with("lucky"), + .conflicts_with("lucky") + .hidden(!config.lucky), ) .arg( Arg::with_name("query") @@ -77,12 +78,32 @@ pub fn get_opts() -> Result { .index(1) .required_unless_one(&["list-sites", "update-sites", "set-api-key"]), ) + .arg( + Arg::with_name("duckduckgo") + .long("duckduckgo") + .help("Use DuckDuckGo as a search engine"), + ) + .arg( + Arg::with_name("no-duckduckgo") + .long("no-duckduckgo") + .help("Disable duckduckgo") + .conflicts_with("duckduckgo") + .hidden(!config.duckduckgo), + ) .get_matches(); let lucky = match (matches.is_present("lucky"), matches.is_present("no-lucky")) { (true, _) => true, (_, true) => false, _ => config.lucky, }; + let duckduckgo = match ( + matches.is_present("duckduckgo"), + matches.is_present("no-duckduckgo"), + ) { + (true, _) => true, + (_, true) => false, + _ => config.duckduckgo, + }; Ok(Opts { list_sites: matches.is_present("list-sites"), update_sites: matches.is_present("update-sites"), @@ -105,6 +126,7 @@ pub fn get_opts() -> Result { .map(String::from) .or(config.api_key), lucky, + duckduckgo, }, }) } diff --git a/src/config.rs b/src/config.rs index c86e0ad..79cbd74 100644 --- a/src/config.rs +++ b/src/config.rs @@ -13,6 +13,7 @@ pub struct Config { pub limit: u16, pub lucky: bool, pub sites: Vec, + pub duckduckgo: bool, } // TODO make a friender config file, like the colors.toml below @@ -23,6 +24,7 @@ impl Default for Config { limit: 20, lucky: true, sites: vec![String::from("stackoverflow")], + duckduckgo: true, } } } @@ -33,13 +35,22 @@ pub fn user_config() -> Result { let dir = project.config_dir(); fs::create_dir_all(&dir)?; let filename = config_file_name()?; + match utils::open_file(&filename)? { None => { let def = Config::default(); write_config(&def)?; Ok(def) } - Some(file) => serde_yaml::from_reader(file).map_err(|_| Error::MalformedFile(filename)), + Some(file) => serde_yaml::from_reader(file) + .map_err(|_| Error::MalformedFile(filename.clone())) + .and_then(|cfg: Config| { + if cfg.sites.is_empty() { + Err(Error::MalformedFile(filename)) + } else { + Ok(cfg) + } + }), } } diff --git a/src/error.rs b/src/error.rs index d104594..53ba23c 100644 --- a/src/error.rs +++ b/src/error.rs @@ -24,10 +24,10 @@ pub enum Error { Permissions(PermissionType, PathBuf), #[error("{0}")] StackExchange(String), + #[error("{0}")] + ScrapingError(String), #[error("Couldn't find a suitable project directory; is your OS supported?")] ProjectDir, - #[error("Empty sites file in cache")] - EmptySites, #[error("Sorry, couldn't find any answers for your query")] NoResults, } diff --git a/src/main.rs b/src/main.rs index f753cd1..afd6c21 100644 --- a/src/main.rs +++ b/src/main.rs @@ -35,13 +35,7 @@ fn main() -> Result<()> { }) .or_else(|e: Error| { // Handle errors - print_error(&e.to_string())?; - match e { - Error::EmptySites => { - print_notice!(skin, "This can likely be fixed by `so --update-sites`.") - } - _ => Ok(()), - } + print_error(&e.to_string()) }) } @@ -52,23 +46,19 @@ async fn run(skin: &mut MadSkin) -> Result>>> { let config = opts.config; let sites = &config.sites; let lucky = config.lucky; - let mut ls = LocalStorage::new()?; + + let ls = LocalStorage::new(opts.update_sites).await?; if let Some(key) = opts.set_api_key { config::set_api_key(key)?; } - if opts.update_sites { - ls.update_sites().await?; - } - if opts.list_sites { - let sites = ls.sites().await?; let mut md = String::new(); md.push_str("|:-:|:-:|\n"); md.push_str("|Site Code|Site URL|\n"); md.push_str("|-:|:-|\n"); - for s in sites.iter() { + for s in ls.sites.iter() { md.push_str(&format!("|{}|{}\n", s.api_site_parameter, s.site_url)); } md.push_str("|-\n"); @@ -76,7 +66,7 @@ async fn run(skin: &mut MadSkin) -> Result>>> { return Ok(None); } - if let Some(site) = ls.find_invalid_site(sites).await? { + if let Some(site) = ls.find_invalid_site(sites).await { print_error!(skin, "$0 is not a valid StackExchange site.\n\n", site)?; // TODO should only use inline for single lines; use termimad::text stuff print_notice!( @@ -92,20 +82,19 @@ async fn run(skin: &mut MadSkin) -> Result>>> { } if let Some(q) = opts.query { - let se = StackExchange::new(config, q); + let mut se = StackExchange::new(config, ls, q); if lucky { - // TODO this needs preprocessing; all the more reason to do it at SE level let md = se.search_lucky().await?; skin.print_text(&md); skin.print_text("\nPress **[SPACE]** to see more results, or any other key to exit"); // Kick off the rest of the search in the background - let qs = task::spawn(async move { se.search().await }); + let qs = task::spawn(async move { se.search_md().await }); if !utils::wait_for_char(' ')? { return Ok(None); } return Ok(Some(qs.await.unwrap()?)); } else { - return Ok(Some(se.search().await?)); + return Ok(Some(se.search_md().await?)); } } Ok(None) diff --git a/src/stackexchange.rs b/src/stackexchange.rs index 1d4789a..2939c29 100644 --- a/src/stackexchange.rs +++ b/src/stackexchange.rs @@ -1,8 +1,13 @@ use futures::stream::StreamExt; +use percent_encoding::percent_decode_str; use rayon::prelude::*; +use reqwest::header; use reqwest::Client; use reqwest::Url; +use scraper::html::Html; +use scraper::selector::Selector; use serde::{Deserialize, Serialize}; +use std::collections::hash_map::Entry; use std::collections::HashMap; use std::fs; use std::path::PathBuf; @@ -13,7 +18,11 @@ use crate::tui::markdown; use crate::tui::markdown::Markdown; use crate::utils; +/// DuckDuckGo URL +const DUCKDUCKGO_URL: &str = "https://duckduckgo.com"; + /// StackExchange API v2.2 URL +// TODO why not https? const SE_API_URL: &str = "http://api.stackexchange.com"; const SE_API_VERSION: &str = "2.2"; @@ -28,6 +37,11 @@ const SE_SITES_PAGESIZE: u16 = 10000; /// Limit on concurrent requests (gets passed to `buffer_unordered`) const CONCURRENT_REQUESTS_LIMIT: usize = 8; +/// Mock user agent to get real DuckDuckGo results +// TODO copy other user agents and use random one each time +const USER_AGENT: &str = + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0"; + /// This structure allows interacting with parts of the StackExchange /// API, using the `Config` struct to determine certain API settings and options. // TODO should my se structs have &str instead of String? @@ -35,13 +49,13 @@ const CONCURRENT_REQUESTS_LIMIT: usize = 8; pub struct StackExchange { client: Client, config: Config, + sites: HashMap, query: String, } /// This structure allows interacting with locally cached StackExchange metadata. pub struct LocalStorage { - sites: Option>, - filename: PathBuf, + pub sites: Vec, } #[derive(Deserialize, Serialize, Debug)] @@ -84,24 +98,36 @@ struct ResponseWrapper { } impl StackExchange { - pub fn new(config: Config, query: String) -> Self { + pub fn new(config: Config, local_storage: LocalStorage, query: String) -> Self { let client = Client::new(); StackExchange { client, + sites: local_storage.get_urls(&config.sites), config, query, } } - /// Search query at stack exchange and get the top answer body + /// Search query and get the top answer body /// - /// For now, use only the first configured site, since, parodoxically, sites - /// with the worst results will finish executing first, since there's less - /// data to retrieve. - pub async fn search_lucky(&self) -> Result { - Ok(self - .search_advanced_site(self.config.sites.iter().next().unwrap(), 1) - .await? + /// For StackExchange engine, use only the first configured site, + /// since, parodoxically, sites with the worst results will finish + /// executing first, because there's less data to retrieve. + /// + /// Needs mut because it temporarily changes self.config + pub async fn search_lucky(&mut self) -> Result { + let original_config = self.config.clone(); + // Temp set lucky config + self.config.limit = 1; + if !self.config.duckduckgo { + self.config.sites.truncate(1); + } + // Run search with temp config + let result = self.search().await; + // Reset config + self.config = original_config; + + Ok(result? .into_iter() .next() .ok_or(Error::NoResults)? @@ -112,19 +138,71 @@ impl StackExchange { .body) } - /// Search query at stack exchange and get a list of relevant questions - pub async fn search(&self) -> Result>> { - self.search_advanced(self.config.limit).await + /// Search and parse to Markdown for TUI + pub async fn search_md(&self) -> Result>> { + Ok(parse_markdown(self.search().await?)) + } + + /// Search query and get a list of relevant questions + pub async fn search(&self) -> Result>> { + if self.config.duckduckgo { + self.search_duckduck_go().await + } else { + // TODO after duckduck go finished, refactor to _not_ thread this limit, its unnecessary + self.se_search_advanced(self.config.limit).await + } } - /// Parallel searches against the search/advanced endpoint across all configured sites - async fn search_advanced(&self, limit: u16) -> Result>> { + /// Search query at duckduckgo and then fetch the resulting questions from SE. + async fn search_duckduck_go(&self) -> Result>> { + let url = duckduckgo_url(&self.query, self.sites.values()); + let html = self + .client + .get(url) + .header(header::USER_AGENT, USER_AGENT) + .send() + .await? + .text() + .await?; + let ids = parse_questions_from_ddg_html(&html, &self.sites, self.config.limit)?; + self.se_questions(ids).await + } + + /// Parallel searches against the SE question endpoint across the sites in `ids`. + // TODO I'm sure there is a way to DRY the se_question & se_search_advanced functions + async fn se_questions( + &self, + ids: HashMap>, + ) -> Result>> { + futures::stream::iter(ids) + .map(|(site, ids)| { + let clone = self.clone(); + tokio::spawn(async move { + let clone = &clone; + clone.se_questions_site(&site, ids).await + }) + }) + .buffer_unordered(CONCURRENT_REQUESTS_LIMIT) + .collect::>() + .await + .into_iter() + .map(|r| r.map_err(Error::from).and_then(|x| x)) + .collect::>>>() + .map(|v| { + let qs: Vec> = v.into_iter().flatten().collect(); + // TODO sort by original ordering ! + qs + }) + } + + /// Parallel searches against the SE search/advanced endpoint across all configured sites + async fn se_search_advanced(&self, limit: u16) -> Result>> { futures::stream::iter(self.config.sites.clone()) .map(|site| { let clone = self.clone(); tokio::spawn(async move { let clone = &clone; - clone.search_advanced_site(&site, limit).await + clone.se_search_advanced_site(&site, limit).await }) }) .buffer_unordered(CONCURRENT_REQUESTS_LIMIT) @@ -138,18 +216,45 @@ impl StackExchange { if self.config.sites.len() > 1 { qs.sort_unstable_by_key(|q| -q.score); } - Self::parse_markdown(qs) + qs }) } - /// Search against the site's search/advanced endpoint with a given query. + /// Search against the SE site's /questions/{ids} endpoint. + /// Filters out questions with no answers. + async fn se_questions_site( + &self, + site: &str, + ids: Vec, + ) -> Result>> { + let total = ids.len().to_string(); + let endpoint = format!("questions/{ids}", ids = ids.join(";")); + let qs = self + .client + .get(stackexchange_url(&endpoint)) + .header("Accepts", "application/json") + .query(&self.get_default_se_opts()) + .query(&[("site", site), ("pagesize", &total), ("page", "1")]) + .send() + .await? + .json::>>() + .await? + .items; + Ok(Self::preprocess(qs)) + } + + /// Search against the SE site's /search/advanced endpoint with a given query. /// Only fetches questions that have at least one answer. - async fn search_advanced_site(&self, site: &str, limit: u16) -> Result>> { + async fn se_search_advanced_site( + &self, + site: &str, + limit: u16, + ) -> Result>> { let qs = self .client .get(stackexchange_url("search/advanced")) .header("Accepts", "application/json") - .query(&self.get_default_opts()) + .query(&self.get_default_se_opts()) .query(&[ ("q", self.query.as_str()), ("pagesize", &limit.to_string()), @@ -167,7 +272,7 @@ impl StackExchange { Ok(Self::preprocess(qs)) } - fn get_default_opts(&self) -> HashMap<&str, &str> { + fn get_default_se_opts(&self) -> HashMap<&str, &str> { let mut params = HashMap::new(); params.insert("filter", SE_FILTER); if let Some(key) = &self.config.api_key { @@ -178,155 +283,146 @@ impl StackExchange { /// Sorts answers by score /// Preprocess SE markdown to "cmark" markdown (or something closer to it) + /// This markdown preprocess _always_ happens. fn preprocess(qs: Vec>) -> Vec> { - qs.par_iter() + qs.into_par_iter() .map(|q| { - let Question { - id, - score, - title, - answers, - body, - } = q; - answers.to_vec().par_sort_unstable_by_key(|a| -a.score); + let mut answers = q.answers; + answers.par_sort_unstable_by_key(|a| -a.score); let answers = answers - .par_iter() + .into_par_iter() .map(|a| Answer { body: markdown::preprocess(a.body.clone()), - ..*a + ..a }) .collect(); Question { answers, - body: markdown::preprocess(body.to_string()), - id: *id, - score: *score, - title: title.to_string(), + body: markdown::preprocess(q.body), + ..q } }) .collect::>() } +} - /// Parse all markdown fields - fn parse_markdown(qs: Vec>) -> Vec> { - qs.par_iter() - .map(|q| { - let Question { - id, - score, - title, - answers, - body, - } = q; - let body = markdown::parse(body); - let answers = answers - .par_iter() - .map(|a| { - let Answer { - id, - score, - is_accepted, - body, - } = a; - let body = markdown::parse(body); - Answer { - body, - id: *id, - score: *score, - is_accepted: *is_accepted, - } - }) - .collect::>(); - Question { - body, - answers, - id: *id, - score: *score, - title: title.to_string(), - } - }) - .collect::>() - } +/// Parse all markdown fields +/// This only happens for content going into the cursive TUI (not lucky prompt) +fn parse_markdown(qs: Vec>) -> Vec> { + qs.into_par_iter() + .map(|q| { + let body = markdown::parse(q.body); + let answers = q + .answers + .into_par_iter() + .map(|a| { + let body = markdown::parse(a.body); + Answer { + body, + id: a.id, + score: a.score, + is_accepted: a.is_accepted, + } + }) + .collect::>(); + Question { + body, + answers, + id: q.id, + score: q.score, + title: q.title, + } + }) + .collect::>() } impl LocalStorage { - pub fn new() -> Result { - let project = project_dir()?; - let dir = project.cache_dir(); - fs::create_dir_all(&dir)?; - Ok(LocalStorage { - sites: None, - filename: dir.join("sites.json"), - }) + fn fetch_local_sites(filename: &PathBuf) -> Result>> { + if let Some(file) = utils::open_file(filename)? { + return serde_json::from_reader(file) + .map_err(|_| Error::MalformedFile(filename.clone())); + } + Ok(None) } - // TODO inform user if we are downloading - pub async fn sites(&mut self) -> Result<&Vec> { - if self.sites.is_none() && !self.fetch_local_sites()? { - self.fetch_remote_sites().await?; - } - match &self.sites { - Some(sites) if sites.is_empty() => Err(Error::EmptySites), - Some(sites) => Ok(sites), - None => panic!("Code failure in site listing retrieval"), + // TODO decide whether or not I should give LocalStorage an api key.. + async fn fetch_remote_sites() -> Result> { + let se_sites = Client::new() + .get(stackexchange_url("sites")) + .header("Accepts", "application/json") + .query(&[ + ("pagesize", SE_SITES_PAGESIZE.to_string()), + ("page", "1".to_string()), + ]) + .send() + .await? + .json::>() + .await? + .items; + Ok(se_sites + .into_par_iter() + .map(|site| { + let site_url = site.site_url.trim_start_matches("https://").to_string(); + Site { site_url, ..site } + }) + .collect()) + } + + fn store_local_sites(filename: &PathBuf, sites: &[Site]) -> Result<()> { + let file = utils::create_file(filename)?; + serde_json::to_writer(file, sites)?; + Ok(()) + } + + async fn init_sites(filename: &PathBuf, update: bool) -> Result> { + if !update { + if let Some(sites) = Self::fetch_local_sites(filename)? { + return Ok(sites); + } } + let sites = Self::fetch_remote_sites().await?; + Self::store_local_sites(filename, &sites)?; + Ok(sites) } - pub async fn update_sites(&mut self) -> Result<()> { - self.fetch_remote_sites().await + pub async fn new(update: bool) -> Result { + let project = project_dir()?; + let dir = project.cache_dir(); + fs::create_dir_all(&dir)?; + let sites_filename = dir.join("sites.json"); + let sites = Self::init_sites(&sites_filename, update).await?; + Ok(LocalStorage { sites }) } // TODO is this HM worth it? Probably only will ever have < 10 site codes to search... + // TODO store this as Option on self if other methods use it... pub async fn find_invalid_site<'a, 'b>( - &'b mut self, + &'b self, site_codes: &'a [String], - ) -> Result> { + ) -> Option<&'a String> { let hm: HashMap<&str, ()> = self - .sites() - .await? + .sites .iter() .map(|site| (site.api_site_parameter.as_str(), ())) .collect(); - Ok(site_codes.iter().find(|s| !hm.contains_key(&s.as_str()))) + site_codes.iter().find(|s| !hm.contains_key(&s.as_str())) } - fn fetch_local_sites(&mut self) -> Result { - match utils::open_file(&self.filename)? { - Some(file) => { - self.sites = serde_json::from_reader(file) - .map_err(|_| Error::MalformedFile(self.filename.clone()))?; - Ok(true) - } - None => Ok(false), - } - } - - // TODO decide whether or not I should give LocalStorage an api key.. - async fn fetch_remote_sites(&mut self) -> Result<()> { - self.sites = Some( - Client::new() - .get(stackexchange_url("sites")) - .header("Accepts", "application/json") - .query(&[ - ("pagesize", SE_SITES_PAGESIZE.to_string()), - ("page", "1".to_string()), - ]) - .send() - .await? - .json::>() - .await? - .items, - ); - self.store_local_sites() - } - - fn store_local_sites(&self) -> Result<()> { - let file = utils::create_file(&self.filename)?; - Ok(serde_json::to_writer(file, &self.sites)?) + pub fn get_urls(&self, site_codes: &[String]) -> HashMap { + self.sites + .iter() + .filter_map(move |site| { + let _ = site_codes + .iter() + .find(|&sc| *sc == site.api_site_parameter)?; + Some((site.api_site_parameter.to_owned(), site.site_url.to_owned())) + }) + .collect() } } -/// Creates stackexchange API url given endpoint; can technically panic +/// Creates stackexchange API url given endpoint +// TODO lazy static this url parse fn stackexchange_url(path: &str) -> Url { let mut url = Url::parse(SE_API_URL).unwrap(); url.path_segments_mut() @@ -336,6 +432,108 @@ fn stackexchange_url(path: &str) -> Url { url } +/// Creates duckduckgo search url given sites and query +/// See https://duckduckgo.com/params for more info +fn duckduckgo_url<'a, I>(query: &str, sites: I) -> Url +where + I: IntoIterator, +{ + let mut q = String::new(); + // Restrict to sites + q.push('('); + q.push_str( + sites + .into_iter() + .map(|site| String::from("site:") + site) + .collect::>() + .join(" OR ") + .as_str(), + ); + q.push_str(") "); + // Search terms + q.push_str( + query + .trim_end_matches('?') + .split_whitespace() + .collect::>() + .join(" ") + .as_str(), + ); + Url::parse_with_params( + DUCKDUCKGO_URL, + &[("q", q.as_str()), ("kz", "-1"), ("kh", "-1")], + ) + .unwrap() +} + +/// Parse (site, question_id) pairs out of duckduckgo search results html +/// TODO currently hashmap {site: [qids]} BUT we should maintain relevance order ! +/// maybe this is as simple as a HashMap {qid: ordinal} +fn parse_questions_from_ddg_html<'a>( + html: &'a str, + sites: &'a HashMap, + limit: u16, +) -> Result>> { + let fragment = Html::parse_document(html); + let anchors = Selector::parse("a.result__a").unwrap(); + let mut qids: HashMap> = HashMap::new(); + let mut count = 0; + for anchor in fragment.select(&anchors) { + let url = anchor + .value() + .attr("href") + .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string())) + .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?; + sites + .iter() + .find_map(|(site_code, site_url)| { + let id = question_url_to_id(site_url, &url)?; + match qids.entry(site_code.to_owned()) { + Entry::Occupied(mut o) => o.get_mut().push(id), + Entry::Vacant(o) => { + o.insert(vec![id]); + } + } + count += 1; + Some(()) + }) + .ok_or_else(|| { + Error::ScrapingError( + "Duckduckgo returned results outside of SE network".to_string(), + ) + })?; + if count >= limit as usize { + break; + } + } + // It doesn't seem possible for DDG to return no results, so assume this is + // a bad user agent + if count == 0 { + Err(Error::ScrapingError(String::from( + "DuckDuckGo blocked this request", + ))) + } else { + Ok(qids) + } +} + +/// For example +/// ``` +/// let id = "stackoverflow.com"; +/// let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor"; +/// assert_eq!(question_url_to_id(site_url, input), "11828270") +/// ``` +fn question_url_to_id(site_url: &str, input: &str) -> Option { + // TODO use str_prefix once its stable + let fragment = site_url.trim_end_matches('/').to_owned() + "/questions/"; + let ix = input.find(&fragment)? + fragment.len(); + let input = &input[ix..]; + let end = input.find('/')?; + Some(input[0..end].to_string()) +} + +// TODO figure out a query that returns no results so that I can test it and differentiate it from +// a blocked request #[cfg(test)] mod tests { use super::*; @@ -346,4 +544,76 @@ mod tests { "http://api.stackexchange.com/2.2/some/endpoint" ) } + + #[test] + fn test_duckduckgo_url() { + let q = "how do I exit vim?"; + let sites = vec![ + String::from("stackoverflow.com"), + String::from("unix.stackexchange.com"), + ]; + assert_eq!( + duckduckgo_url(q, &sites).as_str(), + String::from( + "https://duckduckgo.com/\ + ?q=%28site%3Astackoverflow.com+OR+site%3Aunix.stackexchange.com%29\ + +how+do+I+exit+vim&kz=-1&kh=-1" + ) + ) + } + + #[test] + fn test_duckduckgo_response() { + // TODO make sure results are either 1) answers 2) failed connection 3) blocked + } + + #[test] + fn test_duckduckgo_parser() { + let html = include_str!("../test/exit-vim.html"); + let sites = vec![ + ("stackoverflow", "stackoverflow.com"), + ("askubuntu", "askubuntu.com"), + ] + .into_iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect::>(); + let mut expected_question_ids = HashMap::new(); + expected_question_ids.insert( + "stackoverflow".to_string(), + vec!["11828270".to_string(), "9171356".to_string()], + ); + expected_question_ids.insert("askubuntu".to_string(), vec!["24406".to_string()]); + assert_eq!( + parse_questions_from_ddg_html(html, &sites, 3).unwrap(), + expected_question_ids + ); + } + + #[test] + fn test_duckduckgo_blocker() -> Result<(), String> { + let html = include_str!("../test/bad-user-agent.html"); + let mut sites = HashMap::new(); + sites.insert( + String::from("stackoverflow"), + String::from("stackoverflow.com"), + ); + + match parse_questions_from_ddg_html(html, &sites, 2) { + Err(Error::ScrapingError(s)) if s == "DuckDuckGo blocked this request".to_string() => { + Ok(()) + } + _ => Err(String::from("Failed to detect DuckDuckGo blocker")), + } + } + + #[test] + fn test_question_url_to_id() { + let site_url = "stackoverflow.com"; + let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor"; + assert_eq!(question_url_to_id(site_url, input).unwrap(), "11828270"); + + let site_url = "stackoverflow.com"; + let input = "/l/?kh=-1&uddg=https://askubuntu.com/questions/24406/how-to-close-vim-from-the-command-line"; + assert_eq!(question_url_to_id(site_url, input), None); + } } diff --git a/src/tui/app.rs b/src/tui/app.rs index ad1a1ea..06ac090 100644 --- a/src/tui/app.rs +++ b/src/tui/app.rs @@ -129,57 +129,3 @@ fn pretty_score(score: i32) -> StyledString { // TODO see cursive/examples/src/bin/select_test.rs for how to test the interface! // maybe see if we can conditionally run when --nocapture is passed? -#[cfg(test)] -mod tests { - use super::*; - use crate::stackexchange::{Answer, Question}; - #[test] - fn test_app() { - let ans_body = r#" -Also try the iter: -1. asdf -2. asfd -0. asdfa sfsdf - -but - - cargo build --example stderr - -and then you run it with - - cd "$(target/debug/examples/stderr)" - cd `(target/debug/examples/stderr)` - -what the application prints on stdout is used as argument to `cd`. - -Try it out. - -Hit any key to quit this screen: - -* **1** will print `..` -* **2** will print `/` -* **3** will print `~` -* or anything else to print this text (so that you may copy-paste) -"#; - let qs = vec![Question { - id: 42, - score: 323, - title: "How do I exit Vim?".to_string(), - body: "yo this be my problem dawg but don't say **do** `this`".to_string(), - answers: vec![ - Answer { - id: 422, - score: -4, - body: ans_body.to_string(), - is_accepted: false, - }, - Answer { - id: 423, - score: 23, - body: "this is a *good* answer tho".to_string(), - is_accepted: true, - }, - ], - }]; - } -} diff --git a/src/tui/markdown.rs b/src/tui/markdown.rs index 0330696..1046aeb 100644 --- a/src/tui/markdown.rs +++ b/src/tui/markdown.rs @@ -42,6 +42,7 @@ pub fn preprocess(input: String) -> String { } /// Preview markdown of the given length +/// **Note**: Assumes preprocessing has taken place pub fn preview(width: usize, input: &StyledString) -> StyledString { let mut w = 0; let mut new_spans = Vec::new(); @@ -509,8 +510,8 @@ and tasks #[test] fn test_from_cow_panic() { let input = " -I'm on a Mac running OS X v10.6 (Snow Leopard). I have Mercurial 1.1 installed.\r\n\r\nAfter I hit Esc to exit insert mode I can't figure out how to save and quit. Hitting Ctrl + C shows me instructions that say typing \"quit\" will write and quit, but it doesn't seem to work.\r\n\r\n\r\n\r\n"; - let parsed = parse(input); +I'm on a Mac running OS X v10.6 (Snow Leopard). I have Mercurial 1.1 installed.\r\n\r\nAfter I hit Esc to exit insert mode I can't figure out how to save and quit. Hitting Ctrl + C shows me instructions that say typing \"quit\" will write and quit, but it doesn't seem to work.\r\n\r\n\r\n\r\n".to_string(); + let parsed = parse(preprocess(input)); let spans: Vec<_> = parsed.spans().into_iter().collect(); let expected_spans = &[ Span { @@ -630,7 +631,6 @@ I'm on a Mac running OS X v10.6 (Snow Leopard). I have Mercurial }, ]; - assert_eq!(spans, expected_spans); for (span, expected_span) in spans.iter().zip(expected_spans.iter()) { assert_eq!(span, expected_span); } diff --git a/test/bad-user-agent.html b/test/bad-user-agent.html new file mode 100644 index 0000000..89c4aaa --- /dev/null +++ b/test/bad-user-agent.html @@ -0,0 +1 @@ +(site:stackoverflow.com) how do I exit nvim at DuckDuckGoIgnore this box please.
diff --git a/test/exit-vim.html b/test/exit-vim.html new file mode 100644 index 0000000..a4f7a4a --- /dev/null +++ b/test/exit-vim.html @@ -0,0 +1,1745 @@ + + + + + + + + + + + + + (site:https://stackoverflow.com OR site:https://askubuntu.com) how do I exit nvim at DuckDuckGo + + + + + + + + + + +
+ +
+ +
+
+ + + + + + + + + +
+
+