diff options
author | Sam Tay <sam.chong.tay@gmail.com> | 2020-06-22 21:59:20 -0700 |
---|---|---|
committer | Sam Tay <sam.chong.tay@gmail.com> | 2020-06-23 19:22:49 -0700 |
commit | 0c4bafb3eb996b0e70707a32c11e8a1a2f9572ba (patch) | |
tree | 6b90e68fd3db4e8b8c6334882ec2872f12402109 | |
parent | fdc4092d0276259c47a14cf2cc52c933fec633e4 (diff) |
Add duckduckgo search engine
-rw-r--r-- | Cargo.lock | 311 | ||||
-rw-r--r-- | Cargo.toml | 5 | ||||
-rw-r--r-- | TODO.md | 51 | ||||
-rw-r--r-- | roadmap.md | 1 | ||||
-rw-r--r-- | src/cli.rs | 26 | ||||
-rw-r--r-- | src/config.rs | 13 | ||||
-rw-r--r-- | src/error.rs | 4 | ||||
-rw-r--r-- | src/main.rs | 27 | ||||
-rw-r--r-- | src/stackexchange.rs | 540 | ||||
-rw-r--r-- | src/tui/app.rs | 54 | ||||
-rw-r--r-- | src/tui/markdown.rs | 6 | ||||
-rw-r--r-- | test/bad-user-agent.html | 1 | ||||
-rw-r--r-- | test/exit-vim.html | 1745 |
13 files changed, 2549 insertions, 235 deletions
@@ -114,6 +114,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2e8c087f005730276d1096a652e92a8bacee2e2472bcc9715a74d2bec38b5820" [[package]] +name = "byteorder" +version = "1.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" + +[[package]] name = "bytes" version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -315,6 +321,33 @@ dependencies = [ ] [[package]] +name = "cssparser" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "matches", + "phf", + "proc-macro2", + "quote", + "smallvec", + "syn", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfae75de57f2b2e85e8768c3ea840fd159c8f33e2b6522c7835b7abac81be16e" +dependencies = [ + "quote", + "syn", +] + +[[package]] name = "cursive" version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -331,8 +364,10 @@ dependencies = [ "log", "maplit", "ncurses", + "pancurses", "signal-hook", "term_size", + "termion", "unicode-segmentation", "unicode-width", ] @@ -396,6 +431,17 @@ dependencies = [ ] [[package]] +name = "derive_more" +version = "0.99.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc655351f820d774679da6cdc23355a93de496867d8203496675162e17b1d671" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] name = "directories" version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -424,6 +470,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4358a9e11b9a09cf52383b451b49a169e8d797b68aa02301ff586d70d9661ea3" [[package]] +name = "dtoa-short" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59020b8513b76630c49d918c33db9f4c91638e7d3404a28084083b87e33f76f2" +dependencies = [ + "dtoa", +] + +[[package]] +name = "ego-tree" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591" + +[[package]] name = "either" version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -531,6 +592,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" [[package]] +name = "futf" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c9c1ce3fa9336301af935ab852c437817d14cd33690446569392e65170aac3b" +dependencies = [ + "mac", + "new_debug_unreachable", +] + +[[package]] name = "futures" version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -626,6 +697,24 @@ dependencies = [ ] [[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width", +] + +[[package]] name = "getrandom" version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -665,6 +754,20 @@ dependencies = [ ] [[package]] +name = "html5ever" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn", +] + +[[package]] name = "http" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -825,12 +928,35 @@ dependencies = [ ] [[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] name = "maplit" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" [[package]] +name = "markup5ever" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab" +dependencies = [ + "log", + "phf", + "phf_codegen", + "serde", + "serde_derive", + "serde_json", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] name = "matches" version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -996,6 +1122,18 @@ dependencies = [ ] [[package]] +name = "new_debug_unreachable" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" + +[[package]] +name = "nodrop" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" + +[[package]] name = "num" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1070,6 +1208,12 @@ dependencies = [ ] [[package]] +name = "numtoa" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8f8bdf33df195859076e54ab11ee78a1b208382d3a26ec40d142ffc1ecc49ef" + +[[package]] name = "once_cell" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1118,6 +1262,19 @@ dependencies = [ ] [[package]] +name = "pancurses" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3058bc37c433096b2ac7afef1c5cdfae49ede0a4ffec3dfc1df1df0959d0ff0" +dependencies = [ + "libc", + "log", + "ncurses", + "pdcurses-sys", + "winreg 0.5.1", +] + +[[package]] name = "parking_lot" version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1142,6 +1299,16 @@ dependencies = [ ] [[package]] +name = "pdcurses-sys" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "084dd22796ff60f1225d4eb6329f33afaf4c85419d51d440ab6b8c6f4529166b" +dependencies = [ + "cc", + "libc", +] + +[[package]] name = "percent-encoding" version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1159,6 +1326,16 @@ dependencies = [ ] [[package]] +name = "phf_codegen" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] name = "phf_generator" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1236,6 +1413,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "237a5ed80e274dbc66f86bd59c1e25edc039660be53194b5fe0a482e0f2612ea" [[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + +[[package]] name = "proc-macro-hack" version = "0.5.16" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1359,6 +1542,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84" [[package]] +name = "redox_termios" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" +dependencies = [ + "redox_syscall", +] + +[[package]] name = "redox_users" version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1411,7 +1603,7 @@ dependencies = [ "wasm-bindgen", "wasm-bindgen-futures", "web-sys", - "winreg", + "winreg 0.7.0", ] [[package]] @@ -1449,6 +1641,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] +name = "scraper" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e02aa790c80c2e494130dec6a522033b6a23603ffc06360e9fe6c611ea2c12" +dependencies = [ + "cssparser", + "ego-tree", + "getopts", + "html5ever", + "matches", + "selectors", + "smallvec", + "tendril", +] + +[[package]] name = "security-framework" version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1472,6 +1680,26 @@ dependencies = [ ] [[package]] +name = "selectors" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe" +dependencies = [ + "bitflags", + "cssparser", + "derive_more", + "fxhash", + "log", + "matches", + "phf", + "phf_codegen", + "precomputed-hash", + "servo_arc", + "smallvec", + "thin-slice", +] + +[[package]] name = "serde" version = "1.0.111" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1527,6 +1755,16 @@ dependencies = [ ] [[package]] +name = "servo_arc" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432" +dependencies = [ + "nodrop", + "stable_deref_trait", +] + +[[package]] name = "signal-hook" version = "0.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1576,10 +1814,12 @@ dependencies = [ "futures", "lazy_static", "minimad", + "percent-encoding", "phf", "pulldown-cmark", "rayon", "reqwest", + "scraper", "serde", "serde_json", "serde_yaml", @@ -1608,6 +1848,31 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dba1a27d3efae4351c8051072d619e3ade2820635c3958d826bfea39d59b54c8" [[package]] +name = "string_cache" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2940c75beb4e3bf3a494cef919a747a2cb81e52571e212bfbd185074add7208a" +dependencies = [ + "lazy_static", + "new_debug_unreachable", + "phf_shared", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f24c8e5e19d22a726626f1a5e16fe15b132dcf21d10177fa5a45ce7962996b97" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + +[[package]] name = "strsim" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1645,6 +1910,17 @@ dependencies = [ ] [[package]] +name = "tendril" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "707feda9f2582d5d680d733e38755547a3e8fb471e7ba11452ecfd9ce93a5d3b" +dependencies = [ + "futf", + "mac", + "utf-8", +] + +[[package]] name = "term_size" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1668,6 +1944,18 @@ dependencies = [ ] [[package]] +name = "termion" +version = "1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c22cec9d8978d906be5ac94bceb5a010d885c626c4c8855721a4dbd20e3ac905" +dependencies = [ + "libc", + "numtoa", + "redox_syscall", + "redox_termios", +] + +[[package]] name = "textwrap" version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1677,6 +1965,12 @@ dependencies = [ ] [[package]] +name = "thin-slice" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" + +[[package]] name = "thiserror" version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1843,6 +2137,12 @@ dependencies = [ ] [[package]] +name = "utf-8" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7" + +[[package]] name = "vcpkg" version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1990,6 +2290,15 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "winreg" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a27a759395c1195c4cc5cda607ef6f8f6498f64e78f7900f5de0a127a424704a" +dependencies = [ + "winapi 0.3.8", +] + +[[package]] +name = "winreg" version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69" @@ -15,10 +15,13 @@ serde_json = "1.0" serde_yaml = "0.8" reqwest = { version = "0.10", features = ["gzip", "json"] } -futures = "0.3" tokio = { version = "0.2", features = ["full"] } +futures = "0.3" rayon = "1.3" +percent-encoding = "2.1" +scraper = "0.12" + lazy_static = "1.4" minimad = "0.6" termimad = "0.8" @@ -1,11 +1,22 @@ # TODO ### v0.3.0 -1. Duckduck go search ftw, e.g. -``` -(site:stackoverflow.com OR site:unix.stackexchange.com) what is linux -``` -etc. +1. Keep relevance ordering !!! + +### v0.3.1 +1. Much of the code can be reused for google: + * parsing href after `"url="` (similar to uddg) + * formatting `(site:stackoverflow.com OR site:unix.stackexchange.com) what is linux` + So make a `Scraper` trait and implement it for DDG & Google. Then + `stackexchange` can just code against `Scraper` and choose based on + `--search-engine | -e' argument` +2. Maybe reorganize to + - stackexchange + - api + - scraper + + + ### Endless future improvements for the TUI 1. Init with smaller layout depending on initial screen size. @@ -20,23 +31,29 @@ etc. ### resources for later -#### async -1. start with [this](http://patshaughnessy.net/2020/1/20/downloading-100000-files-using-async-rust) but also see the following gist and thread through the below links to make sure its actually async.. -0. breakdown of futures+reqwest [here](https://stackoverflow.com/questions/51044467/how-can-i-perform-parallel-asynchronous-http-get-requests-with-reqwest) -0. general concurrency in rust [info](https://blog.yoshuawuyts.com/streams-concurrency/) -0. [Intro to async rust](http://jamesmcm.github.io/blog/2020/05/06/a-practical-introduction-to-async-programming-in-rust/) -1. Async API calls [tokio](https://stackoverflow.com/a/57770687) -2. Parallel calls against multiple sites [vid](https://www.youtube.com/watch?v=O-LagKc0MPA) -0. OR JUST THREADS [see here](https://rust-lang.github.io/async-book/01_getting_started/02_why_async.html) - #### scraping 6. Google stuff [scraping with reqwest](https://rust-lang-nursery.github.io/rust-cookbook/web/scraping.html)) +```python +# if necessary, choose one of these to mimic browswer request +USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0', + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0', + 'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0', + ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) ' + 'Chrome/19.0.1084.46 Safari/536.5'), + ('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46' + 'Safari/536.5'), ) + +# checks for search engine blocks +BLOCK_INDICATORS = ( + 'form id="captcha-form"', + 'This page appears when Google automatically detects requests coming from your computer ' + 'network which appear to be in violation of the <a href="//www.google.com/policies/terms/">Terms of Service' +) +``` + #### distribution 1. oh game over [dawg](https://github.com/japaric/trust) -2. also, use [feature flags]() to select backend. Only use crossterm on Windows - since it is rather jumpy... - #### ideas 5. Add sort option, e.g. relevance|votes|date @@ -33,3 +33,4 @@ [ ] add duckduckgo logo to readme [ ] per platform package mgmt [ ] more testing +[ ] maybe add google engine too. but fuck google. @@ -58,7 +58,7 @@ pub fn get_opts() -> Result<Opts> { .takes_value(true) .default_value(limit) .validator(|s| s.parse::<u32>().map_err(|e| e.to_string()).map(|_| ())) - .help("Question limit per site query"), + .help("Question limit"), ) .arg( Arg::with_name("lucky") @@ -69,7 +69,8 @@ pub fn get_opts() -> Result<Opts> { Arg::with_name("no-lucky") .long("no-lucky") .help("Disable lucky") - .conflicts_with("lucky"), + .conflicts_with("lucky") + .hidden(!config.lucky), ) .arg( Arg::with_name("query") @@ -77,12 +78,32 @@ pub fn get_opts() -> Result<Opts> { .index(1) .required_unless_one(&["list-sites", "update-sites", "set-api-key"]), ) + .arg( + Arg::with_name("duckduckgo") + .long("duckduckgo") + .help("Use DuckDuckGo as a search engine"), + ) + .arg( + Arg::with_name("no-duckduckgo") + .long("no-duckduckgo") + .help("Disable duckduckgo") + .conflicts_with("duckduckgo") + .hidden(!config.duckduckgo), + ) .get_matches(); let lucky = match (matches.is_present("lucky"), matches.is_present("no-lucky")) { (true, _) => true, (_, true) => false, _ => config.lucky, }; + let duckduckgo = match ( + matches.is_present("duckduckgo"), + matches.is_present("no-duckduckgo"), + ) { + (true, _) => true, + (_, true) => false, + _ => config.duckduckgo, + }; Ok(Opts { list_sites: matches.is_present("list-sites"), update_sites: matches.is_present("update-sites"), @@ -105,6 +126,7 @@ pub fn get_opts() -> Result<Opts> { .map(String::from) .or(config.api_key), lucky, + duckduckgo, }, }) } diff --git a/src/config.rs b/src/config.rs index c86e0ad..79cbd74 100644 --- a/src/config.rs +++ b/src/config.rs @@ -13,6 +13,7 @@ pub struct Config { pub limit: u16, pub lucky: bool, pub sites: Vec<String>, + pub duckduckgo: bool, } // TODO make a friender config file, like the colors.toml below @@ -23,6 +24,7 @@ impl Default for Config { limit: 20, lucky: true, sites: vec![String::from("stackoverflow")], + duckduckgo: true, } } } @@ -33,13 +35,22 @@ pub fn user_config() -> Result<Config> { let dir = project.config_dir(); fs::create_dir_all(&dir)?; let filename = config_file_name()?; + match utils::open_file(&filename)? { None => { let def = Config::default(); write_config(&def)?; Ok(def) } - Some(file) => serde_yaml::from_reader(file).map_err(|_| Error::MalformedFile(filename)), + Some(file) => serde_yaml::from_reader(file) + .map_err(|_| Error::MalformedFile(filename.clone())) + .and_then(|cfg: Config| { + if cfg.sites.is_empty() { + Err(Error::MalformedFile(filename)) + } else { + Ok(cfg) + } + }), } } diff --git a/src/error.rs b/src/error.rs index d104594..53ba23c 100644 --- a/src/error.rs +++ b/src/error.rs @@ -24,10 +24,10 @@ pub enum Error { Permissions(PermissionType, PathBuf), #[error("{0}")] StackExchange(String), + #[error("{0}")] + ScrapingError(String), #[error("Couldn't find a suitable project directory; is your OS supported?")] ProjectDir, - #[error("Empty sites file in cache")] - EmptySites, #[error("Sorry, couldn't find any answers for your query")] NoResults, } diff --git a/src/main.rs b/src/main.rs index f753cd1..afd6c21 100644 --- a/src/main.rs +++ b/src/main.rs @@ -35,13 +35,7 @@ fn main() -> Result<()> { }) .or_else(|e: Error| { // Handle errors - print_error(&e.to_string())?; - match e { - Error::EmptySites => { - print_notice!(skin, "This can likely be fixed by `so --update-sites`.") - } - _ => Ok(()), - } + print_error(&e.to_string()) }) } @@ -52,23 +46,19 @@ async fn run(skin: &mut MadSkin) -> Result<Option<Vec<Question<Markdown>>>> { let config = opts.config; let sites = &config.sites; let lucky = config.lucky; - let mut ls = LocalStorage::new()?; + + let ls = LocalStorage::new(opts.update_sites).await?; if let Some(key) = opts.set_api_key { config::set_api_key(key)?; } - if opts.update_sites { - ls.update_sites().await?; - } - if opts.list_sites { - let sites = ls.sites().await?; let mut md = String::new(); md.push_str("|:-:|:-:|\n"); md.push_str("|Site Code|Site URL|\n"); md.push_str("|-:|:-|\n"); - for s in sites.iter() { + for s in ls.sites.iter() { md.push_str(&format!("|{}|{}\n", s.api_site_parameter, s.site_url)); } md.push_str("|-\n"); @@ -76,7 +66,7 @@ async fn run(skin: &mut MadSkin) -> Result<Option<Vec<Question<Markdown>>>> { return Ok(None); } - if let Some(site) = ls.find_invalid_site(sites).await? { + if let Some(site) = ls.find_invalid_site(sites).await { print_error!(skin, "$0 is not a valid StackExchange site.\n\n", site)?; // TODO should only use inline for single lines; use termimad::text stuff print_notice!( @@ -92,20 +82,19 @@ async fn run(skin: &mut MadSkin) -> Result<Option<Vec<Question<Markdown>>>> { } if let Some(q) = opts.query { - let se = StackExchange::new(config, q); + let mut se = StackExchange::new(config, ls, q); if lucky { - // TODO this needs preprocessing; all the more reason to do it at SE level let md = se.search_lucky().await?; skin.print_text(&md); skin.print_text("\nPress **[SPACE]** to see more results, or any other key to exit"); // Kick off the rest of the search in the background - let qs = task::spawn(async move { se.search().await }); + let qs = task::spawn(async move { se.search_md().await }); if !utils::wait_for_char(' ')? { return Ok(None); } return Ok(Some(qs.await.unwrap()?)); } else { - return Ok(Some(se.search().await?)); + return Ok(Some(se.search_md().await?)); } } Ok(None) diff --git a/src/stackexchange.rs b/src/stackexchange.rs index 1d4789a..2939c29 100644 --- a/src/stackexchange.rs +++ b/src/stackexchange.rs @@ -1,8 +1,13 @@ use futures::stream::StreamExt; +use percent_encoding::percent_decode_str; use rayon::prelude::*; +use reqwest::header; use reqwest::Client; use reqwest::Url; +use scraper::html::Html; +use scraper::selector::Selector; use serde::{Deserialize, Serialize}; +use std::collections::hash_map::Entry; use std::collections::HashMap; use std::fs; use std::path::PathBuf; @@ -13,7 +18,11 @@ use crate::tui::markdown; use crate::tui::markdown::Markdown; use crate::utils; +/// DuckDuckGo URL +const DUCKDUCKGO_URL: &str = "https://duckduckgo.com"; + /// StackExchange API v2.2 URL +// TODO why not https? const SE_API_URL: &str = "http://api.stackexchange.com"; const SE_API_VERSION: &str = "2.2"; @@ -28,6 +37,11 @@ const SE_SITES_PAGESIZE: u16 = 10000; /// Limit on concurrent requests (gets passed to `buffer_unordered`) const CONCURRENT_REQUESTS_LIMIT: usize = 8; +/// Mock user agent to get real DuckDuckGo results +// TODO copy other user agents and use random one each time +const USER_AGENT: &str = + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0"; + /// This structure allows interacting with parts of the StackExchange /// API, using the `Config` struct to determine certain API settings and options. // TODO should my se structs have &str instead of String? @@ -35,13 +49,13 @@ const CONCURRENT_REQUESTS_LIMIT: usize = 8; pub struct StackExchange { client: Client, config: Config, + sites: HashMap<String, String>, query: String, } /// This structure allows interacting with locally cached StackExchange metadata. pub struct LocalStorage { - sites: Option<Vec<Site>>, - filename: PathBuf, + pub sites: Vec<Site>, } #[derive(Deserialize, Serialize, Debug)] @@ -84,24 +98,36 @@ struct ResponseWrapper<T> { } impl StackExchange { - pub fn new(config: Config, query: String) -> Self { + pub fn new(config: Config, local_storage: LocalStorage, query: String) -> Self { let client = Client::new(); StackExchange { client, + sites: local_storage.get_urls(&config.sites), config, query, } } - /// Search query at stack exchange and get the top answer body + /// Search query and get the top answer body /// - /// For now, use only the first configured site, since, parodoxically, sites - /// with the worst results will finish executing first, since there's less - /// data to retrieve. - pub async fn search_lucky(&self) -> Result<String> { - Ok(self - .search_advanced_site(self.config.sites.iter().next().unwrap(), 1) - .await? + /// For StackExchange engine, use only the first configured site, + /// since, parodoxically, sites with the worst results will finish + /// executing first, because there's less data to retrieve. + /// + /// Needs mut because it temporarily changes self.config + pub async fn search_lucky(&mut self) -> Result<String> { + let original_config = self.config.clone(); + // Temp set lucky config + self.config.limit = 1; + if !self.config.duckduckgo { + self.config.sites.truncate(1); + } + // Run search with temp config + let result = self.search().await; + // Reset config + self.config = original_config; + + Ok(result? .into_iter() .next() .ok_or(Error::NoResults)? @@ -112,19 +138,71 @@ impl StackExchange { .body) } - /// Search query at stack exchange and get a list of relevant questions - pub async fn search(&self) -> Result<Vec<Question<Markdown>>> { - self.search_advanced(self.config.limit).await + /// Search and parse to Markdown for TUI + pub async fn search_md(&self) -> Result<Vec<Question<Markdown>>> { + Ok(parse_markdown(self.search().await?)) + } + + /// Search query and get a list of relevant questions + pub async fn search(&self) -> Result<Vec<Question<String>>> { + if self.config.duckduckgo { + self.search_duckduck_go().await + } else { + // TODO after duckduck go finished, refactor to _not_ thread this limit, its unnecessary + self.se_search_advanced(self.config.limit).await + } } - /// Parallel searches against the search/advanced endpoint across all configured sites - async fn search_advanced(&self, limit: u16) -> Result<Vec<Question<Markdown>>> { + /// Search query at duckduckgo and then fetch the resulting questions from SE. + async fn search_duckduck_go(&self) -> Result<Vec<Question<String>>> { + let url = duckduckgo_url(&self.query, self.sites.values()); + let html = self + .client + .get(url) + .header(header::USER_AGENT, USER_AGENT) + .send() + .await? + .text() + .await?; + let ids = parse_questions_from_ddg_html(&html, &self.sites, self.config.limit)?; + self.se_questions(ids).await + } + + /// Parallel searches against the SE question endpoint across the sites in `ids`. + // TODO I'm sure there is a way to DRY the se_question & se_search_advanced functions + async fn se_questions( + &self, + ids: HashMap<String, Vec<String>>, + ) -> Result<Vec<Question<String>>> { + futures::stream::iter(ids) + .map(|(site, ids)| { + let clone = self.clone(); + tokio::spawn(async move { + let clone = &clone; + clone.se_questions_site(&site, ids).await + }) + }) + .buffer_unordered(CONCURRENT_REQUESTS_LIMIT) + .collect::<Vec<_>>() + .await + .into_iter() + .map(|r| r.map_err(Error::from).and_then(|x| x)) + .collect::<Result<Vec<Vec<_>>>>() + .map(|v| { + let qs: Vec<Question<String>> = v.into_iter().flatten().collect(); + // TODO sort by original ordering ! + qs + }) + } + + /// Parallel searches against the SE search/advanced endpoint across all configured sites |