summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSam Tay <sam.chong.tay@gmail.com>2020-06-22 21:59:20 -0700
committerSam Tay <sam.chong.tay@gmail.com>2020-06-23 19:22:49 -0700
commit0c4bafb3eb996b0e70707a32c11e8a1a2f9572ba (patch)
tree6b90e68fd3db4e8b8c6334882ec2872f12402109
parentfdc4092d0276259c47a14cf2cc52c933fec633e4 (diff)
Add duckduckgo search engine
-rw-r--r--Cargo.lock311
-rw-r--r--Cargo.toml5
-rw-r--r--TODO.md51
-rw-r--r--roadmap.md1
-rw-r--r--src/cli.rs26
-rw-r--r--src/config.rs13
-rw-r--r--src/error.rs4
-rw-r--r--src/main.rs27
-rw-r--r--src/stackexchange.rs540
-rw-r--r--src/tui/app.rs54
-rw-r--r--src/tui/markdown.rs6
-rw-r--r--test/bad-user-agent.html1
-rw-r--r--test/exit-vim.html1745
13 files changed, 2549 insertions, 235 deletions
diff --git a/Cargo.lock b/Cargo.lock
index 3d8f148..e87755a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -114,6 +114,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e8c087f005730276d1096a652e92a8bacee2e2472bcc9715a74d2bec38b5820"
[[package]]
+name = "byteorder"
+version = "1.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de"
+
+[[package]]
name = "bytes"
version = "0.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -315,6 +321,33 @@ dependencies = [
]
[[package]]
+name = "cssparser"
+version = "0.27.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a"
+dependencies = [
+ "cssparser-macros",
+ "dtoa-short",
+ "itoa",
+ "matches",
+ "phf",
+ "proc-macro2",
+ "quote",
+ "smallvec",
+ "syn",
+]
+
+[[package]]
+name = "cssparser-macros"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfae75de57f2b2e85e8768c3ea840fd159c8f33e2b6522c7835b7abac81be16e"
+dependencies = [
+ "quote",
+ "syn",
+]
+
+[[package]]
name = "cursive"
version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -331,8 +364,10 @@ dependencies = [
"log",
"maplit",
"ncurses",
+ "pancurses",
"signal-hook",
"term_size",
+ "termion",
"unicode-segmentation",
"unicode-width",
]
@@ -396,6 +431,17 @@ dependencies = [
]
[[package]]
+name = "derive_more"
+version = "0.99.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc655351f820d774679da6cdc23355a93de496867d8203496675162e17b1d671"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
name = "directories"
version = "2.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -424,6 +470,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4358a9e11b9a09cf52383b451b49a169e8d797b68aa02301ff586d70d9661ea3"
[[package]]
+name = "dtoa-short"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59020b8513b76630c49d918c33db9f4c91638e7d3404a28084083b87e33f76f2"
+dependencies = [
+ "dtoa",
+]
+
+[[package]]
+name = "ego-tree"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591"
+
+[[package]]
name = "either"
version = "1.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -531,6 +592,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
[[package]]
+name = "futf"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c9c1ce3fa9336301af935ab852c437817d14cd33690446569392e65170aac3b"
+dependencies = [
+ "mac",
+ "new_debug_unreachable",
+]
+
+[[package]]
name = "futures"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -626,6 +697,24 @@ dependencies = [
]
[[package]]
+name = "fxhash"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
+dependencies = [
+ "byteorder",
+]
+
+[[package]]
+name = "getopts"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
+dependencies = [
+ "unicode-width",
+]
+
+[[package]]
name = "getrandom"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -665,6 +754,20 @@ dependencies = [
]
[[package]]
+name = "html5ever"
+version = "0.25.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b"
+dependencies = [
+ "log",
+ "mac",
+ "markup5ever",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
name = "http"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -825,12 +928,35 @@ dependencies = [
]
[[package]]
+name = "mac"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
+
+[[package]]
name = "maplit"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
[[package]]
+name = "markup5ever"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab"
+dependencies = [
+ "log",
+ "phf",
+ "phf_codegen",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "string_cache",
+ "string_cache_codegen",
+ "tendril",
+]
+
+[[package]]
name = "matches"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -996,6 +1122,18 @@ dependencies = [
]
[[package]]
+name = "new_debug_unreachable"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54"
+
+[[package]]
+name = "nodrop"
+version = "0.1.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
+
+[[package]]
name = "num"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1070,6 +1208,12 @@ dependencies = [
]
[[package]]
+name = "numtoa"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8f8bdf33df195859076e54ab11ee78a1b208382d3a26ec40d142ffc1ecc49ef"
+
+[[package]]
name = "once_cell"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1118,6 +1262,19 @@ dependencies = [
]
[[package]]
+name = "pancurses"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3058bc37c433096b2ac7afef1c5cdfae49ede0a4ffec3dfc1df1df0959d0ff0"
+dependencies = [
+ "libc",
+ "log",
+ "ncurses",
+ "pdcurses-sys",
+ "winreg 0.5.1",
+]
+
+[[package]]
name = "parking_lot"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1142,6 +1299,16 @@ dependencies = [
]
[[package]]
+name = "pdcurses-sys"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "084dd22796ff60f1225d4eb6329f33afaf4c85419d51d440ab6b8c6f4529166b"
+dependencies = [
+ "cc",
+ "libc",
+]
+
+[[package]]
name = "percent-encoding"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1159,6 +1326,16 @@ dependencies = [
]
[[package]]
+name = "phf_codegen"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
+dependencies = [
+ "phf_generator",
+ "phf_shared",
+]
+
+[[package]]
name = "phf_generator"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1236,6 +1413,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "237a5ed80e274dbc66f86bd59c1e25edc039660be53194b5fe0a482e0f2612ea"
[[package]]
+name = "precomputed-hash"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
+
+[[package]]
name = "proc-macro-hack"
version = "0.5.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1359,6 +1542,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84"
[[package]]
+name = "redox_termios"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76"
+dependencies = [
+ "redox_syscall",
+]
+
+[[package]]
name = "redox_users"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1411,7 +1603,7 @@ dependencies = [
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
- "winreg",
+ "winreg 0.7.0",
]
[[package]]
@@ -1449,6 +1641,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
+name = "scraper"
+version = "0.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48e02aa790c80c2e494130dec6a522033b6a23603ffc06360e9fe6c611ea2c12"
+dependencies = [
+ "cssparser",
+ "ego-tree",
+ "getopts",
+ "html5ever",
+ "matches",
+ "selectors",
+ "smallvec",
+ "tendril",
+]
+
+[[package]]
name = "security-framework"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1472,6 +1680,26 @@ dependencies = [
]
[[package]]
+name = "selectors"
+version = "0.22.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe"
+dependencies = [
+ "bitflags",
+ "cssparser",
+ "derive_more",
+ "fxhash",
+ "log",
+ "matches",
+ "phf",
+ "phf_codegen",
+ "precomputed-hash",
+ "servo_arc",
+ "smallvec",
+ "thin-slice",
+]
+
+[[package]]
name = "serde"
version = "1.0.111"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1527,6 +1755,16 @@ dependencies = [
]
[[package]]
+name = "servo_arc"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432"
+dependencies = [
+ "nodrop",
+ "stable_deref_trait",
+]
+
+[[package]]
name = "signal-hook"
version = "0.1.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1576,10 +1814,12 @@ dependencies = [
"futures",
"lazy_static",
"minimad",
+ "percent-encoding",
"phf",
"pulldown-cmark",
"rayon",
"reqwest",
+ "scraper",
"serde",
"serde_json",
"serde_yaml",
@@ -1608,6 +1848,31 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dba1a27d3efae4351c8051072d619e3ade2820635c3958d826bfea39d59b54c8"
[[package]]
+name = "string_cache"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2940c75beb4e3bf3a494cef919a747a2cb81e52571e212bfbd185074add7208a"
+dependencies = [
+ "lazy_static",
+ "new_debug_unreachable",
+ "phf_shared",
+ "precomputed-hash",
+ "serde",
+]
+
+[[package]]
+name = "string_cache_codegen"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f24c8e5e19d22a726626f1a5e16fe15b132dcf21d10177fa5a45ce7962996b97"
+dependencies = [
+ "phf_generator",
+ "phf_shared",
+ "proc-macro2",
+ "quote",
+]
+
+[[package]]
name = "strsim"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1645,6 +1910,17 @@ dependencies = [
]
[[package]]
+name = "tendril"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "707feda9f2582d5d680d733e38755547a3e8fb471e7ba11452ecfd9ce93a5d3b"
+dependencies = [
+ "futf",
+ "mac",
+ "utf-8",
+]
+
+[[package]]
name = "term_size"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1668,6 +1944,18 @@ dependencies = [
]
[[package]]
+name = "termion"
+version = "1.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c22cec9d8978d906be5ac94bceb5a010d885c626c4c8855721a4dbd20e3ac905"
+dependencies = [
+ "libc",
+ "numtoa",
+ "redox_syscall",
+ "redox_termios",
+]
+
+[[package]]
name = "textwrap"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1677,6 +1965,12 @@ dependencies = [
]
[[package]]
+name = "thin-slice"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
+
+[[package]]
name = "thiserror"
version = "1.0.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1843,6 +2137,12 @@ dependencies = [
]
[[package]]
+name = "utf-8"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7"
+
+[[package]]
name = "vcpkg"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1990,6 +2290,15 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "winreg"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a27a759395c1195c4cc5cda607ef6f8f6498f64e78f7900f5de0a127a424704a"
+dependencies = [
+ "winapi 0.3.8",
+]
+
+[[package]]
+name = "winreg"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69"
diff --git a/Cargo.toml b/Cargo.toml
index 25d5708..6443c85 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,10 +15,13 @@ serde_json = "1.0"
serde_yaml = "0.8"
reqwest = { version = "0.10", features = ["gzip", "json"] }
-futures = "0.3"
tokio = { version = "0.2", features = ["full"] }
+futures = "0.3"
rayon = "1.3"
+percent-encoding = "2.1"
+scraper = "0.12"
+
lazy_static = "1.4"
minimad = "0.6"
termimad = "0.8"
diff --git a/TODO.md b/TODO.md
index 056472f..6b53cb2 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,11 +1,22 @@
# TODO
### v0.3.0
-1. Duckduck go search ftw, e.g.
-```
-(site:stackoverflow.com OR site:unix.stackexchange.com) what is linux
-```
-etc.
+1. Keep relevance ordering !!!
+
+### v0.3.1
+1. Much of the code can be reused for google:
+ * parsing href after `"url="` (similar to uddg)
+ * formatting `(site:stackoverflow.com OR site:unix.stackexchange.com) what is linux`
+ So make a `Scraper` trait and implement it for DDG & Google. Then
+ `stackexchange` can just code against `Scraper` and choose based on
+ `--search-engine | -e' argument`
+2. Maybe reorganize to
+ - stackexchange
+ - api
+ - scraper
+
+
+
### Endless future improvements for the TUI
1. Init with smaller layout depending on initial screen size.
@@ -20,23 +31,29 @@ etc.
### resources for later
-#### async
-1. start with [this](http://patshaughnessy.net/2020/1/20/downloading-100000-files-using-async-rust) but also see the following gist and thread through the below links to make sure its actually async..
-0. breakdown of futures+reqwest [here](https://stackoverflow.com/questions/51044467/how-can-i-perform-parallel-asynchronous-http-get-requests-with-reqwest)
-0. general concurrency in rust [info](https://blog.yoshuawuyts.com/streams-concurrency/)
-0. [Intro to async rust](http://jamesmcm.github.io/blog/2020/05/06/a-practical-introduction-to-async-programming-in-rust/)
-1. Async API calls [tokio](https://stackoverflow.com/a/57770687)
-2. Parallel calls against multiple sites [vid](https://www.youtube.com/watch?v=O-LagKc0MPA)
-0. OR JUST THREADS [see here](https://rust-lang.github.io/async-book/01_getting_started/02_why_async.html)
-
#### scraping
6. Google stuff [scraping with reqwest](https://rust-lang-nursery.github.io/rust-cookbook/web/scraping.html))
+```python
+# if necessary, choose one of these to mimic browswer request
+USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
+ 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0',
+ 'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0',
+ ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) '
+ 'Chrome/19.0.1084.46 Safari/536.5'),
+ ('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46'
+ 'Safari/536.5'), )
+
+# checks for search engine blocks
+BLOCK_INDICATORS = (
+ 'form id="captcha-form"',
+ 'This page appears when Google automatically detects requests coming from your computer '
+ 'network which appear to be in violation of the <a href="//www.google.com/policies/terms/">Terms of Service'
+)
+```
+
#### distribution
1. oh game over [dawg](https://github.com/japaric/trust)
-2. also, use [feature flags]() to select backend. Only use crossterm on Windows
- since it is rather jumpy...
-
#### ideas
5. Add sort option, e.g. relevance|votes|date
diff --git a/roadmap.md b/roadmap.md
index efe6dfe..b5d8d36 100644
--- a/roadmap.md
+++ b/roadmap.md
@@ -33,3 +33,4 @@
[ ] add duckduckgo logo to readme
[ ] per platform package mgmt
[ ] more testing
+[ ] maybe add google engine too. but fuck google.
diff --git a/src/cli.rs b/src/cli.rs
index 7d946d0..7abde66 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -58,7 +58,7 @@ pub fn get_opts() -> Result<Opts> {
.takes_value(true)
.default_value(limit)
.validator(|s| s.parse::<u32>().map_err(|e| e.to_string()).map(|_| ()))
- .help("Question limit per site query"),
+ .help("Question limit"),
)
.arg(
Arg::with_name("lucky")
@@ -69,7 +69,8 @@ pub fn get_opts() -> Result<Opts> {
Arg::with_name("no-lucky")
.long("no-lucky")
.help("Disable lucky")
- .conflicts_with("lucky"),
+ .conflicts_with("lucky")
+ .hidden(!config.lucky),
)
.arg(
Arg::with_name("query")
@@ -77,12 +78,32 @@ pub fn get_opts() -> Result<Opts> {
.index(1)
.required_unless_one(&["list-sites", "update-sites", "set-api-key"]),
)
+ .arg(
+ Arg::with_name("duckduckgo")
+ .long("duckduckgo")
+ .help("Use DuckDuckGo as a search engine"),
+ )
+ .arg(
+ Arg::with_name("no-duckduckgo")
+ .long("no-duckduckgo")
+ .help("Disable duckduckgo")
+ .conflicts_with("duckduckgo")
+ .hidden(!config.duckduckgo),
+ )
.get_matches();
let lucky = match (matches.is_present("lucky"), matches.is_present("no-lucky")) {
(true, _) => true,
(_, true) => false,
_ => config.lucky,
};
+ let duckduckgo = match (
+ matches.is_present("duckduckgo"),
+ matches.is_present("no-duckduckgo"),
+ ) {
+ (true, _) => true,
+ (_, true) => false,
+ _ => config.duckduckgo,
+ };
Ok(Opts {
list_sites: matches.is_present("list-sites"),
update_sites: matches.is_present("update-sites"),
@@ -105,6 +126,7 @@ pub fn get_opts() -> Result<Opts> {
.map(String::from)
.or(config.api_key),
lucky,
+ duckduckgo,
},
})
}
diff --git a/src/config.rs b/src/config.rs
index c86e0ad..79cbd74 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -13,6 +13,7 @@ pub struct Config {
pub limit: u16,
pub lucky: bool,
pub sites: Vec<String>,
+ pub duckduckgo: bool,
}
// TODO make a friender config file, like the colors.toml below
@@ -23,6 +24,7 @@ impl Default for Config {
limit: 20,
lucky: true,
sites: vec![String::from("stackoverflow")],
+ duckduckgo: true,
}
}
}
@@ -33,13 +35,22 @@ pub fn user_config() -> Result<Config> {
let dir = project.config_dir();
fs::create_dir_all(&dir)?;
let filename = config_file_name()?;
+
match utils::open_file(&filename)? {
None => {
let def = Config::default();
write_config(&def)?;
Ok(def)
}
- Some(file) => serde_yaml::from_reader(file).map_err(|_| Error::MalformedFile(filename)),
+ Some(file) => serde_yaml::from_reader(file)
+ .map_err(|_| Error::MalformedFile(filename.clone()))
+ .and_then(|cfg: Config| {
+ if cfg.sites.is_empty() {
+ Err(Error::MalformedFile(filename))
+ } else {
+ Ok(cfg)
+ }
+ }),
}
}
diff --git a/src/error.rs b/src/error.rs
index d104594..53ba23c 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -24,10 +24,10 @@ pub enum Error {
Permissions(PermissionType, PathBuf),
#[error("{0}")]
StackExchange(String),
+ #[error("{0}")]
+ ScrapingError(String),
#[error("Couldn't find a suitable project directory; is your OS supported?")]
ProjectDir,
- #[error("Empty sites file in cache")]
- EmptySites,
#[error("Sorry, couldn't find any answers for your query")]
NoResults,
}
diff --git a/src/main.rs b/src/main.rs
index f753cd1..afd6c21 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -35,13 +35,7 @@ fn main() -> Result<()> {
})
.or_else(|e: Error| {
// Handle errors
- print_error(&e.to_string())?;
- match e {
- Error::EmptySites => {
- print_notice!(skin, "This can likely be fixed by `so --update-sites`.")
- }
- _ => Ok(()),
- }
+ print_error(&e.to_string())
})
}
@@ -52,23 +46,19 @@ async fn run(skin: &mut MadSkin) -> Result<Option<Vec<Question<Markdown>>>> {
let config = opts.config;
let sites = &config.sites;
let lucky = config.lucky;
- let mut ls = LocalStorage::new()?;
+
+ let ls = LocalStorage::new(opts.update_sites).await?;
if let Some(key) = opts.set_api_key {
config::set_api_key(key)?;
}
- if opts.update_sites {
- ls.update_sites().await?;
- }
-
if opts.list_sites {
- let sites = ls.sites().await?;
let mut md = String::new();
md.push_str("|:-:|:-:|\n");
md.push_str("|Site Code|Site URL|\n");
md.push_str("|-:|:-|\n");
- for s in sites.iter() {
+ for s in ls.sites.iter() {
md.push_str(&format!("|{}|{}\n", s.api_site_parameter, s.site_url));
}
md.push_str("|-\n");
@@ -76,7 +66,7 @@ async fn run(skin: &mut MadSkin) -> Result<Option<Vec<Question<Markdown>>>> {
return Ok(None);
}
- if let Some(site) = ls.find_invalid_site(sites).await? {
+ if let Some(site) = ls.find_invalid_site(sites).await {
print_error!(skin, "$0 is not a valid StackExchange site.\n\n", site)?;
// TODO should only use inline for single lines; use termimad::text stuff
print_notice!(
@@ -92,20 +82,19 @@ async fn run(skin: &mut MadSkin) -> Result<Option<Vec<Question<Markdown>>>> {
}
if let Some(q) = opts.query {
- let se = StackExchange::new(config, q);
+ let mut se = StackExchange::new(config, ls, q);
if lucky {
- // TODO this needs preprocessing; all the more reason to do it at SE level
let md = se.search_lucky().await?;
skin.print_text(&md);
skin.print_text("\nPress **[SPACE]** to see more results, or any other key to exit");
// Kick off the rest of the search in the background
- let qs = task::spawn(async move { se.search().await });
+ let qs = task::spawn(async move { se.search_md().await });
if !utils::wait_for_char(' ')? {
return Ok(None);
}
return Ok(Some(qs.await.unwrap()?));
} else {
- return Ok(Some(se.search().await?));
+ return Ok(Some(se.search_md().await?));
}
}
Ok(None)
diff --git a/src/stackexchange.rs b/src/stackexchange.rs
index 1d4789a..2939c29 100644
--- a/src/stackexchange.rs
+++ b/src/stackexchange.rs
@@ -1,8 +1,13 @@
use futures::stream::StreamExt;
+use percent_encoding::percent_decode_str;
use rayon::prelude::*;
+use reqwest::header;
use reqwest::Client;
use reqwest::Url;
+use scraper::html::Html;
+use scraper::selector::Selector;
use serde::{Deserialize, Serialize};
+use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::fs;
use std::path::PathBuf;
@@ -13,7 +18,11 @@ use crate::tui::markdown;
use crate::tui::markdown::Markdown;
use crate::utils;
+/// DuckDuckGo URL
+const DUCKDUCKGO_URL: &str = "https://duckduckgo.com";
+
/// StackExchange API v2.2 URL
+// TODO why not https?
const SE_API_URL: &str = "http://api.stackexchange.com";
const SE_API_VERSION: &str = "2.2";
@@ -28,6 +37,11 @@ const SE_SITES_PAGESIZE: u16 = 10000;
/// Limit on concurrent requests (gets passed to `buffer_unordered`)
const CONCURRENT_REQUESTS_LIMIT: usize = 8;
+/// Mock user agent to get real DuckDuckGo results
+// TODO copy other user agents and use random one each time
+const USER_AGENT: &str =
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0";
+
/// This structure allows interacting with parts of the StackExchange
/// API, using the `Config` struct to determine certain API settings and options.
// TODO should my se structs have &str instead of String?
@@ -35,13 +49,13 @@ const CONCURRENT_REQUESTS_LIMIT: usize = 8;
pub struct StackExchange {
client: Client,
config: Config,
+ sites: HashMap<String, String>,
query: String,
}
/// This structure allows interacting with locally cached StackExchange metadata.
pub struct LocalStorage {
- sites: Option<Vec<Site>>,
- filename: PathBuf,
+ pub sites: Vec<Site>,
}
#[derive(Deserialize, Serialize, Debug)]
@@ -84,24 +98,36 @@ struct ResponseWrapper<T> {
}
impl StackExchange {
- pub fn new(config: Config, query: String) -> Self {
+ pub fn new(config: Config, local_storage: LocalStorage, query: String) -> Self {
let client = Client::new();
StackExchange {
client,
+ sites: local_storage.get_urls(&config.sites),
config,
query,
}
}
- /// Search query at stack exchange and get the top answer body
+ /// Search query and get the top answer body
///
- /// For now, use only the first configured site, since, parodoxically, sites
- /// with the worst results will finish executing first, since there's less
- /// data to retrieve.
- pub async fn search_lucky(&self) -> Result<String> {
- Ok(self
- .search_advanced_site(self.config.sites.iter().next().unwrap(), 1)
- .await?
+ /// For StackExchange engine, use only the first configured site,
+ /// since, parodoxically, sites with the worst results will finish
+ /// executing first, because there's less data to retrieve.
+ ///
+ /// Needs mut because it temporarily changes self.config
+ pub async fn search_lucky(&mut self) -> Result<String> {
+ let original_config = self.config.clone();
+ // Temp set lucky config
+ self.config.limit = 1;
+ if !self.config.duckduckgo {
+ self.config.sites.truncate(1);
+ }
+ // Run search with temp config
+ let result = self.search().await;
+ // Reset config
+ self.config = original_config;
+
+ Ok(result?
.into_iter()
.next()
.ok_or(Error::NoResults)?
@@ -112,19 +138,71 @@ impl StackExchange {
.body)
}
- /// Search query at stack exchange and get a list of relevant questions
- pub async fn search(&self) -> Result<Vec<Question<Markdown>>> {
- self.search_advanced(self.config.limit).await
+ /// Search and parse to Markdown for TUI
+ pub async fn search_md(&self) -> Result<Vec<Question<Markdown>>> {
+ Ok(parse_markdown(self.search().await?))
+ }
+
+ /// Search query and get a list of relevant questions
+ pub async fn search(&self) -> Result<Vec<Question<String>>> {
+ if self.config.duckduckgo {
+ self.search_duckduck_go().await
+ } else {
+ // TODO after duckduck go finished, refactor to _not_ thread this limit, its unnecessary
+ self.se_search_advanced(self.config.limit).await
+ }
}
- /// Parallel searches against the search/advanced endpoint across all configured sites
- async fn search_advanced(&self, limit: u16) -> Result<Vec<Question<Markdown>>> {
+ /// Search query at duckduckgo and then fetch the resulting questions from SE.
+ async fn search_duckduck_go(&self) -> Result<Vec<Question<String>>> {
+ let url = duckduckgo_url(&self.query, self.sites.values());
+ let html = self
+ .client
+ .get(url)
+ .header(header::USER_AGENT, USER_AGENT)
+ .send()
+ .await?
+ .text()
+ .await?;
+ let ids = parse_questions_from_ddg_html(&html, &self.sites, self.config.limit)?;
+ self.se_questions(ids).await
+ }
+
+ /// Parallel searches against the SE question endpoint across the sites in `ids`.
+ // TODO I'm sure there is a way to DRY the se_question & se_search_advanced functions
+ async fn se_questions(
+ &self,
+ ids: HashMap<String, Vec<String>>,
+ ) -> Result<Vec<Question<String>>> {
+ futures::stream::iter(ids)
+ .map(|(site, ids)| {
+ let clone = self.clone();
+ tokio::spawn(async move {
+ let clone = &clone;
+ clone.se_questions_site(&site, ids).await
+ })
+ })
+ .buffer_unordered(CONCURRENT_REQUESTS_LIMIT)
+ .collect::<Vec<_>>()
+ .await
+ .into_iter()
+ .map(|r| r.map_err(Error::from).and_then(|x| x))
+ .collect::<Result<Vec<Vec<_>>>>()
+ .map(|v| {
+ let qs: Vec<Question<String>> = v.into_iter().flatten().collect();
+ // TODO sort by original ordering !
+ qs
+ })
+ }
+
+ /// Parallel searches against the SE search/advanced endpoint across all configured sites
+ async fn se_search_advanced(&self, limit: u16) -> Result<Vec<Question<String>>> {</