summaryrefslogtreecommitdiffstats
path: root/src/stackexchange.rs
diff options
context:
space:
mode:
authorSam Tay <sam.chong.tay@gmail.com>2020-06-22 21:59:20 -0700
committerSam Tay <sam.chong.tay@gmail.com>2020-06-23 19:22:49 -0700
commit0c4bafb3eb996b0e70707a32c11e8a1a2f9572ba (patch)
tree6b90e68fd3db4e8b8c6334882ec2872f12402109 /src/stackexchange.rs
parentfdc4092d0276259c47a14cf2cc52c933fec633e4 (diff)
Add duckduckgo search engine
Diffstat (limited to 'src/stackexchange.rs')
-rw-r--r--src/stackexchange.rs540
1 files changed, 405 insertions, 135 deletions
diff --git a/src/stackexchange.rs b/src/stackexchange.rs
index 1d4789a..2939c29 100644
--- a/src/stackexchange.rs
+++ b/src/stackexchange.rs
@@ -1,8 +1,13 @@
use futures::stream::StreamExt;
+use percent_encoding::percent_decode_str;
use rayon::prelude::*;
+use reqwest::header;
use reqwest::Client;
use reqwest::Url;
+use scraper::html::Html;
+use scraper::selector::Selector;
use serde::{Deserialize, Serialize};
+use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::fs;
use std::path::PathBuf;
@@ -13,7 +18,11 @@ use crate::tui::markdown;
use crate::tui::markdown::Markdown;
use crate::utils;
+/// DuckDuckGo URL
+const DUCKDUCKGO_URL: &str = "https://duckduckgo.com";
+
/// StackExchange API v2.2 URL
+// TODO why not https?
const SE_API_URL: &str = "http://api.stackexchange.com";
const SE_API_VERSION: &str = "2.2";
@@ -28,6 +37,11 @@ const SE_SITES_PAGESIZE: u16 = 10000;
/// Limit on concurrent requests (gets passed to `buffer_unordered`)
const CONCURRENT_REQUESTS_LIMIT: usize = 8;
+/// Mock user agent to get real DuckDuckGo results
+// TODO copy other user agents and use random one each time
+const USER_AGENT: &str =
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0";
+
/// This structure allows interacting with parts of the StackExchange
/// API, using the `Config` struct to determine certain API settings and options.
// TODO should my se structs have &str instead of String?
@@ -35,13 +49,13 @@ const CONCURRENT_REQUESTS_LIMIT: usize = 8;
pub struct StackExchange {
client: Client,
config: Config,
+ sites: HashMap<String, String>,
query: String,
}
/// This structure allows interacting with locally cached StackExchange metadata.
pub struct LocalStorage {
- sites: Option<Vec<Site>>,
- filename: PathBuf,
+ pub sites: Vec<Site>,
}
#[derive(Deserialize, Serialize, Debug)]
@@ -84,24 +98,36 @@ struct ResponseWrapper<T> {
}
impl StackExchange {
- pub fn new(config: Config, query: String) -> Self {
+ pub fn new(config: Config, local_storage: LocalStorage, query: String) -> Self {
let client = Client::new();
StackExchange {
client,
+ sites: local_storage.get_urls(&config.sites),
config,
query,
}
}
- /// Search query at stack exchange and get the top answer body
+ /// Search query and get the top answer body
///
- /// For now, use only the first configured site, since, parodoxically, sites
- /// with the worst results will finish executing first, since there's less
- /// data to retrieve.
- pub async fn search_lucky(&self) -> Result<String> {
- Ok(self
- .search_advanced_site(self.config.sites.iter().next().unwrap(), 1)
- .await?
+ /// For StackExchange engine, use only the first configured site,
+ /// since, parodoxically, sites with the worst results will finish
+ /// executing first, because there's less data to retrieve.
+ ///
+ /// Needs mut because it temporarily changes self.config
+ pub async fn search_lucky(&mut self) -> Result<String> {
+ let original_config = self.config.clone();
+ // Temp set lucky config
+ self.config.limit = 1;
+ if !self.config.duckduckgo {
+ self.config.sites.truncate(1);
+ }
+ // Run search with temp config
+ let result = self.search().await;
+ // Reset config
+ self.config = original_config;
+
+ Ok(result?
.into_iter()
.next()
.ok_or(Error::NoResults)?
@@ -112,19 +138,71 @@ impl StackExchange {
.body)
}
- /// Search query at stack exchange and get a list of relevant questions
- pub async fn search(&self) -> Result<Vec<Question<Markdown>>> {
- self.search_advanced(self.config.limit).await
+ /// Search and parse to Markdown for TUI
+ pub async fn search_md(&self) -> Result<Vec<Question<Markdown>>> {
+ Ok(parse_markdown(self.search().await?))
+ }
+
+ /// Search query and get a list of relevant questions
+ pub async fn search(&self) -> Result<Vec<Question<String>>> {
+ if self.config.duckduckgo {
+ self.search_duckduck_go().await
+ } else {
+ // TODO after duckduck go finished, refactor to _not_ thread this limit, its unnecessary
+ self.se_search_advanced(self.config.limit).await
+ }
}
- /// Parallel searches against the search/advanced endpoint across all configured sites
- async fn search_advanced(&self, limit: u16) -> Result<Vec<Question<Markdown>>> {
+ /// Search query at duckduckgo and then fetch the resulting questions from SE.
+ async fn search_duckduck_go(&self) -> Result<Vec<Question<String>>> {
+ let url = duckduckgo_url(&self.query, self.sites.values());
+ let html = self
+ .client
+ .get(url)
+ .header(header::USER_AGENT, USER_AGENT)
+ .send()
+ .await?
+ .text()
+ .await?;
+ let ids = parse_questions_from_ddg_html(&html, &self.sites, self.config.limit)?;
+ self.se_questions(ids).await
+ }
+
+ /// Parallel searches against the SE question endpoint across the sites in `ids`.
+ // TODO I'm sure there is a way to DRY the se_question & se_search_advanced functions
+ async fn se_questions(
+ &self,
+ ids: HashMap<String, Vec<String>>,
+ ) -> Result<Vec<Question<String>>> {
+ futures::stream::iter(ids)
+ .map(|(site, ids)| {
+ let clone = self.clone();
+ tokio::spawn(async move {
+ let clone = &clone;
+ clone.se_questions_site(&site, ids).await
+ })
+ })
+ .buffer_unordered(CONCURRENT_REQUESTS_LIMIT)
+ .collect::<Vec<_>>()
+ .await
+ .into_iter()
+ .map(|r| r.map_err(Error::from).and_then(|x| x))
+ .collect::<Result<Vec<Vec<_>>>>()
+ .map(|v| {
+ let qs: Vec<Question<String>> = v.into_iter().flatten().collect();
+ // TODO sort by original ordering !
+ qs
+ })
+ }
+
+ /// Parallel searches against the SE search/advanced endpoint across all configured sites
+ async fn se_search_advanced(&self, limit: u16) -> Result<Vec<Question<String>>> {
futures::stream::iter(self.config.sites.clone())
.map(|site| {
let clone = self.clone();
tokio::spawn(async move {
let clone = &clone;
- clone.search_advanced_site(&site, limit).await
+ clone.se_search_advanced_site(&site, limit).await
})
})
.buffer_unordered(CONCURRENT_REQUESTS_LIMIT)
@@ -138,18 +216,45 @@ impl StackExchange {
if self.config.sites.len() > 1 {
qs.sort_unstable_by_key(|q| -q.score);
}
- Self::parse_markdown(qs)
+ qs
})
}
- /// Search against the site's search/advanced endpoint with a given query.
+ /// Search against the SE site's /questions/{ids} endpoint.
+ /// Filters out questions with no answers.
+ async fn se_questions_site(
+ &self,
+ site: &str,
+ ids: Vec<String>,
+ ) -> Result<Vec<Question<String>>> {
+ let total = ids.len().to_string();
+ let endpoint = format!("questions/{ids}", ids = ids.join(";"));
+ let qs = self
+ .client
+ .get(stackexchange_url(&endpoint))
+ .header("Accepts", "application/json")
+ .query(&self.get_default_se_opts())
+ .query(&[("site", site), ("pagesize", &total), ("page", "1")])
+ .send()
+ .await?
+ .json::<ResponseWrapper<Question<String>>>()
+ .await?
+ .items;
+ Ok(Self::preprocess(qs))
+ }
+
+ /// Search against the SE site's /search/advanced endpoint with a given query.
/// Only fetches questions that have at least one answer.
- async fn search_advanced_site(&self, site: &str, limit: u16) -> Result<Vec<Question<String>>> {
+ async fn se_search_advanced_site(
+ &self,
+ site: &str,
+ limit: u16,
+ ) -> Result<Vec<Question<String>>> {
let qs = self
.client
.get(stackexchange_url("search/advanced"))
.header("Accepts", "application/json")
- .query(&self.get_default_opts())
+ .query(&self.get_default_se_opts())
.query(&[
("q", self.query.as_str()),
("pagesize", &limit.to_string()),
@@ -167,7 +272,7 @@ impl StackExchange {
Ok(Self::preprocess(qs))
}
- fn get_default_opts(&self) -> HashMap<&str, &str> {
+ fn get_default_se_opts(&self) -> HashMap<&str, &str> {
let mut params = HashMap::new();
params.insert("filter", SE_FILTER);
if let Some(key) = &self.config.api_key {
@@ -178,155 +283,146 @@ impl StackExchange {
/// Sorts answers by score
/// Preprocess SE markdown to "cmark" markdown (or something closer to it)
+ /// This markdown preprocess _always_ happens.
fn preprocess(qs: Vec<Question<String>>) -> Vec<Question<String>> {
- qs.par_iter()
+ qs.into_par_iter()
.map(|q| {
- let Question {
- id,
- score,
- title,
- answers,
- body,
- } = q;
- answers.to_vec().par_sort_unstable_by_key(|a| -a.score);
+ let mut answers = q.answers;
+ answers.par_sort_unstable_by_key(|a| -a.score);
let answers = answers
- .par_iter()
+ .into_par_iter()
.map(|a| Answer {
body: markdown::preprocess(a.body.clone()),
- ..*a
+ ..a
})
.collect();
Question {
answers,
- body: markdown::preprocess(body.to_string()),
- id: *id,
- score: *score,
- title: title.to_string(),
+ body: markdown::preprocess(q.body),
+ ..q
}
})
.collect::<Vec<_>>()
}
+}
- /// Parse all markdown fields
- fn parse_markdown(qs: Vec<Question<String>>) -> Vec<Question<Markdown>> {
- qs.par_iter()
- .map(|q| {
- let Question {
- id,
- score,
- title,
- answers,
- body,
- } = q;
- let body = markdown::parse(body);
- let answers = answers
- .par_iter()
- .map(|a| {
- let Answer {
- id,
- score,
- is_accepted,
- body,
- } = a;
- let body = markdown::parse(body);
- Answer {
- body,
- id: *id,
- score: *score,
- is_accepted: *is_accepted,
- }
- })
- .collect::<Vec<_>>();
- Question {
- body,
- answers,
- id: *id,
- score: *score,
- title: title.to_string(),
- }
- })
- .collect::<Vec<_>>()
- }
+/// Parse all markdown fields
+/// This only happens for content going into the cursive TUI (not lucky prompt)
+fn parse_markdown(qs: Vec<Question<String>>) -> Vec<Question<Markdown>> {
+ qs.into_par_iter()
+ .map(|q| {
+ let body = markdown::parse(q.body);
+ let answers = q
+ .answers
+ .into_par_iter()
+ .map(|a| {
+ let body = markdown::parse(a.body);
+ Answer {
+ body,
+ id: a.id,
+ score: a.score,
+ is_accepted: a.is_accepted,
+ }
+ })
+ .collect::<Vec<_>>();
+ Question {
+ body,
+ answers,
+ id: q.id,
+ score: q.score,
+ title: q.title,
+ }
+ })
+ .collect::<Vec<_>>()
}
impl LocalStorage {
- pub fn new() -> Result<Self> {
- let project = project_dir()?;
- let dir = project.cache_dir();
- fs::create_dir_all(&dir)?;
- Ok(LocalStorage {
- sites: None,
- filename: dir.join("sites.json"),
- })
+ fn fetch_local_sites(filename: &PathBuf) -> Result<Option<Vec<Site>>> {
+ if let Some(file) = utils::open_file(filename)? {
+ return serde_json::from_reader(file)
+ .map_err(|_| Error::MalformedFile(filename.clone()));
+ }
+ Ok(None)
}
- // TODO inform user if we are downloading
- pub async fn sites(&mut self) -> Result<&Vec<Site>> {
- if self.sites.is_none() && !self.fetch_local_sites()? {
- self.fetch_remote_sites().await?;
- }
- match &self.sites {
- Some(sites) if sites.is_empty() => Err(Error::EmptySites),
- Some(sites) => Ok(sites),
- None => panic!("Code failure in site listing retrieval"),
+ // TODO decide whether or not I should give LocalStorage an api key..
+ async fn fetch_remote_sites() -> Result<Vec<Site>> {
+ let se_sites = Client::new()
+ .get(stackexchange_url("sites"))
+ .header("Accepts", "application/json")
+ .query(&[
+ ("pagesize", SE_SITES_PAGESIZE.to_string()),
+ ("page", "1".to_string()),
+ ])
+ .send()
+ .await?
+ .json::<ResponseWrapper<Site>>()
+ .await?
+ .items;
+ Ok(se_sites
+ .into_par_iter()
+ .map(|site| {
+ let site_url = site.site_url.trim_start_matches("https://").to_string();
+ Site { site_url, ..site }
+ })
+ .collect())
+ }
+
+ fn store_local_sites(filename: &PathBuf, sites: &[Site]) -> Result<()> {
+ let file = utils::create_file(filename)?;
+ serde_json::to_writer(file, sites)?;
+ Ok(())
+ }
+
+ async fn init_sites(filename: &PathBuf, update: bool) -> Result<Vec<Site>> {
+ if !update {
+ if let Some(sites) = Self::fetch_local_sites(filename)? {
+ return Ok(sites);
+ }
}
+ let sites = Self::fetch_remote_sites().await?;
+ Self::store_local_sites(filename, &sites)?;
+ Ok(sites)
}
- pub async fn update_sites(&mut self) -> Result<()> {
- self.fetch_remote_sites().await
+ pub async fn new(update: bool) -> Result<Self> {
+ let project = project_dir()?;
+ let dir = project.cache_dir();
+ fs::create_dir_all(&dir)?;
+ let sites_filename = dir.join("sites.json");
+ let sites = Self::init_sites(&sites_filename, update).await?;
+ Ok(LocalStorage { sites })
}
// TODO is this HM worth it? Probably only will ever have < 10 site codes to search...
+ // TODO store this as Option<HM> on self if other methods use it...
pub async fn find_invalid_site<'a, 'b>(
- &'b mut self,
+ &'b self,
site_codes: &'a [String],
- ) -> Result<Option<&'a String>> {
+ ) -> Option<&'a String> {
let hm: HashMap<&str, ()> = self
- .sites()
- .await?
+ .sites
.iter()
.map(|site| (site.api_site_parameter.as_str(), ()))
.collect();
- Ok(site_codes.iter().find(|s| !hm.contains_key(&s.as_str())))
+ site_codes.iter().find(|s| !hm.contains_key(&s.as_str()))
}
- fn fetch_local_sites(&mut self) -> Result<bool> {
- match utils::open_file(&self.filename)? {
- Some(file) => {
- self.sites = serde_json::from_reader(file)
- .map_err(|_| Error::MalformedFile(self.filename.clone()))?;
- Ok(true)
- }
- None => Ok(false),
- }
- }
-
- // TODO decide whether or not I should give LocalStorage an api key..
- async fn fetch_remote_sites(&mut self) -> Result<()> {
- self.sites = Some(
- Client::new()
- .get(stackexchange_url("sites"))
- .header("Accepts", "application/json")
- .query(&[
- ("pagesize", SE_SITES_PAGESIZE.to_string()),
- ("page", "1".to_string()),
- ])
- .send()
- .await?
- .json::<ResponseWrapper<Site>>()
- .await?
- .items,
- );
- self.store_local_sites()
- }
-
- fn store_local_sites(&self) -> Result<()> {
- let file = utils::create_file(&self.filename)?;
- Ok(serde_json::to_writer(file, &self.sites)?)
+ pub fn get_urls(&self, site_codes: &[String]) -> HashMap<String, String> {
+ self.sites
+ .iter()
+ .filter_map(move |site| {
+ let _ = site_codes
+ .iter()
+ .find(|&sc| *sc == site.api_site_parameter)?;
+ Some((site.api_site_parameter.to_owned(), site.site_url.to_owned()))
+ })
+ .collect()
}
}
-/// Creates stackexchange API url given endpoint; can technically panic
+/// Creates stackexchange API url given endpoint
+// TODO lazy static this url parse
fn stackexchange_url(path: &str) -> Url {
let mut url = Url::parse(SE_API_URL).unwrap();
url.path_segments_mut()
@@ -336,6 +432,108 @@ fn stackexchange_url(path: &str) -> Url {
url
}
+/// Creates duckduckgo search url given sites and query
+/// See https://duckduckgo.com/params for more info
+fn duckduckgo_url<'a, I>(query: &str, sites: I) -> Url
+where
+ I: IntoIterator<Item = &'a String>,
+{
+ let mut q = String::new();
+ // Restrict to sites
+ q.push('(');
+ q.push_str(
+ sites
+ .into_iter()
+ .map(|site| String::from("site:") + site)
+ .collect::<Vec<_>>()
+ .join(" OR ")
+ .as_str(),
+ );
+ q.push_str(") ");
+ // Search terms
+ q.push_str(
+ query
+ .trim_end_matches('?')
+ .split_whitespace()
+ .collect::<Vec<_>>()
+ .join(" ")
+ .as_str(),
+ );
+ Url::parse_with_params(
+ DUCKDUCKGO_URL,
+ &[("q", q.as_str()), ("kz", "-1"), ("kh", "-1")],
+ )
+ .unwrap()
+}
+
+/// Parse (site, question_id) pairs out of duckduckgo search results html
+/// TODO currently hashmap {site: [qids]} BUT we should maintain relevance order !
+/// maybe this is as simple as a HashMap {qid: ordinal}
+fn parse_questions_from_ddg_html<'a>(
+ html: &'a str,
+ sites: &'a HashMap<String, String>,
+ limit: u16,
+) -> Result<HashMap<String, Vec<String>>> {
+ let fragment = Html::parse_document(html);
+ let anchors = Selector::parse("a.result__a").unwrap();
+ let mut qids: HashMap<String, Vec<String>> = HashMap::new();
+ let mut count = 0;
+ for anchor in fragment.select(&anchors) {
+ let url = anchor
+ .value()
+ .attr("href")
+ .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string()))
+ .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?;
+ sites
+ .iter()
+ .find_map(|(site_code, site_url)| {
+ let id = question_url_to_id(site_url, &url)?;
+ match qids.entry(site_code.to_owned()) {
+ Entry::Occupied(mut o) => o.get_mut().push(id),
+ Entry::Vacant(o) => {
+ o.insert(vec![id]);
+ }
+ }
+ count += 1;
+ Some(())
+ })
+ .ok_or_else(|| {
+ Error::ScrapingError(
+ "Duckduckgo returned results outside of SE network".to_string(),
+ )
+ })?;
+ if count >= limit as usize {
+ break;
+ }
+ }
+ // It doesn't seem possible for DDG to return no results, so assume this is
+ // a bad user agent
+ if count == 0 {
+ Err(Error::ScrapingError(String::from(
+ "DuckDuckGo blocked this request",
+ )))
+ } else {
+ Ok(qids)
+ }
+}
+
+/// For example
+/// ```
+/// let id = "stackoverflow.com";
+/// let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor";
+/// assert_eq!(question_url_to_id(site_url, input), "11828270")
+/// ```
+fn question_url_to_id(site_url: &str, input: &str) -> Option<String> {
+ // TODO use str_prefix once its stable
+ let fragment = site_url.trim_end_matches('/').to_owned() + "/questions/";
+ let ix = input.find(&fragment)? + fragment.len();
+ let input = &input[ix..];
+ let end = input.find('/')?;
+ Some(input[0..end].to_string())
+}
+
+// TODO figure out a query that returns no results so that I can test it and differentiate it from
+// a blocked request
#[cfg(test)]
mod tests {
use super::*;
@@ -346,4 +544,76 @@ mod tests {
"http://api.stackexchange.com/2.2/some/endpoint"
)
}
+
+ #[test]
+ fn test_duckduckgo_url() {
+ let q = "how do I exit vim?";
+ let sites = vec![
+ String::from("stackoverflow.com"),
+ String::from("unix.stackexchange.com"),
+ ];
+ assert_eq!(
+ duckduckgo_url(q, &sites).as_str(),
+ String::from(
+ "https://duckduckgo.com/\
+ ?q=%28site%3Astackoverflow.com+OR+site%3Aunix.stackexchange.com%29\
+ +how+do+I+exit+vim&kz=-1&kh=-1"
+ )
+ )
+ }
+
+ #[test]
+ fn test_duckduckgo_response() {
+ // TODO make sure results are either 1) answers 2) failed connection 3) blocked
+ }
+
+ #[test]
+ fn test_duckduckgo_parser() {
+ let html = include_str!("../test/exit-vim.html");
+ let sites = vec![
+ ("stackoverflow", "stackoverflow.com"),
+ ("askubuntu", "askubuntu.com"),
+ ]
+ .into_iter()
+ .map(|(k, v)| (k.to_string(), v.to_string()))
+ .collect::<HashMap<String, String>>();
+ let mut expected_question_ids = HashMap::new();
+ expected_question_ids.insert(
+ "stackoverflow".to_string(),
+ vec!["11828270".to_string(), "9171356".to_string()],
+ );
+ expected_question_ids.insert("askubuntu".to_string(), vec!["24406".to_string()]);
+ assert_eq!(
+ parse_questions_from_ddg_html(html, &sites, 3).unwrap(),
+ expected_question_ids
+ );
+ }
+
+ #[test]
+ fn test_duckduckgo_blocker() -> Result<(), String> {
+ let html = include_str!("../test/bad-user-agent.html");
+ let mut sites = HashMap::new();
+ sites.insert(
+ String::from("stackoverflow"),
+ String::from("stackoverflow.com"),
+ );
+
+ match parse_questions_from_ddg_html(html, &sites, 2) {
+ Err(Error::ScrapingError(s)) if s == "DuckDuckGo blocked this request".to_string() => {
+ Ok(())
+ }
+ _ => Err(String::from("Failed to detect DuckDuckGo blocker")),
+ }
+ }
+
+ #[test]
+ fn test_question_url_to_id() {
+ let site_url = "stackoverflow.com";
+ let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor";
+ assert_eq!(question_url_to_id(site_url, input).unwrap(), "11828270");
+
+ let site_url = "stackoverflow.com";
+ let input = "/l/?kh=-1&uddg=https://askubuntu.com/questions/24406/how-to-close-vim-from-the-command-line";
+ assert_eq!(question_url_to_id(site_url, input), None);
+ }
}