diff options
author | Sam Tay <sam.chong.tay@gmail.com> | 2020-06-22 21:59:20 -0700 |
---|---|---|
committer | Sam Tay <sam.chong.tay@gmail.com> | 2020-06-23 19:22:49 -0700 |
commit | 0c4bafb3eb996b0e70707a32c11e8a1a2f9572ba (patch) | |
tree | 6b90e68fd3db4e8b8c6334882ec2872f12402109 /src/stackexchange.rs | |
parent | fdc4092d0276259c47a14cf2cc52c933fec633e4 (diff) |
Add duckduckgo search engine
Diffstat (limited to 'src/stackexchange.rs')
-rw-r--r-- | src/stackexchange.rs | 540 |
1 files changed, 405 insertions, 135 deletions
diff --git a/src/stackexchange.rs b/src/stackexchange.rs index 1d4789a..2939c29 100644 --- a/src/stackexchange.rs +++ b/src/stackexchange.rs @@ -1,8 +1,13 @@ use futures::stream::StreamExt; +use percent_encoding::percent_decode_str; use rayon::prelude::*; +use reqwest::header; use reqwest::Client; use reqwest::Url; +use scraper::html::Html; +use scraper::selector::Selector; use serde::{Deserialize, Serialize}; +use std::collections::hash_map::Entry; use std::collections::HashMap; use std::fs; use std::path::PathBuf; @@ -13,7 +18,11 @@ use crate::tui::markdown; use crate::tui::markdown::Markdown; use crate::utils; +/// DuckDuckGo URL +const DUCKDUCKGO_URL: &str = "https://duckduckgo.com"; + /// StackExchange API v2.2 URL +// TODO why not https? const SE_API_URL: &str = "http://api.stackexchange.com"; const SE_API_VERSION: &str = "2.2"; @@ -28,6 +37,11 @@ const SE_SITES_PAGESIZE: u16 = 10000; /// Limit on concurrent requests (gets passed to `buffer_unordered`) const CONCURRENT_REQUESTS_LIMIT: usize = 8; +/// Mock user agent to get real DuckDuckGo results +// TODO copy other user agents and use random one each time +const USER_AGENT: &str = + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0"; + /// This structure allows interacting with parts of the StackExchange /// API, using the `Config` struct to determine certain API settings and options. // TODO should my se structs have &str instead of String? @@ -35,13 +49,13 @@ const CONCURRENT_REQUESTS_LIMIT: usize = 8; pub struct StackExchange { client: Client, config: Config, + sites: HashMap<String, String>, query: String, } /// This structure allows interacting with locally cached StackExchange metadata. pub struct LocalStorage { - sites: Option<Vec<Site>>, - filename: PathBuf, + pub sites: Vec<Site>, } #[derive(Deserialize, Serialize, Debug)] @@ -84,24 +98,36 @@ struct ResponseWrapper<T> { } impl StackExchange { - pub fn new(config: Config, query: String) -> Self { + pub fn new(config: Config, local_storage: LocalStorage, query: String) -> Self { let client = Client::new(); StackExchange { client, + sites: local_storage.get_urls(&config.sites), config, query, } } - /// Search query at stack exchange and get the top answer body + /// Search query and get the top answer body /// - /// For now, use only the first configured site, since, parodoxically, sites - /// with the worst results will finish executing first, since there's less - /// data to retrieve. - pub async fn search_lucky(&self) -> Result<String> { - Ok(self - .search_advanced_site(self.config.sites.iter().next().unwrap(), 1) - .await? + /// For StackExchange engine, use only the first configured site, + /// since, parodoxically, sites with the worst results will finish + /// executing first, because there's less data to retrieve. + /// + /// Needs mut because it temporarily changes self.config + pub async fn search_lucky(&mut self) -> Result<String> { + let original_config = self.config.clone(); + // Temp set lucky config + self.config.limit = 1; + if !self.config.duckduckgo { + self.config.sites.truncate(1); + } + // Run search with temp config + let result = self.search().await; + // Reset config + self.config = original_config; + + Ok(result? .into_iter() .next() .ok_or(Error::NoResults)? @@ -112,19 +138,71 @@ impl StackExchange { .body) } - /// Search query at stack exchange and get a list of relevant questions - pub async fn search(&self) -> Result<Vec<Question<Markdown>>> { - self.search_advanced(self.config.limit).await + /// Search and parse to Markdown for TUI + pub async fn search_md(&self) -> Result<Vec<Question<Markdown>>> { + Ok(parse_markdown(self.search().await?)) + } + + /// Search query and get a list of relevant questions + pub async fn search(&self) -> Result<Vec<Question<String>>> { + if self.config.duckduckgo { + self.search_duckduck_go().await + } else { + // TODO after duckduck go finished, refactor to _not_ thread this limit, its unnecessary + self.se_search_advanced(self.config.limit).await + } } - /// Parallel searches against the search/advanced endpoint across all configured sites - async fn search_advanced(&self, limit: u16) -> Result<Vec<Question<Markdown>>> { + /// Search query at duckduckgo and then fetch the resulting questions from SE. + async fn search_duckduck_go(&self) -> Result<Vec<Question<String>>> { + let url = duckduckgo_url(&self.query, self.sites.values()); + let html = self + .client + .get(url) + .header(header::USER_AGENT, USER_AGENT) + .send() + .await? + .text() + .await?; + let ids = parse_questions_from_ddg_html(&html, &self.sites, self.config.limit)?; + self.se_questions(ids).await + } + + /// Parallel searches against the SE question endpoint across the sites in `ids`. + // TODO I'm sure there is a way to DRY the se_question & se_search_advanced functions + async fn se_questions( + &self, + ids: HashMap<String, Vec<String>>, + ) -> Result<Vec<Question<String>>> { + futures::stream::iter(ids) + .map(|(site, ids)| { + let clone = self.clone(); + tokio::spawn(async move { + let clone = &clone; + clone.se_questions_site(&site, ids).await + }) + }) + .buffer_unordered(CONCURRENT_REQUESTS_LIMIT) + .collect::<Vec<_>>() + .await + .into_iter() + .map(|r| r.map_err(Error::from).and_then(|x| x)) + .collect::<Result<Vec<Vec<_>>>>() + .map(|v| { + let qs: Vec<Question<String>> = v.into_iter().flatten().collect(); + // TODO sort by original ordering ! + qs + }) + } + + /// Parallel searches against the SE search/advanced endpoint across all configured sites + async fn se_search_advanced(&self, limit: u16) -> Result<Vec<Question<String>>> { futures::stream::iter(self.config.sites.clone()) .map(|site| { let clone = self.clone(); tokio::spawn(async move { let clone = &clone; - clone.search_advanced_site(&site, limit).await + clone.se_search_advanced_site(&site, limit).await }) }) .buffer_unordered(CONCURRENT_REQUESTS_LIMIT) @@ -138,18 +216,45 @@ impl StackExchange { if self.config.sites.len() > 1 { qs.sort_unstable_by_key(|q| -q.score); } - Self::parse_markdown(qs) + qs }) } - /// Search against the site's search/advanced endpoint with a given query. + /// Search against the SE site's /questions/{ids} endpoint. + /// Filters out questions with no answers. + async fn se_questions_site( + &self, + site: &str, + ids: Vec<String>, + ) -> Result<Vec<Question<String>>> { + let total = ids.len().to_string(); + let endpoint = format!("questions/{ids}", ids = ids.join(";")); + let qs = self + .client + .get(stackexchange_url(&endpoint)) + .header("Accepts", "application/json") + .query(&self.get_default_se_opts()) + .query(&[("site", site), ("pagesize", &total), ("page", "1")]) + .send() + .await? + .json::<ResponseWrapper<Question<String>>>() + .await? + .items; + Ok(Self::preprocess(qs)) + } + + /// Search against the SE site's /search/advanced endpoint with a given query. /// Only fetches questions that have at least one answer. - async fn search_advanced_site(&self, site: &str, limit: u16) -> Result<Vec<Question<String>>> { + async fn se_search_advanced_site( + &self, + site: &str, + limit: u16, + ) -> Result<Vec<Question<String>>> { let qs = self .client .get(stackexchange_url("search/advanced")) .header("Accepts", "application/json") - .query(&self.get_default_opts()) + .query(&self.get_default_se_opts()) .query(&[ ("q", self.query.as_str()), ("pagesize", &limit.to_string()), @@ -167,7 +272,7 @@ impl StackExchange { Ok(Self::preprocess(qs)) } - fn get_default_opts(&self) -> HashMap<&str, &str> { + fn get_default_se_opts(&self) -> HashMap<&str, &str> { let mut params = HashMap::new(); params.insert("filter", SE_FILTER); if let Some(key) = &self.config.api_key { @@ -178,155 +283,146 @@ impl StackExchange { /// Sorts answers by score /// Preprocess SE markdown to "cmark" markdown (or something closer to it) + /// This markdown preprocess _always_ happens. fn preprocess(qs: Vec<Question<String>>) -> Vec<Question<String>> { - qs.par_iter() + qs.into_par_iter() .map(|q| { - let Question { - id, - score, - title, - answers, - body, - } = q; - answers.to_vec().par_sort_unstable_by_key(|a| -a.score); + let mut answers = q.answers; + answers.par_sort_unstable_by_key(|a| -a.score); let answers = answers - .par_iter() + .into_par_iter() .map(|a| Answer { body: markdown::preprocess(a.body.clone()), - ..*a + ..a }) .collect(); Question { answers, - body: markdown::preprocess(body.to_string()), - id: *id, - score: *score, - title: title.to_string(), + body: markdown::preprocess(q.body), + ..q } }) .collect::<Vec<_>>() } +} - /// Parse all markdown fields - fn parse_markdown(qs: Vec<Question<String>>) -> Vec<Question<Markdown>> { - qs.par_iter() - .map(|q| { - let Question { - id, - score, - title, - answers, - body, - } = q; - let body = markdown::parse(body); - let answers = answers - .par_iter() - .map(|a| { - let Answer { - id, - score, - is_accepted, - body, - } = a; - let body = markdown::parse(body); - Answer { - body, - id: *id, - score: *score, - is_accepted: *is_accepted, - } - }) - .collect::<Vec<_>>(); - Question { - body, - answers, - id: *id, - score: *score, - title: title.to_string(), - } - }) - .collect::<Vec<_>>() - } +/// Parse all markdown fields +/// This only happens for content going into the cursive TUI (not lucky prompt) +fn parse_markdown(qs: Vec<Question<String>>) -> Vec<Question<Markdown>> { + qs.into_par_iter() + .map(|q| { + let body = markdown::parse(q.body); + let answers = q + .answers + .into_par_iter() + .map(|a| { + let body = markdown::parse(a.body); + Answer { + body, + id: a.id, + score: a.score, + is_accepted: a.is_accepted, + } + }) + .collect::<Vec<_>>(); + Question { + body, + answers, + id: q.id, + score: q.score, + title: q.title, + } + }) + .collect::<Vec<_>>() } impl LocalStorage { - pub fn new() -> Result<Self> { - let project = project_dir()?; - let dir = project.cache_dir(); - fs::create_dir_all(&dir)?; - Ok(LocalStorage { - sites: None, - filename: dir.join("sites.json"), - }) + fn fetch_local_sites(filename: &PathBuf) -> Result<Option<Vec<Site>>> { + if let Some(file) = utils::open_file(filename)? { + return serde_json::from_reader(file) + .map_err(|_| Error::MalformedFile(filename.clone())); + } + Ok(None) } - // TODO inform user if we are downloading - pub async fn sites(&mut self) -> Result<&Vec<Site>> { - if self.sites.is_none() && !self.fetch_local_sites()? { - self.fetch_remote_sites().await?; - } - match &self.sites { - Some(sites) if sites.is_empty() => Err(Error::EmptySites), - Some(sites) => Ok(sites), - None => panic!("Code failure in site listing retrieval"), + // TODO decide whether or not I should give LocalStorage an api key.. + async fn fetch_remote_sites() -> Result<Vec<Site>> { + let se_sites = Client::new() + .get(stackexchange_url("sites")) + .header("Accepts", "application/json") + .query(&[ + ("pagesize", SE_SITES_PAGESIZE.to_string()), + ("page", "1".to_string()), + ]) + .send() + .await? + .json::<ResponseWrapper<Site>>() + .await? + .items; + Ok(se_sites + .into_par_iter() + .map(|site| { + let site_url = site.site_url.trim_start_matches("https://").to_string(); + Site { site_url, ..site } + }) + .collect()) + } + + fn store_local_sites(filename: &PathBuf, sites: &[Site]) -> Result<()> { + let file = utils::create_file(filename)?; + serde_json::to_writer(file, sites)?; + Ok(()) + } + + async fn init_sites(filename: &PathBuf, update: bool) -> Result<Vec<Site>> { + if !update { + if let Some(sites) = Self::fetch_local_sites(filename)? { + return Ok(sites); + } } + let sites = Self::fetch_remote_sites().await?; + Self::store_local_sites(filename, &sites)?; + Ok(sites) } - pub async fn update_sites(&mut self) -> Result<()> { - self.fetch_remote_sites().await + pub async fn new(update: bool) -> Result<Self> { + let project = project_dir()?; + let dir = project.cache_dir(); + fs::create_dir_all(&dir)?; + let sites_filename = dir.join("sites.json"); + let sites = Self::init_sites(&sites_filename, update).await?; + Ok(LocalStorage { sites }) } // TODO is this HM worth it? Probably only will ever have < 10 site codes to search... + // TODO store this as Option<HM> on self if other methods use it... pub async fn find_invalid_site<'a, 'b>( - &'b mut self, + &'b self, site_codes: &'a [String], - ) -> Result<Option<&'a String>> { + ) -> Option<&'a String> { let hm: HashMap<&str, ()> = self - .sites() - .await? + .sites .iter() .map(|site| (site.api_site_parameter.as_str(), ())) .collect(); - Ok(site_codes.iter().find(|s| !hm.contains_key(&s.as_str()))) + site_codes.iter().find(|s| !hm.contains_key(&s.as_str())) } - fn fetch_local_sites(&mut self) -> Result<bool> { - match utils::open_file(&self.filename)? { - Some(file) => { - self.sites = serde_json::from_reader(file) - .map_err(|_| Error::MalformedFile(self.filename.clone()))?; - Ok(true) - } - None => Ok(false), - } - } - - // TODO decide whether or not I should give LocalStorage an api key.. - async fn fetch_remote_sites(&mut self) -> Result<()> { - self.sites = Some( - Client::new() - .get(stackexchange_url("sites")) - .header("Accepts", "application/json") - .query(&[ - ("pagesize", SE_SITES_PAGESIZE.to_string()), - ("page", "1".to_string()), - ]) - .send() - .await? - .json::<ResponseWrapper<Site>>() - .await? - .items, - ); - self.store_local_sites() - } - - fn store_local_sites(&self) -> Result<()> { - let file = utils::create_file(&self.filename)?; - Ok(serde_json::to_writer(file, &self.sites)?) + pub fn get_urls(&self, site_codes: &[String]) -> HashMap<String, String> { + self.sites + .iter() + .filter_map(move |site| { + let _ = site_codes + .iter() + .find(|&sc| *sc == site.api_site_parameter)?; + Some((site.api_site_parameter.to_owned(), site.site_url.to_owned())) + }) + .collect() } } -/// Creates stackexchange API url given endpoint; can technically panic +/// Creates stackexchange API url given endpoint +// TODO lazy static this url parse fn stackexchange_url(path: &str) -> Url { let mut url = Url::parse(SE_API_URL).unwrap(); url.path_segments_mut() @@ -336,6 +432,108 @@ fn stackexchange_url(path: &str) -> Url { url } +/// Creates duckduckgo search url given sites and query +/// See https://duckduckgo.com/params for more info +fn duckduckgo_url<'a, I>(query: &str, sites: I) -> Url +where + I: IntoIterator<Item = &'a String>, +{ + let mut q = String::new(); + // Restrict to sites + q.push('('); + q.push_str( + sites + .into_iter() + .map(|site| String::from("site:") + site) + .collect::<Vec<_>>() + .join(" OR ") + .as_str(), + ); + q.push_str(") "); + // Search terms + q.push_str( + query + .trim_end_matches('?') + .split_whitespace() + .collect::<Vec<_>>() + .join(" ") + .as_str(), + ); + Url::parse_with_params( + DUCKDUCKGO_URL, + &[("q", q.as_str()), ("kz", "-1"), ("kh", "-1")], + ) + .unwrap() +} + +/// Parse (site, question_id) pairs out of duckduckgo search results html +/// TODO currently hashmap {site: [qids]} BUT we should maintain relevance order ! +/// maybe this is as simple as a HashMap {qid: ordinal} +fn parse_questions_from_ddg_html<'a>( + html: &'a str, + sites: &'a HashMap<String, String>, + limit: u16, +) -> Result<HashMap<String, Vec<String>>> { + let fragment = Html::parse_document(html); + let anchors = Selector::parse("a.result__a").unwrap(); + let mut qids: HashMap<String, Vec<String>> = HashMap::new(); + let mut count = 0; + for anchor in fragment.select(&anchors) { + let url = anchor + .value() + .attr("href") + .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string())) + .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?; + sites + .iter() + .find_map(|(site_code, site_url)| { + let id = question_url_to_id(site_url, &url)?; + match qids.entry(site_code.to_owned()) { + Entry::Occupied(mut o) => o.get_mut().push(id), + Entry::Vacant(o) => { + o.insert(vec![id]); + } + } + count += 1; + Some(()) + }) + .ok_or_else(|| { + Error::ScrapingError( + "Duckduckgo returned results outside of SE network".to_string(), + ) + })?; + if count >= limit as usize { + break; + } + } + // It doesn't seem possible for DDG to return no results, so assume this is + // a bad user agent + if count == 0 { + Err(Error::ScrapingError(String::from( + "DuckDuckGo blocked this request", + ))) + } else { + Ok(qids) + } +} + +/// For example +/// ``` +/// let id = "stackoverflow.com"; +/// let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor"; +/// assert_eq!(question_url_to_id(site_url, input), "11828270") +/// ``` +fn question_url_to_id(site_url: &str, input: &str) -> Option<String> { + // TODO use str_prefix once its stable + let fragment = site_url.trim_end_matches('/').to_owned() + "/questions/"; + let ix = input.find(&fragment)? + fragment.len(); + let input = &input[ix..]; + let end = input.find('/')?; + Some(input[0..end].to_string()) +} + +// TODO figure out a query that returns no results so that I can test it and differentiate it from +// a blocked request #[cfg(test)] mod tests { use super::*; @@ -346,4 +544,76 @@ mod tests { "http://api.stackexchange.com/2.2/some/endpoint" ) } + + #[test] + fn test_duckduckgo_url() { + let q = "how do I exit vim?"; + let sites = vec![ + String::from("stackoverflow.com"), + String::from("unix.stackexchange.com"), + ]; + assert_eq!( + duckduckgo_url(q, &sites).as_str(), + String::from( + "https://duckduckgo.com/\ + ?q=%28site%3Astackoverflow.com+OR+site%3Aunix.stackexchange.com%29\ + +how+do+I+exit+vim&kz=-1&kh=-1" + ) + ) + } + + #[test] + fn test_duckduckgo_response() { + // TODO make sure results are either 1) answers 2) failed connection 3) blocked + } + + #[test] + fn test_duckduckgo_parser() { + let html = include_str!("../test/exit-vim.html"); + let sites = vec![ + ("stackoverflow", "stackoverflow.com"), + ("askubuntu", "askubuntu.com"), + ] + .into_iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect::<HashMap<String, String>>(); + let mut expected_question_ids = HashMap::new(); + expected_question_ids.insert( + "stackoverflow".to_string(), + vec!["11828270".to_string(), "9171356".to_string()], + ); + expected_question_ids.insert("askubuntu".to_string(), vec!["24406".to_string()]); + assert_eq!( + parse_questions_from_ddg_html(html, &sites, 3).unwrap(), + expected_question_ids + ); + } + + #[test] + fn test_duckduckgo_blocker() -> Result<(), String> { + let html = include_str!("../test/bad-user-agent.html"); + let mut sites = HashMap::new(); + sites.insert( + String::from("stackoverflow"), + String::from("stackoverflow.com"), + ); + + match parse_questions_from_ddg_html(html, &sites, 2) { + Err(Error::ScrapingError(s)) if s == "DuckDuckGo blocked this request".to_string() => { + Ok(()) + } + _ => Err(String::from("Failed to detect DuckDuckGo blocker")), + } + } + + #[test] + fn test_question_url_to_id() { + let site_url = "stackoverflow.com"; + let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor"; + assert_eq!(question_url_to_id(site_url, input).unwrap(), "11828270"); + + let site_url = "stackoverflow.com"; + let input = "/l/?kh=-1&uddg=https://askubuntu.com/questions/24406/how-to-close-vim-from-the-command-line"; + assert_eq!(question_url_to_id(site_url, input), None); + } } |