diff options
Diffstat (limited to 'src/stackexchange.rs')
-rw-r--r-- | src/stackexchange.rs | 648 |
1 files changed, 0 insertions, 648 deletions
diff --git a/src/stackexchange.rs b/src/stackexchange.rs deleted file mode 100644 index a5f59e9..0000000 --- a/src/stackexchange.rs +++ /dev/null @@ -1,648 +0,0 @@ -use futures::stream::StreamExt; -use percent_encoding::percent_decode_str; -use rayon::prelude::*; -use reqwest::header; -use reqwest::Client; -use reqwest::Url; -use scraper::html::Html; -use scraper::selector::Selector; -use serde::{Deserialize, Serialize}; -use std::collections::hash_map::Entry; -use std::collections::HashMap; -use std::fs; -use std::path::PathBuf; - -use crate::config::{project_dir, Config}; -use crate::error::{Error, Result}; -use crate::tui::markdown; -use crate::tui::markdown::Markdown; -use crate::utils; - -/// DuckDuckGo URL -const DUCKDUCKGO_URL: &str = "https://duckduckgo.com"; - -/// StackExchange API v2.2 URL -// TODO why not https? -const SE_API_URL: &str = "http://api.stackexchange.com"; -const SE_API_VERSION: &str = "2.2"; - -/// Filter generated to include only the fields needed to populate -/// the structs below. Go here to make new filters: -/// [create filter](https://api.stackexchange.com/docs/create-filter). -const SE_FILTER: &str = ".DND5X2VHHUH8HyJzpjo)5NvdHI3w6auG"; - -/// Pagesize when fetching all SE sites. Should be good for many years... -const SE_SITES_PAGESIZE: u16 = 10000; - -/// Limit on concurrent requests (gets passed to `buffer_unordered`) -const CONCURRENT_REQUESTS_LIMIT: usize = 8; - -/// Mock user agent to get real DuckDuckGo results -// TODO copy other user agents and use random one each time -const USER_AGENT: &str = - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0"; - -/// This structure allows interacting with parts of the StackExchange -/// API, using the `Config` struct to determine certain API settings and options. -#[derive(Clone)] -pub struct StackExchange { - client: Client, - config: Config, - sites: HashMap<String, String>, - query: String, -} - -/// This structure allows interacting with locally cached StackExchange metadata. -pub struct LocalStorage { - pub sites: Vec<Site>, -} - -#[derive(Deserialize, Serialize, Debug)] -pub struct Site { - pub api_site_parameter: String, - pub site_url: String, -} - -/// Represents a StackExchange answer with a custom selection of fields from -/// the [StackExchange docs](https://api.stackexchange.com/docs/types/answer) -#[derive(Clone, Deserialize, Debug)] -pub struct Answer<S> { - #[serde(rename = "answer_id")] - pub id: u32, - pub score: i32, - #[serde(rename = "body_markdown")] - pub body: S, - pub is_accepted: bool, -} - -/// Represents a StackExchange question with a custom selection of fields from -/// the [StackExchange docs](https://api.stackexchange.com/docs/types/question) -// TODO container over answers should be generic iterator -#[derive(Clone, Deserialize, Debug)] -pub struct Question<S> { - #[serde(rename = "question_id")] - pub id: u32, - pub score: i32, - pub answers: Vec<Answer<S>>, - pub title: String, - #[serde(rename = "body_markdown")] - pub body: S, -} - -/// Internal struct that represents the boilerplate response wrapper from SE API. -#[derive(Deserialize, Debug)] -struct ResponseWrapper<T> { - items: Vec<T>, -} - -// Iss question_id unique across all sites? If not, then this edge case is -// unaccounted for when sorting. -// -// If this is ever an issue, it wouldn't be too hard to account for this; just -// keep track of site in the `ordering` field and also return site from the -// spawned per-site tasks. -#[derive(Debug, PartialEq)] -struct ScrapedData { - /// Mapping of site code to question ids - question_ids: HashMap<String, Vec<String>>, - /// Mapping of question_id to its ordinal place in search results - ordering: HashMap<String, usize>, -} - -impl StackExchange { - pub fn new(config: Config, local_storage: LocalStorage, query: String) -> Self { - let client = Client::new(); - StackExchange { - client, - sites: local_storage.get_urls(&config.sites), - config, - query, - } - } - - /// Search query and get the top answer body - /// - /// For StackExchange engine, use only the first configured site, - /// since, parodoxically, sites with the worst results will finish - /// executing first, because there's less data to retrieve. - /// - /// Needs mut because it temporarily changes self.config - pub async fn search_lucky(&mut self) -> Result<String> { - let original_config = self.config.clone(); - // Temp set lucky config - self.config.limit = 1; - if !self.config.duckduckgo { - self.config.sites.truncate(1); - } - // Run search with temp config - let result = self.search().await; - // Reset config - self.config = original_config; - - Ok(result? - .into_iter() - .next() - .ok_or(Error::NoResults)? - .answers - .into_iter() - .next() - .ok_or_else(|| Error::StackExchange(String::from("Received question with no answers")))? - .body) - } - - /// Search and parse to Markdown for TUI - pub async fn search_md(&self) -> Result<Vec<Question<Markdown>>> { - Ok(parse_markdown(self.search().await?)) - } - - /// Search query and get a list of relevant questions - pub async fn search(&self) -> Result<Vec<Question<String>>> { - if self.config.duckduckgo { - self.search_duckduck_go().await - } else { - // TODO after duckduck go finished, refactor to _not_ thread this limit, its unnecessary - self.se_search_advanced(self.config.limit).await - } - } - - /// Search query at duckduckgo and then fetch the resulting questions from SE. - async fn search_duckduck_go(&self) -> Result<Vec<Question<String>>> { - let url = duckduckgo_url(&self.query, self.sites.values()); - let html = self - .client - .get(url) - .header(header::USER_AGENT, USER_AGENT) - .send() - .await? - .text() - .await?; - let data = parse_questions_from_ddg_html(&html, &self.sites, self.config.limit)?; - self.se_questions(data).await - } - - /// Parallel searches against the SE question endpoint across the sites in `ids`. - // TODO I'm sure there is a way to DRY the se_question & se_search_advanced functions - async fn se_questions(&self, data: ScrapedData) -> Result<Vec<Question<String>>> { - let ScrapedData { - question_ids, - ordering, - } = data; - futures::stream::iter(question_ids) - .map(|(site, ids)| { - let clone = self.clone(); - tokio::spawn(async move { - let clone = &clone; - clone.se_questions_site(&site, ids).await - }) - }) - .buffer_unordered(CONCURRENT_REQUESTS_LIMIT) - .collect::<Vec<_>>() - .await - .into_iter() - .map(|r| r.map_err(Error::from).and_then(|x| x)) - .collect::<Result<Vec<Vec<_>>>>() - .map(|v| { - let mut qs: Vec<Question<String>> = v.into_iter().flatten().collect(); - qs.sort_unstable_by_key(|q| ordering.get(&q.id.to_string()).unwrap()); - qs - }) - } - - /// Parallel searches against the SE search/advanced endpoint across all configured sites - async fn se_search_advanced(&self, limit: u16) -> Result<Vec<Question<String>>> { - futures::stream::iter(self.config.sites.clone()) - .map(|site| { - let clone = self.clone(); - tokio::spawn(async move { - let clone = &clone; - clone.se_search_advanced_site(&site, limit).await - }) - }) - .buffer_unordered(CONCURRENT_REQUESTS_LIMIT) - .collect::<Vec<_>>() - .await - .into_iter() - .map(|r| r.map_err(Error::from).and_then(|x| x)) - .collect::<Result<Vec<Vec<_>>>>() - .map(|v| { - let mut qs: Vec<Question<String>> = v.into_iter().flatten().collect(); - if self.config.sites.len() > 1 { - qs.sort_unstable_by_key(|q| -q.score); - } - qs - }) - } - - /// Search against the SE site's /questions/{ids} endpoint. - /// Filters out questions with no answers. - async fn se_questions_site( - &self, - site: &str, - ids: Vec<String>, - ) -> Result<Vec<Question<String>>> { - let total = ids.len().to_string(); - let endpoint = format!("questions/{ids}", ids = ids.join(";")); - let qs = self - .client - .get(stackexchange_url(&endpoint)) - .header("Accepts", "application/json") - .query(&self.get_default_se_opts()) - .query(&[("site", site), ("pagesize", &total), ("page", "1")]) - .send() - .await? - .json::<ResponseWrapper<Question<String>>>() - .await? - .items; - Ok(Self::preprocess(qs)) - } - - /// Search against the SE site's /search/advanced endpoint with a given query. - /// Only fetches questions that have at least one answer. - async fn se_search_advanced_site( - &self, - site: &str, - limit: u16, - ) -> Result<Vec<Question<String>>> { - let qs = self - .client - .get(stackexchange_url("search/advanced")) - .header("Accepts", "application/json") - .query(&self.get_default_se_opts()) - .query(&[ - ("q", self.query.as_str()), - ("pagesize", &limit.to_string()), - ("site", site), - ("page", "1"), - ("answers", "1"), - ("order", "desc"), - ("sort", "relevance"), - ]) - .send() - .await? - .json::<ResponseWrapper<Question<String>>>() - .await? - .items; - Ok(Self::preprocess(qs)) - } - - fn get_default_se_opts(&self) -> HashMap<&str, &str> { - let mut params = HashMap::new(); - params.insert("filter", SE_FILTER); - if let Some(key) = &self.config.api_key { - params.insert("key", &key); - } - params - } - - /// Sorts answers by score - /// Preprocess SE markdown to "cmark" markdown (or something closer to it) - /// This markdown preprocess _always_ happens. - fn preprocess(qs: Vec<Question<String>>) -> Vec<Question<String>> { - qs.into_par_iter() - .map(|q| { - let mut answers = q.answers; - answers.par_sort_unstable_by_key(|a| -a.score); - let answers = answers - .into_par_iter() - .map(|a| Answer { - body: markdown::preprocess(a.body.clone()), - ..a - }) - .collect(); - Question { - answers, - body: markdown::preprocess(q.body), - ..q - } - }) - .collect::<Vec<_>>() - } -} - -/// Parse all markdown fields -/// This only happens for content going into the cursive TUI (not lucky prompt) -fn parse_markdown(qs: Vec<Question<String>>) -> Vec<Question<Markdown>> { - qs.into_par_iter() - .map(|q| { - let body = markdown::parse(q.body); - let answers = q - .answers - .into_par_iter() - .map(|a| { - let body = markdown::parse(a.body); - Answer { - body, - id: a.id, - score: a.score, - is_accepted: a.is_accepted, - } - }) - .collect::<Vec<_>>(); - Question { - body, - answers, - id: q.id, - score: q.score, - title: q.title, - } - }) - .collect::<Vec<_>>() -} - -impl LocalStorage { - fn fetch_local_sites(filename: &PathBuf) -> Result<Option<Vec<Site>>> { - if let Some(file) = utils::open_file(filename)? { - return serde_json::from_reader(file) - .map_err(|_| Error::MalformedFile(filename.clone())); - } - Ok(None) - } - - // TODO decide whether or not I should give LocalStorage an api key.. - async fn fetch_remote_sites() -> Result<Vec<Site>> { - let se_sites = Client::new() - .get(stackexchange_url("sites")) - .header("Accepts", "application/json") - .query(&[ - ("pagesize", SE_SITES_PAGESIZE.to_string()), - ("page", "1".to_string()), - ]) - .send() - .await? - .json::<ResponseWrapper<Site>>() - .await? - .items; - Ok(se_sites - .into_par_iter() - .map(|site| { - let site_url = site.site_url.trim_start_matches("https://").to_string(); - Site { site_url, ..site } - }) - .collect()) - } - - fn store_local_sites(filename: &PathBuf, sites: &[Site]) -> Result<()> { - let file = utils::create_file(filename)?; - serde_json::to_writer(file, sites)?; - Ok(()) - } - - async fn init_sites(filename: &PathBuf, update: bool) -> Result<Vec<Site>> { - if !update { - if let Some(sites) = Self::fetch_local_sites(filename)? { - return Ok(sites); - } - } - let sites = Self::fetch_remote_sites().await?; - Self::store_local_sites(filename, &sites)?; - Ok(sites) - } - - pub async fn new(update: bool) -> Result<Self> { - let project = project_dir()?; - let dir = project.cache_dir(); - fs::create_dir_all(&dir)?; - let sites_filename = dir.join("sites.json"); - let sites = Self::init_sites(&sites_filename, update).await?; - Ok(LocalStorage { sites }) - } - - // TODO is this HM worth it? Probably only will ever have < 10 site codes to search... - // maybe store this as Option<HM> on self if other methods use it... - pub async fn find_invalid_site<'a, 'b>( - &'b self, - site_codes: &'a [String], - ) -> Option<&'a String> { - let hm: HashMap<&str, ()> = self - .sites - .iter() - .map(|site| (site.api_site_parameter.as_str(), ())) - .collect(); - site_codes.iter().find(|s| !hm.contains_key(&s.as_str())) - } - - pub fn get_urls(&self, site_codes: &[String]) -> HashMap<String, String> { - self.sites - .iter() - .filter_map(move |site| { - let _ = site_codes - .iter() - .find(|&sc| *sc == site.api_site_parameter)?; - Some((site.api_site_parameter.to_owned(), site.site_url.to_owned())) - }) - .collect() - } -} - -/// Creates stackexchange API url given endpoint -// TODO lazy static this url parse -fn stackexchange_url(path: &str) -> Url { - let mut url = Url::parse(SE_API_URL).unwrap(); - url.path_segments_mut() - .unwrap() - .push(SE_API_VERSION) - .extend(path.split('/')); - url -} - -/// Creates duckduckgo search url given sites and query -/// See https://duckduckgo.com/params for more info -fn duckduckgo_url<'a, I>(query: &str, sites: I) -> Url -where - I: IntoIterator<Item = &'a String>, -{ - let mut q = String::new(); - // Restrict to sites - q.push('('); - q.push_str( - sites - .into_iter() - .map(|site| String::from("site:") + site) - .collect::<Vec<_>>() - .join(" OR ") - .as_str(), - ); - q.push_str(") "); - // Search terms - q.push_str( - query - .trim_end_matches('?') - .split_whitespace() - .collect::<Vec<_>>() - .join(" ") - .as_str(), - ); - Url::parse_with_params( - DUCKDUCKGO_URL, - &[("q", q.as_str()), ("kz", "-1"), ("kh", "-1")], - ) - .unwrap() -} - -/// Parse (site, question_id) pairs out of duckduckgo search results html -// TODO Benchmark this. It would likely be faster to use regex on the decoded url. -fn parse_questions_from_ddg_html<'a>( - html: &'a str, - sites: &'a HashMap<String, String>, - limit: u16, -) -> Result<ScrapedData> { - let fragment = Html::parse_document(html); - let anchors = Selector::parse("a.result__a").unwrap(); - let mut question_ids: HashMap<String, Vec<String>> = HashMap::new(); - let mut ordering: HashMap<String, usize> = HashMap::new(); - let mut count = 0; - for anchor in fragment.select(&anchors) { - let url = anchor - .value() - .attr("href") - .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string())) - .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?; - sites - .iter() - .find_map(|(site_code, site_url)| { - let id = question_url_to_id(site_url, &url)?; - ordering.insert(id.to_owned(), count); - match question_ids.entry(site_code.to_owned()) { - Entry::Occupied(mut o) => o.get_mut().push(id), - Entry::Vacant(o) => { - o.insert(vec![id]); - } - } - count += 1; - Some(()) - }) - .ok_or_else(|| { - Error::ScrapingError( - "Duckduckgo returned results outside of SE network".to_string(), - ) - })?; - if count >= limit as usize { - break; - } - } - // It doesn't seem possible for DDG to return no results, so assume this is - // a bad user agent - if count == 0 { - Err(Error::ScrapingError(String::from( - "DuckDuckGo blocked this request", - ))) - } else { - Ok(ScrapedData { - question_ids, - ordering, - }) - } -} - -/// For example -/// ``` -/// let id = "stackoverflow.com"; -/// let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor"; -/// assert_eq!(question_url_to_id(site_url, input), "11828270") -/// ``` -fn question_url_to_id(site_url: &str, input: &str) -> Option<String> { - // TODO use str_prefix once its stable - let fragment = site_url.trim_end_matches('/').to_owned() + "/questions/"; - let ix = input.find(&fragment)? + fragment.len(); - let input = &input[ix..]; - let end = input.find('/')?; - Some(input[0..end].to_string()) -} - -// TODO find a query that returns no results so that I can test it and -// differentiate it from a blocked request -#[cfg(test)] -mod tests { - use super::*; - #[test] - fn test_stackexchange_url() { - assert_eq!( - stackexchange_url("some/endpoint").as_str(), - "http://api.stackexchange.com/2.2/some/endpoint" - ) - } - - #[test] - fn test_duckduckgo_url() { - let q = "how do I exit vim?"; - let sites = vec![ - String::from("stackoverflow.com"), - String::from("unix.stackexchange.com"), - ]; - assert_eq!( - duckduckgo_url(q, &sites).as_str(), - String::from( - "https://duckduckgo.com/\ - ?q=%28site%3Astackoverflow.com+OR+site%3Aunix.stackexchange.com%29\ - +how+do+I+exit+vim&kz=-1&kh=-1" - ) - ) - } - - #[test] - fn test_duckduckgo_response() { - // TODO make sure results are either 1) answers 2) failed connection 3) blocked - } - - #[test] - fn test_duckduckgo_parser() { - let html = include_str!("../test/exit-vim.html"); - let sites = vec![ - ("stackoverflow", "stackoverflow.com"), - ("askubuntu", "askubuntu.com"), - ] - .into_iter() - .map(|(k, v)| (k.to_string(), v.to_string())) - .collect::<HashMap<String, String>>(); - let expected_scraped_data = ScrapedData { - question_ids: vec![ - ("stackoverflow", vec!["11828270", "9171356"]), - ("askubuntu", vec!["24406"]), - ] - .into_iter() - .map(|(k, v)| { - ( - k.to_string(), - v.into_iter().map(|s| s.to_string()).collect(), - ) - }) - .collect(), - ordering: vec![("11828270", 0), ("9171356", 2), ("24406", 1)] - .into_iter() - .map(|(k, v)| (k.to_string(), v)) - .collect(), - }; - assert_eq!( - parse_questions_from_ddg_html(html, &sites, 3).unwrap(), - expected_scraped_data - ); - } - - #[test] - fn test_duckduckgo_blocker() -> Result<(), String> { - let html = include_str!("../test/bad-user-agent.html"); - let mut sites = HashMap::new(); - sites.insert( - String::from("stackoverflow"), - String::from("stackoverflow.com"), - ); - - match parse_questions_from_ddg_html(html, &sites, 2) { - Err(Error::ScrapingError(s)) if s == "DuckDuckGo blocked this request".to_string() => { - Ok(()) - } - _ => Err(String::from("Failed to detect DuckDuckGo blocker")), - } - } - - #[test] - fn test_question_url_to_id() { - let site_url = "stackoverflow.com"; - let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor"; - assert_eq!(question_url_to_id(site_url, input).unwrap(), "11828270"); - - let site_url = "stackoverflow.com"; - let input = "/l/?kh=-1&uddg=https://askubuntu.com/questions/24406/how-to-close-vim-from-the-command-line"; - assert_eq!(question_url_to_id(site_url, input), None); - } -} |