summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSam Tay <sam.chong.tay@gmail.com>2020-06-23 23:07:35 -0700
committerSam Tay <sam.chong.tay@gmail.com>2020-06-24 01:35:50 -0700
commit9d1e601554a982c2932e2161b153104d4cc14424 (patch)
tree0e27ef0ad56a2760f0d7f87e02027505b32dd8a3
parent74bda95681c253eea5010417dc74e8569010f7f9 (diff)
Reorganize code
The StackExchange struct was getting really bloated. This separates it into smaller modules and structs.
-rw-r--r--TODO.md1
-rw-r--r--src/main.rs4
-rw-r--r--src/stackexchange.rs648
-rw-r--r--src/stackexchange/api.rs201
-rw-r--r--src/stackexchange/local_storage.rs76
-rw-r--r--src/stackexchange/mod.rs8
-rw-r--r--src/stackexchange/scraper.rs251
-rw-r--r--src/stackexchange/search.rs204
8 files changed, 743 insertions, 650 deletions
diff --git a/TODO.md b/TODO.md
index 4a97764..083d492 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,6 +1,7 @@
# TODO
### v0.3.1
+0. Refactor the enum/struct for search engines
1. Much of the code can be reused for google:
* parsing href after `"url="` (similar to uddg)
* formatting `(site:stackoverflow.com OR site:unix.stackexchange.com) what is linux`
diff --git a/src/main.rs b/src/main.rs
index afd6c21..ac176fc 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -12,7 +12,7 @@ use crossterm::style::Color;
use error::{Error, Result};
use lazy_static::lazy_static;
use minimad::mad_inline;
-use stackexchange::{LocalStorage, StackExchange};
+use stackexchange::{LocalStorage, Search};
use term::mk_print_error;
use termimad::{CompoundStyle, MadSkin};
use tokio::runtime::Runtime;
@@ -82,7 +82,7 @@ async fn run(skin: &mut MadSkin) -> Result<Option<Vec<Question<Markdown>>>> {
}
if let Some(q) = opts.query {
- let mut se = StackExchange::new(config, ls, q);
+ let mut se = Search::new(config, ls, q);
if lucky {
let md = se.search_lucky().await?;
skin.print_text(&md);
diff --git a/src/stackexchange.rs b/src/stackexchange.rs
deleted file mode 100644
index a5f59e9..0000000
--- a/src/stackexchange.rs
+++ /dev/null
@@ -1,648 +0,0 @@
-use futures::stream::StreamExt;
-use percent_encoding::percent_decode_str;
-use rayon::prelude::*;
-use reqwest::header;
-use reqwest::Client;
-use reqwest::Url;
-use scraper::html::Html;
-use scraper::selector::Selector;
-use serde::{Deserialize, Serialize};
-use std::collections::hash_map::Entry;
-use std::collections::HashMap;
-use std::fs;
-use std::path::PathBuf;
-
-use crate::config::{project_dir, Config};
-use crate::error::{Error, Result};
-use crate::tui::markdown;
-use crate::tui::markdown::Markdown;
-use crate::utils;
-
-/// DuckDuckGo URL
-const DUCKDUCKGO_URL: &str = "https://duckduckgo.com";
-
-/// StackExchange API v2.2 URL
-// TODO why not https?
-const SE_API_URL: &str = "http://api.stackexchange.com";
-const SE_API_VERSION: &str = "2.2";
-
-/// Filter generated to include only the fields needed to populate
-/// the structs below. Go here to make new filters:
-/// [create filter](https://api.stackexchange.com/docs/create-filter).
-const SE_FILTER: &str = ".DND5X2VHHUH8HyJzpjo)5NvdHI3w6auG";
-
-/// Pagesize when fetching all SE sites. Should be good for many years...
-const SE_SITES_PAGESIZE: u16 = 10000;
-
-/// Limit on concurrent requests (gets passed to `buffer_unordered`)
-const CONCURRENT_REQUESTS_LIMIT: usize = 8;
-
-/// Mock user agent to get real DuckDuckGo results
-// TODO copy other user agents and use random one each time
-const USER_AGENT: &str =
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0";
-
-/// This structure allows interacting with parts of the StackExchange
-/// API, using the `Config` struct to determine certain API settings and options.
-#[derive(Clone)]
-pub struct StackExchange {
- client: Client,
- config: Config,
- sites: HashMap<String, String>,
- query: String,
-}
-
-/// This structure allows interacting with locally cached StackExchange metadata.
-pub struct LocalStorage {
- pub sites: Vec<Site>,
-}
-
-#[derive(Deserialize, Serialize, Debug)]
-pub struct Site {
- pub api_site_parameter: String,
- pub site_url: String,
-}
-
-/// Represents a StackExchange answer with a custom selection of fields from
-/// the [StackExchange docs](https://api.stackexchange.com/docs/types/answer)
-#[derive(Clone, Deserialize, Debug)]
-pub struct Answer<S> {
- #[serde(rename = "answer_id")]
- pub id: u32,
- pub score: i32,
- #[serde(rename = "body_markdown")]
- pub body: S,
- pub is_accepted: bool,
-}
-
-/// Represents a StackExchange question with a custom selection of fields from
-/// the [StackExchange docs](https://api.stackexchange.com/docs/types/question)
-// TODO container over answers should be generic iterator
-#[derive(Clone, Deserialize, Debug)]
-pub struct Question<S> {
- #[serde(rename = "question_id")]
- pub id: u32,
- pub score: i32,
- pub answers: Vec<Answer<S>>,
- pub title: String,
- #[serde(rename = "body_markdown")]
- pub body: S,
-}
-
-/// Internal struct that represents the boilerplate response wrapper from SE API.
-#[derive(Deserialize, Debug)]
-struct ResponseWrapper<T> {
- items: Vec<T>,
-}
-
-// Iss question_id unique across all sites? If not, then this edge case is
-// unaccounted for when sorting.
-//
-// If this is ever an issue, it wouldn't be too hard to account for this; just
-// keep track of site in the `ordering` field and also return site from the
-// spawned per-site tasks.
-#[derive(Debug, PartialEq)]
-struct ScrapedData {
- /// Mapping of site code to question ids
- question_ids: HashMap<String, Vec<String>>,
- /// Mapping of question_id to its ordinal place in search results
- ordering: HashMap<String, usize>,
-}
-
-impl StackExchange {
- pub fn new(config: Config, local_storage: LocalStorage, query: String) -> Self {
- let client = Client::new();
- StackExchange {
- client,
- sites: local_storage.get_urls(&config.sites),
- config,
- query,
- }
- }
-
- /// Search query and get the top answer body
- ///
- /// For StackExchange engine, use only the first configured site,
- /// since, parodoxically, sites with the worst results will finish
- /// executing first, because there's less data to retrieve.
- ///
- /// Needs mut because it temporarily changes self.config
- pub async fn search_lucky(&mut self) -> Result<String> {
- let original_config = self.config.clone();
- // Temp set lucky config
- self.config.limit = 1;
- if !self.config.duckduckgo {
- self.config.sites.truncate(1);
- }
- // Run search with temp config
- let result = self.search().await;
- // Reset config
- self.config = original_config;
-
- Ok(result?
- .into_iter()
- .next()
- .ok_or(Error::NoResults)?
- .answers
- .into_iter()
- .next()
- .ok_or_else(|| Error::StackExchange(String::from("Received question with no answers")))?
- .body)
- }
-
- /// Search and parse to Markdown for TUI
- pub async fn search_md(&self) -> Result<Vec<Question<Markdown>>> {
- Ok(parse_markdown(self.search().await?))
- }
-
- /// Search query and get a list of relevant questions
- pub async fn search(&self) -> Result<Vec<Question<String>>> {
- if self.config.duckduckgo {
- self.search_duckduck_go().await
- } else {
- // TODO after duckduck go finished, refactor to _not_ thread this limit, its unnecessary
- self.se_search_advanced(self.config.limit).await
- }
- }
-
- /// Search query at duckduckgo and then fetch the resulting questions from SE.
- async fn search_duckduck_go(&self) -> Result<Vec<Question<String>>> {
- let url = duckduckgo_url(&self.query, self.sites.values());
- let html = self
- .client
- .get(url)
- .header(header::USER_AGENT, USER_AGENT)
- .send()
- .await?
- .text()
- .await?;
- let data = parse_questions_from_ddg_html(&html, &self.sites, self.config.limit)?;
- self.se_questions(data).await
- }
-
- /// Parallel searches against the SE question endpoint across the sites in `ids`.
- // TODO I'm sure there is a way to DRY the se_question & se_search_advanced functions
- async fn se_questions(&self, data: ScrapedData) -> Result<Vec<Question<String>>> {
- let ScrapedData {
- question_ids,
- ordering,
- } = data;
- futures::stream::iter(question_ids)
- .map(|(site, ids)| {
- let clone = self.clone();
- tokio::spawn(async move {
- let clone = &clone;
- clone.se_questions_site(&site, ids).await
- })
- })
- .buffer_unordered(CONCURRENT_REQUESTS_LIMIT)
- .collect::<Vec<_>>()
- .await
- .into_iter()
- .map(|r| r.map_err(Error::from).and_then(|x| x))
- .collect::<Result<Vec<Vec<_>>>>()
- .map(|v| {
- let mut qs: Vec<Question<String>> = v.into_iter().flatten().collect();
- qs.sort_unstable_by_key(|q| ordering.get(&q.id.to_string()).unwrap());
- qs
- })
- }
-
- /// Parallel searches against the SE search/advanced endpoint across all configured sites
- async fn se_search_advanced(&self, limit: u16) -> Result<Vec<Question<String>>> {
- futures::stream::iter(self.config.sites.clone())
- .map(|site| {
- let clone = self.clone();
- tokio::spawn(async move {
- let clone = &clone;
- clone.se_search_advanced_site(&site, limit).await
- })
- })
- .buffer_unordered(CONCURRENT_REQUESTS_LIMIT)
- .collect::<Vec<_>>()
- .await
- .into_iter()
- .map(|r| r.map_err(Error::from).and_then(|x| x))
- .collect::<Result<Vec<Vec<_>>>>()
- .map(|v| {
- let mut qs: Vec<Question<String>> = v.into_iter().flatten().collect();
- if self.config.sites.len() > 1 {
- qs.sort_unstable_by_key(|q| -q.score);
- }
- qs
- })
- }
-
- /// Search against the SE site's /questions/{ids} endpoint.
- /// Filters out questions with no answers.
- async fn se_questions_site(
- &self,
- site: &str,
- ids: Vec<String>,
- ) -> Result<Vec<Question<String>>> {
- let total = ids.len().to_string();
- let endpoint = format!("questions/{ids}", ids = ids.join(";"));
- let qs = self
- .client
- .get(stackexchange_url(&endpoint))
- .header("Accepts", "application/json")
- .query(&self.get_default_se_opts())
- .query(&[("site", site), ("pagesize", &total), ("page", "1")])
- .send()
- .await?
- .json::<ResponseWrapper<Question<String>>>()
- .await?
- .items;
- Ok(Self::preprocess(qs))
- }
-
- /// Search against the SE site's /search/advanced endpoint with a given query.
- /// Only fetches questions that have at least one answer.
- async fn se_search_advanced_site(
- &self,
- site: &str,
- limit: u16,
- ) -> Result<Vec<Question<String>>> {
- let qs = self
- .client
- .get(stackexchange_url("search/advanced"))
- .header("Accepts", "application/json")
- .query(&self.get_default_se_opts())
- .query(&[
- ("q", self.query.as_str()),
- ("pagesize", &limit.to_string()),
- ("site", site),
- ("page", "1"),
- ("answers", "1"),
- ("order", "desc"),
- ("sort", "relevance"),
- ])
- .send()
- .await?
- .json::<ResponseWrapper<Question<String>>>()
- .await?
- .items;
- Ok(Self::preprocess(qs))
- }
-
- fn get_default_se_opts(&self) -> HashMap<&str, &str> {
- let mut params = HashMap::new();
- params.insert("filter", SE_FILTER);
- if let Some(key) = &self.config.api_key {
- params.insert("key", &key);
- }
- params
- }
-
- /// Sorts answers by score
- /// Preprocess SE markdown to "cmark" markdown (or something closer to it)
- /// This markdown preprocess _always_ happens.
- fn preprocess(qs: Vec<Question<String>>) -> Vec<Question<String>> {
- qs.into_par_iter()
- .map(|q| {
- let mut answers = q.answers;
- answers.par_sort_unstable_by_key(|a| -a.score);
- let answers = answers
- .into_par_iter()
- .map(|a| Answer {
- body: markdown::preprocess(a.body.clone()),
- ..a
- })
- .collect();
- Question {
- answers,
- body: markdown::preprocess(q.body),
- ..q
- }
- })
- .collect::<Vec<_>>()
- }
-}
-
-/// Parse all markdown fields
-/// This only happens for content going into the cursive TUI (not lucky prompt)
-fn parse_markdown(qs: Vec<Question<String>>) -> Vec<Question<Markdown>> {
- qs.into_par_iter()
- .map(|q| {
- let body = markdown::parse(q.body);
- let answers = q
- .answers
- .into_par_iter()
- .map(|a| {
- let body = markdown::parse(a.body);
- Answer {
- body,
- id: a.id,
- score: a.score,
- is_accepted: a.is_accepted,
- }
- })
- .collect::<Vec<_>>();
- Question {
- body,
- answers,
- id: q.id,
- score: q.score,
- title: q.title,
- }
- })
- .collect::<Vec<_>>()
-}
-
-impl LocalStorage {
- fn fetch_local_sites(filename: &PathBuf) -> Result<Option<Vec<Site>>> {
- if let Some(file) = utils::open_file(filename)? {
- return serde_json::from_reader(file)
- .map_err(|_| Error::MalformedFile(filename.clone()));
- }
- Ok(None)
- }
-
- // TODO decide whether or not I should give LocalStorage an api key..
- async fn fetch_remote_sites() -> Result<Vec<Site>> {
- let se_sites = Client::new()
- .get(stackexchange_url("sites"))
- .header("Accepts", "application/json")
- .query(&[
- ("pagesize", SE_SITES_PAGESIZE.to_string()),
- ("page", "1".to_string()),
- ])
- .send()
- .await?
- .json::<ResponseWrapper<Site>>()
- .await?
- .items;
- Ok(se_sites
- .into_par_iter()
- .map(|site| {
- let site_url = site.site_url.trim_start_matches("https://").to_string();
- Site { site_url, ..site }
- })
- .collect())
- }
-
- fn store_local_sites(filename: &PathBuf, sites: &[Site]) -> Result<()> {
- let file = utils::create_file(filename)?;
- serde_json::to_writer(file, sites)?;
- Ok(())
- }
-
- async fn init_sites(filename: &PathBuf, update: bool) -> Result<Vec<Site>> {
- if !update {
- if let Some(sites) = Self::fetch_local_sites(filename)? {
- return Ok(sites);
- }
- }
- let sites = Self::fetch_remote_sites().await?;
- Self::store_local_sites(filename, &sites)?;
- Ok(sites)
- }
-
- pub async fn new(update: bool) -> Result<Self> {
- let project = project_dir()?;
- let dir = project.cache_dir();
- fs::create_dir_all(&dir)?;
- let sites_filename = dir.join("sites.json");
- let sites = Self::init_sites(&sites_filename, update).await?;
- Ok(LocalStorage { sites })
- }
-
- // TODO is this HM worth it? Probably only will ever have < 10 site codes to search...
- // maybe store this as Option<HM> on self if other methods use it...
- pub async fn find_invalid_site<'a, 'b>(
- &'b self,
- site_codes: &'a [String],
- ) -> Option<&'a String> {
- let hm: HashMap<&str, ()> = self
- .sites
- .iter()
- .map(|site| (site.api_site_parameter.as_str(), ()))
- .collect();
- site_codes.iter().find(|s| !hm.contains_key(&s.as_str()))
- }
-
- pub fn get_urls(&self, site_codes: &[String]) -> HashMap<String, String> {
- self.sites
- .iter()
- .filter_map(move |site| {
- let _ = site_codes
- .iter()
- .find(|&sc| *sc == site.api_site_parameter)?;
- Some((site.api_site_parameter.to_owned(), site.site_url.to_owned()))
- })
- .collect()
- }
-}
-
-/// Creates stackexchange API url given endpoint
-// TODO lazy static this url parse
-fn stackexchange_url(path: &str) -> Url {
- let mut url = Url::parse(SE_API_URL).unwrap();
- url.path_segments_mut()
- .unwrap()
- .push(SE_API_VERSION)
- .extend(path.split('/'));
- url
-}
-
-/// Creates duckduckgo search url given sites and query
-/// See https://duckduckgo.com/params for more info
-fn duckduckgo_url<'a, I>(query: &str, sites: I) -> Url
-where
- I: IntoIterator<Item = &'a String>,
-{
- let mut q = String::new();
- // Restrict to sites
- q.push('(');
- q.push_str(
- sites
- .into_iter()
- .map(|site| String::from("site:") + site)
- .collect::<Vec<_>>()
- .join(" OR ")
- .as_str(),
- );
- q.push_str(") ");
- // Search terms
- q.push_str(
- query
- .trim_end_matches('?')
- .split_whitespace()
- .collect::<Vec<_>>()
- .join(" ")
- .as_str(),
- );
- Url::parse_with_params(
- DUCKDUCKGO_URL,
- &[("q", q.as_str()), ("kz", "-1"), ("kh", "-1")],
- )
- .unwrap()
-}
-
-/// Parse (site, question_id) pairs out of duckduckgo search results html
-// TODO Benchmark this. It would likely be faster to use regex on the decoded url.
-fn parse_questions_from_ddg_html<'a>(
- html: &'a str,
- sites: &'a HashMap<String, String>,
- limit: u16,
-) -> Result<ScrapedData> {
- let fragment = Html::parse_document(html);
- let anchors = Selector::parse("a.result__a").unwrap();
- let mut question_ids: HashMap<String, Vec<String>> = HashMap::new();
- let mut ordering: HashMap<String, usize> = HashMap::new();
- let mut count = 0;
- for anchor in fragment.select(&anchors) {
- let url = anchor
- .value()
- .attr("href")
- .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string()))
- .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?;
- sites
- .iter()
- .find_map(|(site_code, site_url)| {
- let id = question_url_to_id(site_url, &url)?;
- ordering.insert(id.to_owned(), count);
- match question_ids.entry(site_code.to_owned()) {
- Entry::Occupied(mut o) => o.get_mut().push(id),
- Entry::Vacant(o) => {
- o.insert(vec![id]);
- }
- }
- count += 1;
- Some(())
- })
- .ok_or_else(|| {
- Error::ScrapingError(
- "Duckduckgo returned results outside of SE network".to_string(),
- )
- })?;
- if count >= limit as usize {
- break;
- }
- }
- // It doesn't seem possible for DDG to return no results, so assume this is
- // a bad user agent
- if count == 0 {
- Err(Error::ScrapingError(String::from(
- "DuckDuckGo blocked this request",
- )))
- } else {
- Ok(ScrapedData {
- question_ids,
- ordering,
- })
- }
-}
-
-/// For example
-/// ```
-/// let id = "stackoverflow.com";
-/// let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor";
-/// assert_eq!(question_url_to_id(site_url, input), "11828270")
-/// ```
-fn question_url_to_id(site_url: &str, input: &str) -> Option<String> {
- // TODO use str_prefix once its stable
- let fragment = site_url.trim_end_matches('/').to_owned() + "/questions/";
- let ix = input.find(&fragment)? + fragment.len();
- let input = &input[ix..];
- let end = input.find('/')?;
- Some(input[0..end].to_string())
-}
-
-// TODO find a query that returns no results so that I can test it and
-// differentiate it from a blocked request
-#[cfg(test)]
-mod tests {
- use super::*;
- #[test]
- fn test_stackexchange_url() {
- assert_eq!(
- stackexchange_url("some/endpoint").as_str(),
- "http://api.stackexchange.com/2.2/some/endpoint"
- )
- }
-
- #[test]
- fn test_duckduckgo_url() {
- let q = "how do I exit vim?";
- let sites = vec![
- String::from("stackoverflow.com"),
- String::from("unix.stackexchange.com"),
- ];
- assert_eq!(
- duckduckgo_url(q, &sites).as_str(),
- String::from(
- "https://duckduckgo.com/\
- ?q=%28site%3Astackoverflow.com+OR+site%3Aunix.stackexchange.com%29\
- +how+do+I+exit+vim&kz=-1&kh=-1"
- )
- )
- }
-
- #[test]
- fn test_duckduckgo_response() {
- // TODO make sure results are either 1) answers 2) failed connection 3) blocked
- }
-
- #[test]
- fn test_duckduckgo_parser() {
- let html = include_str!("../test/exit-vim.html");
- let sites = vec![
- ("stackoverflow", "stackoverflow.com"),
- ("askubuntu", "askubuntu.com"),
- ]
- .into_iter()
- .map(|(k, v)| (k.to_string(), v.to_string()))
- .collect::<HashMap<String, String>>();
- let expected_scraped_data = ScrapedData {
- question_ids: vec![
- ("stackoverflow", vec!["11828270", "9171356"]),
- ("askubuntu", vec!["24406"]),
- ]
- .into_iter()
- .map(|(k, v)| {
- (
- k.to_string(),
- v.into_iter().map(|s| s.to_string()).collect(),
- )
- })
- .collect(),
- ordering: vec![("11828270", 0), ("9171356", 2), ("24406", 1)]
- .into_iter()
- .map(|(k, v)| (k.to_string(), v))
- .collect(),
- };
- assert_eq!(
- parse_questions_from_ddg_html(html, &sites, 3).unwrap(),
- expected_scraped_data
- );
- }
-
- #[test]
- fn test_duckduckgo_blocker() -> Result<(), String> {
- let html = include_str!("../test/bad-user-agent.html");
- let mut sites = HashMap::new();
- sites.insert(
- String::from("stackoverflow"),
- String::from("stackoverflow.com"),
- );
-
- match parse_questions_from_ddg_html(html, &sites, 2) {
- Err(Error::ScrapingError(s)) if s == "DuckDuckGo blocked this request".to_string() => {
- Ok(())
- }
- _ => Err(String::from("Failed to detect DuckDuckGo blocker")),
- }
- }
-
- #[test]
- fn test_question_url_to_id() {
- let site_url = "stackoverflow.com";
- let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor";
- assert_eq!(question_url_to_id(site_url, input).unwrap(), "11828270");
-
- let site_url = "stackoverflow.com";
- let input = "/l/?kh=-1&uddg=https://askubuntu.com/questions/24406/how-to-close-vim-from-the-command-line";
- assert_eq!(question_url_to_id(site_url, input), None);
- }
-}
diff --git a/src/stackexchange/api.rs b/src/stackexchange/api.rs
new file mode 100644
index 0000000..ff94de2
--- /dev/null
+++ b/src/stackexchange/api.rs
@@ -0,0 +1,201 @@
+use rayon::prelude::*;
+use reqwest::header;
+use reqwest::Client;
+use reqwest::Url;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+use crate::error::Result;
+use crate::tui::markdown;
+
+/// StackExchange API v2.2 URL
+// TODO why not https?
+const SE_API_URL: &str = "http://api.stackexchange.com";
+const SE_API_VERSION: &str = "2.2";
+
+/// Filter generated to include only the fields needed to populate
+/// the structs below. Go here to make new filters:
+/// [create filter](https://api.stackexchange.com/docs/create-filter).
+const SE_FILTER: &str = ".DND5X2VHHUH8HyJzpjo)5NvdHI3w6auG";
+
+/// Pagesize when fetching all SE sites. Should be good for many years...
+const SE_SITES_PAGESIZE: u16 = 10000;
+
+/// Represents a StackExchange answer with a custom selection of fields from
+/// the [StackExchange docs](https://api.stackexchange.com/docs/types/answer)
+#[derive(Clone, Deserialize, Debug)]
+pub struct Answer<S> {
+ #[serde(rename = "answer_id")]
+ pub id: u32,
+ pub score: i32,
+ #[serde(rename = "body_markdown")]
+ pub body: S,
+ pub is_accepted: bool,
+}
+
+/// Represents a StackExchange question with a custom selection of fields from
+/// the [StackExchange docs](https://api.stackexchange.com/docs/types/question)
+// TODO container over answers should be generic iterator
+#[derive(Clone, Deserialize, Debug)]
+pub struct Question<S> {
+ #[serde(rename = "question_id")]
+ pub id: u32,
+ pub score: i32,
+ pub answers: Vec<Answer<S>>,
+ pub title: String,
+ #[serde(rename = "body_markdown")]
+ pub body: S,
+}
+
+/// Internal struct that represents the boilerplate response wrapper from SE API.
+#[derive(Deserialize, Debug)]
+struct ResponseWrapper<T> {
+ items: Vec<T>,
+}
+
+#[derive(Deserialize, Serialize, Debug)]
+pub struct Site {
+ pub api_site_parameter: String,
+ pub site_url: String,
+}
+
+#[derive(Clone)]
+pub struct Api {
+ client: Client,
+ api_key: Option<String>,
+}
+
+impl Api {
+ pub fn new(api_key: Option<String>) -> Self {
+ // TODO can lazy_static this above
+ let mut headers = header::HeaderMap::new();
+ headers.insert(
+ header::ACCEPT,
+ header::HeaderValue::from_static("application/json"),
+ );
+ let client = Client::builder().default_headers(headers).build().unwrap();
+ Api { client, api_key }
+ }
+
+ /// Search against the SE site's /questions/{ids} endpoint.
+ /// Filters out questions with no answers.
+ pub async fn questions(&self, site: &str, ids: Vec<String>) -> Result<Vec<Question<String>>> {
+ let total = ids.len().to_string();
+ let endpoint = format!("questions/{ids}", ids = ids.join(";"));
+ let qs = self
+ .client
+ .get(stackexchange_url(&endpoint))
+ .query(&self.get_default_se_opts())
+ .query(&[("site", site), ("pagesize", &total)])
+ .send()
+ .await?
+ .json::<ResponseWrapper<Question<String>>>()
+ .await?
+ .items;
+ Ok(Self::preprocess(qs))
+ }
+
+ /// Search against the SE site's /search/advanced endpoint with a given query.
+ /// Only fetches questions that have at least one answer.
+ pub async fn search_advanced(
+ &self,
+ query: &str,
+ site: &str,
+ limit: u16,
+ ) -> Result<Vec<Question<String>>> {
+ let qs = self
+ .client
+ .get(stackexchange_url("search/advanced"))
+ .query(&self.get_default_se_opts())
+ .query(&[
+ ("q", query),
+ ("pagesize", &limit.to_string()),
+ ("site", site),
+ ("answers", "1"),
+ ("order", "desc"),
+ ("sort", "relevance"),
+ ])
+ .send()
+ .await?
+ .json::<ResponseWrapper<Question<String>>>()
+ .await?
+ .items;
+ Ok(Self::preprocess(qs))
+ }
+
+ pub async fn sites(&self) -> Result<Vec<Site>> {
+ let sites = self
+ .client
+ .get(stackexchange_url("sites"))
+ .query(&[("pagesize", SE_SITES_PAGESIZE.to_string())])
+ .send()
+ .await?
+ .json::<ResponseWrapper<Site>>()
+ .await?
+ .items;
+ Ok(sites
+ .into_par_iter()
+ .map(|site| {
+ let site_url = site.site_url.trim_start_matches("https://").to_string();
+ Site { site_url, ..site }
+ })
+ .collect())
+ }
+
+ fn get_default_se_opts(&self) -> HashMap<&str, &str> {
+ let mut params = HashMap::new();
+ params.insert("filter", SE_FILTER);
+ params.insert("page", "1");
+ if let Some(key) = &self.api_key {
+ params.insert("key", &key);
+ }
+ params
+ }
+
+ /// Sorts answers by score
+ /// Preprocess SE markdown to "cmark" markdown (or something closer to it)
+ /// This markdown preprocess _always_ happens.
+ fn preprocess(qs: Vec<Question<String>>) -> Vec<Question<String>> {
+ qs.into_par_iter()
+ .map(|q| {
+ let mut answers = q.answers;
+ answers.par_sort_unstable_by_key(|a| -a.score);
+ let answers = answers
+ .into_par_iter()
+ .map(|a| Answer {
+ body: markdown::preprocess(a.body.clone()),
+ ..a
+ })
+ .collect();
+ Question {
+ answers,
+ body: markdown::preprocess(q.body),
+ ..q
+ }
+ })
+ .collect::<Vec<_>>()
+ }
+}
+
+/// Creates stackexchange API url given endpoint
+// TODO lazy static this url parse
+fn stackexchange_url(path: &str) -> Url {
+ let mut url = Url::parse(SE_API_URL).unwrap();
+ url.path_segments_mut()
+ .unwrap()
+ .push(SE_API_VERSION)
+ .extend(path.split('/'));
+ url
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ #[test]
+ fn test_stackexchange_url() {
+ assert_eq!(
+ stackexchange_url("some/endpoint").as_str(),
+ "http://api.stackexchange.com/2.2/some/endpoint"
+ )
+ }
+}
diff --git a/src/stackexchange/local_storage.rs b/src/stackexchange/local_storage.rs
new file mode 100644
index 0000000..8d009f8
--- /dev/null
+++ b/src/stackexchange/local_storage.rs
@@ -0,0 +1,76 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::PathBuf;
+
+use crate::config::project_dir;
+use crate::error::{Error, Result};
+use crate::utils;
+
+use super::api::{Api, Site};
+
+/// This structure allows interacting with locally cached StackExchange metadata.
+pub struct LocalStorage {
+ pub sites: Vec<Site>,
+}
+
+impl LocalStorage {
+ fn fetch_local_sites(filename: &PathBuf) -> Result<Option<Vec<Site>>> {
+ if let Some(file) = utils::open_file(filename)? {
+ return serde_json::from_reader(file)
+ .map_err(|_| Error::MalformedFile(filename.clone()));
+ }
+ Ok(None)
+ }
+
+ fn store_local_sites(filename: &PathBuf, sites: &[Site]) -> Result<()> {
+ let file = utils::create_file(filename)?;
+ serde_json::to_writer(file, sites)?;
+ Ok(())
+ }
+
+ async fn init_sites(filename: &PathBuf, update: bool) -> Result<Vec<Site>> {
+ if !update {
+ if let Some(sites) = Self::fetch_local_sites(filename)? {
+ return Ok(sites);
+ }
+ }
+ let sites = Api::new(None).sites().await?;
+ Self::store_local_sites(filename, &sites)?;
+ Ok(sites)
+ }
+
+ pub async fn new(update: bool) -> Result<Self> {
+ let project = project_dir()?;
+ let dir = project.cache_dir();
+ fs::create_dir_all(&dir)?;
+ let sites_filename = dir.join("sites.json");
+ let sites = Self::init_sites(&sites_filename, update).await?;
+ Ok(LocalStorage { sites })
+ }
+
+ // TODO is this HM worth it? Probably only will ever have < 10 site codes to search...
+ // maybe store this as Option<HM> on self if other methods use it...
+ pub async fn find_invalid_site<'a, 'b>(
+ &'b self,
+ site_codes: &'a [String],
+ ) -> Option<&'a String> {
+ let hm: HashMap<&str, ()> = self
+ .sites
+ .iter()
+ .map(|site| (site.api_site_parameter.as_str(), ()))
+ .collect();
+ site_codes.iter().find(|s| !hm.contains_key(&s.as_str()))
+ }
+
+ pub fn get_urls(&self, site_codes: &[String]) -> HashMap<String, String> {
+ self.sites
+ .iter()
+ .filter_map(move |site| {
+ let _ = site_codes
+ .iter()
+ .find(|&sc| *sc == site.api_site_parameter)?;
+ Some((site.api_s