Reorganize code

The StackExchange struct was getting really bloated. This separates it into smaller modules and structs.
author: Sam Tay <sam.chong.tay@gmail.com> 2020-06-23 23:07:35 -0700
committer: Sam Tay <sam.chong.tay@gmail.com> 2020-06-24 01:35:50 -0700
commit: 9d1e601554a982c2932e2161b153104d4cc14424 (patch)
tree: 0e27ef0ad56a2760f0d7f87e02027505b32dd8a3
parent: 74bda95681c253eea5010417dc74e8569010f7f9 (diff)
8 files changed, 743 insertions, 650 deletions
diff --git a/TODO.md b/TODO.md
index 4a97764..083d492 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,6 +1,7 @@
 # TODO
 
 ### v0.3.1
+0. Refactor the enum/struct for search engines
 1. Much of the code can be reused for google:
     * parsing href after `"url="` (similar to uddg)
     * formatting `(site:stackoverflow.com OR site:unix.stackexchange.com) what is linux`
diff --git a/src/main.rs b/src/main.rs
index afd6c21..ac176fc 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -12,7 +12,7 @@ use crossterm::style::Color;
 use error::{Error, Result};
 use lazy_static::lazy_static;
 use minimad::mad_inline;
-use stackexchange::{LocalStorage, StackExchange};
+use stackexchange::{LocalStorage, Search};
 use term::mk_print_error;
 use termimad::{CompoundStyle, MadSkin};
 use tokio::runtime::Runtime;
@@ -82,7 +82,7 @@ async fn run(skin: &mut MadSkin) -> Result<Option<Vec<Question<Markdown>>>> {
     }
 
     if let Some(q) = opts.query {
-        let mut se = StackExchange::new(config, ls, q);
+        let mut se = Search::new(config, ls, q);
         if lucky {
             let md = se.search_lucky().await?;
             skin.print_text(&md);
diff --git a/src/stackexchange.rs b/src/stackexchange.rs
deleted file mode 100644
index a5f59e9..0000000
--- a/src/stackexchange.rs
+++ /dev/null
@@ -1,648 +0,0 @@
-use futures::stream::StreamExt;
-use percent_encoding::percent_decode_str;
-use rayon::prelude::*;
-use reqwest::header;
-use reqwest::Client;
-use reqwest::Url;
-use scraper::html::Html;
-use scraper::selector::Selector;
-use serde::{Deserialize, Serialize};
-use std::collections::hash_map::Entry;
-use std::collections::HashMap;
-use std::fs;
-use std::path::PathBuf;
-
-use crate::config::{project_dir, Config};
-use crate::error::{Error, Result};
-use crate::tui::markdown;
-use crate::tui::markdown::Markdown;
-use crate::utils;
-
-/// DuckDuckGo URL
-const DUCKDUCKGO_URL: &str = "https://duckduckgo.com";
-
-/// StackExchange API v2.2 URL
-// TODO why not https?
-const SE_API_URL: &str = "http://api.stackexchange.com";
-const SE_API_VERSION: &str = "2.2";
-
-/// Filter generated to include only the fields needed to populate
-/// the structs below. Go here to make new filters:
-/// [create filter](https://api.stackexchange.com/docs/create-filter).
-const SE_FILTER: &str = ".DND5X2VHHUH8HyJzpjo)5NvdHI3w6auG";
-
-/// Pagesize when fetching all SE sites. Should be good for many years...
-const SE_SITES_PAGESIZE: u16 = 10000;
-
-/// Limit on concurrent requests (gets passed to `buffer_unordered`)
-const CONCURRENT_REQUESTS_LIMIT: usize = 8;
-
-/// Mock user agent to get real DuckDuckGo results
-// TODO copy other user agents and use random one each time
-const USER_AGENT: &str =
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0";
-
-/// This structure allows interacting with parts of the StackExchange
-/// API, using the `Config` struct to determine certain API settings and options.
-#[derive(Clone)]
-pub struct StackExchange {
-    client: Client,
-    config: Config,
-    sites: HashMap<String, String>,
-    query: String,
-}
-
-/// This structure allows interacting with locally cached StackExchange metadata.
-pub struct LocalStorage {
-    pub sites: Vec<Site>,
-}
-
-#[derive(Deserialize, Serialize, Debug)]
-pub struct Site {
-    pub api_site_parameter: String,
-    pub site_url: String,
-}
-
-/// Represents a StackExchange answer with a custom selection of fields from
-/// the [StackExchange docs](https://api.stackexchange.com/docs/types/answer)
-#[derive(Clone, Deserialize, Debug)]
-pub struct Answer<S> {
-    #[serde(rename = "answer_id")]
-    pub id: u32,
-    pub score: i32,
-    #[serde(rename = "body_markdown")]
-    pub body: S,
-    pub is_accepted: bool,
-}
-
-/// Represents a StackExchange question with a custom selection of fields from
-/// the [StackExchange docs](https://api.stackexchange.com/docs/types/question)
-// TODO container over answers should be generic iterator
-#[derive(Clone, Deserialize, Debug)]
-pub struct Question<S> {
-    #[serde(rename = "question_id")]
-    pub id: u32,
-    pub score: i32,
-    pub answers: Vec<Answer<S>>,
-    pub title: String,
-    #[serde(rename = "body_markdown")]
-    pub body: S,
-}
-
-/// Internal struct that represents the boilerplate response wrapper from SE API.
-#[derive(Deserialize, Debug)]
-struct ResponseWrapper<T> {
-    items: Vec<T>,
-}
-
-// Iss question_id unique across all sites? If not, then this edge case is
-// unaccounted for when sorting.
-//
-// If this is ever an issue, it wouldn't be too hard to account for this; just
-// keep track of site in the `ordering` field and also return site from the
-// spawned per-site tasks.
-#[derive(Debug, PartialEq)]
-struct ScrapedData {
-    /// Mapping of site code to question ids
-    question_ids: HashMap<String, Vec<String>>,
-    /// Mapping of question_id to its ordinal place in search results
-    ordering: HashMap<String, usize>,
-}
-
-impl StackExchange {
-    pub fn new(config: Config, local_storage: LocalStorage, query: String) -> Self {
-        let client = Client::new();
-        StackExchange {
-            client,
-            sites: local_storage.get_urls(&config.sites),
-            config,
-            query,
-        }
-    }
-
-    /// Search query and get the top answer body
-    ///
-    /// For StackExchange engine, use only the first configured site,
-    /// since, parodoxically, sites with the worst results will finish
-    /// executing first, because there's less data to retrieve.
-    ///
-    /// Needs mut because it temporarily changes self.config
-    pub async fn search_lucky(&mut self) -> Result<String> {
-        let original_config = self.config.clone();
-        // Temp set lucky config
-        self.config.limit = 1;
-        if !self.config.duckduckgo {
-            self.config.sites.truncate(1);
-        }
-        // Run search with temp config
-        let result = self.search().await;
-        // Reset config
-        self.config = original_config;
-
-        Ok(result?
-            .into_iter()
-            .next()
-            .ok_or(Error::NoResults)?
-            .answers
-            .into_iter()
-            .next()
-            .ok_or_else(|| Error::StackExchange(String::from("Received question with no answers")))?
-            .body)
-    }
-
-    /// Search and parse to Markdown for TUI
-    pub async fn search_md(&self) -> Result<Vec<Question<Markdown>>> {
-        Ok(parse_markdown(self.search().await?))
-    }
-
-    /// Search query and get a list of relevant questions
-    pub async fn search(&self) -> Result<Vec<Question<String>>> {
-        if self.config.duckduckgo {
-            self.search_duckduck_go().await
-        } else {
-            // TODO after duckduck go finished, refactor to _not_ thread this limit, its unnecessary
-            self.se_search_advanced(self.config.limit).await
-        }
-    }
-
-    /// Search query at duckduckgo and then fetch the resulting questions from SE.
-    async fn search_duckduck_go(&self) -> Result<Vec<Question<String>>> {
-        let url = duckduckgo_url(&self.query, self.sites.values());
-        let html = self
-            .client
-            .get(url)
-            .header(header::USER_AGENT, USER_AGENT)
-            .send()
-            .await?
-            .text()
-            .await?;
-        let data = parse_questions_from_ddg_html(&html, &self.sites, self.config.limit)?;
-        self.se_questions(data).await
-    }
-
-    /// Parallel searches against the SE question endpoint across the sites in `ids`.
-    // TODO I'm sure there is a way to DRY the se_question & se_search_advanced functions
-    async fn se_questions(&self, data: ScrapedData) -> Result<Vec<Question<String>>> {
-        let ScrapedData {
-            question_ids,
-            ordering,
-        } = data;
-        futures::stream::iter(question_ids)
-            .map(|(site, ids)| {
-                let clone = self.clone();
-                tokio::spawn(async move {
-                    let clone = &clone;
-                    clone.se_questions_site(&site, ids).await
-                })
-            })
-            .buffer_unordered(CONCURRENT_REQUESTS_LIMIT)
-            .collect::<Vec<_>>()
-            .await
-            .into_iter()
-            .map(|r| r.map_err(Error::from).and_then(|x| x))
-            .collect::<Result<Vec<Vec<_>>>>()
-            .map(|v| {
-                let mut qs: Vec<Question<String>> = v.into_iter().flatten().collect();
-                qs.sort_unstable_by_key(|q| ordering.get(&q.id.to_string()).unwrap());
-                qs
-            })
-    }
-
-    /// Parallel searches against the SE search/advanced endpoint across all configured sites
-    async fn se_search_advanced(&self, limit: u16) -> Result<Vec<Question<String>>> {
-        futures::stream::iter(self.config.sites.clone())
-            .map(|site| {
-                let clone = self.clone();
-                tokio::spawn(async move {
-                    let clone = &clone;
-                    clone.se_search_advanced_site(&site, limit).await
-                })
-            })
-            .buffer_unordered(CONCURRENT_REQUESTS_LIMIT)
-            .collect::<Vec<_>>()
-            .await
-            .into_iter()
-            .map(|r| r.map_err(Error::from).and_then(|x| x))
-            .collect::<Result<Vec<Vec<_>>>>()
-            .map(|v| {
-                let mut qs: Vec<Question<String>> = v.into_iter().flatten().collect();
-                if self.config.sites.len() > 1 {
-                    qs.sort_unstable_by_key(|q| -q.score);
-                }
-                qs
-            })
-    }
-
-    /// Search against the SE site's /questions/{ids} endpoint.
-    /// Filters out questions with no answers.
-    async fn se_questions_site(
-        &self,
-        site: &str,
-        ids: Vec<String>,
-    ) -> Result<Vec<Question<String>>> {
-        let total = ids.len().to_string();
-        let endpoint = format!("questions/{ids}", ids = ids.join(";"));
-        let qs = self
-            .client
-            .get(stackexchange_url(&endpoint))
-            .header("Accepts", "application/json")
-            .query(&self.get_default_se_opts())
-            .query(&[("site", site), ("pagesize", &total), ("page", "1")])
-            .send()
-            .await?
-            .json::<ResponseWrapper<Question<String>>>()
-            .await?
-            .items;
-        Ok(Self::preprocess(qs))
-    }
-
-    /// Search against the SE site's /search/advanced endpoint with a given query.
-    /// Only fetches questions that have at least one answer.
-    async fn se_search_advanced_site(
-        &self,
-        site: &str,
-        limit: u16,
-    ) -> Result<Vec<Question<String>>> {
-        let qs = self
-            .client
-            .get(stackexchange_url("search/advanced"))
-            .header("Accepts", "application/json")
-            .query(&self.get_default_se_opts())
-            .query(&[
-                ("q", self.query.as_str()),
-                ("pagesize", &limit.to_string()),
-                ("site", site),
-                ("page", "1"),
-                ("answers", "1"),
-                ("order", "desc"),
-                ("sort", "relevance"),
-            ])
-            .send()
-            .await?
-            .json::<ResponseWrapper<Question<String>>>()
-            .await?
-            .items;
-        Ok(Self::preprocess(qs))
-    }
-
-    fn get_default_se_opts(&self) -> HashMap<&str, &str> {
-        let mut params = HashMap::new();
-        params.insert("filter", SE_FILTER);
-        if let Some(key) = &self.config.api_key {
-            params.insert("key", &key);
-        }
-        params
-    }
-
-    /// Sorts answers by score
-    /// Preprocess SE markdown to "cmark" markdown (or something closer to it)
-    /// This markdown preprocess _always_ happens.
-    fn preprocess(qs: Vec<Question<String>>) -> Vec<Question<String>> {
-        qs.into_par_iter()
-            .map(|q| {
-                let mut answers = q.answers;
-                answers.par_sort_unstable_by_key(|a| -a.score);
-                let answers = answers
-                    .into_par_iter()
-                    .map(|a| Answer {
-                        body: markdown::preprocess(a.body.clone()),
-                        ..a
-                    })
-                    .collect();
-                Question {
-                    answers,
-                    body: markdown::preprocess(q.body),
-                    ..q
-                }
-            })
-            .collect::<Vec<_>>()
-    }
-}
-
-/// Parse all markdown fields
-/// This only happens for content going into the cursive TUI (not lucky prompt)
-fn parse_markdown(qs: Vec<Question<String>>) -> Vec<Question<Markdown>> {
-    qs.into_par_iter()
-        .map(|q| {
-            let body = markdown::parse(q.body);
-            let answers = q
-                .answers
-                .into_par_iter()
-                .map(|a| {
-                    let body = markdown::parse(a.body);
-                    Answer {
-                        body,
-                        id: a.id,
-                        score: a.score,
-                        is_accepted: a.is_accepted,
-                    }
-                })
-                .collect::<Vec<_>>();
-            Question {
-                body,
-                answers,
-                id: q.id,
-                score: q.score,
-                title: q.title,
-            }
-        })
-        .collect::<Vec<_>>()
-}
-
-impl LocalStorage {
-    fn fetch_local_sites(filename: &PathBuf) -> Result<Option<Vec<Site>>> {
-        if let Some(file) = utils::open_file(filename)? {
-            return serde_json::from_reader(file)
-                .map_err(|_| Error::MalformedFile(filename.clone()));
-        }
-        Ok(None)
-    }
-
-    // TODO decide whether or not I should give LocalStorage an api key..
-    async fn fetch_remote_sites() -> Result<Vec<Site>> {
-        let se_sites = Client::new()
-            .get(stackexchange_url("sites"))
-            .header("Accepts", "application/json")
-            .query(&[
-                ("pagesize", SE_SITES_PAGESIZE.to_string()),
-                ("page", "1".to_string()),
-            ])
-            .send()
-            .await?
-            .json::<ResponseWrapper<Site>>()
-            .await?
-            .items;
-        Ok(se_sites
-            .into_par_iter()
-            .map(|site| {
-                let site_url = site.site_url.trim_start_matches("https://").to_string();
-                Site { site_url, ..site }
-            })
-            .collect())
-    }
-
-    fn store_local_sites(filename: &PathBuf, sites: &[Site]) -> Result<()> {
-        let file = utils::create_file(filename)?;
-        serde_json::to_writer(file, sites)?;
-        Ok(())
-    }
-
-    async fn init_sites(filename: &PathBuf, update: bool) -> Result<Vec<Site>> {
-        if !update {
-            if let Some(sites) = Self::fetch_local_sites(filename)? {
-                return Ok(sites);
-            }
-        }
-        let sites = Self::fetch_remote_sites().await?;
-        Self::store_local_sites(filename, &sites)?;
-        Ok(sites)
-    }
-
-    pub async fn new(update: bool) -> Result<Self> {
-        let project = project_dir()?;
-        let dir = project.cache_dir();
-        fs::create_dir_all(&dir)?;
-        let sites_filename = dir.join("sites.json");
-        let sites = Self::init_sites(&sites_filename, update).await?;
-        Ok(LocalStorage { sites })
-    }
-
-    // TODO is this HM worth it? Probably only will ever have < 10 site codes to search...
-    // maybe store this as Option<HM> on self if other methods use it...
-    pub async fn find_invalid_site<'a, 'b>(
-        &'b self,
-        site_codes: &'a [String],
-    ) -> Option<&'a String> {
-        let hm: HashMap<&str, ()> = self
-            .sites
-            .iter()
-            .map(|site| (site.api_site_parameter.as_str(), ()))
-            .collect();
-        site_codes.iter().find(|s| !hm.contains_key(&s.as_str()))
-    }
-
-    pub fn get_urls(&self, site_codes: &[String]) -> HashMap<String, String> {
-        self.sites
-            .iter()
-            .filter_map(move |site| {
-                let _ = site_codes
-                    .iter()
-                    .find(|&sc| *sc == site.api_site_parameter)?;
-                Some((site.api_site_parameter.to_owned(), site.site_url.to_owned()))
-            })
-            .collect()
-    }
-}
-
-/// Creates stackexchange API url given endpoint
-// TODO lazy static this url parse
-fn stackexchange_url(path: &str) -> Url {
-    let mut url = Url::parse(SE_API_URL).unwrap();
-    url.path_segments_mut()
-        .unwrap()
-        .push(SE_API_VERSION)
-        .extend(path.split('/'));
-    url
-}
-
-/// Creates duckduckgo search url given sites and query
-/// See https://duckduckgo.com/params for more info
-fn duckduckgo_url<'a, I>(query: &str, sites: I) -> Url
-where
-    I: IntoIterator<Item = &'a String>,
-{
-    let mut q = String::new();
-    //  Restrict to sites
-    q.push('(');
-    q.push_str(
-        sites
-            .into_iter()
-            .map(|site| String::from("site:") + site)
-            .collect::<Vec<_>>()
-            .join(" OR ")
-            .as_str(),
-    );
-    q.push_str(") ");
-    //  Search terms
-    q.push_str(
-        query
-            .trim_end_matches('?')
-            .split_whitespace()
-            .collect::<Vec<_>>()
-            .join(" ")
-            .as_str(),
-    );
-    Url::parse_with_params(
-        DUCKDUCKGO_URL,
-        &[("q", q.as_str()), ("kz", "-1"), ("kh", "-1")],
-    )
-    .unwrap()
-}
-
-/// Parse (site, question_id) pairs out of duckduckgo search results html
-// TODO Benchmark this. It would likely be faster to use regex on the decoded url.
-fn parse_questions_from_ddg_html<'a>(
-    html: &'a str,
-    sites: &'a HashMap<String, String>,
-    limit: u16,
-) -> Result<ScrapedData> {
-    let fragment = Html::parse_document(html);
-    let anchors = Selector::parse("a.result__a").unwrap();
-    let mut question_ids: HashMap<String, Vec<String>> = HashMap::new();
-    let mut ordering: HashMap<String, usize> = HashMap::new();
-    let mut count = 0;
-    for anchor in fragment.select(&anchors) {
-        let url = anchor
-            .value()
-            .attr("href")
-            .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string()))
-            .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?;
-        sites
-            .iter()
-            .find_map(|(site_code, site_url)| {
-                let id = question_url_to_id(site_url, &url)?;
-                ordering.insert(id.to_owned(), count);
-                match question_ids.entry(site_code.to_owned()) {
-                    Entry::Occupied(mut o) => o.get_mut().push(id),
-                    Entry::Vacant(o) => {
-                        o.insert(vec![id]);
-                    }
-                }
-                count += 1;
-                Some(())
-            })
-            .ok_or_else(|| {
-                Error::ScrapingError(
-                    "Duckduckgo returned results outside of SE network".to_string(),
-                )
-            })?;
-        if count >= limit as usize {
-            break;
-        }
-    }
-    // It doesn't seem possible for DDG to return no results, so assume this is
-    // a bad user agent
-    if count == 0 {
-        Err(Error::ScrapingError(String::from(
-            "DuckDuckGo blocked this request",
-        )))
-    } else {
-        Ok(ScrapedData {
-            question_ids,
-            ordering,
-        })
-    }
-}
-
-/// For example
-/// ```
-/// let id = "stackoverflow.com";
-/// let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor";
-/// assert_eq!(question_url_to_id(site_url, input), "11828270")
-/// ```
-fn question_url_to_id(site_url: &str, input: &str) -> Option<String> {
-    // TODO use str_prefix once its stable
-    let fragment = site_url.trim_end_matches('/').to_owned() + "/questions/";
-    let ix = input.find(&fragment)? + fragment.len();
-    let input = &input[ix..];
-    let end = input.find('/')?;
-    Some(input[0..end].to_string())
-}
-
-// TODO find a query that returns no results so that I can test it and
-// differentiate it from a blocked request
-#[cfg(test)]
-mod tests {
-    use super::*;
-    #[test]
-    fn test_stackexchange_url() {
-        assert_eq!(
-            stackexchange_url("some/endpoint").as_str(),
-            "http://api.stackexchange.com/2.2/some/endpoint"
-        )
-    }
-
-    #[test]
-    fn test_duckduckgo_url() {
-        let q = "how do I exit vim?";
-        let sites = vec![
-            String::from("stackoverflow.com"),
-            String::from("unix.stackexchange.com"),
-        ];
-        assert_eq!(
-            duckduckgo_url(q, &sites).as_str(),
-            String::from(
-                "https://duckduckgo.com/\
-                ?q=%28site%3Astackoverflow.com+OR+site%3Aunix.stackexchange.com%29\
-                +how+do+I+exit+vim&kz=-1&kh=-1"
-            )
-        )
-    }
-
-    #[test]
-    fn test_duckduckgo_response() {
-        // TODO make sure results are either 1) answers 2) failed connection 3) blocked
-    }
-
-    #[test]
-    fn test_duckduckgo_parser() {
-        let html = include_str!("../test/exit-vim.html");
-        let sites = vec![
-            ("stackoverflow", "stackoverflow.com"),
-            ("askubuntu", "askubuntu.com"),
-        ]
-        .into_iter()
-        .map(|(k, v)| (k.to_string(), v.to_string()))
-        .collect::<HashMap<String, String>>();
-        let expected_scraped_data = ScrapedData {
-            question_ids: vec![
-                ("stackoverflow", vec!["11828270", "9171356"]),
-                ("askubuntu", vec!["24406"]),
-            ]
-            .into_iter()
-            .map(|(k, v)| {
-                (
-                    k.to_string(),
-                    v.into_iter().map(|s| s.to_string()).collect(),
-                )
-            })
-            .collect(),
-            ordering: vec![("11828270", 0), ("9171356", 2), ("24406", 1)]
-                .into_iter()
-                .map(|(k, v)| (k.to_string(), v))
-                .collect(),
-        };
-        assert_eq!(
-            parse_questions_from_ddg_html(html, &sites, 3).unwrap(),
-            expected_scraped_data
-        );
-    }
-
-    #[test]
-    fn test_duckduckgo_blocker() -> Result<(), String> {
-        let html = include_str!("../test/bad-user-agent.html");
-        let mut sites = HashMap::new();
-        sites.insert(
-            String::from("stackoverflow"),
-            String::from("stackoverflow.com"),
-        );
-
-        match parse_questions_from_ddg_html(html, &sites, 2) {
-            Err(Error::ScrapingError(s)) if s == "DuckDuckGo blocked this request".to_string() => {
-                Ok(())
-            }
-            _ => Err(String::from("Failed to detect DuckDuckGo blocker")),
-        }
-    }
-
-    #[test]
-    fn test_question_url_to_id() {
-        let site_url = "stackoverflow.com";
-        let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor";
-        assert_eq!(question_url_to_id(site_url, input).unwrap(), "11828270");
-
-        let site_url = "stackoverflow.com";
-        let input = "/l/?kh=-1&uddg=https://askubuntu.com/questions/24406/how-to-close-vim-from-the-command-line";
-        assert_eq!(question_url_to_id(site_url, input), None);
-    }
-}
diff --git a/src/stackexchange/api.rs b/src/stackexchange/api.rs
new file mode 100644
index 0000000..ff94de2
--- /dev/null
+++ b/src/stackexchange/api.rs
@@ -0,0 +1,201 @@
+use rayon::prelude::*;
+use reqwest::header;
+use reqwest::Client;
+use reqwest::Url;
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+
+use crate::error::Result;
+use crate::tui::markdown;
+
+/// StackExchange API v2.2 URL
+// TODO why not https?
+const SE_API_URL: &str = "http://api.stackexchange.com";
+const SE_API_VERSION: &str = "2.2";
+
+/// Filter generated to include only the fields needed to populate
+/// the structs below. Go here to make new filters:
+/// [create filter](https://api.stackexchange.com/docs/create-filter).
+const SE_FILTER: &str = ".DND5X2VHHUH8HyJzpjo)5NvdHI3w6auG";
+
+/// Pagesize when fetching all SE sites. Should be good for many years...
+const SE_SITES_PAGESIZE: u16 = 10000;
+
+/// Represents a StackExchange answer with a custom selection of fields from
+/// the [StackExchange docs](https://api.stackexchange.com/docs/types/answer)
+#[derive(Clone, Deserialize, Debug)]
+pub struct Answer<S> {
+    #[serde(rename = "answer_id")]
+    pub id: u32,
+    pub score: i32,
+    #[serde(rename = "body_markdown")]
+    pub body: S,
+    pub is_accepted: bool,
+}
+
+/// Represents a StackExchange question with a custom selection of fields from
+/// the [StackExchange docs](https://api.stackexchange.com/docs/types/question)
+// TODO container over answers should be generic iterator
+#[derive(Clone, Deserialize, Debug)]
+pub struct Question<S> {
+    #[serde(rename = "question_id")]
+    pub id: u32,
+    pub score: i32,
+    pub answers: Vec<Answer<S>>,
+    pub title: String,
+    #[serde(rename = "body_markdown")]
+    pub body: S,
+}
+
+/// Internal struct that represents the boilerplate response wrapper from SE API.
+#[derive(Deserialize, Debug)]
+struct ResponseWrapper<T> {
+    items: Vec<T>,
+}
+
+#[derive(Deserialize, Serialize, Debug)]
+pub struct Site {
+    pub api_site_parameter: String,
+    pub site_url: String,
+}
+
+#[derive(Clone)]
+pub struct Api {
+    client: Client,
+    api_key: Option<String>,
+}
+
+impl Api {
+    pub fn new(api_key: Option<String>) -> Self {
+        // TODO can lazy_static this above
+        let mut headers = header::HeaderMap::new();
+        headers.insert(
+            header::ACCEPT,
+            header::HeaderValue::from_static("application/json"),
+        );
+        let client = Client::builder().default_headers(headers).build().unwrap();
+        Api { client, api_key }
+    }
+
+    /// Search against the SE site's /questions/{ids} endpoint.
+    /// Filters out questions with no answers.
+    pub async fn questions(&self, site: &str, ids: Vec<String>) -> Result<Vec<Question<String>>> {
+        let total = ids.len().to_string();
+        let endpoint = format!("questions/{ids}", ids = ids.join(";"));
+        let qs = self
+            .client
+            .get(stackexchange_url(&endpoint))
+            .query(&self.get_default_se_opts())
+            .query(&[("site", site), ("pagesize", &total)])
+            .send()
+            .await?
+            .json::<ResponseWrapper<Question<String>>>()
+            .await?
+            .items;
+        Ok(Self::preprocess(qs))
+    }
+
+    /// Search against the SE site's /search/advanced endpoint with a given query.
+    /// Only fetches questions that have at least one answer.
+    pub async fn search_advanced(
+        &self,
+        query: &str,
+        site: &str,
+        limit: u16,
+    ) -> Result<Vec<Question<String>>> {
+        let qs = self
+            .client
+            .get(stackexchange_url("search/advanced"))
+            .query(&self.get_default_se_opts())
+            .query(&[
+                ("q", query),
+                ("pagesize", &limit.to_string()),
+                ("site", site),
+                ("answers", "1"),
+                ("order", "desc"),
+                ("sort", "relevance"),
+            ])
+            .send()
+            .await?
+            .json::<ResponseWrapper<Question<String>>>()
+            .await?
+            .items;
+        Ok(Self::preprocess(qs))
+    }
+
+    pub async fn sites(&self) -> Result<Vec<Site>> {
+        let sites = self
+            .client
+            .get(stackexchange_url("sites"))
+            .query(&[("pagesize", SE_SITES_PAGESIZE.to_string())])
+            .send()
+            .await?
+            .json::<ResponseWrapper<Site>>()
+            .await?
+            .items;
+        Ok(sites
+            .into_par_iter()
+            .map(|site| {
+                let site_url = site.site_url.trim_start_matches("https://").to_string();
+                Site { site_url, ..site }
+            })
+            .collect())
+    }
+
+    fn get_default_se_opts(&self) -> HashMap<&str, &str> {
+        let mut params = HashMap::new();
+        params.insert("filter", SE_FILTER);
+        params.insert("page", "1");
+        if let Some(key) = &self.api_key {
+            params.insert("key", &key);
+        }
+        params
+    }
+
+    /// Sorts answers by score
+    /// Preprocess SE markdown to "cmark" markdown (or something closer to it)
+    /// This markdown preprocess _always_ happens.
+    fn preprocess(qs: Vec<Question<String>>) -> Vec<Question<String>> {
+        qs.into_par_iter()
+            .map(|q| {
+                let mut answers = q.answers;
+                answers.par_sort_unstable_by_key(|a| -a.score);
+                let answers = answers
+                    .into_par_iter()
+                    .map(|a| Answer {
+                        body: markdown::preprocess(a.body.clone()),
+                        ..a
+                    })
+                    .collect();
+                Question {
+                    answers,
+                    body: markdown::preprocess(q.body),
+                    ..q
+                }
+            })
+            .collect::<Vec<_>>()
+    }
+}
+
+/// Creates stackexchange API url given endpoint
+// TODO lazy static this url parse
+fn stackexchange_url(path: &str) -> Url {
+    let mut url = Url::parse(SE_API_URL).unwrap();
+    url.path_segments_mut()
+        .unwrap()
+        .push(SE_API_VERSION)
+        .extend(path.split('/'));
+    url
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_stackexchange_url() {
+        assert_eq!(
+            stackexchange_url("some/endpoint").as_str(),
+            "http://api.stackexchange.com/2.2/some/endpoint"
+        )
+    }
+}
diff --git a/src/stackexchange/local_storage.rs b/src/stackexchange/local_storage.rs
new file mode 100644
index 0000000..8d009f8
--- /dev/null
+++ b/src/stackexchange/local_storage.rs
@@ -0,0 +1,76 @@
+use std::collections::HashMap;
+use std::fs;
+use std::path::PathBuf;
+
+use crate::config::project_dir;
+use crate::error::{Error, Result};
+use crate::utils;
+
+use super::api::{Api, Site};
+
+/// This structure allows interacting with locally cached StackExchange metadata.
+pub struct LocalStorage {
+    pub sites: Vec<Site>,
+}
+
+impl LocalStorage {
+    fn fetch_local_sites(filename: &PathBuf) -> Result<Option<Vec<Site>>> {
+        if let Some(file) = utils::open_file(filename)? {
+            return serde_json::from_reader(file)
+                .map_err(|_| Error::MalformedFile(filename.clone()));
+        }
+        Ok(None)
+    }
+
+    fn store_local_sites(filename: &PathBuf, sites: &[Site]) -> Result<()> {
+        let file = utils::create_file(filename)?;
+        serde_json::to_writer(file, sites)?;
+        Ok(())
+    }
+
+    async fn init_sites(filename: &PathBuf, update: bool) -> Result<Vec<Site>> {
+        if !update {
+            if let Some(sites) = Self::fetch_local_sites(filename)? {
+                return Ok(sites);
+            }
+        }
+        let sites = Api::new(None).sites().await?;
+        Self::store_local_sites(filename, &sites)?;
+        Ok(sites)
+    }
+
+    pub async fn new(update: bool) -> Result<Self> {
+        let project = project_dir()?;
+        let dir = project.cache_dir();
+        fs::create_dir_all(&dir)?;
+        let sites_filename = dir.join("sites.json");
+        let sites = Self::init_sites(&sites_filename, update).await?;
+        Ok(LocalStorage { sites })
+    }
+
+    // TODO is this HM worth it? Probably only will ever have < 10 site codes to search...
+    // maybe store this as Option<HM> on self if other methods use it...
+    pub async fn find_invalid_site<'a, 'b>(
+        &'b self,
+        site_codes: &'a [String],
+    ) -> Option<&'a String> {
+        let hm: HashMap<&str, ()> = self
+            .sites
+            .iter()
+            .map(|site| (site.api_site_parameter.as_str(), ()))
+            .collect();
+        site_codes.iter().find(|s| !hm.contains_key(&s.as_str()))
+    }
+
+    pub fn get_urls(&self, site_codes: &[String]) -> HashMap<String, String> {
+        self.sites
+            .iter()
+            .filter_map(move |site| {
+                let _ = site_codes
+                    .iter()
+                    .find(|&sc| *sc == site.api_site_parameter)?;
+                Some((site.api_s
author	Sam Tay <sam.chong.tay@gmail.com>	2020-06-23 23:07:35 -0700
committer	Sam Tay <sam.chong.tay@gmail.com>	2020-06-24 01:35:50 -0700
commit	9d1e601554a982c2932e2161b153104d4cc14424 (patch)
tree	0e27ef0ad56a2760f0d7f87e02027505b32dd8a3
parent	74bda95681c253eea5010417dc74e8569010f7f9 (diff)