Refactor search engine types

Still not sure exactly what the idiomatic representation is here
author: Sam Tay <sam.chong.tay@gmail.com> 2020-06-24 02:36:26 -0700
committer: Sam Tay <sam.chong.tay@gmail.com> 2020-06-24 02:39:39 -0700
commit: 95f429041ee505497f36530e1895c2ea3554d37b (patch)
tree: 93ae0d2fdbc85dc58884d75981cafe2d33640edb
parent: caa03bb164e40827e0d5f3f35522ac3cabc1e348 (diff)
4 files changed, 98 insertions, 97 deletions
diff --git a/TODO.md b/TODO.md
index 083d492..0e6be2a 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,17 +1,12 @@
 # TODO
 
 ### v0.3.1
-0. Refactor the enum/struct for search engines
 1. Much of the code can be reused for google:
     * parsing href after `"url="` (similar to uddg)
     * formatting `(site:stackoverflow.com OR site:unix.stackexchange.com) what is linux`
   So make a `Scraper` trait and implement it for DDG & Google. Then
   `stackexchange` can just code against `Scraper` and choose based on
   `--search-engine | -e' argument`
-2. Maybe reorganize to
-   - stackexchange
-     - api
-     - scraper
 
 ### Endless future improvements for the TUI
 1. Init with smaller layout depending on initial screen size.
diff --git a/src/main.rs b/src/main.rs
index ac176fc..e023d30 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -82,19 +82,19 @@ async fn run(skin: &mut MadSkin) -> Result<Option<Vec<Question<Markdown>>>> {
     }
 
     if let Some(q) = opts.query {
-        let mut se = Search::new(config, ls, q);
+        let mut search = Search::new(config, ls, q);
         if lucky {
-            let md = se.search_lucky().await?;
+            let md = search.search_lucky().await?;
             skin.print_text(&md);
             skin.print_text("\nPress **[SPACE]** to see more results, or any other key to exit");
             // Kick off the rest of the search in the background
-            let qs = task::spawn(async move { se.search_md().await });
+            let qs = task::spawn(async move { search.search_md().await });
             if !utils::wait_for_char(' ')? {
                 return Ok(None);
             }
             return Ok(Some(qs.await.unwrap()?));
         } else {
-            return Ok(Some(se.search_md().await?));
+            return Ok(Some(search.search_md().await?));
         }
     }
     Ok(None)
diff --git a/src/stackexchange/scraper.rs b/src/stackexchange/scraper.rs
index 53ac08b..a0d29ab 100644
--- a/src/stackexchange/scraper.rs
+++ b/src/stackexchange/scraper.rs
@@ -10,7 +10,6 @@ use crate::error::{Error, Result};
 /// DuckDuckGo URL
 const DUCKDUCKGO_URL: &str = "https://duckduckgo.com";
 
-// TODO Should there be separate Unit-type structs for each one? With separate implementations?
 pub enum SearchEngine {
     DuckDuckGo,
 }
@@ -29,6 +28,7 @@ pub struct ScrapedData {
     pub ordering: HashMap<String, usize>,
 }
 
+// TODO add this type system limitation to blog post
 pub trait Scraper {
     /// Parse data from search results html
     fn parse(&self, html: &str, sites: &HashMap<String, String>, limit: u16)
@@ -48,7 +48,7 @@ impl Scraper for SearchEngine {
         limit: u16,
     ) -> Result<ScrapedData> {
         match &self {
-            SearchEngine::DuckDuckGo => parse_duckduckgo(html, sites, limit),
+            Self::DuckDuckGo => DuckDuckGo.parse(html, sites, limit),
         }
     }
     fn get_url<'a, I>(&self, query: &str, sites: I) -> Url
@@ -56,64 +56,103 @@ impl Scraper for SearchEngine {
         I: IntoIterator<Item = &'a String>,
     {
         match &self {
-            SearchEngine::DuckDuckGo => duckduckgo_url(query, sites),
+            Self::DuckDuckGo => DuckDuckGo.get_url(query, sites),
         }
     }
 }
 
-/// Parse (site, question_id) pairs out of duckduckgo search results html
-// TODO Benchmark this. It would likely be faster to use regex on the decoded url.
-// TODO pull out parts that are composable across different engines
-fn parse_duckduckgo<'a>(
-    html: &'a str,
-    sites: &'a HashMap<String, String>,
-    limit: u16,
-) -> Result<ScrapedData> {
-    let fragment = Html::parse_document(html);
-    let anchors = Selector::parse("a.result__a").unwrap();
-    let mut question_ids: HashMap<String, Vec<String>> = HashMap::new();
-    let mut ordering: HashMap<String, usize> = HashMap::new();
-    let mut count = 0;
-    for anchor in fragment.select(&anchors) {
-        let url = anchor
-            .value()
-            .attr("href")
-            .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string()))
-            .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?;
-        sites
-            .iter()
-            .find_map(|(site_code, site_url)| {
-                let id = question_url_to_id(site_url, &url)?;
-                ordering.insert(id.to_owned(), count);
-                match question_ids.entry(site_code.to_owned()) {
-                    Entry::Occupied(mut o) => o.get_mut().push(id),
-                    Entry::Vacant(o) => {
-                        o.insert(vec![id]);
+struct DuckDuckGo;
+
+impl Scraper for DuckDuckGo {
+    /// Parse (site, question_id) pairs out of duckduckgo search results html
+    // TODO Benchmark this. It would likely be faster to use regex on the decoded url.
+    // TODO pull out parts that are composable across different engines
+    fn parse(
+        &self,
+        html: &str,
+        sites: &HashMap<String, String>,
+        limit: u16,
+    ) -> Result<ScrapedData> {
+        let fragment = Html::parse_document(html);
+        let anchors = Selector::parse("a.result__a").unwrap();
+        let mut question_ids: HashMap<String, Vec<String>> = HashMap::new();
+        let mut ordering: HashMap<String, usize> = HashMap::new();
+        let mut count = 0;
+        for anchor in fragment.select(&anchors) {
+            let url = anchor
+                .value()
+                .attr("href")
+                .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string()))
+                .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?;
+            sites
+                .iter()
+                .find_map(|(site_code, site_url)| {
+                    let id = question_url_to_id(site_url, &url)?;
+                    ordering.insert(id.to_owned(), count);
+                    match question_ids.entry(site_code.to_owned()) {
+                        Entry::Occupied(mut o) => o.get_mut().push(id),
+                        Entry::Vacant(o) => {
+                            o.insert(vec![id]);
+                        }
                     }
-                }
-                count += 1;
-                Some(())
+                    count += 1;
+                    Some(())
+                })
+                .ok_or_else(|| {
+                    Error::ScrapingError(
+                        "Duckduckgo returned results outside of SE network".to_string(),
+                    )
+                })?;
+            if count >= limit as usize {
+                break;
+            }
+        }
+        // It doesn't seem possible for DDG to return no results, so assume this is
+        // a bad user agent
+        if count == 0 {
+            Err(Error::ScrapingError(String::from(
+                "DuckDuckGo blocked this request",
+            )))
+        } else {
+            Ok(ScrapedData {
+                question_ids,
+                ordering,
             })
-            .ok_or_else(|| {
-                Error::ScrapingError(
-                    "Duckduckgo returned results outside of SE network".to_string(),
-                )
-            })?;
-        if count >= limit as usize {
-            break;
         }
     }
-    // It doesn't seem possible for DDG to return no results, so assume this is
-    // a bad user agent
-    if count == 0 {
-        Err(Error::ScrapingError(String::from(
-            "DuckDuckGo blocked this request",
-        )))
-    } else {
-        Ok(ScrapedData {
-            question_ids,
-            ordering,
-        })
+
+    /// Creates duckduckgo search url given sites and query
+    /// See https://duckduckgo.com/params for more info
+    fn get_url<'a, I>(&self, query: &str, sites: I) -> Url
+    where
+        I: IntoIterator<Item = &'a String>,
+    {
+        let mut q = String::new();
+        //  Restrict to sites
+        q.push('(');
+        q.push_str(
+            sites
+                .into_iter()
+                .map(|site| String::from("site:") + site)
+                .collect::<Vec<_>>()
+                .join(" OR ")
+                .as_str(),
+        );
+        q.push_str(") ");
+        //  Search terms
+        q.push_str(
+            query
+                .trim_end_matches('?')
+                .split_whitespace()
+                .collect::<Vec<_>>()
+                .join(" ")
+                .as_str(),
+        );
+        Url::parse_with_params(
+            DUCKDUCKGO_URL,
+            &[("q", q.as_str()), ("kz", "-1"), ("kh", "-1")],
+        )
+        .unwrap()
     }
 }
 
@@ -132,40 +171,6 @@ fn question_url_to_id(site_url: &str, input: &str) -> Option<String> {
     Some(input[0..end].to_string())
 }
 
-/// Creates duckduckgo search url given sites and query
-/// See https://duckduckgo.com/params for more info
-fn duckduckgo_url<'a, I>(query: &str, sites: I) -> Url
-where
-    I: IntoIterator<Item = &'a String>,
-{
-    let mut q = String::new();
-    //  Restrict to sites
-    q.push('(');
-    q.push_str(
-        sites
-            .into_iter()
-            .map(|site| String::from("site:") + site)
-            .collect::<Vec<_>>()
-            .join(" OR ")
-            .as_str(),
-    );
-    q.push_str(") ");
-    //  Search terms
-    q.push_str(
-        query
-            .trim_end_matches('?')
-            .split_whitespace()
-            .collect::<Vec<_>>()
-            .join(" ")
-            .as_str(),
-    );
-    Url::parse_with_params(
-        DUCKDUCKGO_URL,
-        &[("q", q.as_str()), ("kz", "-1"), ("kh", "-1")],
-    )
-    .unwrap()
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -178,7 +183,7 @@ mod tests {
             String::from("unix.stackexchange.com"),
         ];
         assert_eq!(
-            duckduckgo_url(q, &sites).as_str(),
+            DuckDuckGo.get_url(q, &sites).as_str(),
             String::from(
                 "https://duckduckgo.com/\
                 ?q=%28site%3Astackoverflow.com+OR+site%3Aunix.stackexchange.com%29\
diff --git a/src/stackexchange/search.rs b/src/stackexchange/search.rs
index 8e40cb4..ed89e15 100644
--- a/src/stackexchange/search.rs
+++ b/src/stackexchange/search.rs
@@ -23,6 +23,7 @@ const USER_AGENT: &str =
 
 /// This structure provides methods to search queries and get StackExchange
 /// questions/answers in return.
+// TODO this really needs a better name...
 #[derive(Clone)]
 pub struct Search {
     api: Api,
@@ -89,7 +90,7 @@ impl Search {
     }
 
     /// Search query at duckduckgo and then fetch the resulting questions from SE.
-    async fn search_by_engine(&self, search_engine: SearchEngine) -> Result<Vec<Question<String>>> {
+    async fn search_by_engine(&self, search_engine: impl Scraper) -> Result<Vec<Question<String>>> {
         let url = search_engine.get_url(&self.query, self.sites.values());
         let html = Client::new()
             .get(url)
author	Sam Tay <sam.chong.tay@gmail.com>	2020-06-24 02:36:26 -0700
committer	Sam Tay <sam.chong.tay@gmail.com>	2020-06-24 02:39:39 -0700
commit	95f429041ee505497f36530e1895c2ea3554d37b (patch)
tree	93ae0d2fdbc85dc58884d75981cafe2d33640edb
parent	caa03bb164e40827e0d5f3f35522ac3cabc1e348 (diff)