From 5625602e711ceab71bdace19c239c1972fc6ac4d Mon Sep 17 00:00:00 2001
From: Sam Tay <sam.chong.tay@gmail.com>
Date: Wed, 24 Jun 2020 13:09:03 -0700
Subject: Add google search engine

---
 src/cli.rs                   |   2 +-
 src/config.rs                |   3 +-
 src/stackexchange/scraper.rs | 228 +++++++++++++++++++++++++++++--------------
 src/stackexchange/search.rs  |   3 +-
 4 files changed, 162 insertions(+), 74 deletions(-)

(limited to 'src')
diff --git a/src/cli.rs b/src/cli.rs
index 715d62e..1892066 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -90,7 +90,7 @@ pub fn get_opts() -> Result<Opts> {
                 .takes_value(true)
                 .default_value(engine)
                 .value_name("engine")
-                .possible_values(&["duckduckgo", "stackexchange"])
+                .possible_values(&["duckduckgo", "google", "stackexchange"])
                 .help("Use specified search engine")
                 .next_line_help(true),
         )
diff --git a/src/config.rs b/src/config.rs
index 3102a87..0154cda 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -12,7 +12,7 @@ use crate::utils;
 #[serde(rename_all = "lowercase")] // TODO test this
 pub enum SearchEngine {
     DuckDuckGo,
-    //Google,
+    Google,
     StackExchange,
 }
 
@@ -30,6 +30,7 @@ impl fmt::Display for SearchEngine {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         let s = match &self {
             SearchEngine::DuckDuckGo => "duckduckgo",
+            SearchEngine::Google => "google",
             SearchEngine::StackExchange => "stackexchange",
         };
         write!(f, "{}", s)
diff --git a/src/stackexchange/scraper.rs b/src/stackexchange/scraper.rs
index e6376fa..2f62eb6 100644
--- a/src/stackexchange/scraper.rs
+++ b/src/stackexchange/scraper.rs
@@ -9,6 +9,7 @@ use crate::error::{Error, Result};
 
 /// DuckDuckGo URL
 const DUCKDUCKGO_URL: &str = "https://duckduckgo.com";
+const GOOGLE_URL: &str = "https://google.com/search";
 
 // Is question_id unique across all sites? If not, then this edge case is
 // unaccounted for when sorting.
@@ -40,60 +41,23 @@ pub struct DuckDuckGo;
 
 impl Scraper for DuckDuckGo {
     /// Parse (site, question_id) pairs out of duckduckgo search results html
-    // TODO Benchmark this. It would likely be faster to use regex on the decoded url.
-    // TODO pull out parts that are composable across different engines
     fn parse(
         &self,
         html: &str,
         sites: &HashMap<String, String>,
         limit: u16,
     ) -> Result<ScrapedData> {
-        let fragment = Html::parse_document(html);
         let anchors = Selector::parse("a.result__a").unwrap();
-        let mut question_ids: HashMap<String, Vec<String>> = HashMap::new();
-        let mut ordering: HashMap<String, usize> = HashMap::new();
-        let mut count = 0;
-        for anchor in fragment.select(&anchors) {
-            let url = anchor
-                .value()
-                .attr("href")
-                .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string()))
-                .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?;
-            sites
-                .iter()
-                .find_map(|(site_code, site_url)| {
-                    let id = question_url_to_id(site_url, &url)?;
-                    ordering.insert(id.to_owned(), count);
-                    match question_ids.entry(site_code.to_owned()) {
-                        Entry::Occupied(mut o) => o.get_mut().push(id),
-                        Entry::Vacant(o) => {
-                            o.insert(vec![id]);
-                        }
-                    }
-                    count += 1;
-                    Some(())
-                })
-                .ok_or_else(|| {
-                    Error::ScrapingError(
-                        "Duckduckgo returned results outside of SE network".to_string(),
-                    )
-                })?;
-            if count >= limit as usize {
-                break;
+        parse_with_selector(anchors, html, sites, limit).and_then(|sd| {
+            // DDG seems to never have empty results, so assume this is blocked
+            if sd.question_ids.is_empty() {
+                Err(Error::ScrapingError(String::from(
+                    "DuckDuckGo blocked this request",
+                )))
+            } else {
+                Ok(sd)
             }
-        }
-        // It doesn't seem possible for DDG to return no results, so assume this is
-        // a bad user agent
-        if count == 0 {
-            Err(Error::ScrapingError(String::from(
-                "DuckDuckGo blocked this request",
-            )))
-        } else {
-            Ok(ScrapedData {
-                question_ids,
-                ordering,
-            })
-        }
+        })
     }
 
     /// Creates duckduckgo search url given sites and query
@@ -102,27 +66,7 @@ impl Scraper for DuckDuckGo {
     where
         I: IntoIterator<Item = &'a String>,
     {
-        let mut q = String::new();
-        //  Restrict to sites
-        q.push('(');
-        q.push_str(
-            sites
-                .into_iter()
-                .map(|site| String::from("site:") + site)
-                .collect::<Vec<_>>()
-                .join(" OR ")
-                .as_str(),
-        );
-        q.push_str(") ");
-        //  Search terms
-        q.push_str(
-            query
-                .trim_end_matches('?')
-                .split_whitespace()
-                .collect::<Vec<_>>()
-                .join(" ")
-                .as_str(),
-        );
+        let q = make_query_arg(query, sites);
         Url::parse_with_params(
             DUCKDUCKGO_URL,
             &[("q", q.as_str()), ("kz", "-1"), ("kh", "-1")],
@@ -131,6 +75,107 @@ impl Scraper for DuckDuckGo {
     }
 }
 
+pub struct Google;
+
+impl Scraper for Google {
+    /// Parse SE data out of google search results html
+    fn parse(
+        &self,
+        html: &str,
+        sites: &HashMap<String, String>,
+        limit: u16,
+    ) -> Result<ScrapedData> {
+        let anchors = Selector::parse("div.r > a").unwrap();
+        // TODO detect no results
+        // TODO detect blocked request
+        parse_with_selector(anchors, html, sites, limit)
+    }
+
+    /// Creates duckduckgo search url given sites and query
+    /// See https://duckduckgo.com/params for more info
+    fn get_url<'a, I>(&self, query: &str, sites: I) -> Url
+    where
+        I: IntoIterator<Item = &'a String>,
+    {
+        let q = make_query_arg(query, sites);
+        Url::parse_with_params(GOOGLE_URL, &[("q", q.as_str())]).unwrap()
+    }
+}
+
+fn make_query_arg<'a, I>(query: &str, sites: I) -> String
+where
+    I: IntoIterator<Item = &'a String>,
+{
+    let mut q = String::new();
+    //  Restrict to sites
+    q.push('(');
+    q.push_str(
+        sites
+            .into_iter()
+            .map(|site| String::from("site:") + site)
+            .collect::<Vec<_>>()
+            .join(" OR ")
+            .as_str(),
+    );
+    q.push_str(") ");
+    //  Search terms
+    q.push_str(
+        query
+            .trim_end_matches('?')
+            .split_whitespace()
+            .collect::<Vec<_>>()
+            .join(" ")
+            .as_str(),
+    );
+    q
+}
+
+// TODO Benchmark this. It would likely be faster to use regex on the decoded url.
+fn parse_with_selector(
+    anchors: Selector,
+    html: &str,
+    sites: &HashMap<String, String>,
+    limit: u16,
+) -> Result<ScrapedData> {
+    let fragment = Html::parse_document(html);
+    let mut question_ids: HashMap<String, Vec<String>> = HashMap::new();
+    let mut ordering: HashMap<String, usize> = HashMap::new();
+    let mut count = 0;
+    for anchor in fragment.select(&anchors) {
+        let url = anchor
+            .value()
+            .attr("href")
+            .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string()))
+            .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?;
+        sites
+            .iter()
+            .find_map(|(site_code, site_url)| {
+                let id = question_url_to_id(site_url, &url)?;
+                ordering.insert(id.to_owned(), count);
+                match question_ids.entry(site_code.to_owned()) {
+                    Entry::Occupied(mut o) => o.get_mut().push(id),
+                    Entry::Vacant(o) => {
+                        o.insert(vec![id]);
+                    }
+                }
+                count += 1;
+                Some(())
+            })
+            .ok_or_else(|| {
+                Error::ScrapingError(
+                    "Search engine returned results outside of SE network".to_string(),
+                )
+            })?;
+        if count >= limit as usize {
+            break;
+        }
+    }
+    Ok(ScrapedData {
+        question_ids,
+        ordering,
+    })
+}
+
 /// For example
 /// ```
 /// let id = "stackoverflow.com";
@@ -169,7 +214,7 @@ mod tests {
 
     #[test]
     fn test_duckduckgo_parser() {
-        let html = include_str!("../../test/exit-vim.html");
+        let html = include_str!("../../test/duckduckgo/exit-vim.html");
         let sites = vec![
             ("stackoverflow", "stackoverflow.com"),
             ("askubuntu", "askubuntu.com"),
@@ -196,21 +241,55 @@ mod tests {
                 .collect(),
         };
         assert_eq!(
-            SearchEngine::DuckDuckGo.parse(html, &sites, 3).unwrap(),
+            DuckDuckGo.parse(html, &sites, 3).unwrap(),
+            expected_scraped_data
+        );
+    }
+
+    #[test]
+    fn test_google_parser() {
+        let html = include_str!("../../test/google/exit-vim.html");
+        let sites = vec![
+            ("stackoverflow", "stackoverflow.com"),
+            ("askubuntu", "askubuntu.com"),
+        ]
+        .into_iter()
+        .map(|(k, v)| (k.to_string(), v.to_string()))
+        .collect::<HashMap<String, String>>();
+        let expected_scraped_data = ScrapedData {
+            question_ids: vec![
+                ("stackoverflow", vec!["11828270", "25919461"]),
+                ("askubuntu", vec!["24406"]),
+            ]
+            .into_iter()
+            .map(|(k, v)| {
+                (
+                    k.to_string(),
+                    v.into_iter().map(|s| s.to_string()).collect(),
+                )
+            })
+            .collect(),
+            ordering: vec![("11828270", 0), ("25919461", 1), ("24406", 2)]
+                .into_iter()
+                .map(|(k, v)| (k.to_string(), v))
+                .collect(),
+        };
+        assert_eq!(
+            Google.parse(html, &sites, 3).unwrap(),
             expected_scraped_data
         );
     }
 
     #[test]
     fn test_duckduckgo_blocker() -> Result<(), String> {
-        let html = include_str!("../../test/bad-user-agent.html");
+        let html = include_str!("../../test/duckduckgo/bad-user-agent.html");
         let mut sites = HashMap::new();
         sites.insert(
             String::from("stackoverflow"),
             String::from("stackoverflow.com"),
         );
 
-        match SearchEngine::DuckDuckGo.parse(html, &sites, 2) {
+        match DuckDuckGo.parse(html, &sites, 2) {
             Err(Error::ScrapingError(s)) if s == "DuckDuckGo blocked this request".to_string() => {
                 Ok(())
             }
@@ -218,6 +297,13 @@ mod tests {
         }
     }
 
+    #[test]
+    // TODO  Get a blocked request html
+    // note: this may only be possible at search.rs level (with non-200 code)
+    fn test_google_blocker() -> Result<(), String> {
+        Ok(())
+    }
+
     #[test]
     fn test_question_url_to_id() {
         let site_url = "stackoverflow.com";
diff --git a/src/stackexchange/search.rs b/src/stackexchange/search.rs
index 530b665..acfbcc7 100644
--- a/src/stackexchange/search.rs
+++ b/src/stackexchange/search.rs
@@ -11,7 +11,7 @@ use crate::tui::markdown::Markdown;
 
 use super::api::{Answer, Api, Question};
 use super::local_storage::LocalStorage;
-use super::scraper::{DuckDuckGo, ScrapedData, Scraper};
+use super::scraper::{DuckDuckGo, Google, ScrapedData, Scraper};
 
 /// Limit on concurrent requests (gets passed to `buffer_unordered`)
 const CONCURRENT_REQUESTS_LIMIT: usize = 8;
@@ -84,6 +84,7 @@ impl Search {
     pub async fn search(&self) -> Result<Vec<Question<String>>> {
         match self.config.search_engine {
             SearchEngine::DuckDuckGo => self.search_by_scraper(DuckDuckGo).await,
+            SearchEngine::Google => self.search_by_scraper(Google).await,
             SearchEngine::StackExchange => self.parallel_search_advanced().await,
         }
     }
-- 
cgit v1.2.3