1 files changed, 157 insertions, 71 deletions
diff --git a/src/stackexchange/scraper.rs b/src/stackexchange/scraper.rs
index e6376fa..2f62eb6 100644
--- a/src/stackexchange/scraper.rs
+++ b/src/stackexchange/scraper.rs
@@ -9,6 +9,7 @@ use crate::error::{Error, Result};
 
 /// DuckDuckGo URL
 const DUCKDUCKGO_URL: &str = "https://duckduckgo.com";
+const GOOGLE_URL: &str = "https://google.com/search";
 
 // Is question_id unique across all sites? If not, then this edge case is
 // unaccounted for when sorting.
@@ -40,60 +41,23 @@ pub struct DuckDuckGo;
 
 impl Scraper for DuckDuckGo {
     /// Parse (site, question_id) pairs out of duckduckgo search results html
-    // TODO Benchmark this. It would likely be faster to use regex on the decoded url.
-    // TODO pull out parts that are composable across different engines
     fn parse(
         &self,
         html: &str,
         sites: &HashMap<String, String>,
         limit: u16,
     ) -> Result<ScrapedData> {
-        let fragment = Html::parse_document(html);
         let anchors = Selector::parse("a.result__a").unwrap();
-        let mut question_ids: HashMap<String, Vec<String>> = HashMap::new();
-        let mut ordering: HashMap<String, usize> = HashMap::new();
-        let mut count = 0;
-        for anchor in fragment.select(&anchors) {
-            let url = anchor
-                .value()
-                .attr("href")
-                .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string()))
-                .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?;
-            sites
-                .iter()
-                .find_map(|(site_code, site_url)| {
-                    let id = question_url_to_id(site_url, &url)?;
-                    ordering.insert(id.to_owned(), count);
-                    match question_ids.entry(site_code.to_owned()) {
-                        Entry::Occupied(mut o) => o.get_mut().push(id),
-                        Entry::Vacant(o) => {
-                            o.insert(vec![id]);
-                        }
-                    }
-                    count += 1;
-                    Some(())
-                })
-                .ok_or_else(|| {
-                    Error::ScrapingError(
-                        "Duckduckgo returned results outside of SE network".to_string(),
-                    )
-                })?;
-            if count >= limit as usize {
-                break;
+        parse_with_selector(anchors, html, sites, limit).and_then(|sd| {
+            // DDG seems to never have empty results, so assume this is blocked
+            if sd.question_ids.is_empty() {
+                Err(Error::ScrapingError(String::from(
+                    "DuckDuckGo blocked this request",
+                )))
+            } else {
+                Ok(sd)
             }
-        }
-        // It doesn't seem possible for DDG to return no results, so assume this is
-        // a bad user agent
-        if count == 0 {
-            Err(Error::ScrapingError(String::from(
-                "DuckDuckGo blocked this request",
-            )))
-        } else {
-            Ok(ScrapedData {
-                question_ids,
-                ordering,
-            })
-        }
+        })
     }
 
     /// Creates duckduckgo search url given sites and query
@@ -102,27 +66,7 @@ impl Scraper for DuckDuckGo {
     where
         I: IntoIterator<Item = &'a String>,
     {
-        let mut q = String::new();
-        //  Restrict to sites
-        q.push('(');
-        q.push_str(
-            sites
-                .into_iter()
-                .map(|site| String::from("site:") + site)
-                .collect::<Vec<_>>()
-                .join(" OR ")
-                .as_str(),
-        );
-        q.push_str(") ");
-        //  Search terms
-        q.push_str(
-            query
-                .trim_end_matches('?')
-                .split_whitespace()
-                .collect::<Vec<_>>()
-                .join(" ")
-                .as_str(),
-        );
+        let q = make_query_arg(query, sites);
         Url::parse_with_params(
             DUCKDUCKGO_URL,
             &[("q", q.as_str()), ("kz", "-1"), ("kh", "-1")],
@@ -131,6 +75,107 @@ impl Scraper for DuckDuckGo {
     }
 }
 
+pub struct Google;
+
+impl Scraper for Google {
+    /// Parse SE data out of google search results html
+    fn parse(
+        &self,
+        html: &str,
+        sites: &HashMap<String, String>,
+        limit: u16,
+    ) -> Result<ScrapedData> {
+        let anchors = Selector::parse("div.r > a").unwrap();
+        // TODO detect no results
+        // TODO detect blocked request
+        parse_with_selector(anchors, html, sites, limit)
+    }
+
+    /// Creates duckduckgo search url given sites and query
+    /// See https://duckduckgo.com/params for more info
+    fn get_url<'a, I>(&self, query: &str, sites: I) -> Url
+    where
+        I: IntoIterator<Item = &'a String>,
+    {
+        let q = make_query_arg(query, sites);
+        Url::parse_with_params(GOOGLE_URL, &[("q", q.as_str())]).unwrap()
+    }
+}
+
+fn make_query_arg<'a, I>(query: &str, sites: I) -> String
+where
+    I: IntoIterator<Item = &'a String>,
+{
+    let mut q = String::new();
+    //  Restrict to sites
+    q.push('(');
+    q.push_str(
+        sites
+            .into_iter()
+            .map(|site| String::from("site:") + site)
+            .collect::<Vec<_>>()
+            .join(" OR ")
+            .as_str(),
+    );
+    q.push_str(") ");
+    //  Search terms
+    q.push_str(
+        query
+            .trim_end_matches('?')
+            .split_whitespace()
+            .collect::<Vec<_>>()
+            .join(" ")
+            .as_str(),
+    );
+    q
+}
+
+// TODO Benchmark this. It would likely be faster to use regex on the decoded url.
+fn parse_with_selector(
+    anchors: Selector,
+    html: &str,
+    sites: &HashMap<String, String>,
+    limit: u16,
+) -> Result<ScrapedData> {
+    let fragment = Html::parse_document(html);
+    let mut question_ids: HashMap<String, Vec<String>> = HashMap::new();
+    let mut ordering: HashMap<String, usize> = HashMap::new();
+    let mut count = 0;
+    for anchor in fragment.select(&anchors) {
+        let url = anchor
+            .value()
+            .attr("href")
+            .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string()))
+            .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?;
+        sites
+            .iter()
+            .find_map(|(site_code, site_url)| {
+                let id = question_url_to_id(site_url, &url)?;
+                ordering.insert(id.to_owned(), count);
+                match question_ids.entry(site_code.to_owned()) {
+                    Entry::Occupied(mut o) => o.get_mut().push(id),
+                    Entry::Vacant(o) => {
+                        o.insert(vec![id]);
+                    }
+                }
+                count += 1;
+                Some(())
+            })
+            .ok_or_else(|| {
+                Error::ScrapingError(
+                    "Search engine returned results outside of SE network".to_string(),
+                )
+            })?;
+        if count >= limit as usize {
+            break;
+        }
+    }
+    Ok(ScrapedData {
+        question_ids,
+        ordering,
+    })
+}
+
 /// For example
 /// ```
 /// let id = "stackoverflow.com";
@@ -169,7 +214,7 @@ mod tests {
 
     #[test]
     fn test_duckduckgo_parser() {
-        let html = include_str!("../../test/exit-vim.html");
+        let html = include_str!("../../test/duckduckgo/exit-vim.html");
         let sites = vec![
             ("stackoverflow", "stackoverflow.com"),
             ("askubuntu", "askubuntu.com"),
@@ -196,21 +241,55 @@ mod tests {
                 .collect(),
         };
         assert_eq!(
-            SearchEngine::DuckDuckGo.parse(html, &sites, 3).unwrap(),
+            DuckDuckGo.parse(html, &sites, 3).unwrap(),
+            expected_scraped_data
+        );
+    }
+
+    #[test]
+    fn test_google_parser() {
+        let html = include_str!("../../test/google/exit-vim.html");
+        let sites = vec![
+            ("stackoverflow", "stackoverflow.com"),
+            ("askubuntu", "askubuntu.com"),
+        ]
+        .into_iter()
+        .map(|(k, v)| (k.to_string(), v.to_string()))
+        .collect::<HashMap<String, String>>();
+        let expected_scraped_data = ScrapedData {
+            question_ids: vec![
+                ("stackoverflow", vec!["11828270", "25919461"]),
+                ("askubuntu", vec!["24406"]),
+            ]
+            .into_iter()
+            .map(|(k, v)| {
+                (
+                    k.to_string(),
+                    v.into_iter().map(|s| s.to_string()).collect(),
+                )
+            })
+            .collect(),
+            ordering: vec![("11828270", 0), ("25919461", 1), ("24406", 2)]
+                .into_iter()
+                .map(|(k, v)| (k.to_string(), v))
+                .collect(),
+        };
+        assert_eq!(
+            Google.parse(html, &sites, 3).unwrap(),
             expected_scraped_data
         );
     }
 
     #[test]
     fn test_duckduckgo_blocker() -> Result<(), String> {
-        let html = include_str!("../../test/bad-user-agent.html");
+        let html = include_str!("../../test/duckduckgo/bad-user-agent.html");
         let mut sites = HashMap::new();
         sites.insert(
             String::from("stackoverflow"),
             String::from("stackoverflow.com"),
         );
 
-        match SearchEngine::DuckDuckGo.parse(html, &sites, 2) {
+        match DuckDuckGo.parse(html, &sites, 2) {
             Err(Error::ScrapingError(s)) if s == "DuckDuckGo blocked this request".to_string() => {
                 Ok(())
             }
@@ -219,6 +298,13 @@ mod tests {
     }
 
     #[test]
+    // TODO  Get a blocked request html
+    // note: this may only be possible at search.rs level (with non-200 code)
+    fn test_google_blocker() -> Result<(), String> {
+        Ok(())
+    }
+
+    #[test]
     fn test_question_url_to_id() {
         let site_url = "stackoverflow.com";
         let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor";