From 5625602e711ceab71bdace19c239c1972fc6ac4d Mon Sep 17 00:00:00 2001 From: Sam Tay Date: Wed, 24 Jun 2020 13:09:03 -0700 Subject: Add google search engine --- src/cli.rs | 2 +- src/config.rs | 3 +- src/stackexchange/scraper.rs | 228 +++++++++++++++++++++++++++++-------------- src/stackexchange/search.rs | 3 +- 4 files changed, 162 insertions(+), 74 deletions(-) (limited to 'src') diff --git a/src/cli.rs b/src/cli.rs index 715d62e..1892066 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -90,7 +90,7 @@ pub fn get_opts() -> Result { .takes_value(true) .default_value(engine) .value_name("engine") - .possible_values(&["duckduckgo", "stackexchange"]) + .possible_values(&["duckduckgo", "google", "stackexchange"]) .help("Use specified search engine") .next_line_help(true), ) diff --git a/src/config.rs b/src/config.rs index 3102a87..0154cda 100644 --- a/src/config.rs +++ b/src/config.rs @@ -12,7 +12,7 @@ use crate::utils; #[serde(rename_all = "lowercase")] // TODO test this pub enum SearchEngine { DuckDuckGo, - //Google, + Google, StackExchange, } @@ -30,6 +30,7 @@ impl fmt::Display for SearchEngine { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let s = match &self { SearchEngine::DuckDuckGo => "duckduckgo", + SearchEngine::Google => "google", SearchEngine::StackExchange => "stackexchange", }; write!(f, "{}", s) diff --git a/src/stackexchange/scraper.rs b/src/stackexchange/scraper.rs index e6376fa..2f62eb6 100644 --- a/src/stackexchange/scraper.rs +++ b/src/stackexchange/scraper.rs @@ -9,6 +9,7 @@ use crate::error::{Error, Result}; /// DuckDuckGo URL const DUCKDUCKGO_URL: &str = "https://duckduckgo.com"; +const GOOGLE_URL: &str = "https://google.com/search"; // Is question_id unique across all sites? If not, then this edge case is // unaccounted for when sorting. @@ -40,60 +41,23 @@ pub struct DuckDuckGo; impl Scraper for DuckDuckGo { /// Parse (site, question_id) pairs out of duckduckgo search results html - // TODO Benchmark this. It would likely be faster to use regex on the decoded url. - // TODO pull out parts that are composable across different engines fn parse( &self, html: &str, sites: &HashMap, limit: u16, ) -> Result { - let fragment = Html::parse_document(html); let anchors = Selector::parse("a.result__a").unwrap(); - let mut question_ids: HashMap> = HashMap::new(); - let mut ordering: HashMap = HashMap::new(); - let mut count = 0; - for anchor in fragment.select(&anchors) { - let url = anchor - .value() - .attr("href") - .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string())) - .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?; - sites - .iter() - .find_map(|(site_code, site_url)| { - let id = question_url_to_id(site_url, &url)?; - ordering.insert(id.to_owned(), count); - match question_ids.entry(site_code.to_owned()) { - Entry::Occupied(mut o) => o.get_mut().push(id), - Entry::Vacant(o) => { - o.insert(vec![id]); - } - } - count += 1; - Some(()) - }) - .ok_or_else(|| { - Error::ScrapingError( - "Duckduckgo returned results outside of SE network".to_string(), - ) - })?; - if count >= limit as usize { - break; + parse_with_selector(anchors, html, sites, limit).and_then(|sd| { + // DDG seems to never have empty results, so assume this is blocked + if sd.question_ids.is_empty() { + Err(Error::ScrapingError(String::from( + "DuckDuckGo blocked this request", + ))) + } else { + Ok(sd) } - } - // It doesn't seem possible for DDG to return no results, so assume this is - // a bad user agent - if count == 0 { - Err(Error::ScrapingError(String::from( - "DuckDuckGo blocked this request", - ))) - } else { - Ok(ScrapedData { - question_ids, - ordering, - }) - } + }) } /// Creates duckduckgo search url given sites and query @@ -102,27 +66,7 @@ impl Scraper for DuckDuckGo { where I: IntoIterator, { - let mut q = String::new(); - // Restrict to sites - q.push('('); - q.push_str( - sites - .into_iter() - .map(|site| String::from("site:") + site) - .collect::>() - .join(" OR ") - .as_str(), - ); - q.push_str(") "); - // Search terms - q.push_str( - query - .trim_end_matches('?') - .split_whitespace() - .collect::>() - .join(" ") - .as_str(), - ); + let q = make_query_arg(query, sites); Url::parse_with_params( DUCKDUCKGO_URL, &[("q", q.as_str()), ("kz", "-1"), ("kh", "-1")], @@ -131,6 +75,107 @@ impl Scraper for DuckDuckGo { } } +pub struct Google; + +impl Scraper for Google { + /// Parse SE data out of google search results html + fn parse( + &self, + html: &str, + sites: &HashMap, + limit: u16, + ) -> Result { + let anchors = Selector::parse("div.r > a").unwrap(); + // TODO detect no results + // TODO detect blocked request + parse_with_selector(anchors, html, sites, limit) + } + + /// Creates duckduckgo search url given sites and query + /// See https://duckduckgo.com/params for more info + fn get_url<'a, I>(&self, query: &str, sites: I) -> Url + where + I: IntoIterator, + { + let q = make_query_arg(query, sites); + Url::parse_with_params(GOOGLE_URL, &[("q", q.as_str())]).unwrap() + } +} + +fn make_query_arg<'a, I>(query: &str, sites: I) -> String +where + I: IntoIterator, +{ + let mut q = String::new(); + // Restrict to sites + q.push('('); + q.push_str( + sites + .into_iter() + .map(|site| String::from("site:") + site) + .collect::>() + .join(" OR ") + .as_str(), + ); + q.push_str(") "); + // Search terms + q.push_str( + query + .trim_end_matches('?') + .split_whitespace() + .collect::>() + .join(" ") + .as_str(), + ); + q +} + +// TODO Benchmark this. It would likely be faster to use regex on the decoded url. +fn parse_with_selector( + anchors: Selector, + html: &str, + sites: &HashMap, + limit: u16, +) -> Result { + let fragment = Html::parse_document(html); + let mut question_ids: HashMap> = HashMap::new(); + let mut ordering: HashMap = HashMap::new(); + let mut count = 0; + for anchor in fragment.select(&anchors) { + let url = anchor + .value() + .attr("href") + .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string())) + .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?; + sites + .iter() + .find_map(|(site_code, site_url)| { + let id = question_url_to_id(site_url, &url)?; + ordering.insert(id.to_owned(), count); + match question_ids.entry(site_code.to_owned()) { + Entry::Occupied(mut o) => o.get_mut().push(id), + Entry::Vacant(o) => { + o.insert(vec![id]); + } + } + count += 1; + Some(()) + }) + .ok_or_else(|| { + Error::ScrapingError( + "Search engine returned results outside of SE network".to_string(), + ) + })?; + if count >= limit as usize { + break; + } + } + Ok(ScrapedData { + question_ids, + ordering, + }) +} + /// For example /// ``` /// let id = "stackoverflow.com"; @@ -169,7 +214,7 @@ mod tests { #[test] fn test_duckduckgo_parser() { - let html = include_str!("../../test/exit-vim.html"); + let html = include_str!("../../test/duckduckgo/exit-vim.html"); let sites = vec![ ("stackoverflow", "stackoverflow.com"), ("askubuntu", "askubuntu.com"), @@ -196,21 +241,55 @@ mod tests { .collect(), }; assert_eq!( - SearchEngine::DuckDuckGo.parse(html, &sites, 3).unwrap(), + DuckDuckGo.parse(html, &sites, 3).unwrap(), + expected_scraped_data + ); + } + + #[test] + fn test_google_parser() { + let html = include_str!("../../test/google/exit-vim.html"); + let sites = vec![ + ("stackoverflow", "stackoverflow.com"), + ("askubuntu", "askubuntu.com"), + ] + .into_iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect::>(); + let expected_scraped_data = ScrapedData { + question_ids: vec![ + ("stackoverflow", vec!["11828270", "25919461"]), + ("askubuntu", vec!["24406"]), + ] + .into_iter() + .map(|(k, v)| { + ( + k.to_string(), + v.into_iter().map(|s| s.to_string()).collect(), + ) + }) + .collect(), + ordering: vec![("11828270", 0), ("25919461", 1), ("24406", 2)] + .into_iter() + .map(|(k, v)| (k.to_string(), v)) + .collect(), + }; + assert_eq!( + Google.parse(html, &sites, 3).unwrap(), expected_scraped_data ); } #[test] fn test_duckduckgo_blocker() -> Result<(), String> { - let html = include_str!("../../test/bad-user-agent.html"); + let html = include_str!("../../test/duckduckgo/bad-user-agent.html"); let mut sites = HashMap::new(); sites.insert( String::from("stackoverflow"), String::from("stackoverflow.com"), ); - match SearchEngine::DuckDuckGo.parse(html, &sites, 2) { + match DuckDuckGo.parse(html, &sites, 2) { Err(Error::ScrapingError(s)) if s == "DuckDuckGo blocked this request".to_string() => { Ok(()) } @@ -218,6 +297,13 @@ mod tests { } } + #[test] + // TODO Get a blocked request html + // note: this may only be possible at search.rs level (with non-200 code) + fn test_google_blocker() -> Result<(), String> { + Ok(()) + } + #[test] fn test_question_url_to_id() { let site_url = "stackoverflow.com"; diff --git a/src/stackexchange/search.rs b/src/stackexchange/search.rs index 530b665..acfbcc7 100644 --- a/src/stackexchange/search.rs +++ b/src/stackexchange/search.rs @@ -11,7 +11,7 @@ use crate::tui::markdown::Markdown; use super::api::{Answer, Api, Question}; use super::local_storage::LocalStorage; -use super::scraper::{DuckDuckGo, ScrapedData, Scraper}; +use super::scraper::{DuckDuckGo, Google, ScrapedData, Scraper}; /// Limit on concurrent requests (gets passed to `buffer_unordered`) const CONCURRENT_REQUESTS_LIMIT: usize = 8; @@ -84,6 +84,7 @@ impl Search { pub async fn search(&self) -> Result>> { match self.config.search_engine { SearchEngine::DuckDuckGo => self.search_by_scraper(DuckDuckGo).await, + SearchEngine::Google => self.search_by_scraper(Google).await, SearchEngine::StackExchange => self.parallel_search_advanced().await, } } -- cgit v1.2.3