From 5625602e711ceab71bdace19c239c1972fc6ac4d Mon Sep 17 00:00:00 2001 From: Sam Tay Date: Wed, 24 Jun 2020 13:09:03 -0700 Subject: Add google search engine --- TODO.md | 15 +- roadmap.md | 6 +- src/cli.rs | 2 +- src/config.rs | 3 +- src/stackexchange/scraper.rs | 228 +++-- src/stackexchange/search.rs | 3 +- test/bad-user-agent.html | 1 - test/duckduckgo/bad-user-agent.html | 1 + test/duckduckgo/exit-vim.html | 1745 +++++++++++++++++++++++++++++++++++ test/exit-vim.html | 1745 ----------------------------------- test/google/exit-vim.html | 201 ++++ 11 files changed, 2115 insertions(+), 1835 deletions(-) delete mode 100644 test/bad-user-agent.html create mode 100644 test/duckduckgo/bad-user-agent.html create mode 100644 test/duckduckgo/exit-vim.html delete mode 100644 test/exit-vim.html create mode 100644 test/google/exit-vim.html diff --git a/TODO.md b/TODO.md index 0e6be2a..992f2d6 100644 --- a/TODO.md +++ b/TODO.md @@ -1,18 +1,9 @@ # TODO -### v0.3.1 -1. Much of the code can be reused for google: - * parsing href after `"url="` (similar to uddg) - * formatting `(site:stackoverflow.com OR site:unix.stackexchange.com) what is linux` - So make a `Scraper` trait and implement it for DDG & Google. Then - `stackexchange` can just code against `Scraper` and choose based on - `--search-engine | -e' argument` - ### Endless future improvements for the TUI -1. Init with smaller layout depending on initial screen size. -2. Maybe cli `--auto-resize` option. 3. Small text at bottom with '?' to bring up key mapping dialog -4. Clean up! remove dupe between ListView and MdView by making a common trait +1. Init with smaller layout depending on initial screen size. +2. Maybe cli `--auto-resize` option that changes layouts at breakpoints. 5. Maybe **[ESC]** cycles layout in the opposite direction? And stops at BothColumns? 6. Allow cycling through themes, either found in `~/.config/so/colors/*.toml` @@ -23,7 +14,7 @@ #### scraping ```python -# if necessary, choose one of these to mimic browser request +# if necessary, cycle through these to mimic browser request USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0', 'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0', diff --git a/roadmap.md b/roadmap.md index ec52411..f79b693 100644 --- a/roadmap.md +++ b/roadmap.md @@ -24,16 +24,16 @@ [x] Add duckduckgo scraper ### v0.3.1 -[ ] Add google scraper +[x] Add google scraper ### at some point +[ ] look up how to add `debug!` macros; will help troubleshooting blocked requests [ ] use trust to distrubute app binaries [ ] ask SE forums if I should bundle my api-key? (if so use an env var macro) [ ] allow new queries from TUI, e.g. hit `/` for a prompt [ ] or `/` searches current q/a -[ ] clean up error.rs and term.rs ; only keep whats actually ergonomic +[ ] clean up dingleberries in error.rs and term.rs ; only keep whats actually ergonomic [ ] ask legal@stackoverflow.com for permission to logo stackoverflow/stackexchange in readme [ ] add duckduckgo logo to readme [ ] per platform package mgmt [ ] more testing -[ ] maybe add google engine too. but fuck google. diff --git a/src/cli.rs b/src/cli.rs index 715d62e..1892066 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -90,7 +90,7 @@ pub fn get_opts() -> Result { .takes_value(true) .default_value(engine) .value_name("engine") - .possible_values(&["duckduckgo", "stackexchange"]) + .possible_values(&["duckduckgo", "google", "stackexchange"]) .help("Use specified search engine") .next_line_help(true), ) diff --git a/src/config.rs b/src/config.rs index 3102a87..0154cda 100644 --- a/src/config.rs +++ b/src/config.rs @@ -12,7 +12,7 @@ use crate::utils; #[serde(rename_all = "lowercase")] // TODO test this pub enum SearchEngine { DuckDuckGo, - //Google, + Google, StackExchange, } @@ -30,6 +30,7 @@ impl fmt::Display for SearchEngine { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let s = match &self { SearchEngine::DuckDuckGo => "duckduckgo", + SearchEngine::Google => "google", SearchEngine::StackExchange => "stackexchange", }; write!(f, "{}", s) diff --git a/src/stackexchange/scraper.rs b/src/stackexchange/scraper.rs index e6376fa..2f62eb6 100644 --- a/src/stackexchange/scraper.rs +++ b/src/stackexchange/scraper.rs @@ -9,6 +9,7 @@ use crate::error::{Error, Result}; /// DuckDuckGo URL const DUCKDUCKGO_URL: &str = "https://duckduckgo.com"; +const GOOGLE_URL: &str = "https://google.com/search"; // Is question_id unique across all sites? If not, then this edge case is // unaccounted for when sorting. @@ -40,60 +41,23 @@ pub struct DuckDuckGo; impl Scraper for DuckDuckGo { /// Parse (site, question_id) pairs out of duckduckgo search results html - // TODO Benchmark this. It would likely be faster to use regex on the decoded url. - // TODO pull out parts that are composable across different engines fn parse( &self, html: &str, sites: &HashMap, limit: u16, ) -> Result { - let fragment = Html::parse_document(html); let anchors = Selector::parse("a.result__a").unwrap(); - let mut question_ids: HashMap> = HashMap::new(); - let mut ordering: HashMap = HashMap::new(); - let mut count = 0; - for anchor in fragment.select(&anchors) { - let url = anchor - .value() - .attr("href") - .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string())) - .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?; - sites - .iter() - .find_map(|(site_code, site_url)| { - let id = question_url_to_id(site_url, &url)?; - ordering.insert(id.to_owned(), count); - match question_ids.entry(site_code.to_owned()) { - Entry::Occupied(mut o) => o.get_mut().push(id), - Entry::Vacant(o) => { - o.insert(vec![id]); - } - } - count += 1; - Some(()) - }) - .ok_or_else(|| { - Error::ScrapingError( - "Duckduckgo returned results outside of SE network".to_string(), - ) - })?; - if count >= limit as usize { - break; + parse_with_selector(anchors, html, sites, limit).and_then(|sd| { + // DDG seems to never have empty results, so assume this is blocked + if sd.question_ids.is_empty() { + Err(Error::ScrapingError(String::from( + "DuckDuckGo blocked this request", + ))) + } else { + Ok(sd) } - } - // It doesn't seem possible for DDG to return no results, so assume this is - // a bad user agent - if count == 0 { - Err(Error::ScrapingError(String::from( - "DuckDuckGo blocked this request", - ))) - } else { - Ok(ScrapedData { - question_ids, - ordering, - }) - } + }) } /// Creates duckduckgo search url given sites and query @@ -102,27 +66,7 @@ impl Scraper for DuckDuckGo { where I: IntoIterator, { - let mut q = String::new(); - // Restrict to sites - q.push('('); - q.push_str( - sites - .into_iter() - .map(|site| String::from("site:") + site) - .collect::>() - .join(" OR ") - .as_str(), - ); - q.push_str(") "); - // Search terms - q.push_str( - query - .trim_end_matches('?') - .split_whitespace() - .collect::>() - .join(" ") - .as_str(), - ); + let q = make_query_arg(query, sites); Url::parse_with_params( DUCKDUCKGO_URL, &[("q", q.as_str()), ("kz", "-1"), ("kh", "-1")], @@ -131,6 +75,107 @@ impl Scraper for DuckDuckGo { } } +pub struct Google; + +impl Scraper for Google { + /// Parse SE data out of google search results html + fn parse( + &self, + html: &str, + sites: &HashMap, + limit: u16, + ) -> Result { + let anchors = Selector::parse("div.r > a").unwrap(); + // TODO detect no results + // TODO detect blocked request + parse_with_selector(anchors, html, sites, limit) + } + + /// Creates duckduckgo search url given sites and query + /// See https://duckduckgo.com/params for more info + fn get_url<'a, I>(&self, query: &str, sites: I) -> Url + where + I: IntoIterator, + { + let q = make_query_arg(query, sites); + Url::parse_with_params(GOOGLE_URL, &[("q", q.as_str())]).unwrap() + } +} + +fn make_query_arg<'a, I>(query: &str, sites: I) -> String +where + I: IntoIterator, +{ + let mut q = String::new(); + // Restrict to sites + q.push('('); + q.push_str( + sites + .into_iter() + .map(|site| String::from("site:") + site) + .collect::>() + .join(" OR ") + .as_str(), + ); + q.push_str(") "); + // Search terms + q.push_str( + query + .trim_end_matches('?') + .split_whitespace() + .collect::>() + .join(" ") + .as_str(), + ); + q +} + +// TODO Benchmark this. It would likely be faster to use regex on the decoded url. +fn parse_with_selector( + anchors: Selector, + html: &str, + sites: &HashMap, + limit: u16, +) -> Result { + let fragment = Html::parse_document(html); + let mut question_ids: HashMap> = HashMap::new(); + let mut ordering: HashMap = HashMap::new(); + let mut count = 0; + for anchor in fragment.select(&anchors) { + let url = anchor + .value() + .attr("href") + .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string())) + .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?; + sites + .iter() + .find_map(|(site_code, site_url)| { + let id = question_url_to_id(site_url, &url)?; + ordering.insert(id.to_owned(), count); + match question_ids.entry(site_code.to_owned()) { + Entry::Occupied(mut o) => o.get_mut().push(id), + Entry::Vacant(o) => { + o.insert(vec![id]); + } + } + count += 1; + Some(()) + }) + .ok_or_else(|| { + Error::ScrapingError( + "Search engine returned results outside of SE network".to_string(), + ) + })?; + if count >= limit as usize { + break; + } + } + Ok(ScrapedData { + question_ids, + ordering, + }) +} + /// For example /// ``` /// let id = "stackoverflow.com"; @@ -169,7 +214,7 @@ mod tests { #[test] fn test_duckduckgo_parser() { - let html = include_str!("../../test/exit-vim.html"); + let html = include_str!("../../test/duckduckgo/exit-vim.html"); let sites = vec![ ("stackoverflow", "stackoverflow.com"), ("askubuntu", "askubuntu.com"), @@ -196,21 +241,55 @@ mod tests { .collect(), }; assert_eq!( - SearchEngine::DuckDuckGo.parse(html, &sites, 3).unwrap(), + DuckDuckGo.parse(html, &sites, 3).unwrap(), + expected_scraped_data + ); + } + + #[test] + fn test_google_parser() { + let html = include_str!("../../test/google/exit-vim.html"); + let sites = vec![ + ("stackoverflow", "stackoverflow.com"), + ("askubuntu", "askubuntu.com"), + ] + .into_iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect::>(); + let expected_scraped_data = ScrapedData { + question_ids: vec![ + ("stackoverflow", vec!["11828270", "25919461"]), + ("askubuntu", vec!["24406"]), + ] + .into_iter() + .map(|(k, v)| { + ( + k.to_string(), + v.into_iter().map(|s| s.to_string()).collect(), + ) + }) + .collect(), + ordering: vec![("11828270", 0), ("25919461", 1), ("24406", 2)] + .into_iter() + .map(|(k, v)| (k.to_string(), v)) + .collect(), + }; + assert_eq!( + Google.parse(html, &sites, 3).unwrap(), expected_scraped_data ); } #[test] fn test_duckduckgo_blocker() -> Result<(), String> { - let html = include_str!("../../test/bad-user-agent.html"); + let html = include_str!("../../test/duckduckgo/bad-user-agent.html"); let mut sites = HashMap::new(); sites.insert( String::from("stackoverflow"), String::from("stackoverflow.com"), ); - match SearchEngine::DuckDuckGo.parse(html, &sites, 2) { + match DuckDuckGo.parse(html, &sites, 2) { Err(Error::ScrapingError(s)) if s == "DuckDuckGo blocked this request".to_string() => { Ok(()) } @@ -218,6 +297,13 @@ mod tests { } } + #[test] + // TODO Get a blocked request html + // note: this may only be possible at search.rs level (with non-200 code) + fn test_google_blocker() -> Result<(), String> { + Ok(()) + } + #[test] fn test_question_url_to_id() { let site_url = "stackoverflow.com"; diff --git a/src/stackexchange/search.rs b/src/stackexchange/search.rs index 530b665..acfbcc7 100644 --- a/src/stackexchange/search.rs +++ b/src/stackexchange/search.rs @@ -11,7 +11,7 @@ use crate::tui::markdown::Markdown; use super::api::{Answer, Api, Question}; use super::local_storage::LocalStorage; -use super::scraper::{DuckDuckGo, ScrapedData, Scraper}; +use super::scraper::{DuckDuckGo, Google, ScrapedData, Scraper}; /// Limit on concurrent requests (gets passed to `buffer_unordered`) const CONCURRENT_REQUESTS_LIMIT: usize = 8; @@ -84,6 +84,7 @@ impl Search { pub async fn search(&self) -> Result>> { match self.config.search_engine { SearchEngine::DuckDuckGo => self.search_by_scraper(DuckDuckGo).await, + SearchEngine::Google => self.search_by_scraper(Google).await, SearchEngine::StackExchange => self.parallel_search_advanced().await, } } diff --git a/test/bad-user-agent.html b/test/bad-user-agent.html deleted file mode 100644 index 89c4aaa..0000000 --- a/test/bad-user-agent.html +++ /dev/null @@ -1 +0,0 @@ -(site:stackoverflow.com) how do I exit nvim at DuckDuckGoIgnore this box please.
diff --git a/test/duckduckgo/bad-user-agent.html b/test/duckduckgo/bad-user-agent.html new file mode 100644 index 0000000..89c4aaa --- /dev/null +++ b/test/duckduckgo/bad-user-agent.html @@ -0,0 +1 @@ +(site:stackoverflow.com) how do I exit nvim at DuckDuckGoIgnore this box please.
diff --git a/test/duckduckgo/exit-vim.html b/test/duckduckgo/exit-vim.html new file mode 100644 index 0000000..a4f7a4a --- /dev/null +++ b/test/duckduckgo/exit-vim.html @@ -0,0 +1,1745 @@ + + + + + + + + + + + + + (site:https://stackoverflow.com OR site:https://askubuntu.com) how do I exit nvim at DuckDuckGo + + + + + + + + + + +
+ +
+ +
+
+ + + + + + + + + +
+
+