use percent_encoding::percent_decode_str; use reqwest::Url; use scraper::html::Html; use scraper::selector::Selector; use std::collections::hash_map::Entry; use std::collections::HashMap; use crate::error::{Error, Result}; /// DuckDuckGo URL const DUCKDUCKGO_URL: &str = "https://duckduckgo.com"; const GOOGLE_URL: &str = "https://google.com/search"; // Is question_id unique across all sites? If not, then this edge case is // unaccounted for when sorting. // // If this is ever an issue, it wouldn't be too hard to account for this; just // keep track of site in the `ordering` field and also return site from the // spawned per-site tasks. #[derive(Debug, PartialEq)] pub struct ScrapedData { /// Mapping of site code to question ids pub question_ids: HashMap>, /// Mapping of question_id to its ordinal place in search results pub ordering: HashMap, } // TODO add this type system limitation to blog post pub trait Scraper { /// Parse data from search results html fn parse(&self, html: &str, sites: &HashMap, limit: u16) -> Result; /// Get the url to search query restricted to sites fn get_url<'a, I>(&self, query: &str, sites: I) -> Url where I: IntoIterator; } pub struct DuckDuckGo; impl Scraper for DuckDuckGo { /// Parse (site, question_id) pairs out of duckduckgo search results html fn parse( &self, html: &str, sites: &HashMap, limit: u16, ) -> Result { let anchors = Selector::parse("a.result__a").unwrap(); parse_with_selector(anchors, html, sites, limit).and_then(|sd| { // DDG seems to never have empty results, so assume this is blocked if sd.question_ids.is_empty() { Err(Error::Scraping(String::from( "DuckDuckGo blocked this request", ))) } else { Ok(sd) } }) } /// Creates duckduckgo search url given sites and query /// See https://duckduckgo.com/params for more info fn get_url<'a, I>(&self, query: &str, sites: I) -> Url where I: IntoIterator, { let q = make_query_arg(query, sites); Url::parse_with_params( DUCKDUCKGO_URL, &[("q", q.as_str()), ("kz", "-1"), ("kh", "-1")], ) .unwrap() } } pub struct Google; impl Scraper for Google { /// Parse SE data out of google search results html fn parse( &self, html: &str, sites: &HashMap, limit: u16, ) -> Result { let anchors = Selector::parse("a").unwrap(); parse_with_selector(anchors, html, sites, limit) } /// Creates duckduckgo search url given sites and query /// See https://duckduckgo.com/params for more info fn get_url<'a, I>(&self, query: &str, sites: I) -> Url where I: IntoIterator, { let q = make_query_arg(query, sites); Url::parse_with_params(GOOGLE_URL, &[("q", q.as_str())]).unwrap() } } fn make_query_arg<'a, I>(query: &str, sites: I) -> String where I: IntoIterator, { let mut q = String::new(); // Restrict to sites q.push('('); q.push_str( sites .into_iter() .map(|site| String::from("site:") + site) .collect::>() .join(" OR ") .as_str(), ); q.push_str(") "); // Search terms q.push_str( query .trim_end_matches('?') .split_whitespace() .collect::>() .join(" ") .as_str(), ); q } fn parse_with_selector( anchors: Selector, html: &str, sites: &HashMap, limit: u16, ) -> Result { let fragment = Html::parse_document(html); let mut question_ids: HashMap> = HashMap::new(); let mut ordering: HashMap = HashMap::new(); let mut count = 0; for anchor in fragment.select(&anchors) { if let Some(url) = anchor .value() .attr("href") .map(|href| percent_decode_str(href).decode_utf8_lossy()) { sites.iter().find_map(|(site_code, site_url)| { let id = question_url_to_id(site_url, &url)?; ordering.insert(id.to_owned(), count); match question_ids.entry(site_code.to_owned()) { Entry::Occupied(mut o) => o.get_mut().push(id), Entry::Vacant(o) => { o.insert(vec![id]); } } count += 1; Some(()) }); } if count >= limit as usize { break; } } Ok(ScrapedData { question_ids, ordering, }) } // TODO use str_prefix once its stable fn question_url_to_id(site_url: &str, input: &str) -> Option { ["/questions/", "/q/"].iter().find_map(|segment| { let fragment = site_url.trim_end_matches('/').to_owned() + segment; let mut ix = input.find(&fragment)?; if ix > 0 && input.chars().nth(ix - 1) == Some('.') { return None; } ix += fragment.len(); let input = &input[ix..]; let id = if let Some(end) = input.find('/') { input[0..end].to_string() } else { input[0..].to_string() }; if id.chars().all(|c| c.is_digit(10)) { Some(id) } else { None } }) } // TODO Get blocked google request html // TODO Get google no results html // note: this may only be possible at search.rs level (with non-200 code) #[cfg(test)] mod tests { use super::*; #[test] fn test_duckduckgo_url() { let q = "how do I exit vim?"; let sites = vec![ String::from("stackoverflow.com"), String::from("unix.stackexchange.com"), ]; assert_eq!( DuckDuckGo.get_url(q, &sites).as_str(), String::from( "https://duckduckgo.com/\ ?q=%28site%3Astackoverflow.com+OR+site%3Aunix.stackexchange.com%29\ +how+do+I+exit+vim&kz=-1&kh=-1" ) ) } #[test] fn test_duckduckgo_parser() { let html = include_str!("../../test/duckduckgo/exit-vim.html"); let sites = vec![ ("stackoverflow", "stackoverflow.com"), ("askubuntu", "askubuntu.com"), ] .into_iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect::>(); let expected_scraped_data = ScrapedData { question_ids: vec![ ("stackoverflow", vec!["11828270", "9171356"]), ("askubuntu", vec!["24406"]), ] .into_iter() .map(|(k, v)| { ( k.to_string(), v.into_iter().map(|s| s.to_string()).collect(), ) }) .collect(), ordering: vec![("11828270", 0), ("9171356", 2), ("24406", 1)] .into_iter() .map(|(k, v)| (k.to_string(), v)) .collect(), }; assert_eq!( DuckDuckGo.parse(html, &sites, 3).unwrap(), expected_scraped_data ); } #[test] fn test_google_parser() { let html = include_str!("../../test/google/exit-vim.html"); let sites = vec![ ("stackoverflow", "stackoverflow.com"), ("askubuntu", "askubuntu.com"), ] .into_iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect::>(); let expected_scraped_data = ScrapedData { question_ids: vec![ ("stackoverflow", vec!["11828270", "25919461"]), ("askubuntu", vec!["24406"]), ] .into_iter() .map(|(k, v)| { ( k.to_string(), v.into_iter().map(|s| s.to_string()).collect(), ) }) .collect(), ordering: vec![("11828270", 0), ("25919461", 1), ("24406", 2)] .into_iter() .map(|(k, v)| (k.to_string(), v)) .collect(), }; assert_eq!( Google.parse(html, &sites, 3).unwrap(), expected_scraped_data ); } #[test] fn test_google_q_parser() { let html = include_str!("../../test/google/parsing-q.html"); let mut sites = HashMap::new(); sites.insert( String::from("stackoverflow"), String::from("stackoverflow.com"), ); let expected_scraped_data = ScrapedData { question_ids: vec![( String::from("stackoverflow"), vec![ String::from("3940128"), String::from("4647368"), String::from("12336105"), ], )] .into_iter() .collect(), ordering: vec![ (String::from("3940128"), 0), (String::from("4647368"), 1), (String::from("12336105"), 2), ] .into_iter() .collect(), }; assert_eq!( Google.parse(html, &sites, 3).unwrap(), expected_scraped_data ); } #[test] fn test_duckduckgo_blocker() -> Result<(), String> { let html = include_str!("../../test/duckduckgo/bad-user-agent.html"); let mut sites = HashMap::new(); sites.insert( String::from("stackoverflow"), String::from("stackoverflow.com"), ); match DuckDuckGo.parse(html, &sites, 2) { Err(Error::Scraping(s)) if s == *"DuckDuckGo blocked this request" => Ok(()), _ => Err(String::from("Failed to detect DuckDuckGo blocker")), } } #[test] fn test_question_url_to_id() { // Happy path let site_url = "stackoverflow.com"; let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor"; assert_eq!(question_url_to_id(site_url, input).unwrap(), "11828270"); // Happy path with variant /q/ let site_url = "stackoverflow.com"; let input = "/l/?kh=-1&uddg=https://stackoverflow.com/q/11828270"; assert_eq!(question_url_to_id(site_url, input).unwrap(), "11828270"); // Base site let site_url = "unix.stackoverflow.com"; let input = "/l/?kh=-1&uddg=https://unix.stackoverflow.com"; assert_eq!(question_url_to_id(site_url, input), None); // Tagged link let site_url = "meta.stackexchange.com"; let input = "/l/?kh=-1&uddg=https://meta.stackexchange.com/questions/tagged/stackexchange-tour"; assert_eq!(question_url_to_id(site_url, input), None); // Different site let site_url = "meta.stackexchange.com"; let input = "/l/?kh=-1&uddg=https://math.meta.stackexchange.com/q/11828270"; assert_eq!(question_url_to_id(site_url, input), None); } }