summaryrefslogtreecommitdiffstats
path: root/src/stackexchange/scraper.rs
diff options
context:
space:
mode:
Diffstat (limited to 'src/stackexchange/scraper.rs')
-rw-r--r--src/stackexchange/scraper.rs228
1 files changed, 157 insertions, 71 deletions
diff --git a/src/stackexchange/scraper.rs b/src/stackexchange/scraper.rs
index e6376fa..2f62eb6 100644
--- a/src/stackexchange/scraper.rs
+++ b/src/stackexchange/scraper.rs
@@ -9,6 +9,7 @@ use crate::error::{Error, Result};
/// DuckDuckGo URL
const DUCKDUCKGO_URL: &str = "https://duckduckgo.com";
+const GOOGLE_URL: &str = "https://google.com/search";
// Is question_id unique across all sites? If not, then this edge case is
// unaccounted for when sorting.
@@ -40,60 +41,23 @@ pub struct DuckDuckGo;
impl Scraper for DuckDuckGo {
/// Parse (site, question_id) pairs out of duckduckgo search results html
- // TODO Benchmark this. It would likely be faster to use regex on the decoded url.
- // TODO pull out parts that are composable across different engines
fn parse(
&self,
html: &str,
sites: &HashMap<String, String>,
limit: u16,
) -> Result<ScrapedData> {
- let fragment = Html::parse_document(html);
let anchors = Selector::parse("a.result__a").unwrap();
- let mut question_ids: HashMap<String, Vec<String>> = HashMap::new();
- let mut ordering: HashMap<String, usize> = HashMap::new();
- let mut count = 0;
- for anchor in fragment.select(&anchors) {
- let url = anchor
- .value()
- .attr("href")
- .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string()))
- .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?;
- sites
- .iter()
- .find_map(|(site_code, site_url)| {
- let id = question_url_to_id(site_url, &url)?;
- ordering.insert(id.to_owned(), count);
- match question_ids.entry(site_code.to_owned()) {
- Entry::Occupied(mut o) => o.get_mut().push(id),
- Entry::Vacant(o) => {
- o.insert(vec![id]);
- }
- }
- count += 1;
- Some(())
- })
- .ok_or_else(|| {
- Error::ScrapingError(
- "Duckduckgo returned results outside of SE network".to_string(),
- )
- })?;
- if count >= limit as usize {
- break;
+ parse_with_selector(anchors, html, sites, limit).and_then(|sd| {
+ // DDG seems to never have empty results, so assume this is blocked
+ if sd.question_ids.is_empty() {
+ Err(Error::ScrapingError(String::from(
+ "DuckDuckGo blocked this request",
+ )))
+ } else {
+ Ok(sd)
}
- }
- // It doesn't seem possible for DDG to return no results, so assume this is
- // a bad user agent
- if count == 0 {
- Err(Error::ScrapingError(String::from(
- "DuckDuckGo blocked this request",
- )))
- } else {
- Ok(ScrapedData {
- question_ids,
- ordering,
- })
- }
+ })
}
/// Creates duckduckgo search url given sites and query
@@ -102,27 +66,7 @@ impl Scraper for DuckDuckGo {
where
I: IntoIterator<Item = &'a String>,
{
- let mut q = String::new();
- // Restrict to sites
- q.push('(');
- q.push_str(
- sites
- .into_iter()
- .map(|site| String::from("site:") + site)
- .collect::<Vec<_>>()
- .join(" OR ")
- .as_str(),
- );
- q.push_str(") ");
- // Search terms
- q.push_str(
- query
- .trim_end_matches('?')
- .split_whitespace()
- .collect::<Vec<_>>()
- .join(" ")
- .as_str(),
- );
+ let q = make_query_arg(query, sites);
Url::parse_with_params(
DUCKDUCKGO_URL,
&[("q", q.as_str()), ("kz", "-1"), ("kh", "-1")],
@@ -131,6 +75,107 @@ impl Scraper for DuckDuckGo {
}
}
+pub struct Google;
+
+impl Scraper for Google {
+ /// Parse SE data out of google search results html
+ fn parse(
+ &self,
+ html: &str,
+ sites: &HashMap<String, String>,
+ limit: u16,
+ ) -> Result<ScrapedData> {
+ let anchors = Selector::parse("div.r > a").unwrap();
+ // TODO detect no results
+ // TODO detect blocked request
+ parse_with_selector(anchors, html, sites, limit)
+ }
+
+ /// Creates duckduckgo search url given sites and query
+ /// See https://duckduckgo.com/params for more info
+ fn get_url<'a, I>(&self, query: &str, sites: I) -> Url
+ where
+ I: IntoIterator<Item = &'a String>,
+ {
+ let q = make_query_arg(query, sites);
+ Url::parse_with_params(GOOGLE_URL, &[("q", q.as_str())]).unwrap()
+ }
+}
+
+fn make_query_arg<'a, I>(query: &str, sites: I) -> String
+where
+ I: IntoIterator<Item = &'a String>,
+{
+ let mut q = String::new();
+ // Restrict to sites
+ q.push('(');
+ q.push_str(
+ sites
+ .into_iter()
+ .map(|site| String::from("site:") + site)
+ .collect::<Vec<_>>()
+ .join(" OR ")
+ .as_str(),
+ );
+ q.push_str(") ");
+ // Search terms
+ q.push_str(
+ query
+ .trim_end_matches('?')
+ .split_whitespace()
+ .collect::<Vec<_>>()
+ .join(" ")
+ .as_str(),
+ );
+ q
+}
+
+// TODO Benchmark this. It would likely be faster to use regex on the decoded url.
+fn parse_with_selector(
+ anchors: Selector,
+ html: &str,
+ sites: &HashMap<String, String>,
+ limit: u16,
+) -> Result<ScrapedData> {
+ let fragment = Html::parse_document(html);
+ let mut question_ids: HashMap<String, Vec<String>> = HashMap::new();
+ let mut ordering: HashMap<String, usize> = HashMap::new();
+ let mut count = 0;
+ for anchor in fragment.select(&anchors) {
+ let url = anchor
+ .value()
+ .attr("href")
+ .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string()))
+ .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?;
+ sites
+ .iter()
+ .find_map(|(site_code, site_url)| {
+ let id = question_url_to_id(site_url, &url)?;
+ ordering.insert(id.to_owned(), count);
+ match question_ids.entry(site_code.to_owned()) {
+ Entry::Occupied(mut o) => o.get_mut().push(id),
+ Entry::Vacant(o) => {
+ o.insert(vec![id]);
+ }
+ }
+ count += 1;
+ Some(())
+ })
+ .ok_or_else(|| {
+ Error::ScrapingError(
+ "Search engine returned results outside of SE network".to_string(),
+ )
+ })?;
+ if count >= limit as usize {
+ break;
+ }
+ }
+ Ok(ScrapedData {
+ question_ids,
+ ordering,
+ })
+}
+
/// For example
/// ```
/// let id = "stackoverflow.com";
@@ -169,7 +214,7 @@ mod tests {
#[test]
fn test_duckduckgo_parser() {
- let html = include_str!("../../test/exit-vim.html");
+ let html = include_str!("../../test/duckduckgo/exit-vim.html");
let sites = vec![
("stackoverflow", "stackoverflow.com"),
("askubuntu", "askubuntu.com"),
@@ -196,21 +241,55 @@ mod tests {
.collect(),
};
assert_eq!(
- SearchEngine::DuckDuckGo.parse(html, &sites, 3).unwrap(),
+ DuckDuckGo.parse(html, &sites, 3).unwrap(),
+ expected_scraped_data
+ );
+ }
+
+ #[test]
+ fn test_google_parser() {
+ let html = include_str!("../../test/google/exit-vim.html");
+ let sites = vec![
+ ("stackoverflow", "stackoverflow.com"),
+ ("askubuntu", "askubuntu.com"),
+ ]
+ .into_iter()
+ .map(|(k, v)| (k.to_string(), v.to_string()))
+ .collect::<HashMap<String, String>>();
+ let expected_scraped_data = ScrapedData {
+ question_ids: vec![
+ ("stackoverflow", vec!["11828270", "25919461"]),
+ ("askubuntu", vec!["24406"]),
+ ]
+ .into_iter()
+ .map(|(k, v)| {
+ (
+ k.to_string(),
+ v.into_iter().map(|s| s.to_string()).collect(),
+ )
+ })
+ .collect(),
+ ordering: vec![("11828270", 0), ("25919461", 1), ("24406", 2)]
+ .into_iter()
+ .map(|(k, v)| (k.to_string(), v))
+ .collect(),
+ };
+ assert_eq!(
+ Google.parse(html, &sites, 3).unwrap(),
expected_scraped_data
);
}
#[test]
fn test_duckduckgo_blocker() -> Result<(), String> {
- let html = include_str!("../../test/bad-user-agent.html");
+ let html = include_str!("../../test/duckduckgo/bad-user-agent.html");
let mut sites = HashMap::new();
sites.insert(
String::from("stackoverflow"),
String::from("stackoverflow.com"),
);
- match SearchEngine::DuckDuckGo.parse(html, &sites, 2) {
+ match DuckDuckGo.parse(html, &sites, 2) {
Err(Error::ScrapingError(s)) if s == "DuckDuckGo blocked this request".to_string() => {
Ok(())
}
@@ -219,6 +298,13 @@ mod tests {
}
#[test]
+ // TODO Get a blocked request html
+ // note: this may only be possible at search.rs level (with non-200 code)
+ fn test_google_blocker() -> Result<(), String> {
+ Ok(())
+ }
+
+ #[test]
fn test_question_url_to_id() {
let site_url = "stackoverflow.com";
let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor";