From 0e27fbe38f44155096f5c5c1d934d70d6e21eb7e Mon Sep 17 00:00:00 2001 From: Sam Tay Date: Wed, 30 Jun 2021 00:01:20 -0400 Subject: Fix google parser --- Cargo.toml | 4 +- src/stackexchange/scraper.rs | 31 ++-- test/google/exit-vim.html | 327 +++++++++++++++++++++++-------------------- test/google/parsing-q.html | 325 +++++++++++++++++++++++------------------- 4 files changed, 376 insertions(+), 311 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index fbffa24..4cc43d6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,12 +21,12 @@ criterion = "0.3" [[bench]] name = "html_parsing" -path = "html_parsing.rs" +path = "benches/html_parsing.rs" harness = false [[bench]] name = "md_parsing" -path = "md_parsing.rs" +path = "benches/md_parsing.rs" harness = false [dependencies] diff --git a/src/stackexchange/scraper.rs b/src/stackexchange/scraper.rs index e67f0f6..97c4bef 100644 --- a/src/stackexchange/scraper.rs +++ b/src/stackexchange/scraper.rs @@ -85,7 +85,7 @@ impl Scraper for Google { sites: &HashMap, limit: u16, ) -> Result { - let anchors = Selector::parse("div.r > a").unwrap(); + let anchors = Selector::parse("a").unwrap(); parse_with_selector(anchors, html, sites, limit) } @@ -139,23 +139,24 @@ fn parse_with_selector( let mut ordering: HashMap = HashMap::new(); let mut count = 0; for anchor in fragment.select(&anchors) { - let url = anchor + if let Some(url) = anchor .value() .attr("href") - .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string())) - .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?; - sites.iter().find_map(|(site_code, site_url)| { - let id = question_url_to_id(site_url, &url)?; - ordering.insert(id.to_owned(), count); - match question_ids.entry(site_code.to_owned()) { - Entry::Occupied(mut o) => o.get_mut().push(id), - Entry::Vacant(o) => { - o.insert(vec![id]); + .map(|href| percent_decode_str(href).decode_utf8_lossy()) + { + sites.iter().find_map(|(site_code, site_url)| { + let id = question_url_to_id(site_url, &url)?; + ordering.insert(id.to_owned(), count); + match question_ids.entry(site_code.to_owned()) { + Entry::Occupied(mut o) => o.get_mut().push(id), + Entry::Vacant(o) => { + o.insert(vec![id]); + } } - } - count += 1; - Some(()) - }); + count += 1; + Some(()) + }); + } if count >= limit as usize { break; } diff --git a/test/google/exit-vim.html b/test/google/exit-vim.html index bd74d4c..90abdf3 100644 --- a/test/google/exit-vim.html +++ b/test/google/exit-vim.html @@ -1,201 +1,230 @@ -(site:askubuntu.com OR site:stackoverflow.com) how do i exit nvim - Google Search

Accessibility Links