From 74bda95681c253eea5010417dc74e8569010f7f9 Mon Sep 17 00:00:00 2001 From: Sam Tay Date: Tue, 23 Jun 2020 21:09:35 -0700 Subject: Maintain order of duckduckgo search results --- TODO.md | 22 +++++--------- roadmap.md | 5 +++- src/stackexchange.rs | 83 +++++++++++++++++++++++++++++++++++----------------- 3 files changed, 67 insertions(+), 43 deletions(-) diff --git a/TODO.md b/TODO.md index 6b53cb2..4a97764 100644 --- a/TODO.md +++ b/TODO.md @@ -1,8 +1,5 @@ # TODO -### v0.3.0 -1. Keep relevance ordering !!! - ### v0.3.1 1. Much of the code can be reused for google: * parsing href after `"url="` (similar to uddg) @@ -15,9 +12,6 @@ - api - scraper - - - ### Endless future improvements for the TUI 1. Init with smaller layout depending on initial screen size. 2. Maybe cli `--auto-resize` option. @@ -32,17 +26,15 @@ ### resources for later #### scraping -6. Google stuff [scraping with reqwest](https://rust-lang-nursery.github.io/rust-cookbook/web/scraping.html)) - ```python -# if necessary, choose one of these to mimic browswer request +# if necessary, choose one of these to mimic browser request USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0', - 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0', - 'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0', - ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) ' - 'Chrome/19.0.1084.46 Safari/536.5'), - ('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46' - 'Safari/536.5'), ) +'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0', +'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0', +('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) ' +'Chrome/19.0.1084.46 Safari/536.5'), +('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46' +'Safari/536.5'), ) # checks for search engine blocks BLOCK_INDICATORS = ( diff --git a/roadmap.md b/roadmap.md index b5d8d36..ec52411 100644 --- a/roadmap.md +++ b/roadmap.md @@ -21,7 +21,10 @@ [x] Support multiple --site args & searches ### v0.3.0 -[ ] Add duckduckgo scraper +[x] Add duckduckgo scraper + +### v0.3.1 +[ ] Add google scraper ### at some point [ ] use trust to distrubute app binaries diff --git a/src/stackexchange.rs b/src/stackexchange.rs index 2939c29..a5f59e9 100644 --- a/src/stackexchange.rs +++ b/src/stackexchange.rs @@ -44,7 +44,6 @@ const USER_AGENT: &str = /// This structure allows interacting with parts of the StackExchange /// API, using the `Config` struct to determine certain API settings and options. -// TODO should my se structs have &str instead of String? #[derive(Clone)] pub struct StackExchange { client: Client, @@ -79,7 +78,6 @@ pub struct Answer { /// Represents a StackExchange question with a custom selection of fields from /// the [StackExchange docs](https://api.stackexchange.com/docs/types/question) // TODO container over answers should be generic iterator -// TODO let body be a generic that implements Display! #[derive(Clone, Deserialize, Debug)] pub struct Question { #[serde(rename = "question_id")] @@ -97,6 +95,20 @@ struct ResponseWrapper { items: Vec, } +// Iss question_id unique across all sites? If not, then this edge case is +// unaccounted for when sorting. +// +// If this is ever an issue, it wouldn't be too hard to account for this; just +// keep track of site in the `ordering` field and also return site from the +// spawned per-site tasks. +#[derive(Debug, PartialEq)] +struct ScrapedData { + /// Mapping of site code to question ids + question_ids: HashMap>, + /// Mapping of question_id to its ordinal place in search results + ordering: HashMap, +} + impl StackExchange { pub fn new(config: Config, local_storage: LocalStorage, query: String) -> Self { let client = Client::new(); @@ -164,17 +176,18 @@ impl StackExchange { .await? .text() .await?; - let ids = parse_questions_from_ddg_html(&html, &self.sites, self.config.limit)?; - self.se_questions(ids).await + let data = parse_questions_from_ddg_html(&html, &self.sites, self.config.limit)?; + self.se_questions(data).await } /// Parallel searches against the SE question endpoint across the sites in `ids`. // TODO I'm sure there is a way to DRY the se_question & se_search_advanced functions - async fn se_questions( - &self, - ids: HashMap>, - ) -> Result>> { - futures::stream::iter(ids) + async fn se_questions(&self, data: ScrapedData) -> Result>> { + let ScrapedData { + question_ids, + ordering, + } = data; + futures::stream::iter(question_ids) .map(|(site, ids)| { let clone = self.clone(); tokio::spawn(async move { @@ -189,8 +202,8 @@ impl StackExchange { .map(|r| r.map_err(Error::from).and_then(|x| x)) .collect::>>>() .map(|v| { - let qs: Vec> = v.into_iter().flatten().collect(); - // TODO sort by original ordering ! + let mut qs: Vec> = v.into_iter().flatten().collect(); + qs.sort_unstable_by_key(|q| ordering.get(&q.id.to_string()).unwrap()); qs }) } @@ -395,7 +408,7 @@ impl LocalStorage { } // TODO is this HM worth it? Probably only will ever have < 10 site codes to search... - // TODO store this as Option on self if other methods use it... + // maybe store this as Option on self if other methods use it... pub async fn find_invalid_site<'a, 'b>( &'b self, site_codes: &'a [String], @@ -467,16 +480,16 @@ where } /// Parse (site, question_id) pairs out of duckduckgo search results html -/// TODO currently hashmap {site: [qids]} BUT we should maintain relevance order ! -/// maybe this is as simple as a HashMap {qid: ordinal} +// TODO Benchmark this. It would likely be faster to use regex on the decoded url. fn parse_questions_from_ddg_html<'a>( html: &'a str, sites: &'a HashMap, limit: u16, -) -> Result>> { +) -> Result { let fragment = Html::parse_document(html); let anchors = Selector::parse("a.result__a").unwrap(); - let mut qids: HashMap> = HashMap::new(); + let mut question_ids: HashMap> = HashMap::new(); + let mut ordering: HashMap = HashMap::new(); let mut count = 0; for anchor in fragment.select(&anchors) { let url = anchor @@ -488,7 +501,8 @@ fn parse_questions_from_ddg_html<'a>( .iter() .find_map(|(site_code, site_url)| { let id = question_url_to_id(site_url, &url)?; - match qids.entry(site_code.to_owned()) { + ordering.insert(id.to_owned(), count); + match question_ids.entry(site_code.to_owned()) { Entry::Occupied(mut o) => o.get_mut().push(id), Entry::Vacant(o) => { o.insert(vec![id]); @@ -513,7 +527,10 @@ fn parse_questions_from_ddg_html<'a>( "DuckDuckGo blocked this request", ))) } else { - Ok(qids) + Ok(ScrapedData { + question_ids, + ordering, + }) } } @@ -532,8 +549,8 @@ fn question_url_to_id(site_url: &str, input: &str) -> Option { Some(input[0..end].to_string()) } -// TODO figure out a query that returns no results so that I can test it and differentiate it from -// a blocked request +// TODO find a query that returns no results so that I can test it and +// differentiate it from a blocked request #[cfg(test)] mod tests { use super::*; @@ -577,15 +594,27 @@ mod tests { .into_iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect::>(); - let mut expected_question_ids = HashMap::new(); - expected_question_ids.insert( - "stackoverflow".to_string(), - vec!["11828270".to_string(), "9171356".to_string()], - ); - expected_question_ids.insert("askubuntu".to_string(), vec!["24406".to_string()]); + let expected_scraped_data = ScrapedData { + question_ids: vec![ + ("stackoverflow", vec!["11828270", "9171356"]), + ("askubuntu", vec!["24406"]), + ] + .into_iter() + .map(|(k, v)| { + ( + k.to_string(), + v.into_iter().map(|s| s.to_string()).collect(), + ) + }) + .collect(), + ordering: vec![("11828270", 0), ("9171356", 2), ("24406", 1)] + .into_iter() + .map(|(k, v)| (k.to_string(), v)) + .collect(), + }; assert_eq!( parse_questions_from_ddg_html(html, &sites, 3).unwrap(), - expected_question_ids + expected_scraped_data ); } -- cgit v1.2.3