Maintain order of duckduckgo search resultsv0.3.0

author: Sam Tay <sam.chong.tay@gmail.com> 2020-06-23 21:09:35 -0700
committer: Sam Tay <sam.chong.tay@gmail.com> 2020-06-23 21:09:35 -0700
commit: 74bda95681c253eea5010417dc74e8569010f7f9 (patch)
tree: dbb7168aab62e75c7ccbfcf3e109b5ac6dafe902
parent: 0c4bafb3eb996b0e70707a32c11e8a1a2f9572ba (diff)
3 files changed, 67 insertions, 43 deletions
diff --git a/TODO.md b/TODO.md
index 6b53cb2..4a97764 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,8 +1,5 @@
 # TODO
 
-### v0.3.0
-1. Keep relevance ordering !!!
-
 ### v0.3.1
 1. Much of the code can be reused for google:
     * parsing href after `"url="` (similar to uddg)
@@ -15,9 +12,6 @@
      - api
      - scraper
 
-
-
-
 ### Endless future improvements for the TUI
 1. Init with smaller layout depending on initial screen size.
 2. Maybe cli `--auto-resize` option.
@@ -32,17 +26,15 @@
 ### resources for later
 
 #### scraping
-6. Google stuff [scraping with reqwest](https://rust-lang-nursery.github.io/rust-cookbook/web/scraping.html))
-
 ```python
-# if necessary, choose one of these to mimic browswer request
+# if necessary, choose one of these to mimic browser request
 USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
-               'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0',
-                              'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0',
-                                             ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) '
-                                                             'Chrome/19.0.1084.46 Safari/536.5'),
-                                                                            ('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46'
-                                                                                            'Safari/536.5'), )
+'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0',
+'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0',
+('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) '
+'Chrome/19.0.1084.46 Safari/536.5'),
+('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46'
+'Safari/536.5'), )
 
 # checks for search engine blocks
 BLOCK_INDICATORS = (
diff --git a/roadmap.md b/roadmap.md
index b5d8d36..ec52411 100644
--- a/roadmap.md
+++ b/roadmap.md
@@ -21,7 +21,10 @@
 [x] Support multiple --site args & searches
 
 ### v0.3.0
-[ ] Add duckduckgo scraper
+[x] Add duckduckgo scraper
+
+### v0.3.1
+[ ] Add google scraper
 
 ### at some point
 [ ] use trust to distrubute app binaries
diff --git a/src/stackexchange.rs b/src/stackexchange.rs
index 2939c29..a5f59e9 100644
--- a/src/stackexchange.rs
+++ b/src/stackexchange.rs
@@ -44,7 +44,6 @@ const USER_AGENT: &str =
 
 /// This structure allows interacting with parts of the StackExchange
 /// API, using the `Config` struct to determine certain API settings and options.
-// TODO should my se structs have &str instead of String?
 #[derive(Clone)]
 pub struct StackExchange {
     client: Client,
@@ -79,7 +78,6 @@ pub struct Answer<S> {
 /// Represents a StackExchange question with a custom selection of fields from
 /// the [StackExchange docs](https://api.stackexchange.com/docs/types/question)
 // TODO container over answers should be generic iterator
-// TODO let body be a generic that implements Display!
 #[derive(Clone, Deserialize, Debug)]
 pub struct Question<S> {
     #[serde(rename = "question_id")]
@@ -97,6 +95,20 @@ struct ResponseWrapper<T> {
     items: Vec<T>,
 }
 
+// Iss question_id unique across all sites? If not, then this edge case is
+// unaccounted for when sorting.
+//
+// If this is ever an issue, it wouldn't be too hard to account for this; just
+// keep track of site in the `ordering` field and also return site from the
+// spawned per-site tasks.
+#[derive(Debug, PartialEq)]
+struct ScrapedData {
+    /// Mapping of site code to question ids
+    question_ids: HashMap<String, Vec<String>>,
+    /// Mapping of question_id to its ordinal place in search results
+    ordering: HashMap<String, usize>,
+}
+
 impl StackExchange {
     pub fn new(config: Config, local_storage: LocalStorage, query: String) -> Self {
         let client = Client::new();
@@ -164,17 +176,18 @@ impl StackExchange {
             .await?
             .text()
             .await?;
-        let ids = parse_questions_from_ddg_html(&html, &self.sites, self.config.limit)?;
-        self.se_questions(ids).await
+        let data = parse_questions_from_ddg_html(&html, &self.sites, self.config.limit)?;
+        self.se_questions(data).await
     }
 
     /// Parallel searches against the SE question endpoint across the sites in `ids`.
     // TODO I'm sure there is a way to DRY the se_question & se_search_advanced functions
-    async fn se_questions(
-        &self,
-        ids: HashMap<String, Vec<String>>,
-    ) -> Result<Vec<Question<String>>> {
-        futures::stream::iter(ids)
+    async fn se_questions(&self, data: ScrapedData) -> Result<Vec<Question<String>>> {
+        let ScrapedData {
+            question_ids,
+            ordering,
+        } = data;
+        futures::stream::iter(question_ids)
             .map(|(site, ids)| {
                 let clone = self.clone();
                 tokio::spawn(async move {
@@ -189,8 +202,8 @@ impl StackExchange {
             .map(|r| r.map_err(Error::from).and_then(|x| x))
             .collect::<Result<Vec<Vec<_>>>>()
             .map(|v| {
-                let qs: Vec<Question<String>> = v.into_iter().flatten().collect();
-                // TODO sort by original ordering !
+                let mut qs: Vec<Question<String>> = v.into_iter().flatten().collect();
+                qs.sort_unstable_by_key(|q| ordering.get(&q.id.to_string()).unwrap());
                 qs
             })
     }
@@ -395,7 +408,7 @@ impl LocalStorage {
     }
 
     // TODO is this HM worth it? Probably only will ever have < 10 site codes to search...
-    // TODO store this as Option<HM> on self if other methods use it...
+    // maybe store this as Option<HM> on self if other methods use it...
     pub async fn find_invalid_site<'a, 'b>(
         &'b self,
         site_codes: &'a [String],
@@ -467,16 +480,16 @@ where
 }
 
 /// Parse (site, question_id) pairs out of duckduckgo search results html
-/// TODO currently hashmap {site: [qids]} BUT we should maintain relevance order !
-///      maybe this is as simple as a HashMap {qid: ordinal}
+// TODO Benchmark this. It would likely be faster to use regex on the decoded url.
 fn parse_questions_from_ddg_html<'a>(
     html: &'a str,
     sites: &'a HashMap<String, String>,
     limit: u16,
-) -> Result<HashMap<String, Vec<String>>> {
+) -> Result<ScrapedData> {
     let fragment = Html::parse_document(html);
     let anchors = Selector::parse("a.result__a").unwrap();
-    let mut qids: HashMap<String, Vec<String>> = HashMap::new();
+    let mut question_ids: HashMap<String, Vec<String>> = HashMap::new();
+    let mut ordering: HashMap<String, usize> = HashMap::new();
     let mut count = 0;
     for anchor in fragment.select(&anchors) {
         let url = anchor
@@ -488,7 +501,8 @@ fn parse_questions_from_ddg_html<'a>(
             .iter()
             .find_map(|(site_code, site_url)| {
                 let id = question_url_to_id(site_url, &url)?;
-                match qids.entry(site_code.to_owned()) {
+                ordering.insert(id.to_owned(), count);
+                match question_ids.entry(site_code.to_owned()) {
                     Entry::Occupied(mut o) => o.get_mut().push(id),
                     Entry::Vacant(o) => {
                         o.insert(vec![id]);
@@ -513,7 +527,10 @@ fn parse_questions_from_ddg_html<'a>(
             "DuckDuckGo blocked this request",
         )))
     } else {
-        Ok(qids)
+        Ok(ScrapedData {
+            question_ids,
+            ordering,
+        })
     }
 }
 
@@ -532,8 +549,8 @@ fn question_url_to_id(site_url: &str, input: &str) -> Option<String> {
     Some(input[0..end].to_string())
 }
 
-// TODO figure out a query that returns no results so that I can test it and differentiate it from
-// a blocked request
+// TODO find a query that returns no results so that I can test it and
+// differentiate it from a blocked request
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -577,15 +594,27 @@ mod tests {
         .into_iter()
         .map(|(k, v)| (k.to_string(), v.to_string()))
         .collect::<HashMap<String, String>>();
-        let mut expected_question_ids = HashMap::new();
-        expected_question_ids.insert(
-            "stackoverflow".to_string(),
-            vec!["11828270".to_string(), "9171356".to_string()],
-        );
-        expected_question_ids.insert("askubuntu".to_string(), vec!["24406".to_string()]);
+        let expected_scraped_data = ScrapedData {
+            question_ids: vec![
+                ("stackoverflow", vec!["11828270", "9171356"]),
+                ("askubuntu", vec!["24406"]),
+            ]
+            .into_iter()
+            .map(|(k, v)| {
+                (
+                    k.to_string(),
+                    v.into_iter().map(|s| s.to_string()).collect(),
+                )
+            })
+            .collect(),
+            ordering: vec![("11828270", 0), ("9171356", 2), ("24406", 1)]
+                .into_iter()
+                .map(|(k, v)| (k.to_string(), v))
+                .collect(),
+        };
         assert_eq!(
             parse_questions_from_ddg_html(html, &sites, 3).unwrap(),
-            expected_question_ids
+            expected_scraped_data
         );
     }
author	Sam Tay <sam.chong.tay@gmail.com>	2020-06-23 21:09:35 -0700
committer	Sam Tay <sam.chong.tay@gmail.com>	2020-06-23 21:09:35 -0700
commit	74bda95681c253eea5010417dc74e8569010f7f9 (patch)
tree	dbb7168aab62e75c7ccbfcf3e109b5ac6dafe902
parent	0c4bafb3eb996b0e70707a32c11e8a1a2f9572ba (diff)