summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSam Tay <sam.chong.tay@gmail.com>2020-06-23 21:09:35 -0700
committerSam Tay <sam.chong.tay@gmail.com>2020-06-23 21:09:35 -0700
commit74bda95681c253eea5010417dc74e8569010f7f9 (patch)
treedbb7168aab62e75c7ccbfcf3e109b5ac6dafe902
parent0c4bafb3eb996b0e70707a32c11e8a1a2f9572ba (diff)
Maintain order of duckduckgo search resultsv0.3.0
-rw-r--r--TODO.md22
-rw-r--r--roadmap.md5
-rw-r--r--src/stackexchange.rs83
3 files changed, 67 insertions, 43 deletions
diff --git a/TODO.md b/TODO.md
index 6b53cb2..4a97764 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,8 +1,5 @@
# TODO
-### v0.3.0
-1. Keep relevance ordering !!!
-
### v0.3.1
1. Much of the code can be reused for google:
* parsing href after `"url="` (similar to uddg)
@@ -15,9 +12,6 @@
- api
- scraper
-
-
-
### Endless future improvements for the TUI
1. Init with smaller layout depending on initial screen size.
2. Maybe cli `--auto-resize` option.
@@ -32,17 +26,15 @@
### resources for later
#### scraping
-6. Google stuff [scraping with reqwest](https://rust-lang-nursery.github.io/rust-cookbook/web/scraping.html))
-
```python
-# if necessary, choose one of these to mimic browswer request
+# if necessary, choose one of these to mimic browser request
USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
- 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0',
- 'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0',
- ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) '
- 'Chrome/19.0.1084.46 Safari/536.5'),
- ('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46'
- 'Safari/536.5'), )
+'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0',
+'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0',
+('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) '
+'Chrome/19.0.1084.46 Safari/536.5'),
+('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46'
+'Safari/536.5'), )
# checks for search engine blocks
BLOCK_INDICATORS = (
diff --git a/roadmap.md b/roadmap.md
index b5d8d36..ec52411 100644
--- a/roadmap.md
+++ b/roadmap.md
@@ -21,7 +21,10 @@
[x] Support multiple --site args & searches
### v0.3.0
-[ ] Add duckduckgo scraper
+[x] Add duckduckgo scraper
+
+### v0.3.1
+[ ] Add google scraper
### at some point
[ ] use trust to distrubute app binaries
diff --git a/src/stackexchange.rs b/src/stackexchange.rs
index 2939c29..a5f59e9 100644
--- a/src/stackexchange.rs
+++ b/src/stackexchange.rs
@@ -44,7 +44,6 @@ const USER_AGENT: &str =
/// This structure allows interacting with parts of the StackExchange
/// API, using the `Config` struct to determine certain API settings and options.
-// TODO should my se structs have &str instead of String?
#[derive(Clone)]
pub struct StackExchange {
client: Client,
@@ -79,7 +78,6 @@ pub struct Answer<S> {
/// Represents a StackExchange question with a custom selection of fields from
/// the [StackExchange docs](https://api.stackexchange.com/docs/types/question)
// TODO container over answers should be generic iterator
-// TODO let body be a generic that implements Display!
#[derive(Clone, Deserialize, Debug)]
pub struct Question<S> {
#[serde(rename = "question_id")]
@@ -97,6 +95,20 @@ struct ResponseWrapper<T> {
items: Vec<T>,
}
+// Iss question_id unique across all sites? If not, then this edge case is
+// unaccounted for when sorting.
+//
+// If this is ever an issue, it wouldn't be too hard to account for this; just
+// keep track of site in the `ordering` field and also return site from the
+// spawned per-site tasks.
+#[derive(Debug, PartialEq)]
+struct ScrapedData {
+ /// Mapping of site code to question ids
+ question_ids: HashMap<String, Vec<String>>,
+ /// Mapping of question_id to its ordinal place in search results
+ ordering: HashMap<String, usize>,
+}
+
impl StackExchange {
pub fn new(config: Config, local_storage: LocalStorage, query: String) -> Self {
let client = Client::new();
@@ -164,17 +176,18 @@ impl StackExchange {
.await?
.text()
.await?;
- let ids = parse_questions_from_ddg_html(&html, &self.sites, self.config.limit)?;
- self.se_questions(ids).await
+ let data = parse_questions_from_ddg_html(&html, &self.sites, self.config.limit)?;
+ self.se_questions(data).await
}
/// Parallel searches against the SE question endpoint across the sites in `ids`.
// TODO I'm sure there is a way to DRY the se_question & se_search_advanced functions
- async fn se_questions(
- &self,
- ids: HashMap<String, Vec<String>>,
- ) -> Result<Vec<Question<String>>> {
- futures::stream::iter(ids)
+ async fn se_questions(&self, data: ScrapedData) -> Result<Vec<Question<String>>> {
+ let ScrapedData {
+ question_ids,
+ ordering,
+ } = data;
+ futures::stream::iter(question_ids)
.map(|(site, ids)| {
let clone = self.clone();
tokio::spawn(async move {
@@ -189,8 +202,8 @@ impl StackExchange {
.map(|r| r.map_err(Error::from).and_then(|x| x))
.collect::<Result<Vec<Vec<_>>>>()
.map(|v| {
- let qs: Vec<Question<String>> = v.into_iter().flatten().collect();
- // TODO sort by original ordering !
+ let mut qs: Vec<Question<String>> = v.into_iter().flatten().collect();
+ qs.sort_unstable_by_key(|q| ordering.get(&q.id.to_string()).unwrap());
qs
})
}
@@ -395,7 +408,7 @@ impl LocalStorage {
}
// TODO is this HM worth it? Probably only will ever have < 10 site codes to search...
- // TODO store this as Option<HM> on self if other methods use it...
+ // maybe store this as Option<HM> on self if other methods use it...
pub async fn find_invalid_site<'a, 'b>(
&'b self,
site_codes: &'a [String],
@@ -467,16 +480,16 @@ where
}
/// Parse (site, question_id) pairs out of duckduckgo search results html
-/// TODO currently hashmap {site: [qids]} BUT we should maintain relevance order !
-/// maybe this is as simple as a HashMap {qid: ordinal}
+// TODO Benchmark this. It would likely be faster to use regex on the decoded url.
fn parse_questions_from_ddg_html<'a>(
html: &'a str,
sites: &'a HashMap<String, String>,
limit: u16,
-) -> Result<HashMap<String, Vec<String>>> {
+) -> Result<ScrapedData> {
let fragment = Html::parse_document(html);
let anchors = Selector::parse("a.result__a").unwrap();
- let mut qids: HashMap<String, Vec<String>> = HashMap::new();
+ let mut question_ids: HashMap<String, Vec<String>> = HashMap::new();
+ let mut ordering: HashMap<String, usize> = HashMap::new();
let mut count = 0;
for anchor in fragment.select(&anchors) {
let url = anchor
@@ -488,7 +501,8 @@ fn parse_questions_from_ddg_html<'a>(
.iter()
.find_map(|(site_code, site_url)| {
let id = question_url_to_id(site_url, &url)?;
- match qids.entry(site_code.to_owned()) {
+ ordering.insert(id.to_owned(), count);
+ match question_ids.entry(site_code.to_owned()) {
Entry::Occupied(mut o) => o.get_mut().push(id),
Entry::Vacant(o) => {
o.insert(vec![id]);
@@ -513,7 +527,10 @@ fn parse_questions_from_ddg_html<'a>(
"DuckDuckGo blocked this request",
)))
} else {
- Ok(qids)
+ Ok(ScrapedData {
+ question_ids,
+ ordering,
+ })
}
}
@@ -532,8 +549,8 @@ fn question_url_to_id(site_url: &str, input: &str) -> Option<String> {
Some(input[0..end].to_string())
}
-// TODO figure out a query that returns no results so that I can test it and differentiate it from
-// a blocked request
+// TODO find a query that returns no results so that I can test it and
+// differentiate it from a blocked request
#[cfg(test)]
mod tests {
use super::*;
@@ -577,15 +594,27 @@ mod tests {
.into_iter()
.map(|(k, v)| (k.to_string(), v.to_string()))
.collect::<HashMap<String, String>>();
- let mut expected_question_ids = HashMap::new();
- expected_question_ids.insert(
- "stackoverflow".to_string(),
- vec!["11828270".to_string(), "9171356".to_string()],
- );
- expected_question_ids.insert("askubuntu".to_string(), vec!["24406".to_string()]);
+ let expected_scraped_data = ScrapedData {
+ question_ids: vec![
+ ("stackoverflow", vec!["11828270", "9171356"]),
+ ("askubuntu", vec!["24406"]),
+ ]
+ .into_iter()
+ .map(|(k, v)| {
+ (
+ k.to_string(),
+ v.into_iter().map(|s| s.to_string()).collect(),
+ )
+ })
+ .collect(),
+ ordering: vec![("11828270", 0), ("9171356", 2), ("24406", 1)]
+ .into_iter()
+ .map(|(k, v)| (k.to_string(), v))
+ .collect(),
+ };
assert_eq!(
parse_questions_from_ddg_html(html, &sites, 3).unwrap(),
- expected_question_ids
+ expected_scraped_data
);
}