summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSam Tay <sam.chong.tay@gmail.com>2020-06-24 02:36:26 -0700
committerSam Tay <sam.chong.tay@gmail.com>2020-06-24 02:39:39 -0700
commit95f429041ee505497f36530e1895c2ea3554d37b (patch)
tree93ae0d2fdbc85dc58884d75981cafe2d33640edb
parentcaa03bb164e40827e0d5f3f35522ac3cabc1e348 (diff)
Refactor search engine types
Still not sure exactly what the idiomatic representation is here
-rw-r--r--TODO.md5
-rw-r--r--src/main.rs8
-rw-r--r--src/stackexchange/scraper.rs179
-rw-r--r--src/stackexchange/search.rs3
4 files changed, 98 insertions, 97 deletions
diff --git a/TODO.md b/TODO.md
index 083d492..0e6be2a 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,17 +1,12 @@
# TODO
### v0.3.1
-0. Refactor the enum/struct for search engines
1. Much of the code can be reused for google:
* parsing href after `"url="` (similar to uddg)
* formatting `(site:stackoverflow.com OR site:unix.stackexchange.com) what is linux`
So make a `Scraper` trait and implement it for DDG & Google. Then
`stackexchange` can just code against `Scraper` and choose based on
`--search-engine | -e' argument`
-2. Maybe reorganize to
- - stackexchange
- - api
- - scraper
### Endless future improvements for the TUI
1. Init with smaller layout depending on initial screen size.
diff --git a/src/main.rs b/src/main.rs
index ac176fc..e023d30 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -82,19 +82,19 @@ async fn run(skin: &mut MadSkin) -> Result<Option<Vec<Question<Markdown>>>> {
}
if let Some(q) = opts.query {
- let mut se = Search::new(config, ls, q);
+ let mut search = Search::new(config, ls, q);
if lucky {
- let md = se.search_lucky().await?;
+ let md = search.search_lucky().await?;
skin.print_text(&md);
skin.print_text("\nPress **[SPACE]** to see more results, or any other key to exit");
// Kick off the rest of the search in the background
- let qs = task::spawn(async move { se.search_md().await });
+ let qs = task::spawn(async move { search.search_md().await });
if !utils::wait_for_char(' ')? {
return Ok(None);
}
return Ok(Some(qs.await.unwrap()?));
} else {
- return Ok(Some(se.search_md().await?));
+ return Ok(Some(search.search_md().await?));
}
}
Ok(None)
diff --git a/src/stackexchange/scraper.rs b/src/stackexchange/scraper.rs
index 53ac08b..a0d29ab 100644
--- a/src/stackexchange/scraper.rs
+++ b/src/stackexchange/scraper.rs
@@ -10,7 +10,6 @@ use crate::error::{Error, Result};
/// DuckDuckGo URL
const DUCKDUCKGO_URL: &str = "https://duckduckgo.com";
-// TODO Should there be separate Unit-type structs for each one? With separate implementations?
pub enum SearchEngine {
DuckDuckGo,
}
@@ -29,6 +28,7 @@ pub struct ScrapedData {
pub ordering: HashMap<String, usize>,
}
+// TODO add this type system limitation to blog post
pub trait Scraper {
/// Parse data from search results html
fn parse(&self, html: &str, sites: &HashMap<String, String>, limit: u16)
@@ -48,7 +48,7 @@ impl Scraper for SearchEngine {
limit: u16,
) -> Result<ScrapedData> {
match &self {
- SearchEngine::DuckDuckGo => parse_duckduckgo(html, sites, limit),
+ Self::DuckDuckGo => DuckDuckGo.parse(html, sites, limit),
}
}
fn get_url<'a, I>(&self, query: &str, sites: I) -> Url
@@ -56,64 +56,103 @@ impl Scraper for SearchEngine {
I: IntoIterator<Item = &'a String>,
{
match &self {
- SearchEngine::DuckDuckGo => duckduckgo_url(query, sites),
+ Self::DuckDuckGo => DuckDuckGo.get_url(query, sites),
}
}
}
-/// Parse (site, question_id) pairs out of duckduckgo search results html
-// TODO Benchmark this. It would likely be faster to use regex on the decoded url.
-// TODO pull out parts that are composable across different engines
-fn parse_duckduckgo<'a>(
- html: &'a str,
- sites: &'a HashMap<String, String>,
- limit: u16,
-) -> Result<ScrapedData> {
- let fragment = Html::parse_document(html);
- let anchors = Selector::parse("a.result__a").unwrap();
- let mut question_ids: HashMap<String, Vec<String>> = HashMap::new();
- let mut ordering: HashMap<String, usize> = HashMap::new();
- let mut count = 0;
- for anchor in fragment.select(&anchors) {
- let url = anchor
- .value()
- .attr("href")
- .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string()))
- .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?;
- sites
- .iter()
- .find_map(|(site_code, site_url)| {
- let id = question_url_to_id(site_url, &url)?;
- ordering.insert(id.to_owned(), count);
- match question_ids.entry(site_code.to_owned()) {
- Entry::Occupied(mut o) => o.get_mut().push(id),
- Entry::Vacant(o) => {
- o.insert(vec![id]);
+struct DuckDuckGo;
+
+impl Scraper for DuckDuckGo {
+ /// Parse (site, question_id) pairs out of duckduckgo search results html
+ // TODO Benchmark this. It would likely be faster to use regex on the decoded url.
+ // TODO pull out parts that are composable across different engines
+ fn parse(
+ &self,
+ html: &str,
+ sites: &HashMap<String, String>,
+ limit: u16,
+ ) -> Result<ScrapedData> {
+ let fragment = Html::parse_document(html);
+ let anchors = Selector::parse("a.result__a").unwrap();
+ let mut question_ids: HashMap<String, Vec<String>> = HashMap::new();
+ let mut ordering: HashMap<String, usize> = HashMap::new();
+ let mut count = 0;
+ for anchor in fragment.select(&anchors) {
+ let url = anchor
+ .value()
+ .attr("href")
+ .ok_or_else(|| Error::ScrapingError("Anchor with no href".to_string()))
+ .map(|href| percent_decode_str(href).decode_utf8_lossy().into_owned())?;
+ sites
+ .iter()
+ .find_map(|(site_code, site_url)| {
+ let id = question_url_to_id(site_url, &url)?;
+ ordering.insert(id.to_owned(), count);
+ match question_ids.entry(site_code.to_owned()) {
+ Entry::Occupied(mut o) => o.get_mut().push(id),
+ Entry::Vacant(o) => {
+ o.insert(vec![id]);
+ }
}
- }
- count += 1;
- Some(())
+ count += 1;
+ Some(())
+ })
+ .ok_or_else(|| {
+ Error::ScrapingError(
+ "Duckduckgo returned results outside of SE network".to_string(),
+ )
+ })?;
+ if count >= limit as usize {
+ break;
+ }
+ }
+ // It doesn't seem possible for DDG to return no results, so assume this is
+ // a bad user agent
+ if count == 0 {
+ Err(Error::ScrapingError(String::from(
+ "DuckDuckGo blocked this request",
+ )))
+ } else {
+ Ok(ScrapedData {
+ question_ids,
+ ordering,
})
- .ok_or_else(|| {
- Error::ScrapingError(
- "Duckduckgo returned results outside of SE network".to_string(),
- )
- })?;
- if count >= limit as usize {
- break;
}
}
- // It doesn't seem possible for DDG to return no results, so assume this is
- // a bad user agent
- if count == 0 {
- Err(Error::ScrapingError(String::from(
- "DuckDuckGo blocked this request",
- )))
- } else {
- Ok(ScrapedData {
- question_ids,
- ordering,
- })
+
+ /// Creates duckduckgo search url given sites and query
+ /// See https://duckduckgo.com/params for more info
+ fn get_url<'a, I>(&self, query: &str, sites: I) -> Url
+ where
+ I: IntoIterator<Item = &'a String>,
+ {
+ let mut q = String::new();
+ // Restrict to sites
+ q.push('(');
+ q.push_str(
+ sites
+ .into_iter()
+ .map(|site| String::from("site:") + site)
+ .collect::<Vec<_>>()
+ .join(" OR ")
+ .as_str(),
+ );
+ q.push_str(") ");
+ // Search terms
+ q.push_str(
+ query
+ .trim_end_matches('?')
+ .split_whitespace()
+ .collect::<Vec<_>>()
+ .join(" ")
+ .as_str(),
+ );
+ Url::parse_with_params(
+ DUCKDUCKGO_URL,
+ &[("q", q.as_str()), ("kz", "-1"), ("kh", "-1")],
+ )
+ .unwrap()
}
}
@@ -132,40 +171,6 @@ fn question_url_to_id(site_url: &str, input: &str) -> Option<String> {
Some(input[0..end].to_string())
}
-/// Creates duckduckgo search url given sites and query
-/// See https://duckduckgo.com/params for more info
-fn duckduckgo_url<'a, I>(query: &str, sites: I) -> Url
-where
- I: IntoIterator<Item = &'a String>,
-{
- let mut q = String::new();
- // Restrict to sites
- q.push('(');
- q.push_str(
- sites
- .into_iter()
- .map(|site| String::from("site:") + site)
- .collect::<Vec<_>>()
- .join(" OR ")
- .as_str(),
- );
- q.push_str(") ");
- // Search terms
- q.push_str(
- query
- .trim_end_matches('?')
- .split_whitespace()
- .collect::<Vec<_>>()
- .join(" ")
- .as_str(),
- );
- Url::parse_with_params(
- DUCKDUCKGO_URL,
- &[("q", q.as_str()), ("kz", "-1"), ("kh", "-1")],
- )
- .unwrap()
-}
-
#[cfg(test)]
mod tests {
use super::*;
@@ -178,7 +183,7 @@ mod tests {
String::from("unix.stackexchange.com"),
];
assert_eq!(
- duckduckgo_url(q, &sites).as_str(),
+ DuckDuckGo.get_url(q, &sites).as_str(),
String::from(
"https://duckduckgo.com/\
?q=%28site%3Astackoverflow.com+OR+site%3Aunix.stackexchange.com%29\
diff --git a/src/stackexchange/search.rs b/src/stackexchange/search.rs
index 8e40cb4..ed89e15 100644
--- a/src/stackexchange/search.rs
+++ b/src/stackexchange/search.rs
@@ -23,6 +23,7 @@ const USER_AGENT: &str =
/// This structure provides methods to search queries and get StackExchange
/// questions/answers in return.
+// TODO this really needs a better name...
#[derive(Clone)]
pub struct Search {
api: Api,
@@ -89,7 +90,7 @@ impl Search {
}
/// Search query at duckduckgo and then fetch the resulting questions from SE.
- async fn search_by_engine(&self, search_engine: SearchEngine) -> Result<Vec<Question<String>>> {
+ async fn search_by_engine(&self, search_engine: impl Scraper) -> Result<Vec<Question<String>>> {
let url = search_engine.get_url(&self.query, self.sites.values());
let html = Client::new()
.get(url)