summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSam Tay <sam.chong.tay@gmail.com>2020-06-25 17:51:02 -0700
committerSam Tay <sam.chong.tay@gmail.com>2020-06-25 17:51:02 -0700
commit9b84ce4aaf1bae0b4b8913147a937f2bb52918d6 (patch)
tree351bd9775899f2c6e916d1f887e3a0f401f8c7ce
parentbd35184e2e7b344efe7a5062db37b3ea10782bed (diff)
Fix bug in google parser
-rw-r--r--TODO.md3
-rw-r--r--src/stackexchange/scraper.rs61
2 files changed, 48 insertions, 16 deletions
diff --git a/TODO.md b/TODO.md
index dc818de..bcead9b 100644
--- a/TODO.md
+++ b/TODO.md
@@ -8,8 +8,7 @@
4. Refactor layout handling (see TODO on `tui::views::LayoutView::relayout`)
### bugs
-1. Need to also allow /q/ in search results in addition to /questions/
-2.
+1.
```
so --search-engine google --site stackoverflow --site askubuntu how to stop typing sudo
```
diff --git a/src/stackexchange/scraper.rs b/src/stackexchange/scraper.rs
index b03c577..7c9047f 100644
--- a/src/stackexchange/scraper.rs
+++ b/src/stackexchange/scraper.rs
@@ -176,23 +176,27 @@ fn parse_with_selector(
})
}
-// TODO also allow /q/
/// For example
/// ```
/// let id = "stackoverflow.com";
/// let input = "/l/?kh=-1&uddg=https://stackoverflow.com/questions/11828270/how-do-i-exit-the-vim-editor";
-/// assert_eq!(question_url_to_id(site_url, input), "11828270")
+/// assert_eq!(question_url_to_id(site_url, input), Some(String::from("11828270")))
/// ```
+// TODO use str_prefix once its stable
+// TODO benchmark this. regex is almost undoubtably superior here
fn question_url_to_id(site_url: &str, input: &str) -> Option<String> {
- // TODO use str_prefix once its stable
- let fragment = site_url.trim_end_matches('/').to_owned() + "/questions/";
- let ix = input.find(&fragment)? + fragment.len();
- let input = &input[ix..];
- let end = input.find('/')?;
- Some(input[0..end].to_string())
+ ["/questions/", "/q/"].iter().find_map(|segment| {
+ let fragment = site_url.trim_end_matches('/').to_owned() + segment;
+ let ix = input.find(&fragment)? + fragment.len();
+ let input = &input[ix..];
+ if let Some(end) = input.find('/') {
+ Some(input[0..end].to_string())
+ } else {
+ Some(input[0..].to_string())
+ }
+ })
}
-// TODO test with google/parsing-q.html
#[cfg(test)]
mod tests {
use super::*;
@@ -283,6 +287,39 @@ mod tests {
}
#[test]
+ fn test_google_q_parser() {
+ let html = include_str!("../../test/google/parsing-q.html");
+ let mut sites = HashMap::new();
+ sites.insert(
+ String::from("stackoverflow"),
+ String::from("stackoverflow.com"),
+ );
+ let expected_scraped_data = ScrapedData {
+ question_ids: vec![(
+ String::from("stackoverflow"),
+ vec![
+ String::from("3940128"),
+ String::from("4647368"),
+ String::from("12336105"),
+ ],
+ )]
+ .into_iter()
+ .collect(),
+ ordering: vec![
+ (String::from("3940128"), 0),
+ (String::from("4647368"), 1),
+ (String::from("12336105"), 2),
+ ]
+ .into_iter()
+ .collect(),
+ };
+ assert_eq!(
+ Google.parse(html, &sites, 3).unwrap(),
+ expected_scraped_data
+ );
+ }
+
+ #[test]
fn test_duckduckgo_blocker() -> Result<(), String> {
let html = include_str!("../../test/duckduckgo/bad-user-agent.html");
let mut sites = HashMap::new();
@@ -299,12 +336,8 @@ mod tests {
}
}
- #[test]
- // TODO Get a blocked request html
+ // TODO Get blocked google request html
// note: this may only be possible at search.rs level (with non-200 code)
- fn test_google_blocker() -> Result<(), String> {
- Ok(())
- }
#[test]
fn test_question_url_to_id() {