Update Hacker News HTML text parsing logic (#99)

## Changes - update parsing logic to reflect new HN Algolia API change regarding the use of `<p>` for paragraph breaks - cleanup parsing codes
author: Thang Pham <phamducthang1234@gmail.com> 2023-10-21 17:56:58 -0400
committer: GitHub <noreply@github.com> 2023-10-21 17:56:58 -0400
commit: 9301c39b3bd8cb8e1904b7680d6827d3a8f6a479 (patch)
tree: adbd659b4a6bdea6a7d8d6ff405397842e9f6532
parent: d09b1d24795beb3e90b3fd1d4f6df9bc2ce54af8 (diff)
5 files changed, 97 insertions, 83 deletions
diff --git a/hackernews_tui/src/client/mod.rs b/hackernews_tui/src/client/mod.rs
index ffab48d..b789a90 100644
--- a/hackernews_tui/src/client/mod.rs
+++ b/hackernews_tui/src/client/mod.rs
@@ -77,9 +77,7 @@ impl HNClient {
             format!("get item (id={item_id}) using {request_url}")
         );
 
-        // The item's text returned from HN official APIs may have `<p>` tags representing
-        // paragraph breaks. Convert `<p>` tags to newlines to make the text easier to read.
-        let text = decode_html(&item.text.unwrap_or_default()).replace("<p>", "\n\n");
+        let text = decode_html(&item.text.unwrap_or_default());
 
         // Construct the shortened text to represent the page's title if not exist
         let chars = text.replace('\n', " ").chars().collect::<Vec<_>>();
diff --git a/hackernews_tui/src/model.rs b/hackernews_tui/src/model.rs
index 175c27e..d2aabeb 100644
--- a/hackernews_tui/src/model.rs
+++ b/hackernews_tui/src/model.rs
@@ -59,10 +59,10 @@ pub struct VoteData {
 }
 
 #[derive(Debug, Clone)]
-/// A HackerNews item which can be either a story or a comment.
+/// A Hacker News item which can be either a story or a comment.
 ///
-/// This struct is a shared representation between a story and
-/// a comment for rendering the item's content.
+/// This struct is a shared representation between a story and a comment
+/// and is used to render their content.
 pub struct HnItem {
     pub id: u32,
     pub level: usize,
@@ -107,19 +107,18 @@ impl From<Story> for HnItem {
             ),
         ]);
 
-        let mut story_text = story.content;
+        // parse story's HTML content
+        let result = parse_hn_html_text(story.content, Style::default(), 0);
 
-        let minimized_text = if story_text.is_empty() {
+        // construct a minimized text representing the collapsed story's content
+        let minimized_text = if result.content.source().is_empty() {
             metadata.clone()
         } else {
-            story_text = format!("\n{story_text}");
-
             utils::combine_styled_strings([metadata.clone(), StyledString::plain("... (more)")])
         };
 
-        let mut text = metadata;
-        let result = parse_hn_html_text(story_text, Style::default(), 0);
-        text.append(result.s);
+        let text =
+            utils::combine_styled_strings([metadata, StyledString::plain("\n"), result.content]);
 
         HnItem {
             id: story.id,
@@ -144,17 +143,20 @@ impl From<Comment> for HnItem {
             ),
         ]);
 
-        let mut text = utils::combine_styled_strings([metadata.clone(), StyledString::plain("\n")]);
+        // constructs a minimized text representing the collapsed comment's content
         let minimized_text = utils::combine_styled_strings([
-            metadata,
+            metadata.clone(),
             StyledString::styled(
                 format!("({} more)", comment.n_children + 1),
                 component_style.metadata,
             ),
         ]);
 
+        // parse the comment's content
         let result = parse_hn_html_text(comment.content, Style::default(), 0);
-        text.append(result.s);
+
+        let text =
+            utils::combine_styled_strings([metadata, StyledString::plain("\n"), result.content]);
 
         HnItem {
             id: comment.id,
diff --git a/hackernews_tui/src/parser/article.rs b/hackernews_tui/src/parser/article.rs
index bcf4073..7cac0ba 100644
--- a/hackernews_tui/src/parser/article.rs
+++ b/hackernews_tui/src/parser/article.rs
@@ -1,4 +1,4 @@
-use super::html::HTMLParsedResult;
+use super::html::HTMLTextParsedResult;
 use super::rcdom::{Handle, NodeData, RcDom};
 use crate::parser::html::HTMLTableParsedResult;
 use crate::prelude::*;
@@ -40,7 +40,7 @@ impl Article {
     /// # Arguments:
     /// * `max_width`: the maximum width of the parsed content. This is mostly used
     /// to construct a HTML table using `comfy_table`.
-    pub fn parse(&self, max_width: usize) -> Result<HTMLParsedResult> {
+    pub fn parse(&self, max_width: usize) -> Result<HTMLTextParsedResult> {
         debug!("parse article ({:?})", self);
 
         // parse HTML content into DOM node(s)
@@ -90,7 +90,7 @@ impl Article {
         base_link_id: usize,
         mut style: Style,
         mut args: ArticleParseArgs,
-    ) -> (HTMLParsedResult, bool) {
+    ) -> (HTMLTextParsedResult, bool) {
         // TODO: handle parsing <ol> tags correctly
 
         debug!(
@@ -98,13 +98,13 @@ impl Article {
             node, style, args
         );
 
-        let mut result = HTMLParsedResult::default();
+        let mut result = HTMLTextParsedResult::default();
         let mut suffix = StyledString::new();
 
         let mut visit_block_element_cb = || {
             if !args.is_first_element_in_block {
-                result.s.append_plain("\n\n");
-                result.s.append_styled(&args.prefix, style);
+                result.content.append_plain("\n\n");
+                result.content.append_styled(&args.prefix, style);
             }
             args.is_first_element_in_block = true;
         };
@@ -128,7 +128,7 @@ impl Article {
 
                 has_non_ws_text |= !text.trim().is_empty();
 
-                result.s.append_styled(text, style);
+                result.content.append_styled(text, style);
             }
             NodeData::Element {
                 ref name,
@@ -151,7 +151,9 @@ impl Article {
                         style = style.combine(component_style.header);
                     }
                     expanded_name!(html "br") => {
-                        result.s.append_styled(format!("\n{}", args.prefix), style);
+                        result
+                            .content
+                            .append_styled(format!("\n{}", args.prefix), style);
                     }
                     expanded_name!(html "p") => visit_block_element_cb(),
                     expanded_name!(html "code") => {
@@ -169,7 +171,7 @@ impl Article {
 
                         style = style.combine(component_style.multiline_code_block);
 
-                        result.s.append_styled("  ", style);
+                        result.content.append_styled("  ", style);
                     }
                     expanded_name!(html "blockquote") => {
                         visit_block_element_cb();
@@ -177,7 +179,7 @@ impl Article {
                         args.prefix = format!("{}▎ ", args.prefix);
                         style = style.combine(component_style.quote);
 
-                        result.s.append_styled("▎ ", style);
+                        result.content.append_styled("▎ ", style);
                     }
                     expanded_name!(html "table") => {
                         let mut table_result = HTMLTableParsedResult::default();
@@ -211,7 +213,7 @@ impl Article {
                             table.add_row(row.into_iter().map(|c| c.source().to_owned()));
                         }
 
-                        result.s.append_styled(format!("\n\n{table}"), style);
+                        result.content.append_styled(format!("\n\n{table}"), style);
 
                         return (result, true);
                     }
@@ -225,7 +227,7 @@ impl Article {
                         args.is_first_element_in_block = true;
 
                         result
-                            .s
+                            .content
                             .append_styled(format!("\n{}• ", args.prefix), style);
                     }
                     expanded_name!(html "img") => {
@@ -240,10 +242,12 @@ impl Article {
                         };
 
                         if !args.is_first_element_in_block {
-                            result.s.append_plain("\n\n");
+                            result.content.append_plain("\n\n");
                         }
-                        result.s.append_styled(&img_desc, style);
-                        result.s.append_styled(" (image)", component_style.metadata);
+                        result.content.append_styled(&img_desc, style);
+                        result
+                            .content
+                            .append_styled(" (image)", component_style.metadata);
                     }
                     expanded_name!(html "a") => {
                         // find `href` attribute of an <a> tag
@@ -291,7 +295,7 @@ impl Article {
             }
         });
 
-        result.s.append(suffix);
+        result.content.append(suffix);
         (result, has_non_ws_text)
     }
 
@@ -331,7 +335,7 @@ impl Article {
                         );
 
                         result.links.append(&mut child_result.links);
-                        s.append(child_result.s);
+                        s.append(child_result.content);
                     });
 
                     if !is_header {
diff --git a/hackernews_tui/src/parser/html.rs b/hackernews_tui/src/parser/html.rs
index a64a538..2b62f66 100644
--- a/hackernews_tui/src/parser/html.rs
+++ b/hackernews_tui/src/parser/html.rs
@@ -4,36 +4,36 @@ use once_cell::sync::Lazy;
 use regex::Regex;
 
 /// A regex to parse a HN text (in HTML).
-/// It consists of multiple regex(s) representing different elements.
+/// It consists of multiple regexes representing different components.
 static HN_TEXT_RE: Lazy<Regex> = Lazy::new(|| {
     Regex::new(&format!(
         "(({})|({})|({})|({})|({})|({}))",
-        // a regex that matches a HTML paragraph
+        // a regex matching a HTML paragraph
         r"<p>(?s)(?P<paragraph>(|[^>].*?))</p>",
-        // a regex that matches a paragraph quote (in markdown format)
+        // a regex matching a paragraph quote (in markdown format)
         r"<p>(?s)(?P<quote>>[> ]*)(?P<text>.*?)</p>",
-        // a regex that matches an HTML italic string
+        // a regex matching an HTML italic string
         r"<i>(?s)(?P<italic>.*?)</i>",
-        // a regex that matches a HTML code block (multiline)
+        // a regex matching a HTML code block (multiline)
         r"<pre><code>(?s)(?P<multiline_code>.*?)[\n]*</code></pre>",
-        // a regex that matches a single line code block (markdown format)
+        // a regex matching a single line code block (markdown format)
         r"`(?P<code>[^`]+?)`",
-        // a regex that matches a HTML link
+        // a regex matching a HTML link
         r#"<a\s+?href="(?P<link>.*?)"(?s).+?</a>"#,
     ))
     .unwrap()
 });
 
-/// A HTML parsed result.
+/// Parsed result of a HTML text
 #[derive(Debug, Default)]
-pub struct HTMLParsedResult {
-    /// a styled string representing the decorated HTML content
-    pub s: StyledString,
+pub struct HTMLTextParsedResult {
+    /// parsed HTML content
+    pub content: StyledString,
     /// a list of links inside the HTML document
     pub links: Vec<String>,
 }
 
-/// A HTML table parsed result.
+/// Parsed result of a HTML table
 #[derive(Debug, Default)]
 pub struct HTMLTableParsedResult {
     /// a list of links inside the HTML document
@@ -44,32 +44,46 @@ pub struct HTMLTableParsedResult {
     pub rows: Vec<Vec<StyledString>>,
 }
 
-impl HTMLParsedResult {
-    pub fn merge(&mut self, mut other: HTMLParsedResult) {
-        self.s.append(other.s);
+impl HTMLTextParsedResult {
+    /// merge two HTML parsed results
+    pub fn merge(&mut self, mut other: HTMLTextParsedResult) {
+        self.content.append(other.content);
         self.links.append(&mut other.links);
     }
 }
 
 /// parse a Hacker News HTML text
-pub fn parse_hn_html_text(text: String, style: Style, base_link_id: usize) -> HTMLParsedResult {
+pub fn parse_hn_html_text(text: String, style: Style, base_link_id: usize) -> HTMLTextParsedResult {
     debug!("parse hn html text: {}", text);
 
-    let mut result = HTMLParsedResult::default();
-    // an index such that `text[curr_pos..]` represents the part of the
+    // pre-processed the HTML text
+    let text = {
+        // The item's text returned from HN APIs may have `<p>` tags representing
+        // paragraph breaks. Convert `<p>` tags to <p></p> tag pairs to make the text
+        // easier to parse.
+        if text.is_empty() {
+            text
+        } else {
+            format!("<p>{}</p>", text.replace("<p>", "</p>\n<p>"))
+        }
+    };
+
+    parse(text, style, base_link_id)
+}
+
+/// a helper function of [parse_hn_html_text] for recursively parsing HTML elements inside the text
+fn parse(text: String, style: Style, base_link_id: usize) -> HTMLTextParsedResult {
+    let mut result = HTMLTextParsedResult::default();
+    // an index such that `text[curr_pos..]` represents the slice of the
     // text that hasn't been parsed.
     let mut curr_pos = 0;
 
-    // This variable indicates whether we have parsed the first paragraph of the current text.
-    // It is used to add a break between 2 consecutive paragraphs.
-    let mut seen_first_paragraph = false;
-
     for caps in HN_TEXT_RE.captures_iter(&text) {
         // the part that doesn't match any patterns is rendered in the default style
         let whole_match = caps.get(0).unwrap();
         if curr_pos < whole_match.start() {
             result
-                .s
+                .content
                 .append_styled(&text[curr_pos..whole_match.start()], style);
         }
         curr_pos = whole_match.end();
@@ -77,71 +91,67 @@ pub fn parse_hn_html_text(text: String, style: Style, base_link_id: usize) -> HT
         let component_style = &config::get_config_theme().component_style;
 
         if let (Some(m_quote), Some(m_text)) = (caps.name("quote"), caps.name("text")) {
-            if seen_first_paragraph {
-                result.s.append_plain("\n");
-            } else {
-                seen_first_paragraph = true;
-            }
-
-            // render quote character `>` as indentation character
-            result.s.append_styled(
+            // quoted paragraph
+            // render quote character `>` using the `|` indentation character
+            result.content.append_styled(
                 "▎"
                     .to_string()
                     .repeat(m_quote.as_str().matches('>').count()),
                 style,
             );
-            result.merge(parse_hn_html_text(
+            result.merge(parse(
                 m_text.as_str().to_string(),
                 component_style.quote.into(),
                 base_link_id + result.links.len(),
             ));
 
-            result.s.append_plain("\n");
+            result.content.append_plain("\n");
         } else if let Some(m) = caps.name("paragraph") {
-            if seen_first_paragraph {
-                result.s.append_plain("\n");
-            } else {
-                seen_first_paragraph = true;
-            }
-
-            result.merge(parse_hn_html_text(
+            // normal paragraph
+            result.merge(parse(
                 m.as_str().to_string(),
                 style,
                 base_link_id + result.links.len(),
             ));
 
-            result.s.append_plain("\n");
+            result.content.append_plain("\n");
         } else if let Some(m) = caps.name("link") {
+            // HTML link
             result.links.push(m.as_str().to_string());
 
-            result.s.append_styled(
+            result.content.append_styled(
                 utils::shorten_url(m.as_str()),
                 style.combine(component_style.link),
             );
-            result.s.append_styled(" ", style);
-            result.s.append_styled(
+            result.content.append_styled(" ", style);
+            result.content.append_styled(
                 format!("[{}]", result.links.len() + base_link_id),
                 style.combine(component_style.link_id),
             );
         } else if let Some(m) = caps.name("multiline_code") {
-            result.s.append_styled(
+            // HTML code block
+            result.content.append_styled(
                 m.as_str(),
                 style.combine(component_style.multiline_code_block),
             );
-            result.s.append_plain("\n");
+            result.content.append_plain("\n");
         } else if let Some(m) = caps.name("code") {
+            // markdown single line code block
             result
-                .s
+                .content
                 .append_styled(m.as_str(), style.combine(component_style.single_code_block));
         } else if let Some(m) = caps.name("italic") {
+            // HTML italic
             result
-                .s
+                .content
                 .append_styled(m.as_str(), style.combine(component_style.italic));
         }
     }
 
     if curr_pos < text.len() {
-        result.s.append_styled(&text[curr_pos..text.len()], style);
+        result
+            .content
+            .append_styled(&text[curr_pos..text.len()], style);
     }
 
     result
diff --git a/hackernews_tui/src/view/article_view.rs b/hackernews_tui/src/view/article_view.rs
index dc83c62..04064f2 100644
--- a/hackernews_tui/src/view/article_view.rs
+++ b/hackernews_tui/src/view/article_view.rs
@@ -24,7 +24,7 @@ impl ViewWrapper for ArticleView {
 
             match self.article.parse(self.width.saturating_sub(5)) {
                 Ok(result) => {
-                    self.set_article_content(result.s);
+                    self.set_article_content(result.content);
                     self.links = result.links;
                 }
                 Err(err) => {
author	Thang Pham <phamducthang1234@gmail.com>	2023-10-21 17:56:58 -0400
committer	GitHub <noreply@github.com>	2023-10-21 17:56:58 -0400
commit	9301c39b3bd8cb8e1904b7680d6827d3a8f6a479 (patch)
tree	adbd659b4a6bdea6a7d8d6ff405397842e9f6532
parent	d09b1d24795beb3e90b3fd1d4f6df9bc2ce54af8 (diff)