From 9301c39b3bd8cb8e1904b7680d6827d3a8f6a479 Mon Sep 17 00:00:00 2001 From: Thang Pham Date: Sat, 21 Oct 2023 17:56:58 -0400 Subject: Update Hacker News HTML text parsing logic (#99) ## Changes - update parsing logic to reflect new HN Algolia API change regarding the use of `

` for paragraph breaks - cleanup parsing codes --- hackernews_tui/src/client/mod.rs | 4 +- hackernews_tui/src/model.rs | 28 +++++---- hackernews_tui/src/parser/article.rs | 38 ++++++----- hackernews_tui/src/parser/html.rs | 108 +++++++++++++++++--------------- hackernews_tui/src/view/article_view.rs | 2 +- 5 files changed, 97 insertions(+), 83 deletions(-) diff --git a/hackernews_tui/src/client/mod.rs b/hackernews_tui/src/client/mod.rs index ffab48d..b789a90 100644 --- a/hackernews_tui/src/client/mod.rs +++ b/hackernews_tui/src/client/mod.rs @@ -77,9 +77,7 @@ impl HNClient { format!("get item (id={item_id}) using {request_url}") ); - // The item's text returned from HN official APIs may have `

` tags representing - // paragraph breaks. Convert `

` tags to newlines to make the text easier to read. - let text = decode_html(&item.text.unwrap_or_default()).replace("

", "\n\n"); + let text = decode_html(&item.text.unwrap_or_default()); // Construct the shortened text to represent the page's title if not exist let chars = text.replace('\n', " ").chars().collect::>(); diff --git a/hackernews_tui/src/model.rs b/hackernews_tui/src/model.rs index 175c27e..d2aabeb 100644 --- a/hackernews_tui/src/model.rs +++ b/hackernews_tui/src/model.rs @@ -59,10 +59,10 @@ pub struct VoteData { } #[derive(Debug, Clone)] -/// A HackerNews item which can be either a story or a comment. +/// A Hacker News item which can be either a story or a comment. /// -/// This struct is a shared representation between a story and -/// a comment for rendering the item's content. +/// This struct is a shared representation between a story and a comment +/// and is used to render their content. pub struct HnItem { pub id: u32, pub level: usize, @@ -107,19 +107,18 @@ impl From for HnItem { ), ]); - let mut story_text = story.content; + // parse story's HTML content + let result = parse_hn_html_text(story.content, Style::default(), 0); - let minimized_text = if story_text.is_empty() { + // construct a minimized text representing the collapsed story's content + let minimized_text = if result.content.source().is_empty() { metadata.clone() } else { - story_text = format!("\n{story_text}"); - utils::combine_styled_strings([metadata.clone(), StyledString::plain("... (more)")]) }; - let mut text = metadata; - let result = parse_hn_html_text(story_text, Style::default(), 0); - text.append(result.s); + let text = + utils::combine_styled_strings([metadata, StyledString::plain("\n"), result.content]); HnItem { id: story.id, @@ -144,17 +143,20 @@ impl From for HnItem { ), ]); - let mut text = utils::combine_styled_strings([metadata.clone(), StyledString::plain("\n")]); + // constructs a minimized text representing the collapsed comment's content let minimized_text = utils::combine_styled_strings([ - metadata, + metadata.clone(), StyledString::styled( format!("({} more)", comment.n_children + 1), component_style.metadata, ), ]); + // parse the comment's content let result = parse_hn_html_text(comment.content, Style::default(), 0); - text.append(result.s); + + let text = + utils::combine_styled_strings([metadata, StyledString::plain("\n"), result.content]); HnItem { id: comment.id, diff --git a/hackernews_tui/src/parser/article.rs b/hackernews_tui/src/parser/article.rs index bcf4073..7cac0ba 100644 --- a/hackernews_tui/src/parser/article.rs +++ b/hackernews_tui/src/parser/article.rs @@ -1,4 +1,4 @@ -use super::html::HTMLParsedResult; +use super::html::HTMLTextParsedResult; use super::rcdom::{Handle, NodeData, RcDom}; use crate::parser::html::HTMLTableParsedResult; use crate::prelude::*; @@ -40,7 +40,7 @@ impl Article { /// # Arguments: /// * `max_width`: the maximum width of the parsed content. This is mostly used /// to construct a HTML table using `comfy_table`. - pub fn parse(&self, max_width: usize) -> Result { + pub fn parse(&self, max_width: usize) -> Result { debug!("parse article ({:?})", self); // parse HTML content into DOM node(s) @@ -90,7 +90,7 @@ impl Article { base_link_id: usize, mut style: Style, mut args: ArticleParseArgs, - ) -> (HTMLParsedResult, bool) { + ) -> (HTMLTextParsedResult, bool) { // TODO: handle parsing

    tags correctly debug!( @@ -98,13 +98,13 @@ impl Article { node, style, args ); - let mut result = HTMLParsedResult::default(); + let mut result = HTMLTextParsedResult::default(); let mut suffix = StyledString::new(); let mut visit_block_element_cb = || { if !args.is_first_element_in_block { - result.s.append_plain("\n\n"); - result.s.append_styled(&args.prefix, style); + result.content.append_plain("\n\n"); + result.content.append_styled(&args.prefix, style); } args.is_first_element_in_block = true; }; @@ -128,7 +128,7 @@ impl Article { has_non_ws_text |= !text.trim().is_empty(); - result.s.append_styled(text, style); + result.content.append_styled(text, style); } NodeData::Element { ref name, @@ -151,7 +151,9 @@ impl Article { style = style.combine(component_style.header); } expanded_name!(html "br") => { - result.s.append_styled(format!("\n{}", args.prefix), style); + result + .content + .append_styled(format!("\n{}", args.prefix), style); } expanded_name!(html "p") => visit_block_element_cb(), expanded_name!(html "code") => { @@ -169,7 +171,7 @@ impl Article { style = style.combine(component_style.multiline_code_block); - result.s.append_styled(" ", style); + result.content.append_styled(" ", style); } expanded_name!(html "blockquote") => { visit_block_element_cb(); @@ -177,7 +179,7 @@ impl Article { args.prefix = format!("{}▎ ", args.prefix); style = style.combine(component_style.quote); - result.s.append_styled("▎ ", style); + result.content.append_styled("▎ ", style); } expanded_name!(html "table") => { let mut table_result = HTMLTableParsedResult::default(); @@ -211,7 +213,7 @@ impl Article { table.add_row(row.into_iter().map(|c| c.source().to_owned())); } - result.s.append_styled(format!("\n\n{table}"), style); + result.content.append_styled(format!("\n\n{table}"), style); return (result, true); } @@ -225,7 +227,7 @@ impl Article { args.is_first_element_in_block = true; result - .s + .content .append_styled(format!("\n{}• ", args.prefix), style); } expanded_name!(html "img") => { @@ -240,10 +242,12 @@ impl Article { }; if !args.is_first_element_in_block { - result.s.append_plain("\n\n"); + result.content.append_plain("\n\n"); } - result.s.append_styled(&img_desc, style); - result.s.append_styled(" (image)", component_style.metadata); + result.content.append_styled(&img_desc, style); + result + .content + .append_styled(" (image)", component_style.metadata); } expanded_name!(html "a") => { // find `href` attribute of an tag @@ -291,7 +295,7 @@ impl Article { } }); - result.s.append(suffix); + result.content.append(suffix); (result, has_non_ws_text) } @@ -331,7 +335,7 @@ impl Article { ); result.links.append(&mut child_result.links); - s.append(child_result.s); + s.append(child_result.content); }); if !is_header { diff --git a/hackernews_tui/src/parser/html.rs b/hackernews_tui/src/parser/html.rs index a64a538..2b62f66 100644 --- a/hackernews_tui/src/parser/html.rs +++ b/hackernews_tui/src/parser/html.rs @@ -4,36 +4,36 @@ use once_cell::sync::Lazy; use regex::Regex; /// A regex to parse a HN text (in HTML). -/// It consists of multiple regex(s) representing different elements. +/// It consists of multiple regexes representing different components. static HN_TEXT_RE: Lazy = Lazy::new(|| { Regex::new(&format!( "(({})|({})|({})|({})|({})|({}))", - // a regex that matches a HTML paragraph + // a regex matching a HTML paragraph r"

    (?s)(?P(|[^>].*?))

    ", - // a regex that matches a paragraph quote (in markdown format) + // a regex matching a paragraph quote (in markdown format) r"

    (?s)(?P>[> ]*)(?P.*?)

    ", - // a regex that matches an HTML italic string + // a regex matching an HTML italic string r"(?s)(?P.*?)", - // a regex that matches a HTML code block (multiline) + // a regex matching a HTML code block (multiline) r"
    (?s)(?P.*?)[\n]*
    ", - // a regex that matches a single line code block (markdown format) + // a regex matching a single line code block (markdown format) r"`(?P[^`]+?)`", - // a regex that matches a HTML link + // a regex matching a HTML link r#".*?)"(?s).+?
    "#, )) .unwrap() }); -/// A HTML parsed result. +/// Parsed result of a HTML text #[derive(Debug, Default)] -pub struct HTMLParsedResult { - /// a styled string representing the decorated HTML content - pub s: StyledString, +pub struct HTMLTextParsedResult { + /// parsed HTML content + pub content: StyledString, /// a list of links inside the HTML document pub links: Vec, } -/// A HTML table parsed result. +/// Parsed result of a HTML table #[derive(Debug, Default)] pub struct HTMLTableParsedResult { /// a list of links inside the HTML document @@ -44,32 +44,46 @@ pub struct HTMLTableParsedResult { pub rows: Vec>, } -impl HTMLParsedResult { - pub fn merge(&mut self, mut other: HTMLParsedResult) { - self.s.append(other.s); +impl HTMLTextParsedResult { + /// merge two HTML parsed results + pub fn merge(&mut self, mut other: HTMLTextParsedResult) { + self.content.append(other.content); self.links.append(&mut other.links); } } /// parse a Hacker News HTML text -pub fn parse_hn_html_text(text: String, style: Style, base_link_id: usize) -> HTMLParsedResult { +pub fn parse_hn_html_text(text: String, style: Style, base_link_id: usize) -> HTMLTextParsedResult { debug!("parse hn html text: {}", text); - let mut result = HTMLParsedResult::default(); - // an index such that `text[curr_pos..]` represents the part of the + // pre-processed the HTML text + let text = { + // The item's text returned from HN APIs may have `

    ` tags representing + // paragraph breaks. Convert `

    ` tags to

    tag pairs to make the text + // easier to parse. + if text.is_empty() { + text + } else { + format!("

    {}

    ", text.replace("

    ", "

    \n

    ")) + } + }; + + parse(text, style, base_link_id) +} + +/// a helper function of [parse_hn_html_text] for recursively parsing HTML elements inside the text +fn parse(text: String, style: Style, base_link_id: usize) -> HTMLTextParsedResult { + let mut result = HTMLTextParsedResult::default(); + // an index such that `text[curr_pos..]` represents the slice of the // text that hasn't been parsed. let mut curr_pos = 0; - // This variable indicates whether we have parsed the first paragraph of the current text. - // It is used to add a break between 2 consecutive paragraphs. - let mut seen_first_paragraph = false; - for caps in HN_TEXT_RE.captures_iter(&text) { // the part that doesn't match any patterns is rendered in the default style let whole_match = caps.get(0).unwrap(); if curr_pos < whole_match.start() { result - .s + .content .append_styled(&text[curr_pos..whole_match.start()], style); } curr_pos = whole_match.end(); @@ -77,71 +91,67 @@ pub fn parse_hn_html_text(text: String, style: Style, base_link_id: usize) -> HT let component_style = &config::get_config_theme().component_style; if let (Some(m_quote), Some(m_text)) = (caps.name("quote"), caps.name("text")) { - if seen_first_paragraph { - result.s.append_plain("\n"); - } else { - seen_first_paragraph = true; - } - - // render quote character `>` as indentation character - result.s.append_styled( + // quoted paragraph + // render quote character `>` using the `|` indentation character + result.content.append_styled( "▎" .to_string() .repeat(m_quote.as_str().matches('>').count()), style, ); - result.merge(parse_hn_html_text( + result.merge(parse( m_text.as_str().to_string(), component_style.quote.into(), base_link_id + result.links.len(), )); - result.s.append_plain("\n"); + result.content.append_plain("\n"); } else if let Some(m) = caps.name("paragraph") { - if seen_first_paragraph { - result.s.append_plain("\n"); - } else { - seen_first_paragraph = true; - } - - result.merge(parse_hn_html_text( + // normal paragraph + result.merge(parse( m.as_str().to_string(), style, base_link_id + result.links.len(), )); - result.s.append_plain("\n"); + result.content.append_plain("\n"); } else if let Some(m) = caps.name("link") { + // HTML link result.links.push(m.as_str().to_string()); - result.s.append_styled( + result.content.append_styled( utils::shorten_url(m.as_str()), style.combine(component_style.link), ); - result.s.append_styled(" ", style); - result.s.append_styled( + result.content.append_styled(" ", style); + result.content.append_styled( format!("[{}]", result.links.len() + base_link_id), style.combine(component_style.link_id), ); } else if let Some(m) = caps.name("multiline_code") { - result.s.append_styled( + // HTML code block + result.content.append_styled( m.as_str(), style.combine(component_style.multiline_code_block), ); - result.s.append_plain("\n"); + result.content.append_plain("\n"); } else if let Some(m) = caps.name("code") { + // markdown single line code block result - .s + .content .append_styled(m.as_str(), style.combine(component_style.single_code_block)); } else if let Some(m) = caps.name("italic") { + // HTML italic result - .s + .content .append_styled(m.as_str(), style.combine(component_style.italic)); } } if curr_pos < text.len() { - result.s.append_styled(&text[curr_pos..text.len()], style); + result + .content + .append_styled(&text[curr_pos..text.len()], style); } result diff --git a/hackernews_tui/src/view/article_view.rs b/hackernews_tui/src/view/article_view.rs index dc83c62..04064f2 100644 --- a/hackernews_tui/src/view/article_view.rs +++ b/hackernews_tui/src/view/article_view.rs @@ -24,7 +24,7 @@ impl ViewWrapper for ArticleView { match self.article.parse(self.width.saturating_sub(5)) { Ok(result) => { - self.set_article_content(result.s); + self.set_article_content(result.content); self.links = result.links; } Err(err) => { -- cgit v1.2.3