summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThang Pham <phamducthang1234@gmail.com>2023-10-21 17:56:58 -0400
committerGitHub <noreply@github.com>2023-10-21 17:56:58 -0400
commit9301c39b3bd8cb8e1904b7680d6827d3a8f6a479 (patch)
treeadbd659b4a6bdea6a7d8d6ff405397842e9f6532
parentd09b1d24795beb3e90b3fd1d4f6df9bc2ce54af8 (diff)
Update Hacker News HTML text parsing logic (#99)
## Changes - update parsing logic to reflect new HN Algolia API change regarding the use of `<p>` for paragraph breaks - cleanup parsing codes
-rw-r--r--hackernews_tui/src/client/mod.rs4
-rw-r--r--hackernews_tui/src/model.rs28
-rw-r--r--hackernews_tui/src/parser/article.rs38
-rw-r--r--hackernews_tui/src/parser/html.rs108
-rw-r--r--hackernews_tui/src/view/article_view.rs2
5 files changed, 97 insertions, 83 deletions
diff --git a/hackernews_tui/src/client/mod.rs b/hackernews_tui/src/client/mod.rs
index ffab48d..b789a90 100644
--- a/hackernews_tui/src/client/mod.rs
+++ b/hackernews_tui/src/client/mod.rs
@@ -77,9 +77,7 @@ impl HNClient {
format!("get item (id={item_id}) using {request_url}")
);
- // The item's text returned from HN official APIs may have `<p>` tags representing
- // paragraph breaks. Convert `<p>` tags to newlines to make the text easier to read.
- let text = decode_html(&item.text.unwrap_or_default()).replace("<p>", "\n\n");
+ let text = decode_html(&item.text.unwrap_or_default());
// Construct the shortened text to represent the page's title if not exist
let chars = text.replace('\n', " ").chars().collect::<Vec<_>>();
diff --git a/hackernews_tui/src/model.rs b/hackernews_tui/src/model.rs
index 175c27e..d2aabeb 100644
--- a/hackernews_tui/src/model.rs
+++ b/hackernews_tui/src/model.rs
@@ -59,10 +59,10 @@ pub struct VoteData {
}
#[derive(Debug, Clone)]
-/// A HackerNews item which can be either a story or a comment.
+/// A Hacker News item which can be either a story or a comment.
///
-/// This struct is a shared representation between a story and
-/// a comment for rendering the item's content.
+/// This struct is a shared representation between a story and a comment
+/// and is used to render their content.
pub struct HnItem {
pub id: u32,
pub level: usize,
@@ -107,19 +107,18 @@ impl From<Story> for HnItem {
),
]);
- let mut story_text = story.content;
+ // parse story's HTML content
+ let result = parse_hn_html_text(story.content, Style::default(), 0);
- let minimized_text = if story_text.is_empty() {
+ // construct a minimized text representing the collapsed story's content
+ let minimized_text = if result.content.source().is_empty() {
metadata.clone()
} else {
- story_text = format!("\n{story_text}");
-
utils::combine_styled_strings([metadata.clone(), StyledString::plain("... (more)")])
};
- let mut text = metadata;
- let result = parse_hn_html_text(story_text, Style::default(), 0);
- text.append(result.s);
+ let text =
+ utils::combine_styled_strings([metadata, StyledString::plain("\n"), result.content]);
HnItem {
id: story.id,
@@ -144,17 +143,20 @@ impl From<Comment> for HnItem {
),
]);
- let mut text = utils::combine_styled_strings([metadata.clone(), StyledString::plain("\n")]);
+ // constructs a minimized text representing the collapsed comment's content
let minimized_text = utils::combine_styled_strings([
- metadata,
+ metadata.clone(),
StyledString::styled(
format!("({} more)", comment.n_children + 1),
component_style.metadata,
),
]);
+ // parse the comment's content
let result = parse_hn_html_text(comment.content, Style::default(), 0);
- text.append(result.s);
+
+ let text =
+ utils::combine_styled_strings([metadata, StyledString::plain("\n"), result.content]);
HnItem {
id: comment.id,
diff --git a/hackernews_tui/src/parser/article.rs b/hackernews_tui/src/parser/article.rs
index bcf4073..7cac0ba 100644
--- a/hackernews_tui/src/parser/article.rs
+++ b/hackernews_tui/src/parser/article.rs
@@ -1,4 +1,4 @@
-use super::html::HTMLParsedResult;
+use super::html::HTMLTextParsedResult;
use super::rcdom::{Handle, NodeData, RcDom};
use crate::parser::html::HTMLTableParsedResult;
use crate::prelude::*;
@@ -40,7 +40,7 @@ impl Article {
/// # Arguments:
/// * `max_width`: the maximum width of the parsed content. This is mostly used
/// to construct a HTML table using `comfy_table`.
- pub fn parse(&self, max_width: usize) -> Result<HTMLParsedResult> {
+ pub fn parse(&self, max_width: usize) -> Result<HTMLTextParsedResult> {
debug!("parse article ({:?})", self);
// parse HTML content into DOM node(s)
@@ -90,7 +90,7 @@ impl Article {
base_link_id: usize,
mut style: Style,
mut args: ArticleParseArgs,
- ) -> (HTMLParsedResult, bool) {
+ ) -> (HTMLTextParsedResult, bool) {
// TODO: handle parsing <ol> tags correctly
debug!(
@@ -98,13 +98,13 @@ impl Article {
node, style, args
);
- let mut result = HTMLParsedResult::default();
+ let mut result = HTMLTextParsedResult::default();
let mut suffix = StyledString::new();
let mut visit_block_element_cb = || {
if !args.is_first_element_in_block {
- result.s.append_plain("\n\n");
- result.s.append_styled(&args.prefix, style);
+ result.content.append_plain("\n\n");
+ result.content.append_styled(&args.prefix, style);
}
args.is_first_element_in_block = true;
};
@@ -128,7 +128,7 @@ impl Article {
has_non_ws_text |= !text.trim().is_empty();
- result.s.append_styled(text, style);
+ result.content.append_styled(text, style);
}
NodeData::Element {
ref name,
@@ -151,7 +151,9 @@ impl Article {
style = style.combine(component_style.header);
}
expanded_name!(html "br") => {
- result.s.append_styled(format!("\n{}", args.prefix), style);
+ result
+ .content
+ .append_styled(format!("\n{}", args.prefix), style);
}
expanded_name!(html "p") => visit_block_element_cb(),
expanded_name!(html "code") => {
@@ -169,7 +171,7 @@ impl Article {
style = style.combine(component_style.multiline_code_block);
- result.s.append_styled(" ", style);
+ result.content.append_styled(" ", style);
}
expanded_name!(html "blockquote") => {
visit_block_element_cb();
@@ -177,7 +179,7 @@ impl Article {
args.prefix = format!("{}▎ ", args.prefix);
style = style.combine(component_style.quote);
- result.s.append_styled("▎ ", style);
+ result.content.append_styled("▎ ", style);
}
expanded_name!(html "table") => {
let mut table_result = HTMLTableParsedResult::default();
@@ -211,7 +213,7 @@ impl Article {
table.add_row(row.into_iter().map(|c| c.source().to_owned()));
}
- result.s.append_styled(format!("\n\n{table}"), style);
+ result.content.append_styled(format!("\n\n{table}"), style);
return (result, true);
}
@@ -225,7 +227,7 @@ impl Article {
args.is_first_element_in_block = true;
result
- .s
+ .content
.append_styled(format!("\n{}• ", args.prefix), style);
}
expanded_name!(html "img") => {
@@ -240,10 +242,12 @@ impl Article {
};
if !args.is_first_element_in_block {
- result.s.append_plain("\n\n");
+ result.content.append_plain("\n\n");
}
- result.s.append_styled(&img_desc, style);
- result.s.append_styled(" (image)", component_style.metadata);
+ result.content.append_styled(&img_desc, style);
+ result
+ .content
+ .append_styled(" (image)", component_style.metadata);
}
expanded_name!(html "a") => {
// find `href` attribute of an <a> tag
@@ -291,7 +295,7 @@ impl Article {
}
});
- result.s.append(suffix);
+ result.content.append(suffix);
(result, has_non_ws_text)
}
@@ -331,7 +335,7 @@ impl Article {
);
result.links.append(&mut child_result.links);
- s.append(child_result.s);
+ s.append(child_result.content);
});
if !is_header {
diff --git a/hackernews_tui/src/parser/html.rs b/hackernews_tui/src/parser/html.rs
index a64a538..2b62f66 100644
--- a/hackernews_tui/src/parser/html.rs
+++ b/hackernews_tui/src/parser/html.rs
@@ -4,36 +4,36 @@ use once_cell::sync::Lazy;
use regex::Regex;
/// A regex to parse a HN text (in HTML).
-/// It consists of multiple regex(s) representing different elements.
+/// It consists of multiple regexes representing different components.
static HN_TEXT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(&format!(
"(({})|({})|({})|({})|({})|({}))",
- // a regex that matches a HTML paragraph
+ // a regex matching a HTML paragraph
r"<p>(?s)(?P<paragraph>(|[^>].*?))</p>",
- // a regex that matches a paragraph quote (in markdown format)
+ // a regex matching a paragraph quote (in markdown format)
r"<p>(?s)(?P<quote>>[> ]*)(?P<text>.*?)</p>",
- // a regex that matches an HTML italic string
+ // a regex matching an HTML italic string
r"<i>(?s)(?P<italic>.*?)</i>",
- // a regex that matches a HTML code block (multiline)
+ // a regex matching a HTML code block (multiline)
r"<pre><code>(?s)(?P<multiline_code>.*?)[\n]*</code></pre>",
- // a regex that matches a single line code block (markdown format)
+ // a regex matching a single line code block (markdown format)
r"`(?P<code>[^`]+?)`",
- // a regex that matches a HTML link
+ // a regex matching a HTML link
r#"<a\s+?href="(?P<link>.*?)"(?s).+?</a>"#,
))
.unwrap()
});
-/// A HTML parsed result.
+/// Parsed result of a HTML text
#[derive(Debug, Default)]
-pub struct HTMLParsedResult {
- /// a styled string representing the decorated HTML content
- pub s: StyledString,
+pub struct HTMLTextParsedResult {
+ /// parsed HTML content
+ pub content: StyledString,
/// a list of links inside the HTML document
pub links: Vec<String>,
}
-/// A HTML table parsed result.
+/// Parsed result of a HTML table
#[derive(Debug, Default)]
pub struct HTMLTableParsedResult {
/// a list of links inside the HTML document
@@ -44,32 +44,46 @@ pub struct HTMLTableParsedResult {
pub rows: Vec<Vec<StyledString>>,
}
-impl HTMLParsedResult {
- pub fn merge(&mut self, mut other: HTMLParsedResult) {
- self.s.append(other.s);
+impl HTMLTextParsedResult {
+ /// merge two HTML parsed results
+ pub fn merge(&mut self, mut other: HTMLTextParsedResult) {
+ self.content.append(other.content);
self.links.append(&mut other.links);
}
}
/// parse a Hacker News HTML text
-pub fn parse_hn_html_text(text: String, style: Style, base_link_id: usize) -> HTMLParsedResult {
+pub fn parse_hn_html_text(text: String, style: Style, base_link_id: usize) -> HTMLTextParsedResult {
debug!("parse hn html text: {}", text);
- let mut result = HTMLParsedResult::default();
- // an index such that `text[curr_pos..]` represents the part of the
+ // pre-processed the HTML text
+ let text = {
+ // The item's text returned from HN APIs may have `<p>` tags representing
+ // paragraph breaks. Convert `<p>` tags to <p></p> tag pairs to make the text
+ // easier to parse.
+ if text.is_empty() {
+ text
+ } else {
+ format!("<p>{}</p>", text.replace("<p>", "</p>\n<p>"))
+ }
+ };
+
+ parse(text, style, base_link_id)
+}
+
+/// a helper function of [parse_hn_html_text] for recursively parsing HTML elements inside the text
+fn parse(text: String, style: Style, base_link_id: usize) -> HTMLTextParsedResult {
+ let mut result = HTMLTextParsedResult::default();
+ // an index such that `text[curr_pos..]` represents the slice of the
// text that hasn't been parsed.
let mut curr_pos = 0;
- // This variable indicates whether we have parsed the first paragraph of the current text.
- // It is used to add a break between 2 consecutive paragraphs.
- let mut seen_first_paragraph = false;
-
for caps in HN_TEXT_RE.captures_iter(&text) {
// the part that doesn't match any patterns is rendered in the default style
let whole_match = caps.get(0).unwrap();
if curr_pos < whole_match.start() {
result
- .s
+ .content
.append_styled(&text[curr_pos..whole_match.start()], style);
}
curr_pos = whole_match.end();
@@ -77,71 +91,67 @@ pub fn parse_hn_html_text(text: String, style: Style, base_link_id: usize) -> HT
let component_style = &config::get_config_theme().component_style;
if let (Some(m_quote), Some(m_text)) = (caps.name("quote"), caps.name("text")) {
- if seen_first_paragraph {
- result.s.append_plain("\n");
- } else {
- seen_first_paragraph = true;
- }
-
- // render quote character `>` as indentation character
- result.s.append_styled(
+ // quoted paragraph
+ // render quote character `>` using the `|` indentation character
+ result.content.append_styled(
"▎"
.to_string()
.repeat(m_quote.as_str().matches('>').count()),
style,
);
- result.merge(parse_hn_html_text(
+ result.merge(parse(
m_text.as_str().to_string(),
component_style.quote.into(),
base_link_id + result.links.len(),
));
- result.s.append_plain("\n");
+ result.content.append_plain("\n");
} else if let Some(m) = caps.name("paragraph") {
- if seen_first_paragraph {
- result.s.append_plain("\n");
- } else {
- seen_first_paragraph = true;
- }
-
- result.merge(parse_hn_html_text(
+ // normal paragraph
+ result.merge(parse(
m.as_str().to_string(),
style,
base_link_id + result.links.len(),
));
- result.s.append_plain("\n");
+ result.content.append_plain("\n");
} else if let Some(m) = caps.name("link") {
+ // HTML link
result.links.push(m.as_str().to_string());
- result.s.append_styled(
+ result.content.append_styled(
utils::shorten_url(m.as_str()),
style.combine(component_style.link),
);
- result.s.append_styled(" ", style);
- result.s.append_styled(
+ result.content.append_styled(" ", style);
+ result.content.append_styled(
format!("[{}]", result.links.len() + base_link_id),
style.combine(component_style.link_id),
);
} else if let Some(m) = caps.name("multiline_code") {
- result.s.append_styled(
+ // HTML code block
+ result.content.append_styled(
m.as_str(),
style.combine(component_style.multiline_code_block),
);
- result.s.append_plain("\n");
+ result.content.append_plain("\n");
} else if let Some(m) = caps.name("code") {
+ // markdown single line code block
result
- .s
+ .content
.append_styled(m.as_str(), style.combine(component_style.single_code_block));
} else if let Some(m) = caps.name("italic") {
+ // HTML italic
result
- .s
+ .content
.append_styled(m.as_str(), style.combine(component_style.italic));
}
}
if curr_pos < text.len() {
- result.s.append_styled(&text[curr_pos..text.len()], style);
+ result
+ .content
+ .append_styled(&text[curr_pos..text.len()], style);
}
result
diff --git a/hackernews_tui/src/view/article_view.rs b/hackernews_tui/src/view/article_view.rs
index dc83c62..04064f2 100644
--- a/hackernews_tui/src/view/article_view.rs
+++ b/hackernews_tui/src/view/article_view.rs
@@ -24,7 +24,7 @@ impl ViewWrapper for ArticleView {
match self.article.parse(self.width.saturating_sub(5)) {
Ok(result) => {
- self.set_article_content(result.s);
+ self.set_article_content(result.content);
self.links = result.links;
}
Err(err) => {