hackernews_tui/src/parser/html.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

use crate::prelude::*;
use crate::utils;
use once_cell::sync::Lazy;
use regex::Regex;

/// A regex to parse a HN text (in HTML).
/// It consists of multiple regexes representing different components.
static HN_TEXT_RE: Lazy<Regex> = Lazy::new(|| {
    Regex::new(&format!(
        "(({})|({})|({})|({})|({})|({}))",
        // a regex matching a HTML paragraph
        r"<p>(?s)(?P<paragraph>(|[^>].*?))</p>",
        // a regex matching a paragraph quote (in markdown format)
        r"<p>(?s)(?P<quote>>[> ]*)(?P<text>.*?)</p>",
        // a regex matching an HTML italic string
        r"<i>(?s)(?P<italic>.*?)</i>",
        // a regex matching a HTML code block (multiline)
        r"<pre><code>(?s)(?P<multiline_code>.*?)[\n]*</code></pre>",
        // a regex matching a single line code block (markdown format)
        r"`(?P<code>[^`]+?)`",
        // a regex matching a HTML link
        r#"<a\s+?href="(?P<link>.*?)"(?s).+?</a>"#,
    ))
    .unwrap()
});

/// Parsed result of a HTML text
#[derive(Debug, Default)]
pub struct HTMLTextParsedResult {
    /// parsed HTML content
    pub content: StyledString,
    /// a list of links inside the HTML document
    pub links: Vec<String>,
}

/// Parsed result of a HTML table
#[derive(Debug, Default)]
pub struct HTMLTableParsedResult {
    /// a list of links inside the HTML document
    pub links: Vec<String>,
    /// parsed table headers
    pub headers: Vec<StyledString>,
    /// parsed table rows
    pub rows: Vec<Vec<StyledString>>,
}

impl HTMLTextParsedResult {
    /// merge two HTML parsed results
    pub fn merge(&mut self, mut other: HTMLTextParsedResult) {
        self.content.append(other.content);
        self.links.append(&mut other.links);
    }
}

/// parse a Hacker News HTML text
pub fn parse_hn_html_text(text: String, style: Style, base_link_id: usize) -> HTMLTextParsedResult {
    debug!("parse hn html text: {}", text);

    // pre-processed the HTML text
    let text = {
        // The item's text returned from HN APIs may have `<p>` tags representing
        // paragraph breaks. Convert `<p>` tags to <p></p> tag pairs to make the text
        // easier to parse.
        if text.is_empty() {
            text
        } else {
            format!("<p>{}</p>", text.replace("<p>", "</p>\n<p>"))
        }
    };

    parse(text, style, base_link_id)
}

/// a helper function of [parse_hn_html_text] for recursively parsing HTML elements inside the text
fn parse(text: String, style: Style, base_link_id: usize) -> HTMLTextParsedResult {
    let mut result = HTMLTextParsedResult::default();
    // an index such that `text[curr_pos..]` represents the slice of the
    // text that hasn't been parsed.
    let mut curr_pos = 0;

    for caps in HN_TEXT_RE.captures_iter(&text) {
        // the part that doesn't match any patterns is rendered in the default style
        let whole_match = caps.get(0).unwrap();
        if curr_pos < whole_match.start() {
            result
                .content
                .append_styled(&text[curr_pos..whole_match.start()], style);
        }
        curr_pos = whole_match.end();

        let component_style = &config::get_config_theme().component_style;

        if let (Some(m_quote), Some(m_text)) = (caps.name("quote"), caps.name("text")) {
            // quoted paragraph
            // render quote character `>` using the `|` indentation character
            result.content.append_styled(
                "▎"
                    .to_string()
                    .repeat(m_quote.as_str().matches('>').count()),
                style,
            );
            result.merge(parse(
                m_text.as_str().to_string(),
                component_style.quote.into(),
                base_link_id + result.links.len(),
            ));

            result.content.append_plain("\n");
        } else if let Some(m) = caps.name("paragraph") {
            // normal paragraph
            result.merge(parse(
                m.as_str().to_string(),
                style,
                base_link_id + result.links.len(),
            ));

            result.content.append_plain("\n");
        } else if let Some(m) = caps.name("link") {
            // HTML link
            result.links.push(m.as_str().to_string());

            result.content.append_styled(
                utils::shorten_url(m.as_str()),
                style.combine(component_style.link),
            );
            result.content.append_styled(" ", style);
            result.content.append_styled(
                format!("[{}]", result.links.len() + base_link_id),
                style.combine(component_style.link_id),
            );
        } else if let Some(m) = caps.name("multiline_code") {
            // HTML code block
            result.content.append_styled(
                m.as_str(),
                style.combine(component_style.multiline_code_block),
            );
            result.content.append_plain("\n");
        } else if let Some(m) = caps.name("code") {
            // markdown single line code block
            result
                .content
                .append_styled(m.as_str(), style.combine(component_style.single_code_block));
        } else if let Some(m) = caps.name("italic") {
            // HTML italic
            result
                .content
                .append_styled(m.as_str(), style.combine(component_style.italic));
        }
    }

    if curr_pos < text.len() {
        result
            .content
            .append_styled(&text[curr_pos..text.len()], style);
    }

    result
}