diff options
author | Jan-Erik Rediger <janerik@fnordig.de> | 2022-01-25 20:57:43 +0100 |
---|---|---|
committer | Jan-Erik Rediger <janerik@fnordig.de> | 2022-01-25 21:04:50 +0100 |
commit | adde0c8cfb1ef3199649562ae0d6cc21914befda (patch) | |
tree | 7e5407331ef7c1896969965dd8eaafaa53ccc81f | |
parent | e759070dc8c8d04f452a1c52cc22fd884b6ab0a0 (diff) |
Avoid roundtripping through pulldown-cmark
Roundtripping markdown is actually quite hard.
We don't actually require that.
All we need is once parsing the markdown to find the right marker and
the headings.
We then manually generate markdown and all other content can be copied
unparsed again.
-rw-r--r-- | Cargo.lock | 16 | ||||
-rw-r--r-- | Cargo.toml | 5 | ||||
-rw-r--r-- | src/lib.rs | 75 | ||||
-rw-r--r-- | tests/adds_toc.out.md | 1 | ||||
-rw-r--r-- | tests/backslash_escapes.out.md | 8 | ||||
-rw-r--r-- | tests/handles_inline_code.in.md | 2 | ||||
-rw-r--r-- | tests/it.rs | 4 | ||||
-rw-r--r-- | tests/multi_header_linear.out.md | 5 | ||||
-rw-r--r-- | tests/tables_untouched.out.md | 7 | ||||
-rw-r--r-- | tests/tables_with_html.out.md | 6 |
10 files changed, 57 insertions, 72 deletions
@@ -845,8 +845,7 @@ dependencies = [ "log", "mdbook", "pretty_assertions", - "pulldown-cmark 0.8.0", - "pulldown-cmark-to-cmark", + "pulldown-cmark 0.9.1", "serde_json", "toml", ] @@ -1205,9 +1204,9 @@ dependencies = [ [[package]] name = "pulldown-cmark" -version = "0.8.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffade02495f22453cd593159ea2f59827aae7f53fa8323f756799b670881dcf8" +checksum = "34f197a544b0c9ab3ae46c359a7ec9cbbb5c7bf97054266fecb7ead794a181d6" dependencies = [ "bitflags", "getopts", @@ -1216,15 +1215,6 @@ dependencies = [ ] [[package]] -name = "pulldown-cmark-to-cmark" -version = "6.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95048382115a9da7be92ad51c84064d585b7da17472dcaa7f5eed8853c4c3707" -dependencies = [ - "pulldown-cmark 0.8.0", -] - -[[package]] name = "quick-error" version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -10,9 +10,7 @@ edition = "2018" [dependencies] mdbook = "0.4.10" -pulldown-cmark = "0.8.0" -pulldown-cmark-to-cmark = "6.0.2" -env_logger = "0.8.4" +pulldown-cmark = "0.9.1" log = "0.4.11" clap = "2.33.3" serde_json = "1.0.57" @@ -20,3 +18,4 @@ toml = "0.5.6" [dev-dependencies] pretty_assertions = "0.6.1" +env_logger = "0.8.4" @@ -8,7 +8,6 @@ use mdbook::errors::{Error, Result}; use mdbook::preprocess::{Preprocessor, PreprocessorContext}; use pulldown_cmark::Tag::*; use pulldown_cmark::{Event, Options, Parser}; -use pulldown_cmark_to_cmark::{cmark_with_options, Options as COptions}; use toml::value::Table; pub struct Toc; @@ -110,6 +109,7 @@ fn build_toc(toc: &[(u32, String, String)]) -> String { let mut toc_iter = toc.iter().peekable(); // Start from the level of the first header. + let min_level = toc.iter().map(|(lvl, _, _)| *lvl).min().unwrap_or(1); let mut last_lower = match toc_iter.peek() { Some((lvl, _, _)) => *lvl, None => 0, @@ -127,7 +127,7 @@ fn build_toc(toc: &[(u32, String, String)]) -> String { }); for (level, name, slug) in toc { - let width = 2 * (level - 1) as usize; + let width = 2 * (level - min_level) as usize; writeln!(result, "{1:0$}* [{2}](#{3})", width, "", name, slug).unwrap(); } @@ -135,7 +135,6 @@ fn build_toc(toc: &[(u32, String, String)]) -> String { } fn add_toc(content: &str, cfg: &Config) -> Result<String> { - let mut buf = String::with_capacity(content.len()); let mut toc_found = false; let mut toc_content = vec![]; @@ -150,40 +149,41 @@ fn add_toc(content: &str, cfg: &Config) -> Result<String> { opts.insert(Options::ENABLE_TASKLISTS); let mark: Vec<Event> = Parser::new(&cfg.marker).collect(); - let mut mark_start = -1; + let mut mark_start = None; + let mut mark_end = 0..0; let mut mark_loc = 0; - let mut c = -1; - for e in Parser::new_ext(&content, opts) { - c += 1; - log::trace!("Event: {:?}", e); + for (e, span) in Parser::new_ext(&content, opts).into_offset_iter() { + log::trace!("Event: {:?} (span: {:?})", e, span); if !toc_found { log::trace!( - "TOC not found yet. Location: {}, Start: {}", + "TOC not found yet. Location: {}, Start: {:?}", mark_loc, mark_start ); if e == mark[mark_loc] { - if mark_start == -1 { - mark_start = c; + if mark_start.is_none() { + mark_start = Some(span.clone()); } mark_loc += 1; if mark_loc >= mark.len() { + mark_end = span; toc_found = true } } else if mark_loc > 0 { mark_loc = 0; - mark_start = -1; + mark_start = None; } else { continue; } } - if let Event::Start(Heading(lvl)) = e { - current_header_level = Some(lvl); + if let Event::Start(Heading(lvl, fragment, classes)) = e { + log::trace!("Header(lvl={lvl}, fragment={fragment:?}, classes={classes:?})"); + current_header_level = Some(lvl as u32); continue; } - if let Event::End(Heading(_)) = e { + if let Event::End(Heading(..)) = e { // Skip if this header is nested too deeply. if let Some(level) = current_header_level.take() { let header = current_header.clone(); @@ -219,29 +219,30 @@ fn add_toc(content: &str, cfg: &Config) -> Result<String> { let toc = build_toc(&toc_content); log::trace!("Built TOC: {:?}", toc); - let toc_events = Parser::new(&toc).collect::<Vec<_>>(); - - let mut c = -1; - let events = Parser::new_ext(&content, opts) - .map(|e| { - c += 1; - if toc_found && c > mark_start && c < mark_start + (mark.len() as i32) { - vec![] - } else if toc_found && c == mark_start { - toc_events.clone() - } else { - vec![e] - } - }) - .flatten(); - - let opts = COptions { - newlines_after_codeblock: 1, - ..Default::default() + log::trace!("toc_found={toc_found} mark_start={mark_start:?} mark_end={mark_end:?}"); + + let content = if toc_found { + let mark_start = mark_start.unwrap(); + let content_before_toc = &content[0..mark_start.start]; + let content_after_toc = &content[mark_end.end..]; + log::trace!("content_before_toc={:?}", content_before_toc); + log::trace!("content_after_toc={:?}", content_after_toc); + // Multiline markers might have consumed trailing newlines, + // we ensure there's always one before the content. + let extra = if content_after_toc.as_bytes()[0] == b'\n' { + "" + } else { + "\n" + }; + format!( + "{}{}{}{}", + content_before_toc, toc, extra, content_after_toc + ) + } else { + content.to_string() }; - cmark_with_options(events, &mut buf, None, opts) - .map(|_| buf) - .map_err(|err| Error::msg(format!("Markdown serialization failed: {}", err))) + + Ok(content) } impl Toc { diff --git a/tests/adds_toc.out.md b/tests/adds_toc.out.md index d23da60..a0f5312 100644 --- a/tests/adds_toc.out.md +++ b/tests/adds_toc.out.md @@ -18,3 +18,4 @@ ## Header 2.2 ### Header 2.2.1 + diff --git a/tests/backslash_escapes.out.md b/tests/backslash_escapes.out.md index 6be9dec..55fd035 100644 --- a/tests/backslash_escapes.out.md +++ b/tests/backslash_escapes.out.md @@ -1,9 +1,9 @@ -\*not emphasized\* +\*not emphasized* \<br/> not a tag -\[not a link\](/foo) -\`not code\` +\[not a link](/foo) +\`not code` \* not a list \# not a heading -\[foo\]: /url "not a reference" +\[foo]: /url "not a reference" \ö not a character entity 1\. not a list diff --git a/tests/handles_inline_code.in.md b/tests/handles_inline_code.in.md index bf49fa2..bd3181f 100644 --- a/tests/handles_inline_code.in.md +++ b/tests/handles_inline_code.in.md @@ -13,5 +13,3 @@ ##### Header 1.1.1.1.1 # Another header `with inline` code - - diff --git a/tests/it.rs b/tests/it.rs index b0d7e60..474b2dc 100644 --- a/tests/it.rs +++ b/tests/it.rs @@ -58,7 +58,7 @@ macro_rules! assert_toc { let chapter = Chapter::from_content(content); let result = Toc::add_toc(&chapter, &config); match result { - Ok(result) => assert_eq!(expected.trim_end(), result), + Ok(result) => assert_eq!(expected, result), Err(e) => panic!("{} failed. Error: {}", $name, e), } }; @@ -114,7 +114,7 @@ fn unique_slugs() { #[test] fn add_toc_with_github_marker() { - let marker = "* auto-gen TOC:\n{:toc}".to_owned(); + let marker = "* auto-gen TOC:\n{:toc}\n".to_owned(); assert_toc!("github_marker", with_marker(marker)); } diff --git a/tests/multi_header_linear.out.md b/tests/multi_header_linear.out.md index d5afac4..219ff6f 100644 --- a/tests/multi_header_linear.out.md +++ b/tests/multi_header_linear.out.md @@ -7,14 +7,9 @@ * [Level 1.2.1](#level-121) ## Level 1.1 - ### Level 1.1.1 - ### Level 1.1.2 - ## Level 1.2 - ### Level 1.2.1 text - diff --git a/tests/tables_untouched.out.md b/tests/tables_untouched.out.md index 7207799..1d668ed 100644 --- a/tests/tables_untouched.out.md +++ b/tests/tables_untouched.out.md @@ -1,5 +1,6 @@ # Heading -|Head 1|Head 2| -|------|------| -|Row 1|Row 2| +| Head 1 | Head 2 | +|--------|--------| +| Row 1 | Row 2 | + diff --git a/tests/tables_with_html.out.md b/tests/tables_with_html.out.md index 97e5b03..9f13052 100644 --- a/tests/tables_with_html.out.md +++ b/tests/tables_with_html.out.md @@ -1,5 +1,5 @@ # Heading -|Head 1|Head 2| -|------|------| -|<span>Row 1</span>|Row 2| +| Head 1 | Head 2 | +|--------|--------| +| <span>Row 1</span> | Row 2 | |