From da95ab11b41eec733dca5212ce16d16a15dc55d4 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Mon, 12 Jul 2021 22:32:21 +0200 Subject: doc: Add helper for converting DocBook files to Markdown --- maintainers/scripts/db-to-md.sh | 88 ++++++++++++++++++++ maintainers/scripts/doc/escape-code-markup.py | 97 ++++++++++++++++++++++ .../scripts/doc/replace-xrefs-by-empty-links.py | 32 +++++++ maintainers/scripts/doc/unknown-code-language.lua | 12 +++ 4 files changed, 229 insertions(+) create mode 100755 maintainers/scripts/db-to-md.sh create mode 100755 maintainers/scripts/doc/escape-code-markup.py create mode 100755 maintainers/scripts/doc/replace-xrefs-by-empty-links.py create mode 100644 maintainers/scripts/doc/unknown-code-language.lua (limited to 'maintainers') diff --git a/maintainers/scripts/db-to-md.sh b/maintainers/scripts/db-to-md.sh new file mode 100755 index 000000000000..01357d1e2412 --- /dev/null +++ b/maintainers/scripts/db-to-md.sh @@ -0,0 +1,88 @@ +#! /usr/bin/env nix-shell +#! nix-shell -I nixpkgs=. -i bash -p pandoc + +# This script is temporarily needed while we transition the manual to +# CommonMark. It converts DocBook files into our CommonMark flavour. + +debug= +files=() + +while [ "$#" -gt 0 ]; do + i="$1"; shift 1 + case "$i" in + --debug) + debug=1 + ;; + *) + files+=("$i") + ;; + esac +done + +echo "WARNING: This is an experimental script and might not preserve all formatting." > /dev/stderr +echo "Please report any issues you discover." > /dev/stderr + +outExtension="md" +if [[ $debug ]]; then + outExtension="json" +fi + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# NOTE: Keep in sync with Nixpkgs manual (/doc/Makefile). +# TODO: Remove raw-attribute when we can get rid of DocBook altogether. +pandoc_commonmark_enabled_extensions=+attributes+fenced_divs+footnotes+bracketed_spans+definition_lists+pipe_tables+raw_attribute +targetLang="commonmark${pandoc_commonmark_enabled_extensions}+smart" +if [[ $debug ]]; then + targetLang=json +fi +pandoc_flags=( + # Not needed: + # - diagram-generator.lua (we do not support that in NixOS manual to limit dependencies) + # - media extraction (was only required for diagram generator) + # - myst-reader/roles.lua (only relevant for MyST → DocBook) + # - link-unix-man-references.lua (links should only be added to display output) + # - docbook-writer/rst-roles.lua (only relevant for → DocBook) + # - docbook-writer/labelless-link-is-xref.lua (only relevant for → DocBook) + "--lua-filter=$DIR/../../doc/build-aux/pandoc-filters/docbook-reader/citerefentry-to-rst-role.lua" + "--lua-filter=$DIR/../../doc/build-aux/pandoc-filters/myst-writer/roles.lua" + "--lua-filter=$DIR/doc/unknown-code-language.lua" + -f docbook + -t "$targetLang" + --tab-stop=2 + --wrap=none +) + +for file in "${files[@]}"; do + if [[ ! -f "$file" ]]; then + echo "db-to-md.sh: $file does not exist" > /dev/stderr + exit 1 + else + rootElement=$(xmllint --xpath 'name(//*)' "$file") + + if [[ $rootElement = chapter ]]; then + extension=".chapter.$outExtension" + elif [[ $rootElement = section ]]; then + extension=".section.$outExtension" + else + echo "db-to-md.sh: $file contains an unsupported root element $rootElement" > /dev/stderr + exit 1 + fi + + outFile="${file%".section.xml"}" + outFile="${outFile%".chapter.xml"}" + outFile="${outFile%".xml"}$extension" + temp1=$(mktemp) + $DIR/doc/escape-code-markup.py "$file" "$temp1" + if [[ $debug ]]; then + echo "Converted $file to $temp1" > /dev/stderr + fi + temp2=$(mktemp) + $DIR/doc/replace-xrefs-by-empty-links.py "$temp1" "$temp2" + if [[ $debug ]]; then + echo "Converted $temp1 to $temp2" > /dev/stderr + fi + pandoc "$temp2" -o "$outFile" "${pandoc_flags[@]}" + echo "Converted $file to $outFile" > /dev/stderr + fi +done diff --git a/maintainers/scripts/doc/escape-code-markup.py b/maintainers/scripts/doc/escape-code-markup.py new file mode 100755 index 000000000000..015435b698e6 --- /dev/null +++ b/maintainers/scripts/doc/escape-code-markup.py @@ -0,0 +1,97 @@ +#! /usr/bin/env nix-shell +#! nix-shell -I nixpkgs=channel:nixos-unstable -i python3 -p python3 -p python3.pkgs.lxml + +""" +Pandoc will strip any markup within code elements so +let’s escape them so that they can be handled manually. +""" + +import lxml.etree as ET +import re +import sys + +def replace_element_by_text(el: ET.Element, text: str) -> None: + """ + Author: bernulf + Source: https://stackoverflow.com/a/10520552/160386 + SPDX-License-Identifier: CC-BY-SA-3.0 + """ + text = text + (el.tail or "") + parent = el.getparent() + if parent is not None: + previous = el.getprevious() + if previous is not None: + previous.tail = (previous.tail or "") + text + else: + parent.text = (parent.text or "") + text + parent.remove(el) + +DOCBOOK_NS = "http://docbook.org/ns/docbook" + +# List of elements that pandoc’s DocBook reader strips markup from. +# https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/Readers/DocBook.hs +code_elements = [ + # CodeBlock + "literallayout", + "screen", + "programlisting", + # Code (inline) + "classname", + "code", + "filename", + "envar", + "literal", + "computeroutput", + "prompt", + "parameter", + "option", + "markup", + "wordasword", + "command", + "varname", + "function", + "type", + "symbol", + "constant", + "userinput", + "systemitem", +] + +XMLNS_REGEX = re.compile(r'\s+xmlns(?::[^=]+)?="[^"]*"') +ROOT_ELEMENT_REGEX = re.compile(r'^\s*<[^>]+>') + +def remove_xmlns(match: re.Match) -> str: + """ + Removes xmlns attributes. + + Expects a match containing an opening tag. + """ + return XMLNS_REGEX.sub('', match.group(0)) + +if __name__ == '__main__': + assert len(sys.argv) >= 3, "usage: escape-code-markup.py " + + tree = ET.parse(sys.argv[1]) + name_predicate = " or ".join([f"local-name()='{el}'" for el in code_elements]) + + for markup in tree.xpath(f"//*[({name_predicate}) and namespace-uri()='{DOCBOOK_NS}']/*"): + text = ET.tostring(markup, encoding=str) + + # tostring adds xmlns attributes to the element we want to stringify + # as if it was supposed to be usable standalone. + # We are just converting it to CDATA so we do not care. + # Let’s strip the namespace declarations to keep the code clean. + # + # Note that this removes even namespaces that were potentially + # in the original file. Though, that should be very rare – + # most of the time, we will stringify empty DocBook elements + # like or or, at worst, with xlink:href attribute. + # + # Also note that the regex expects the root element to be first + # thing in the string. But that should be fine, the tostring method + # does not produce XML declaration or doctype by default. + text = ROOT_ELEMENT_REGEX.sub(remove_xmlns, text) + + replace_element_by_text(markup, text) + + tree.write(sys.argv[2]) diff --git a/maintainers/scripts/doc/replace-xrefs-by-empty-links.py b/maintainers/scripts/doc/replace-xrefs-by-empty-links.py new file mode 100755 index 000000000000..2006ef897f7a --- /dev/null +++ b/maintainers/scripts/doc/replace-xrefs-by-empty-links.py @@ -0,0 +1,32 @@ +#! /usr/bin/env nix-shell +#! nix-shell -I nixpkgs=channel:nixos-unstable -i python3 -p python3 -p python3.pkgs.lxml + +""" +Pandoc will try to resolve xrefs and replace them with regular links. +let’s replace them with links with empty labels which MyST +and our pandoc filters recognize as cross-references. +""" + +import lxml.etree as ET +import sys + +XLINK_NS = "http://www.w3.org/1999/xlink" + +ns = { + "db": "http://docbook.org/ns/docbook", +} + + +if __name__ == '__main__': + assert len(sys.argv) >= 3, "usage: replace-xrefs-by-empty-links.py " + + tree = ET.parse(sys.argv[1]) + for xref in tree.findall(".//db:xref", ns): + text = ET.tostring(xref, encoding=str) + parent = xref.getparent() + link = parent.makeelement('link') + target_name = xref.get("linkend") + link.set(f"{{{XLINK_NS}}}href", f"#{target_name}") + parent.replace(xref, link) + + tree.write(sys.argv[2]) diff --git a/maintainers/scripts/doc/unknown-code-language.lua b/maintainers/scripts/doc/unknown-code-language.lua new file mode 100644 index 000000000000..85d8df4690ba --- /dev/null +++ b/maintainers/scripts/doc/unknown-code-language.lua @@ -0,0 +1,12 @@ +--[[ +Adds “unknown” class to CodeBlock AST nodes without any classes. + +This will cause Pandoc to use fenced code block, which we prefer. +]] + +function CodeBlock(elem) + if #elem.classes == 0 then + elem.classes:insert('unknown') + return elem + end +end -- cgit v1.2.3