From a82d2700fcc772aada15d65b8f76913ca23f7404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Erik=20Pedersen?= Date: Sat, 4 Jan 2020 11:28:19 +0100 Subject: markup/goldmark: Make auto IDs GitHub compatible You can turn off this behaviour: ```toml [markup] [markup.goldmark] [markup.goldmark.parser] autoHeadingIDAsciiOnly = true ``` Note that the `anchorize` now adapts its behaviour depending on the default Markdown handler. Fixes #6616 --- markup/blackfriday/convert.go | 7 +- markup/converter/converter.go | 5 ++ markup/goldmark/autoid.go | 125 ++++++++++++++++++++++++++++++ markup/goldmark/autoid_test.go | 121 +++++++++++++++++++++++++++++ markup/goldmark/convert.go | 20 ++++- markup/goldmark/convert_test.go | 54 +++++++++---- markup/goldmark/goldmark_config/config.go | 4 + 7 files changed, 319 insertions(+), 17 deletions(-) create mode 100644 markup/goldmark/autoid.go create mode 100644 markup/goldmark/autoid_test.go (limited to 'markup') diff --git a/markup/blackfriday/convert.go b/markup/blackfriday/convert.go index 3df23c7ae..bbbc2b377 100644 --- a/markup/blackfriday/convert.go +++ b/markup/blackfriday/convert.go @@ -60,6 +60,10 @@ type blackfridayConverter struct { cfg converter.ProviderConfig } +func (c *blackfridayConverter) SanitizeAnchorName(s string) string { + return blackfriday.SanitizedAnchorName(s) +} + func (c *blackfridayConverter) AnchorSuffix() string { if c.bf.PlainIDAnchors { return "" @@ -204,5 +208,6 @@ var blackfridayExtensionMap = map[string]int{ } var ( - _ converter.DocumentInfo = (*blackfridayConverter)(nil) + _ converter.DocumentInfo = (*blackfridayConverter)(nil) + _ converter.AnchorNameSanitizer = (*blackfridayConverter)(nil) ) diff --git a/markup/converter/converter.go b/markup/converter/converter.go index a4585bd03..b8a5c92c1 100644 --- a/markup/converter/converter.go +++ b/markup/converter/converter.go @@ -87,6 +87,11 @@ type TableOfContentsProvider interface { TableOfContents() tableofcontents.Root } +// AnchorNameSanitizer tells how a converter sanitizes anchor names. +type AnchorNameSanitizer interface { + SanitizeAnchorName(s string) string +} + // Bytes holds a byte slice and implements the Result interface. type Bytes []byte diff --git a/markup/goldmark/autoid.go b/markup/goldmark/autoid.go new file mode 100644 index 000000000..6599f08d9 --- /dev/null +++ b/markup/goldmark/autoid.go @@ -0,0 +1,125 @@ +// Copyright 2019 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package goldmark + +import ( + "bytes" + "strconv" + "unicode" + "unicode/utf8" + + "github.com/gohugoio/hugo/common/text" + + "github.com/yuin/goldmark/ast" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/util" + + bp "github.com/gohugoio/hugo/bufferpool" +) + +func sanitizeAnchorNameString(s string, asciiOnly bool) string { + return string(sanitizeAnchorName([]byte(s), asciiOnly)) +} + +func sanitizeAnchorName(b []byte, asciiOnly bool) []byte { + return sanitizeAnchorNameWithHook(b, asciiOnly, nil) +} + +func sanitizeAnchorNameWithHook(b []byte, asciiOnly bool, hook func(buf *bytes.Buffer)) []byte { + buf := bp.GetBuffer() + + if asciiOnly { + // Normalize it to preserve accents if possible. + b = text.RemoveAccents(b) + } + + for len(b) > 0 { + r, size := utf8.DecodeRune(b) + switch { + case asciiOnly && size != 1: + case isSpace(r): + buf.WriteString("-") + case r == '-' || isAlphaNumeric(r): + buf.WriteRune(unicode.ToLower(r)) + default: + } + + b = b[size:] + } + + if hook != nil { + hook(buf) + } + + result := make([]byte, buf.Len()) + copy(result, buf.Bytes()) + + bp.PutBuffer(buf) + + return result +} + +func isAlphaNumeric(r rune) bool { + return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r) +} + +func isSpace(r rune) bool { + return r == ' ' || r == '\t' +} + +var _ parser.IDs = (*idFactory)(nil) + +type idFactory struct { + asciiOnly bool + vals map[string]struct{} +} + +func newIDFactory(asciiOnly bool) *idFactory { + return &idFactory{ + vals: make(map[string]struct{}), + asciiOnly: asciiOnly, + } +} + +func (ids *idFactory) Generate(value []byte, kind ast.NodeKind) []byte { + return sanitizeAnchorNameWithHook(value, ids.asciiOnly, func(buf *bytes.Buffer) { + if buf.Len() == 0 { + if kind == ast.KindHeading { + buf.WriteString("heading") + } else { + buf.WriteString("id") + } + } + + if _, found := ids.vals[util.BytesToReadOnlyString(buf.Bytes())]; found { + // Append a hypen and a number, starting with 1. + buf.WriteRune('-') + pos := buf.Len() + for i := 1; ; i++ { + buf.WriteString(strconv.Itoa(i)) + if _, found := ids.vals[util.BytesToReadOnlyString(buf.Bytes())]; !found { + break + } + buf.Truncate(pos) + } + } + + ids.vals[buf.String()] = struct{}{} + + }) +} + +func (ids *idFactory) Put(value []byte) { + ids.vals[util.BytesToReadOnlyString(value)] = struct{}{} +} diff --git a/markup/goldmark/autoid_test.go b/markup/goldmark/autoid_test.go new file mode 100644 index 000000000..915c6a03c --- /dev/null +++ b/markup/goldmark/autoid_test.go @@ -0,0 +1,121 @@ +// Copyright 2019 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package goldmark + +import ( + "strings" + "testing" + + qt "github.com/frankban/quicktest" +) + +func TestSanitizeAnchorName(t *testing.T) { + c := qt.New(t) + + // Tests generated manually on github.com + tests := ` +God is good: 神真美好 +Number 32 +Question? +1+2=3 +Special !"#$%&(parens)=?´* chars +Resumé +One-Hyphen +Multiple--Hyphens +Trailing hyphen- +Many spaces here +Forward/slash +Backward\slash +Under_score +` + + expect := ` +god-is-good-神真美好 +number-32 +question +123 +special-parens-chars +resumé +one-hyphen +multiple--hyphens +trailing-hyphen- +many---spaces--here +forwardslash +backwardslash +under_score +` + + tests, expect = strings.TrimSpace(tests), strings.TrimSpace(expect) + + testlines, expectlines := strings.Split(tests, "\n"), strings.Split(expect, "\n") + + if len(testlines) != len(expectlines) { + panic("test setup failed") + } + + for i, input := range testlines { + input := input + expect := expectlines[i] + c.Run(input, func(c *qt.C) { + b := []byte(input) + got := string(sanitizeAnchorName(b, false)) + c.Assert(got, qt.Equals, expect) + c.Assert(sanitizeAnchorNameString(input, false), qt.Equals, expect) + c.Assert(string(b), qt.Equals, input) + }) + } +} + +func TestSanitizeAnchorNameAsciiOnly(t *testing.T) { + c := qt.New(t) + + c.Assert(sanitizeAnchorNameString("god is神真美好 good", true), qt.Equals, "god-is-good") + c.Assert(sanitizeAnchorNameString("Resumé", true), qt.Equals, "resume") + +} + +func BenchmarkSanitizeAnchorName(b *testing.B) { + input := []byte("God is good: 神真美好") + b.ResetTimer() + for i := 0; i < b.N; i++ { + result := sanitizeAnchorName(input, false) + if len(result) != 24 { + b.Fatalf("got %d", len(result)) + + } + } +} + +func BenchmarkSanitizeAnchorNameAsciiOnly(b *testing.B) { + input := []byte("God is good: 神真美好") + b.ResetTimer() + for i := 0; i < b.N; i++ { + result := sanitizeAnchorName(input, true) + if len(result) != 12 { + b.Fatalf("got %d", len(result)) + + } + } +} + +func BenchmarkSanitizeAnchorNameString(b *testing.B) { + input := "God is good: 神真美好" + b.ResetTimer() + for i := 0; i < b.N; i++ { + result := sanitizeAnchorNameString(input, false) + if len(result) != 24 { + b.Fatalf("got %d", len(result)) + } + } +} diff --git a/markup/goldmark/convert.go b/markup/goldmark/convert.go index af204125f..7d50839e2 100644 --- a/markup/goldmark/convert.go +++ b/markup/goldmark/convert.go @@ -50,19 +50,33 @@ type provide struct { func (p provide) New(cfg converter.ProviderConfig) (converter.Provider, error) { md := newMarkdown(cfg) + return converter.NewProvider("goldmark", func(ctx converter.DocumentContext) (converter.Converter, error) { return &goldmarkConverter{ ctx: ctx, cfg: cfg, md: md, + sanitizeAnchorName: func(s string) string { + return sanitizeAnchorNameString(s, cfg.MarkupConfig.Goldmark.Parser.AutoHeadingIDAsciiOnly) + }, }, nil }), nil } +var ( + _ converter.AnchorNameSanitizer = (*goldmarkConverter)(nil) +) + type goldmarkConverter struct { md goldmark.Markdown ctx converter.DocumentContext cfg converter.ProviderConfig + + sanitizeAnchorName func(s string) string +} + +func (c *goldmarkConverter) SanitizeAnchorName(s string) string { + return c.sanitizeAnchorName(s) } func newMarkdown(pcfg converter.ProviderConfig) goldmark.Markdown { @@ -226,7 +240,7 @@ func (c *goldmarkConverter) Convert(ctx converter.RenderContext) (result convert buf := &bufWriter{Buffer: &bytes.Buffer{}} result = buf - pctx := newParserContext(ctx) + pctx := c.newParserContext(ctx) reader := text.NewReader(ctx.Src) doc := c.md.Parser().Parse( @@ -265,8 +279,8 @@ func (c *goldmarkConverter) Supports(feature identity.Identity) bool { return featureSet[feature.GetIdentity()] } -func newParserContext(rctx converter.RenderContext) *parserContext { - ctx := parser.NewContext() +func (c *goldmarkConverter) newParserContext(rctx converter.RenderContext) *parserContext { + ctx := parser.NewContext(parser.WithIDs(newIDFactory(c.cfg.MarkupConfig.Goldmark.Parser.AutoHeadingIDAsciiOnly))) ctx.Set(tocEnableKey, rctx.RenderTOC) return &parserContext{ Context: ctx, diff --git a/markup/goldmark/convert_test.go b/markup/goldmark/convert_test.go index 2a9727606..b9bf01ef5 100644 --- a/markup/goldmark/convert_test.go +++ b/markup/goldmark/convert_test.go @@ -28,6 +28,23 @@ import ( qt "github.com/frankban/quicktest" ) +func convert(c *qt.C, mconf markup_config.Config, content string) converter.Result { + + p, err := Provider.New( + converter.ProviderConfig{ + MarkupConfig: mconf, + Logger: loggers.NewErrorLogger(), + }, + ) + c.Assert(err, qt.IsNil) + conv, err := p.New(converter.DocumentContext{DocumentID: "thedoc"}) + c.Assert(err, qt.IsNil) + b, err := conv.Convert(converter.RenderContext{RenderTOC: true, Src: []byte(content)}) + c.Assert(err, qt.IsNil) + + return b +} + func TestConvert(t *testing.T) { c := qt.New(t) @@ -92,29 +109,23 @@ description : the description for the content. +## 神真美好 + +## 神真美好 + +## 神真美好 + [^1]: And that's the footnote. ` // Code fences content = strings.Replace(content, "§§§", "```", -1) - mconf := markup_config.Default mconf.Highlight.NoClasses = false mconf.Goldmark.Renderer.Unsafe = true - p, err := Provider.New( - converter.ProviderConfig{ - MarkupConfig: mconf, - Logger: loggers.NewErrorLogger(), - }, - ) - c.Assert(err, qt.IsNil) - conv, err := p.New(converter.DocumentContext{DocumentID: "thedoc"}) - c.Assert(err, qt.IsNil) - b, err := conv.Convert(converter.RenderContext{RenderTOC: true, Src: []byte(content)}) - c.Assert(err, qt.IsNil) - + b := convert(c, mconf, content) got := string(b.Bytes()) // Links @@ -123,6 +134,9 @@ description // Header IDs c.Assert(got, qt.Contains, `

Custom ID

`, qt.Commentf(got)) c.Assert(got, qt.Contains, `

Auto ID

`, qt.Commentf(got)) + c.Assert(got, qt.Contains, `

神真美好

`, qt.Commentf(got)) + c.Assert(got, qt.Contains, `

神真美好

`, qt.Commentf(got)) + c.Assert(got, qt.Contains, `

神真美好

`, qt.Commentf(got)) // Code fences c.Assert(got, qt.Contains, "
LINE1\n
") @@ -148,6 +162,20 @@ description } +func TestConvertAutoIDAsciiOnly(t *testing.T) { + c := qt.New(t) + + content := ` +## God is Good: 神真美好 +` + mconf := markup_config.Default + mconf.Goldmark.Parser.AutoHeadingIDAsciiOnly = true + b := convert(c, mconf, content) + got := string(b.Bytes()) + + c.Assert(got, qt.Contains, "

") +} + func TestCodeFence(t *testing.T) { c := qt.New(t) diff --git a/markup/goldmark/goldmark_config/config.go b/markup/goldmark/goldmark_config/config.go index bf18a384d..2454eb46f 100644 --- a/markup/goldmark/goldmark_config/config.go +++ b/markup/goldmark/goldmark_config/config.go @@ -69,6 +69,10 @@ type Parser struct { // auto generated heading ids. AutoHeadingID bool + // When AutoHeadingID is enabled this will generate IDs with Ascii + // characters only. + AutoHeadingIDAsciiOnly bool + // Enables custom attributes. Attribute bool } -- cgit v1.2.3