summaryrefslogtreecommitdiffstats
path: root/publisher
diff options
context:
space:
mode:
authorBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>2021-05-13 13:10:32 +0200
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>2021-05-17 17:15:32 +0200
commitef0f1a726901d6c614040cfc2d7e8f9a2ca97816 (patch)
treeae56b2ac4b307d421bfbebc3efaa83abb16e0f59 /publisher
parentabbc99d4c60b102e2779e4362ceb433095719384 (diff)
publisher: Make the HTML element collector more robust
Fixes #8530
Diffstat (limited to 'publisher')
-rw-r--r--publisher/htmlElementsCollector.go379
-rw-r--r--publisher/htmlElementsCollector_test.go38
2 files changed, 264 insertions, 153 deletions
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go
index 9dc28c4c2..1bc1a09bc 100644
--- a/publisher/htmlElementsCollector.go
+++ b/publisher/htmlElementsCollector.go
@@ -19,12 +19,51 @@ import (
"sort"
"strings"
"sync"
+ "unicode"
+ "unicode/utf8"
"golang.org/x/net/html"
"github.com/gohugoio/hugo/helpers"
)
+const eof = -1
+
+var (
+ htmlJsonFixer = strings.NewReplacer(", ", "\n")
+ jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`)
+ classAttrRe = regexp.MustCompile(`(?i)^class$|transition`)
+
+ skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
+ skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`)
+ endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)
+
+ exceptionList = map[string]bool{
+ "thead": true,
+ "tbody": true,
+ "tfoot": true,
+ "td": true,
+ "tr": true,
+ }
+)
+
+func newHTMLElementsCollector() *htmlElementsCollector {
+ return &htmlElementsCollector{
+ elementSet: make(map[string]bool),
+ }
+}
+
+func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
+ w := &htmlElementsCollectorWriter{
+ collector: collector,
+ state: htmlLexStart,
+ }
+
+ w.defaultLexElementInside = w.lexElementInside(htmlLexStart)
+
+ return w
+}
+
// HTMLElements holds lists of tags and attribute values for classes and id.
type HTMLElements struct {
Tags []string `json:"tags"`
@@ -48,6 +87,12 @@ func (h *HTMLElements) Sort() {
sort.Strings(h.IDs)
}
+type htmlElement struct {
+ Tag string
+ Classes []string
+ IDs []string
+}
+
type htmlElementsCollector struct {
// Contains the raw HTML string. We will get the same element
// several times, and want to avoid costly reparsing when this
@@ -59,12 +104,6 @@ type htmlElementsCollector struct {
mu sync.RWMutex
}
-func newHTMLElementsCollector() *htmlElementsCollector {
- return &htmlElementsCollector{
- elementSet: make(map[string]bool),
- }
-}
-
func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
var (
classes []string
@@ -93,114 +132,118 @@ func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
type htmlElementsCollectorWriter struct {
collector *htmlElementsCollector
- buff bytes.Buffer
- isCollecting bool
- inPreTag string
+ r rune // Current rune
+ width int // The width in bytes of r
+ input []byte // The current slice written to Write
+ pos int // The current position in input
- inQuote bool
- quoteValue byte
-}
+ err error
-func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
- return &htmlElementsCollectorWriter{
- collector: collector,
- }
+ inQuote rune
+
+ buff bytes.Buffer
+
+ // Current state
+ state htmlCollectorStateFunc
+
+ // Precompiled state funcs
+ defaultLexElementInside htmlCollectorStateFunc
}
-// Write splits the incoming stream into single html element.
+// Write collects HTML elements from p.
func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
n = len(p)
- i := 0
-
- for i < len(p) {
- // If we are not collecting, cycle through byte stream until start bracket "<" is found.
- if !w.isCollecting {
- for ; i < len(p); i++ {
- b := p[i]
- if b == '<' {
- w.startCollecting()
- break
- }
- }
+ w.input = p
+ w.pos = 0
+
+ for {
+ w.r = w.next()
+ if w.r == eof {
+ return
}
+ w.state = w.state(w)
+ }
+}
- if w.isCollecting {
- // If we are collecting, cycle through byte stream until end bracket ">" is found,
- // disregard any ">" if within a quote,
- // write bytes until found to buffer.
- for ; i < len(p); i++ {
- b := p[i]
- w.toggleIfQuote(b)
- w.buff.WriteByte(b)
-
- if !w.inQuote && b == '>' {
- w.endCollecting()
- break
- }
- }
+func (l *htmlElementsCollectorWriter) backup() {
+ l.pos -= l.width
+ l.r, _ = utf8.DecodeRune(l.input[l.pos:])
+}
+
+func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
+ var s htmlCollectorStateFunc
+ s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
+ w.buff.WriteRune(w.r)
+ if condition() {
+ w.buff.Reset()
+ return resolve
}
+ return s
+ }
+ return s
+}
- // If no end bracket ">" is found while collecting, but the stream ended
- // this could mean we received chunks of a stream from e.g. the minify functionality
- // next if loop will be skipped.
+func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
+ var s htmlCollectorStateFunc
+ s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
+ if condition(w.r) {
+ return resolve
+ }
+ return s
+ }
+ return s
+}
- // At this point we have collected an element line between angle brackets "<" and ">".
- if !w.isCollecting {
- if w.buff.Len() == 0 {
- continue
+// Starts with e.g. "<body " or "<div"
+func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
+ var s htmlCollectorStateFunc
+ s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
+ w.buff.WriteRune(w.r)
+
+ // Skip any text inside a quote.
+ if w.r == '\'' || w.r == '"' {
+ if w.inQuote == w.r {
+ w.inQuote = 0
+ } else if w.inQuote == 0 {
+ w.inQuote = w.r
}
+ }
- if w.inPreTag != "" { // within preformatted code block
- s := w.buff.String()
- w.buff.Reset()
- if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
- w.inPreTag = ""
- }
- continue
- }
+ if w.inQuote != 0 {
+ return s
+ }
- // First check if we have processed this element before.
- w.collector.mu.RLock()
+ if w.r == '>' {
// Work with the bytes slice as long as it's practical,
// to save memory allocations.
b := w.buff.Bytes()
- // See https://github.com/dominikh/go-tools/issues/723
- //lint:ignore S1030 This construct avoids memory allocation for the string.
+ defer func() {
+ w.buff.Reset()
+ }()
+
+ // First check if we have processed this element before.
+ w.collector.mu.RLock()
+
seen := w.collector.elementSet[string(b)]
w.collector.mu.RUnlock()
if seen {
- w.buff.Reset()
- continue
- }
-
- // Filter out unwanted tags
- // if within preformatted code blocks <pre>, <textarea>, <script>, <style>
- // comments and doctype tags
- // end tags.
- switch {
- case bytes.HasPrefix(b, []byte("<!")): // comment or doctype tag
- w.buff.Reset()
- continue
- case bytes.HasPrefix(b, []byte("</")): // end tag
- w.buff.Reset()
- continue
+ return resolve
}
s := w.buff.String()
- w.buff.Reset()
- // Check if a preformatted code block started.
- if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) {
- w.inPreTag = tagName
+ if s == "" {
+ return resolve
}
// Parse each collected element.
el, err := parseHTMLElement(s)
if err != nil {
- return n, err
+ w.err = err
+ return resolve
}
// Write this tag to the element set.
@@ -208,109 +251,137 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
w.collector.elementSet[s] = true
w.collector.elements = append(w.collector.elements, el)
w.collector.mu.Unlock()
+
+ return resolve
+
}
+
+ return s
}
- return
+ return s
}
-func (c *htmlElementsCollectorWriter) startCollecting() {
- c.isCollecting = true
-}
+func (l *htmlElementsCollectorWriter) next() rune {
+ if l.pos >= len(l.input) {
+ l.width = 0
+ return eof
+ }
-func (c *htmlElementsCollectorWriter) endCollecting() {
- c.isCollecting = false
- c.inQuote = false
+ runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
+ l.width = runeWidth
+ l.pos += l.width
+ return runeValue
}
-func (c *htmlElementsCollectorWriter) toggleIfQuote(b byte) {
- if isQuote(b) {
- if c.inQuote && b == c.quoteValue {
- c.inQuote = false
- } else if !c.inQuote {
- c.inQuote = true
- c.quoteValue = b
+// returns the next state in HTML element scanner.
+type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc
+
+// At "<", buffer empty.
+// Potentially starting a HTML element.
+func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
+ if w.r == '>' || unicode.IsSpace(w.r) {
+ if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) {
+ w.buff.Reset()
+ return htmlLexStart
}
- }
-}
-func isQuote(b byte) bool {
- return b == '"' || b == '\''
-}
+ tagName := w.buff.Bytes()[1:]
-func parseStartTag(s string) (string, bool) {
- s = strings.TrimPrefix(s, "<")
- s = strings.TrimSuffix(s, ">")
+ switch {
+ case skipInnerElementRe.Match(tagName):
+ // pre, script etc. We collect classes etc. on the surrounding
+ // element, but skip the inner content.
+ w.backup()
- spaceIndex := strings.Index(s, " ")
- if spaceIndex != -1 {
- s = s[:spaceIndex]
+ // tagName will be overwritten, so make a copy.
+ tagNameCopy := make([]byte, len(tagName))
+ copy(tagNameCopy, tagName)
+
+ return w.lexElementInside(
+ w.consumeBuffUntil(
+ func() bool {
+ if w.r != '>' {
+ return false
+ }
+ m := endTagRe.FindSubmatch(w.buff.Bytes())
+ if m == nil {
+ return false
+ }
+ return bytes.EqualFold(m[1], tagNameCopy)
+ },
+ htmlLexStart,
+ ))
+ case skipAllElementRe.Match(tagName):
+ // E.g. "<!DOCTYPE ..."
+ w.buff.Reset()
+ return w.consumeRuneUntil(func(r rune) bool {
+ return r == '>'
+ }, htmlLexStart)
+ default:
+ w.backup()
+ return w.defaultLexElementInside
+ }
}
- return strings.ToLower(strings.TrimSpace(s)), true
-}
+ w.buff.WriteRune(w.r)
-func parseEndTag(s string) (string, bool) {
- if !strings.HasPrefix(s, "</") {
- return "", false
+ // If it's a comment, skip to its end.
+ if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) {
+ w.buff.Reset()
+ return htmlLexToEndOfComment
}
- s = strings.TrimPrefix(s, "</")
- s = strings.TrimSuffix(s, ">")
-
- return strings.ToLower(strings.TrimSpace(s)), true
+ return htmlLexElementStart
}
-// No need to look inside these for HTML elements.
-func isPreFormatted(s string) bool {
- return s == "pre" || s == "textarea" || s == "script" || s == "style"
-}
+// Entry state func.
+// Looks for a opening bracket, '<'.
+func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
+ if w.r == '<' {
+ w.backup()
+ w.buff.Reset()
+ return htmlLexElementStart
+ }
-type htmlElement struct {
- Tag string
- Classes []string
- IDs []string
+ return htmlLexStart
}
-var (
- htmlJsonFixer = strings.NewReplacer(", ", "\n")
- jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`)
- classAttrRe = regexp.MustCompile(`(?i)^class$|transition`)
+// After "<!--", buff empty.
+func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
+ w.buff.WriteRune(w.r)
- exceptionList = map[string]bool{
- "thead": true,
- "tbody": true,
- "tfoot": true,
- "td": true,
- "tr": true,
+ if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) {
+ // Done, start looking for HTML elements again.
+ return htmlLexStart
}
-)
+
+ return htmlLexToEndOfComment
+}
func parseHTMLElement(elStr string) (el htmlElement, err error) {
- var tagBuffer string = ""
- tagName, ok := parseStartTag(elStr)
- if !ok {
- return
- }
+ tagName := parseStartTag(elStr)
+
+ el.Tag = strings.ToLower(tagName)
+ tagNameToParse := el.Tag
// The net/html parser does not handle single table elements as input, e.g. tbody.
// We only care about the element/class/ids, so just store away the original tag name
// and pretend it's a <div>.
- if exceptionList[tagName] {
- tagBuffer = tagName
+ if exceptionList[el.Tag] {
elStr = strings.Replace(elStr, tagName, "div", 1)
+ tagNameToParse = "div"
}
n, err := html.Parse(strings.NewReader(elStr))
if err != nil {
return
}
+
var walk func(*html.Node)
walk = func(n *html.Node) {
- if n.Type == html.ElementNode && strings.Contains(elStr, n.Data) {
- el.Tag = n.Data
-
+ if n.Type == html.ElementNode && n.Data == tagNameToParse {
for _, a := range n.Attr {
switch {
case strings.EqualFold(a.Key, "id"):
@@ -345,10 +416,20 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) {
walk(n)
- // did we replaced the start tag?
- if tagBuffer != "" {
- el.Tag = tagBuffer
+ return
+}
+
+// Variants of s
+// <body class="b a">
+// <div>
+func parseStartTag(s string) string {
+ spaceIndex := strings.IndexFunc(s, func(r rune) bool {
+ return unicode.IsSpace(r)
+ })
+
+ if spaceIndex == -1 {
+ return s[1 : len(s)-1]
}
- return
+ return s[1:spaceIndex]
}
diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go
index 0c8b2b65b..2eac31f73 100644
--- a/publisher/htmlElementsCollector_test.go
+++ b/publisher/htmlElementsCollector_test.go
@@ -15,8 +15,12 @@ package publisher
import (
"fmt"
+ "math/rand"
"strings"
"testing"
+ "time"
+
+ "github.com/gohugoio/hugo/common/text"
"github.com/gohugoio/hugo/media"
"github.com/gohugoio/hugo/minifiers"
@@ -28,6 +32,7 @@ import (
func TestClassCollector(t *testing.T) {
c := qt.New((t))
+ rnd := rand.New(rand.NewSource(time.Now().Unix()))
f := func(tags, classes, ids string) HTMLElements {
var tagss, classess, idss []string
@@ -57,14 +62,20 @@ func TestClassCollector(t *testing.T) {
expect HTMLElements
}{
{"basic", `<body class="b a"></body>`, f("body", "a b", "")},
- {"duplicates", `<div class="b a b"></div>`, f("div", "a b", "")},
+ {"duplicates", `<div class="b a b"></div><div class="b a b"></div>x'`, f("div", "a b", "")},
{"single quote", `<body class='b a'></body>`, f("body", "a b", "")},
{"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")},
+ {"short", `<i>`, f("i", "", "")},
+ {"invalid", `< body class="b a"></body><div></div>`, f("div", "", "")},
// https://github.com/gohugoio/hugo/issues/7318
{"thead", `<table class="cl1">
<thead class="cl2"><tr class="cl3"><td class="cl4"></td></tr></thead>
<tbody class="cl5"><tr class="cl6"><td class="cl7"></td></tr></tbody>
</table>`, f("table tbody td thead tr", "cl1 cl2 cl3 cl4 cl5 cl6 cl7", "")},
+ {"thead uppercase", `<TABLE class="CL1">
+ <THEAD class="CL2"><TR class="CL3"><TD class="CL4"></TD></TR></THEAD>
+ <TBODY class="CL5"><TR class="CL6"><TD class="CL7"></TD></TR></TBODY>
+</TABLE>`, f("table tbody td thead tr", "CL1 CL2 CL3 CL4 CL5 CL6 CL7", "")},
// https://github.com/gohugoio/hugo/issues/7161
{"minified a href", `<a class="b a" href=/></a>`, f("a", "a b", "")},
{"AlpineJS bind 1", `<body>
@@ -98,6 +109,11 @@ func TestClassCollector(t *testing.T) {
{"Textarea tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
{"DOCTYPE should beskipped", `<!DOCTYPE html>`, f("", "", "")},
{"Comments should be skipped", `<!-- example comment -->`, f("", "", "")},
+ {"Comments with elements before and after", `<div></div><!-- example comment --><span><span>`, f("div span", "", "")},
+ // Issue #8530
+ {"Comment with single quote", `<!-- Hero Area Image d'accueil --><i class="foo">`, f("i", "foo", "")},
+ {"Uppercase tags", `<DIV></DIV>`, f("div", "", "")},
+ {"Predefined tags with distinct casing", `<script>if (a < b) { nothing(); }</SCRIPT><div></div>`, f("div script", "", "")},
// Issue #8417
{"Tabs inline", `<hr id="a" class="foo"><div class="bar">d</div>`, f("div hr", "bar foo", "a")},
{"Tabs on multiple rows", `<form
@@ -108,16 +124,29 @@ func TestClassCollector(t *testing.T) {
<div id="b" class="foo">d</div>`, f("div form", "foo", "a b")},
} {
- for _, minify := range []bool{false, true} {
- c.Run(fmt.Sprintf("%s--minify-%t", test.name, minify), func(c *qt.C) {
+ for _, variant := range []struct {
+ minify bool
+ stream bool
+ }{
+ {minify: false, stream: false},
+ {minify: true, stream: false},
+ {minify: false, stream: true},
+ } {
+
+ c.Run(fmt.Sprintf("%s--minify-%t--stream-%t", test.name, variant.minify, variant.stream), func(c *qt.C) {
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
- if minify {
+ if variant.minify {
if skipMinifyTest[test.name] {
c.Skip("skip minify test")
}
v := viper.New()
m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
m.Minify(media.HTMLType, w, strings.NewReader(test.html))
+ } else if variant.stream {
+ chunks := text.Chunk(test.html, rnd.Intn(41)+1)
+ for _, chunk := range chunks {
+ fmt.Fprint(w, chunk)
+ }
} else {
fmt.Fprint(w, test.html)
}
@@ -126,6 +155,7 @@ func TestClassCollector(t *testing.T) {
})
}
}
+
}
func BenchmarkElementsCollectorWriter(b *testing.B) {