summaryrefslogtreecommitdiffstats
path: root/publisher
diff options
context:
space:
mode:
authorBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>2021-04-06 18:19:25 +0200
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>2021-04-07 00:26:02 +0200
commit8a308944e46f8c2aa054005d5aed89f2711f9c1d (patch)
tree7f993ed28e021396c31d94aa87f49617ac6c4195 /publisher
parent7b4ade56dd50d89a91760fc5ef8e2f151874de96 (diff)
publisher: Skip script, pre and textarea content when looking for HTML elements
Updates #7567
Diffstat (limited to 'publisher')
-rw-r--r--publisher/htmlElementsCollector.go87
-rw-r--r--publisher/htmlElementsCollector_test.go6
2 files changed, 60 insertions, 33 deletions
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go
index 1823a8327..d9479aafa 100644
--- a/publisher/htmlElementsCollector.go
+++ b/publisher/htmlElementsCollector.go
@@ -64,7 +64,7 @@ type cssClassCollectorWriter struct {
buff bytes.Buffer
isCollecting bool
- dropValue bool
+ inPreTag string
inQuote bool
quoteValue byte
@@ -90,49 +90,58 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
b := p[i]
w.toggleIfQuote(b)
if !w.inQuote && b == '>' {
- w.endCollecting(false)
+ w.endCollecting()
break
}
w.buff.WriteByte(b)
}
if !w.isCollecting {
- if w.dropValue {
- w.buff.Reset()
- } else {
- // First check if we have processed this element before.
- w.collector.mu.RLock()
-
- // See https://github.com/dominikh/go-tools/issues/723
- //lint:ignore S1030 This construct avoids memory allocation for the string.
- seen := w.collector.elementSet[string(w.buff.Bytes())]
- w.collector.mu.RUnlock()
- if seen {
- w.buff.Reset()
- continue
+ if w.inPreTag != "" {
+ s := w.buff.String()
+ if tagName, isEnd := w.parseEndTag(s); isEnd && w.inPreTag == tagName {
+ w.inPreTag = ""
}
+ w.buff.Reset()
+ continue
+ }
- s := w.buff.String()
+ // First check if we have processed this element before.
+ w.collector.mu.RLock()
+ // See https://github.com/dominikh/go-tools/issues/723
+ //lint:ignore S1030 This construct avoids memory allocation for the string.
+ seen := w.collector.elementSet[string(w.buff.Bytes())]
+ w.collector.mu.RUnlock()
+ if seen {
w.buff.Reset()
+ continue
+ }
- if strings.HasPrefix(s, "</") {
- continue
- }
+ s := w.buff.String()
- key := s
+ w.buff.Reset()
- s, tagName := w.insertStandinHTMLElement(s)
- el := parseHTMLElement(s)
- el.Tag = tagName
+ if strings.HasPrefix(s, "</") {
+ continue
+ }
- w.collector.mu.Lock()
- w.collector.elementSet[key] = true
- if el.Tag != "" {
- w.collector.elements = append(w.collector.elements, el)
- }
- w.collector.mu.Unlock()
+ key := s
+
+ s, tagName := w.insertStandinHTMLElement(s)
+ el := parseHTMLElement(s)
+ el.Tag = tagName
+ if w.isPreFormatted(tagName) {
+ w.inPreTag = tagName
}
+
+ w.collector.mu.Lock()
+ w.collector.elementSet[key] = true
+ if el.Tag != "" {
+ w.collector.elements = append(w.collector.elements, el)
+ }
+ w.collector.mu.Unlock()
+
}
}
}
@@ -140,6 +149,11 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
return
}
+// No need to look inside these for HTML elements.
+func (c *cssClassCollectorWriter) isPreFormatted(s string) bool {
+ return s == "pre" || s == "textarea" || s == "script"
+}
+
// The net/html parser does not handle single table elements as input, e.g. tbody.
// We only care about the element/class/ids, so just store away the original tag name
// and pretend it's a <div>.
@@ -154,15 +168,24 @@ func (c *cssClassCollectorWriter) insertStandinHTMLElement(el string) (string, s
return newv, strings.ToLower(tag)
}
-func (c *cssClassCollectorWriter) endCollecting(drop bool) {
+func (c *cssClassCollectorWriter) parseEndTag(s string) (string, bool) {
+ if !strings.HasPrefix(s, "</") {
+ return "", false
+ }
+ s = strings.TrimPrefix(s, "</")
+ s = strings.TrimSuffix(s, ">")
+ return strings.ToLower(strings.TrimSpace(s)), true
+}
+
+func (c *cssClassCollectorWriter) endCollecting() {
c.isCollecting = false
c.inQuote = false
- c.dropValue = drop
+
}
func (c *cssClassCollectorWriter) startCollecting() {
c.isCollecting = true
- c.dropValue = false
+
}
func (c *cssClassCollectorWriter) toggleIfQuote(b byte) {
diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go
index 2c2fd3733..5a1802234 100644
--- a/publisher/htmlElementsCollector_test.go
+++ b/publisher/htmlElementsCollector_test.go
@@ -89,8 +89,12 @@ func TestClassCollector(t *testing.T) {
{"Alpine transition 1", `<div x-transition:enter-start="opacity-0 transform mobile:-translate-x-8 sm:-translate-y-8">`, f("div", "mobile:-translate-x-8 opacity-0 sm:-translate-y-8 transform", "")},
{"Vue bind", `<div v-bind:class="{ active: isActive }"></div>`, f("div", "active", "")},
- // https://github.com/gohugoio/hugo/issues/7746
+ // Issue #7746
{"Apostrophe inside attribute value", `<a class="missingclass" title="Plus d'information">my text</a><div></div>`, f("a div", "missingclass", "")},
+ // Issue #7567
+ {"Script tags content should be skipped", `<script><span>foo</span><span>bar</span></script><div class="foo"></div>`, f("div script", "foo", "")},
+ {"Pre tags content should be skipped", `<pre class="preclass"><span>foo</span><span>bar</span></pre><div class="foo"></div>`, f("div pre", "foo preclass", "")},
+ {"Textare tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
} {
c.Run(test.name, func(c *qt.C) {
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())