publisher: Skip script, pre and textarea content when looking for HTML elements

Updates #7567
author: Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com> 2021-04-06 18:19:25 +0200
committer: Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com> 2021-04-07 00:26:02 +0200
commit: 8a308944e46f8c2aa054005d5aed89f2711f9c1d (patch)
tree: 7f993ed28e021396c31d94aa87f49617ac6c4195 /publisher
parent: 7b4ade56dd50d89a91760fc5ef8e2f151874de96 (diff)
2 files changed, 60 insertions, 33 deletions
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go
index 1823a8327..d9479aafa 100644
--- a/publisher/htmlElementsCollector.go
+++ b/publisher/htmlElementsCollector.go
@@ -64,7 +64,7 @@ type cssClassCollectorWriter struct {
 	buff      bytes.Buffer
 
 	isCollecting bool
-	dropValue    bool
+	inPreTag     string
 
 	inQuote    bool
 	quoteValue byte
@@ -90,49 +90,58 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
 				b := p[i]
 				w.toggleIfQuote(b)
 				if !w.inQuote && b == '>' {
-					w.endCollecting(false)
+					w.endCollecting()
 					break
 				}
 				w.buff.WriteByte(b)
 			}
 
 			if !w.isCollecting {
-				if w.dropValue {
-					w.buff.Reset()
-				} else {
-					// First check if we have processed this element before.
-					w.collector.mu.RLock()
-
-					// See https://github.com/dominikh/go-tools/issues/723
-					//lint:ignore S1030 This construct avoids memory allocation for the string.
-					seen := w.collector.elementSet[string(w.buff.Bytes())]
-					w.collector.mu.RUnlock()
-					if seen {
-						w.buff.Reset()
-						continue
+				if w.inPreTag != "" {
+					s := w.buff.String()
+					if tagName, isEnd := w.parseEndTag(s); isEnd && w.inPreTag == tagName {
+						w.inPreTag = ""
 					}
+					w.buff.Reset()
+					continue
+				}
 
-					s := w.buff.String()
+				// First check if we have processed this element before.
+				w.collector.mu.RLock()
 
+				// See https://github.com/dominikh/go-tools/issues/723
+				//lint:ignore S1030 This construct avoids memory allocation for the string.
+				seen := w.collector.elementSet[string(w.buff.Bytes())]
+				w.collector.mu.RUnlock()
+				if seen {
 					w.buff.Reset()
+					continue
+				}
 
-					if strings.HasPrefix(s, "</") {
-						continue
-					}
+				s := w.buff.String()
 
-					key := s
+				w.buff.Reset()
 
-					s, tagName := w.insertStandinHTMLElement(s)
-					el := parseHTMLElement(s)
-					el.Tag = tagName
+				if strings.HasPrefix(s, "</") {
+					continue
+				}
 
-					w.collector.mu.Lock()
-					w.collector.elementSet[key] = true
-					if el.Tag != "" {
-						w.collector.elements = append(w.collector.elements, el)
-					}
-					w.collector.mu.Unlock()
+				key := s
+
+				s, tagName := w.insertStandinHTMLElement(s)
+				el := parseHTMLElement(s)
+				el.Tag = tagName
+				if w.isPreFormatted(tagName) {
+					w.inPreTag = tagName
 				}
+
+				w.collector.mu.Lock()
+				w.collector.elementSet[key] = true
+				if el.Tag != "" {
+					w.collector.elements = append(w.collector.elements, el)
+				}
+				w.collector.mu.Unlock()
+
 			}
 		}
 	}
@@ -140,6 +149,11 @@ func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) {
 	return
 }
 
+// No need to look inside these for HTML elements.
+func (c *cssClassCollectorWriter) isPreFormatted(s string) bool {
+	return s == "pre" || s == "textarea" || s == "script"
+}
+
 // The net/html parser does not handle single table elements as input, e.g. tbody.
 // We only care about the element/class/ids, so just store away the original tag name
 // and pretend it's a <div>.
@@ -154,15 +168,24 @@ func (c *cssClassCollectorWriter) insertStandinHTMLElement(el string) (string, s
 	return newv, strings.ToLower(tag)
 }
 
-func (c *cssClassCollectorWriter) endCollecting(drop bool) {
+func (c *cssClassCollectorWriter) parseEndTag(s string) (string, bool) {
+	if !strings.HasPrefix(s, "</") {
+		return "", false
+	}
+	s = strings.TrimPrefix(s, "</")
+	s = strings.TrimSuffix(s, ">")
+	return strings.ToLower(strings.TrimSpace(s)), true
+}
+
+func (c *cssClassCollectorWriter) endCollecting() {
 	c.isCollecting = false
 	c.inQuote = false
-	c.dropValue = drop
+
 }
 
 func (c *cssClassCollectorWriter) startCollecting() {
 	c.isCollecting = true
-	c.dropValue = false
+
 }
 
 func (c *cssClassCollectorWriter) toggleIfQuote(b byte) {
diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go
index 2c2fd3733..5a1802234 100644
--- a/publisher/htmlElementsCollector_test.go
+++ b/publisher/htmlElementsCollector_test.go
@@ -89,8 +89,12 @@ func TestClassCollector(t *testing.T) {
 
 		{"Alpine transition 1", `<div x-transition:enter-start="opacity-0 transform mobile:-translate-x-8 sm:-translate-y-8">`, f("div", "mobile:-translate-x-8 opacity-0 sm:-translate-y-8 transform", "")},
 		{"Vue bind", `<div v-bind:class="{ active: isActive }"></div>`, f("div", "active", "")},
-		// https://github.com/gohugoio/hugo/issues/7746
+		// Issue #7746
 		{"Apostrophe inside attribute value", `<a class="missingclass" title="Plus d'information">my text</a><div></div>`, f("a div", "missingclass", "")},
+		// Issue #7567
+		{"Script tags content should be skipped", `<script><span>foo</span><span>bar</span></script><div class="foo"></div>`, f("div script", "foo", "")},
+		{"Pre tags content should be skipped", `<pre class="preclass"><span>foo</span><span>bar</span></pre><div class="foo"></div>`, f("div pre", "foo preclass", "")},
+		{"Textare tags content should be skipped", `<textarea class="textareaclass"><span>foo</span><span>bar</span></textarea><div class="foo"></div>`, f("div textarea", "foo textareaclass", "")},
 	} {
 		c.Run(test.name, func(c *qt.C) {
 			w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
author	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>	2021-04-06 18:19:25 +0200
committer	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>	2021-04-07 00:26:02 +0200
commit	8a308944e46f8c2aa054005d5aed89f2711f9c1d (patch)
tree	7f993ed28e021396c31d94aa87f49617ac6c4195 /publisher
parent	7b4ade56dd50d89a91760fc5ef8e2f151874de96 (diff)