summaryrefslogtreecommitdiffstats
path: root/publisher
diff options
context:
space:
mode:
authorBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>2021-04-20 16:50:03 +0200
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>2021-04-20 17:24:17 +0200
commitef34dd8f0e94e52ba6f1d5d607e4ac3ae98a7abb (patch)
tree6cccbecf2bf0a899bb61c87eb52e981198355ad9 /publisher
parentbc80022e033a5462d1a9ce541f40a050994011cc (diff)
publisher: Some performance tweaks for the HTML elements collector
Diffstat (limited to 'publisher')
-rw-r--r--publisher/htmlElementsCollector.go80
-rw-r--r--publisher/htmlElementsCollector_test.go70
2 files changed, 49 insertions, 101 deletions
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go
index 9f4be1ff5..13387a7ee 100644
--- a/publisher/htmlElementsCollector.go
+++ b/publisher/htmlElementsCollector.go
@@ -108,13 +108,13 @@ func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlEleme
}
}
-// Write splits the incoming stream into single html element and writes these into elementSet
+// Write splits the incoming stream into single html element.
func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
n = len(p)
i := 0
for i < len(p) {
- // if is not collecting, cycle through byte stream until start bracket "<" is found
+ // If we are not collecting, cycle through byte stream until start bracket "<" is found.
if !w.isCollecting {
for ; i < len(p); i++ {
b := p[i]
@@ -126,9 +126,9 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
}
if w.isCollecting {
- // if is collecting, cycle through byte stream until end bracket ">" is found
- // disregard any ">" if within a quote
- // write bytes until found to buffer
+ // If we are collecting, cycle through byte stream until end bracket ">" is found,
+ // disregard any ">" if within a quote,
+ // write bytes until found to buffer.
for ; i < len(p); i++ {
b := p[i]
w.toggleIfQuote(b)
@@ -141,54 +141,69 @@ func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
}
}
- // if no end bracket ">" is found while collecting, but the stream ended
+ // If no end bracket ">" is found while collecting, but the stream ended
// this could mean we received chunks of a stream from e.g. the minify functionality
- // next if loop will be skipped
+ // next if loop will be skipped.
- // at this point we have collected an element line between angle brackets "<" and ">"
+ // At this point we have collected an element line between angle brackets "<" and ">".
if !w.isCollecting {
- s := w.buff.String()
- w.buff.Reset()
-
- // filter out unwanted tags
- // empty string, just in case
- // if within preformatted code blocks <pre>, <textarea>, <script>, <style>
- // comments and doctype tags
- // end tags
- switch {
- case s == "": // empty string
+ if w.buff.Len() == 0 {
continue
- case w.inPreTag != "": // within preformatted code block
+ }
+
+ if w.inPreTag != "" { // within preformatted code block
+ s := w.buff.String()
+ w.buff.Reset()
if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
w.inPreTag = ""
}
continue
- case strings.HasPrefix(s, "<!"): // comment or doctype tag
- continue
- case strings.HasPrefix(s, "</"): // end tag
- continue
}
- // check if we have processed this element before.
+ // First check if we have processed this element before.
w.collector.mu.RLock()
- seen := w.collector.elementSet[s]
+
+ // Work with the bytes slice as long as it's practical,
+ // to save memory allocations.
+ b := w.buff.Bytes()
+
+ // See https://github.com/dominikh/go-tools/issues/723
+ //lint:ignore S1030 This construct avoids memory allocation for the string.
+ seen := w.collector.elementSet[string(b)]
w.collector.mu.RUnlock()
if seen {
+ w.buff.Reset()
continue
}
- // check if a preformatted code block started
+ // Filter out unwanted tags
+ // if within preformatted code blocks <pre>, <textarea>, <script>, <style>
+ // comments and doctype tags
+ // end tags.
+ switch {
+ case bytes.HasPrefix(b, []byte("<!")): // comment or doctype tag
+ w.buff.Reset()
+ continue
+ case bytes.HasPrefix(b, []byte("</")): // end tag
+ w.buff.Reset()
+ continue
+ }
+
+ s := w.buff.String()
+ w.buff.Reset()
+
+ // Check if a preformatted code block started.
if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) {
w.inPreTag = tagName
}
- // parse each collected element
+ // Parse each collected element.
el, err := parseHTMLElement(s)
if err != nil {
return n, err
}
- // write this tag to the element set
+ // Write this tag to the element set.
w.collector.mu.Lock()
w.collector.elementSet[s] = true
w.collector.elements = append(w.collector.elements, el)
@@ -265,17 +280,18 @@ var (
htmlJsonFixer = strings.NewReplacer(", ", "\n")
jsonAttrRe = regexp.MustCompile(`'?(.*?)'?:.*`)
classAttrRe = regexp.MustCompile(`(?i)^class$|transition`)
-)
-func parseHTMLElement(elStr string) (el htmlElement, err error) {
- var tagBuffer string = ""
- exceptionList := map[string]bool{
+ exceptionList = map[string]bool{
"thead": true,
"tbody": true,
"tfoot": true,
"td": true,
"tr": true,
}
+)
+
+func parseHTMLElement(elStr string) (el htmlElement, err error) {
+ var tagBuffer string = ""
tagName, ok := parseStartTag(elStr)
if !ok {
diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go
index 1ada27c18..0c8b2b65b 100644
--- a/publisher/htmlElementsCollector_test.go
+++ b/publisher/htmlElementsCollector_test.go
@@ -14,7 +14,6 @@
package publisher
import (
- "bytes"
"fmt"
"strings"
"testing"
@@ -129,33 +128,8 @@ func TestClassCollector(t *testing.T) {
}
}
-func BenchmarkClassCollectorWriter(b *testing.B) {
+func BenchmarkElementsCollectorWriter(b *testing.B) {
const benchHTML = `
-<html>
-<body id="i1" class="a b c d">
-<a class="c d e"></a>
-<br>
-<a class="c d e"></a>
-<a class="c d e"></a>
-<br>
-<a id="i2" class="c d e f"></a>
-<a id="i3" class="c d e"></a>
-<a class="c d e"></a>
-<br>
-<a class="c d e"></a>
-<a class="c d e"></a>
-<a class="c d e"></a>
-<a class="c d e"></a>
-</body>
-</html>
-`
- for i := 0; i < b.N; i++ {
- w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
- fmt.Fprint(w, benchHTML)
- }
-}
-
-const benchHTML = `
<!DOCTYPE html>
<html>
<head>
@@ -207,51 +181,9 @@ const benchHTML = `
</body>
</html>
`
-
-func BenchmarkElementsCollectorWriter(b *testing.B) {
- b.ReportAllocs()
for i := 0; i < b.N; i++ {
w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
fmt.Fprint(w, benchHTML)
- }
-}
-
-func BenchmarkElementsCollectorWriterMinified(b *testing.B) {
- b.ReportAllocs()
- v := viper.New()
- m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
- var buf bytes.Buffer
- m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML))
- b.ResetTimer()
-
- for i := 0; i < b.N; i++ {
- w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
- fmt.Fprint(w, buf.String())
- }
-}
-
-func BenchmarkElementsCollectorWriterWithMinifyStream(b *testing.B) {
- b.ReportAllocs()
- v := viper.New()
- m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
- b.ResetTimer()
-
- for i := 0; i < b.N; i++ {
- w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
- m.Minify(media.HTMLType, w, strings.NewReader(benchHTML))
- }
-}
-
-func BenchmarkElementsCollectorWriterWithMinifyString(b *testing.B) {
- b.ReportAllocs()
- v := viper.New()
- m, _ := minifiers.New(media.DefaultTypes, output.DefaultFormats, v)
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- var buf bytes.Buffer
- m.Minify(media.HTMLType, &buf, strings.NewReader(benchHTML))
- w := newHTMLElementsCollectorWriter(newHTMLElementsCollector())
- fmt.Fprint(w, buf.String())
}
}