summaryrefslogtreecommitdiffstats
path: root/publisher
diff options
context:
space:
mode:
authorBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>2023-02-05 15:14:30 +0100
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>2023-02-05 20:01:39 +0100
commitf9fc0e045bc1f72ba61fdf4a79b10a75a240394e (patch)
treed2c622aa89ff0aa20acfaf53e684833255671864 /publisher
parent4f4a1c00bfdc385c5afda9dcc1f259b1f9956991 (diff)
Fix slow HTML elements collector for the pre case
``` name old time/op new time/op delta ElementsCollectorWriterPre-10 25.2µs ± 1% 3.4µs ± 0% -86.54% (p=0.029 n=4+4) name old alloc/op new alloc/op delta ElementsCollectorWriterPre-10 624B ± 0% 142B ± 0% -77.18% (p=0.029 n=4+4) name old allocs/op new allocs/op delta ElementsCollectorWriterPre-10 16.0 ± 0% 6.0 ± 0% -62.50% (p=0.029 n=4+4) ``` Fixes #10698
Diffstat (limited to 'publisher')
-rw-r--r--publisher/htmlElementsCollector.go73
-rw-r--r--publisher/htmlElementsCollector_test.go28
2 files changed, 93 insertions, 8 deletions
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go
index ca6e2d940..91e1237a9 100644
--- a/publisher/htmlElementsCollector.go
+++ b/publisher/htmlElementsCollector.go
@@ -36,7 +36,6 @@ var (
skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
skipAllElementRe = regexp.MustCompile(`(?i)^!DOCTYPE`)
- endTagRe = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)
exceptionList = map[string]bool{
"thead": true,
@@ -312,11 +311,7 @@ func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc
if w.r != '>' {
return false
}
- m := endTagRe.FindSubmatch(w.buff.Bytes())
- if m == nil {
- return false
- }
- return bytes.EqualFold(m[1], tagNameCopy)
+ return isClosedByTag(w.buff.Bytes(), tagNameCopy)
},
htmlLexStart,
))
@@ -428,8 +423,9 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) {
}
// Variants of s
-// <body class="b a">
-// <div>
+//
+// <body class="b a">
+// <div>
func parseStartTag(s string) string {
spaceIndex := strings.IndexFunc(s, func(r rune) bool {
return unicode.IsSpace(r)
@@ -441,3 +437,64 @@ func parseStartTag(s string) string {
return s[1:spaceIndex]
}
+
+// isClosedByTag reports whether b ends with a closing tag for tagName.
+func isClosedByTag(b, tagName []byte) bool {
+ if len(b) == 0 {
+ return false
+ }
+
+ if b[len(b)-1] != '>' {
+ return false
+ }
+
+ var (
+ lo int
+ hi int
+
+ state int
+ inWord bool
+ )
+
+LOOP:
+ for i := len(b) - 2; i >= 0; i-- {
+ switch {
+ case b[i] == '<':
+ if state != 1 {
+ return false
+ }
+ state = 2
+ break LOOP
+ case b[i] == '/':
+ if state != 0 {
+ return false
+ }
+ state++
+ if inWord {
+ lo = i + 1
+ inWord = false
+ }
+ case isSpace(b[i]):
+ if inWord {
+ lo = i + 1
+ inWord = false
+ }
+ default:
+ if !inWord {
+ hi = i + 1
+ inWord = true
+ }
+ }
+ }
+
+ if state != 2 {
+ return false
+ }
+
+ return bytes.EqualFold(tagName, b[lo:hi])
+
+}
+
+func isSpace(b byte) bool {
+ return b == ' ' || b == '\t' || b == '\n'
+}
diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go
index 8be8c46ac..11590e0a3 100644
--- a/publisher/htmlElementsCollector_test.go
+++ b/publisher/htmlElementsCollector_test.go
@@ -155,6 +155,34 @@ func TestClassCollector(t *testing.T) {
}
+func TestEndsWithTag(t *testing.T) {
+ c := qt.New((t))
+
+ for _, test := range []struct {
+ name string
+ s string
+ tagName string
+ expect bool
+ }{
+ {"empty", "", "div", false},
+ {"no match", "foo", "div", false},
+ {"no close", "foo<div>", "div", false},
+ {"no close 2", "foo/div>", "div", false},
+ {"no close 2", "foo//div>", "div", false},
+ {"no tag", "foo</>", "div", false},
+ {"match", "foo</div>", "div", true},
+ {"match space", "foo< / div>", "div", true},
+ {"match space 2", "foo< / div \n>", "div", true},
+ {"match case", "foo</DIV>", "div", true},
+ } {
+ c.Run(test.name, func(c *qt.C) {
+ got := isClosedByTag([]byte(test.s), []byte(test.tagName))
+ c.Assert(got, qt.Equals, test.expect)
+ })
+ }
+
+}
+
func BenchmarkElementsCollectorWriter(b *testing.B) {
const benchHTML = `
<!DOCTYPE html>