diff options
Diffstat (limited to 'publisher')
-rw-r--r-- | publisher/htmlElementsCollector.go | 268 | ||||
-rw-r--r-- | publisher/htmlElementsCollector_test.go | 81 | ||||
-rw-r--r-- | publisher/publisher.go | 47 | ||||
-rw-r--r-- | publisher/publisher_test.go | 14 |
4 files changed, 386 insertions, 24 deletions
diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go new file mode 100644 index 000000000..c6e0d3f0f --- /dev/null +++ b/publisher/htmlElementsCollector.go @@ -0,0 +1,268 @@ +// Copyright 2020 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package publisher + +import ( + "github.com/gohugoio/hugo/helpers" + "golang.org/x/net/html" + yaml "gopkg.in/yaml.v2" + + "bytes" + "sort" + "strings" + "sync" +) + +func newHTMLElementsCollector() *htmlElementsCollector { + return &htmlElementsCollector{ + elementSet: make(map[string]bool), + } +} + +func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *cssClassCollectorWriter { + return &cssClassCollectorWriter{ + collector: collector, + } +} + +// HTMLElements holds lists of tags and attribute values for classes and id. +type HTMLElements struct { + Tags []string `json:"tags"` + Classes []string `json:"classes"` + IDs []string `json:"ids"` +} + +func (h *HTMLElements) Merge(other HTMLElements) { + h.Tags = append(h.Tags, other.Tags...) + h.Classes = append(h.Classes, other.Classes...) + h.IDs = append(h.IDs, other.IDs...) + + h.Tags = helpers.UniqueStringsReuse(h.Tags) + h.Classes = helpers.UniqueStringsReuse(h.Classes) + h.IDs = helpers.UniqueStringsReuse(h.IDs) + +} + +func (h *HTMLElements) Sort() { + sort.Strings(h.Tags) + sort.Strings(h.Classes) + sort.Strings(h.IDs) +} + +type cssClassCollectorWriter struct { + collector *htmlElementsCollector + buff bytes.Buffer + + isCollecting bool + dropValue bool + inQuote bool +} + +func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) { + n = len(p) + i := 0 + + for i < len(p) { + if !w.isCollecting { + for ; i < len(p); i++ { + b := p[i] + if b == '<' { + w.startCollecting() + break + } + } + } + + if w.isCollecting { + for ; i < len(p); i++ { + b := p[i] + if !w.inQuote && b == '/' { + // End element, we don't care about those. + w.endCollecting(true) + break + } + w.toggleIfQuote(b) + if !w.inQuote && b == '>' { + w.endCollecting(false) + break + } + w.buff.WriteByte(b) + } + + if !w.isCollecting { + if w.dropValue { + w.buff.Reset() + } else { + // First check if we have processed this element before. + w.collector.mu.RLock() + + // See https://github.com/dominikh/go-tools/issues/723 + //lint:ignore S1030 This construct avoids memory allocation for the string. + seen := w.collector.elementSet[string(w.buff.Bytes())] + w.collector.mu.RUnlock() + if seen { + w.buff.Reset() + continue + } + + s := w.buff.String() + + w.buff.Reset() + + el := parseHTMLElement(s) + + w.collector.mu.Lock() + w.collector.elementSet[s] = true + if el.Tag != "" { + w.collector.elements = append(w.collector.elements, el) + } + w.collector.mu.Unlock() + } + } + } + } + + return +} + +func (c *cssClassCollectorWriter) endCollecting(drop bool) { + c.isCollecting = false + c.inQuote = false + c.dropValue = drop +} + +func (c *cssClassCollectorWriter) startCollecting() { + c.isCollecting = true + c.dropValue = false +} + +func (c *cssClassCollectorWriter) toggleIfQuote(b byte) { + if isQuote(b) { + c.inQuote = !c.inQuote + } +} + +type htmlElement struct { + Tag string + Classes []string + IDs []string +} + +type htmlElementsCollector struct { + // Contains the raw HTML string. We will get the same element + // several times, and want to avoid costly reparsing when this + // is used for aggregated data only. + elementSet map[string]bool + + elements []htmlElement + + mu sync.RWMutex +} + +func (c *htmlElementsCollector) getHTMLElements() HTMLElements { + + var ( + classes []string + ids []string + tags []string + ) + + for _, el := range c.elements { + classes = append(classes, el.Classes...) + ids = append(ids, el.IDs...) + tags = append(tags, el.Tag) + } + + classes = helpers.UniqueStringsSorted(classes) + ids = helpers.UniqueStringsSorted(ids) + tags = helpers.UniqueStringsSorted(tags) + + els := HTMLElements{ + Classes: classes, + IDs: ids, + Tags: tags, + } + + return els +} + +func isQuote(b byte) bool { + return b == '"' || b == '\'' +} + +var htmlJsonFixer = strings.NewReplacer(", ", "\n") + +func parseHTMLElement(elStr string) (el htmlElement) { + elStr = strings.TrimSpace(elStr) + if !strings.HasSuffix(elStr, ">") { + elStr += ">" + } + n, err := html.Parse(strings.NewReader(elStr)) + if err != nil { + return + } + var walk func(*html.Node) + walk = func(n *html.Node) { + if n.Type == html.ElementNode && strings.Contains(elStr, n.Data) { + el.Tag = n.Data + + for _, a := range n.Attr { + switch { + case strings.EqualFold(a.Key, "id"): + // There should be only one, but one never knows... + el.IDs = append(el.IDs, a.Val) + default: + if strings.EqualFold(a.Key, "class") { + el.Classes = append(el.Classes, strings.Fields(a.Val)...) + } else { + key := strings.ToLower(a.Key) + val := strings.TrimSpace(a.Val) + if strings.Contains(key, "class") && strings.HasPrefix(val, "{") { + // This looks like a Vue or AlpineJS class binding. + // Try to unmarshal it as YAML and pull the keys. + // This may look odd, as the source is (probably) JS (JSON), but the YAML + // parser is much more lenient with simple JS input, it seems. + m := make(map[string]interface{}) + val = htmlJsonFixer.Replace(strings.Trim(val, "{}")) + // Remove leading space to make it look like YAML. + lines := strings.Split(val, "\n") + for i, l := range lines { + lines[i] = strings.TrimSpace(l) + } + val = strings.Join(lines, "\n") + err := yaml.Unmarshal([]byte(val), &m) + if err == nil { + for k := range m { + el.Classes = append(el.Classes, strings.Fields(k)...) + } + } else { + // Just insert the raw values. This is used for CSS class pruning + // so, it's important not to leave out values that may be a CSS class. + el.Classes = append(el.Classes, strings.Fields(val)...) + } + } + } + } + } + } + + for c := n.FirstChild; c != nil; c = c.NextSibling { + walk(c) + } + } + + walk(n) + + return +} diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go new file mode 100644 index 000000000..3ef159d8b --- /dev/null +++ b/publisher/htmlElementsCollector_test.go @@ -0,0 +1,81 @@ +// Copyright 2020 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package publisher + +import ( + "fmt" + "strings" + "testing" + + qt "github.com/frankban/quicktest" +) + +func TestClassCollector(t *testing.T) { + c := qt.New((t)) + + f := func(tags, classes, ids string) HTMLElements { + var tagss, classess, idss []string + if tags != "" { + tagss = strings.Split(tags, " ") + } + if classes != "" { + classess = strings.Split(classes, " ") + } + if ids != "" { + idss = strings.Split(ids, " ") + } + return HTMLElements{ + Tags: tagss, + Classes: classess, + IDs: idss, + } + } + + for _, test := range []struct { + name string + html string + expect HTMLElements + }{ + {"basic", `<body class="b a"></body>`, f("body", "a b", "")}, + {"duplicates", `<div class="b a b"></div>`, f("div", "a b", "")}, + {"single quote", `<body class='b a'></body>`, f("body", "a b", "")}, + {"no quote", `<body class=b id=myelement></body>`, f("body", "b", "myelement")}, + + {"AlpineJS bind 1", `<body> + <div x-bind:class="{ + 'class1': data.open, + 'class2 class3': data.foo == 'bar' + }"> + </div> + </body>`, f("body div", "class1 class2 class3", "")}, + + {"Alpine bind 2", `<div x-bind:class="{ 'bg-black': filter.checked }" + class="inline-block mr-1 mb-2 rounded bg-gray-300 px-2 py-2">FOO</div>`, + f("div", "bg-black bg-gray-300 inline-block mb-2 mr-1 px-2 py-2 rounded", "")}, + + {"Alpine bind 3", `<div x-bind:class="{ 'text-gray-800': !checked, 'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")}, + {"Alpine bind 4", `<div x-bind:class="{ 'text-gray-800': !checked, + 'text-white': checked }"></div>`, f("div", "text-gray-800 text-white", "")}, + + {"Vue bind", `<div v-bind:class="{ active: isActive }"></div>`, f("div", "active", "")}, + } { + c.Run(test.name, func(c *qt.C) { + w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) + fmt.Fprint(w, test.html) + got := w.collector.getHTMLElements() + c.Assert(got, qt.DeepEquals, test.expect) + }) + } + +} diff --git a/publisher/publisher.go b/publisher/publisher.go index f30073c08..8b8d2fa63 100644 --- a/publisher/publisher.go +++ b/publisher/publisher.go @@ -1,4 +1,4 @@ -// Copyright 2019 The Hugo Authors. All rights reserved. +// Copyright 2020 The Hugo Authors. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -18,7 +18,8 @@ import ( "io" "sync/atomic" - "github.com/gohugoio/hugo/config" + "github.com/gohugoio/hugo/resources" + "github.com/gohugoio/hugo/media" "github.com/gohugoio/hugo/minifiers" @@ -68,17 +69,21 @@ type Descriptor struct { // DestinationPublisher is the default and currently only publisher in Hugo. This // publisher prepares and publishes an item to the defined destination, e.g. /public. type DestinationPublisher struct { - fs afero.Fs - min minifiers.Client + fs afero.Fs + min minifiers.Client + htmlElementsCollector *htmlElementsCollector } // NewDestinationPublisher creates a new DestinationPublisher. -func NewDestinationPublisher(fs afero.Fs, outputFormats output.Formats, mediaTypes media.Types, cfg config.Provider) (pub DestinationPublisher, err error) { - pub = DestinationPublisher{fs: fs} - pub.min, err = minifiers.New(mediaTypes, outputFormats, cfg) - if err != nil { - return +func NewDestinationPublisher(rs *resources.Spec, outputFormats output.Formats, mediaTypes media.Types) (pub DestinationPublisher, err error) { + fs := rs.BaseFs.PublishFs + cfg := rs.Cfg + var classCollector *htmlElementsCollector + if rs.BuildConfig.WriteStats { + classCollector = newHTMLElementsCollector() } + pub = DestinationPublisher{fs: fs, htmlElementsCollector: classCollector} + pub.min, err = minifiers.New(mediaTypes, outputFormats, cfg) return } @@ -111,16 +116,38 @@ func (p DestinationPublisher) Publish(d Descriptor) error { } defer f.Close() - _, err = io.Copy(f, src) + var w io.Writer = f + + if p.htmlElementsCollector != nil && d.OutputFormat.IsHTML { + w = io.MultiWriter(w, newHTMLElementsCollectorWriter(p.htmlElementsCollector)) + } + + _, err = io.Copy(w, src) if err == nil && d.StatCounter != nil { atomic.AddUint64(d.StatCounter, uint64(1)) } + return err } +func (p DestinationPublisher) PublishStats() PublishStats { + if p.htmlElementsCollector == nil { + return PublishStats{} + } + + return PublishStats{ + HTMLElements: p.htmlElementsCollector.getHTMLElements(), + } +} + +type PublishStats struct { + HTMLElements HTMLElements `json:"htmlElements"` +} + // Publisher publishes a result file. type Publisher interface { Publish(d Descriptor) error + PublishStats() PublishStats } // XML transformer := transform.New(urlreplacers.NewAbsURLInXMLTransformer(path)) diff --git a/publisher/publisher_test.go b/publisher/publisher_test.go deleted file mode 100644 index 200accc8b..000000000 --- a/publisher/publisher_test.go +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright 2018 The Hugo Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package publisher |