summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--transform/absurl.go59
-rw-r--r--transform/absurlreplacer.go325
-rw-r--r--transform/chain_test.go22
4 files changed, 356 insertions, 52 deletions
diff --git a/.gitignore b/.gitignore
index 3ea8aedd6..41162a757 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,8 @@ hugo
docs/public*
hugo.exe
*.test
+*.prof
+nohup.out
cover.out
*.swp
*.swo
diff --git a/transform/absurl.go b/transform/absurl.go
index 0a0cd7239..0efe624ac 100644
--- a/transform/absurl.go
+++ b/transform/absurl.go
@@ -1,64 +1,33 @@
package transform
import (
- "bytes"
- "net/url"
- "strings"
+ "sync"
)
-func AbsURL(absURL string) (trs []link, err error) {
- var baseURL *url.URL
+var absUrlInit sync.Once
+var ar *absurlReplacer
- if baseURL, err = url.Parse(absURL); err != nil {
- return
- }
+// for performance reasons, we reuse the first baseUrl given
+func initAbsurlReplacer(baseURL string) {
+ absUrlInit.Do(func() {
+ ar = newAbsurlReplacer(baseURL)
+ })
+}
- base := strings.TrimRight(baseURL.String(), "/")
+func AbsURL(absURL string) (trs []link, err error) {
+ initAbsurlReplacer(absURL)
- var (
- srcdq = []byte(" src=\"" + base + "/")
- hrefdq = []byte(" href=\"" + base + "/")
- srcsq = []byte(" src='" + base + "/")
- hrefsq = []byte(" href='" + base + "/")
- )
trs = append(trs, func(content []byte) []byte {
- content = guardReplace(content, []byte(" src=\"//"), []byte(" src=\"/"), srcdq)
- content = guardReplace(content, []byte(" src='//"), []byte(" src='/"), srcsq)
- content = guardReplace(content, []byte(" href=\"//"), []byte(" href=\"/"), hrefdq)
- content = guardReplace(content, []byte(" href='//"), []byte(" href='/"), hrefsq)
- return content
+ return ar.replaceInHtml(content)
})
return
}
func AbsURLInXML(absURL string) (trs []link, err error) {
- var baseURL *url.URL
+ initAbsurlReplacer(absURL)
- if baseURL, err = url.Parse(absURL); err != nil {
- return
- }
-
- base := strings.TrimRight(baseURL.String(), "/")
-
- var (
- srcedq = []byte(" src="" + base + "/")
- hrefedq = []byte(" href="" + base + "/")
- srcesq = []byte(" src='" + base + "/")
- hrefesq = []byte(" href='" + base + "/")
- )
trs = append(trs, func(content []byte) []byte {
- content = guardReplace(content, []byte(" src="//"), []byte(" src="/"), srcedq)
- content = guardReplace(content, []byte(" src='//"), []byte(" src='/"), srcesq)
- content = guardReplace(content, []byte(" href="//"), []byte(" href="/"), hrefedq)
- content = guardReplace(content, []byte(" href='//"), []byte(" href='/"), hrefesq)
- return content
+ return ar.replaceInXml(content)
})
return
}
-
-func guardReplace(content, guard, match, replace []byte) []byte {
- if !bytes.Contains(content, guard) {
- content = bytes.Replace(content, match, replace, -1)
- }
- return content
-}
diff --git a/transform/absurlreplacer.go b/transform/absurlreplacer.go
new file mode 100644
index 000000000..7b6f72379
--- /dev/null
+++ b/transform/absurlreplacer.go
@@ -0,0 +1,325 @@
+package transform
+
+import (
+ "bytes"
+ bp "github.com/spf13/hugo/bufferpool"
+ "net/url"
+ "strings"
+ "sync"
+ "unicode/utf8"
+)
+
+// position (in bytes)
+type pos int
+
+type matchState int
+
+const (
+ matchStateNone matchState = iota
+ matchStateWhitespace
+ matchStatePartial
+ matchStateFull
+)
+
+type item struct {
+ typ itemType
+ pos pos
+ val []byte
+}
+
+type itemType int
+
+const (
+ tText itemType = iota
+
+ // matches
+ tSrcdq
+ tHrefdq
+ tSrcsq
+ tHrefsq
+ // guards
+ tGrcdq
+ tGhrefdq
+ tGsrcsq
+ tGhrefsq
+)
+
+type contentlexer struct {
+ content []byte
+
+ pos pos // input position
+ start pos // item start position
+ width pos // width of last element
+
+ matchers []absurlMatcher
+ state stateFunc
+ prefixLookup *prefixes
+
+ // items delivered to client
+ items []item
+}
+
+type stateFunc func(*contentlexer) stateFunc
+
+type prefixRunes []rune
+
+type prefixes struct {
+ pr []prefixRunes
+ curr prefixRunes // current prefix lookup table
+ i int // current index
+
+ // first rune in potential match
+ first rune
+
+ // match-state:
+ // none, whitespace, partial, full
+ ms matchState
+}
+
+// match returns partial and full match for the prefix in play
+// - it's a full match if all prefix runes has checked out in row
+// - it's a partial match if it's on its way towards a full match
+func (l *contentlexer) match(r rune) {
+ p := l.prefixLookup
+ if p.curr == nil {
+ // assumes prefixes all start off on a different rune
+ // works in this special case: href, src
+ p.i = 0
+ for _, pr := range p.pr {
+ if pr[p.i] == r {
+ fullMatch := len(p.pr) == 1
+ p.first = r
+ if !fullMatch {
+ p.curr = pr
+ l.prefixLookup.ms = matchStatePartial
+ } else {
+ l.prefixLookup.ms = matchStateFull
+ }
+ return
+ }
+ }
+ } else {
+ p.i++
+ if p.curr[p.i] == r {
+ fullMatch := len(p.curr) == p.i+1
+ if fullMatch {
+ p.curr = nil
+ l.prefixLookup.ms = matchStateFull
+ } else {
+ l.prefixLookup.ms = matchStatePartial
+ }
+ return
+ }
+
+ p.curr = nil
+ }
+
+ l.prefixLookup.ms = matchStateNone
+}
+
+func (l *contentlexer) emit(t itemType) {
+ l.items = append(l.items, item{t, l.start, l.content[l.start:l.pos]})
+ l.start = l.pos
+}
+
+var mainPrefixRunes = []prefixRunes{{'s', 'r', 'c', '='}, {'h', 'r', 'e', 'f', '='}}
+
+var itemSlicePool = &sync.Pool{
+ New: func() interface{} {
+ return make([]item, 0, 8)
+ },
+}
+
+func replace(content []byte, matchers []absurlMatcher) *contentlexer {
+ var items []item
+ if x := itemSlicePool.Get(); x != nil {
+ items = x.([]item)[:0]
+ defer itemSlicePool.Put(items)
+ } else {
+ items = make([]item, 0, 8)
+ }
+
+ lexer := &contentlexer{content: content,
+ items: items,
+ prefixLookup: &prefixes{pr: mainPrefixRunes},
+ matchers: matchers}
+
+ lexer.runReplacer()
+ return lexer
+}
+
+func (l *contentlexer) runReplacer() {
+ for l.state = lexReplacements; l.state != nil; {
+ l.state = l.state(l)
+ }
+}
+
+type absurlMatcher struct {
+ replaceType itemType
+ guardType itemType
+ match []byte
+ guard []byte
+ replacement []byte
+ guarded bool
+}
+
+func (a absurlMatcher) isSourceType() bool {
+ return a.replaceType == tSrcdq || a.replaceType == tSrcsq
+}
+
+func lexReplacements(l *contentlexer) stateFunc {
+ contentLength := len(l.content)
+ var r rune
+
+ for {
+ if int(l.pos) >= contentLength {
+ l.width = 0
+ break
+ }
+
+ var width int = 1
+ r = rune(l.content[l.pos])
+ if r >= utf8.RuneSelf {
+ r, width = utf8.DecodeRune(l.content[l.pos:])
+ }
+ l.width = pos(width)
+ l.pos += l.width
+
+ if r == ' ' {
+ l.prefixLookup.ms = matchStateWhitespace
+ } else if l.prefixLookup.ms != matchStateNone {
+ l.match(r)
+ if l.prefixLookup.ms == matchStateFull {
+ checkCandidate(l)
+ }
+ }
+
+ }
+
+ // Done!
+ if l.pos > l.start {
+ l.emit(tText)
+ }
+ return nil
+}
+
+func checkCandidate(l *contentlexer) {
+ isSource := l.prefixLookup.first == 's'
+ for _, m := range l.matchers {
+
+ if m.guarded {
+ continue
+ }
+
+ if isSource && !m.isSourceType() || !isSource && m.isSourceType() {
+ continue
+ }
+
+ s := l.content[l.pos:]
+ if bytes.HasPrefix(s, m.guard) {
+ if l.pos > l.start {
+ l.emit(tText)
+ }
+ l.pos += pos(len(m.guard))
+ l.emit(m.guardType)
+ m.guarded = true
+ return
+ } else if bytes.HasPrefix(s, m.match) {
+ if l.pos > l.start {
+ l.emit(tText)
+ }
+ l.pos += pos(len(m.match))
+ l.emit(m.replaceType)
+ return
+
+ }
+ }
+}
+
+func doReplace(content []byte, matchers []absurlMatcher) []byte {
+ b := bp.GetBuffer()
+ defer bp.PutBuffer(b)
+
+ guards := make([]bool, len(matchers))
+ replaced := replace(content, matchers)
+
+ // first pass: check guards
+ for _, item := range replaced.items {
+ if item.typ != tText {
+ for i, e := range matchers {
+ if item.typ == e.guardType {
+ guards[i] = true
+ break
+ }
+ }
+ }
+ }
+ // second pass: do replacements for non-guarded tokens
+ for _, token := range replaced.items {
+ switch token.typ {
+ case tText:
+ b.Write(token.val)
+ default:
+ for i, e := range matchers {
+ if token.typ == e.replaceType && !guards[i] {
+ b.Write(e.replacement)
+ } else if token.typ == e.replaceType || token.typ == e.guardType {
+ b.Write(token.val)
+ }
+ }
+ }
+ }
+
+ return b.Bytes()
+}
+
+type absurlReplacer struct {
+ htmlMatchers []absurlMatcher
+ xmlMatchers []absurlMatcher
+}
+
+func newAbsurlReplacer(baseUrl string) *absurlReplacer {
+ u, _ := url.Parse(baseUrl)
+ base := strings.TrimRight(u.String(), "/")
+
+ // HTML
+ dqHtmlMatch := []byte("\"/")
+ sqHtmlMatch := []byte("'/")
+
+ dqGuard := []byte("\"//")
+ sqGuard := []byte("'//")
+
+ // XML
+ dqXmlMatch := []byte(""/")
+ sqXmlMatch := []byte("'/")
+
+ dqXmlGuard := []byte(""//")
+ sqXmlGuard := []byte("'//")
+
+ dqHtml := []byte("\"" + base + "/")
+ sqHtml := []byte("'" + base + "/")
+
+ dqXml := []byte(""" + base + "/")
+ sqXml := []byte("'" + base + "/")
+
+ return &absurlReplacer{htmlMatchers: []absurlMatcher{
+ {tSrcdq, tGrcdq, dqHtmlMatch, dqGuard, dqHtml, false},
+ {tSrcsq, tGsrcsq, sqHtmlMatch, sqGuard, sqHtml, false},
+ {tHrefdq, tGhrefdq, dqHtmlMatch, dqGuard, dqHtml, false},
+ {tHrefsq, tGhrefsq, sqHtmlMatch, sqGuard, sqHtml, false}},
+ xmlMatchers: []absurlMatcher{
+ {tSrcdq, tGrcdq, dqXmlMatch, dqXmlGuard, dqXml, false},
+ {tSrcsq, tGsrcsq, sqXmlMatch, sqXmlGuard, sqXml, false},
+ {tHrefdq, tGhrefdq, dqXmlMatch, dqXmlGuard, dqXml, false},
+ {tHrefsq, tGhrefsq, sqXmlMatch, sqXmlGuard, sqXml, false},
+ }}
+
+}
+
+func (au *absurlReplacer) replaceInHtml(content []byte) []byte {
+ return doReplace(content, au.htmlMatchers)
+}
+
+func (au *absurlReplacer) replaceInXml(content []byte) []byte {
+ return doReplace(content, au.xmlMatchers)
+}
diff --git a/transform/chain_test.go b/transform/chain_test.go
index 71037d455..a88d84533 100644
--- a/transform/chain_test.go
+++ b/transform/chain_test.go
@@ -14,21 +14,29 @@ const CORRECT_OUTPUT_SRC_HREF_DQ = "<!DOCTYPE html><html><head><script src=\"foo
const CORRECT_OUTPUT_SRC_HREF_SQ = "<!DOCTYPE html><html><head><script src='foobar.js'></script><script src='http://base/barfoo.js'></script></head><body><nav><h1>title</h1></nav><article>content <a href='foobar'>foobar</a>. <a href='http://base/foobar'>Follow up</a></article></body></html>"
const H5_XML_CONTENT_ABS_URL = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\" ?><feed xmlns=\"http://www.w3.org/2005/Atom\"><entry><content type=\"html\">&lt;p&gt;&lt;a href=&#34;/foobar&#34;&gt;foobar&lt;/a&gt;&lt;/p&gt; &lt;p&gt;A video: &lt;iframe src=&#39;/foo&#39;&gt;&lt;/iframe&gt;&lt;/p&gt;</content></entry></feed>"
-const CORRECT_OUTPUT_SRC_HREF_IN_XML = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\" ?><feed xmlns=\"http://www.w3.org/2005/Atom\"><entry><content type=\"html\">&lt;p&gt;&lt;a href=&#34;http://xml/foobar&#34;&gt;foobar&lt;/a&gt;&lt;/p&gt; &lt;p&gt;A video: &lt;iframe src=&#39;http://xml/foo&#39;&gt;&lt;/iframe&gt;&lt;/p&gt;</content></entry></feed>"
+const CORRECT_OUTPUT_SRC_HREF_IN_XML = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\" ?><feed xmlns=\"http://www.w3.org/2005/Atom\"><entry><content type=\"html\">&lt;p&gt;&lt;a href=&#34;http://base/foobar&#34;&gt;foobar&lt;/a&gt;&lt;/p&gt; &lt;p&gt;A video: &lt;iframe src=&#39;http://base/foo&#39;&gt;&lt;/iframe&gt;&lt;/p&gt;</content></entry></feed>"
const H5_XML_CONTENT_GUARDED = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\" ?><feed xmlns=\"http://www.w3.org/2005/Atom\"><entry><content type=\"html\">&lt;p&gt;&lt;a href=&#34;//foobar&#34;&gt;foobar&lt;/a&gt;&lt;/p&gt; &lt;p&gt;A video: &lt;iframe src=&#39;//foo&#39;&gt;&lt;/iframe&gt;&lt;/p&gt;</content></entry></feed>"
-var abs_url_tests = []test{
+// additional sanity tests for replacements testing
+const REPLACE_1 = "No replacements."
+const REPLACE_2 = "ᚠᛇᚻ ᛒᛦᚦ ᚠᚱᚩᚠᚢᚱ\nᚠᛁᚱᚪ ᚷᛖᚻᚹᛦᛚᚳᚢᛗ"
+
+var abs_url_bench_tests = []test{
{H5_JS_CONTENT_DOUBLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_DQ},
{H5_JS_CONTENT_SINGLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_SQ},
{H5_JS_CONTENT_ABS_URL, H5_JS_CONTENT_ABS_URL},
{H5_JS_CONTENT_ABS_URL_SCHEMALESS, H5_JS_CONTENT_ABS_URL_SCHEMALESS},
}
-var xml_abs_url_tests = []test{
+var xml_abs_url_bench_tests = []test{
{H5_XML_CONTENT_ABS_URL, CORRECT_OUTPUT_SRC_HREF_IN_XML},
{H5_XML_CONTENT_GUARDED, H5_XML_CONTENT_GUARDED},
}
+var sanity_tests = []test{{REPLACE_1, REPLACE_1}, {REPLACE_2, REPLACE_2}}
+var abs_url_tests = append(abs_url_bench_tests, sanity_tests...)
+var xml_abs_url_tests = append(xml_abs_url_bench_tests, sanity_tests...)
+
func TestChainZeroTransformers(t *testing.T) {
tr := NewChain()
in := new(bytes.Buffer)
@@ -44,7 +52,7 @@ func BenchmarkAbsUrl(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
- apply(b.Errorf, tr, abs_url_tests)
+ apply(b.Errorf, tr, abs_url_bench_tests)
}
}
@@ -57,17 +65,17 @@ func TestAbsUrl(t *testing.T) {
}
func BenchmarkXmlAbsUrl(b *testing.B) {
- absURLInXML, _ := AbsURLInXML("http://xml")
+ absURLInXML, _ := AbsURLInXML("http://base")
tr := NewChain(absURLInXML...)
b.ResetTimer()
for i := 0; i < b.N; i++ {
- apply(b.Errorf, tr, xml_abs_url_tests)
+ apply(b.Errorf, tr, xml_abs_url_bench_tests)
}
}
func TestXMLAbsUrl(t *testing.T) {
- absURLInXML, _ := AbsURLInXML("http://xml")
+ absURLInXML, _ := AbsURLInXML("http://base")
tr := NewChain(absURLInXML...)
apply(t.Errorf, tr, xml_abs_url_tests)
}