summaryrefslogtreecommitdiffstats
path: root/transform
diff options
context:
space:
mode:
authorbep <bjorn.erik.pedersen@gmail.com>2015-05-03 19:54:17 +0200
committerbep <bjorn.erik.pedersen@gmail.com>2015-05-03 19:54:23 +0200
commitbee6a69846f476013e78c32d55f6c508ed8f1a99 (patch)
tree848ec15f48bce9f7f3ed1c093a92ca23fea3edaa /transform
parentbe6482603ee8ac3516ae19ceebb35662f038b25e (diff)
canonifyurls in srcset
Speed is about the same as before, uses slightly less memory: ``` benchmark old ns/op new ns/op delta BenchmarkAbsURL 17302 17713 +2.38% BenchmarkXMLAbsURL 9463 9470 +0.07% benchmark old allocs new allocs delta BenchmarkAbsURL 28 24 -14.29% BenchmarkXMLAbsURL 14 12 -14.29% benchmark old bytes new bytes delta BenchmarkAbsURL 3422 3144 -8.12% BenchmarkXMLAbsURL 1985 1864 -6.10% ``` Fixes #1059
Diffstat (limited to 'transform')
-rw-r--r--transform/absurlreplacer.go285
-rw-r--r--transform/chain_test.go59
2 files changed, 247 insertions, 97 deletions
diff --git a/transform/absurlreplacer.go b/transform/absurlreplacer.go
index 9acdfc8de..9d3aa8901 100644
--- a/transform/absurlreplacer.go
+++ b/transform/absurlreplacer.go
@@ -29,69 +29,98 @@ type contentlexer struct {
start int // item start position
width int // width of last element
- matchers []absURLMatcher
- state stateFunc
- prefixLookup *prefixes
+ matchers []absURLMatcher
+ state stateFunc
+
+ ms matchState
+ matches [3]bool // track matches of the 3 prefixes
+ i int // last index in matches checked
w io.Writer
}
type stateFunc func(*contentlexer) stateFunc
-type prefixRunes []rune
-
-type prefixes struct {
- pr []prefixRunes
- curr prefixRunes // current prefix lookup table
- i int // current index
+type prefix struct {
+ r []rune
+ f func(l *contentlexer)
+}
- // first rune in potential match
- first rune
+var prefixes = []*prefix{
+ &prefix{r: []rune{'s', 'r', 'c', '='}, f: checkCandidateSrc},
+ &prefix{r: []rune{'s', 'r', 'c', 's', 'e', 't', '='}, f: checkCandidateSrcset},
+ &prefix{r: []rune{'h', 'r', 'e', 'f', '='}, f: checkCandidateHref}}
- // match-state:
- // none, whitespace, partial, full
- ms matchState
+type absURLMatcher struct {
+ prefix int
+ match []byte
+ quote []byte
+ replacementURL []byte
}
-// match returns partial and full match for the prefix in play
-// - it's a full match if all prefix runes has checked out in row
-// - it's a partial match if it's on its way towards a full match
func (l *contentlexer) match(r rune) {
- p := l.prefixLookup
- if p.curr == nil {
- // assumes prefixes all start off on a different rune
- // works in this special case: href, src
- p.i = 0
- for _, pr := range p.pr {
- if pr[p.i] == r {
- fullMatch := len(p.pr) == 1
- p.first = r
- if !fullMatch {
- p.curr = pr
- l.prefixLookup.ms = matchStatePartial
- } else {
- l.prefixLookup.ms = matchStateFull
+
+ var found bool
+
+ // note, the prefixes can start off on the same foot, i.e.
+ // src and srcset.
+ if l.ms == matchStateWhitespace {
+ l.i = 0
+ for j, p := range prefixes {
+ if r == p.r[l.i] {
+ l.matches[j] = true
+ found = true
+ if l.checkMatchState(r, j) {
+ return
}
- return
+ } else {
+ l.matches[j] = false
}
}
- } else {
- p.i++
- if p.curr[p.i] == r {
- fullMatch := len(p.curr) == p.i+1
- if fullMatch {
- p.curr = nil
- l.prefixLookup.ms = matchStateFull
+
+ if !found {
+ l.ms = matchStateNone
+ }
+
+ return
+ }
+
+ l.i++
+ for j, m := range l.matches {
+ // still a match?
+ if m {
+ if prefixes[j].r[l.i] == r {
+ found = true
+ if l.checkMatchState(r, j) {
+ return
+ }
} else {
- l.prefixLookup.ms = matchStatePartial
+ l.matches[j] = false
}
- return
}
+ }
- p.curr = nil
+ if found {
+ return
}
- l.prefixLookup.ms = matchStateNone
+ l.ms = matchStateNone
+}
+
+func (l *contentlexer) checkMatchState(r rune, idx int) bool {
+ if r == '=' {
+ l.ms = matchStateFull
+ for k := range l.matches {
+ if k != idx {
+ l.matches[k] = false
+ }
+ }
+ return true
+ }
+
+ l.ms = matchStatePartial
+
+ return false
}
func (l *contentlexer) emit() {
@@ -99,49 +128,108 @@ func (l *contentlexer) emit() {
l.start = l.pos
}
-var mainPrefixRunes = []prefixRunes{{'s', 'r', 'c', '='}, {'h', 'r', 'e', 'f', '='}}
+func (a absURLMatcher) isSourceType() bool {
+ return a.prefix == matchPrefixSrc
+}
-type absURLMatcher struct {
- prefix int
- match []byte
- replacement []byte
+func checkCandidateSrc(l *contentlexer) {
+ for _, m := range l.matchers {
+ if !m.isSourceType() {
+ continue
+ }
+ l.replaceSimple(m)
+ }
}
-func (a absURLMatcher) isSourceType() bool {
- return a.prefix == matchPrefixSrc
+func checkCandidateHref(l *contentlexer) {
+ for _, m := range l.matchers {
+ if m.isSourceType() {
+ continue
+ }
+ l.replaceSimple(m)
+ }
}
-func checkCandidate(l *contentlexer) {
- isSource := l.prefixLookup.first == 's'
+func checkCandidateSrcset(l *contentlexer) {
+ // special case, not frequent (me think)
for _, m := range l.matchers {
+ if m.isSourceType() {
+ continue
+ }
- if isSource && !m.isSourceType() || !isSource && m.isSourceType() {
+ if !bytes.HasPrefix(l.content[l.pos:], m.match) {
continue
}
- if bytes.HasPrefix(l.content[l.pos:], m.match) {
- // check for schemaless URLs
- posAfter := l.pos + len(m.match)
- if posAfter >= len(l.content) {
- return
- }
- r, _ := utf8.DecodeRune(l.content[posAfter:])
- if r == '/' {
- // schemaless: skip
- return
- }
- if l.pos > l.start {
- l.emit()
- }
- l.pos += len(m.match)
- l.w.Write(m.replacement)
- l.start = l.pos
+ // check for schemaless URLs
+ posAfter := l.pos + len(m.match)
+ if posAfter >= len(l.content) {
+ return
+ }
+ r, _ := utf8.DecodeRune(l.content[posAfter:])
+ if r == '/' {
+ // schemaless: skip
+ continue
+ }
+
+ posLastQuote := bytes.Index(l.content[l.pos+1:], m.quote)
+
+ // safe guard
+ if posLastQuote < 0 || posLastQuote > 2000 {
return
+ }
+ if l.pos > l.start {
+ l.emit()
}
+
+ section := l.content[l.pos+len(m.quote) : l.pos+posLastQuote+1]
+
+ fields := bytes.Fields(section)
+ l.w.Write([]byte(m.quote))
+ for i, f := range fields {
+ if f[0] == '/' {
+ l.w.Write(m.replacementURL)
+ l.w.Write(f[1:])
+
+ } else {
+ l.w.Write(f)
+ }
+
+ if i < len(fields)-1 {
+ l.w.Write([]byte(" "))
+ }
+ }
+
+ l.w.Write(m.quote)
+ l.pos += len(section) + (len(m.quote) * 2)
+ l.start = l.pos
}
}
+func (l *contentlexer) replaceSimple(m absURLMatcher) {
+ if !bytes.HasPrefix(l.content[l.pos:], m.match) {
+ return
+ }
+ // check for schemaless URLs
+ posAfter := l.pos + len(m.match)
+ if posAfter >= len(l.content) {
+ return
+ }
+ r, _ := utf8.DecodeRune(l.content[posAfter:])
+ if r == '/' {
+ // schemaless: skip
+ return
+ }
+ if l.pos > l.start {
+ l.emit()
+ }
+ l.pos += len(m.match)
+ l.w.Write(m.quote)
+ l.w.Write(m.replacementURL)
+ l.start = l.pos
+}
+
func (l *contentlexer) replace() {
contentLength := len(l.content)
var r rune
@@ -152,7 +240,7 @@ func (l *contentlexer) replace() {
break
}
- var width int = 1
+ var width = 1
r = rune(l.content[l.pos])
if r >= utf8.RuneSelf {
r, width = utf8.DecodeRune(l.content[l.pos:])
@@ -160,14 +248,24 @@ func (l *contentlexer) replace() {
l.width = width
l.pos += l.width
if r == ' ' {
- l.prefixLookup.ms = matchStateWhitespace
- } else if l.prefixLookup.ms != matchStateNone {
+ l.ms = matchStateWhitespace
+ } else if l.ms != matchStateNone {
l.match(r)
- if l.prefixLookup.ms == matchStateFull {
- checkCandidate(l)
+ if l.ms == matchStateFull {
+ var p *prefix
+ for i, m := range l.matches {
+ if m {
+ p = prefixes[i]
+ }
+ l.matches[i] = false
+ }
+ if p == nil {
+ panic("illegal state: curr is nil when state is full")
+ }
+ l.ms = matchStateNone
+ p.f(l)
}
}
-
}
// Done!
@@ -177,15 +275,12 @@ func (l *contentlexer) replace() {
}
func doReplace(ct contentTransformer, matchers []absURLMatcher) {
-
lexer := &contentlexer{
- content: ct.Content(),
- w: ct,
- prefixLookup: &prefixes{pr: mainPrefixRunes},
- matchers: matchers}
+ content: ct.Content(),
+ w: ct,
+ matchers: matchers}
lexer.replace()
-
}
type absURLReplacer struct {
@@ -195,7 +290,7 @@ type absURLReplacer struct {
func newAbsURLReplacer(baseURL string) *absURLReplacer {
u, _ := url.Parse(baseURL)
- base := strings.TrimRight(u.String(), "/")
+ base := []byte(strings.TrimRight(u.String(), "/") + "/")
// HTML
dqHTMLMatch := []byte("\"/")
@@ -205,23 +300,23 @@ func newAbsURLReplacer(baseURL string) *absURLReplacer {
dqXMLMatch := []byte("&#34;/")
sqXMLMatch := []byte("&#39;/")
- dqHTML := []byte("\"" + base + "/")
- sqHTML := []byte("'" + base + "/")
+ dqHTML := []byte("\"")
+ sqHTML := []byte("'")
- dqXML := []byte("&#34;" + base + "/")
- sqXML := []byte("&#39;" + base + "/")
+ dqXML := []byte("&#34;")
+ sqXML := []byte("&#39;")
return &absURLReplacer{
htmlMatchers: []absURLMatcher{
- {matchPrefixSrc, dqHTMLMatch, dqHTML},
- {matchPrefixSrc, sqHTMLMatch, sqHTML},
- {matchPrefixHref, dqHTMLMatch, dqHTML},
- {matchPrefixHref, sqHTMLMatch, sqHTML}},
+ {matchPrefixSrc, dqHTMLMatch, dqHTML, base},
+ {matchPrefixSrc, sqHTMLMatch, sqHTML, base},
+ {matchPrefixHref, dqHTMLMatch, dqHTML, base},
+ {matchPrefixHref, sqHTMLMatch, sqHTML, base}},
xmlMatchers: []absURLMatcher{
- {matchPrefixSrc, dqXMLMatch, dqXML},
- {matchPrefixSrc, sqXMLMatch, sqXML},
- {matchPrefixHref, dqXMLMatch, dqXML},
- {matchPrefixHref, sqXMLMatch, sqXML},
+ {matchPrefixSrc, dqXMLMatch, dqXML, base},
+ {matchPrefixSrc, sqXMLMatch, sqXML, base},
+ {matchPrefixHref, dqXMLMatch, dqXML, base},
+ {matchPrefixHref, sqXMLMatch, sqXML, base},
}}
}
diff --git a/transform/chain_test.go b/transform/chain_test.go
index 683ac77c1..242d571e7 100644
--- a/transform/chain_test.go
+++ b/transform/chain_test.go
@@ -25,8 +25,42 @@ const REPLACE_2 = "ᚠᛇᚻ ᛒᛦᚦ ᚠᚱᚩᚠᚢᚱ\nᚠᛁᚱᚪ ᚷᛖ
// Issue: 816, schemaless links combined with others
const REPLACE_SCHEMALESS_HTML = `Pre. src='//schemaless' src='/normal' <a href="//schemaless">Schemaless</a>. <a href="/normal">normal</a>. Post.`
const REPLACE_SCHEMALESS_HTML_CORRECT = `Pre. src='//schemaless' src='http://base/normal' <a href="//schemaless">Schemaless</a>. <a href="http://base/normal">normal</a>. Post.`
-const REPLACE_SCHEMALESS_XML = `Pre. src=&#34;//schemaless&#34; src=&#34;/normal&#34; <a href=&#39;//schemaless&#39;>Schemaless</a>. <a href=&#39;/normal&#39;>normal</a>. Post.`
-const REPLACE_SCHEMALESS_XML_CORRECT = `Pre. src=&#34;//schemaless&#34; src=&#34;http://base/normal&#34; <a href=&#39;//schemaless&#39;>Schemaless</a>. <a href=&#39;http://base/normal&#39;>normal</a>. Post.`
+const REPLACE_SCHEMALESS_XML = `Pre. src=&#39;//schemaless&#39; src=&#39;/normal&#39; <a href=&#39;//schemaless&#39;>Schemaless</a>. <a href=&#39;/normal&#39;>normal</a>. Post.`
+const REPLACE_SCHEMALESS_XML_CORRECT = `Pre. src=&#39;//schemaless&#39; src=&#39;http://base/normal&#39; <a href=&#39;//schemaless&#39;>Schemaless</a>. <a href=&#39;http://base/normal&#39;>normal</a>. Post.`
+
+// srcset=
+const SRCSET_BASIC = `Pre. <img srcset="/img/small.jpg 200w /img/big.jpg 700w" alt="text" src="/img/foo.jpg">`
+const SRCSET_BASIC_CORRECT = `Pre. <img srcset="http://base/img/small.jpg 200w http://base/img/big.jpg 700w" alt="text" src="http://base/img/foo.jpg">`
+const SRCSET_SINGLE_QUOTE = `Pre. <img srcset='/img/small.jpg 200w /img/big.jpg 700w' alt="text" src="/img/foo.jpg"> POST.`
+const SRCSET_SINGLE_QUOTE_CORRECT = `Pre. <img srcset='http://base/img/small.jpg 200w http://base/img/big.jpg 700w' alt="text" src="http://base/img/foo.jpg"> POST.`
+const SRCSET_XML_BASIC = `Pre. <img srcset=&#34;/img/small.jpg 200w /img/big.jpg 700w&#34; alt=&#34;text&#34; src=&#34;/img/foo.jpg&#34;>`
+const SRCSET_XML_BASIC_CORRECT = `Pre. <img srcset=&#34;http://base/img/small.jpg 200w http://base/img/big.jpg 700w&#34; alt=&#34;text&#34; src=&#34;http://base/img/foo.jpg&#34;>`
+const SRCSET_XML_SINGLE_QUOTE = `Pre. <img srcset=&#34;/img/small.jpg 200w /img/big.jpg 700w&#34; alt=&#34;text&#34; src=&#34;/img/foo.jpg&#34;>`
+const SRCSET_XML_SINGLE_QUOTE_CORRECT = `Pre. <img srcset=&#34;http://base/img/small.jpg 200w http://base/img/big.jpg 700w&#34; alt=&#34;text&#34; src=&#34;http://base/img/foo.jpg&#34;>`
+const SRCSET_VARIATIONS = `Pre.
+Missing start quote: <img srcset=/img/small.jpg 200w /img/big.jpg 700w" alt="text"> src='/img/foo.jpg'> FOO.
+<img srcset='/img.jpg'>
+schemaless: <img srcset='//img.jpg' src='//basic.jpg'>
+schemaless2: <img srcset="//img.jpg" src="//basic.jpg2> POST
+`
+const SRCSET_VARIATIONS_CORRECT = `Pre.
+Missing start quote: <img srcset=/img/small.jpg 200w /img/big.jpg 700w" alt="text"> src='http://base/img/foo.jpg'> FOO.
+<img srcset='http://base/img.jpg'>
+schemaless: <img srcset='//img.jpg' src='//basic.jpg'>
+schemaless2: <img srcset="//img.jpg" src="//basic.jpg2> POST
+`
+const SRCSET_XML_VARIATIONS = `Pre.
+Missing start quote: &lt;img srcset=/img/small.jpg 200w /img/big.jpg 700w&quot; alt=&quot;text&quot;&gt; src=&#39;/img/foo.jpg&#39;&gt; FOO.
+&lt;img srcset=&#39;/img.jpg&#39;&gt;
+schemaless: &lt;img srcset=&#39;//img.jpg&#39; src=&#39;//basic.jpg&#39;&gt;
+schemaless2: &lt;img srcset=&quot;//img.jpg&quot; src=&quot;//basic.jpg2&gt; POST
+`
+const SRCSET_XML_VARIATIONS_CORRECT = `Pre.
+Missing start quote: &lt;img srcset=/img/small.jpg 200w /img/big.jpg 700w&quot; alt=&quot;text&quot;&gt; src=&#39;http://base/img/foo.jpg&#39;&gt; FOO.
+&lt;img srcset=&#39;http://base/img.jpg&#39;&gt;
+schemaless: &lt;img srcset=&#39;//img.jpg&#39; src=&#39;//basic.jpg&#39;&gt;
+schemaless2: &lt;img srcset=&quot;//img.jpg&quot; src=&quot;//basic.jpg2&gt; POST
+`
var abs_url_bench_tests = []test{
{H5_JS_CONTENT_DOUBLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_DQ},
@@ -46,6 +80,12 @@ var abs_url_tests = append(abs_url_bench_tests, append(sanity_tests, extra_tests
var extra_tests_xml = []test{{REPLACE_SCHEMALESS_XML, REPLACE_SCHEMALESS_XML_CORRECT}}
var xml_abs_url_tests = append(xml_abs_url_bench_tests, append(sanity_tests, extra_tests_xml...)...)
+var srcset_tests = []test{{SRCSET_BASIC, SRCSET_BASIC_CORRECT}, {SRCSET_SINGLE_QUOTE, SRCSET_SINGLE_QUOTE_CORRECT}, {SRCSET_VARIATIONS, SRCSET_VARIATIONS_CORRECT}}
+var srcset_xml_tests = []test{
+ {SRCSET_XML_BASIC, SRCSET_XML_BASIC_CORRECT},
+ {SRCSET_XML_SINGLE_QUOTE, SRCSET_XML_SINGLE_QUOTE_CORRECT},
+ {SRCSET_XML_VARIATIONS, SRCSET_XML_VARIATIONS_CORRECT}}
+
func TestChainZeroTransformers(t *testing.T) {
tr := NewChain()
in := new(bytes.Buffer)
@@ -102,6 +142,21 @@ func TestAbsURL(t *testing.T) {
}
+func TestAbsURLSrcSet(t *testing.T) {
+ absURL, _ := absURLFromURL("http://base")
+ tr := NewChain(absURL...)
+
+ apply(t.Errorf, tr, srcset_tests)
+}
+
+func TestAbsXMLURLSrcSet(t *testing.T) {
+ absURLInXML, _ := absURLInXMLFromURL("http://base")
+ tr := NewChain(absURLInXML...)
+
+ apply(t.Errorf, tr, srcset_xml_tests)
+
+}
+
func BenchmarkXMLAbsURL(b *testing.B) {
absURLInXML, _ := absURLInXMLFromURL("http://base")
tr := NewChain(absURLInXML...)