summaryrefslogtreecommitdiffstats
path: root/transform
diff options
context:
space:
mode:
authorbep <bjorn.erik.pedersen@gmail.com>2015-02-17 04:33:44 +0100
committerbep <bjorn.erik.pedersen@gmail.com>2015-02-17 11:07:24 +0100
commitc2e29138728ffbae19b4b0a3e59503c47dd986a5 (patch)
tree407b3eb0e80bc5ff568236d1e021147d62c563e5 /transform
parent6b28e38cea0dec3e3f045ab8ec833608b91a946f (diff)
Fix canonifyurl vs schemaless links
And looks even faster: Compared to previous attempt: benchmark old ns/op new ns/op delta BenchmarkAbsUrl 30902 27206 -11.96% BenchmarkXmlAbsUrl 15389 14216 -7.62% benchmark old allocs new allocs delta BenchmarkAbsUrl 33 28 -15.15% BenchmarkXmlAbsUrl 16 14 -12.50% benchmark old bytes new bytes delta BenchmarkAbsUrl 4167 3504 -15.91% BenchmarkXmlAbsUrl 2057 2048 -0.44% Compared to before I started all of this: benchmark old ns/op new ns/op delta BenchmarkAbsUrl 36219 27206 -24.88% BenchmarkXmlAbsUrl 23903 14216 -40.53% benchmark old allocs new allocs delta BenchmarkAbsUrl 60 28 -53.33% BenchmarkXmlAbsUrl 30 14 -53.33% benchmark old bytes new bytes delta BenchmarkAbsUrl 5842 3504 -40.02% BenchmarkXmlAbsUrl 3754 2048 -45.44% Fixes #816
Diffstat (limited to 'transform')
-rw-r--r--transform/absurlreplacer.go108
-rw-r--r--transform/chain_test.go12
2 files changed, 46 insertions, 74 deletions
diff --git a/transform/absurlreplacer.go b/transform/absurlreplacer.go
index 7b6f72379..f558fcaae 100644
--- a/transform/absurlreplacer.go
+++ b/transform/absurlreplacer.go
@@ -37,11 +37,6 @@ const (
tHrefdq
tSrcsq
tHrefsq
- // guards
- tGrcdq
- tGhrefdq
- tGsrcsq
- tGhrefsq
)
type contentlexer struct {
@@ -130,24 +125,6 @@ var itemSlicePool = &sync.Pool{
},
}
-func replace(content []byte, matchers []absurlMatcher) *contentlexer {
- var items []item
- if x := itemSlicePool.Get(); x != nil {
- items = x.([]item)[:0]
- defer itemSlicePool.Put(items)
- } else {
- items = make([]item, 0, 8)
- }
-
- lexer := &contentlexer{content: content,
- items: items,
- prefixLookup: &prefixes{pr: mainPrefixRunes},
- matchers: matchers}
-
- lexer.runReplacer()
- return lexer
-}
-
func (l *contentlexer) runReplacer() {
for l.state = lexReplacements; l.state != nil; {
l.state = l.state(l)
@@ -156,11 +133,8 @@ func (l *contentlexer) runReplacer() {
type absurlMatcher struct {
replaceType itemType
- guardType itemType
match []byte
- guard []byte
replacement []byte
- guarded bool
}
func (a absurlMatcher) isSourceType() bool {
@@ -207,24 +181,21 @@ func checkCandidate(l *contentlexer) {
isSource := l.prefixLookup.first == 's'
for _, m := range l.matchers {
- if m.guarded {
- continue
- }
-
if isSource && !m.isSourceType() || !isSource && m.isSourceType() {
continue
}
- s := l.content[l.pos:]
- if bytes.HasPrefix(s, m.guard) {
- if l.pos > l.start {
- l.emit(tText)
+ if bytes.HasPrefix(l.content[l.pos:], m.match) {
+ // check for schemaless urls
+ posAfter := pos(int(l.pos) + len(m.match))
+ if int(posAfter) >= len(l.content) {
+ return
+ }
+ r, _ := utf8.DecodeRune(l.content[posAfter:])
+ if r == '/' {
+ // schemaless: skip
+ return
}
- l.pos += pos(len(m.guard))
- l.emit(m.guardType)
- m.guarded = true
- return
- } else if bytes.HasPrefix(s, m.match) {
if l.pos > l.start {
l.emit(tText)
}
@@ -240,31 +211,30 @@ func doReplace(content []byte, matchers []absurlMatcher) []byte {
b := bp.GetBuffer()
defer bp.PutBuffer(b)
- guards := make([]bool, len(matchers))
- replaced := replace(content, matchers)
-
- // first pass: check guards
- for _, item := range replaced.items {
- if item.typ != tText {
- for i, e := range matchers {
- if item.typ == e.guardType {
- guards[i] = true
- break
- }
- }
- }
+ var items []item
+ if x := itemSlicePool.Get(); x != nil {
+ items = x.([]item)[:0]
+ defer itemSlicePool.Put(items)
+ } else {
+ items = make([]item, 0, 8)
}
- // second pass: do replacements for non-guarded tokens
- for _, token := range replaced.items {
+
+ lexer := &contentlexer{content: content,
+ items: items,
+ prefixLookup: &prefixes{pr: mainPrefixRunes},
+ matchers: matchers}
+
+ lexer.runReplacer()
+
+ for _, token := range lexer.items {
switch token.typ {
case tText:
b.Write(token.val)
default:
- for i, e := range matchers {
- if token.typ == e.replaceType && !guards[i] {
+ for _, e := range matchers {
+ if token.typ == e.replaceType {
b.Write(e.replacement)
- } else if token.typ == e.replaceType || token.typ == e.guardType {
- b.Write(token.val)
+ break
}
}
}
@@ -286,16 +256,10 @@ func newAbsurlReplacer(baseUrl string) *absurlReplacer {
dqHtmlMatch := []byte("\"/")
sqHtmlMatch := []byte("'/")
- dqGuard := []byte("\"//")
- sqGuard := []byte("'//")
-
// XML
dqXmlMatch := []byte("&#34;/")
sqXmlMatch := []byte("&#39;/")
- dqXmlGuard := []byte("&#34;//")
- sqXmlGuard := []byte("&#39;//")
-
dqHtml := []byte("\"" + base + "/")
sqHtml := []byte("'" + base + "/")
@@ -303,15 +267,15 @@ func newAbsurlReplacer(baseUrl string) *absurlReplacer {
sqXml := []byte("&#39;" + base + "/")
return &absurlReplacer{htmlMatchers: []absurlMatcher{
- {tSrcdq, tGrcdq, dqHtmlMatch, dqGuard, dqHtml, false},
- {tSrcsq, tGsrcsq, sqHtmlMatch, sqGuard, sqHtml, false},
- {tHrefdq, tGhrefdq, dqHtmlMatch, dqGuard, dqHtml, false},
- {tHrefsq, tGhrefsq, sqHtmlMatch, sqGuard, sqHtml, false}},
+ {tSrcdq, dqHtmlMatch, dqHtml},
+ {tSrcsq, sqHtmlMatch, sqHtml},
+ {tHrefdq, dqHtmlMatch, dqHtml},
+ {tHrefsq, sqHtmlMatch, sqHtml}},
xmlMatchers: []absurlMatcher{
- {tSrcdq, tGrcdq, dqXmlMatch, dqXmlGuard, dqXml, false},
- {tSrcsq, tGsrcsq, sqXmlMatch, sqXmlGuard, sqXml, false},
- {tHrefdq, tGhrefdq, dqXmlMatch, dqXmlGuard, dqXml, false},
- {tHrefsq, tGhrefsq, sqXmlMatch, sqXmlGuard, sqXml, false},
+ {tSrcdq, dqXmlMatch, dqXml},
+ {tSrcsq, sqXmlMatch, sqXml},
+ {tHrefdq, dqXmlMatch, dqXml},
+ {tHrefsq, sqXmlMatch, sqXml},
}}
}
diff --git a/transform/chain_test.go b/transform/chain_test.go
index a88d84533..cd7749ec7 100644
--- a/transform/chain_test.go
+++ b/transform/chain_test.go
@@ -21,6 +21,12 @@ const H5_XML_CONTENT_GUARDED = "<?xml version=\"1.0\" encoding=\"utf-8\" standal
const REPLACE_1 = "No replacements."
const REPLACE_2 = "ᚠᛇᚻ ᛒᛦᚦ ᚠᚱᚩᚠᚢᚱ\nᚠᛁᚱᚪ ᚷᛖᚻᚹᛦᛚᚳᚢᛗ"
+// Issue: 816, schemaless links combined with others
+const REPLACE_SCHEMALESS_HTML = `Pre. src='//schemaless' src='/normal' <a href="//schemaless">Schemaless</a>. <a href="/normal">normal</a>. Post.`
+const REPLACE_SCHEMALESS_HTML_CORRECT = `Pre. src='//schemaless' src='http://base/normal' <a href="//schemaless">Schemaless</a>. <a href="http://base/normal">normal</a>. Post.`
+const REPLACE_SCHEMALESS_XML = `Pre. src=&#34;//schemaless&#34; src=&#34;/normal&#34; <a href=&#39;//schemaless&#39;>Schemaless</a>. <a href=&#39;/normal&#39;>normal</a>. Post.`
+const REPLACE_SCHEMALESS_XML_CORRECT = `Pre. src=&#34;//schemaless&#34; src=&#34;http://base/normal&#34; <a href=&#39;//schemaless&#39;>Schemaless</a>. <a href=&#39;http://base/normal&#39;>normal</a>. Post.`
+
var abs_url_bench_tests = []test{
{H5_JS_CONTENT_DOUBLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_DQ},
{H5_JS_CONTENT_SINGLE_QUOTE, CORRECT_OUTPUT_SRC_HREF_SQ},
@@ -34,8 +40,10 @@ var xml_abs_url_bench_tests = []test{
}
var sanity_tests = []test{{REPLACE_1, REPLACE_1}, {REPLACE_2, REPLACE_2}}
-var abs_url_tests = append(abs_url_bench_tests, sanity_tests...)
-var xml_abs_url_tests = append(xml_abs_url_bench_tests, sanity_tests...)
+var extra_tests_html = []test{{REPLACE_SCHEMALESS_HTML, REPLACE_SCHEMALESS_HTML_CORRECT}}
+var abs_url_tests = append(abs_url_bench_tests, append(sanity_tests, extra_tests_html...)...)
+var extra_tests_xml = []test{{REPLACE_SCHEMALESS_XML, REPLACE_SCHEMALESS_XML_CORRECT}}
+var xml_abs_url_tests = append(xml_abs_url_bench_tests, append(sanity_tests, extra_tests_xml...)...)
func TestChainZeroTransformers(t *testing.T) {
tr := NewChain()