summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>2022-07-07 16:11:47 +0200
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>2022-07-09 16:03:11 +0200
commit223bf2800488ad5d38854bbb595d789bc35ebe32 (patch)
tree84b04f8f50b4450cf5f87943befe31fd9d7b8b90
parent72b0ccdb010fcdfeb3bb4a955d4fc04529816c0d (diff)
parser/pageparser: Don't store the byte slices
On its own this change doesn't do any magic, but this is part of a bigger picture about making Hugo leaner in the memory usage department.
-rw-r--r--.github/workflows/test.yml1
-rw-r--r--hugolib/page.go24
-rw-r--r--hugolib/page__content.go2
-rw-r--r--hugolib/shortcode.go26
-rw-r--r--hugolib/shortcode_test.go6
-rw-r--r--parser/pageparser/item.go71
-rw-r--r--parser/pageparser/item_test.go27
-rw-r--r--parser/pageparser/pagelexer.go54
-rw-r--r--parser/pageparser/pageparser.go79
-rw-r--r--parser/pageparser/pageparser_intro_test.go118
-rw-r--r--parser/pageparser/pageparser_main_test.go22
-rw-r--r--parser/pageparser/pageparser_shortcode_test.go140
-rw-r--r--parser/pageparser/pageparser_test.go13
13 files changed, 385 insertions, 198 deletions
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 1403c4d57..e1b78e1a5 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -50,6 +50,7 @@ jobs:
- if: matrix.os == 'windows-latest'
run: |
choco install pandoc
+ choco install mingw --version 10.2.0 --allow-downgrade
- run: pandoc -v
- if: matrix.os == 'ubuntu-latest'
name: Install dart-sass-embedded Linux
diff --git a/hugolib/page.go b/hugolib/page.go
index e37b47300..4752d11f1 100644
--- a/hugolib/page.go
+++ b/hugolib/page.go
@@ -639,7 +639,7 @@ func (p *pageState) mapContentForResult(
if fe, ok := err.(herrors.FileError); ok {
return fe
}
- return p.parseError(err, iter.Input(), i.Pos)
+ return p.parseError(err, result.Input(), i.Pos())
}
// the parser is guaranteed to return items in proper order or fail, so …
@@ -656,14 +656,14 @@ Loop:
case it.Type == pageparser.TypeIgnore:
case it.IsFrontMatter():
f := pageparser.FormatFromFrontMatterType(it.Type)
- m, err := metadecoders.Default.UnmarshalToMap(it.Val, f)
+ m, err := metadecoders.Default.UnmarshalToMap(it.Val(result.Input()), f)
if err != nil {
if fe, ok := err.(herrors.FileError); ok {
pos := fe.Position()
// Apply the error to the content file.
pos.Filename = p.File().Filename()
// Offset the starting position of front matter.
- offset := iter.LineNumber() - 1
+ offset := iter.LineNumber(result.Input()) - 1
if f == metadecoders.YAML {
offset -= 1
}
@@ -687,7 +687,7 @@ Loop:
next := iter.Peek()
if !next.IsDone() {
- p.source.posMainContent = next.Pos
+ p.source.posMainContent = next.Pos()
}
if !p.s.shouldBuild(p) {
@@ -699,10 +699,10 @@ Loop:
posBody := -1
f := func(item pageparser.Item) bool {
if posBody == -1 && !item.IsDone() {
- posBody = item.Pos
+ posBody = item.Pos()
}
- if item.IsNonWhitespace() {
+ if item.IsNonWhitespace(result.Input()) {
p.truncated = true
// Done
@@ -712,7 +712,7 @@ Loop:
}
iter.PeekWalk(f)
- p.source.posSummaryEnd = it.Pos
+ p.source.posSummaryEnd = it.Pos()
p.source.posBodyStart = posBody
p.source.hasSummaryDivider = true
@@ -727,13 +727,13 @@ Loop:
// let extractShortcode handle left delim (will do so recursively)
iter.Backup()
- currShortcode, err := s.extractShortcode(ordinal, 0, iter)
+ currShortcode, err := s.extractShortcode(ordinal, 0, result.Input(), iter)
if err != nil {
return fail(err, it)
}
- currShortcode.pos = it.Pos
- currShortcode.length = iter.Current().Pos - it.Pos
+ currShortcode.pos = it.Pos()
+ currShortcode.length = iter.Current().Pos() - it.Pos()
if currShortcode.placeholder == "" {
currShortcode.placeholder = createShortcodePlaceholder("s", currShortcode.ordinal)
}
@@ -754,7 +754,7 @@ Loop:
rn.AddShortcode(currShortcode)
case it.Type == pageparser.TypeEmoji:
- if emoji := helpers.Emoji(it.ValStr()); emoji != nil {
+ if emoji := helpers.Emoji(it.ValStr(result.Input())); emoji != nil {
rn.AddReplacement(emoji, it)
} else {
rn.AddBytes(it)
@@ -762,7 +762,7 @@ Loop:
case it.IsEOF():
break Loop
case it.IsError():
- err := fail(errors.New(it.ValStr()), it)
+ err := fail(errors.New(it.ValStr(result.Input())), it)
currShortcode.err = err
return err
diff --git a/hugolib/page__content.go b/hugolib/page__content.go
index bf69fafcd..a721d1fce 100644
--- a/hugolib/page__content.go
+++ b/hugolib/page__content.go
@@ -45,7 +45,7 @@ func (p pageContent) contentToRender(parsed pageparser.Result, pm *pageContentMa
for _, it := range pm.items {
switch v := it.(type) {
case pageparser.Item:
- c = append(c, source[v.Pos:v.Pos+len(v.Val)]...)
+ c = append(c, source[v.Pos():v.Pos()+len(v.Val(source))]...)
case pageContentReplacement:
c = append(c, v.val...)
case *shortcode:
diff --git a/hugolib/shortcode.go b/hugolib/shortcode.go
index 33767fc68..1627acacb 100644
--- a/hugolib/shortcode.go
+++ b/hugolib/shortcode.go
@@ -509,7 +509,7 @@ func (s *shortcodeHandler) parseError(err error, input []byte, pos int) error {
// pageTokens state:
// - before: positioned just before the shortcode start
// - after: shortcode(s) consumed (plural when they are nested)
-func (s *shortcodeHandler) extractShortcode(ordinal, level int, pt *pageparser.Iterator) (*shortcode, error) {
+func (s *shortcodeHandler) extractShortcode(ordinal, level int, source []byte, pt *pageparser.Iterator) (*shortcode, error) {
if s == nil {
panic("handler nil")
}
@@ -520,7 +520,7 @@ func (s *shortcodeHandler) extractShortcode(ordinal, level int, pt *pageparser.I
pt.Backup()
item := pt.Next()
if item.IsIndentation() {
- sc.indentation = string(item.Val)
+ sc.indentation = item.ValStr(source)
}
}
@@ -530,7 +530,7 @@ func (s *shortcodeHandler) extractShortcode(ordinal, level int, pt *pageparser.I
const errorPrefix = "failed to extract shortcode"
fail := func(err error, i pageparser.Item) error {
- return s.parseError(fmt.Errorf("%s: %w", errorPrefix, err), pt.Input(), i.Pos)
+ return s.parseError(fmt.Errorf("%s: %w", errorPrefix, err), source, i.Pos())
}
Loop:
@@ -550,7 +550,7 @@ Loop:
if cnt > 0 {
// nested shortcode; append it to inner content
pt.Backup()
- nested, err := s.extractShortcode(nestedOrdinal, nextLevel, pt)
+ nested, err := s.extractShortcode(nestedOrdinal, nextLevel, source, pt)
nestedOrdinal++
if nested != nil && nested.name != "" {
s.addName(nested.name)
@@ -589,7 +589,7 @@ Loop:
// return that error, more specific
continue
}
- return sc, fail(fmt.Errorf("shortcode %q has no .Inner, yet a closing tag was provided", next.Val), next)
+ return sc, fail(fmt.Errorf("shortcode %q has no .Inner, yet a closing tag was provided", next.ValStr(source)), next)
}
}
if next.IsRightShortcodeDelim() {
@@ -602,11 +602,11 @@ Loop:
return sc, nil
case currItem.IsText():
- sc.inner = append(sc.inner, currItem.ValStr())
+ sc.inner = append(sc.inner, currItem.ValStr(source))
case currItem.Type == pageparser.TypeEmoji:
// TODO(bep) avoid the duplication of these "text cases", to prevent
// more of #6504 in the future.
- val := currItem.ValStr()
+ val := currItem.ValStr(source)
if emoji := helpers.Emoji(val); emoji != nil {
sc.inner = append(sc.inner, string(emoji))
} else {
@@ -614,7 +614,7 @@ Loop:
}
case currItem.IsShortcodeName():
- sc.name = currItem.ValStr()
+ sc.name = currItem.ValStr(source)
// Used to check if the template expects inner content.
templs := s.s.Tmpl().LookupVariants(sc.name)
@@ -625,7 +625,7 @@ Loop:
sc.info = templs[0].(tpl.Info)
sc.templs = templs
case currItem.IsInlineShortcodeName():
- sc.name = currItem.ValStr()
+ sc.name = currItem.ValStr(source)
sc.isInline = true
case currItem.IsShortcodeParam():
if !pt.IsValueNext() {
@@ -634,11 +634,11 @@ Loop:
// named params
if sc.params == nil {
params := make(map[string]any)
- params[currItem.ValStr()] = pt.Next().ValTyped()
+ params[currItem.ValStr(source)] = pt.Next().ValTyped(source)
sc.params = params
} else {
if params, ok := sc.params.(map[string]any); ok {
- params[currItem.ValStr()] = pt.Next().ValTyped()
+ params[currItem.ValStr(source)] = pt.Next().ValTyped(source)
} else {
return sc, errShortCodeIllegalState
}
@@ -647,11 +647,11 @@ Loop:
// positional params
if sc.params == nil {
var params []any
- params = append(params, currItem.ValTyped())
+ params = append(params, currItem.ValTyped(source))
sc.params = params
} else {
if params, ok := sc.params.([]any); ok {
- params = append(params, currItem.ValTyped())
+ params = append(params, currItem.ValTyped(source))
sc.params = params
} else {
return sc, errShortCodeIllegalState
diff --git a/hugolib/shortcode_test.go b/hugolib/shortcode_test.go
index 5b8a5c295..3f9190962 100644
--- a/hugolib/shortcode_test.go
+++ b/hugolib/shortcode_test.go
@@ -112,7 +112,7 @@ title: "Shortcodes Galore!"
handler := newShortcodeHandler(nil, s)
iter := p.Iterator()
- short, err := handler.extractShortcode(0, 0, iter)
+ short, err := handler.extractShortcode(0, 0, p.Input(), iter)
test.check(c, short, err)
})
@@ -763,7 +763,7 @@ title: "Hugo Rocks!"
)
}
-func TestShortcodeTypedParams(t *testing.T) {
+func TestShortcodeParams(t *testing.T) {
t.Parallel()
c := qt.New(t)
@@ -778,6 +778,7 @@ title: "Hugo Rocks!"
types positional: {{< hello true false 33 3.14 >}}
types named: {{< hello b1=true b2=false i1=33 f1=3.14 >}}
types string: {{< hello "true" trues "33" "3.14" >}}
+escaped quoute: {{< hello "hello \"world\"." >}}
`).WithTemplatesAdded(
@@ -796,6 +797,7 @@ Get: {{ printf "%v (%T)" $b1 $b1 | safeHTML }}
"types positional: - 0: true (bool) - 1: false (bool) - 2: 33 (int) - 3: 3.14 (float64)",
"types named: - b1: true (bool) - b2: false (bool) - f1: 3.14 (float64) - i1: 33 (int) Get: true (bool) ",
"types string: - 0: true (string) - 1: trues (string) - 2: 33 (string) - 3: 3.14 (string) ",
+ "hello &#34;world&#34;. (string)",
)
}
diff --git a/parser/pageparser/item.go b/parser/pageparser/item.go
index 52546be41..2083be70a 100644
--- a/parser/pageparser/item.go
+++ b/parser/pageparser/item.go
@@ -22,21 +22,59 @@ import (
"github.com/yuin/goldmark/util"
)
+type lowHigh struct {
+ Low int
+ High int
+}
+
type Item struct {
- Type ItemType
- Pos int
- Val []byte
+ Type ItemType
+ Err error
+
+ // The common case is a single segment.
+ low int
+ high int
+
+ // This is the uncommon case.
+ segments []lowHigh
+
+ // Used for validation.
+ firstByte byte
+
isString bool
}
type Items []Item
-func (i Item) ValStr() string {
- return string(i.Val)
+func (i Item) Pos() int {
+ if len(i.segments) > 0 {
+ return i.segments[0].Low
+ }
+ return i.low
+}
+
+func (i Item) Val(source []byte) []byte {
+ if len(i.segments) == 0 {
+ return source[i.low:i.high]
+ }
+
+ if len(i.segments) == 1 {
+ return source[i.segments[0].Low:i.segments[0].High]
+ }
+
+ var b bytes.Buffer
+ for _, s := range i.segments {
+ b.Write(source[s.Low:s.High])
+ }
+ return b.Bytes()
+}
+
+func (i Item) ValStr(source []byte) string {
+ return string(i.Val(source))
}
-func (i Item) ValTyped() any {
- str := i.ValStr()
+func (i Item) ValTyped(source []byte) any {
+ str := i.ValStr(source)
if i.isString {
// A quoted value that is a string even if it looks like a number etc.
return str
@@ -73,8 +111,8 @@ func (i Item) IsIndentation() bool {
return i.Type == tIndentation
}
-func (i Item) IsNonWhitespace() bool {
- return len(bytes.TrimSpace(i.Val)) > 0
+func (i Item) IsNonWhitespace(source []byte) bool {
+ return len(bytes.TrimSpace(i.Val(source))) > 0
}
func (i Item) IsShortcodeName() bool {
@@ -125,20 +163,21 @@ func (i Item) IsError() bool {
return i.Type == tError
}
-func (i Item) String() string {
+func (i Item) ToString(source []byte) string {
+ val := i.Val(source)
switch {
case i.Type == tEOF:
return "EOF"
case i.Type == tError:
- return string(i.Val)
+ return string(val)
case i.Type == tIndentation:
- return fmt.Sprintf("%s:[%s]", i.Type, util.VisualizeSpaces(i.Val))
+ return fmt.Sprintf("%s:[%s]", i.Type, util.VisualizeSpaces(val))
case i.Type > tKeywordMarker:
- return fmt.Sprintf("<%s>", i.Val)
- case len(i.Val) > 50:
- return fmt.Sprintf("%v:%.20q...", i.Type, i.Val)
+ return fmt.Sprintf("<%s>", val)
+ case len(val) > 50:
+ return fmt.Sprintf("%v:%.20q...", i.Type, val)
}
- return fmt.Sprintf("%v:[%s]", i.Type, i.Val)
+ return fmt.Sprintf("%v:[%s]", i.Type, val)
}
type ItemType int
diff --git a/parser/pageparser/item_test.go b/parser/pageparser/item_test.go
index cd01202c6..db4cc127a 100644
--- a/parser/pageparser/item_test.go
+++ b/parser/pageparser/item_test.go
@@ -22,13 +22,22 @@ import (
func TestItemValTyped(t *testing.T) {
c := qt.New(t)
- c.Assert(Item{Val: []byte("3.14")}.ValTyped(), qt.Equals, float64(3.14))
- c.Assert(Item{Val: []byte(".14")}.ValTyped(), qt.Equals, float64(.14))
- c.Assert(Item{Val: []byte("314")}.ValTyped(), qt.Equals, 314)
- c.Assert(Item{Val: []byte("314x")}.ValTyped(), qt.Equals, "314x")
- c.Assert(Item{Val: []byte("314 ")}.ValTyped(), qt.Equals, "314 ")
- c.Assert(Item{Val: []byte("314"), isString: true}.ValTyped(), qt.Equals, "314")
- c.Assert(Item{Val: []byte("true")}.ValTyped(), qt.Equals, true)
- c.Assert(Item{Val: []byte("false")}.ValTyped(), qt.Equals, false)
- c.Assert(Item{Val: []byte("trues")}.ValTyped(), qt.Equals, "trues")
+ source := []byte("3.14")
+ c.Assert(Item{low: 0, high: len(source)}.ValTyped(source), qt.Equals, float64(3.14))
+ source = []byte(".14")
+ c.Assert(Item{low: 0, high: len(source)}.ValTyped(source), qt.Equals, float64(0.14))
+ source = []byte("314")
+ c.Assert(Item{low: 0, high: len(source)}.ValTyped(source), qt.Equals, 314)
+ source = []byte("314")
+ c.Assert(Item{low: 0, high: len(source), isString: true}.ValTyped(source), qt.Equals, "314")
+ source = []byte("314x")
+ c.Assert(Item{low: 0, high: len(source)}.ValTyped(source), qt.Equals, "314x")
+ source = []byte("314 ")
+ c.Assert(Item{low: 0, high: len(source)}.ValTyped(source), qt.Equals, "314 ")
+ source = []byte("true")
+ c.Assert(Item{low: 0, high: len(source)}.ValTyped(source), qt.Equals, true)
+ source = []byte("false")
+ c.Assert(Item{low: 0, high: len(source)}.ValTyped(source), qt.Equals, false)
+ source = []byte("trued")
+
}
diff --git a/parser/pageparser/pagelexer.go b/parser/pageparser/pagelexer.go
index 770f26eb9..a7e6b6cd4 100644
--- a/parser/pageparser/pagelexer.go
+++ b/parser/pageparser/pagelexer.go
@@ -54,7 +54,7 @@ type pageLexer struct {
// Implement the Result interface
func (l *pageLexer) Iterator() *Iterator {
- return l.newIterator()
+ return NewIterator(l.items)
}
func (l *pageLexer) Input() []byte {
@@ -85,10 +85,6 @@ func newPageLexer(input []byte, stateStart stateFunc, cfg Config) *pageLexer {
return lexer
}
-func (l *pageLexer) newIterator() *Iterator {
- return &Iterator{l: l, lastPos: -1}
-}
-
// main loop
func (l *pageLexer) run() *pageLexer {
for l.state = l.stateStart; l.state != nil; {
@@ -136,6 +132,13 @@ func (l *pageLexer) backup() {
l.pos -= l.width
}
+func (l *pageLexer) append(item Item) {
+ if item.Pos() < len(l.input) {
+ item.firstByte = l.input[item.Pos()]
+ }
+ l.items = append(l.items, item)
+}
+
// sends an item back to the client.
func (l *pageLexer) emit(t ItemType) {
defer func() {
@@ -151,11 +154,11 @@ func (l *pageLexer) emit(t ItemType) {
break
}
if i == l.start && b != '\n' {
- l.items = append(l.items, Item{tIndentation, l.start, l.input[l.start:l.pos], false})
+ l.append(Item{Type: tIndentation, low: l.start, high: l.pos})
return
} else if b == '\n' && i < l.pos-1 {
- l.items = append(l.items, Item{t, l.start, l.input[l.start : i+1], false})
- l.items = append(l.items, Item{tIndentation, i + 1, l.input[i+1 : l.pos], false})
+ l.append(Item{Type: t, low: l.start, high: i + 1})
+ l.append(Item{Type: tIndentation, low: i + 1, high: l.pos})
return
} else if b == '\n' && i == l.pos-1 {
break
@@ -164,13 +167,13 @@ func (l *pageLexer) emit(t ItemType) {
}
}
- l.items = append(l.items, Item{t, l.start, l.input[l.start:l.pos], false})
+ l.append(Item{Type: t, low: l.start, high: l.pos})
}
// sends a string item back to the client.
func (l *pageLexer) emitString(t ItemType) {
- l.items = append(l.items, Item{t, l.start, l.input[l.start:l.pos], true})
+ l.append(Item{Type: t, low: l.start, high: l.pos, isString: true})
l.start = l.pos
}
@@ -180,14 +183,33 @@ func (l *pageLexer) isEOF() bool {
// special case, do not send '\\' back to client
func (l *pageLexer) ignoreEscapesAndEmit(t ItemType, isString bool) {
- val := bytes.Map(func(r rune) rune {
+ i := l.start
+ k := i
+
+ var segments []lowHigh
+
+ for i < l.pos {
+ r, w := utf8.DecodeRune(l.input[i:l.pos])
if r == '\\' {
- return -1
+ if i > k {
+ segments = append(segments, lowHigh{k, i})
+ }
+ l.append(Item{Type: TypeIgnore, low: i, high: i + w})
+ k = i + w
}
- return r
- }, l.input[l.start:l.pos])
- l.items = append(l.items, Item{t, l.start, val, isString})
+ i += w
+ }
+
+ if k < l.pos {
+ segments = append(segments, lowHigh{k, l.pos})
+ }
+
+ if len(segments) > 0 {
+ l.append(Item{Type: t, segments: segments})
+ }
+
l.start = l.pos
+
}
// gets the current value (for debugging and error handling)
@@ -204,7 +226,7 @@ var lf = []byte("\n")
// nil terminates the parser
func (l *pageLexer) errorf(format string, args ...any) stateFunc {
- l.items = append(l.items, Item{tError, l.start, []byte(fmt.Sprintf(format, args...)), true})
+ l.append(Item{Type: tError, Err: fmt.Errorf(format, args...)})
return nil
}
diff --git a/parser/pageparser/pageparser.go b/parser/pageparser/pageparser.go
index 67abefc30..0a9fc61af 100644
--- a/parser/pageparser/pageparser.go
+++ b/parser/pageparser/pageparser.go
@@ -15,6 +15,7 @@ package pageparser
import (
"bytes"
+ "errors"
"fmt"
"io"
"io/ioutil"
@@ -33,9 +34,6 @@ type Result interface {
var _ Result = (*pageLexer)(nil)
// Parse parses the page in the given reader according to the given Config.
-// TODO(bep) now that we have improved the "lazy order" init, it *may* be
-// some potential saving in doing a buffered approach where the first pass does
-// the frontmatter only.
func Parse(r io.Reader, cfg Config) (Result, error) {
return parseSection(r, cfg, lexIntroSection)
}
@@ -63,12 +61,12 @@ func ParseFrontMatterAndContent(r io.Reader) (ContentFrontMatter, error) {
walkFn := func(item Item) bool {
if frontMatterSource != nil {
// The rest is content.
- cf.Content = psr.Input()[item.Pos:]
+ cf.Content = psr.Input()[item.low:]
// Done
return false
} else if item.IsFrontMatter() {
cf.FrontMatterFormat = FormatFromFrontMatterType(item.Type)
- frontMatterSource = item.Val
+ frontMatterSource = item.Val(psr.Input())
}
return true
}
@@ -113,10 +111,15 @@ func parseBytes(b []byte, cfg Config, start stateFunc) (Result, error) {
return lexer, nil
}
+// NewIterator creates a new Iterator.
+func NewIterator(items Items) *Iterator {
+ return &Iterator{items: items, lastPos: -1}
+}
+
// An Iterator has methods to iterate a parsed page with support going back
// if needed.
type Iterator struct {
- l *pageLexer
+ items Items
lastPos int // position of the last item returned by nextItem
}
@@ -126,19 +129,14 @@ func (t *Iterator) Next() Item {
return t.Current()
}
-// Input returns the input source.
-func (t *Iterator) Input() []byte {
- return t.l.Input()
-}
-
-var errIndexOutOfBounds = Item{tError, 0, []byte("no more tokens"), true}
+var errIndexOutOfBounds = Item{Type: tError, Err: errors.New("no more tokens")}
// Current will repeatably return the current item.
func (t *Iterator) Current() Item {
- if t.lastPos >= len(t.l.items) {
+ if t.lastPos >= len(t.items) {
return errIndexOutOfBounds
}
- return t.l.items[t.lastPos]
+ return t.items[t.lastPos]
}
// backs up one token.
@@ -163,14 +161,14 @@ func (t *Iterator) IsValueNext() bool {
// look at, but do not consume, the next item
// repeated, sequential calls will return the same item
func (t *Iterator) Peek() Item {
- return t.l.items[t.lastPos+1]
+ return t.items[t.lastPos+1]
}
// PeekWalk will feed the next items in the iterator to walkFn
// until it returns false.
func (t *Iterator) PeekWalk(walkFn func(item Item) bool) {
- for i := t.lastPos + 1; i < len(t.l.items); i++ {
- item := t.l.items[i]
+ for i := t.lastPos + 1; i < len(t.items); i++ {
+ item := t.items[i]
if !walkFn(item) {
break
}
@@ -190,6 +188,49 @@ func (t *Iterator) Consume(cnt int) {
}
// LineNumber returns the current line number. Used for logging.
-func (t *Iterator) LineNumber() int {
- return bytes.Count(t.l.input[:t.Current().Pos], lf) + 1
+func (t *Iterator) LineNumber(source []byte) int {
+ return bytes.Count(source[:t.Current().low], lf) + 1
+}
+
+// IsProbablySourceOfItems returns true if the given source looks like original
+// source of the items.
+// There may be some false positives, but that is highly unlikely and good enough
+// for the planned purpose.
+// It will also return false if the last item is not EOF (error situations) and
+// true if both source and items are empty.
+func IsProbablySourceOfItems(source []byte, items Items) bool {
+ if len(source) == 0 && len(items) == 0 {
+ return false
+ }
+ if len(items) == 0 {
+ return false
+ }
+
+ last := items[len(items)-1]
+ if last.Type != tEOF {
+ return false
+ }
+
+ if last.Pos() != len(source) {
+ return false
+ }
+
+ for _, item := range items {
+ if item.Type == tError {
+ return false
+ }
+ if item.Type == tEOF {
+ return true
+ }
+
+ if item.Pos() >= len(source) {
+ return false
+ }
+
+ if item.firstByte != source[item.Pos()] {
+ return false
+ }
+ }
+
+ return true
}
diff --git a/parser/pageparser/pageparser_intro_test.go b/parser/pageparser/pageparser_intro_test.go
index 1b903d546..1b2d59ccc 100644
--- a/parser/pageparser/pageparser_intro_test.go
+++ b/parser/pageparser/pageparser_intro_test.go
@@ -15,19 +15,25 @@ package pageparser
import (
"fmt"
- "reflect"
"strings"
"testing"
+
+ qt "github.com/frankban/quicktest"
)
type lexerTest struct {
name string
input string
- items []Item
+ items []typeText
+}
+
+type typeText struct {
+ typ ItemType
+ text string
}
-func nti(tp ItemType, val string) Item {
- return Item{tp, 0, []byte(val), false}
+func nti(tp ItemType, val string) typeText {
+ return typeText{typ: tp, text: val}
}
var (
@@ -52,48 +58,79 @@ var crLfReplacer = strings.NewReplacer("\r", "#", "\n", "$")
// TODO(bep) a way to toggle ORG mode vs the rest.
var frontMatterTests = []lexerTest{
- {"empty", "", []Item{tstEOF}},
- {"Byte order mark", "\ufeff\nSome text.\n", []Item{nti(TypeIgnore, "\ufeff"), tstSomeText, tstEOF}},
- {"HTML Document", ` <html> `, []Item{nti(tError, "plain HTML documents not supported")}},
- {"HTML Document with shortcode", `<html>{{< sc1 >}}</html>`, []Item{nti(tError, "plain HTML documents not supported")}},
- {"No front matter", "\nSome text.\n", []Item{tstSomeText, tstEOF}},
- {"YAML front matter", "---\nfoo: \"bar\"\n---\n\nSome text.\n", []Item{tstFrontMatterYAML, tstSomeText, tstEOF}},
- {"YAML empty front matter", "---\n---\n\nSome text.\n", []Item{nti(TypeFrontMatterYAML, ""), tstSomeText, tstEOF}},
- {"YAML commented out front matter", "<!--\n---\nfoo: \"bar\"\n---\n-->\nSome text.\n", []Item{nti(TypeIgnore, "<!--\n"), tstFrontMatterYAML, nti(TypeIgnore, "-->"), tstSomeText, tstEOF}},
- {"YAML commented out front matter, no end", "<!--\n---\nfoo: \"bar\"\n---\nSome text.\n", []Item{nti(TypeIgnore, "<!--\n"), tstFrontMatterYAML, nti(tError, "starting HTML comment with no end")}},
+ {"empty", "", []typeText{tstEOF}},
+ {"Byte order mark", "\ufeff\nSome text.\n", []typeText{nti(TypeIgnore, "\ufeff"), tstSomeText, tstEOF}},
+ {"HTML Document", ` <html> `, []typeText{nti(tError, "plain HTML documents not supported")}},
+ {"HTML Document with shortcode", `<html>{{< sc1 >}}</html>`, []typeText{nti(tError, "plain HTML documents not supported")}},
+ {"No front matter", "\nSome text.\n", []typeText{tstSomeText, tstEOF}},
+ {"YAML front matter", "---\nfoo: \"bar\"\n---\n\nSome text.\n", []typeText{tstFrontMatterYAML, tstSomeText, tstEOF}},
+ {"YAML empty front matter", "---\n---\n\nSome text.\n", []typeText{nti(TypeFrontMatterYAML, ""), tstSomeText, tstEOF}},
+ {"YAML commented out front matter", "<!--\n---\nfoo: \"bar\"\n---\n-->\nSome text.\n", []typeText{nti(TypeIgnore, "<!--\n"), tstFrontMatterYAML, nti(TypeIgnore, "-->"), tstSomeText, tstEOF}},
+ {"YAML commented out front matter, no end", "<!--\n---\nfoo: \"bar\"\n---\nSome text.\n", []typeText{nti(TypeIgnore, "<!--\n"), tstFrontMatterYAML, nti(tError, "starting HTML comment with no end")}},
// Note that we keep all bytes as they are, but we need to handle CRLF
- {"YAML front matter CRLF", "---\r\nfoo: \"bar\"\r\n---\n\nSome text.\n", []Item{tstFrontMatterYAMLCRLF, tstSomeText, tstEOF}},
- {"TOML front matter", "+++\nfoo = \"bar\"\n+++\n\nSome text.\n", []Item{tstFrontMatterTOML, tstSomeText, tstEOF}},
- {"JSON front matter", tstJSON + "\r\n\nSome text.\n", []Item{tstFrontMatterJSON, tstSomeText, tstEOF}},
- {"ORG front matter", tstORG + "\nSome text.\n", []Item{tstFrontMatterORG, tstSomeText, tstEOF}},
- {"Summary divider ORG", tstORG + "\nSome text.\n# more\nSome text.\n", []Item{tstFrontMatterORG, tstSomeText, nti(TypeLeadSummaryDivider, "# more\n"), nti(tText, "Some text.\n"), tstEOF}},
- {"Summary divider", "+++\nfoo = \"bar\"\n+++\n\nSome text.\n<!--more-->\nSome text.\n", []Item{tstFrontMatterTOML, tstSomeText, tstSummaryDivider, nti(tText, "Some text.\n"), tstEOF}},
- {"Summary divider same line", "+++\nfoo = \"bar\"\n+++\n\nSome text.<!--more-->Some text.\n", []Item{tstFrontMatterTOML, nti(tText, "\nSome text."), nti(TypeLeadSummaryDivider, "<!--more-->"), nti(tText, "Some text.\n"), tstEOF}},
+ {"YAML front matter CRLF", "---\r\nfoo: \"bar\"\r\n---\n\nSome text.\n", []typeText{tstFrontMatterYAMLCRLF, tstSomeText, tstEOF}},
+ {"TOML front matter", "+++\nfoo = \"bar\"\n+++\n\nSome text.\n", []typeText{tstFrontMatterTOML, tstSomeText, tstEOF}},
+ {"JSON front matter", tstJSON + "\r\n\nSome text.\n", []typeText{tstFrontMatterJSON, tstSomeText, tstEOF}},
+ {"ORG front matter", tstORG + "\nSome text.\n", []typeText{tstFrontMatterORG, tstSomeText, tstEOF}},
+ {"Summary divider ORG", tstORG + "\nSome text.\n# more\nSome text.\n", []typeText{tstFrontMatterORG, tstSomeText, nti(TypeLeadSummaryDivider, "# more\n"), nti(tText, "Some text.\n"), tstEOF}},
+ {"Summary divider", "+++\nfoo = \"bar\"\n+++\n\nSome text.\n<!--more-->\nSome te