// Copyright 2018 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package pageparser provides a parser for Hugo content files (Markdown, HTML etc.) in Hugo.
// This implementation is highly inspired by the great talk given by Rob Pike called "Lexical Scanning in Go"
// It's on YouTube, Google it!.
// See slides here: http://cuddle.googlecode.com/hg/talk/lex.html
package pageparser
import (
"bytes"
"fmt"
"unicode"
"unicode/utf8"
)
const eof = -1
// returns the next state in scanner.
type stateFunc func(*pageLexer) stateFunc
type lexerShortcodeState struct {
currLeftDelimItem ItemType
currRightDelimItem ItemType
currShortcodeName string // is only set when a shortcode is in opened state
closingState int // > 0 = on its way to be closed
elementStepNum int // step number in element
paramElements int // number of elements (name + value = 2) found first
openShortcodes map[string]bool // set of shortcodes in open state
}
type pageLexer struct {
input []byte
stateStart stateFunc
state stateFunc
pos int // input position
start int // item start position
width int // width of last element
// The summary divider to look for.
summaryDivider []byte
// Set when we have parsed any summary divider
summaryDividerChecked bool
lexerShortcodeState
// items delivered to client
items Items
}
// Implement the Result interface
func (l *pageLexer) Iterator() *Iterator {
return l.newIterator()
}
func (l *pageLexer) Input() []byte {
return l.input
}
// note: the input position here is normally 0 (start), but
// can be set if position of first shortcode is known
func newPageLexer(input []byte, inputPosition int, stateStart stateFunc) *pageLexer {
lexer := &pageLexer{
input: input,
pos: inputPosition,
stateStart: stateStart,
lexerShortcodeState: lexerShortcodeState{
currLeftDelimItem: tLeftDelimScNoMarkup,
currRightDelimItem: tRightDelimScNoMarkup,
openShortcodes: make(map[string]bool),
},
items: make([]Item, 0, 5),
}
return lexer
}
func (l *pageLexer) newIterator() *Iterator {
return &Iterator{l: l, lastPos: -1}
}
// main loop
func (l *pageLexer) run() *pageLexer {
for l.state = l.stateStart; l.state != nil; {
l.state = l.state(l)
}
return l
}
// Shortcode syntax
var (
leftDelimSc = []byte("{{")
leftDelimScNoMarkup = []byte("{{<")
rightDelimScNoMarkup = []byte(">}}")
leftDelimScWithMarkup = []byte("{{%")
rightDelimScWithMarkup = []byte("%}}")
leftComment = []byte("/*") // comments in this context us used to to mark shortcodes as "not really a shortcode"
rightComment = []byte("*/")
)
// Page syntax
var (
byteOrderMark = '\ufeff'
summaryDivider = []byte("<!--more-->")
summaryDividerOrg = []byte("# more")
delimTOML = []byte("+++")
delimYAML = []byte("---")
delimOrg = []byte("#+")
htmlCommentStart = []byte("<!--")
htmlCOmmentEnd = []byte("-->")
)
func (l *pageLexer) next() rune {
if int(l.pos) >= len(l.input) {
l.width = 0
return eof
}
runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
l.width = runeWidth
l.pos += l.width
return runeValue
}
// peek, but no consume
func (l *pageLexer) peek() rune {
r := l.next()