From 1e3e34002dae3d4a980141efcc86886e7de5bef8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Erik=20Pedersen?= Date: Thu, 18 Oct 2018 10:21:23 +0200 Subject: hugolib: Integrate new page parser See #5324 --- parser/frontmatter.go | 1 + parser/metadecoders/decoder.go | 95 ++++++++++++++++ parser/metadecoders/json.go | 31 ++++++ parser/metadecoders/yaml.go | 84 ++++++++++++++ parser/pageparser/item.go | 60 +++++----- parser/pageparser/pagelexer.go | 170 +++++++++++++++++++---------- parser/pageparser/pagelexer_test.go | 29 +++++ parser/pageparser/pageparser.go | 100 ++++++++++------- parser/pageparser/pageparser_intro_test.go | 33 +++--- 9 files changed, 466 insertions(+), 137 deletions(-) create mode 100644 parser/metadecoders/decoder.go create mode 100644 parser/metadecoders/json.go create mode 100644 parser/metadecoders/yaml.go create mode 100644 parser/pageparser/pagelexer_test.go (limited to 'parser') diff --git a/parser/frontmatter.go b/parser/frontmatter.go index 3716dc112..284d3f955 100644 --- a/parser/frontmatter.go +++ b/parser/frontmatter.go @@ -203,6 +203,7 @@ func removeTOMLIdentifier(datum []byte) []byte { // HandleYAMLMetaData unmarshals YAML-encoded datum and returns a Go interface // representing the encoded data structure. +// TODO(bep) 2errors remove these handlers (and hopefully package) func HandleYAMLMetaData(datum []byte) (map[string]interface{}, error) { m := map[string]interface{}{} err := yaml.Unmarshal(datum, &m) diff --git a/parser/metadecoders/decoder.go b/parser/metadecoders/decoder.go new file mode 100644 index 000000000..7527d7a08 --- /dev/null +++ b/parser/metadecoders/decoder.go @@ -0,0 +1,95 @@ +// Copyright 2018 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metadecoders + +import ( + "encoding/json" + + "github.com/BurntSushi/toml" + "github.com/chaseadamsio/goorgeous" + "github.com/gohugoio/hugo/parser/pageparser" + "github.com/pkg/errors" + yaml "gopkg.in/yaml.v1" +) + +type Format string + +const ( + // These are the supported metdata formats in Hugo. Most of these are also + // supported as /data formats. + ORG Format = "org" + JSON Format = "json" + TOML Format = "toml" + YAML Format = "yaml" +) + +// FormatFromFrontMatterType will return empty if not supported. +func FormatFromFrontMatterType(typ pageparser.ItemType) Format { + switch typ { + case pageparser.TypeFrontMatterJSON: + return JSON + case pageparser.TypeFrontMatterORG: + return ORG + case pageparser.TypeFrontMatterTOML: + return TOML + case pageparser.TypeFrontMatterYAML: + return YAML + default: + return "" + } +} + +// UnmarshalToMap will unmarshall data in format f into a new map. This is +// what's needed for Hugo's front matter decoding. +func UnmarshalToMap(data []byte, f Format) (map[string]interface{}, error) { + m := make(map[string]interface{}) + + if data == nil { + return m, nil + } + + var err error + + switch f { + case ORG: + m, err = goorgeous.OrgHeaders(data) + case JSON: + err = json.Unmarshal(data, &m) + case TOML: + _, err = toml.Decode(string(data), &m) + case YAML: + err = yaml.Unmarshal(data, &m) + + // To support boolean keys, the `yaml` package unmarshals maps to + // map[interface{}]interface{}. Here we recurse through the result + // and change all maps to map[string]interface{} like we would've + // gotten from `json`. + if err == nil { + for k, v := range m { + if vv, changed := stringifyMapKeys(v); changed { + m[k] = vv + } + } + } + default: + return nil, errors.Errorf("unmarshal of format %q is not supported", f) + } + + if err != nil { + return nil, errors.Wrapf(err, "unmarshal failed for format %q", f) + } + + return m, nil + +} diff --git a/parser/metadecoders/json.go b/parser/metadecoders/json.go new file mode 100644 index 000000000..21ca8a3b9 --- /dev/null +++ b/parser/metadecoders/json.go @@ -0,0 +1,31 @@ +// Copyright 2018 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metadecoders + +import "encoding/json" + +// HandleJSONData unmarshals JSON-encoded datum and returns a Go interface +// representing the encoded data structure. +func HandleJSONData(datum []byte) (interface{}, error) { + if datum == nil { + // Package json returns on error on nil input. + // Return an empty map to be consistent with our other supported + // formats. + return make(map[string]interface{}), nil + } + + var f interface{} + err := json.Unmarshal(datum, &f) + return f, err +} diff --git a/parser/metadecoders/yaml.go b/parser/metadecoders/yaml.go new file mode 100644 index 000000000..3a520ac07 --- /dev/null +++ b/parser/metadecoders/yaml.go @@ -0,0 +1,84 @@ +// Copyright 2018 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// The metadecoders package contains functions to decode metadata (e.g. page front matter) +// from different formats: TOML, YAML, JSON. +package metadecoders + +import ( + "fmt" + + "github.com/spf13/cast" + yaml "gopkg.in/yaml.v1" +) + +// HandleYAMLData unmarshals YAML-encoded datum and returns a Go interface +// representing the encoded data structure. +func HandleYAMLData(datum []byte) (interface{}, error) { + var m interface{} + err := yaml.Unmarshal(datum, &m) + if err != nil { + return nil, err + } + + // To support boolean keys, the `yaml` package unmarshals maps to + // map[interface{}]interface{}. Here we recurse through the result + // and change all maps to map[string]interface{} like we would've + // gotten from `json`. + if mm, changed := stringifyMapKeys(m); changed { + return mm, nil + } + + return m, nil +} + +// stringifyMapKeys recurses into in and changes all instances of +// map[interface{}]interface{} to map[string]interface{}. This is useful to +// work around the impedence mismatch between JSON and YAML unmarshaling that's +// described here: https://github.com/go-yaml/yaml/issues/139 +// +// Inspired by https://github.com/stripe/stripe-mock, MIT licensed +func stringifyMapKeys(in interface{}) (interface{}, bool) { + switch in := in.(type) { + case []interface{}: + for i, v := range in { + if vv, replaced := stringifyMapKeys(v); replaced { + in[i] = vv + } + } + case map[interface{}]interface{}: + res := make(map[string]interface{}) + var ( + ok bool + err error + ) + for k, v := range in { + var ks string + + if ks, ok = k.(string); !ok { + ks, err = cast.ToStringE(k) + if err != nil { + ks = fmt.Sprintf("%v", k) + } + } + if vv, replaced := stringifyMapKeys(v); replaced { + res[ks] = vv + } else { + res[ks] = v + } + } + return res, true + } + + return nil, false +} diff --git a/parser/pageparser/item.go b/parser/pageparser/item.go index 6e93bb696..d97fed734 100644 --- a/parser/pageparser/item.go +++ b/parser/pageparser/item.go @@ -16,87 +16,95 @@ package pageparser import "fmt" type Item struct { - typ itemType + Typ ItemType pos pos Val []byte } +type Items []Item + func (i Item) ValStr() string { return string(i.Val) } func (i Item) IsText() bool { - return i.typ == tText + return i.Typ == tText } func (i Item) IsShortcodeName() bool { - return i.typ == tScName + return i.Typ == tScName } func (i Item) IsLeftShortcodeDelim() bool { - return i.typ == tLeftDelimScWithMarkup || i.typ == tLeftDelimScNoMarkup + return i.Typ == tLeftDelimScWithMarkup || i.Typ == tLeftDelimScNoMarkup } func (i Item) IsRightShortcodeDelim() bool { - return i.typ == tRightDelimScWithMarkup || i.typ == tRightDelimScNoMarkup + return i.Typ == tRightDelimScWithMarkup || i.Typ == tRightDelimScNoMarkup } func (i Item) IsShortcodeClose() bool { - return i.typ == tScClose + return i.Typ == tScClose } func (i Item) IsShortcodeParam() bool { - return i.typ == tScParam + return i.Typ == tScParam } func (i Item) IsShortcodeParamVal() bool { - return i.typ == tScParamVal + return i.Typ == tScParamVal } func (i Item) IsShortcodeMarkupDelimiter() bool { - return i.typ == tLeftDelimScWithMarkup || i.typ == tRightDelimScWithMarkup + return i.Typ == tLeftDelimScWithMarkup || i.Typ == tRightDelimScWithMarkup +} + +func (i Item) IsFrontMatter() bool { + return i.Typ >= TypeFrontMatterYAML && i.Typ <= TypeFrontMatterORG } func (i Item) IsDone() bool { - return i.typ == tError || i.typ == tEOF + return i.Typ == tError || i.Typ == tEOF } func (i Item) IsEOF() bool { - return i.typ == tEOF + return i.Typ == tEOF } func (i Item) IsError() bool { - return i.typ == tError + return i.Typ == tError } func (i Item) String() string { switch { - case i.typ == tEOF: + case i.Typ == tEOF: return "EOF" - case i.typ == tError: + case i.Typ == tError: return string(i.Val) - case i.typ > tKeywordMarker: + case i.Typ > tKeywordMarker: return fmt.Sprintf("<%s>", i.Val) case len(i.Val) > 50: - return fmt.Sprintf("%v:%.20q...", i.typ, i.Val) + return fmt.Sprintf("%v:%.20q...", i.Typ, i.Val) } - return fmt.Sprintf("%v:[%s]", i.typ, i.Val) + return fmt.Sprintf("%v:[%s]", i.Typ, i.Val) } -type itemType int +type ItemType int const ( - tError itemType = iota + tError ItemType = iota tEOF // page items - tHTMLLead // < - tSummaryDivider // - tSummaryDividerOrg // # more - tFrontMatterYAML - tFrontMatterTOML - tFrontMatterJSON - tFrontMatterORG + TypeHTMLDocument // document starting with < as first non-whitespace + TypeHTMLComment // We ignore leading comments + TypeLeadSummaryDivider // + TypeSummaryDividerOrg // # more + TypeFrontMatterYAML + TypeFrontMatterTOML + TypeFrontMatterJSON + TypeFrontMatterORG + TypeIgnore // // The BOM Unicode byte order marker and possibly others // shortcode items tLeftDelimScNoMarkup diff --git a/parser/pageparser/pagelexer.go b/parser/pageparser/pagelexer.go index c15e977ca..7768b0b2f 100644 --- a/parser/pageparser/pagelexer.go +++ b/parser/pageparser/pagelexer.go @@ -33,8 +33,8 @@ const eof = -1 type stateFunc func(*pageLexer) stateFunc type lexerShortcodeState struct { - currLeftDelimItem itemType - currRightDelimItem itemType + currLeftDelimItem ItemType + currRightDelimItem ItemType currShortcodeName string // is only set when a shortcode is in opened state closingState int // > 0 = on its way to be closed elementStepNum int // step number in element @@ -50,14 +50,24 @@ type pageLexer struct { pos pos // input position start pos // item start position width pos // width of last element - lastPos pos // position of the last item returned by nextItem - contentSections int + // Set when we have parsed any summary divider + summaryDividerChecked bool lexerShortcodeState // items delivered to client - items []Item + items Items +} + +// Implement the Result interface +func (l *pageLexer) Iterator() *Iterator { + return l.newIterator() +} + +func (l *pageLexer) Input() []byte { + return l.input + } // note: the input position here is normally 0 (start), but @@ -79,6 +89,10 @@ func newPageLexer(input []byte, inputPosition pos, stateStart stateFunc) *pageLe return lexer } +func (l *pageLexer) newIterator() *Iterator { + return &Iterator{l: l, lastPos: -1} +} + // main loop func (l *pageLexer) run() *pageLexer { for l.state = l.stateStart; l.state != nil; { @@ -89,6 +103,7 @@ func (l *pageLexer) run() *pageLexer { // Shortcode syntax var ( + leftDelimSc = []byte("{{") leftDelimScNoMarkup = []byte("{{<") rightDelimScNoMarkup = []byte(">}}") leftDelimScWithMarkup = []byte("{{%") @@ -99,11 +114,14 @@ var ( // Page syntax var ( + byteOrderMark = '\ufeff' summaryDivider = []byte("") summaryDividerOrg = []byte("# more") delimTOML = []byte("+++") delimYAML = []byte("---") delimOrg = []byte("#+") + htmlCOmmentStart = []byte("") ) func (l *pageLexer) next() rune { @@ -131,13 +149,13 @@ func (l *pageLexer) backup() { } // sends an item back to the client. -func (l *pageLexer) emit(t itemType) { +func (l *pageLexer) emit(t ItemType) { l.items = append(l.items, Item{t, l.start, l.input[l.start:l.pos]}) l.start = l.pos } // special case, do not send '\\' back to client -func (l *pageLexer) ignoreEscapesAndEmit(t itemType) { +func (l *pageLexer) ignoreEscapesAndEmit(t ItemType) { val := bytes.Map(func(r rune) rune { if r == '\\' { return -1 @@ -160,25 +178,12 @@ func (l *pageLexer) ignore() { var lf = []byte("\n") -// nice to have in error logs -func (l *pageLexer) lineNum() int { - return bytes.Count(l.input[:l.lastPos], lf) + 1 -} - // nil terminates the parser func (l *pageLexer) errorf(format string, args ...interface{}) stateFunc { l.items = append(l.items, Item{tError, l.start, []byte(fmt.Sprintf(format, args...))}) return nil } -// consumes and returns the next item -func (l *pageLexer) nextItem() Item { - item := l.items[0] - l.items = l.items[1:] - l.lastPos = item.pos - return item -} - func (l *pageLexer) consumeCRLF() bool { var consumed bool for _, r := range crLf { @@ -192,12 +197,28 @@ func (l *pageLexer) consumeCRLF() bool { } func lexMainSection(l *pageLexer) stateFunc { + // Fast forward as far as possible. + var l1, l2, l3 int + if !l.summaryDividerChecked { + // TODO(bep) 2errors make the summary divider per type + l1 = l.index(summaryDivider) + l2 = l.index(summaryDividerOrg) + if l1 == -1 && l2 == -1 { + l.summaryDividerChecked = true + } + } + l3 = l.index(leftDelimSc) + skip := minPositiveIndex(l1, l2, l3) + if skip > 0 { + l.pos += pos(skip) + } + for { if l.isShortCodeStart() { if l.pos > l.start { l.emit(tText) } - if bytes.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) { + if l.hasPrefix(leftDelimScWithMarkup) { l.currLeftDelimItem = tLeftDelimScWithMarkup l.currRightDelimItem = tRightDelimScWithMarkup } else { @@ -207,21 +228,21 @@ func lexMainSection(l *pageLexer) stateFunc { return lexShortcodeLeftDelim } - if l.contentSections <= 1 { - if bytes.HasPrefix(l.input[l.pos:], summaryDivider) { + if !l.summaryDividerChecked { + if l.hasPrefix(summaryDivider) { if l.pos > l.start { l.emit(tText) } - l.contentSections++ + l.summaryDividerChecked = true l.pos += pos(len(summaryDivider)) - l.emit(tSummaryDivider) - } else if bytes.HasPrefix(l.input[l.pos:], summaryDividerOrg) { + l.emit(TypeLeadSummaryDivider) + } else if l.hasPrefix(summaryDividerOrg) { if l.pos > l.start { l.emit(tText) } - l.contentSections++ + l.summaryDividerChecked = true l.pos += pos(len(summaryDividerOrg)) - l.emit(tSummaryDividerOrg) + l.emit(TypeSummaryDividerOrg) } } @@ -237,7 +258,7 @@ func lexMainSection(l *pageLexer) stateFunc { } func (l *pageLexer) isShortCodeStart() bool { - return bytes.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || bytes.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup) + return l.hasPrefix(leftDelimScWithMarkup) || l.hasPrefix(leftDelimScNoMarkup) } func lexIntroSection(l *pageLexer) stateFunc { @@ -250,28 +271,37 @@ LOOP: switch { case r == '+': - return l.lexFrontMatterSection(tFrontMatterTOML, r, "TOML", delimTOML) + return l.lexFrontMatterSection(TypeFrontMatterTOML, r, "TOML", delimTOML) case r == '-': - return l.lexFrontMatterSection(tFrontMatterYAML, r, "YAML", delimYAML) + return l.lexFrontMatterSection(TypeFrontMatterYAML, r, "YAML", delimYAML) case r == '{': return lexFrontMatterJSON case r == '#': return lexFrontMatterOrgMode + case r == byteOrderMark: + l.emit(TypeIgnore) case !isSpace(r) && !isEndOfLine(r): + // No front matter. if r == '<' { - l.emit(tHTMLLead) - // Not need to look further. Hugo treats this as plain HTML, - // no front matter, no shortcodes, no nothing. - l.pos = pos(len(l.input)) - l.emit(tText) - break LOOP + l.backup() + if l.hasPrefix(htmlCOmmentStart) { + right := l.index(htmlCOmmentEnd) + if right == -1 { + return l.errorf("starting HTML comment with no end") + } + l.pos += pos(right) + pos(len(htmlCOmmentEnd)) + l.emit(TypeHTMLComment) + } else { + // Not need to look further. Hugo treats this as plain HTML, + // no front matter, no shortcodes, no nothing. + l.pos = pos(len(l.input)) + l.emit(TypeHTMLDocument) + } } - return l.errorf("failed to detect front matter type; got unknown identifier %q", r) + break LOOP } } - l.contentSections = 1 - // Now move on to the shortcodes. return lexMainSection } @@ -324,7 +354,7 @@ func lexFrontMatterJSON(l *pageLexer) stateFunc { } l.consumeCRLF() - l.emit(tFrontMatterJSON) + l.emit(TypeFrontMatterJSON) return lexMainSection } @@ -338,7 +368,7 @@ func lexFrontMatterOrgMode(l *pageLexer) stateFunc { l.backup() - if !bytes.HasPrefix(l.input[l.pos:], delimOrg) { + if !l.hasPrefix(delimOrg) { // TODO(bep) consider error return lexMainSection } @@ -351,7 +381,7 @@ LOOP: switch { case r == '\n': - if !bytes.HasPrefix(l.input[l.pos:], delimOrg) { + if !l.hasPrefix(delimOrg) { break LOOP } case r == eof: @@ -360,24 +390,25 @@ LOOP: } } - l.emit(tFrontMatterORG) + l.emit(TypeFrontMatterORG) return lexMainSection } +func (l *pageLexer) printCurrentInput() { + fmt.Printf("input[%d:]: %q", l.pos, string(l.input[l.pos:])) +} + // Handle YAML or TOML front matter. -func (l *pageLexer) lexFrontMatterSection(tp itemType, delimr rune, name string, delim []byte) stateFunc { +func (l *pageLexer) lexFrontMatterSection(tp ItemType, delimr rune, name string, delim []byte) stateFunc { + for i := 0; i < 2; i++ { if r := l.next(); r != delimr { return l.errorf("invalid %s delimiter", name) } } - if !l.consumeCRLF() { - return l.errorf("invalid %s delimiter", name) - } - // We don't care about the delimiters. l.ignore() @@ -387,7 +418,7 @@ func (l *pageLexer) lexFrontMatterSection(tp itemType, delimr rune, name string, return l.errorf("EOF looking for end %s front matter delimiter", name) } if isEndOfLine(r) { - if bytes.HasPrefix(l.input[l.pos:], delim) { + if l.hasPrefix(delim) { l.emit(tp) l.pos += 3 l.consumeCRLF() @@ -402,7 +433,7 @@ func (l *pageLexer) lexFrontMatterSection(tp itemType, delimr rune, name string, func lexShortcodeLeftDelim(l *pageLexer) stateFunc { l.pos += pos(len(l.currentLeftShortcodeDelim())) - if bytes.HasPrefix(l.input[l.pos:], leftComment) { + if l.hasPrefix(leftComment) { return lexShortcodeComment } l.emit(l.currentLeftShortcodeDelimItem()) @@ -412,7 +443,7 @@ func lexShortcodeLeftDelim(l *pageLexer) stateFunc { } func lexShortcodeComment(l *pageLexer) stateFunc { - posRightComment := bytes.Index(l.input[l.pos:], append(rightComment, l.currentRightShortcodeDelim()...)) + posRightComment := l.index(append(rightComment, l.currentRightShortcodeDelim()...)) if posRightComment <= 1 { return l.errorf("comment must be closed") } @@ -493,7 +524,7 @@ func lexShortcodeParam(l *pageLexer, escapedQuoteStart bool) stateFunc { } -func lexShortcodeQuotedParamVal(l *pageLexer, escapedQuotedValuesAllowed bool, typ itemType) stateFunc { +func lexShortcodeQuotedParamVal(l *pageLexer, escapedQuotedValuesAllowed bool, typ ItemType) stateFunc { openQuoteFound := false escapedInnerQuoteFound := false escapedQuoteState := 0 @@ -592,7 +623,7 @@ Loop: } func lexEndOfShortcode(l *pageLexer) stateFunc { - if bytes.HasPrefix(l.input[l.pos:], l.currentRightShortcodeDelim()) { + if l.hasPrefix(l.currentRightShortcodeDelim()) { return lexShortcodeRightDelim } switch r := l.next(); { @@ -606,7 +637,7 @@ func lexEndOfShortcode(l *pageLexer) stateFunc { // scans the elements inside shortcode tags func lexInsideShortcode(l *pageLexer) stateFunc { - if bytes.HasPrefix(l.input[l.pos:], l.currentRightShortcodeDelim()) { + if l.hasPrefix(l.currentRightShortcodeDelim()) { return lexShortcodeRightDelim } switch r := l.next(); { @@ -643,11 +674,19 @@ func lexInsideShortcode(l *pageLexer) stateFunc { // state helpers -func (l *pageLexer) currentLeftShortcodeDelimItem() itemType { +func (l *pageLexer) index(sep []byte) int { + return bytes.Index(l.input[l.pos:], sep) +} + +func (l *pageLexer) hasPrefix(prefix []byte) bool { + return bytes.HasPrefix(l.input[l.pos:], prefix) +} + +func (l *pageLexer) currentLeftShortcodeDelimItem() ItemType { return l.currLeftDelimItem } -func (l *pageLexer) currentRightShortcodeDelimItem() itemType { +func (l *pageLexer) currentRightShortcodeDelimItem() ItemType { return l.currRightDelimItem } @@ -668,6 +707,23 @@ func (l *pageLexer) currentRightShortcodeDelim() []byte { // helper functions +// returns the min index > 0 +func minPositiveIndex(indices ...int) int { + min := -1 + + for _, j := range indices { + if j <= 0 { + continue + } + if min == -1 { + min = j + } else if j < min { + min = j + } + } + return min +} + func isSpace(r rune) bool { return r == ' ' || r == '\t' } diff --git a/parser/pageparser/pagelexer_test.go b/parser/pageparser/pagelexer_test.go new file mode 100644 index 000000000..5c85df017 --- /dev/null +++ b/parser/pageparser/pagelexer_test.go @@ -0,0 +1,29 @@ +// Copyright 2018 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pageparser + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestMinPositiveIndex(t *testing.T) { + assert := require.New(t) + assert.Equal(1, minPositiveIndex(4, 1, 2, 3)) + assert.Equal(2, minPositiveIndex(4, 0, -2, 2, 5)) + assert.Equal(-1, minPositiveIndex()) + assert.Equal(-1, minPositiveIndex(-2, -3)) + +} diff --git a/parser/pageparser/pageparser.go b/parser/pageparser/pageparser.go index 948c05edf..b4cdef75c 100644 --- a/parser/pageparser/pageparser.go +++ b/parser/pageparser/pageparser.go @@ -17,72 +17,90 @@ // See slides here: http://cuddle.googlecode.com/hg/talk/lex.html package pageparser -func Parse(input []byte) *Tokens { - return ParseFrom(input, 0) +import ( + "bytes" + "io" + "io/ioutil" + + "github.com/pkg/errors" +) + +// Result holds the parse result. +type Result interface { + // Iterator returns a new Iterator positioned at the benning of the parse tree. + Iterator() *Iterator + // Input returns the input to Parse. + Input() []byte } -func ParseFrom(input []byte, from int) *Tokens { - lexer := newPageLexer(input, pos(from), lexMainSection) // TODO(bep) 2errors +var _ Result = (*pageLexer)(nil) + +// Parse parses the page in the given reader. +func Parse(r io.Reader) (Result, error) { + b, err := ioutil.ReadAll(r) + if err != nil { + return nil, errors.Wrap(err, "failed to read page content") + } + lexer := newPageLexer(b, 0, lexIntroSection) lexer.run() - return &Tokens{lexer: lexer} + return lexer, nil + } -type Tokens struct { - lexer *pageLexer - token [3]Item // 3-item look-ahead is what we currently need - peekCount int +func parseMainSection(input []byte, from int) Result { + lexer := newPageLexer(input, pos(from), lexMainSection) // TODO(bep) 2errors + lexer.run() + return lexer } -func (t *Tokens) Next() Item { - if t.peekCount > 0 { - t.peekCount-- - } else { - t.token[0] = t.lexer.nextItem() - } - return t.token[t.peekCount] +// An Iterator has methods to iterate a parsed page with support going back +// if needed. +type Iterator struct { + l *pageLexer + lastPos pos // position of the last item returned by nextItem } -// backs up one token. -func (t *Tokens) Backup() { - t.peekCount++ +// consumes and returns the next item +func (t *Iterator) Next() Item { + t.lastPos++ + return t.current() } -// backs up two tokens. -func (t *Tokens) Backup2(t1 Item) { - t.token[1] = t1 - t.peekCount = 2 +var errIndexOutOfBounds = Item{tError, 0, []byte("no more tokens")} + +func (t *Iterator) current() Item { + if t.lastPos >= pos(len(t.l.items)) { + return errIndexOutOfBounds + } + return t.l.items[t.lastPos] } -// backs up three tokens. -func (t *Tokens) Backup3(t2, t1 Item) { - t.token[1] = t1 - t.token[2] = t2 - t.peekCount = 3 +// backs up one token. +func (t *Iterator) Backup() { + if t.lastPos < 0 { + panic("need to go forward before going back") + } + t.lastPos-- } // check for non-error and non-EOF types coming next -func (t *Tokens) IsValueNext() bool { +func (t *Iterator) IsValueNext() bool { i := t.Peek() - return i.typ != tError && i.typ != tEOF + return i.Typ != tError && i.Typ != tEOF } // look at, but do not consume, the next item // repeated, sequential calls will return the same item -func (t *Tokens) Peek() Item { - if t.peekCount > 0 { - return t.token[t.peekCount-1] - } - t.peekCount = 1 - t.token[0] = t.lexer.nextItem() - return t.token[0] +func (t *Iterator) Peek() Item { + return t.l.items[t.lastPos+1] } // Consume is a convencience method to consume the next n tokens, // but back off Errors and EOF. -func (t *Tokens) Consume(cnt int) { +func (t *Iterator) Consume(cnt int) { for i := 0; i < cnt; i++ { token := t.Next() - if token.typ == tError || token.typ == tEOF { + if token.Typ == tError || token.Typ == tEOF { t.Backup() break } @@ -90,6 +108,6 @@ func (t *Tokens) Consume(cnt int) { } // LineNumber returns the current line number. Used for logging. -func (t *Tokens) LineNumber() int { - return t.lexer.lineNum() +func (t *Iterator) LineNumber() int { + return bytes.Count(t.l.input[:t.current().pos], lf) + 1 } diff --git a/parser/pageparser/pageparser_intro_test.go b/parser/pageparser/pageparser_intro_test.go index 19e30dc9a..bfd19c250 100644 --- a/parser/pageparser/pageparser_intro_test.go +++ b/parser/pageparser/pageparser_intro_test.go @@ -26,27 +26,26 @@ type lexerTest struct { items []Item } -func nti(tp itemType, val string) Item { +func nti(tp ItemType, val string) Item { return Item{tp, 0, []byte(val)} } var ( tstJSON = `{ "a": { "b": "\"Hugo\"}" } }` - tstHTMLLead = nti(tHTMLLead, " <") - tstFrontMatterTOML = nti(tFrontMatterTOML, "foo = \"bar\"\n") - tstFrontMatterYAML = nti(tFrontMatterYAML, "foo: \"bar\"\n") - tstFrontMatterYAMLCRLF = nti(tFrontMatterYAML, "foo: \"bar\"\r\n") - tstFrontMatterJSON = nti(tFrontMatterJSON, tstJSON+"\r\n") + tstFrontMatterTOML = nti(TypeFrontMatterTOML, "\nfoo = \"bar\"\n") + tstFrontMatterYAML = nti(TypeFrontMatterYAML, "\nfoo: \"bar\"\n") + tstFrontMatterYAMLCRLF = nti(TypeFrontMatterYAML, "\r\nfoo: \"bar\"\r\n") + tstFrontMatterJSON = nti(TypeFrontMatterJSON, tstJSON+"\r\n") tstSomeText = nti(tText, "\nSome text.\n") - tstSummaryDivider = nti(tSummaryDivider, "") - tstSummaryDividerOrg = nti(tSummaryDividerOrg, "# more") + tstSummaryDivider = nti(TypeLeadSummaryDivider, "") + tstSummaryDividerOrg = nti(TypeSummaryDividerOrg, "# more") tstORG = ` #+TITLE: T1 #+AUTHOR: A1 #+DESCRIPTION: D1 ` - tstFrontMatterORG = nti(tFrontMatterORG, tstORG) + tstFrontMatterORG = nti(TypeFrontMatterORG, tstORG) ) var crLfReplacer = strings.NewReplacer("\r", "#", "\n", "$") @@ -54,8 +53,15 @@ var crLfReplacer = strings.NewReplacer("\r", "#", "\n", "$") // TODO(bep) a way to toggle ORG mode vs the rest. var frontMatterTests = []lexerTest{ {"empty", "", []Item{tstEOF}}, - {"HTML Document", ` `, []Item{tstHTMLLead, nti(tText, "html> "), tstEOF}}, + {"Byte order mark", "\ufeff\nSome text.\n", []Item{nti(TypeIgnore, "\ufeff"), tstSomeText, tstEOF}}, + {"HTML Document", ` `, []Item{nti(TypeHTMLDocument, " "), tstEOF}}, + {"HTML Document 2", `

Hugo Rocks

`, []Item{nti(TypeHTMLDocument, "

Hugo Rocks

"), tstEOF}}, + {"No front matter", "\nSome text.\n", []Item{tstSomeText, tstEOF}}, {"YAML front matter", "---\nfoo: \"bar\"\n---\n\nSome text.\n", []Item{tstFrontMatterYAML, tstSomeText, tstEOF}}, + {"YAML empty front matter", "---\n---\n\nSome text.\n", []Item{nti(TypeFrontMatterYAML, "\n"), tstSomeText, tstEOF}}, + + {"YAML commented out front matter", "\nSome text.\n", []Item{nti(TypeHTMLComment, ""), tstSomeText, tstEOF}}, + // Note that we keep all bytes as they are, but we need to handle CRLF {"YAML front matter CRLF", "---\r\nfoo: \"bar\"\r\n---\n\nSome text.\n", []Item{tstFrontMatterYAMLCRLF, tstSomeText, tstEOF}}, {"TOML front matter", "+++\nfoo = \"bar\"\n+++\n\nSome text.\n", []Item{tstFrontMatterTOML, tstSomeText, tstEOF}}, @@ -80,11 +86,12 @@ func TestFrontMatter(t *testing.T) { func collect(input []byte, skipFrontMatter bool, stateStart stateFunc) (items []Item) { l := newPageLexer(input, 0, stateStart) l.run() + t := l.newIterator() for { - item := l.nextItem() + item := t.Next() items = append(items, item) - if item.typ == tEOF || item.typ == tError { + if item.Typ == tEOF || item.Typ == tError { break } } @@ -97,7 +104,7 @@ func equal(i1, i2 []Item) bool { return false } for k := range i1 { - if i1[k].typ != i2[k].typ { + if i1[k].Typ != i2[k].Typ { return false } if !reflect.DeepEqual(i1[k].Val, i2[k].Val) { -- cgit v1.2.3