1 files changed, 494 insertions, 0 deletions
diff --git a/common/paths/pathparser.go b/common/paths/pathparser.go
new file mode 100644
index 000000000..842d9307b
--- /dev/null
+++ b/common/paths/pathparser.go
@@ -0,0 +1,494 @@
+// Copyright 2024 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package paths
+
+import (
+	"path"
+	"path/filepath"
+	"runtime"
+	"strings"
+
+	"github.com/gohugoio/hugo/common/types"
+	"github.com/gohugoio/hugo/hugofs/files"
+)
+
+var defaultPathParser PathParser
+
+// PathParser parses a path into a Path.
+type PathParser struct {
+	// Maps the language code to its index in the languages/sites slice.
+	LanguageIndex map[string]int
+}
+
+// Parse parses component c with path s into Path using the default path parser.
+func Parse(c, s string) *Path {
+	return defaultPathParser.Parse(c, s)
+}
+
+// NormalizePathString returns a normalized path string using the very basic Hugo rules.
+func NormalizePathStringBasic(s string) string {
+	// All lower case.
+	s = strings.ToLower(s)
+
+	// Replace spaces with hyphens.
+	s = strings.ReplaceAll(s, " ", "-")
+
+	return s
+}
+
+// Parse parses component c with path s into Path using Hugo's content path rules.
+func (parser PathParser) Parse(c, s string) *Path {
+	p, err := parser.parse(c, s)
+	if err != nil {
+		panic(err)
+	}
+	return p
+}
+
+func (pp *PathParser) parse(component, s string) (*Path, error) {
+	ss := NormalizePathStringBasic(s)
+
+	p, err := pp.doParse(component, ss)
+	if err != nil {
+		return nil, err
+	}
+
+	if s != ss {
+		var err error
+		// Preserve the original case for titles etc.
+		p.unnormalized, err = pp.doParse(component, s)
+
+		if err != nil {
+			return nil, err
+		}
+	} else {
+		p.unnormalized = p
+	}
+
+	return p, nil
+}
+
+func (pp *PathParser) doParse(component, s string) (*Path, error) {
+	p := &Path{
+		component:             component,
+		posContainerLow:       -1,
+		posContainerHigh:      -1,
+		posSectionHigh:        -1,
+		posIdentifierLanguage: -1,
+	}
+
+	hasLang := pp.LanguageIndex != nil
+	hasLang = hasLang && (component == files.ComponentFolderContent || component == files.ComponentFolderLayouts)
+
+	if runtime.GOOS == "windows" {
+		s = path.Clean(filepath.ToSlash(s))
+		if s == "." {
+			s = ""
+		}
+	}
+
+	if s == "" {
+		s = "/"
+	}
+
+	// Leading slash, no trailing slash.
+	if !strings.HasPrefix(s, "/") {
+		s = "/" + s
+	}
+
+	if s != "/" && s[len(s)-1] == '/' {
+		s = s[:len(s)-1]
+	}
+
+	p.s = s
+	slashCount := 0
+
+	for i := len(s) - 1; i >= 0; i-- {
+		c := s[i]
+
+		switch c {
+		case '.':
+			if p.posContainerHigh == -1 {
+				var high int
+				if len(p.identifiers) > 0 {
+					high = p.identifiers[len(p.identifiers)-1].Low - 1
+				} else {
+					high = len(p.s)
+				}
+				id := types.LowHigh{Low: i + 1, High: high}
+				if len(p.identifiers) == 0 {
+					p.identifiers = append(p.identifiers, id)
+				} else if len(p.identifiers) == 1 {
+					// Check for a valid language.
+					s := p.s[id.Low:id.High]
+
+					if hasLang {
+						if _, found := pp.LanguageIndex[s]; found {
+							p.posIdentifierLanguage = 1
+							p.identifiers = append(p.identifiers, id)
+						}
+					}
+				}
+			}
+		case '/':
+			slashCount++
+			if p.posContainerHigh == -1 {
+				p.posContainerHigh = i + 1
+			} else if p.posContainerLow == -1 {
+				p.posContainerLow = i + 1
+			}
+			if i > 0 {
+				p.posSectionHigh = i
+			}
+		}
+	}
+
+	isContentComponent := p.component == files.ComponentFolderContent || p.component == files.ComponentFolderArchetypes
+	isContent := isContentComponent && files.IsContentExt(p.Ext())
+
+	if isContent {
+		id := p.identifiers[len(p.identifiers)-1]
+		b := p.s[p.posContainerHigh : id.Low-1]
+		switch b {
+		case "index":
+			p.bundleType = PathTypeLeaf
+		case "_index":
+			p.bundleType = PathTypeBranch
+		default:
+			p.bundleType = PathTypeContentSingle
+		}
+
+		if slashCount == 2 && p.IsLeafBundle() {
+			p.posSectionHigh = 0
+		}
+	}
+
+	return p, nil
+}
+
+func ModifyPathBundleTypeResource(p *Path) {
+	if p.IsContent() {
+		p.bundleType = PathTypeContentResource
+	} else {
+		p.bundleType = PathTypeFile
+	}
+}
+
+type PathType int
+
+const (
+	// A generic resource, e.g. a JSON file.
+	PathTypeFile PathType = iota
+
+	// All below are content files.
+	// A resource of a content type with front matter.
+	PathTypeContentResource
+
+	// E.g. /blog/my-post.md
+	PathTypeContentSingle
+
+	// All bewlow are bundled content files.
+
+	// Leaf bundles, e.g. /blog/my-post/index.md
+	PathTypeLeaf
+
+	// Branch bundles, e.g. /blog/_index.md
+	PathTypeBranch
+)
+
+type Path struct {
+	s string
+
+	posContainerLow  int
+	posContainerHigh int
+	posSectionHigh   int
+
+	component  string
+	bundleType PathType
+
+	identifiers []types.LowHigh
+
+	posIdentifierLanguage int
+
+	trimLeadingSlash bool
+
+	unnormalized *Path
+}
+
+// TrimLeadingSlash returns a copy of the Path with the leading slash removed.
+func (p Path) TrimLeadingSlash() *Path {
+	p.trimLeadingSlash = true
+	return &p
+}
+
+func (p *Path) norm(s string) string {
+	if p.trimLeadingSlash {
+		s = strings.TrimPrefix(s, "/")
+	}
+	return s
+}
+
+// IdentifierBase satifies identity.Identity.
+func (p *Path) IdentifierBase() string {
+	return p.Base()[1:]
+}
+
+// Component returns the component for this path (e.g. "content").
+func (p *Path) Component() string {
+	return p.component
+}
+
+// Container returns the base name of the container directory for this path.
+func (p *Path) Container() string {
+	if p.posContainerLow == -1 {
+		return ""
+	}
+	return p.norm(p.s[p.posContainerLow : p.posContainerHigh-1])
+}
+
+// ContainerDir returns the container directory for this path.
+// For content bundles this will be the parent directory.
+func (p *Path) ContainerDir() string {
+	if p.posContainerLow == -1 || !p.IsBundle() {
+		return p.Dir()
+	}
+	return p.norm(p.s[:p.posContainerLow-1])
+}
+
+// Section returns the first path element (section).
+func (p *Path) Section() string {
+	if p.posSectionHigh <= 0 {
+		return ""
+	}
+	return p.norm(p.s[1:p.posSectionHigh])
+}
+
+// IsContent returns true if the path is a content file (e.g. mypost.md).
+// Note that this will also return true for content files in a bundle.
+func (p *Path) IsContent() bool {
+	return p.BundleType() >= PathTypeContentResource
+}
+
+// isContentPage returns true if the path is a content file (e.g. mypost.md),
+// but nof if inside a leaf bundle.
+func (p *Path) isContentPage() bool {
+	return p.BundleType() >= PathTypeContentSingle
+}
+
+// Name returns the last element of path.
+func (p *Path) Name() string {
+	if p.posContainerHigh > 0 {
+		return p.s[p.posContainerHigh:]
+	}
+	return p.s
+}
+
+// Name returns the last element of path withhout any extension.
+func (p *Path) NameNoExt() string {
+	if i := p.identifierIndex(0); i != -1 {
+		return p.s[p.posContainerHigh : p.identifiers[i].Low-1]
+	}
+	return p.s[p.posContainerHigh:]
+}
+
+// Name returns the last element of path withhout any language identifier.
+func (p *Path) NameNoLang() string {
+	i := p.identifierIndex(p.posIdentifierLanguage)
+	if i == -1 {
+		return p.Name()
+	}
+
+	return p.s[p.posContainerHigh:p.identifiers[i].Low-1] + p.s[p.identifiers[i].High:]
+}
+
+// BaseNameNoIdentifier returns the logcical base name for a resource without any idenifier (e.g. no extension).
+// For bundles this will be the containing directory's name, e.g. "blog".
+func (p *Path) BaseNameNoIdentifier() string {
+	if p.IsBundle() {
+		return p.Container()
+	}
+	return p.NameNoIdentifier()
+}
+
+// NameNoIdentifier returns the last element of path withhout any identifier (e.g. no extension).
+func (p *Path) NameNoIdentifier() string {
+	if len(p.identifiers) > 0 {
+		return p.s[p.posContainerHigh : p.identifiers[len(p.identifiers)-1].Low-1]
+	}
+	return p.s[p.posContainerHigh:]
+}
+
+// Dir returns all but the last element of path, typically the path's directory.
+func (p *Path) Dir() (d string) {
+	if p.posContainerHigh > 0 {
+		d = p.s[:p.posContainerHigh-1]
+	}
+	if d == "" {
+		d = "/"
+	}
+	d = p.norm(d)
+	return
+}
+
+// Path returns the full path.
+func (p *Path) Path() (d string) {
+	return p.norm(p.s)
+}
+
+// Unmormalized returns the Path with the original case preserved.
+func (p *Path) Unmormalized() *Path {
+	return p.unnormalized
+}
+
+// PathNoLang returns the Path but with any language identifier removed.
+func (p *Path) PathNoLang() string {
+	return p.base(true, false)
+}
+
+// PathNoIdentifier returns the Path but with any identifier (ext, lang) removed.
+func (p *Path) PathNoIdentifier() string {
+	return p.base(false, false)
+}
+
+// PathRel returns the path relativeto the given owner.
+func (p *Path) PathRel(owner *Path) string {
+	ob := owner.Base()
+	if !strings.HasSuffix(ob, "/") {
+		ob += "/"
+	}
+	return strings.TrimPrefix(p.Path(), ob)
+}
+
+// BaseRel returns the base path relative to the given owner.
+func (p *Path) BaseRel(owner *Path) string {
+	ob := owner.Base()
+	if ob == "/" {
+		ob = ""
+	}
+	return p.Base()[len(ob)+1:]
+}
+
+// For content files, Base returns the path without any identifiers (extension, language code etc.).
+// Any 'index' as the last path element is ignored.
+//
+// For other files (Resources), any extension is kept.
+func (p *Path) Base() string {
+	return p.base(!p.isContentPage(), p.IsBundle())
+}
+
+// BaseNoLeadingSlash returns the base path without the leading slash.
+func (p *Path) BaseNoLeadingSlash() string {
+	return p.Base()[1:]
+}
+
+func (p *Path) base(preserveExt, isBundle bool) string {
+	if len(p.identifiers) == 0 {
+		return p.norm(p.s)
+	}
+
+	if preserveExt && len(p.identifiers) == 1 {
+		// Preserve extension.
+		return p.norm(p.s)
+	}
+
+	id := p.identifiers[len(p.identifiers)-1]
+	high := id.Low - 1
+
+	if isBundle {
+		high = p.posContainerHigh - 1
+	}
+
+	if high == 0 {
+		high++
+	}
+
+	if !preserveExt {
+		return p.norm(p.s[:high])
+	}
+
+	// For txt files etc. we want to preserve the extension.
+	id = p.identifiers[0]
+
+	return p.norm(p.s[:high] + p.s[id.Low-1:id.High])
+}
+
+func (p *Path) Ext() string {
+	return p.identifierAsString(0)
+}
+
+func (p *Path) Lang() string {
+	return p.identifierAsString(1)
+}
+
+func (p *Path) Identifier(i int) string {
+	return p.identifierAsString(i)
+}
+
+func (p *Path) Identifiers() []string {
+	ids := make([]string, len(p.identifiers))
+	for i, id := range p.identifiers {
+		ids[i] = p.s[id.Low:id.High]
+	}
+	return ids
+}
+
+func (p *Path) IsHTML() bool {
+	return files.IsHTML(p.Ext())
+}
+
+func (p *Path) BundleType() PathType {
+	return p.bundleType
+}
+
+func (p *Path) IsBundle() bool {
+	return p.bundleType >= PathTypeLeaf
+}
+
+func (p *Path) IsBranchBundle() bool {
+	return p.bundleType == PathTypeBranch
+}
+
+func (p *Path) IsLeafBundle() bool {
+	return p.bundleType == PathTypeLeaf
+}
+
+func (p *Path) identifierAsString(i int) string {
+	i = p.identifierIndex(i)
+	if i == -1 {
+		return ""
+	}
+
+	id := p.identifiers[i]
+	return p.s[id.Low:id.High]
+}
+
+func (p *Path) identifierIndex(i int) int {
+	if i < 0 || i >= len(p.identifiers) {
+		return -1
+	}
+	return i
+}
+
+// HasExt returns true if the Unix styled path has an extension.
+func HasExt(p string) bool {
+	for i := len(p) - 1; i >= 0; i-- {
+		if p[i] == '.' {
+			return true
+		}
+		if p[i] == '/' {
+			return false
+		}
+	}
+	return false
+}