summaryrefslogtreecommitdiffstats
path: root/related
diff options
context:
space:
mode:
authorBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>2017-08-19 13:16:00 +0200
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>2017-09-06 00:20:02 +0200
commit3b4f17bbc9ff789faa581ac278ad109d1ac5b816 (patch)
tree7b706ad5fce15afa1825b6565bae09bc517cc687 /related
parent16c9127663951ace1a3901cf669c49cc72780ced (diff)
hugolib: Implement "related content"
This closes #98, even if this commit does not do full content text search. We may revisit that problem in the future, but that deserves its own issue. Fixes #98
Diffstat (limited to 'related')
-rw-r--r--related/inverted_index.go450
-rw-r--r--related/inverted_index_test.go276
2 files changed, 726 insertions, 0 deletions
diff --git a/related/inverted_index.go b/related/inverted_index.go
new file mode 100644
index 000000000..f0d598d33
--- /dev/null
+++ b/related/inverted_index.go
@@ -0,0 +1,450 @@
+// Copyright 2017-present The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package related holds code to help finding related content.
+package related
+
+import (
+ "errors"
+ "fmt"
+ "math"
+ "sort"
+ "strings"
+ "time"
+
+ "github.com/gohugoio/hugo/common/types"
+ "github.com/mitchellh/mapstructure"
+)
+
+var (
+ _ Keyword = (*StringKeyword)(nil)
+ zeroDate = time.Time{}
+
+ // DefaultConfig is the default related config.
+ DefaultConfig = Config{
+ Threshold: 80,
+ Indices: IndexConfigs{
+ IndexConfig{Name: "keywords", Weight: 100},
+ IndexConfig{Name: "date", Weight: 10},
+ },
+ }
+)
+
+/*
+Config is the top level configuration element used to configure how to retrieve
+related content in Hugo.
+
+An example site config.toml:
+
+ [related]
+ threshold = 1
+ [[related.indices]]
+ name = "keywords"
+ weight = 200
+ [[related.indices]]
+ name = "tags"
+ weight = 100
+ [[related.indices]]
+ name = "date"
+ weight = 1
+ pattern = "2006"
+*/
+type Config struct {
+ // Only include matches >= threshold, a normalized rank between 0 and 100.
+ Threshold int
+
+ // To get stable "See also" sections we, by default, exclude newer related pages.
+ IncludeNewer bool
+
+ // Will lower case all string values and queries to the indices.
+ // May get better results, but at a slight performance cost.
+ ToLower bool
+
+ Indices IndexConfigs
+}
+
+func (c *Config) Add(index IndexConfig) {
+ if c.ToLower {
+ index.ToLower = true
+ }
+ c.Indices = append(c.Indices, index)
+}
+
+// IndexConfigs holds a set of index configurations.
+type IndexConfigs []IndexConfig
+
+// IndexConfig configures an index.
+type IndexConfig struct {
+ // The index name. This directly maps to a field or Param name.
+ Name string
+
+ // Contextual pattern used to convert the Param value into a string.
+ // Currently only used for dates. Can be used to, say, bump posts in the same
+ // time frame when searching for related documents.
+ // For dates it follows Go's time.Format patterns, i.e.
+ // "2006" for YYYY and "200601" for YYYYMM.
+ Pattern string
+
+ // This field's weight when doing multi-index searches. Higher is "better".
+ Weight int
+
+ // Will lower case all string values in and queries tothis index.
+ // May get better accurate results, but at a slight performance cost.
+ ToLower bool
+}
+
+// Document is the interface an indexable document in Hugo must fulfill.
+type Document interface {
+ // SearchKeywords returns a list of keywords for the given index config.
+ SearchKeywords(cfg IndexConfig) ([]Keyword, error)
+
+ // When this document was or will be published.
+ PubDate() time.Time
+}
+
+// InvertedIndex holds an inverted index, also sometimes named posting list, which
+// lists, for every possible search term, the documents that contain that term.
+type InvertedIndex struct {
+ cfg Config
+ index map[string]map[Keyword][]Document
+
+ minWeight int
+ maxWeight int
+}
+
+func (idx *InvertedIndex) getIndexCfg(name string) (IndexConfig, bool) {
+ for _, conf := range idx.cfg.Indices {
+ if conf.Name == name {
+ return conf, true
+ }
+ }
+
+ return IndexConfig{}, false
+}
+
+// NewInvertedIndex creates a new InvertedIndex.
+// Documents to index must be added in Add.
+func NewInvertedIndex(cfg Config) *InvertedIndex {
+ idx := &InvertedIndex{index: make(map[string]map[Keyword][]Document), cfg: cfg}
+ for _, conf := range cfg.Indices {
+ idx.index[conf.Name] = make(map[Keyword][]Document)
+ if conf.Weight < idx.minWeight {
+ // By default, the weight scale starts at 0, but we allow
+ // negative weights.
+ idx.minWeight = conf.Weight
+ }
+ if conf.Weight > idx.maxWeight {
+ idx.maxWeight = conf.Weight
+ }
+ }
+ return idx
+}
+
+// Add documents to the inverted index.
+// The value must support == and !=.
+func (idx *InvertedIndex) Add(docs ...Document) error {
+ var err error
+ for _, config := range idx.cfg.Indices {
+ if config.Weight == 0 {
+ // Disabled
+ continue
+ }
+ setm := idx.index[config.Name]
+
+ for _, doc := range docs {
+ var words []Keyword
+ words, err = doc.SearchKeywords(config)
+ if err != nil {
+ continue
+ }
+
+ for _, keyword := range words {
+ setm[keyword] = append(setm[keyword], doc)
+ }
+ }
+ }
+
+ return err
+
+}
+
+// queryElement holds the index name and keywords that can be used to compose a
+// search for related content.
+type queryElement struct {
+ Index string
+ Keywords []Keyword
+}
+
+func newQueryElement(index string, keywords ...Keyword) queryElement {
+ return queryElement{Index: index, Keywords: keywords}
+}
+
+type ranks []*rank
+
+type rank struct {
+ Doc Document
+ Weight int
+ Matches int
+}
+
+func (r *rank) addWeight(w int) {
+ r.Weight += w
+ r.Matches++
+}
+
+func newRank(doc Document, weight int) *rank {
+ return &rank{Doc: doc, Weight: weight, Matches: 1}
+}
+
+func (r ranks) Len() int { return len(r) }
+func (r ranks) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
+func (r ranks) Less(i, j int) bool {
+ if r[i].Weight == r[j].Weight {
+ return r[i].Doc.PubDate().After(r[j].Doc.PubDate())
+ }
+ return r[i].Weight > r[j].Weight
+}
+
+// SearchDoc finds the documents matching any of the keywords in the given indices
+// against the given document.
+// The resulting document set will be sorted according to number of matches
+// and the index weights, and any matches with a rank below the configured
+// threshold (normalize to 0..100) will be removed.
+// If an index name is provided, only that index will be queried.
+func (idx *InvertedIndex) SearchDoc(doc Document, indices ...string) ([]Document, error) {
+ var q []queryElement
+
+ var configs IndexConfigs
+
+ if len(indices) == 0 {
+ configs = idx.cfg.Indices
+ } else {
+ configs = make(IndexConfigs, len(indices))
+ for i, indexName := range indices {
+ cfg, found := idx.getIndexCfg(indexName)
+ if !found {
+ return nil, fmt.Errorf("index %q not found", indexName)
+ }
+ configs[i] = cfg
+ }
+ }
+
+ for _, cfg := range configs {
+ keywords, err := doc.SearchKeywords(cfg)
+ if err != nil {
+ return nil, err
+ }
+
+ q = append(q, newQueryElement(cfg.Name, keywords...))
+
+ }
+
+ return idx.searchDate(doc.PubDate(), q...)
+}
+
+func (cfg IndexConfig) ToKeywords(v interface{}) ([]Keyword, error) {
+ var (
+ keywords []Keyword
+ toLower = cfg.ToLower
+ )
+ switch vv := v.(type) {
+ case string:
+ if toLower {
+ vv = strings.ToLower(vv)
+ }
+ keywords = append(keywords, StringKeyword(vv))
+ case []string:
+ if toLower {
+ for i := 0; i < len(vv); i++ {
+ vv[i] = strings.ToLower(vv[i])
+ }
+ }
+ keywords = append(keywords, StringsToKeywords(vv...)...)
+ case time.Time:
+ layout := "2006"
+ if cfg.Pattern != "" {
+ layout = cfg.Pattern
+ }
+ keywords = append(keywords, StringKeyword(vv.Format(layout)))
+ case nil:
+ return keywords, nil
+ default:
+ return keywords, fmt.Errorf("indexing currently not supported for for index %q and type %T", cfg.Name, vv)
+ }
+
+ return keywords, nil
+}
+
+// SearchKeyValues finds the documents matching any of the keywords in the given indices.
+// The resulting document set will be sorted according to number of matches
+// and the index weights, and any matches with a rank below the configured
+// threshold (normalize to 0..100) will be removed.
+func (idx *InvertedIndex) SearchKeyValues(args ...types.KeyValues) ([]Document, error) {
+ q := make([]queryElement, len(args))
+
+ for i, arg := range args {
+ var keywords []Keyword
+ key := arg.KeyString()
+ if key == "" {
+ return nil, fmt.Errorf("index %q not valid", arg.Key)
+ }
+ conf, found := idx.getIndexCfg(key)
+ if !found {
+ return nil, fmt.Errorf("index %q not found", key)
+ }
+
+ for _, val := range arg.Values {
+ k, err := conf.ToKeywords(val)
+ if err != nil {
+ return nil, err
+ }
+ keywords = append(keywords, k...)
+ }
+
+ q[i] = newQueryElement(conf.Name, keywords...)
+
+ }
+
+ return idx.search(q...)
+}
+
+func (idx *InvertedIndex) search(query ...queryElement) ([]Document, error) {
+ return idx.searchDate(zeroDate, query...)
+}
+
+func (idx *InvertedIndex) searchDate(upperDate time.Time, query ...queryElement) ([]Document, error) {
+ matchm := make(map[Document]*rank, 200)
+ applyDateFilter := !idx.cfg.IncludeNewer && !upperDate.IsZero()
+
+ for _, el := range query {
+ setm, found := idx.index[el.Index]
+ if !found {
+ return []Document{}, fmt.Errorf("index for %q not found", el.Index)
+ }
+
+ config, found := idx.getIndexCfg(el.Index)
+ if !found {
+ return []Document{}, fmt.Errorf("index config for %q not found", el.Index)
+ }
+
+ for _, kw := range el.Keywords {
+ if docs, found := setm[kw]; found {
+ for _, doc := range docs {
+ if applyDateFilter {
+ // Exclude newer than the limit given
+ if doc.PubDate().After(upperDate) {
+ continue
+ }
+ }
+ r, found := matchm[doc]
+ if !found {
+ matchm[doc] = newRank(doc, config.Weight)
+ } else {
+ r.addWeight(config.Weight)
+ }
+ }
+ }
+ }
+ }
+
+ if len(matchm) == 0 {
+ return []Document{}, nil
+ }
+
+ matches := make(ranks, 0, 100)
+
+ for _, v := range matchm {
+ avgWeight := v.Weight / v.Matches
+ weight := norm(avgWeight, idx.minWeight, idx.maxWeight)
+ threshold := idx.cfg.Threshold / v.Matches
+
+ if weight >= threshold {
+ matches = append(matches, v)
+ }
+ }
+
+ sort.Stable(matches)
+
+ result := make([]Document, len(matches))
+
+ for i, m := range matches {
+ result[i] = m.Doc
+ }
+
+ return result, nil
+}
+
+// normalizes num to a number between 0 and 100.
+func norm(num, min, max int) int {
+ if min > max {
+ panic("min > max")
+ }
+ return int(math.Floor((float64(num-min) / float64(max-min) * 100) + 0.5))
+}
+
+// DecodeConfig decodes a slice of map into Config.
+func DecodeConfig(in interface{}) (Config, error) {
+ if in == nil {
+ return Config{}, errors.New("no related config provided")
+ }
+
+ m, ok := in.(map[string]interface{})
+ if !ok {
+ return Config{}, fmt.Errorf("expected map[string]interface {} got %T", in)
+ }
+
+ if len(m) == 0 {
+ return Config{}, errors.New("empty related config provided")
+ }
+
+ var c Config
+
+ if err := mapstructure.WeakDecode(m, &c); err != nil {
+ return c, err
+ }
+
+ if c.Threshold < 0 || c.Threshold > 100 {
+ return Config{}, errors.New("related threshold must be between 0 and 100")
+ }
+
+ if c.ToLower {
+ for i, _ := range c.Indices {
+ c.Indices[i].ToLower = true
+ }
+ }
+
+ return c, nil
+}
+
+// StringKeyword is a string search keyword.
+type StringKeyword string
+
+func (s StringKeyword) String() string {
+ return string(s)
+}
+
+// Keyword is the interface a keyword in the search index must implement.
+type Keyword interface {
+ String() string
+}
+
+// StringsToKeywords converts the given slice of strings to a slice of Keyword.
+func StringsToKeywords(s ...string) []Keyword {
+ kw := make([]Keyword, len(s))
+
+ for i := 0; i < len(s); i++ {
+ kw[i] = StringKeyword(s[i])
+ }
+
+ return kw
+}
diff --git a/related/inverted_index_test.go b/related/inverted_index_test.go
new file mode 100644
index 000000000..781a969fb
--- /dev/null
+++ b/related/inverted_index_test.go
@@ -0,0 +1,276 @@
+// Copyright 2017-present The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package related
+
+import (
+ "fmt"
+ "math/rand"
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/require"
+)
+
+type testDoc struct {
+ keywords map[string][]Keyword
+ date time.Time
+}
+
+func (k *testDoc) String() string {
+ s := "\n"
+ for k, v := range k.keywords {
+ s += k + ":\t\t"
+ for _, vv := range v {
+ s += " " + vv.String()
+ }
+ s += "\n"
+ }
+ return s
+}
+
+func newTestDoc(name string, keywords ...string) *testDoc {
+ km := make(map[string][]Keyword)
+
+ time.Sleep(1 * time.Millisecond)
+ kw := &testDoc{keywords: km, date: time.Now()}
+
+ kw.addKeywords(name, keywords...)
+ return kw
+}
+
+func (t *testDoc) addKeywords(name string, keywords ...string) *testDoc {
+ keywordm := createTestKeywords(name, keywords...)
+
+ for k, v := range keywordm {
+ keywords := make([]Keyword, len(v))
+ for i := 0; i < len(v); i++ {
+ keywords[i] = StringKeyword(v[i])
+ }
+ t.keywords[k] = keywords
+ }
+ return t
+}
+
+func createTestKeywords(name string, keywords ...string) map[string][]string {
+ return map[string][]string{
+ name: keywords,
+ }
+}
+
+func (k *testDoc) SearchKeywords(cfg IndexConfig) ([]Keyword, error) {
+ return k.keywords[cfg.Name], nil
+}
+
+func (k *testDoc) PubDate() time.Time {
+ return k.date
+}
+
+func TestSearch(t *testing.T) {
+
+ config := Config{
+ Threshold: 90,
+ IncludeNewer: false,
+ Indices: IndexConfigs{
+ IndexConfig{Name: "tags", Weight: 50},
+ IndexConfig{Name: "keywords", Weight: 65},
+ },
+ }
+
+ idx := NewInvertedIndex(config)
+ //idx.debug = true
+
+ docs := []Document{
+ newTestDoc("tags", "a", "b", "c", "d"),
+ newTestDoc("tags", "b", "d", "g"),
+ newTestDoc("tags", "b", "h").addKeywords("keywords", "a"),
+ newTestDoc("tags", "g", "h").addKeywords("keywords", "a", "b"),
+ }
+
+ idx.Add(docs...)
+
+ t.Run("count", func(t *testing.T) {
+ assert := require.New(t)
+ assert.Len(idx.index, 2)
+ set1, found := idx.index["tags"]
+ assert.True(found)
+ // 6 tags
+ assert.Len(set1, 6)
+
+ set2, found := idx.index["keywords"]
+ assert.True(found)
+ assert.Len(set2, 2)
+
+ })
+
+ t.Run("search-tags", func(t *testing.T) {
+ assert := require.New(t)
+ m, err := idx.search(newQueryElement("tags", StringsToKeywords("a", "b", "d", "z")...))
+ assert.NoError(err)
+ assert.Len(m, 2)
+ assert.Equal(docs[0], m[0])
+ assert.Equal(docs[1], m[1])
+ })
+
+ t.Run("search-tags-and-keywords", func(t *testing.T) {
+ assert := require.New(t)
+ m, err := idx.search(
+ newQueryElement("tags", StringsToKeywords("a", "b", "z")...),
+ newQueryElement("keywords", StringsToKeywords("a", "b")...))
+ assert.NoError(err)
+ assert.Len(m, 3)
+ assert.Equal(docs[3], m[0])
+ assert.Equal(docs[2], m[1])
+ assert.Equal(docs[0], m[2])
+ })
+
+ t.Run("searchdoc-all", func(t *testing.T) {
+ assert := require.New(t)
+ doc := newTestDoc("tags", "a").addKeywords("keywords", "a")
+ m, err := idx.SearchDoc(doc)
+ assert.NoError(err)
+ assert.Len(m, 2)
+ assert.Equal(docs[3], m[0])
+ assert.Equal(docs[2], m[1])
+ })
+
+ t.Run("searchdoc-tags", func(t *testing.T) {
+ assert := require.New(t)
+ doc := newTestDoc("tags", "a", "b", "d", "z").addKeywords("keywords", "a", "b")
+ m, err := idx.SearchDoc(doc, "tags")
+ assert.NoError(err)
+ assert.Len(m, 2)
+ assert.Equal(docs[0], m[0])
+ assert.Equal(docs[1], m[1])
+ })
+
+ t.Run("searchdoc-keywords-date", func(t *testing.T) {
+ assert := require.New(t)
+ doc := newTestDoc("tags", "a", "b", "d", "z").addKeywords("keywords", "a", "b")
+ // This will get a date newer than the others.
+ newDoc := newTestDoc("keywords", "a", "b")
+ idx.Add(newDoc)
+
+ m, err := idx.SearchDoc(doc, "keywords")
+ assert.NoError(err)
+ assert.Len(m, 2)
+ assert.Equal(docs[3], m[0])
+ })
+
+}
+
+func BenchmarkRelatedNewIndex(b *testing.B) {
+
+ pages := make([]*testDoc, 100)
+ numkeywords := 30
+ allKeywords := make([]string, numkeywords)
+ for i := 0; i < numkeywords; i++ {
+ allKeywords[i] = fmt.Sprintf("keyword%d", i+1)
+ }
+
+ for i := 0; i < len(pages); i++ {
+ start := rand.Intn(len(allKeywords))
+ end := start + 3
+ if end >= len(allKeywords) {
+ end = start + 1
+ }
+
+ kw := newTestDoc("tags", allKeywords[start:end]...)
+ if i%5 == 0 {
+ start := rand.Intn(len(allKeywords))
+ end := start + 3
+ if end >= len(allKeywords) {
+ end = start + 1
+ }
+ kw.addKeywords("keywords", allKeywords[start:end]...)
+ }
+
+ pages[i] = kw
+ }
+
+ cfg := Config{
+ Threshold: 50,
+ Indices: IndexConfigs{
+ IndexConfig{Name: "tags", Weight: 100},
+ IndexConfig{Name: "keywords", Weight: 200},
+ },
+ }
+
+ b.Run("singles", func(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ idx := NewInvertedIndex(cfg)
+ for _, doc := range pages {
+ idx.Add(doc)
+ }
+ }
+ })
+
+ b.Run("all", func(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ idx := NewInvertedIndex(cfg)
+ docs := make([]Document, len(pages))
+ for i := 0; i < len(pages); i++ {
+ docs[i] = pages[i]
+ }
+ idx.Add(docs...)
+ }
+ })
+
+}
+
+func BenchmarkRelatedMatchesIn(b *testing.B) {
+
+ q1 := newQueryElement("tags", StringsToKeywords("keyword2", "keyword5", "keyword32", "asdf")...)
+ q2 := newQueryElement("keywords", StringsToKeywords("keyword3", "keyword4")...)
+
+ docs := make([]*testDoc, 1000)
+ numkeywords := 20
+ allKeywords := make([]string, numkeywords)
+ for i := 0; i < numkeywords; i++ {
+ allKeywords[i] = fmt.Sprintf("keyword%d", i+1)
+ }
+
+ cfg := Config{
+ Threshold: 20,
+ Indices: IndexConfigs{
+ IndexConfig{Name: "tags", Weight: 100},
+ IndexConfig{Name: "keywords", Weight: 200},
+ },
+ }
+
+ idx := NewInvertedIndex(cfg)
+
+ for i := 0; i < len(docs); i++ {
+ start := rand.Intn(len(allKeywords))
+ end := start + 3
+ if end >= len(allKeywords) {
+ end = start + 1
+ }
+
+ index := "tags"
+ if i%5 == 0 {
+ index = "keywords"
+ }
+
+ idx.Add(newTestDoc(index, allKeywords[start:end]...))
+ }
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ if i%10 == 0 {
+ idx.search(q2)
+ } else {
+ idx.search(q1)
+ }
+ }
+}