summaryrefslogtreecommitdiffstats
path: root/hugolib
diff options
context:
space:
mode:
authorcoderzh <pythonzh@gmail.com>2015-09-03 18:22:20 +0800
committerBjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>2015-10-07 15:14:57 +0200
commit823334875d396bdc15770c335c2029a01a7ef2ce (patch)
tree26930dcd01a5433322ed38f63562421fd080a96f /hugolib
parent2c045ac449fbdca33daae828813a3b4a08224ef7 (diff)
WordCount and Summary support CJK Language
* add global `hasCJKLanguage` flag, if true, turn on auto-detecting CJKLanguage * add `isCJKLanguage` frontmatter to force specify whether is CJKLanguage or not * For .Summary: If isCJKLanguage is true, use the runes as basis for truncation, else keep as today. * For WordCount: If isCJKLanguage is true, use the runes as basis for calculation, else keep as today. * Unexport RuneCount Fixes #1377
Diffstat (limited to 'hugolib')
-rw-r--r--hugolib/page.go81
-rw-r--r--hugolib/page_test.go129
2 files changed, 171 insertions, 39 deletions
diff --git a/hugolib/page.go b/hugolib/page.go
index c50e2da18..e08e764af 100644
--- a/hugolib/page.go
+++ b/hugolib/page.go
@@ -28,6 +28,7 @@ import (
"net/url"
"path"
"path/filepath"
+ "regexp"
"strings"
"sync"
"time"
@@ -42,6 +43,10 @@ import (
"github.com/spf13/viper"
)
+var (
+ cjk = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}`)
+)
+
type Page struct {
Params map[string]interface{}
Content template.HTML
@@ -67,7 +72,6 @@ type Page struct {
contentShortCodes map[string]string
plain string // TODO should be []byte
plainWords []string
- plainRuneCount int
plainInit sync.Once
plainSecondaryInit sync.Once
renderingConfig *helpers.Blackfriday
@@ -78,6 +82,7 @@ type Page struct {
Node
pageMenus PageMenus
pageMenusInit sync.Once
+ isCJKLanguage bool
}
type Source struct {
@@ -111,12 +116,6 @@ func (p *Page) PlainWords() []string {
return p.plainWords
}
-// RuneCount returns the rune count, excluding any whitespace, of the plain content.
-func (p *Page) RuneCount() int {
- p.initPlainSecondary()
- return p.plainRuneCount
-}
-
func (p *Page) initPlain() {
p.plainInit.Do(func() {
p.plain = helpers.StripHTML(string(p.Content))
@@ -125,20 +124,6 @@ func (p *Page) initPlain() {
})
}
-func (p *Page) initPlainSecondary() {
- p.plainSecondaryInit.Do(func() {
- p.initPlain()
- runeCount := 0
- for _, r := range p.plain {
- if !helpers.IsWhitespace(r) {
- runeCount++
- }
- }
- p.plainRuneCount = runeCount
- return
- })
-}
-
func (p *Page) IsNode() bool {
return false
}
@@ -218,7 +203,13 @@ func (p *Page) setSummary() {
} else {
// If hugo defines split:
// render, strip html, then split
- summary, truncated := helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength)
+ var summary string
+ var truncated bool
+ if p.isCJKLanguage {
+ summary, truncated = helpers.TruncateWordsByRune(p.PlainWords(), helpers.SummaryLength)
+ } else {
+ summary, truncated = helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength)
+ }
p.Summary = template.HTML(summary)
p.Truncated = truncated
@@ -363,18 +354,27 @@ func (p *Page) ReadFrom(buf io.Reader) (int64, error) {
}
func (p *Page) analyzePage() {
- p.WordCount = 0
- for _, word := range p.PlainWords() {
- runeCount := utf8.RuneCountInString(word)
- if len(word) == runeCount {
- p.WordCount++
- } else {
- p.WordCount += runeCount
+ if p.isCJKLanguage {
+ p.WordCount = 0
+ for _, word := range p.PlainWords() {
+ runeCount := utf8.RuneCountInString(word)
+ if len(word) == runeCount {
+ p.WordCount++
+ } else {
+ p.WordCount += runeCount
+ }
}
+ } else {
+ p.WordCount = len(p.PlainWords())
}
-
+
p.FuzzyWordCount = int((p.WordCount+100)/100) * 100
- p.ReadingTime = int((p.WordCount + 212) / 213)
+
+ if p.isCJKLanguage {
+ p.ReadingTime = int((p.WordCount + 500) / 501)
+ } else {
+ p.ReadingTime = int((p.WordCount + 212) / 213)
+ }
}
func (p *Page) permalink() (*url.URL, error) {
@@ -481,7 +481,7 @@ func (p *Page) update(f interface{}) error {
}
m := f.(map[string]interface{})
var err error
- var draft, published *bool
+ var draft, published, isCJKLanguage *bool
for k, v := range m {
loki := strings.ToLower(k)
switch loki {
@@ -542,6 +542,9 @@ func (p *Page) update(f interface{}) error {
p.Status = cast.ToString(v)
case "sitemap":
p.Sitemap = parseSitemap(cast.ToStringMap(v))
+ case "iscjklanguage":
+ isCJKLanguage = new(bool)
+ *isCJKLanguage = cast.ToBool(v)
default:
// If not one of the explicit values, store in Params
switch vv := v.(type) {
@@ -596,6 +599,16 @@ func (p *Page) update(f interface{}) error {
p.Lastmod = p.Date
}
+ if isCJKLanguage != nil {
+ p.isCJKLanguage = *isCJKLanguage
+ } else if viper.GetBool("HasCJKLanguage") {
+ if cjk.Match(p.rawContent) {
+ p.isCJKLanguage = true
+ } else {
+ p.isCJKLanguage = false
+ }
+ }
+
return nil
}
@@ -766,6 +779,8 @@ func (p *Page) parse(reader io.Reader) error {
p.renderable = psr.IsRenderable()
p.frontmatter = psr.FrontMatter()
+ p.rawContent = psr.Content()
+
meta, err := psr.Metadata()
if meta != nil {
if err != nil {
@@ -778,8 +793,6 @@ func (p *Page) parse(reader io.Reader) error {
}
}
- p.rawContent = psr.Content()
-
return nil
}
diff --git a/hugolib/page_test.go b/hugolib/page_test.go
index c3506d48d..9134ba6c6 100644
--- a/hugolib/page_test.go
+++ b/hugolib/page_test.go
@@ -146,16 +146,67 @@ Summary Same Line<!--more-->
Some more text
`
- SIMPLE_PAGE_WITH_FIVE_MULTIBYTE_UFT8_RUNES = `---
+ SIMPLE_PAGE_WITH_ALL_CJK_RUNES = `---
title: Simple
---
€ € € € €
+你好
+도형이
+カテゴリー
`
+ SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES = `---
+title: Simple
+---
+
+
+In Chinese, 好 means good. In Chinese, 好 means good.
+In Chinese, 好 means good. In Chinese, 好 means good.
+In Chinese, 好 means good. In Chinese, 好 means good.
+In Chinese, 好 means good. In Chinese, 好 means good.
+In Chinese, 好 means good. In Chinese, 好 means good.
+In Chinese, 好 means good. In Chinese, 好 means good.
+In Chinese, 好 means good. In Chinese, 好 means good.
+More then 70 words.
+
+
+`
+ SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY = "In Chinese, 好 means good. In Chinese, 好 means good. " +
+ "In Chinese, 好 means good. In Chinese, 好 means good. " +
+ "In Chinese, 好 means good. In Chinese, 好 means good. " +
+ "In Chinese, 好 means good. In Chinese, 好 means good. " +
+ "In Chinese, 好 means good. In Chinese, 好 means good. " +
+ "In Chinese, 好 means good. In Chinese, 好 means good. " +
+ "In Chinese, 好 means good. In Chinese, 好 means good."
+
+ SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE = `---
+title: Simple
+isCJKLanguage: false
+---
+
+In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good. In Chinese, 好的呀 means good.
+In Chinese, 好的啊 means good. In Chinese, 好的呀呀 means good enough.
+More then 70 words.
+
+
+`
+ SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY = "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+ "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+ "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+ "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+ "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+ "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " +
+ "In Chinese, 好的啊 means good. In Chinese, 好的呀呀 means good enough."
+
SIMPLE_PAGE_WITH_LONG_CONTENT = `---
title: Simple
---
@@ -584,18 +635,86 @@ func TestPageWithDate(t *testing.T) {
checkPageDate(t, p, d)
}
-func TestRuneCount(t *testing.T) {
+func TestWordCountWithAllCJKRunesWithoutHasCJKLanguage(t *testing.T) {
+ viper.Reset()
+
p, _ := NewPage("simple.md")
- _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_FIVE_MULTIBYTE_UFT8_RUNES))
+ _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ALL_CJK_RUNES))
p.Convert()
p.analyzePage()
if err != nil {
t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
}
- if p.RuneCount() != 5 {
- t.Fatalf("incorrect rune count for content '%s'. expected %v, got %v", p.plain, 5, p.RuneCount())
+ if p.WordCount != 8 {
+ t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 8, p.WordCount)
+ }
+}
+
+func TestWordCountWithAllCJKRunesHasCJKLanguage(t *testing.T) {
+ viper.Reset()
+ defer viper.Reset()
+
+ viper.Set("HasCJKLanguage", true)
+
+ p, _ := NewPage("simple.md")
+ _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ALL_CJK_RUNES))
+ p.Convert()
+ p.analyzePage()
+ if err != nil {
+ t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
+ }
+
+ if p.WordCount != 15 {
+ t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 15, p.WordCount)
+ }
+}
+
+func TestWordCountWithMainEnglishWithCJKRunes(t *testing.T) {
+ viper.Reset()
+ defer viper.Reset()
+
+ viper.Set("HasCJKLanguage", true)
+
+ p, _ := NewPage("simple.md")
+ _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES))
+ p.Convert()
+ p.analyzePage()
+ if err != nil {
+ t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
+ }
+
+ if p.WordCount != 74 {
+ t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 74, p.WordCount)
+ }
+
+ if p.Summary != SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY {
+ t.Fatalf("incorrect Summary for content '%s'. expected %v, got %v", p.plain,
+ SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY, p.Summary)
+ }
+}
+
+func TestWordCountWithIsCJKLanguageFalse(t *testing.T) {
+ viper.Reset()
+ defer viper.Reset()
+
+ viper.Set("HasCJKLanguage", true)
+
+ p, _ := NewPage("simple.md")
+ _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE))
+ p.Convert()
+ p.analyzePage()
+ if err != nil {
+ t.Fatalf("Unable to create a page with frontmatter and body content: %s", err)
+ }
+
+ if p.WordCount != 75 {
+ t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 75, p.WordCount)
+ }
+ if p.Summary != SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY {
+ t.Fatalf("incorrect Summary for content '%s'. expected %v, got %v", p.plain,
+ SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY, p.Summary)
}
}