diff options
author | coderzh <pythonzh@gmail.com> | 2015-09-03 18:22:20 +0800 |
---|---|---|
committer | Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com> | 2015-10-07 15:14:57 +0200 |
commit | 823334875d396bdc15770c335c2029a01a7ef2ce (patch) | |
tree | 26930dcd01a5433322ed38f63562421fd080a96f /hugolib | |
parent | 2c045ac449fbdca33daae828813a3b4a08224ef7 (diff) |
WordCount and Summary support CJK Language
* add global `hasCJKLanguage` flag, if true, turn on auto-detecting CJKLanguage
* add `isCJKLanguage` frontmatter to force specify whether is CJKLanguage or not
* For .Summary: If isCJKLanguage is true, use the runes as basis for truncation, else keep as today.
* For WordCount: If isCJKLanguage is true, use the runes as basis for calculation, else keep as today.
* Unexport RuneCount
Fixes #1377
Diffstat (limited to 'hugolib')
-rw-r--r-- | hugolib/page.go | 81 | ||||
-rw-r--r-- | hugolib/page_test.go | 129 |
2 files changed, 171 insertions, 39 deletions
diff --git a/hugolib/page.go b/hugolib/page.go index c50e2da18..e08e764af 100644 --- a/hugolib/page.go +++ b/hugolib/page.go @@ -28,6 +28,7 @@ import ( "net/url" "path" "path/filepath" + "regexp" "strings" "sync" "time" @@ -42,6 +43,10 @@ import ( "github.com/spf13/viper" ) +var ( + cjk = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}`) +) + type Page struct { Params map[string]interface{} Content template.HTML @@ -67,7 +72,6 @@ type Page struct { contentShortCodes map[string]string plain string // TODO should be []byte plainWords []string - plainRuneCount int plainInit sync.Once plainSecondaryInit sync.Once renderingConfig *helpers.Blackfriday @@ -78,6 +82,7 @@ type Page struct { Node pageMenus PageMenus pageMenusInit sync.Once + isCJKLanguage bool } type Source struct { @@ -111,12 +116,6 @@ func (p *Page) PlainWords() []string { return p.plainWords } -// RuneCount returns the rune count, excluding any whitespace, of the plain content. -func (p *Page) RuneCount() int { - p.initPlainSecondary() - return p.plainRuneCount -} - func (p *Page) initPlain() { p.plainInit.Do(func() { p.plain = helpers.StripHTML(string(p.Content)) @@ -125,20 +124,6 @@ func (p *Page) initPlain() { }) } -func (p *Page) initPlainSecondary() { - p.plainSecondaryInit.Do(func() { - p.initPlain() - runeCount := 0 - for _, r := range p.plain { - if !helpers.IsWhitespace(r) { - runeCount++ - } - } - p.plainRuneCount = runeCount - return - }) -} - func (p *Page) IsNode() bool { return false } @@ -218,7 +203,13 @@ func (p *Page) setSummary() { } else { // If hugo defines split: // render, strip html, then split - summary, truncated := helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength) + var summary string + var truncated bool + if p.isCJKLanguage { + summary, truncated = helpers.TruncateWordsByRune(p.PlainWords(), helpers.SummaryLength) + } else { + summary, truncated = helpers.TruncateWordsToWholeSentence(p.PlainWords(), helpers.SummaryLength) + } p.Summary = template.HTML(summary) p.Truncated = truncated @@ -363,18 +354,27 @@ func (p *Page) ReadFrom(buf io.Reader) (int64, error) { } func (p *Page) analyzePage() { - p.WordCount = 0 - for _, word := range p.PlainWords() { - runeCount := utf8.RuneCountInString(word) - if len(word) == runeCount { - p.WordCount++ - } else { - p.WordCount += runeCount + if p.isCJKLanguage { + p.WordCount = 0 + for _, word := range p.PlainWords() { + runeCount := utf8.RuneCountInString(word) + if len(word) == runeCount { + p.WordCount++ + } else { + p.WordCount += runeCount + } } + } else { + p.WordCount = len(p.PlainWords()) } - + p.FuzzyWordCount = int((p.WordCount+100)/100) * 100 - p.ReadingTime = int((p.WordCount + 212) / 213) + + if p.isCJKLanguage { + p.ReadingTime = int((p.WordCount + 500) / 501) + } else { + p.ReadingTime = int((p.WordCount + 212) / 213) + } } func (p *Page) permalink() (*url.URL, error) { @@ -481,7 +481,7 @@ func (p *Page) update(f interface{}) error { } m := f.(map[string]interface{}) var err error - var draft, published *bool + var draft, published, isCJKLanguage *bool for k, v := range m { loki := strings.ToLower(k) switch loki { @@ -542,6 +542,9 @@ func (p *Page) update(f interface{}) error { p.Status = cast.ToString(v) case "sitemap": p.Sitemap = parseSitemap(cast.ToStringMap(v)) + case "iscjklanguage": + isCJKLanguage = new(bool) + *isCJKLanguage = cast.ToBool(v) default: // If not one of the explicit values, store in Params switch vv := v.(type) { @@ -596,6 +599,16 @@ func (p *Page) update(f interface{}) error { p.Lastmod = p.Date } + if isCJKLanguage != nil { + p.isCJKLanguage = *isCJKLanguage + } else if viper.GetBool("HasCJKLanguage") { + if cjk.Match(p.rawContent) { + p.isCJKLanguage = true + } else { + p.isCJKLanguage = false + } + } + return nil } @@ -766,6 +779,8 @@ func (p *Page) parse(reader io.Reader) error { p.renderable = psr.IsRenderable() p.frontmatter = psr.FrontMatter() + p.rawContent = psr.Content() + meta, err := psr.Metadata() if meta != nil { if err != nil { @@ -778,8 +793,6 @@ func (p *Page) parse(reader io.Reader) error { } } - p.rawContent = psr.Content() - return nil } diff --git a/hugolib/page_test.go b/hugolib/page_test.go index c3506d48d..9134ba6c6 100644 --- a/hugolib/page_test.go +++ b/hugolib/page_test.go @@ -146,16 +146,67 @@ Summary Same Line<!--more--> Some more text ` - SIMPLE_PAGE_WITH_FIVE_MULTIBYTE_UFT8_RUNES = `--- + SIMPLE_PAGE_WITH_ALL_CJK_RUNES = `--- title: Simple --- € € € € € +你好 +도형이 +カテゴリー ` + SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES = `--- +title: Simple +--- + + +In Chinese, 好 means good. In Chinese, 好 means good. +In Chinese, 好 means good. In Chinese, 好 means good. +In Chinese, 好 means good. In Chinese, 好 means good. +In Chinese, 好 means good. In Chinese, 好 means good. +In Chinese, 好 means good. In Chinese, 好 means good. +In Chinese, 好 means good. In Chinese, 好 means good. +In Chinese, 好 means good. In Chinese, 好 means good. +More then 70 words. + + +` + SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY = "In Chinese, 好 means good. In Chinese, 好 means good. " + + "In Chinese, 好 means good. In Chinese, 好 means good. " + + "In Chinese, 好 means good. In Chinese, 好 means good. " + + "In Chinese, 好 means good. In Chinese, 好 means good. " + + "In Chinese, 好 means good. In Chinese, 好 means good. " + + "In Chinese, 好 means good. In Chinese, 好 means good. " + + "In Chinese, 好 means good. In Chinese, 好 means good." + + SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE = `--- +title: Simple +isCJKLanguage: false +--- + +In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. +In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. +In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. +In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. +In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. +In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. +In Chinese, 好的啊 means good. In Chinese, 好的呀呀 means good enough. +More then 70 words. + + +` + SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY = "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " + + "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " + + "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " + + "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " + + "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " + + "In Chinese, 好的啊 means good. In Chinese, 好的呀 means good. " + + "In Chinese, 好的啊 means good. In Chinese, 好的呀呀 means good enough." + SIMPLE_PAGE_WITH_LONG_CONTENT = `--- title: Simple --- @@ -584,18 +635,86 @@ func TestPageWithDate(t *testing.T) { checkPageDate(t, p, d) } -func TestRuneCount(t *testing.T) { +func TestWordCountWithAllCJKRunesWithoutHasCJKLanguage(t *testing.T) { + viper.Reset() + p, _ := NewPage("simple.md") - _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_FIVE_MULTIBYTE_UFT8_RUNES)) + _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ALL_CJK_RUNES)) p.Convert() p.analyzePage() if err != nil { t.Fatalf("Unable to create a page with frontmatter and body content: %s", err) } - if p.RuneCount() != 5 { - t.Fatalf("incorrect rune count for content '%s'. expected %v, got %v", p.plain, 5, p.RuneCount()) + if p.WordCount != 8 { + t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 8, p.WordCount) + } +} + +func TestWordCountWithAllCJKRunesHasCJKLanguage(t *testing.T) { + viper.Reset() + defer viper.Reset() + + viper.Set("HasCJKLanguage", true) + + p, _ := NewPage("simple.md") + _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ALL_CJK_RUNES)) + p.Convert() + p.analyzePage() + if err != nil { + t.Fatalf("Unable to create a page with frontmatter and body content: %s", err) + } + + if p.WordCount != 15 { + t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 15, p.WordCount) + } +} + +func TestWordCountWithMainEnglishWithCJKRunes(t *testing.T) { + viper.Reset() + defer viper.Reset() + + viper.Set("HasCJKLanguage", true) + + p, _ := NewPage("simple.md") + _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES)) + p.Convert() + p.analyzePage() + if err != nil { + t.Fatalf("Unable to create a page with frontmatter and body content: %s", err) + } + + if p.WordCount != 74 { + t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 74, p.WordCount) + } + + if p.Summary != SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY { + t.Fatalf("incorrect Summary for content '%s'. expected %v, got %v", p.plain, + SIMPLE_PAGE_WITH_MAIN_ENGLISH_WITH_CJK_RUNES_SUMMARY, p.Summary) + } +} + +func TestWordCountWithIsCJKLanguageFalse(t *testing.T) { + viper.Reset() + defer viper.Reset() + + viper.Set("HasCJKLanguage", true) + + p, _ := NewPage("simple.md") + _, err := p.ReadFrom(strings.NewReader(SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE)) + p.Convert() + p.analyzePage() + if err != nil { + t.Fatalf("Unable to create a page with frontmatter and body content: %s", err) + } + + if p.WordCount != 75 { + t.Fatalf("incorrect word count for content '%s'. expected %v, got %v", p.plain, 75, p.WordCount) + } + if p.Summary != SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY { + t.Fatalf("incorrect Summary for content '%s'. expected %v, got %v", p.plain, + SIMPLE_PAGE_WITH_ISCJKLANGUAGE_FALSE_SUMMARY, p.Summary) } } |