summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBram Moolenaar <Bram@vim.org>2020-06-04 18:22:13 +0200
committerBram Moolenaar <Bram@vim.org>2020-06-04 18:22:13 +0200
commite52702f00322c8a8861efd0bd6a3775e685e5685 (patch)
treeffd498e52ee2b139f3a145147aca02afc5013d63
parent9155825b2428ef6bf654204a534b033a6879c90d (diff)
patch 8.2.0901: formatting CJK text isn't optimalv8.2.0901
Problem: Formatting CJK text isn't optimal. Solution: Properly break CJK lines. (closes #3875)
-rw-r--r--runtime/doc/change.txt4
-rw-r--r--src/mbyte.c152
-rw-r--r--src/ops.c5
-rw-r--r--src/option.h3
-rw-r--r--src/proto/mbyte.pro3
-rw-r--r--src/testdir/Make_all.mak2
-rw-r--r--src/testdir/test_cjk_linebreak.vim91
-rw-r--r--src/textformat.c76
-rw-r--r--src/version.c2
9 files changed, 329 insertions, 9 deletions
diff --git a/runtime/doc/change.txt b/runtime/doc/change.txt
index 59004af14a..39e7b48ff9 100644
--- a/runtime/doc/change.txt
+++ b/runtime/doc/change.txt
@@ -1688,6 +1688,10 @@ B When joining lines, don't insert a space between two multi-byte
characters. Overruled by the 'M' flag.
1 Don't break a line after a one-letter word. It's broken before it
instead (if possible).
+] Respect textwidth rigorously. With this flag set, no line can be
+ longer than textwidth, unless line-break-prohibition rules make this
+ impossible. Mainly for CJK scripts and works only if 'encoding' is
+ "utf-8".
j Where it makes sense, remove a comment leader when joining lines. For
example, joining:
int i; // the index ~
diff --git a/src/mbyte.c b/src/mbyte.c
index de732c4fca..4f4210a532 100644
--- a/src/mbyte.c
+++ b/src/mbyte.c
@@ -3843,6 +3843,158 @@ utf_head_off(char_u *base, char_u *p)
}
/*
+ * Whether space is NOT allowed before/after 'c'.
+ */
+ int
+utf_eat_space(int cc)
+{
+ return ((cc >= 0x2000 && cc <= 0x206F) // General punctuations
+ || (cc >= 0x2e00 && cc <= 0x2e7f) // Supplemental punctuations
+ || (cc >= 0x3000 && cc <= 0x303f) // CJK symbols and punctuations
+ || (cc >= 0xff01 && cc <= 0xff0f) // Full width ASCII punctuations
+ || (cc >= 0xff1a && cc <= 0xff20) // ..
+ || (cc >= 0xff3b && cc <= 0xff40) // ..
+ || (cc >= 0xff5b && cc <= 0xff65)); // ..
+}
+
+/*
+ * Whether line break is allowed before "cc".
+ */
+ int
+utf_allow_break_before(int cc)
+{
+ static const int BOL_prohibition_punct[] =
+ {
+ '!',
+ '%',
+ ')',
+ ',',
+ ':',
+ ';',
+ '>',
+ '?',
+ ']',
+ '}',
+ 0x2019, // ’ right single quotation mark
+ 0x201d, // ” right double quotation mark
+ 0x2020, // † dagger
+ 0x2021, // ‡ double dagger
+ 0x2026, // … horizontal ellipsis
+ 0x2030, // ‰ per mille sign
+ 0x2031, // ‱ per then thousand sign
+ 0x203c, // ‼ double exclamation mark
+ 0x2047, // ⁇ double question mark
+ 0x2048, // ⁈ question exclamation mark
+ 0x2049, // ⁉ exclamation question mark
+ 0x2103, // ℃ degree celsius
+ 0x2109, // ℉ degree fahrenheit
+ 0x3001, // 、 ideographic comma
+ 0x3002, // 。 ideographic full stop
+ 0x3009, // 〉 right angle bracket
+ 0x300b, // 》 right double angle bracket
+ 0x300d, // 」 right corner bracket
+ 0x300f, // 』 right white corner bracket
+ 0x3011, // 】 right black lenticular bracket
+ 0x3015, // 〕 right tortoise shell bracket
+ 0x3017, // 〗 right white lenticular bracket
+ 0x3019, // 〙 right white tortoise shell bracket
+ 0x301b, // 〛 right white square bracket
+ 0xff01, // ! fullwidth exclamation mark
+ 0xff09, // ) fullwidth right parenthesis
+ 0xff0c, // , fullwidth comma
+ 0xff0e, // . fullwidth full stop
+ 0xff1a, // : fullwidth colon
+ 0xff1b, // ; fullwidth semicolon
+ 0xff1f, // ? fullwidth question mark
+ 0xff3d, // ] fullwidth right square bracket
+ 0xff5d, // } fullwidth right curly bracket
+ };
+
+ int first = 0;
+ int last = sizeof(BOL_prohibition_punct)/sizeof(int) - 1;
+ int mid = 0;
+
+ while (first < last)
+ {
+ mid = (first + last)/2;
+
+ if (cc == BOL_prohibition_punct[mid])
+ return FALSE;
+ else if (cc > BOL_prohibition_punct[mid])
+ first = mid + 1;
+ else
+ last = mid - 1;
+ }
+
+ return cc != BOL_prohibition_punct[first];
+}
+
+/*
+ * Whether line break is allowed after "cc".
+ */
+ static int
+utf_allow_break_after(int cc)
+{
+ static const int EOL_prohibition_punct[] =
+ {
+ '(',
+ '<',
+ '[',
+ '`',
+ '{',
+ //0x2014, // — em dash
+ 0x2018, // ‘ left single quotation mark
+ 0x201c, // “ left double quotation mark
+ //0x2053, // ~ swung dash
+ 0x3008, // 〈 left angle bracket
+ 0x300a, // 《 left double angle bracket
+ 0x300c, // 「 left corner bracket
+ 0x300e, // 『 left white corner bracket
+ 0x3010, // 【 left black lenticular bracket
+ 0x3014, // 〔 left tortoise shell bracket
+ 0x3016, // 〖 left white lenticular bracket
+ 0x3018, // 〘 left white tortoise shell bracket
+ 0x301a, // 〚 left white square bracket
+ 0xff08, // ( fullwidth left parenthesis
+ 0xff3b, // [ fullwidth left square bracket
+ 0xff5b, // { fullwidth left curly bracket
+ };
+
+ int first = 0;
+ int last = sizeof(EOL_prohibition_punct)/sizeof(int) - 1;
+ int mid = 0;
+
+ while (first < last)
+ {
+ mid = (first + last)/2;
+
+ if (cc == EOL_prohibition_punct[mid])
+ return FALSE;
+ else if (cc > EOL_prohibition_punct[mid])
+ first = mid + 1;
+ else
+ last = mid - 1;
+ }
+
+ return cc != EOL_prohibition_punct[first];
+}
+
+/*
+ * Whether line break is allowed between "cc" and "ncc".
+ */
+ int
+utf_allow_break(int cc, int ncc)
+{
+ // don't break between two-letter punctuations
+ if (cc == ncc
+ && (cc == 0x2014 // em dash
+ || cc == 0x2026)) // horizontal ellipsis
+ return FALSE;
+
+ return utf_allow_break_after(cc) && utf_allow_break_before(ncc);
+}
+
+/*
* Copy a character from "*fp" to "*tp" and advance the pointers.
*/
void
diff --git a/src/ops.c b/src/ops.c
index ea71edbf3c..ca00c252d7 100644
--- a/src/ops.c
+++ b/src/ops.c
@@ -1967,7 +1967,10 @@ do_join(
&& (!has_format_option(FO_MBYTE_JOIN)
|| (mb_ptr2char(curr) < 0x100 && endcurr1 < 0x100))
&& (!has_format_option(FO_MBYTE_JOIN2)
- || mb_ptr2char(curr) < 0x100 || endcurr1 < 0x100)
+ || (mb_ptr2char(curr) < 0x100
+ && !(enc_utf8 && utf_eat_space(endcurr1)))
+ || (endcurr1 < 0x100
+ && !(enc_utf8 && utf_eat_space(mb_ptr2char(curr)))))
)
{
// don't add a space if the line is ending in a space
diff --git a/src/option.h b/src/option.h
index 718553a908..4b6eadc460 100644
--- a/src/option.h
+++ b/src/option.h
@@ -141,12 +141,13 @@
#define FO_ONE_LETTER '1'
#define FO_WHITE_PAR 'w' // trailing white space continues paragr.
#define FO_AUTO 'a' // automatic formatting
+#define FO_RIGOROUS_TW ']' // respect textwidth rigorously
#define FO_REMOVE_COMS 'j' // remove comment leaders when joining lines
#define FO_PERIOD_ABBR 'p' // don't break a single space after a period
#define DFLT_FO_VI "vt"
#define DFLT_FO_VIM "tcq"
-#define FO_ALL "tcroq2vlb1mMBn,awjp" // for do_set()
+#define FO_ALL "tcroq2vlb1mMBn,aw]jp" // for do_set()
// characters for the p_cpo option:
#define CPO_ALTREAD 'a' // ":read" sets alternate file name
diff --git a/src/proto/mbyte.pro b/src/proto/mbyte.pro
index 7b8c4fe679..79af974ec6 100644
--- a/src/proto/mbyte.pro
+++ b/src/proto/mbyte.pro
@@ -52,6 +52,9 @@ void show_utf8(void);
int latin_head_off(char_u *base, char_u *p);
int dbcs_screen_head_off(char_u *base, char_u *p);
int utf_head_off(char_u *base, char_u *p);
+int utf_eat_space(int cc);
+int utf_allow_break_before(int cc);
+int utf_allow_break(int cc, int ncc);
void mb_copy_char(char_u **fp, char_u **tp);
int mb_off_next(char_u *base, char_u *p);
int mb_tail_off(char_u *base, char_u *p);
diff --git a/src/testdir/Make_all.mak b/src/testdir/Make_all.mak
index 6b151d07ca..7402067b45 100644
--- a/src/testdir/Make_all.mak
+++ b/src/testdir/Make_all.mak
@@ -85,6 +85,7 @@ NEW_TESTS = \
test_charsearch_utf8 \
test_checkpath \
test_cindent \
+ test_cjk_linebreak \
test_clientserver \
test_close_count \
test_cmdline \
@@ -333,6 +334,7 @@ NEW_TESTS_RES = \
test_charsearch.res \
test_checkpath.res \
test_cindent.res \
+ test_cjk_linebreak.res \
test_clientserver.res \
test_close_count.res \
test_cmdline.res \
diff --git a/src/testdir/test_cjk_linebreak.vim b/src/testdir/test_cjk_linebreak.vim
new file mode 100644
index 0000000000..7a1cedeeac
--- /dev/null
+++ b/src/testdir/test_cjk_linebreak.vim
@@ -0,0 +1,91 @@
+scriptencoding utf-8
+
+func Run_cjk_linebreak_after()
+ set textwidth=12
+ for punct in [
+ \ '!', '%', ')', ',', ':', ';', '>', '?', ']', '}', '’', '”', '†', '‡',
+ \ '…', '‰', '‱', '‼', '⁇', '⁈', '⁉', '℃', '℉', '、', '。', '〉', '》',
+ \ '」', '』', '】', '〕', '〗', '〙', '〛', '!', ')', ',', '.', ':',
+ \ ';', '?', ']', '}']
+ call setline('.', '这是一个测试'.punct.'试试 CJK 行禁则补丁。')
+ normal gqq
+ call assert_equal('这是一个测试'.punct, getline(1))
+ %d_
+ endfor
+endfunc
+
+func Test_cjk_linebreak_after()
+ set formatoptions=croqn2mB1j
+ call Run_cjk_linebreak_after()
+endfunc
+
+" TODO: this test fails
+"func Test_cjk_linebreak_after_rigorous()
+" set formatoptions=croqn2mB1j]
+" call Run_cjk_linebreak_after()
+"endfunc
+
+func Run_cjk_linebreak_before()
+ set textwidth=12
+ for punct in [
+ \ '(', '<', '[', '`', '{', '‘', '“', '〈', '《', '「', '『', '【', '〔',
+ \ '〖', '〘', '〚', '(', '[', '{']
+ call setline('.', '这是个测试'.punct.'试试 CJK 行禁则补丁。')
+ normal gqq
+ call assert_equal('这是个测试', getline(1))
+ %d_
+ endfor
+endfunc
+
+func Test_cjk_linebreak_before()
+ set formatoptions=croqn2mB1j
+ call Run_cjk_linebreak_before()
+endfunc
+
+func Test_cjk_linebreak_before_rigorous()
+ set formatoptions=croqn2mB1j]
+ call Run_cjk_linebreak_before()
+endfunc
+
+func Run_cjk_linebreak_nobetween()
+ " …… must not start a line
+ call setline('.', '这是个测试……试试 CJK 行禁则补丁。')
+ set textwidth=12 ambiwidth=double
+ normal gqq
+ " TODO: this fails
+ " call assert_equal('这是个测试……', getline(1))
+ %d_
+
+ call setline('.', '这是一个测试……试试 CJK 行禁则补丁。')
+ set textwidth=12 ambiwidth=double
+ normal gqq
+ call assert_equal('这是一个测', getline(1))
+ %d_
+
+ " but —— can
+ call setline('.', '这是个测试——试试 CJK 行禁则补丁。')
+ set textwidth=12 ambiwidth=double
+ normal gqq
+ call assert_equal('这是个测试', getline(1))
+endfunc
+
+func Test_cjk_linebreak_nobetween()
+ set formatoptions=croqn2mB1j
+ call Run_cjk_linebreak_nobetween()
+endfunc
+
+func Test_cjk_linebreak_nobetween_rigorous()
+ set formatoptions=croqn2mB1j]
+ call Run_cjk_linebreak_nobetween()
+endfunc
+
+func Test_cjk_linebreak_join_punct()
+ for punct in ['——', '〗', ',', '。', '……']
+ call setline(1, '文本文本'.punct)
+ call setline(2, 'English')
+ set formatoptions=croqn2mB1j
+ normal ggJ
+ call assert_equal('文本文本'.punct.'English', getline(1))
+ %d_
+ endfor
+endfunc
diff --git a/src/textformat.c b/src/textformat.c
index c50d70f7c3..373ab59693 100644
--- a/src/textformat.c
+++ b/src/textformat.c
@@ -45,10 +45,12 @@ internal_format(
int c) // character to be inserted (can be NUL)
{
int cc;
+ int skip_pos;
int save_char = NUL;
int haveto_redraw = FALSE;
int fo_ins_blank = has_format_option(FO_INS_BLANK);
int fo_multibyte = has_format_option(FO_MBYTE_BREAK);
+ int fo_rigor_tw = has_format_option(FO_RIGOROUS_TW);
int fo_white_par = has_format_option(FO_WHITE_PAR);
int first_line = TRUE;
colnr_T leader_len;
@@ -125,6 +127,7 @@ internal_format(
curwin->w_cursor.col = startcol;
foundcol = 0;
+ skip_pos = 0;
// Find position to break at.
// Stop at first entered white when 'formatoptions' has 'v'
@@ -189,8 +192,11 @@ internal_format(
if (curwin->w_cursor.col <= (colnr_T)wantcol)
break;
}
- else if (cc >= 0x100 && fo_multibyte)
+ else if ((cc >= 0x100 || !utf_allow_break_before(cc)) && fo_multibyte)
{
+ int ncc;
+ int allow_break;
+
// Break after or before a multi-byte character.
if (curwin->w_cursor.col != startcol)
{
@@ -199,8 +205,14 @@ internal_format(
break;
col = curwin->w_cursor.col;
inc_cursor();
- // Don't change end_foundcol if already set.
- if (foundcol != curwin->w_cursor.col)
+ ncc = gchar_cursor();
+
+ allow_break =
+ (enc_utf8 && utf_allow_break(cc, ncc))
+ || enc_dbcs;
+
+ // If we have already checked this position, skip!
+ if (curwin->w_cursor.col != skip_pos && allow_break)
{
foundcol = curwin->w_cursor.col;
end_foundcol = foundcol;
@@ -213,6 +225,7 @@ internal_format(
if (curwin->w_cursor.col == 0)
break;
+ ncc = cc;
col = curwin->w_cursor.col;
dec_cursor();
@@ -220,16 +233,65 @@ internal_format(
if (WHITECHAR(cc))
continue; // break with space
- // Don't break until after the comment leader
+ // Don't break until after the comment leader.
if (curwin->w_cursor.col < leader_len)
break;
curwin->w_cursor.col = col;
+ skip_pos = curwin->w_cursor.col;
- foundcol = curwin->w_cursor.col;
- end_foundcol = foundcol;
+ allow_break =
+ (enc_utf8 && utf_allow_break(cc, ncc))
+ || enc_dbcs;
+
+ // Must handle this to respect line break prohibition.
+ if (allow_break)
+ {
+ foundcol = curwin->w_cursor.col;
+ end_foundcol = foundcol;
+ }
if (curwin->w_cursor.col <= (colnr_T)wantcol)
- break;
+ {
+ int ncc_allow_break =
+ (enc_utf8 && utf_allow_break_before(ncc)) || enc_dbcs;
+
+ if (allow_break)
+ break;
+ if (!ncc_allow_break && !fo_rigor_tw)
+ {
+ // Enable at most 1 punct hang outside of textwidth.
+ if (curwin->w_cursor.col == startcol)
+ {
+ // We are inserting a non-breakable char, postpone
+ // line break check to next insert.
+ end_foundcol = foundcol = 0;
+ break;
+ }
+
+ // Neither cc nor ncc is NUL if we are here, so
+ // it's safe to inc_cursor.
+ col = curwin->w_cursor.col;
+
+ inc_cursor();
+ cc = ncc;
+ ncc = gchar_cursor();
+ // handle insert
+ ncc = (ncc != NUL) ? ncc : c;
+
+ allow_break =
+ (enc_utf8 && utf_allow_break(cc, ncc))
+ || enc_dbcs;
+
+ if (allow_break)
+ {
+ // Break only when we are not at end of line.
+ end_foundcol = foundcol =
+ ncc == NUL? 0 : curwin->w_cursor.col;
+ break;
+ }
+ curwin->w_cursor.col = col;
+ }
+ }
}
if (curwin->w_cursor.col == 0)
break;
diff --git a/src/version.c b/src/version.c
index 2c22af411e..291744e4e6 100644
--- a/src/version.c
+++ b/src/version.c
@@ -747,6 +747,8 @@ static char *(features[]) =
static int included_patches[] =
{ /* Add new patch number below this line */
/**/
+ 901,
+/**/
900,
/**/
899,