patch 8.2.0901: formatting CJK text isn't optimalv8.2.0901

Problem: Formatting CJK text isn't optimal. Solution: Properly break CJK lines. (closes #3875)
author: Bram Moolenaar <Bram@vim.org> 2020-06-04 18:22:13 +0200
committer: Bram Moolenaar <Bram@vim.org> 2020-06-04 18:22:13 +0200
commit: e52702f00322c8a8861efd0bd6a3775e685e5685 (patch)
tree: ffd498e52ee2b139f3a145147aca02afc5013d63
parent: 9155825b2428ef6bf654204a534b033a6879c90d (diff)
9 files changed, 329 insertions, 9 deletions
diff --git a/runtime/doc/change.txt b/runtime/doc/change.txt
index 59004af14a..39e7b48ff9 100644
--- a/runtime/doc/change.txt
+++ b/runtime/doc/change.txt
@@ -1688,6 +1688,10 @@ B	When joining lines, don't insert a space between two multi-byte
 	characters.  Overruled by the 'M' flag.
 1	Don't break a line after a one-letter word.  It's broken before it
 	instead (if possible).
+]	Respect textwidth rigorously. With this flag set, no line can be
+	longer than textwidth, unless line-break-prohibition rules make this
+	impossible.  Mainly for CJK scripts and works only if 'encoding' is
+	"utf-8".
 j	Where it makes sense, remove a comment leader when joining lines.  For
 	example, joining:
 		int i;   // the index ~
diff --git a/src/mbyte.c b/src/mbyte.c
index de732c4fca..4f4210a532 100644
--- a/src/mbyte.c
+++ b/src/mbyte.c
@@ -3843,6 +3843,158 @@ utf_head_off(char_u *base, char_u *p)
 }
 
 /*
+ * Whether space is NOT allowed before/after 'c'.
+ */
+    int
+utf_eat_space(int cc)
+{
+    return ((cc >= 0x2000 && cc <= 0x206F)	// General punctuations
+	 || (cc >= 0x2e00 && cc <= 0x2e7f)	// Supplemental punctuations
+	 || (cc >= 0x3000 && cc <= 0x303f)	// CJK symbols and punctuations
+	 || (cc >= 0xff01 && cc <= 0xff0f)	// Full width ASCII punctuations
+	 || (cc >= 0xff1a && cc <= 0xff20)	// ..
+	 || (cc >= 0xff3b && cc <= 0xff40)	// ..
+	 || (cc >= 0xff5b && cc <= 0xff65));	// ..
+}
+
+/*
+ * Whether line break is allowed before "cc".
+ */
+    int
+utf_allow_break_before(int cc)
+{
+    static const int BOL_prohibition_punct[] =
+    {
+	'!',
+	'%',
+	')',
+	',',
+	':',
+	';',
+	'>',
+	'?',
+	']',
+	'}',
+	0x2019, // ’ right single quotation mark
+	0x201d, // ” right double quotation mark
+	0x2020, // † dagger
+	0x2021, // ‡ double dagger
+	0x2026, // … horizontal ellipsis
+	0x2030, // ‰ per mille sign
+	0x2031, // ‱ per then thousand sign
+	0x203c, // ‼ double exclamation mark
+	0x2047, // ⁇ double question mark
+	0x2048, // ⁈ question exclamation mark
+	0x2049, // ⁉ exclamation question mark
+	0x2103, // ℃ degree celsius
+	0x2109, // ℉ degree fahrenheit
+	0x3001, // 、 ideographic comma
+	0x3002, // 。 ideographic full stop
+	0x3009, // 〉 right angle bracket
+	0x300b, // 》 right double angle bracket
+	0x300d, // 」 right corner bracket
+	0x300f, // 』 right white corner bracket
+	0x3011, // 】 right black lenticular bracket
+	0x3015, // 〕 right tortoise shell bracket
+	0x3017, // 〗 right white lenticular bracket
+	0x3019, // 〙 right white tortoise shell bracket
+	0x301b, // 〛 right white square bracket
+	0xff01, // ！ fullwidth exclamation mark
+	0xff09, // ） fullwidth right parenthesis
+	0xff0c, // ， fullwidth comma
+	0xff0e, // ． fullwidth full stop
+	0xff1a, // ： fullwidth colon
+	0xff1b, // ； fullwidth semicolon
+	0xff1f, // ？ fullwidth question mark
+	0xff3d, // ］ fullwidth right square bracket
+	0xff5d, // ｝ fullwidth right curly bracket
+    };
+
+    int first = 0;
+    int last  = sizeof(BOL_prohibition_punct)/sizeof(int) - 1;
+    int mid   = 0;
+
+    while (first < last)
+    {
+	mid = (first + last)/2;
+
+	if (cc == BOL_prohibition_punct[mid])
+	    return FALSE;
+	else if (cc > BOL_prohibition_punct[mid])
+	    first = mid + 1;
+	else
+	    last = mid - 1;
+    }
+
+    return cc != BOL_prohibition_punct[first];
+}
+
+/*
+ * Whether line break is allowed after "cc".
+ */
+    static int
+utf_allow_break_after(int cc)
+{
+    static const int EOL_prohibition_punct[] =
+    {
+	'(',
+	'<',
+	'[',
+	'`',
+	'{',
+	//0x2014, // — em dash
+	0x2018, // ‘ left single quotation mark
+	0x201c, // “ left double quotation mark
+	//0x2053, // ～ swung dash
+	0x3008, // 〈 left angle bracket
+	0x300a, // 《 left double angle bracket
+	0x300c, // 「 left corner bracket
+	0x300e, // 『 left white corner bracket
+	0x3010, // 【 left black lenticular bracket
+	0x3014, // 〔 left tortoise shell bracket
+	0x3016, // 〖 left white lenticular bracket
+	0x3018, // 〘 left white tortoise shell bracket
+	0x301a, // 〚 left white square bracket
+	0xff08, // （ fullwidth left parenthesis
+	0xff3b, // ［ fullwidth left square bracket
+	0xff5b, // ｛ fullwidth left curly bracket
+    };
+
+    int first = 0;
+    int last  = sizeof(EOL_prohibition_punct)/sizeof(int) - 1;
+    int mid   = 0;
+
+    while (first < last)
+    {
+	mid = (first + last)/2;
+
+	if (cc == EOL_prohibition_punct[mid])
+	    return FALSE;
+	else if (cc > EOL_prohibition_punct[mid])
+	    first = mid + 1;
+	else
+	    last = mid - 1;
+    }
+
+    return cc != EOL_prohibition_punct[first];
+}
+
+/*
+ * Whether line break is allowed between "cc" and "ncc".
+ */
+    int
+utf_allow_break(int cc, int ncc)
+{
+    // don't break between two-letter punctuations
+    if (cc == ncc
+	    && (cc == 0x2014 // em dash
+		|| cc == 0x2026)) // horizontal ellipsis
+	return FALSE;
+
+    return utf_allow_break_after(cc) && utf_allow_break_before(ncc);
+}
+
+/*
  * Copy a character from "*fp" to "*tp" and advance the pointers.
  */
     void
diff --git a/src/ops.c b/src/ops.c
index ea71edbf3c..ca00c252d7 100644
--- a/src/ops.c
+++ b/src/ops.c
@@ -1967,7 +1967,10 @@ do_join(
 		    && (!has_format_option(FO_MBYTE_JOIN)
 			|| (mb_ptr2char(curr) < 0x100 && endcurr1 < 0x100))
 		    && (!has_format_option(FO_MBYTE_JOIN2)
-			|| mb_ptr2char(curr) < 0x100 || endcurr1 < 0x100)
+			|| (mb_ptr2char(curr) < 0x100
+			    && !(enc_utf8 && utf_eat_space(endcurr1)))
+			|| (endcurr1 < 0x100
+			    && !(enc_utf8 && utf_eat_space(mb_ptr2char(curr)))))
 	       )
 	    {
 		// don't add a space if the line is ending in a space
diff --git a/src/option.h b/src/option.h
index 718553a908..4b6eadc460 100644
--- a/src/option.h
+++ b/src/option.h
@@ -141,12 +141,13 @@
 #define FO_ONE_LETTER	'1'
 #define FO_WHITE_PAR	'w'	// trailing white space continues paragr.
 #define FO_AUTO		'a'	// automatic formatting
+#define FO_RIGOROUS_TW	']'     // respect textwidth rigorously
 #define FO_REMOVE_COMS	'j'	// remove comment leaders when joining lines
 #define FO_PERIOD_ABBR	'p'	// don't break a single space after a period
 
 #define DFLT_FO_VI	"vt"
 #define DFLT_FO_VIM	"tcq"
-#define FO_ALL		"tcroq2vlb1mMBn,awjp"	// for do_set()
+#define FO_ALL		"tcroq2vlb1mMBn,aw]jp"	// for do_set()
 
 // characters for the p_cpo option:
 #define CPO_ALTREAD	'a'	// ":read" sets alternate file name
diff --git a/src/proto/mbyte.pro b/src/proto/mbyte.pro
index 7b8c4fe679..79af974ec6 100644
--- a/src/proto/mbyte.pro
+++ b/src/proto/mbyte.pro
@@ -52,6 +52,9 @@ void show_utf8(void);
 int latin_head_off(char_u *base, char_u *p);
 int dbcs_screen_head_off(char_u *base, char_u *p);
 int utf_head_off(char_u *base, char_u *p);
+int utf_eat_space(int cc);
+int utf_allow_break_before(int cc);
+int utf_allow_break(int cc, int ncc);
 void mb_copy_char(char_u **fp, char_u **tp);
 int mb_off_next(char_u *base, char_u *p);
 int mb_tail_off(char_u *base, char_u *p);
diff --git a/src/testdir/Make_all.mak b/src/testdir/Make_all.mak
index 6b151d07ca..7402067b45 100644
--- a/src/testdir/Make_all.mak
+++ b/src/testdir/Make_all.mak
@@ -85,6 +85,7 @@ NEW_TESTS = \
 	test_charsearch_utf8 \
 	test_checkpath \
 	test_cindent \
+	test_cjk_linebreak \
 	test_clientserver \
 	test_close_count \
 	test_cmdline \
@@ -333,6 +334,7 @@ NEW_TESTS_RES = \
 	test_charsearch.res \
 	test_checkpath.res \
 	test_cindent.res \
+	test_cjk_linebreak.res \
 	test_clientserver.res \
 	test_close_count.res \
 	test_cmdline.res \
diff --git a/src/testdir/test_cjk_linebreak.vim b/src/testdir/test_cjk_linebreak.vim
new file mode 100644
index 0000000000..7a1cedeeac
--- /dev/null
+++ b/src/testdir/test_cjk_linebreak.vim
@@ -0,0 +1,91 @@
+scriptencoding utf-8
+
+func Run_cjk_linebreak_after()
+  set textwidth=12
+  for punct in [
+        \ '!', '%', ')', ',', ':', ';', '>', '?', ']', '}', '’', '”', '†', '‡',
+        \ '…', '‰', '‱', '‼', '⁇', '⁈', '⁉', '℃', '℉', '、', '。', '〉', '》',
+        \ '」', '』', '】', '〕', '〗', '〙', '〛', '！', '）', '，', '．', '：',
+        \ '；', '？', '］', '｝']
+    call setline('.', '这是一个测试'.punct.'试试 CJK 行禁则补丁。')
+    normal gqq
+    call assert_equal('这是一个测试'.punct, getline(1))
+    %d_
+  endfor
+endfunc
+
+func Test_cjk_linebreak_after()
+  set formatoptions=croqn2mB1j
+  call Run_cjk_linebreak_after()
+endfunc
+
+" TODO: this test fails
+"func Test_cjk_linebreak_after_rigorous()
+"  set formatoptions=croqn2mB1j]
+"  call Run_cjk_linebreak_after()
+"endfunc
+
+func Run_cjk_linebreak_before()
+  set textwidth=12
+  for punct in [
+        \ '(', '<', '[', '`', '{', '‘', '“', '〈', '《', '「', '『', '【', '〔',
+        \ '〖', '〘', '〚', '（', '［', '｛']
+    call setline('.', '这是个测试'.punct.'试试 CJK 行禁则补丁。')
+    normal gqq
+    call assert_equal('这是个测试', getline(1))
+    %d_
+  endfor
+endfunc
+
+func Test_cjk_linebreak_before()
+  set formatoptions=croqn2mB1j
+  call Run_cjk_linebreak_before()
+endfunc
+
+func Test_cjk_linebreak_before_rigorous()
+  set formatoptions=croqn2mB1j]
+  call Run_cjk_linebreak_before()
+endfunc
+
+func Run_cjk_linebreak_nobetween()
+  " …… must not start a line
+  call setline('.', '这是个测试……试试 CJK 行禁则补丁。')
+  set textwidth=12 ambiwidth=double
+  normal gqq
+  " TODO: this fails
+  " call assert_equal('这是个测试……', getline(1))
+  %d_
+
+  call setline('.', '这是一个测试……试试 CJK 行禁则补丁。')
+  set textwidth=12 ambiwidth=double
+  normal gqq
+  call assert_equal('这是一个测', getline(1))
+  %d_
+
+  " but —— can
+  call setline('.', '这是个测试——试试 CJK 行禁则补丁。')
+  set textwidth=12 ambiwidth=double
+  normal gqq
+  call assert_equal('这是个测试', getline(1))
+endfunc
+
+func Test_cjk_linebreak_nobetween()
+  set formatoptions=croqn2mB1j
+  call Run_cjk_linebreak_nobetween()
+endfunc
+
+func Test_cjk_linebreak_nobetween_rigorous()
+  set formatoptions=croqn2mB1j]
+  call Run_cjk_linebreak_nobetween()
+endfunc
+
+func Test_cjk_linebreak_join_punct()
+  for punct in ['——', '〗', '，', '。', '……']
+    call setline(1, '文本文本'.punct)
+    call setline(2, 'English')
+    set formatoptions=croqn2mB1j
+    normal ggJ
+    call assert_equal('文本文本'.punct.'English', getline(1))
+    %d_
+  endfor
+endfunc
diff --git a/src/textformat.c b/src/textformat.c
index c50d70f7c3..373ab59693 100644
--- a/src/textformat.c
+++ b/src/textformat.c
@@ -45,10 +45,12 @@ internal_format(
     int		c) // character to be inserted (can be NUL)
 {
     int		cc;
+    int		skip_pos;
     int		save_char = NUL;
     int		haveto_redraw = FALSE;
     int		fo_ins_blank = has_format_option(FO_INS_BLANK);
     int		fo_multibyte = has_format_option(FO_MBYTE_BREAK);
+    int		fo_rigor_tw  = has_format_option(FO_RIGOROUS_TW);
     int		fo_white_par = has_format_option(FO_WHITE_PAR);
     int		first_line = TRUE;
     colnr_T	leader_len;
@@ -125,6 +127,7 @@ internal_format(
 
 	curwin->w_cursor.col = startcol;
 	foundcol = 0;
+	skip_pos = 0;
 
 	// Find position to break at.
 	// Stop at first entered white when 'formatoptions' has 'v'
@@ -189,8 +192,11 @@ internal_format(
 		if (curwin->w_cursor.col <= (colnr_T)wantcol)
 		    break;
 	    }
-	    else if (cc >= 0x100 && fo_multibyte)
+	    else if ((cc >= 0x100 || !utf_allow_break_before(cc)) && fo_multibyte)
 	    {
+		int ncc;
+		int allow_break;
+
 		// Break after or before a multi-byte character.
 		if (curwin->w_cursor.col != startcol)
 		{
@@ -199,8 +205,14 @@ internal_format(
 			break;
 		    col = curwin->w_cursor.col;
 		    inc_cursor();
-		    // Don't change end_foundcol if already set.
-		    if (foundcol != curwin->w_cursor.col)
+		    ncc = gchar_cursor();
+
+		    allow_break =
+			(enc_utf8 && utf_allow_break(cc, ncc))
+			|| enc_dbcs;
+
+		    // If we have already checked this position, skip!
+		    if (curwin->w_cursor.col != skip_pos && allow_break)
 		    {
 			foundcol = curwin->w_cursor.col;
 			end_foundcol = foundcol;
@@ -213,6 +225,7 @@ internal_format(
 		if (curwin->w_cursor.col == 0)
 		    break;
 
+		ncc = cc;
 		col = curwin->w_cursor.col;
 
 		dec_cursor();
@@ -220,16 +233,65 @@ internal_format(
 
 		if (WHITECHAR(cc))
 		    continue;		// break with space
-		// Don't break until after the comment leader
+		// Don't break until after the comment leader.
 		if (curwin->w_cursor.col < leader_len)
 		    break;
 
 		curwin->w_cursor.col = col;
+		skip_pos = curwin->w_cursor.col;
 
-		foundcol = curwin->w_cursor.col;
-		end_foundcol = foundcol;
+		allow_break =
+		    (enc_utf8 && utf_allow_break(cc, ncc))
+		    || enc_dbcs;
+
+		// Must handle this to respect line break prohibition.
+		if (allow_break)
+		{
+		    foundcol = curwin->w_cursor.col;
+		    end_foundcol = foundcol;
+		}
 		if (curwin->w_cursor.col <= (colnr_T)wantcol)
-		    break;
+		{
+		    int ncc_allow_break =
+			 (enc_utf8 && utf_allow_break_before(ncc)) || enc_dbcs;
+
+		    if (allow_break)
+			break;
+		    if (!ncc_allow_break && !fo_rigor_tw)
+		    {
+			// Enable at most 1 punct hang outside of textwidth.
+			if (curwin->w_cursor.col == startcol)
+			{
+			    // We are inserting a non-breakable char, postpone
+			    // line break check to next insert.
+			    end_foundcol = foundcol = 0;
+			    break;
+			}
+
+			// Neither cc nor ncc is NUL if we are here, so
+			// it's safe to inc_cursor.
+			col = curwin->w_cursor.col;
+
+			inc_cursor();
+			cc  = ncc;
+			ncc = gchar_cursor();
+			// handle insert
+			ncc = (ncc != NUL) ? ncc : c;
+
+			allow_break =
+				(enc_utf8 && utf_allow_break(cc, ncc))
+				|| enc_dbcs;
+
+			if (allow_break)
+			{
+			    // Break only when we are not at end of line.
+			    end_foundcol = foundcol =
+				      ncc == NUL? 0 : curwin->w_cursor.col;
+			    break;
+			}
+			curwin->w_cursor.col = col;
+		    }
+		}
 	    }
 	    if (curwin->w_cursor.col == 0)
 		break;
diff --git a/src/version.c b/src/version.c
index 2c22af411e..291744e4e6 100644
--- a/src/version.c
+++ b/src/version.c
@@ -747,6 +747,8 @@ static char *(features[]) =
 static int included_patches[] =
 {   /* Add new patch number below this line */
 /**/
+    901,
+/**/
     900,
 /**/
     899,
author	Bram Moolenaar <Bram@vim.org>	2020-06-04 18:22:13 +0200
committer	Bram Moolenaar <Bram@vim.org>	2020-06-04 18:22:13 +0200
commit	e52702f00322c8a8861efd0bd6a3775e685e5685 (patch)
tree	ffd498e52ee2b139f3a145147aca02afc5013d63
parent	9155825b2428ef6bf654204a534b033a6879c90d (diff)