1 files changed, 5833 insertions, 0 deletions
diff --git a/src/mbyte.c b/src/mbyte.c
new file mode 100644
index 0000000000..699316d0e3
--- /dev/null
+++ b/src/mbyte.c
@@ -0,0 +1,5833 @@
+/* vi:set ts=8 sts=4 sw=4:
+ *
+ * VIM - Vi IMproved	by Bram Moolenaar
+ * Multibyte extensions partly by Sung-Hoon Baek
+ *
+ * Do ":help uganda"  in Vim to read copying and usage conditions.
+ * Do ":help credits" in Vim to see a list of people who contributed.
+ * See README.txt for an overview of the Vim source code.
+ */
+/*
+ * mbyte.c: Code specifically for handling multi-byte characters.
+ *
+ * The encoding used in the core is set with 'encoding'.  When 'encoding' is
+ * changed, the following four variables are set (for speed).
+ * Currently these types of character encodings are supported:
+ *
+ * "enc_dbcs"	    When non-zero it tells the type of double byte character
+ *		    encoding (Chinese, Korean, Japanese, etc.).
+ *		    The cell width on the display is equal to the number of
+ *		    bytes.  (exception: DBCS_JPNU with first byte 0x8e)
+ *		    Recognizing the first or second byte is difficult, it
+ *		    requires checking a byte sequence from the start.
+ * "enc_utf8"	    When TRUE use Unicode characters in UTF-8 encoding.
+ *		    The cell width on the display needs to be determined from
+ *		    the character value.
+ *		    Recognizing bytes is easy: 0xxx.xxxx is a single-byte
+ *		    char, 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading
+ *		    byte of a multi-byte character.
+ *		    To make things complicated, up to two composing characters
+ *		    are allowed.  These are drawn on top of the first char.
+ *		    For most editing the sequence of bytes with composing
+ *		    characters included is considered to be one character.
+ * "enc_unicode"    When 2 use 16-bit Unicode characters (or UTF-16).
+ *		    When 4 use 32-but Unicode characters.
+ *		    Internally characters are stored in UTF-8 encoding to
+ *		    avoid NUL bytes.  Conversion happens when doing I/O.
+ *		    "enc_utf8" will also be TRUE.
+ *
+ * "has_mbyte" is set when "enc_dbcs" or "enc_utf8" is non-zero.
+ *
+ * If none of these is TRUE, 8-bit bytes are used for a character.  The
+ * encoding isn't currently specified (TODO).
+ *
+ * 'encoding' specifies the encoding used in the core.  This is in registers,
+ * text manipulation, buffers, etc.  Conversion has to be done when characters
+ * in another encoding are received or send:
+ *
+ *		       clipboard
+ *			   ^
+ *			   | (2)
+ *			   V
+ *		   +---------------+
+ *	      (1)  |		   | (3)
+ *  keyboard ----->|	 core	   |-----> display
+ *		   |		   |
+ *		   +---------------+
+ *			   ^
+ *			   | (4)
+ *			   V
+ *			 file
+ *
+ * (1) Typed characters arrive in the current locale.  Conversion is to be
+ *     done when 'encoding' is different from 'termencoding'.
+ * (2) Text will be made available with the encoding specified with
+ *     'encoding'.  If this is not sufficient, system-specific conversion
+ *     might be required.
+ * (3) For the GUI the correct font must be selected, no conversion done.
+ *     Otherwise, conversion is to be done when 'encoding' differs from
+ *     'termencoding'.  (Different in the GTK+ 2 port -- 'termencoding'
+ *     is always used for both input and output and must always be set to
+ *     "utf-8".  gui_mch_init() does this automatically.)
+ * (4) The encoding of the file is specified with 'fileencoding'.  Conversion
+ *     is to be done when it's different from 'encoding'.
+ *
+ * The viminfo file is a special case: Only text is converted, not file names.
+ * Vim scripts may contain an ":encoding" command.  This has an effect for
+ * some commands, like ":menutrans"
+ */
+
+#include "vim.h"
+
+#ifdef WIN32UNIX
+# ifndef WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+# endif
+# include <windows.h>
+# ifdef WIN32
+#  undef WIN32	    /* Some windows.h define WIN32, we don't want that here. */
+# endif
+#endif
+
+#if (defined(WIN3264) || defined(WIN32UNIX)) && !defined(__MINGW32__)
+# include <winnls.h>
+#endif
+
+#ifdef FEAT_GUI_X11
+# include <X11/Intrinsic.h>
+#endif
+#ifdef X_LOCALE
+#include <X11/Xlocale.h>
+#endif
+
+#if defined(FEAT_XIM) && defined(HAVE_GTK2)
+# include <gdk/gdkkeysyms.h>
+# ifdef WIN3264
+#  include <gdk/gdkwin32.h>
+# else
+#  include <gdk/gdkx.h>
+# endif
+#endif
+
+#ifdef HAVE_WCHAR_H
+# include <wchar.h>
+#endif
+
+#if 0
+/* This has been disabled, because several people reported problems with the
+ * wcwidth() and iswprint() library functions, esp. for Hebrew. */
+# ifdef __STDC_ISO_10646__
+#  define USE_WCHAR_FUNCTIONS
+# endif
+#endif
+
+#if defined(FEAT_MBYTE) || defined(PROTO)
+
+static int enc_canon_search __ARGS((char_u *name));
+static int dbcs_char2len __ARGS((int c));
+static int dbcs_char2bytes __ARGS((int c, char_u *buf));
+static int dbcs_ptr2len_check __ARGS((char_u *p));
+static int dbcs_char2cells __ARGS((int c));
+static int dbcs_ptr2char __ARGS((char_u *p));
+
+/* Lookup table to quickly get the length in bytes of a UTF-8 character from
+ * the first byte of a UTF-8 string.  Bytes which are illegal when used as the
+ * first byte have a one, because these will be used separately. */
+static char utf8len_tab[256] =
+{
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /*bogus*/
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /*bogus*/
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
+};
+
+/*
+ * XIM often causes trouble.  Define XIM_DEBUG to get a log of XIM callbacks
+ * in the "xim.log" file.
+ */
+/* #define XIM_DEBUG */
+#ifdef XIM_DEBUG
+    static void
+xim_log(char *s, ...)
+{
+    va_list arglist;
+    static FILE *fd = NULL;
+
+    if (fd == (FILE *)-1)
+	return;
+    if (fd == NULL)
+    {
+	fd = fopen("xim.log", "w");
+	if (fd == NULL)
+	{
+	    EMSG("Cannot open xim.log");
+	    fd = (FILE *)-1;
+	    return;
+	}
+    }
+
+    va_start(arglist, s);
+    vfprintf(fd, s, arglist);
+    va_end(arglist);
+}
+#endif
+
+#endif
+
+#if defined(FEAT_MBYTE) || defined(FEAT_POSTSCRIPT) || defined(PROTO)
+/*
+ * Canonical encoding names and their properties.
+ * "iso-8859-n" is handled by enc_canonize() directly.
+ */
+static struct
+{   char *name;		int prop;		int codepage;}
+enc_canon_table[] =
+{
+#define IDX_LATIN_1	0
+    {"latin1",		ENC_8BIT + ENC_LATIN1,	1252},
+#define IDX_ISO_2	1
+    {"iso-8859-2",	ENC_8BIT,		0},
+#define IDX_ISO_3	2
+    {"iso-8859-3",	ENC_8BIT,		0},
+#define IDX_ISO_4	3
+    {"iso-8859-4",	ENC_8BIT,		0},
+#define IDX_ISO_5	4
+    {"iso-8859-5",	ENC_8BIT,		0},
+#define IDX_ISO_6	5
+    {"iso-8859-6",	ENC_8BIT,		0},
+#define IDX_ISO_7	6
+    {"iso-8859-7",	ENC_8BIT,		0},
+#define IDX_CP1255	7
+    {"cp1255",		ENC_8BIT,		1255}, /* close to iso-8859-8 */
+#define IDX_ISO_8	8
+    {"iso-8859-8",	ENC_8BIT,		0},
+#define IDX_ISO_9	9
+    {"iso-8859-9",	ENC_8BIT,		0},
+#define IDX_ISO_10	10
+    {"iso-8859-10",	ENC_8BIT,		0},
+#define IDX_ISO_11	11
+    {"iso-8859-11",	ENC_8BIT,		0},
+#define IDX_ISO_13	12
+    {"iso-8859-13",	ENC_8BIT,		0},
+#define IDX_ISO_14	13
+    {"iso-8859-14",	ENC_8BIT,		0},
+#define IDX_ISO_15	14
+    {"iso-8859-15",	ENC_8BIT,		0},
+#define IDX_KOI8_R	15
+    {"koi8-r",		ENC_8BIT,		0},
+#define IDX_KOI8_U	16
+    {"koi8-u",		ENC_8BIT,		0},
+#define IDX_UTF8	17
+    {"utf-8",		ENC_UNICODE,		0},
+#define IDX_UCS2	18
+    {"ucs-2",		ENC_UNICODE + ENC_ENDIAN_B + ENC_2BYTE, 0},
+#define IDX_UCS2LE	19
+    {"ucs-2le",		ENC_UNICODE + ENC_ENDIAN_L + ENC_2BYTE, 0},
+#define IDX_UTF16	20
+    {"utf-16",		ENC_UNICODE + ENC_ENDIAN_B + ENC_2WORD, 0},
+#define IDX_UTF16LE	21
+    {"utf-16le",	ENC_UNICODE + ENC_ENDIAN_L + ENC_2WORD, 0},
+#define IDX_UCS4	22
+    {"ucs-4",		ENC_UNICODE + ENC_ENDIAN_B + ENC_4BYTE, 0},
+#define IDX_UCS4LE	23
+    {"ucs-4le",		ENC_UNICODE + ENC_ENDIAN_L + ENC_4BYTE, 0},
+#define IDX_DEBUG	24
+    {"debug",		ENC_DBCS,		DBCS_DEBUG},
+#define IDX_CP932	25
+    {"cp932",		ENC_DBCS,		DBCS_JPN},
+#define IDX_CP949	26
+    {"cp949",		ENC_DBCS,		DBCS_KOR},
+#define IDX_CP936	27
+    {"cp936",		ENC_DBCS,		DBCS_CHS},
+#define IDX_CP950	28
+    {"cp950",		ENC_DBCS,		DBCS_CHT},
+#define IDX_EUC_JP	29
+    {"euc-jp",		ENC_DBCS,		DBCS_JPNU},
+#define IDX_SJIS	30
+    {"sjis",		ENC_DBCS,		DBCS_JPN},
+#define IDX_EUC_KR	31
+    {"euc-kr",		ENC_DBCS,		DBCS_KORU},
+#define IDX_EUC_CN	32
+    {"euc-cn",		ENC_DBCS,		DBCS_CHSU},
+#define IDX_EUC_TW	33
+    {"euc-tw",		ENC_DBCS,		DBCS_CHTU},
+#define IDX_BIG5	34
+    {"big5",		ENC_DBCS,		DBCS_CHT},
+#define IDX_CP1251	35
+    {"cp1251",		ENC_8BIT,		1251},
+#define IDX_MACROMAN	36
+    {"macroman",	ENC_8BIT + ENC_MACROMAN, 0},
+#define IDX_COUNT	37
+};
+
+/*
+ * Aliases for encoding names.
+ */
+static struct
+{   char *name;		int canon;}
+enc_alias_table[] =
+{
+    {"ansi",		IDX_LATIN_1},
+    {"iso-8859-1",	IDX_LATIN_1},
+    {"latin2",		IDX_ISO_2},
+    {"latin3",		IDX_ISO_3},
+    {"latin4",		IDX_ISO_4},
+    {"cyrillic",	IDX_ISO_5},
+    {"arabic",		IDX_ISO_6},
+    {"greek",		IDX_ISO_7},
+#ifdef WIN3264
+    {"hebrew",		IDX_CP1255},
+#else
+    {"hebrew",		IDX_ISO_8},
+#endif
+    {"latin5",		IDX_ISO_9},
+    {"turkish",		IDX_ISO_9}, /* ? */
+    {"latin6",		IDX_ISO_10},
+    {"nordic",		IDX_ISO_10}, /* ? */
+    {"thai",		IDX_ISO_11}, /* ? */
+    {"latin7",		IDX_ISO_13},
+    {"latin8",		IDX_ISO_14},
+    {"latin9",		IDX_ISO_15},
+    {"utf8",		IDX_UTF8},
+    {"unicode",		IDX_UCS2},
+    {"ucs2",		IDX_UCS2},
+    {"ucs2be",		IDX_UCS2},
+    {"ucs-2be",		IDX_UCS2},
+    {"ucs2le",		IDX_UCS2LE},
+    {"utf16",		IDX_UTF16},
+    {"utf16be",		IDX_UTF16},
+    {"utf-16be",	IDX_UTF16},
+    {"utf16le",		IDX_UTF16LE},
+    {"ucs4",		IDX_UCS4},
+    {"ucs4be",		IDX_UCS4},
+    {"ucs-4be",		IDX_UCS4},
+    {"ucs4le",		IDX_UCS4LE},
+    {"932",		IDX_CP932},
+    {"949",		IDX_CP949},
+    {"936",		IDX_CP936},
+    {"950",		IDX_CP950},
+    {"eucjp",		IDX_EUC_JP},
+    {"unix-jis",	IDX_EUC_JP},
+    {"ujis",		IDX_EUC_JP},
+    {"shift-jis",	IDX_SJIS},
+    {"euckr",		IDX_EUC_KR},
+    {"5601",		IDX_EUC_KR},	/* Sun: KS C 5601 */
+    {"euccn",		IDX_EUC_CN},
+    {"gb2312",		IDX_EUC_CN},
+    {"euctw",		IDX_EUC_TW},
+#if defined(WIN3264) || defined(WIN32UNIX) || defined(MACOS)
+    {"japan",		IDX_CP932},
+    {"korea",		IDX_CP949},
+    {"prc",		IDX_CP936},
+    {"chinese",		IDX_CP936},
+    {"taiwan",		IDX_CP950},
+    {"big5",		IDX_CP950},
+#else
+    {"japan",		IDX_EUC_JP},
+    {"korea",		IDX_EUC_KR},
+    {"prc",		IDX_EUC_CN},
+    {"chinese",		IDX_EUC_CN},
+    {"taiwan",		IDX_EUC_TW},
+    {"cp950",		IDX_BIG5},
+    {"950",		IDX_BIG5},
+#endif
+    {"mac",		IDX_MACROMAN},
+    {NULL,		0}
+};
+
+#ifndef CP_UTF8
+# define CP_UTF8 65001	/* magic number from winnls.h */
+#endif
+
+/*
+ * Find encoding "name" in the list of canonical encoding names.
+ * Returns -1 if not found.
+ */
+    static int
+enc_canon_search(name)
+    char_u	*name;
+{
+    int		i;
+
+    for (i = 0; i < IDX_COUNT; ++i)
+	if (STRCMP(name, enc_canon_table[i].name) == 0)
+	    return i;
+    return -1;
+}
+
+#endif
+
+#if defined(FEAT_MBYTE) || defined(PROTO)
+
+/*
+ * Find canonical encoding "name" in the list and return its properties.
+ * Returns 0 if not found.
+ */
+    int
+enc_canon_props(name)
+    char_u	*name;
+{
+    int		i;
+
+    i = enc_canon_search(name);
+    if (i >= 0)
+	return enc_canon_table[i].prop;
+#ifdef WIN3264
+    if (name[0] == 'c' && name[1] == 'p' && VIM_ISDIGIT(name[2]))
+    {
+	CPINFO	cpinfo;
+
+	/* Get info on this codepage to find out what it is. */
+	if (GetCPInfo(atoi(name + 2), &cpinfo) != 0)
+	{
+	    if (cpinfo.MaxCharSize == 1) /* some single-byte encoding */
+		return ENC_8BIT;
+	    if (cpinfo.MaxCharSize == 2
+		    && (cpinfo.LeadByte[0] != 0 || cpinfo.LeadByte[1] != 0))
+		/* must be a DBCS encoding */
+		return ENC_DBCS;
+	}
+	return 0;
+    }
+#endif
+    if (STRNCMP(name, "2byte-", 6) == 0)
+	return ENC_DBCS;
+    if (STRNCMP(name, "8bit-", 5) == 0 || STRNCMP(name, "iso-8859-", 9) == 0)
+	return ENC_8BIT;
+    return 0;
+}
+
+/*
+ * Set up for using multi-byte characters.
+ * Called in three cases:
+ * - by main() to initialize (p_enc == NULL)
+ * - by set_init_1() after 'encoding' was set to its default.
+ * - by do_set() when 'encoding' has been set.
+ * p_enc must have been passed through enc_canonize() already.
+ * Sets the "enc_unicode", "enc_utf8", "enc_dbcs" and "has_mbyte" flags.
+ * Fills mb_bytelen_tab[] and returns NULL when there are no problems.
+ * When there is something wrong: Returns an error message and doesn't change
+ * anything.
+ */
+    char_u *
+mb_init()
+{
+    int		i;
+    int		idx;
+    int		n;
+    int		enc_dbcs_new = 0;
+#if defined(USE_ICONV) && !defined(WIN3264) && !defined(WIN32UNIX) \
+	&& !defined(MACOS)
+# define LEN_FROM_CONV
+    vimconv_T	vimconv;
+    char_u	*p;
+#endif
+
+    if (p_enc == NULL)
+    {
+	/* Just starting up: set the whole table to one's. */
+	for (i = 0; i < 256; ++i)
+	    mb_bytelen_tab[i] = 1;
+	input_conv.vc_type = CONV_NONE;
+	input_conv.vc_factor = 1;
+	output_conv.vc_type = CONV_NONE;
+	return NULL;
+    }
+
+#ifdef WIN3264
+    if (p_enc[0] == 'c' && p_enc[1] == 'p' && VIM_ISDIGIT(p_enc[2]))
+    {
+	CPINFO	cpinfo;
+
+	/* Get info on this codepage to find out what it is. */
+	if (GetCPInfo(atoi(p_enc + 2), &cpinfo) != 0)
+	{
+	    if (cpinfo.MaxCharSize == 1)
+	    {
+		/* some single-byte encoding */
+		enc_unicode = 0;
+		enc_utf8 = FALSE;
+	    }
+	    else if (cpinfo.MaxCharSize == 2
+		    && (cpinfo.LeadByte[0] != 0 || cpinfo.LeadByte[1] != 0))
+	    {
+		/* must be a DBCS encoding, check below */
+		enc_dbcs_new = atoi(p_enc + 2);
+	    }
+	    else
+		goto codepage_invalid;
+	}
+	else if (GetLastError() == ERROR_INVALID_PARAMETER)
+	{
+codepage_invalid:
+	    return (char_u *)N_("E543: Not a valid codepage");
+	}
+    }
+#endif
+    else if (STRNCMP(p_enc, "8bit-", 5) == 0
+	    || STRNCMP(p_enc, "iso-8859-", 9) == 0)
+    {
+	/* Accept any "8bit-" or "iso-8859-" name. */
+	enc_unicode = 0;
+	enc_utf8 = FALSE;
+    }
+    else if (STRNCMP(p_enc, "2byte-", 6) == 0)
+    {
+#ifdef WIN3264
+	/* Windows: accept only valid codepage numbers, check below. */
+	if (p_enc[6] != 'c' || p_enc[7] != 'p'
+				      || (enc_dbcs_new = atoi(p_enc + 8)) == 0)
+	    return e_invarg;
+#else
+	/* Unix: accept any "2byte-" name, assume current locale. */
+	enc_dbcs_new = DBCS_2BYTE;
+#endif
+    }
+    else if ((idx = enc_canon_search(p_enc)) >= 0)
+    {
+	i = enc_canon_table[idx].prop;
+	if (i & ENC_UNICODE)
+	{
+	    /* Unicode */
+	    enc_utf8 = TRUE;
+	    if (i & (ENC_2BYTE | ENC_2WORD))
+		enc_unicode = 2;
+	    else if (i & ENC_4BYTE)
+		enc_unicode = 4;
+	    else
+		enc_unicode = 0;
+	}
+	else if (i & ENC_DBCS)
+	{
+	    /* 2byte, handle below */
+	    enc_dbcs_new = enc_canon_table[idx].codepage;
+	}
+	else
+	{
+	    /* Must be 8-bit. */
+	    enc_unicode = 0;
+	    enc_utf8 = FALSE;
+	}
+    }
+    else    /* Don't know what encoding this is, reject it. */
+	return e_invarg;
+
+    if (enc_dbcs_new != 0)
+    {
+#ifdef WIN3264
+	/* Check if the DBCS code page is OK. */
+	if (!IsValidCodePage(enc_dbcs_new))
+	    goto codepage_invalid;
+#endif
+	enc_unicode = 0;
+	enc_utf8 = FALSE;
+    }
+    enc_dbcs = enc_dbcs_new;
+    has_mbyte = (enc_dbcs != 0 || enc_utf8);
+
+#ifdef WIN3264
+    enc_codepage = encname2codepage(p_enc);
+#endif
+
+    /*
+     * Set the function pointers.
+     */
+    if (enc_utf8)
+    {
+	mb_ptr2len_check = utfc_ptr2len_check;
+	mb_char2len = utf_char2len;
+	mb_char2bytes = utf_char2bytes;
+	mb_ptr2cells = utf_ptr2cells;
+	mb_char2cells = utf_char2cells;
+	mb_off2cells = utf_off2cells;
+	mb_ptr2char = utf_ptr2char;
+	mb_head_off = utf_head_off;
+    }
+    else if (enc_dbcs != 0)
+    {
+	mb_ptr2len_check = dbcs_ptr2len_check;
+	mb_char2len = dbcs_char2len;
+	mb_char2bytes = dbcs_char2bytes;
+	mb_ptr2cells = dbcs_ptr2cells;
+	mb_char2cells = dbcs_char2cells;
+	mb_off2cells = dbcs_off2cells;
+	mb_ptr2char = dbcs_ptr2char;
+	mb_head_off = dbcs_head_off;
+    }
+    else
+    {
+	mb_ptr2len_check = latin_ptr2len_check;
+	mb_char2len = latin_char2len;
+	mb_char2bytes = latin_char2bytes;
+	mb_ptr2cells = latin_ptr2cells;
+	mb_char2cells = latin_char2cells;
+	mb_off2cells = latin_off2cells;
+	mb_ptr2char = latin_ptr2char;
+	mb_head_off = latin_head_off;
+    }
+
+    /*
+     * Fill the mb_bytelen_tab[] for MB_BYTE2LEN().
+     */
+#ifdef LEN_FROM_CONV
+    /* When 'encoding' is different from the current locale mblen() won't
+     * work.  Use conversion to "utf-8" instead. */
+    vimconv.vc_type = CONV_NONE;
+    if (enc_dbcs)
+    {
+	p = enc_locale();
+	if (p == NULL || STRCMP(p, p_enc) != 0)
+	{
+	    convert_setup(&vimconv, p_enc, (char_u *)"utf-8");
+	    vimconv.vc_fail = TRUE;
+	}
+	vim_free(p);
+    }
+#endif
+
+    for (i = 0; i < 256; ++i)
+    {
+	/* Our own function to reliably check the length of UTF-8 characters,
+	 * independent of mblen(). */
+	if (enc_utf8)
+	    n = utf8len_tab[i];
+	else if (enc_dbcs == 0)
+	    n = 1;
+	else
+	{
+#if defined(WIN3264) || defined(WIN32UNIX)
+	    /* enc_dbcs is set by setting 'fileencoding'.  It becomes a Windows
+	     * CodePage identifier, which we can pass directly in to Windows
+	     * API */
+	    n = IsDBCSLeadByteEx(enc_dbcs, (BYTE)i) ? 2 : 1;
+#else
+# ifdef MACOS
+	    /*
+	     * if mblen() is not available, character which MSB is turned on
+	     * are treated as leading byte character. (note : This assumption
+	     * is not always true.)
+	     */
+	    n = (i & 0x80) ? 2 : 1;
+# else
+	    char buf[MB_MAXBYTES];
+# ifdef X_LOCALE
+#  ifndef mblen
+#   define mblen _Xmblen
+#  endif
+# endif
+	    if (i == NUL)	/* just in case mblen() can't handle "" */
+		n = 1;
+	    else
+	    {
+		buf[0] = i;
+		buf[1] = 0;
+#ifdef LEN_FROM_CONV
+		if (vimconv.vc_type != CONV_NONE)
+		{
+		    /*
+		     * string_convert() should fail when converting the first
+		     * byte of a double-byte character.
+		     */
+		    p = string_convert(&vimconv, (char_u *)buf, NULL);
+		    if (p != NULL)
+		    {
+			vim_free(p);
+			n = 1;
+		    }
+		    else
+			n = 2;
+		}
+		else
+#endif
+		{
+		    /*
+		     * mblen() should return -1 for invalid (means the leading
+		     * multibyte) character.  However there are some platforms
+		     * where mblen() returns 0 for invalid character.
+		     * Therefore, following condition includes 0.
+		     */
+		    if (mblen(buf, (size_t)1) <= 0)
+			n = 2;
+		    else
+			n = 1;
+		}
+	    }
+# endif
+#endif
+	}
+
+	mb_bytelen_tab[i] = n;
+    }
+
+#ifdef LEN_FROM_CONV
+    convert_setup(&vimconv, NULL, NULL);
+#endif
+
+    /* The cell width depends on the type of multi-byte characters. */
+    (void)init_chartab();
+
+    /* When enc_utf8 is set or reset, (de)allocate ScreenLinesUC[] */
+    screenalloc(FALSE);
+
+    /* When using Unicode, set default for 'fileencodings'. */
+    if (enc_utf8 && !option_was_set((char_u *)"fencs"))
+	set_string_option_direct((char_u *)"fencs", -1,
+				 (char_u *)"ucs-bom,utf-8,latin1", OPT_FREE);
+#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(FEAT_GETTEXT)
+    /* GNU gettext 0.10.37 supports this feature: set the codeset used for
+     * translated messages independently from the current locale. */
+    (void)bind_textdomain_codeset(VIMPACKAGE,
+					  enc_utf8 ? "utf-8" : (char *)p_enc);
+#endif
+
+#ifdef FEAT_AUTOCMD
+    /* Fire an autocommand to let people do custom font setup. This must be
+     * after Vim has been setup for the new encoding. */
+    apply_autocmds(EVENT_ENCODINGCHANGED, NULL, (char_u *)"", FALSE, curbuf);
+#endif
+
+    return NULL;
+}
+
+/*
+ * Return the size of the BOM for the current buffer:
+ * 0 - no BOM
+ * 2 - UCS-2 or UTF-16 BOM
+ * 4 - UCS-4 BOM
+ * 3 - UTF-8 BOM
+ */
+    int
+bomb_size()
+{
+    int n = 0;
+
+    if (curbuf->b_p_bomb && !curbuf->b_p_bin)
+    {
+	if (*curbuf->b_p_fenc == NUL)
+	{
+	    if (enc_utf8)
+	    {
+		if (enc_unicode != 0)
+		    n = enc_unicode;
+		else
+		    n = 3;
+	    }
+	}
+	else if (STRCMP(curbuf->b_p_fenc, "utf-8") == 0)
+	    n = 3;
+	else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0
+		|| STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0)
+	    n = 2;
+	else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0)
+	    n = 4;
+    }
+    return n;
+}
+
+/*
+ * Get class of pointer:
+ * 0 for blank or NUL
+ * 1 for punctuation
+ * 2 for an (ASCII) word character
+ * >2 for other word characters
+ */
+    int
+mb_get_class(p)
+    char_u	*p;
+{
+    if (MB_BYTE2LEN(p[0]) == 1)
+    {
+	if (p[0] == NUL || vim_iswhite(p[0]))
+	    return 0;
+	if (vim_iswordc(p[0]))
+	    return 2;
+	return 1;
+    }
+    if (enc_dbcs != 0 && p[0] != NUL && p[1] != NUL)
+	return dbcs_class(p[0], p[1]);
+    if (enc_utf8)
+	return utf_class(utf_ptr2char(p));
+    return 0;
+}
+
+/*
+ * Get class of a double-byte character.  This always returns 3 or bigger.
+ * TODO: Should return 1 for punctuation.
+ */
+    int
+dbcs_class(lead, trail)
+    unsigned	lead;
+    unsigned	trail;
+{
+    switch (enc_dbcs)
+    {
+	/* please add classfy routine for your language in here */
+
+	case DBCS_JPNU:	/* ? */
+	case DBCS_JPN:
+	    {
+		/* JIS code classification */
+		unsigned char lb = lead;
+		unsigned char tb = trail;
+
+		/* convert process code to JIS */
+# if defined(WIN3264) || defined(WIN32UNIX) || defined(MACOS)
+		/* process code is SJIS */
+		if (lb <= 0x9f)
+		    lb = (lb - 0x81) * 2 + 0x21;
+		else
+		    lb = (lb - 0xc1) * 2 + 0x21;
+		if (tb <= 0x7e)
+		    tb -= 0x1f;
+		else if (tb <= 0x9e)
+		    tb -= 0x20;
+		else
+		{
+		    tb -= 0x7e;
+		    lb += 1;
+		}
+# else
+		/*
+		 * XXX: Code page identification can not use with all
+		 *	    system! So, some other encoding information
+		 *	    will be needed.
+		 *	    In japanese: SJIS,EUC,UNICODE,(JIS)
+		 *	    Note that JIS-code system don't use as
+		 *	    process code in most system because it uses
+		 *	    escape sequences(JIS is context depend encoding).
+		 */
+		/* assume process code is JAPANESE-EUC */
+		lb &= 0x7f;
+		tb &= 0x7f;
+# endif
+		/* exceptions */
+		switch (lb << 8 | tb)
+		{
+		    case 0x2121: /* ZENKAKU space */
+			return 0;
+		    case 0x2122: /* KU-TEN (Japanese comma) */
+		    case 0x2123: /* TOU-TEN (Japanese period) */
+		    case 0x2124: /* ZENKAKU comma */
+		    case 0x2125: /* ZENKAKU period */
+			return 1;
+		    case 0x213c: /* prolongedsound handled as KATAKANA */
+			return 13;
+		}
+		/* sieved by KU code */
+		switch (lb)
+		{
+		    case 0x21:
+		    case 0x22:
+			/* special symbols */
+			return 10;
+		    case 0x23:
+			/* alpha-numeric */
+			return 11;
+		    case 0x24:
+			/* hiragana */
+			return 12;
+		    case 0x25:
+			/* katakana */
+			return 13;
+		    case 0x26:
+			/* greek */
+			return 14;
+		    case 0x27:
+			/* russian */
+			return 15;
+		    case 0x28:
+			/* lines */
+			return 16;
+		    default:
+			/* kanji */
+			return 17;
+		}
+	    }
+
+	case DBCS_KORU:	/* ? */
+	case DBCS_KOR:
+	    {
+		/* KS code classification */
+		unsigned char c1 = lead;
+		unsigned char c2 = trail;
+
+		/*
+		 * 20 : Hangul
+		 * 21 : Hanja
+		 * 22 : Symbols
+		 * 23 : Alpha-numeric/Roman Letter (Full width)
+		 * 24 : Hangul Letter(Alphabet)
+		 * 25 : Roman Numeral/Greek Letter
+		 * 26 : Box Drawings
+		 * 27 : Unit Symbols
+		 * 28 : Circled/Parenthesized Letter
+		 * 29 : Hirigana/Katakana
+		 * 30 : Cyrillic Letter
+		 */
+
+		if (c1 >= 0xB0 && c1 <= 0xC8)
+		    /* Hangul */
+		    return 20;
+#if defined(WIN3264) || defined(WIN32UNIX)
+		else if (c1 <= 0xA0 || c2 <= 0xA0)
+		    /* Extended Hangul Region : MS UHC(Unified Hangul Code) */
+		    /* c1: 0x81-0xA0 with c2: 0x41-0x5A, 0x61-0x7A, 0x81-0xFE
+		     * c1: 0xA1-0xC6 with c2: 0x41-0x5A, 0x61-0x7A, 0x81-0xA0
+		     */
+		    return 20;
+#endif
+
+		else if (c1 >= 0xCA && c1 <= 0xFD)
+		    /* Hanja */
+		    return 21;
+		else switch (c1)
+		{
+		    case 0xA1:
+		    case 0xA2:
+			/* Symbols */
+			return 22;
+		    case 0xA3:
+			/* Alpha-numeric */
+			return 23;
+		    case 0xA4:
+			/* Hangul Letter(Alphabet) */
+			return 24;
+		    case 0xA5:
+			/* Roman Numeral/Greek Letter */
+			return 25;
+		    case 0xA6:
+			/* Box Drawings */
+			return 26;
+		    case 0xA7:
+			/* Unit Symbols */
+			return 27;
+		    case 0xA8:
+		    case 0xA9:
+			if (c2 <= 0xAF)
+			    return 25;  /* Roman Letter */
+			else if (c2 >= 0xF6)
+			    return 22;  /* Symbols */
+			else
+			    /* Circled/Parenthesized Letter */
+			    return 28;
+		    case 0xAA:
+		    case 0xAB:
+			/* Hirigana/Katakana */
+			return 29;
+		    case 0xAC:
+			/* Cyrillic Letter */
+			return 30;
+		}
+	    }
+	default:
+	    break;
+    }
+    return 3;
+}
+
+/*
+ * mb_char2len() function pointer.
+ * Return length in bytes of character "c".
+ * Returns 1 for a single-byte character.
+ */
+/* ARGSUSED */
+    int
+latin_char2len(c)
+    int		c;
+{
+    return 1;
+}
+
+    static int
+dbcs_char2len(c)
+    int		c;
+{
+    if (c >= 0x100)
+	return 2;
+    return 1;
+}
+
+/*
+ * mb_char2bytes() function pointer.
+ * Convert a character to its bytes.
+ * Returns the length in bytes.
+ */
+    int
+latin_char2bytes(c, buf)
+    int		c;
+    char_u	*buf;
+{
+    buf[0] = c;
+    return 1;
+}
+
+    static int
+dbcs_char2bytes(c, buf)
+    int		c;
+    char_u	*buf;
+{
+    if (c >= 0x100)
+    {
+	buf[0] = (unsigned)c >> 8;
+	buf[1] = c;
+	return 2;
+    }
+    buf[0] = c;
+    return 1;
+}
+
+/*
+ * mb_ptr2len_check() function pointer.
+ * Get byte length of character at "*p" but stop at a NUL.
+ * For UTF-8 this includes following composing characters.
+ * Returns 0 when *p is NUL.
+ *
+ */
+    int
+latin_ptr2len_check(p)
+    char_u	*p;
+{
+    return MB_BYTE2LEN(*p);
+}
+
+    static int
+dbcs_ptr2len_check(p)
+    char_u	*p;
+{
+    int		len;
+
+    /* Check if second byte is not missing. */
+    len = MB_BYTE2LEN(*p);
+    if (len == 2 && p[1] == NUL)
+	len = 1;
+    return len;
+}
+
+struct interval
+{
+    unsigned short first;
+    unsigned short last;
+};
+static int intable __ARGS((struct interval *table, size_t size, int c));
+
+/*
+ * Return TRUE if "c" is in "table[size / sizeof(struct interval)]".
+ */
+    static int
+intable(table, size, c)
+    struct interval	*table;
+    size_t		size;
+    int			c;
+{
+    int mid, bot, top;
+
+    /* first quick check for Latin1 etc. characters */
+    if (c < table[0].first)
+	return FALSE;
+
+    /* binary search in table */
+    bot = 0;
+    top = size / sizeof(struct interval) - 1;
+    while (top >= bot)
+    {
+	mid = (bot + top) / 2;
+	if (table[mid].last < c)
+	    bot = mid + 1;
+	else if (table[mid].first > c)
+	    top = mid - 1;
+	else
+	    return TRUE;
+    }
+    return FALSE;
+}
+
+/*
+ * For UTF-8 character "c" return 2 for a double-width character, 1 for others.
+ * Returns 4 or 6 for an unprintable character.
+ * Is only correct for characters >= 0x80.
+ * When p_ambw is "double", return 2 for a character with East Asian Width
+ * class 'A'(mbiguous).
+ */
+    int
+utf_char2cells(c)
+    int		c;
+{
+    /* sorted list of non-overlapping intervals of East Asian Ambiguous
+     * characters, generated with:
+     * "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c" */
+    static struct interval ambiguous[] = {
+	{0x00A1, 0x00A1}, {0x00A4, 0x00A4}, {0x00A7, 0x00A8},
+	{0x00AA, 0x00AA}, {0x00AE, 0x00AE}, {0x00B0, 0x00B4},
+	{0x00B6, 0x00BA}, {0x00BC, 0x00BF}, {0x00C6, 0x00C6},
+	{0x00D0, 0x00D0}, {0x00D7, 0x00D8}, {0x00DE, 0x00E1},
+	{0x00E6, 0x00E6}, {0x00E8, 0x00EA}, {0x00EC, 0x00ED},
+	{0x00F0, 0x00F0}, {0x00F2, 0x00F3}, {0x00F7, 0x00FA},
+	{0x00FC, 0x00FC}, {0x00FE, 0x00FE}, {0x0101, 0x0101},
+	{0x0111, 0x0111}, {0x0113, 0x0113}, {0x011B, 0x011B},
+	{0x0126, 0x0127}, {0x012B, 0x012B}, {0x0131, 0x0133},
+	{0x0138, 0x0138}, {0x013F, 0x0142}, {0x0144, 0x0144},
+	{0x0148, 0x014B}, {0x014D, 0x014D}, {0x0152, 0x0153},
+	{0x0166, 0x0167}, {0x016B, 0x016B}, {0x01CE, 0x01CE},
+	{0x01D0, 0x01D0}, {0x01D2, 0x01D2}, {0x01D4, 0x01D4},
+	{0x01D6, 0x01D6}, {0x01D8, 0x01D8}, {0x01DA, 0x01DA},
+	{0x01DC, 0x01DC}, {0x0251, 0x0251}, {0x0261, 0x0261},
+	{0x02C4, 0x02C4}, {0x02C7, 0x02C7}, {0x02C9, 0x02CB},
+	{0x02CD, 0x02CD}, {0x02D0, 0x02D0}, {0x02D8, 0x02DB},
+	{0x02DD, 0x02DD}, {0x02DF, 0x02DF}, {0x0391, 0x03A1},
+	{0x03A3, 0x03A9}, {0x03B1, 0x03C1}, {0x03C3, 0x03C9},
+	{0x0401, 0x0401}, {0x0410, 0x044F}, {0x0451, 0x0451},
+	{0x2010, 0x2010}, {0x2013, 0x2016}, {0x2018, 0x2019},
+	{0x201C, 0x201D}, {0x2020, 0x2022}, {0x2024, 0x2027},
+	{0x2030, 0x2030}, {0x2032, 0x2033}, {0x2035, 0x2035},
+	{0x203B, 0x203B}, {0x203E, 0x203E}, {0x2074, 0x2074},
+	{0x207F, 0x207F}, {0x2081, 0x2084}, {0x20AC, 0x20AC},
+	{0x2103, 0x2103}, {0x2105, 0x2105}, {0x2109, 0x2109},
+	{0x2113, 0x2113}, {0x2116, 0x2116}, {0x2121, 0x2122},
+	{0x2126, 0x2126}, {0x212B, 0x212B}, {0x2153, 0x2154},
+	{0x215B, 0x215E}, {0x2160, 0x216B}, {0x2170, 0x2179},
+	{0x2190, 0x2199}, {0x21B8, 0x21B9}, {0x21D2, 0x21D2},
+	{0x21D4, 0x21D4}, {0x21E7, 0x21E7}, {0x2200, 0x2200},
+	{0x2202, 0x2203}, {0x2207, 0x2208}, {0x220B, 0x220B},
+	{0x220F, 0x220F}, {0x2211, 0x2211}, {0x2215, 0x2215},
+	{0x221A, 0x221A}, {0x221D, 0x2220}, {0x2223, 0x2223},
+	{0x2225, 0x2225}, {0x2227, 0x222C}, {0x222E, 0x222E},
+	{0x2234, 0x2237}, {0x223C, 0x223D}, {0x2248, 0x2248},
+	{0x224C, 0x224C}, {0x2252, 0x2252}, {0x2260, 0x2261},
+	{0x2264, 0x2267}, {0x226A, 0x226B}, {0x226E, 0x226F},
+	{0x2282, 0x2283}, {0x2286, 0x2287}, {0x2295, 0x2295},
+	{0x2299, 0x2299}, {0x22A5, 0x22A5}, {0x22BF, 0x22BF},
+	{0x2312, 0x2312}, {0x2460, 0x24E9}, {0x24EB, 0x254B},
+	{0x2550, 0x2573}, {0x2580, 0x258F}, {0x2592, 0x2595},
+	{0x25A0, 0x25A1}, {0x25A3, 0x25A9}, {0x25B2, 0x25B3},
+	{0x25B6, 0x25B7}, {0x25BC, 0x25BD}, {0x25C0, 0x25C1},
+	{0x25C6, 0x25C8}, {0x25CB, 0x25CB}, {0x25CE, 0x25D1},
+	{0x25E2, 0x25E5}, {0x25EF, 0x25EF}, {0x2605, 0x2606},
+	{0x2609, 0x2609}, {0x260E, 0x260F}, {0x2614, 0x2615},
+	{0x261C, 0x261C}, {0x261E, 0x261E}, {0x2640, 0x2640},
+	{0x2642, 0x2642}, {0x2660, 0x2661}, {0x2663, 0x2665},
+	{0x2667, 0x266A}, {0x266C, 0x266D}, {0x266F, 0x266F},
+	{0x273D, 0x273D}, {0x2776, 0x277F}, {0xE000, 0xF8FF},
+	{0xFFFD, 0xFFFD}, /* {0xF0000, 0xFFFFD}, {0x100000, 0x10FFFD} */
+    };
+
+    if (c >= 0x100)
+    {
+#ifdef USE_WCHAR_FUNCTIONS
+	/*
+	 * Assume the library function wcwidth() works better than our own
+	 * stuff.  It should return 1 for ambiguous width chars!
+	 */
+	int	n = wcwidth(c);
+
+	if (n < 0)
+	    return 6;		/* unprintable, displays <xxxx> */
+	if (n > 1)
+	    return n;
+#else
+	if (!utf_printable(c))
+	    return 6;		/* unprintable, displays <xxxx> */
+	if (c >= 0x1100
+	    && (c <= 0x115f			/* Hangul Jamo */
+		|| c == 0x2329
+		|| c == 0x232a
+		|| (c >= 0x2e80 && c <= 0xa4cf
+		    && c != 0x303f)		/* CJK ... Yi */
+		|| (c >= 0xac00 && c <= 0xd7a3)	/* Hangul Syllables */
+		|| (c >= 0xf900 && c <= 0xfaff)	/* CJK Compatibility
+						   Ideographs */
+		|| (c >= 0xfe30 && c <= 0xfe6f)	/* CJK Compatibility Forms */
+		|| (c >= 0xff00 && c <= 0xff60)	/* Fullwidth Forms */
+		|| (c >= 0xffe0 && c <= 0xffe6)
+		|| (c >= 0x20000 && c <= 0x2fffd)
+		|| (c >= 0x30000 && c <= 0x3fffd)))
+	    return 2;
+#endif
+    }
+
+    /* Characters below 0x100 are influenced by 'isprint' option */
+    else if (c >= 0x80 && !vim_isprintc(c))
+	return 4;		/* unprintable, displays <xx> */
+
+    if (c >= 0x80 && *p_ambw == 'd' && intable(ambiguous, sizeof(ambiguous), c))
+	return 2;
+
+    return 1;
+}
+
+/*
+ * mb_ptr2cells() function pointer.
+ * Return the number of display cells character at "*p" occupies.
+ * This doesn't take care of unprintable characters, use ptr2cells() for that.
+ */
+/*ARGSUSED*/
+    int
+latin_ptr2cells(p)
+    char_u	*p;
+{
+    return 1;
+}
+
+    int
+utf_ptr2cells(p)
+    char_u	*p;
+{
+    int		c;
+
+    /* Need to convert to a wide character. */
+    if (*p >= 0x80)
+    {
+	c = utf_ptr2char(p);
+	/* An illegal byte is displayed as <xx>. */
+	if (utf_ptr2len_check(p) == 1 || c == NUL)
+	    return 4;
+	/* If the char is ASCII it must be an overlong sequence. */
+	if (c < 0x80)
+	    return char2cells(c);
+	return utf_char2cells(c);
+    }
+    return 1;
+}
+
+    int
+dbcs_ptr2cells(p)
+    char_u	*p;
+{
+    /* Number of cells is equal to number of bytes, except for euc-jp when
+     * the first byte is 0x8e. */
+    if (enc_dbcs == DBCS_JPNU && *p == 0x8e)
+	return 1;
+    return MB_BYTE2LEN(*p);
+}
+
+/*
+ * mb_char2cells() function pointer.
+ * Return the number of display cells character "c" occupies.
+ * Only takes care of multi-byte chars, not "^C" and such.
+ */
+/*ARGSUSED*/
+    int
+latin_char2cells(c)