summaryrefslogtreecommitdiffstats
path: root/src/mbyte.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/mbyte.c')
-rw-r--r--src/mbyte.c5833
1 files changed, 5833 insertions, 0 deletions
diff --git a/src/mbyte.c b/src/mbyte.c
new file mode 100644
index 0000000000..699316d0e3
--- /dev/null
+++ b/src/mbyte.c
@@ -0,0 +1,5833 @@
+/* vi:set ts=8 sts=4 sw=4:
+ *
+ * VIM - Vi IMproved by Bram Moolenaar
+ * Multibyte extensions partly by Sung-Hoon Baek
+ *
+ * Do ":help uganda" in Vim to read copying and usage conditions.
+ * Do ":help credits" in Vim to see a list of people who contributed.
+ * See README.txt for an overview of the Vim source code.
+ */
+/*
+ * mbyte.c: Code specifically for handling multi-byte characters.
+ *
+ * The encoding used in the core is set with 'encoding'. When 'encoding' is
+ * changed, the following four variables are set (for speed).
+ * Currently these types of character encodings are supported:
+ *
+ * "enc_dbcs" When non-zero it tells the type of double byte character
+ * encoding (Chinese, Korean, Japanese, etc.).
+ * The cell width on the display is equal to the number of
+ * bytes. (exception: DBCS_JPNU with first byte 0x8e)
+ * Recognizing the first or second byte is difficult, it
+ * requires checking a byte sequence from the start.
+ * "enc_utf8" When TRUE use Unicode characters in UTF-8 encoding.
+ * The cell width on the display needs to be determined from
+ * the character value.
+ * Recognizing bytes is easy: 0xxx.xxxx is a single-byte
+ * char, 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading
+ * byte of a multi-byte character.
+ * To make things complicated, up to two composing characters
+ * are allowed. These are drawn on top of the first char.
+ * For most editing the sequence of bytes with composing
+ * characters included is considered to be one character.
+ * "enc_unicode" When 2 use 16-bit Unicode characters (or UTF-16).
+ * When 4 use 32-but Unicode characters.
+ * Internally characters are stored in UTF-8 encoding to
+ * avoid NUL bytes. Conversion happens when doing I/O.
+ * "enc_utf8" will also be TRUE.
+ *
+ * "has_mbyte" is set when "enc_dbcs" or "enc_utf8" is non-zero.
+ *
+ * If none of these is TRUE, 8-bit bytes are used for a character. The
+ * encoding isn't currently specified (TODO).
+ *
+ * 'encoding' specifies the encoding used in the core. This is in registers,
+ * text manipulation, buffers, etc. Conversion has to be done when characters
+ * in another encoding are received or send:
+ *
+ * clipboard
+ * ^
+ * | (2)
+ * V
+ * +---------------+
+ * (1) | | (3)
+ * keyboard ----->| core |-----> display
+ * | |
+ * +---------------+
+ * ^
+ * | (4)
+ * V
+ * file
+ *
+ * (1) Typed characters arrive in the current locale. Conversion is to be
+ * done when 'encoding' is different from 'termencoding'.
+ * (2) Text will be made available with the encoding specified with
+ * 'encoding'. If this is not sufficient, system-specific conversion
+ * might be required.
+ * (3) For the GUI the correct font must be selected, no conversion done.
+ * Otherwise, conversion is to be done when 'encoding' differs from
+ * 'termencoding'. (Different in the GTK+ 2 port -- 'termencoding'
+ * is always used for both input and output and must always be set to
+ * "utf-8". gui_mch_init() does this automatically.)
+ * (4) The encoding of the file is specified with 'fileencoding'. Conversion
+ * is to be done when it's different from 'encoding'.
+ *
+ * The viminfo file is a special case: Only text is converted, not file names.
+ * Vim scripts may contain an ":encoding" command. This has an effect for
+ * some commands, like ":menutrans"
+ */
+
+#include "vim.h"
+
+#ifdef WIN32UNIX
+# ifndef WIN32_LEAN_AND_MEAN
+# define WIN32_LEAN_AND_MEAN
+# endif
+# include <windows.h>
+# ifdef WIN32
+# undef WIN32 /* Some windows.h define WIN32, we don't want that here. */
+# endif
+#endif
+
+#if (defined(WIN3264) || defined(WIN32UNIX)) && !defined(__MINGW32__)
+# include <winnls.h>
+#endif
+
+#ifdef FEAT_GUI_X11
+# include <X11/Intrinsic.h>
+#endif
+#ifdef X_LOCALE
+#include <X11/Xlocale.h>
+#endif
+
+#if defined(FEAT_XIM) && defined(HAVE_GTK2)
+# include <gdk/gdkkeysyms.h>
+# ifdef WIN3264
+# include <gdk/gdkwin32.h>
+# else
+# include <gdk/gdkx.h>
+# endif
+#endif
+
+#ifdef HAVE_WCHAR_H
+# include <wchar.h>
+#endif
+
+#if 0
+/* This has been disabled, because several people reported problems with the
+ * wcwidth() and iswprint() library functions, esp. for Hebrew. */
+# ifdef __STDC_ISO_10646__
+# define USE_WCHAR_FUNCTIONS
+# endif
+#endif
+
+#if defined(FEAT_MBYTE) || defined(PROTO)
+
+static int enc_canon_search __ARGS((char_u *name));
+static int dbcs_char2len __ARGS((int c));
+static int dbcs_char2bytes __ARGS((int c, char_u *buf));
+static int dbcs_ptr2len_check __ARGS((char_u *p));
+static int dbcs_char2cells __ARGS((int c));
+static int dbcs_ptr2char __ARGS((char_u *p));
+
+/* Lookup table to quickly get the length in bytes of a UTF-8 character from
+ * the first byte of a UTF-8 string. Bytes which are illegal when used as the
+ * first byte have a one, because these will be used separately. */
+static char utf8len_tab[256] =
+{
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /*bogus*/
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /*bogus*/
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
+};
+
+/*
+ * XIM often causes trouble. Define XIM_DEBUG to get a log of XIM callbacks
+ * in the "xim.log" file.
+ */
+/* #define XIM_DEBUG */
+#ifdef XIM_DEBUG
+ static void
+xim_log(char *s, ...)
+{
+ va_list arglist;
+ static FILE *fd = NULL;
+
+ if (fd == (FILE *)-1)
+ return;
+ if (fd == NULL)
+ {
+ fd = fopen("xim.log", "w");
+ if (fd == NULL)
+ {
+ EMSG("Cannot open xim.log");
+ fd = (FILE *)-1;
+ return;
+ }
+ }
+
+ va_start(arglist, s);
+ vfprintf(fd, s, arglist);
+ va_end(arglist);
+}
+#endif
+
+#endif
+
+#if defined(FEAT_MBYTE) || defined(FEAT_POSTSCRIPT) || defined(PROTO)
+/*
+ * Canonical encoding names and their properties.
+ * "iso-8859-n" is handled by enc_canonize() directly.
+ */
+static struct
+{ char *name; int prop; int codepage;}
+enc_canon_table[] =
+{
+#define IDX_LATIN_1 0
+ {"latin1", ENC_8BIT + ENC_LATIN1, 1252},
+#define IDX_ISO_2 1
+ {"iso-8859-2", ENC_8BIT, 0},
+#define IDX_ISO_3 2
+ {"iso-8859-3", ENC_8BIT, 0},
+#define IDX_ISO_4 3
+ {"iso-8859-4", ENC_8BIT, 0},
+#define IDX_ISO_5 4
+ {"iso-8859-5", ENC_8BIT, 0},
+#define IDX_ISO_6 5
+ {"iso-8859-6", ENC_8BIT, 0},
+#define IDX_ISO_7 6
+ {"iso-8859-7", ENC_8BIT, 0},
+#define IDX_CP1255 7
+ {"cp1255", ENC_8BIT, 1255}, /* close to iso-8859-8 */
+#define IDX_ISO_8 8
+ {"iso-8859-8", ENC_8BIT, 0},
+#define IDX_ISO_9 9
+ {"iso-8859-9", ENC_8BIT, 0},
+#define IDX_ISO_10 10
+ {"iso-8859-10", ENC_8BIT, 0},
+#define IDX_ISO_11 11
+ {"iso-8859-11", ENC_8BIT, 0},
+#define IDX_ISO_13 12
+ {"iso-8859-13", ENC_8BIT, 0},
+#define IDX_ISO_14 13
+ {"iso-8859-14", ENC_8BIT, 0},
+#define IDX_ISO_15 14
+ {"iso-8859-15", ENC_8BIT, 0},
+#define IDX_KOI8_R 15
+ {"koi8-r", ENC_8BIT, 0},
+#define IDX_KOI8_U 16
+ {"koi8-u", ENC_8BIT, 0},
+#define IDX_UTF8 17
+ {"utf-8", ENC_UNICODE, 0},
+#define IDX_UCS2 18
+ {"ucs-2", ENC_UNICODE + ENC_ENDIAN_B + ENC_2BYTE, 0},
+#define IDX_UCS2LE 19
+ {"ucs-2le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2BYTE, 0},
+#define IDX_UTF16 20
+ {"utf-16", ENC_UNICODE + ENC_ENDIAN_B + ENC_2WORD, 0},
+#define IDX_UTF16LE 21
+ {"utf-16le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2WORD, 0},
+#define IDX_UCS4 22
+ {"ucs-4", ENC_UNICODE + ENC_ENDIAN_B + ENC_4BYTE, 0},
+#define IDX_UCS4LE 23
+ {"ucs-4le", ENC_UNICODE + ENC_ENDIAN_L + ENC_4BYTE, 0},
+#define IDX_DEBUG 24
+ {"debug", ENC_DBCS, DBCS_DEBUG},
+#define IDX_CP932 25
+ {"cp932", ENC_DBCS, DBCS_JPN},
+#define IDX_CP949 26
+ {"cp949", ENC_DBCS, DBCS_KOR},
+#define IDX_CP936 27
+ {"cp936", ENC_DBCS, DBCS_CHS},
+#define IDX_CP950 28
+ {"cp950", ENC_DBCS, DBCS_CHT},
+#define IDX_EUC_JP 29
+ {"euc-jp", ENC_DBCS, DBCS_JPNU},
+#define IDX_SJIS 30
+ {"sjis", ENC_DBCS, DBCS_JPN},
+#define IDX_EUC_KR 31
+ {"euc-kr", ENC_DBCS, DBCS_KORU},
+#define IDX_EUC_CN 32
+ {"euc-cn", ENC_DBCS, DBCS_CHSU},
+#define IDX_EUC_TW 33
+ {"euc-tw", ENC_DBCS, DBCS_CHTU},
+#define IDX_BIG5 34
+ {"big5", ENC_DBCS, DBCS_CHT},
+#define IDX_CP1251 35
+ {"cp1251", ENC_8BIT, 1251},
+#define IDX_MACROMAN 36
+ {"macroman", ENC_8BIT + ENC_MACROMAN, 0},
+#define IDX_COUNT 37
+};
+
+/*
+ * Aliases for encoding names.
+ */
+static struct
+{ char *name; int canon;}
+enc_alias_table[] =
+{
+ {"ansi", IDX_LATIN_1},
+ {"iso-8859-1", IDX_LATIN_1},
+ {"latin2", IDX_ISO_2},
+ {"latin3", IDX_ISO_3},
+ {"latin4", IDX_ISO_4},
+ {"cyrillic", IDX_ISO_5},
+ {"arabic", IDX_ISO_6},
+ {"greek", IDX_ISO_7},
+#ifdef WIN3264
+ {"hebrew", IDX_CP1255},
+#else
+ {"hebrew", IDX_ISO_8},
+#endif
+ {"latin5", IDX_ISO_9},
+ {"turkish", IDX_ISO_9}, /* ? */
+ {"latin6", IDX_ISO_10},
+ {"nordic", IDX_ISO_10}, /* ? */
+ {"thai", IDX_ISO_11}, /* ? */
+ {"latin7", IDX_ISO_13},
+ {"latin8", IDX_ISO_14},
+ {"latin9", IDX_ISO_15},
+ {"utf8", IDX_UTF8},
+ {"unicode", IDX_UCS2},
+ {"ucs2", IDX_UCS2},
+ {"ucs2be", IDX_UCS2},
+ {"ucs-2be", IDX_UCS2},
+ {"ucs2le", IDX_UCS2LE},
+ {"utf16", IDX_UTF16},
+ {"utf16be", IDX_UTF16},
+ {"utf-16be", IDX_UTF16},
+ {"utf16le", IDX_UTF16LE},
+ {"ucs4", IDX_UCS4},
+ {"ucs4be", IDX_UCS4},
+ {"ucs-4be", IDX_UCS4},
+ {"ucs4le", IDX_UCS4LE},
+ {"932", IDX_CP932},
+ {"949", IDX_CP949},
+ {"936", IDX_CP936},
+ {"950", IDX_CP950},
+ {"eucjp", IDX_EUC_JP},
+ {"unix-jis", IDX_EUC_JP},
+ {"ujis", IDX_EUC_JP},
+ {"shift-jis", IDX_SJIS},
+ {"euckr", IDX_EUC_KR},
+ {"5601", IDX_EUC_KR}, /* Sun: KS C 5601 */
+ {"euccn", IDX_EUC_CN},
+ {"gb2312", IDX_EUC_CN},
+ {"euctw", IDX_EUC_TW},
+#if defined(WIN3264) || defined(WIN32UNIX) || defined(MACOS)
+ {"japan", IDX_CP932},
+ {"korea", IDX_CP949},
+ {"prc", IDX_CP936},
+ {"chinese", IDX_CP936},
+ {"taiwan", IDX_CP950},
+ {"big5", IDX_CP950},
+#else
+ {"japan", IDX_EUC_JP},
+ {"korea", IDX_EUC_KR},
+ {"prc", IDX_EUC_CN},
+ {"chinese", IDX_EUC_CN},
+ {"taiwan", IDX_EUC_TW},
+ {"cp950", IDX_BIG5},
+ {"950", IDX_BIG5},
+#endif
+ {"mac", IDX_MACROMAN},
+ {NULL, 0}
+};
+
+#ifndef CP_UTF8
+# define CP_UTF8 65001 /* magic number from winnls.h */
+#endif
+
+/*
+ * Find encoding "name" in the list of canonical encoding names.
+ * Returns -1 if not found.
+ */
+ static int
+enc_canon_search(name)
+ char_u *name;
+{
+ int i;
+
+ for (i = 0; i < IDX_COUNT; ++i)
+ if (STRCMP(name, enc_canon_table[i].name) == 0)
+ return i;
+ return -1;
+}
+
+#endif
+
+#if defined(FEAT_MBYTE) || defined(PROTO)
+
+/*
+ * Find canonical encoding "name" in the list and return its properties.
+ * Returns 0 if not found.
+ */
+ int
+enc_canon_props(name)
+ char_u *name;
+{
+ int i;
+
+ i = enc_canon_search(name);
+ if (i >= 0)
+ return enc_canon_table[i].prop;
+#ifdef WIN3264
+ if (name[0] == 'c' && name[1] == 'p' && VIM_ISDIGIT(name[2]))
+ {
+ CPINFO cpinfo;
+
+ /* Get info on this codepage to find out what it is. */
+ if (GetCPInfo(atoi(name + 2), &cpinfo) != 0)
+ {
+ if (cpinfo.MaxCharSize == 1) /* some single-byte encoding */
+ return ENC_8BIT;
+ if (cpinfo.MaxCharSize == 2
+ && (cpinfo.LeadByte[0] != 0 || cpinfo.LeadByte[1] != 0))
+ /* must be a DBCS encoding */
+ return ENC_DBCS;
+ }
+ return 0;
+ }
+#endif
+ if (STRNCMP(name, "2byte-", 6) == 0)
+ return ENC_DBCS;
+ if (STRNCMP(name, "8bit-", 5) == 0 || STRNCMP(name, "iso-8859-", 9) == 0)
+ return ENC_8BIT;
+ return 0;
+}
+
+/*
+ * Set up for using multi-byte characters.
+ * Called in three cases:
+ * - by main() to initialize (p_enc == NULL)
+ * - by set_init_1() after 'encoding' was set to its default.
+ * - by do_set() when 'encoding' has been set.
+ * p_enc must have been passed through enc_canonize() already.
+ * Sets the "enc_unicode", "enc_utf8", "enc_dbcs" and "has_mbyte" flags.
+ * Fills mb_bytelen_tab[] and returns NULL when there are no problems.
+ * When there is something wrong: Returns an error message and doesn't change
+ * anything.
+ */
+ char_u *
+mb_init()
+{
+ int i;
+ int idx;
+ int n;
+ int enc_dbcs_new = 0;
+#if defined(USE_ICONV) && !defined(WIN3264) && !defined(WIN32UNIX) \
+ && !defined(MACOS)
+# define LEN_FROM_CONV
+ vimconv_T vimconv;
+ char_u *p;
+#endif
+
+ if (p_enc == NULL)
+ {
+ /* Just starting up: set the whole table to one's. */
+ for (i = 0; i < 256; ++i)
+ mb_bytelen_tab[i] = 1;
+ input_conv.vc_type = CONV_NONE;
+ input_conv.vc_factor = 1;
+ output_conv.vc_type = CONV_NONE;
+ return NULL;
+ }
+
+#ifdef WIN3264
+ if (p_enc[0] == 'c' && p_enc[1] == 'p' && VIM_ISDIGIT(p_enc[2]))
+ {
+ CPINFO cpinfo;
+
+ /* Get info on this codepage to find out what it is. */
+ if (GetCPInfo(atoi(p_enc + 2), &cpinfo) != 0)
+ {
+ if (cpinfo.MaxCharSize == 1)
+ {
+ /* some single-byte encoding */
+ enc_unicode = 0;
+ enc_utf8 = FALSE;
+ }
+ else if (cpinfo.MaxCharSize == 2
+ && (cpinfo.LeadByte[0] != 0 || cpinfo.LeadByte[1] != 0))
+ {
+ /* must be a DBCS encoding, check below */
+ enc_dbcs_new = atoi(p_enc + 2);
+ }
+ else
+ goto codepage_invalid;
+ }
+ else if (GetLastError() == ERROR_INVALID_PARAMETER)
+ {
+codepage_invalid:
+ return (char_u *)N_("E543: Not a valid codepage");
+ }
+ }
+#endif
+ else if (STRNCMP(p_enc, "8bit-", 5) == 0
+ || STRNCMP(p_enc, "iso-8859-", 9) == 0)
+ {
+ /* Accept any "8bit-" or "iso-8859-" name. */
+ enc_unicode = 0;
+ enc_utf8 = FALSE;
+ }
+ else if (STRNCMP(p_enc, "2byte-", 6) == 0)
+ {
+#ifdef WIN3264
+ /* Windows: accept only valid codepage numbers, check below. */
+ if (p_enc[6] != 'c' || p_enc[7] != 'p'
+ || (enc_dbcs_new = atoi(p_enc + 8)) == 0)
+ return e_invarg;
+#else
+ /* Unix: accept any "2byte-" name, assume current locale. */
+ enc_dbcs_new = DBCS_2BYTE;
+#endif
+ }
+ else if ((idx = enc_canon_search(p_enc)) >= 0)
+ {
+ i = enc_canon_table[idx].prop;
+ if (i & ENC_UNICODE)
+ {
+ /* Unicode */
+ enc_utf8 = TRUE;
+ if (i & (ENC_2BYTE | ENC_2WORD))
+ enc_unicode = 2;
+ else if (i & ENC_4BYTE)
+ enc_unicode = 4;
+ else
+ enc_unicode = 0;
+ }
+ else if (i & ENC_DBCS)
+ {
+ /* 2byte, handle below */
+ enc_dbcs_new = enc_canon_table[idx].codepage;
+ }
+ else
+ {
+ /* Must be 8-bit. */
+ enc_unicode = 0;
+ enc_utf8 = FALSE;
+ }
+ }
+ else /* Don't know what encoding this is, reject it. */
+ return e_invarg;
+
+ if (enc_dbcs_new != 0)
+ {
+#ifdef WIN3264
+ /* Check if the DBCS code page is OK. */
+ if (!IsValidCodePage(enc_dbcs_new))
+ goto codepage_invalid;
+#endif
+ enc_unicode = 0;
+ enc_utf8 = FALSE;
+ }
+ enc_dbcs = enc_dbcs_new;
+ has_mbyte = (enc_dbcs != 0 || enc_utf8);
+
+#ifdef WIN3264
+ enc_codepage = encname2codepage(p_enc);
+#endif
+
+ /*
+ * Set the function pointers.
+ */
+ if (enc_utf8)
+ {
+ mb_ptr2len_check = utfc_ptr2len_check;
+ mb_char2len = utf_char2len;
+ mb_char2bytes = utf_char2bytes;
+ mb_ptr2cells = utf_ptr2cells;
+ mb_char2cells = utf_char2cells;
+ mb_off2cells = utf_off2cells;
+ mb_ptr2char = utf_ptr2char;
+ mb_head_off = utf_head_off;
+ }
+ else if (enc_dbcs != 0)
+ {
+ mb_ptr2len_check = dbcs_ptr2len_check;
+ mb_char2len = dbcs_char2len;
+ mb_char2bytes = dbcs_char2bytes;
+ mb_ptr2cells = dbcs_ptr2cells;
+ mb_char2cells = dbcs_char2cells;
+ mb_off2cells = dbcs_off2cells;
+ mb_ptr2char = dbcs_ptr2char;
+ mb_head_off = dbcs_head_off;
+ }
+ else
+ {
+ mb_ptr2len_check = latin_ptr2len_check;
+ mb_char2len = latin_char2len;
+ mb_char2bytes = latin_char2bytes;
+ mb_ptr2cells = latin_ptr2cells;
+ mb_char2cells = latin_char2cells;
+ mb_off2cells = latin_off2cells;
+ mb_ptr2char = latin_ptr2char;
+ mb_head_off = latin_head_off;
+ }
+
+ /*
+ * Fill the mb_bytelen_tab[] for MB_BYTE2LEN().
+ */
+#ifdef LEN_FROM_CONV
+ /* When 'encoding' is different from the current locale mblen() won't
+ * work. Use conversion to "utf-8" instead. */
+ vimconv.vc_type = CONV_NONE;
+ if (enc_dbcs)
+ {
+ p = enc_locale();
+ if (p == NULL || STRCMP(p, p_enc) != 0)
+ {
+ convert_setup(&vimconv, p_enc, (char_u *)"utf-8");
+ vimconv.vc_fail = TRUE;
+ }
+ vim_free(p);
+ }
+#endif
+
+ for (i = 0; i < 256; ++i)
+ {
+ /* Our own function to reliably check the length of UTF-8 characters,
+ * independent of mblen(). */
+ if (enc_utf8)
+ n = utf8len_tab[i];
+ else if (enc_dbcs == 0)
+ n = 1;
+ else
+ {
+#if defined(WIN3264) || defined(WIN32UNIX)
+ /* enc_dbcs is set by setting 'fileencoding'. It becomes a Windows
+ * CodePage identifier, which we can pass directly in to Windows
+ * API */
+ n = IsDBCSLeadByteEx(enc_dbcs, (BYTE)i) ? 2 : 1;
+#else
+# ifdef MACOS
+ /*
+ * if mblen() is not available, character which MSB is turned on
+ * are treated as leading byte character. (note : This assumption
+ * is not always true.)
+ */
+ n = (i & 0x80) ? 2 : 1;
+# else
+ char buf[MB_MAXBYTES];
+# ifdef X_LOCALE
+# ifndef mblen
+# define mblen _Xmblen
+# endif
+# endif
+ if (i == NUL) /* just in case mblen() can't handle "" */
+ n = 1;
+ else
+ {
+ buf[0] = i;
+ buf[1] = 0;
+#ifdef LEN_FROM_CONV
+ if (vimconv.vc_type != CONV_NONE)
+ {
+ /*
+ * string_convert() should fail when converting the first
+ * byte of a double-byte character.
+ */
+ p = string_convert(&vimconv, (char_u *)buf, NULL);
+ if (p != NULL)
+ {
+ vim_free(p);
+ n = 1;
+ }
+ else
+ n = 2;
+ }
+ else
+#endif
+ {
+ /*
+ * mblen() should return -1 for invalid (means the leading
+ * multibyte) character. However there are some platforms
+ * where mblen() returns 0 for invalid character.
+ * Therefore, following condition includes 0.
+ */
+ if (mblen(buf, (size_t)1) <= 0)
+ n = 2;
+ else
+ n = 1;
+ }
+ }
+# endif
+#endif
+ }
+
+ mb_bytelen_tab[i] = n;
+ }
+
+#ifdef LEN_FROM_CONV
+ convert_setup(&vimconv, NULL, NULL);
+#endif
+
+ /* The cell width depends on the type of multi-byte characters. */
+ (void)init_chartab();
+
+ /* When enc_utf8 is set or reset, (de)allocate ScreenLinesUC[] */
+ screenalloc(FALSE);
+
+ /* When using Unicode, set default for 'fileencodings'. */
+ if (enc_utf8 && !option_was_set((char_u *)"fencs"))
+ set_string_option_direct((char_u *)"fencs", -1,
+ (char_u *)"ucs-bom,utf-8,latin1", OPT_FREE);
+#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(FEAT_GETTEXT)
+ /* GNU gettext 0.10.37 supports this feature: set the codeset used for
+ * translated messages independently from the current locale. */
+ (void)bind_textdomain_codeset(VIMPACKAGE,
+ enc_utf8 ? "utf-8" : (char *)p_enc);
+#endif
+
+#ifdef FEAT_AUTOCMD
+ /* Fire an autocommand to let people do custom font setup. This must be
+ * after Vim has been setup for the new encoding. */
+ apply_autocmds(EVENT_ENCODINGCHANGED, NULL, (char_u *)"", FALSE, curbuf);
+#endif
+
+ return NULL;
+}
+
+/*
+ * Return the size of the BOM for the current buffer:
+ * 0 - no BOM
+ * 2 - UCS-2 or UTF-16 BOM
+ * 4 - UCS-4 BOM
+ * 3 - UTF-8 BOM
+ */
+ int
+bomb_size()
+{
+ int n = 0;
+
+ if (curbuf->b_p_bomb && !curbuf->b_p_bin)
+ {
+ if (*curbuf->b_p_fenc == NUL)
+ {
+ if (enc_utf8)
+ {
+ if (enc_unicode != 0)
+ n = enc_unicode;
+ else
+ n = 3;
+ }
+ }
+ else if (STRCMP(curbuf->b_p_fenc, "utf-8") == 0)
+ n = 3;
+ else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0
+ || STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0)
+ n = 2;
+ else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0)
+ n = 4;
+ }
+ return n;
+}
+
+/*
+ * Get class of pointer:
+ * 0 for blank or NUL
+ * 1 for punctuation
+ * 2 for an (ASCII) word character
+ * >2 for other word characters
+ */
+ int
+mb_get_class(p)
+ char_u *p;
+{
+ if (MB_BYTE2LEN(p[0]) == 1)
+ {
+ if (p[0] == NUL || vim_iswhite(p[0]))
+ return 0;
+ if (vim_iswordc(p[0]))
+ return 2;
+ return 1;
+ }
+ if (enc_dbcs != 0 && p[0] != NUL && p[1] != NUL)
+ return dbcs_class(p[0], p[1]);
+ if (enc_utf8)
+ return utf_class(utf_ptr2char(p));
+ return 0;
+}
+
+/*
+ * Get class of a double-byte character. This always returns 3 or bigger.
+ * TODO: Should return 1 for punctuation.
+ */
+ int
+dbcs_class(lead, trail)
+ unsigned lead;
+ unsigned trail;
+{
+ switch (enc_dbcs)
+ {
+ /* please add classfy routine for your language in here */
+
+ case DBCS_JPNU: /* ? */
+ case DBCS_JPN:
+ {
+ /* JIS code classification */
+ unsigned char lb = lead;
+ unsigned char tb = trail;
+
+ /* convert process code to JIS */
+# if defined(WIN3264) || defined(WIN32UNIX) || defined(MACOS)
+ /* process code is SJIS */
+ if (lb <= 0x9f)
+ lb = (lb - 0x81) * 2 + 0x21;
+ else
+ lb = (lb - 0xc1) * 2 + 0x21;
+ if (tb <= 0x7e)
+ tb -= 0x1f;
+ else if (tb <= 0x9e)
+ tb -= 0x20;
+ else
+ {
+ tb -= 0x7e;
+ lb += 1;
+ }
+# else
+ /*
+ * XXX: Code page identification can not use with all
+ * system! So, some other encoding information
+ * will be needed.
+ * In japanese: SJIS,EUC,UNICODE,(JIS)
+ * Note that JIS-code system don't use as
+ * process code in most system because it uses
+ * escape sequences(JIS is context depend encoding).
+ */
+ /* assume process code is JAPANESE-EUC */
+ lb &= 0x7f;
+ tb &= 0x7f;
+# endif
+ /* exceptions */
+ switch (lb << 8 | tb)
+ {
+ case 0x2121: /* ZENKAKU space */
+ return 0;
+ case 0x2122: /* KU-TEN (Japanese comma) */
+ case 0x2123: /* TOU-TEN (Japanese period) */
+ case 0x2124: /* ZENKAKU comma */
+ case 0x2125: /* ZENKAKU period */
+ return 1;
+ case 0x213c: /* prolongedsound handled as KATAKANA */
+ return 13;
+ }
+ /* sieved by KU code */
+ switch (lb)
+ {
+ case 0x21:
+ case 0x22:
+ /* special symbols */
+ return 10;
+ case 0x23:
+ /* alpha-numeric */
+ return 11;
+ case 0x24:
+ /* hiragana */
+ return 12;
+ case 0x25:
+ /* katakana */
+ return 13;
+ case 0x26:
+ /* greek */
+ return 14;
+ case 0x27:
+ /* russian */
+ return 15;
+ case 0x28:
+ /* lines */
+ return 16;
+ default:
+ /* kanji */
+ return 17;
+ }
+ }
+
+ case DBCS_KORU: /* ? */
+ case DBCS_KOR:
+ {
+ /* KS code classification */
+ unsigned char c1 = lead;
+ unsigned char c2 = trail;
+
+ /*
+ * 20 : Hangul
+ * 21 : Hanja
+ * 22 : Symbols
+ * 23 : Alpha-numeric/Roman Letter (Full width)
+ * 24 : Hangul Letter(Alphabet)
+ * 25 : Roman Numeral/Greek Letter
+ * 26 : Box Drawings
+ * 27 : Unit Symbols
+ * 28 : Circled/Parenthesized Letter
+ * 29 : Hirigana/Katakana
+ * 30 : Cyrillic Letter
+ */
+
+ if (c1 >= 0xB0 && c1 <= 0xC8)
+ /* Hangul */
+ return 20;
+#if defined(WIN3264) || defined(WIN32UNIX)
+ else if (c1 <= 0xA0 || c2 <= 0xA0)
+ /* Extended Hangul Region : MS UHC(Unified Hangul Code) */
+ /* c1: 0x81-0xA0 with c2: 0x41-0x5A, 0x61-0x7A, 0x81-0xFE
+ * c1: 0xA1-0xC6 with c2: 0x41-0x5A, 0x61-0x7A, 0x81-0xA0
+ */
+ return 20;
+#endif
+
+ else if (c1 >= 0xCA && c1 <= 0xFD)
+ /* Hanja */
+ return 21;
+ else switch (c1)
+ {
+ case 0xA1:
+ case 0xA2:
+ /* Symbols */
+ return 22;
+ case 0xA3:
+ /* Alpha-numeric */
+ return 23;
+ case 0xA4:
+ /* Hangul Letter(Alphabet) */
+ return 24;
+ case 0xA5:
+ /* Roman Numeral/Greek Letter */
+ return 25;
+ case 0xA6:
+ /* Box Drawings */
+ return 26;
+ case 0xA7:
+ /* Unit Symbols */
+ return 27;
+ case 0xA8:
+ case 0xA9:
+ if (c2 <= 0xAF)
+ return 25; /* Roman Letter */
+ else if (c2 >= 0xF6)
+ return 22; /* Symbols */
+ else
+ /* Circled/Parenthesized Letter */
+ return 28;
+ case 0xAA:
+ case 0xAB:
+ /* Hirigana/Katakana */
+ return 29;
+ case 0xAC:
+ /* Cyrillic Letter */
+ return 30;
+ }
+ }
+ default:
+ break;
+ }
+ return 3;
+}
+
+/*
+ * mb_char2len() function pointer.
+ * Return length in bytes of character "c".
+ * Returns 1 for a single-byte character.
+ */
+/* ARGSUSED */
+ int
+latin_char2len(c)
+ int c;
+{
+ return 1;
+}
+
+ static int
+dbcs_char2len(c)
+ int c;
+{
+ if (c >= 0x100)
+ return 2;
+ return 1;
+}
+
+/*
+ * mb_char2bytes() function pointer.
+ * Convert a character to its bytes.
+ * Returns the length in bytes.
+ */
+ int
+latin_char2bytes(c, buf)
+ int c;
+ char_u *buf;
+{
+ buf[0] = c;
+ return 1;
+}
+
+ static int
+dbcs_char2bytes(c, buf)
+ int c;
+ char_u *buf;
+{
+ if (c >= 0x100)
+ {
+ buf[0] = (unsigned)c >> 8;
+ buf[1] = c;
+ return 2;
+ }
+ buf[0] = c;
+ return 1;
+}
+
+/*
+ * mb_ptr2len_check() function pointer.
+ * Get byte length of character at "*p" but stop at a NUL.
+ * For UTF-8 this includes following composing characters.
+ * Returns 0 when *p is NUL.
+ *
+ */
+ int
+latin_ptr2len_check(p)
+ char_u *p;
+{
+ return MB_BYTE2LEN(*p);
+}
+
+ static int
+dbcs_ptr2len_check(p)
+ char_u *p;
+{
+ int len;
+
+ /* Check if second byte is not missing. */
+ len = MB_BYTE2LEN(*p);
+ if (len == 2 && p[1] == NUL)
+ len = 1;
+ return len;
+}
+
+struct interval
+{
+ unsigned short first;
+ unsigned short last;
+};
+static int intable __ARGS((struct interval *table, size_t size, int c));
+
+/*
+ * Return TRUE if "c" is in "table[size / sizeof(struct interval)]".
+ */
+ static int
+intable(table, size, c)
+ struct interval *table;
+ size_t size;
+ int c;
+{
+ int mid, bot, top;
+
+ /* first quick check for Latin1 etc. characters */
+ if (c < table[0].first)
+ return FALSE;
+
+ /* binary search in table */
+ bot = 0;
+ top = size / sizeof(struct interval) - 1;
+ while (top >= bot)
+ {
+ mid = (bot + top) / 2;
+ if (table[mid].last < c)
+ bot = mid + 1;
+ else if (table[mid].first > c)
+ top = mid - 1;
+ else
+ return TRUE;
+ }
+ return FALSE;
+}
+
+/*
+ * For UTF-8 character "c" return 2 for a double-width character, 1 for others.
+ * Returns 4 or 6 for an unprintable character.
+ * Is only correct for characters >= 0x80.
+ * When p_ambw is "double", return 2 for a character with East Asian Width
+ * class 'A'(mbiguous).
+ */
+ int
+utf_char2cells(c)
+ int c;
+{
+ /* sorted list of non-overlapping intervals of East Asian Ambiguous
+ * characters, generated with:
+ * "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c" */
+ static struct interval ambiguous[] = {
+ {0x00A1, 0x00A1}, {0x00A4, 0x00A4}, {0x00A7, 0x00A8},
+ {0x00AA, 0x00AA}, {0x00AE, 0x00AE}, {0x00B0, 0x00B4},
+ {0x00B6, 0x00BA}, {0x00BC, 0x00BF}, {0x00C6, 0x00C6},
+ {0x00D0, 0x00D0}, {0x00D7, 0x00D8}, {0x00DE, 0x00E1},
+ {0x00E6, 0x00E6}, {0x00E8, 0x00EA}, {0x00EC, 0x00ED},
+ {0x00F0, 0x00F0}, {0x00F2, 0x00F3}, {0x00F7, 0x00FA},
+ {0x00FC, 0x00FC}, {0x00FE, 0x00FE}, {0x0101, 0x0101},
+ {0x0111, 0x0111}, {0x0113, 0x0113}, {0x011B, 0x011B},
+ {0x0126, 0x0127}, {0x012B, 0x012B}, {0x0131, 0x0133},
+ {0x0138, 0x0138}, {0x013F, 0x0142}, {0x0144, 0x0144},
+ {0x0148, 0x014B}, {0x014D, 0x014D}, {0x0152, 0x0153},
+ {0x0166, 0x0167}, {0x016B, 0x016B}, {0x01CE, 0x01CE},
+ {0x01D0, 0x01D0}, {0x01D2, 0x01D2}, {0x01D4, 0x01D4},
+ {0x01D6, 0x01D6}, {0x01D8, 0x01D8}, {0x01DA, 0x01DA},
+ {0x01DC, 0x01DC}, {0x0251, 0x0251}, {0x0261, 0x0261},
+ {0x02C4, 0x02C4}, {0x02C7, 0x02C7}, {0x02C9, 0x02CB},
+ {0x02CD, 0x02CD}, {0x02D0, 0x02D0}, {0x02D8, 0x02DB},
+ {0x02DD, 0x02DD}, {0x02DF, 0x02DF}, {0x0391, 0x03A1},
+ {0x03A3, 0x03A9}, {0x03B1, 0x03C1}, {0x03C3, 0x03C9},
+ {0x0401, 0x0401}, {0x0410, 0x044F}, {0x0451, 0x0451},
+ {0x2010, 0x2010}, {0x2013, 0x2016}, {0x2018, 0x2019},
+ {0x201C, 0x201D}, {0x2020, 0x2022}, {0x2024, 0x2027},
+ {0x2030, 0x2030}, {0x2032, 0x2033}, {0x2035, 0x2035},
+ {0x203B, 0x203B}, {0x203E, 0x203E}, {0x2074, 0x2074},
+ {0x207F, 0x207F}, {0x2081, 0x2084}, {0x20AC, 0x20AC},
+ {0x2103, 0x2103}, {0x2105, 0x2105}, {0x2109, 0x2109},
+ {0x2113, 0x2113}, {0x2116, 0x2116}, {0x2121, 0x2122},
+ {0x2126, 0x2126}, {0x212B, 0x212B}, {0x2153, 0x2154},
+ {0x215B, 0x215E}, {0x2160, 0x216B}, {0x2170, 0x2179},
+ {0x2190, 0x2199}, {0x21B8, 0x21B9}, {0x21D2, 0x21D2},
+ {0x21D4, 0x21D4}, {0x21E7, 0x21E7}, {0x2200, 0x2200},
+ {0x2202, 0x2203}, {0x2207, 0x2208}, {0x220B, 0x220B},
+ {0x220F, 0x220F}, {0x2211, 0x2211}, {0x2215, 0x2215},
+ {0x221A, 0x221A}, {0x221D, 0x2220}, {0x2223, 0x2223},
+ {0x2225, 0x2225}, {0x2227, 0x222C}, {0x222E, 0x222E},
+ {0x2234, 0x2237}, {0x223C, 0x223D}, {0x2248, 0x2248},
+ {0x224C, 0x224C}, {0x2252, 0x2252}, {0x2260, 0x2261},
+ {0x2264, 0x2267}, {0x226A, 0x226B}, {0x226E, 0x226F},
+ {0x2282, 0x2283}, {0x2286, 0x2287}, {0x2295, 0x2295},
+ {0x2299, 0x2299}, {0x22A5, 0x22A5}, {0x22BF, 0x22BF},
+ {0x2312, 0x2312}, {0x2460, 0x24E9}, {0x24EB, 0x254B},
+ {0x2550, 0x2573}, {0x2580, 0x258F}, {0x2592, 0x2595},
+ {0x25A0, 0x25A1}, {0x25A3, 0x25A9}, {0x25B2, 0x25B3},
+ {0x25B6, 0x25B7}, {0x25BC, 0x25BD}, {0x25C0, 0x25C1},
+ {0x25C6, 0x25C8}, {0x25CB, 0x25CB}, {0x25CE, 0x25D1},
+ {0x25E2, 0x25E5}, {0x25EF, 0x25EF}, {0x2605, 0x2606},
+ {0x2609, 0x2609}, {0x260E, 0x260F}, {0x2614, 0x2615},
+ {0x261C, 0x261C}, {0x261E, 0x261E}, {0x2640, 0x2640},
+ {0x2642, 0x2642}, {0x2660, 0x2661}, {0x2663, 0x2665},
+ {0x2667, 0x266A}, {0x266C, 0x266D}, {0x266F, 0x266F},
+ {0x273D, 0x273D}, {0x2776, 0x277F}, {0xE000, 0xF8FF},
+ {0xFFFD, 0xFFFD}, /* {0xF0000, 0xFFFFD}, {0x100000, 0x10FFFD} */
+ };
+
+ if (c >= 0x100)
+ {
+#ifdef USE_WCHAR_FUNCTIONS
+ /*
+ * Assume the library function wcwidth() works better than our own
+ * stuff. It should return 1 for ambiguous width chars!
+ */
+ int n = wcwidth(c);
+
+ if (n < 0)
+ return 6; /* unprintable, displays <xxxx> */
+ if (n > 1)
+ return n;
+#else
+ if (!utf_printable(c))
+ return 6; /* unprintable, displays <xxxx> */
+ if (c >= 0x1100
+ && (c <= 0x115f /* Hangul Jamo */
+ || c == 0x2329
+ || c == 0x232a
+ || (c >= 0x2e80 && c <= 0xa4cf
+ && c != 0x303f) /* CJK ... Yi */
+ || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
+ || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
+ Ideographs */
+ || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
+ || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
+ || (c >= 0xffe0 && c <= 0xffe6)
+ || (c >= 0x20000 && c <= 0x2fffd)
+ || (c >= 0x30000 && c <= 0x3fffd)))
+ return 2;
+#endif
+ }
+
+ /* Characters below 0x100 are influenced by 'isprint' option */
+ else if (c >= 0x80 && !vim_isprintc(c))
+ return 4; /* unprintable, displays <xx> */
+
+ if (c >= 0x80 && *p_ambw == 'd' && intable(ambiguous, sizeof(ambiguous), c))
+ return 2;
+
+ return 1;
+}
+
+/*
+ * mb_ptr2cells() function pointer.
+ * Return the number of display cells character at "*p" occupies.
+ * This doesn't take care of unprintable characters, use ptr2cells() for that.
+ */
+/*ARGSUSED*/
+ int
+latin_ptr2cells(p)
+ char_u *p;
+{
+ return 1;
+}
+
+ int
+utf_ptr2cells(p)
+ char_u *p;
+{
+ int c;
+
+ /* Need to convert to a wide character. */
+ if (*p >= 0x80)
+ {
+ c = utf_ptr2char(p);
+ /* An illegal byte is displayed as <xx>. */
+ if (utf_ptr2len_check(p) == 1 || c == NUL)
+ return 4;
+ /* If the char is ASCII it must be an overlong sequence. */
+ if (c < 0x80)
+ return char2cells(c);
+ return utf_char2cells(c);
+ }
+ return 1;
+}
+
+ int
+dbcs_ptr2cells(p)
+ char_u *p;
+{
+ /* Number of cells is equal to number of bytes, except for euc-jp when
+ * the first byte is 0x8e. */
+ if (enc_dbcs == DBCS_JPNU && *p == 0x8e)
+ return 1;
+ return MB_BYTE2LEN(*p);
+}
+
+/*
+ * mb_char2cells() function pointer.
+ * Return the number of display cells character "c" occupies.
+ * Only takes care of multi-byte chars, not "^C" and such.
+ */
+/*ARGSUSED*/
+ int
+latin_char2cells(c)