diff options
Diffstat (limited to 'src/mbyte.c')
-rw-r--r-- | src/mbyte.c | 5833 |
1 files changed, 5833 insertions, 0 deletions
diff --git a/src/mbyte.c b/src/mbyte.c new file mode 100644 index 0000000000..699316d0e3 --- /dev/null +++ b/src/mbyte.c @@ -0,0 +1,5833 @@ +/* vi:set ts=8 sts=4 sw=4: + * + * VIM - Vi IMproved by Bram Moolenaar + * Multibyte extensions partly by Sung-Hoon Baek + * + * Do ":help uganda" in Vim to read copying and usage conditions. + * Do ":help credits" in Vim to see a list of people who contributed. + * See README.txt for an overview of the Vim source code. + */ +/* + * mbyte.c: Code specifically for handling multi-byte characters. + * + * The encoding used in the core is set with 'encoding'. When 'encoding' is + * changed, the following four variables are set (for speed). + * Currently these types of character encodings are supported: + * + * "enc_dbcs" When non-zero it tells the type of double byte character + * encoding (Chinese, Korean, Japanese, etc.). + * The cell width on the display is equal to the number of + * bytes. (exception: DBCS_JPNU with first byte 0x8e) + * Recognizing the first or second byte is difficult, it + * requires checking a byte sequence from the start. + * "enc_utf8" When TRUE use Unicode characters in UTF-8 encoding. + * The cell width on the display needs to be determined from + * the character value. + * Recognizing bytes is easy: 0xxx.xxxx is a single-byte + * char, 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading + * byte of a multi-byte character. + * To make things complicated, up to two composing characters + * are allowed. These are drawn on top of the first char. + * For most editing the sequence of bytes with composing + * characters included is considered to be one character. + * "enc_unicode" When 2 use 16-bit Unicode characters (or UTF-16). + * When 4 use 32-but Unicode characters. + * Internally characters are stored in UTF-8 encoding to + * avoid NUL bytes. Conversion happens when doing I/O. + * "enc_utf8" will also be TRUE. + * + * "has_mbyte" is set when "enc_dbcs" or "enc_utf8" is non-zero. + * + * If none of these is TRUE, 8-bit bytes are used for a character. The + * encoding isn't currently specified (TODO). + * + * 'encoding' specifies the encoding used in the core. This is in registers, + * text manipulation, buffers, etc. Conversion has to be done when characters + * in another encoding are received or send: + * + * clipboard + * ^ + * | (2) + * V + * +---------------+ + * (1) | | (3) + * keyboard ----->| core |-----> display + * | | + * +---------------+ + * ^ + * | (4) + * V + * file + * + * (1) Typed characters arrive in the current locale. Conversion is to be + * done when 'encoding' is different from 'termencoding'. + * (2) Text will be made available with the encoding specified with + * 'encoding'. If this is not sufficient, system-specific conversion + * might be required. + * (3) For the GUI the correct font must be selected, no conversion done. + * Otherwise, conversion is to be done when 'encoding' differs from + * 'termencoding'. (Different in the GTK+ 2 port -- 'termencoding' + * is always used for both input and output and must always be set to + * "utf-8". gui_mch_init() does this automatically.) + * (4) The encoding of the file is specified with 'fileencoding'. Conversion + * is to be done when it's different from 'encoding'. + * + * The viminfo file is a special case: Only text is converted, not file names. + * Vim scripts may contain an ":encoding" command. This has an effect for + * some commands, like ":menutrans" + */ + +#include "vim.h" + +#ifdef WIN32UNIX +# ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +# endif +# include <windows.h> +# ifdef WIN32 +# undef WIN32 /* Some windows.h define WIN32, we don't want that here. */ +# endif +#endif + +#if (defined(WIN3264) || defined(WIN32UNIX)) && !defined(__MINGW32__) +# include <winnls.h> +#endif + +#ifdef FEAT_GUI_X11 +# include <X11/Intrinsic.h> +#endif +#ifdef X_LOCALE +#include <X11/Xlocale.h> +#endif + +#if defined(FEAT_XIM) && defined(HAVE_GTK2) +# include <gdk/gdkkeysyms.h> +# ifdef WIN3264 +# include <gdk/gdkwin32.h> +# else +# include <gdk/gdkx.h> +# endif +#endif + +#ifdef HAVE_WCHAR_H +# include <wchar.h> +#endif + +#if 0 +/* This has been disabled, because several people reported problems with the + * wcwidth() and iswprint() library functions, esp. for Hebrew. */ +# ifdef __STDC_ISO_10646__ +# define USE_WCHAR_FUNCTIONS +# endif +#endif + +#if defined(FEAT_MBYTE) || defined(PROTO) + +static int enc_canon_search __ARGS((char_u *name)); +static int dbcs_char2len __ARGS((int c)); +static int dbcs_char2bytes __ARGS((int c, char_u *buf)); +static int dbcs_ptr2len_check __ARGS((char_u *p)); +static int dbcs_char2cells __ARGS((int c)); +static int dbcs_ptr2char __ARGS((char_u *p)); + +/* Lookup table to quickly get the length in bytes of a UTF-8 character from + * the first byte of a UTF-8 string. Bytes which are illegal when used as the + * first byte have a one, because these will be used separately. */ +static char utf8len_tab[256] = +{ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /*bogus*/ + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /*bogus*/ + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1, +}; + +/* + * XIM often causes trouble. Define XIM_DEBUG to get a log of XIM callbacks + * in the "xim.log" file. + */ +/* #define XIM_DEBUG */ +#ifdef XIM_DEBUG + static void +xim_log(char *s, ...) +{ + va_list arglist; + static FILE *fd = NULL; + + if (fd == (FILE *)-1) + return; + if (fd == NULL) + { + fd = fopen("xim.log", "w"); + if (fd == NULL) + { + EMSG("Cannot open xim.log"); + fd = (FILE *)-1; + return; + } + } + + va_start(arglist, s); + vfprintf(fd, s, arglist); + va_end(arglist); +} +#endif + +#endif + +#if defined(FEAT_MBYTE) || defined(FEAT_POSTSCRIPT) || defined(PROTO) +/* + * Canonical encoding names and their properties. + * "iso-8859-n" is handled by enc_canonize() directly. + */ +static struct +{ char *name; int prop; int codepage;} +enc_canon_table[] = +{ +#define IDX_LATIN_1 0 + {"latin1", ENC_8BIT + ENC_LATIN1, 1252}, +#define IDX_ISO_2 1 + {"iso-8859-2", ENC_8BIT, 0}, +#define IDX_ISO_3 2 + {"iso-8859-3", ENC_8BIT, 0}, +#define IDX_ISO_4 3 + {"iso-8859-4", ENC_8BIT, 0}, +#define IDX_ISO_5 4 + {"iso-8859-5", ENC_8BIT, 0}, +#define IDX_ISO_6 5 + {"iso-8859-6", ENC_8BIT, 0}, +#define IDX_ISO_7 6 + {"iso-8859-7", ENC_8BIT, 0}, +#define IDX_CP1255 7 + {"cp1255", ENC_8BIT, 1255}, /* close to iso-8859-8 */ +#define IDX_ISO_8 8 + {"iso-8859-8", ENC_8BIT, 0}, +#define IDX_ISO_9 9 + {"iso-8859-9", ENC_8BIT, 0}, +#define IDX_ISO_10 10 + {"iso-8859-10", ENC_8BIT, 0}, +#define IDX_ISO_11 11 + {"iso-8859-11", ENC_8BIT, 0}, +#define IDX_ISO_13 12 + {"iso-8859-13", ENC_8BIT, 0}, +#define IDX_ISO_14 13 + {"iso-8859-14", ENC_8BIT, 0}, +#define IDX_ISO_15 14 + {"iso-8859-15", ENC_8BIT, 0}, +#define IDX_KOI8_R 15 + {"koi8-r", ENC_8BIT, 0}, +#define IDX_KOI8_U 16 + {"koi8-u", ENC_8BIT, 0}, +#define IDX_UTF8 17 + {"utf-8", ENC_UNICODE, 0}, +#define IDX_UCS2 18 + {"ucs-2", ENC_UNICODE + ENC_ENDIAN_B + ENC_2BYTE, 0}, +#define IDX_UCS2LE 19 + {"ucs-2le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2BYTE, 0}, +#define IDX_UTF16 20 + {"utf-16", ENC_UNICODE + ENC_ENDIAN_B + ENC_2WORD, 0}, +#define IDX_UTF16LE 21 + {"utf-16le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2WORD, 0}, +#define IDX_UCS4 22 + {"ucs-4", ENC_UNICODE + ENC_ENDIAN_B + ENC_4BYTE, 0}, +#define IDX_UCS4LE 23 + {"ucs-4le", ENC_UNICODE + ENC_ENDIAN_L + ENC_4BYTE, 0}, +#define IDX_DEBUG 24 + {"debug", ENC_DBCS, DBCS_DEBUG}, +#define IDX_CP932 25 + {"cp932", ENC_DBCS, DBCS_JPN}, +#define IDX_CP949 26 + {"cp949", ENC_DBCS, DBCS_KOR}, +#define IDX_CP936 27 + {"cp936", ENC_DBCS, DBCS_CHS}, +#define IDX_CP950 28 + {"cp950", ENC_DBCS, DBCS_CHT}, +#define IDX_EUC_JP 29 + {"euc-jp", ENC_DBCS, DBCS_JPNU}, +#define IDX_SJIS 30 + {"sjis", ENC_DBCS, DBCS_JPN}, +#define IDX_EUC_KR 31 + {"euc-kr", ENC_DBCS, DBCS_KORU}, +#define IDX_EUC_CN 32 + {"euc-cn", ENC_DBCS, DBCS_CHSU}, +#define IDX_EUC_TW 33 + {"euc-tw", ENC_DBCS, DBCS_CHTU}, +#define IDX_BIG5 34 + {"big5", ENC_DBCS, DBCS_CHT}, +#define IDX_CP1251 35 + {"cp1251", ENC_8BIT, 1251}, +#define IDX_MACROMAN 36 + {"macroman", ENC_8BIT + ENC_MACROMAN, 0}, +#define IDX_COUNT 37 +}; + +/* + * Aliases for encoding names. + */ +static struct +{ char *name; int canon;} +enc_alias_table[] = +{ + {"ansi", IDX_LATIN_1}, + {"iso-8859-1", IDX_LATIN_1}, + {"latin2", IDX_ISO_2}, + {"latin3", IDX_ISO_3}, + {"latin4", IDX_ISO_4}, + {"cyrillic", IDX_ISO_5}, + {"arabic", IDX_ISO_6}, + {"greek", IDX_ISO_7}, +#ifdef WIN3264 + {"hebrew", IDX_CP1255}, +#else + {"hebrew", IDX_ISO_8}, +#endif + {"latin5", IDX_ISO_9}, + {"turkish", IDX_ISO_9}, /* ? */ + {"latin6", IDX_ISO_10}, + {"nordic", IDX_ISO_10}, /* ? */ + {"thai", IDX_ISO_11}, /* ? */ + {"latin7", IDX_ISO_13}, + {"latin8", IDX_ISO_14}, + {"latin9", IDX_ISO_15}, + {"utf8", IDX_UTF8}, + {"unicode", IDX_UCS2}, + {"ucs2", IDX_UCS2}, + {"ucs2be", IDX_UCS2}, + {"ucs-2be", IDX_UCS2}, + {"ucs2le", IDX_UCS2LE}, + {"utf16", IDX_UTF16}, + {"utf16be", IDX_UTF16}, + {"utf-16be", IDX_UTF16}, + {"utf16le", IDX_UTF16LE}, + {"ucs4", IDX_UCS4}, + {"ucs4be", IDX_UCS4}, + {"ucs-4be", IDX_UCS4}, + {"ucs4le", IDX_UCS4LE}, + {"932", IDX_CP932}, + {"949", IDX_CP949}, + {"936", IDX_CP936}, + {"950", IDX_CP950}, + {"eucjp", IDX_EUC_JP}, + {"unix-jis", IDX_EUC_JP}, + {"ujis", IDX_EUC_JP}, + {"shift-jis", IDX_SJIS}, + {"euckr", IDX_EUC_KR}, + {"5601", IDX_EUC_KR}, /* Sun: KS C 5601 */ + {"euccn", IDX_EUC_CN}, + {"gb2312", IDX_EUC_CN}, + {"euctw", IDX_EUC_TW}, +#if defined(WIN3264) || defined(WIN32UNIX) || defined(MACOS) + {"japan", IDX_CP932}, + {"korea", IDX_CP949}, + {"prc", IDX_CP936}, + {"chinese", IDX_CP936}, + {"taiwan", IDX_CP950}, + {"big5", IDX_CP950}, +#else + {"japan", IDX_EUC_JP}, + {"korea", IDX_EUC_KR}, + {"prc", IDX_EUC_CN}, + {"chinese", IDX_EUC_CN}, + {"taiwan", IDX_EUC_TW}, + {"cp950", IDX_BIG5}, + {"950", IDX_BIG5}, +#endif + {"mac", IDX_MACROMAN}, + {NULL, 0} +}; + +#ifndef CP_UTF8 +# define CP_UTF8 65001 /* magic number from winnls.h */ +#endif + +/* + * Find encoding "name" in the list of canonical encoding names. + * Returns -1 if not found. + */ + static int +enc_canon_search(name) + char_u *name; +{ + int i; + + for (i = 0; i < IDX_COUNT; ++i) + if (STRCMP(name, enc_canon_table[i].name) == 0) + return i; + return -1; +} + +#endif + +#if defined(FEAT_MBYTE) || defined(PROTO) + +/* + * Find canonical encoding "name" in the list and return its properties. + * Returns 0 if not found. + */ + int +enc_canon_props(name) + char_u *name; +{ + int i; + + i = enc_canon_search(name); + if (i >= 0) + return enc_canon_table[i].prop; +#ifdef WIN3264 + if (name[0] == 'c' && name[1] == 'p' && VIM_ISDIGIT(name[2])) + { + CPINFO cpinfo; + + /* Get info on this codepage to find out what it is. */ + if (GetCPInfo(atoi(name + 2), &cpinfo) != 0) + { + if (cpinfo.MaxCharSize == 1) /* some single-byte encoding */ + return ENC_8BIT; + if (cpinfo.MaxCharSize == 2 + && (cpinfo.LeadByte[0] != 0 || cpinfo.LeadByte[1] != 0)) + /* must be a DBCS encoding */ + return ENC_DBCS; + } + return 0; + } +#endif + if (STRNCMP(name, "2byte-", 6) == 0) + return ENC_DBCS; + if (STRNCMP(name, "8bit-", 5) == 0 || STRNCMP(name, "iso-8859-", 9) == 0) + return ENC_8BIT; + return 0; +} + +/* + * Set up for using multi-byte characters. + * Called in three cases: + * - by main() to initialize (p_enc == NULL) + * - by set_init_1() after 'encoding' was set to its default. + * - by do_set() when 'encoding' has been set. + * p_enc must have been passed through enc_canonize() already. + * Sets the "enc_unicode", "enc_utf8", "enc_dbcs" and "has_mbyte" flags. + * Fills mb_bytelen_tab[] and returns NULL when there are no problems. + * When there is something wrong: Returns an error message and doesn't change + * anything. + */ + char_u * +mb_init() +{ + int i; + int idx; + int n; + int enc_dbcs_new = 0; +#if defined(USE_ICONV) && !defined(WIN3264) && !defined(WIN32UNIX) \ + && !defined(MACOS) +# define LEN_FROM_CONV + vimconv_T vimconv; + char_u *p; +#endif + + if (p_enc == NULL) + { + /* Just starting up: set the whole table to one's. */ + for (i = 0; i < 256; ++i) + mb_bytelen_tab[i] = 1; + input_conv.vc_type = CONV_NONE; + input_conv.vc_factor = 1; + output_conv.vc_type = CONV_NONE; + return NULL; + } + +#ifdef WIN3264 + if (p_enc[0] == 'c' && p_enc[1] == 'p' && VIM_ISDIGIT(p_enc[2])) + { + CPINFO cpinfo; + + /* Get info on this codepage to find out what it is. */ + if (GetCPInfo(atoi(p_enc + 2), &cpinfo) != 0) + { + if (cpinfo.MaxCharSize == 1) + { + /* some single-byte encoding */ + enc_unicode = 0; + enc_utf8 = FALSE; + } + else if (cpinfo.MaxCharSize == 2 + && (cpinfo.LeadByte[0] != 0 || cpinfo.LeadByte[1] != 0)) + { + /* must be a DBCS encoding, check below */ + enc_dbcs_new = atoi(p_enc + 2); + } + else + goto codepage_invalid; + } + else if (GetLastError() == ERROR_INVALID_PARAMETER) + { +codepage_invalid: + return (char_u *)N_("E543: Not a valid codepage"); + } + } +#endif + else if (STRNCMP(p_enc, "8bit-", 5) == 0 + || STRNCMP(p_enc, "iso-8859-", 9) == 0) + { + /* Accept any "8bit-" or "iso-8859-" name. */ + enc_unicode = 0; + enc_utf8 = FALSE; + } + else if (STRNCMP(p_enc, "2byte-", 6) == 0) + { +#ifdef WIN3264 + /* Windows: accept only valid codepage numbers, check below. */ + if (p_enc[6] != 'c' || p_enc[7] != 'p' + || (enc_dbcs_new = atoi(p_enc + 8)) == 0) + return e_invarg; +#else + /* Unix: accept any "2byte-" name, assume current locale. */ + enc_dbcs_new = DBCS_2BYTE; +#endif + } + else if ((idx = enc_canon_search(p_enc)) >= 0) + { + i = enc_canon_table[idx].prop; + if (i & ENC_UNICODE) + { + /* Unicode */ + enc_utf8 = TRUE; + if (i & (ENC_2BYTE | ENC_2WORD)) + enc_unicode = 2; + else if (i & ENC_4BYTE) + enc_unicode = 4; + else + enc_unicode = 0; + } + else if (i & ENC_DBCS) + { + /* 2byte, handle below */ + enc_dbcs_new = enc_canon_table[idx].codepage; + } + else + { + /* Must be 8-bit. */ + enc_unicode = 0; + enc_utf8 = FALSE; + } + } + else /* Don't know what encoding this is, reject it. */ + return e_invarg; + + if (enc_dbcs_new != 0) + { +#ifdef WIN3264 + /* Check if the DBCS code page is OK. */ + if (!IsValidCodePage(enc_dbcs_new)) + goto codepage_invalid; +#endif + enc_unicode = 0; + enc_utf8 = FALSE; + } + enc_dbcs = enc_dbcs_new; + has_mbyte = (enc_dbcs != 0 || enc_utf8); + +#ifdef WIN3264 + enc_codepage = encname2codepage(p_enc); +#endif + + /* + * Set the function pointers. + */ + if (enc_utf8) + { + mb_ptr2len_check = utfc_ptr2len_check; + mb_char2len = utf_char2len; + mb_char2bytes = utf_char2bytes; + mb_ptr2cells = utf_ptr2cells; + mb_char2cells = utf_char2cells; + mb_off2cells = utf_off2cells; + mb_ptr2char = utf_ptr2char; + mb_head_off = utf_head_off; + } + else if (enc_dbcs != 0) + { + mb_ptr2len_check = dbcs_ptr2len_check; + mb_char2len = dbcs_char2len; + mb_char2bytes = dbcs_char2bytes; + mb_ptr2cells = dbcs_ptr2cells; + mb_char2cells = dbcs_char2cells; + mb_off2cells = dbcs_off2cells; + mb_ptr2char = dbcs_ptr2char; + mb_head_off = dbcs_head_off; + } + else + { + mb_ptr2len_check = latin_ptr2len_check; + mb_char2len = latin_char2len; + mb_char2bytes = latin_char2bytes; + mb_ptr2cells = latin_ptr2cells; + mb_char2cells = latin_char2cells; + mb_off2cells = latin_off2cells; + mb_ptr2char = latin_ptr2char; + mb_head_off = latin_head_off; + } + + /* + * Fill the mb_bytelen_tab[] for MB_BYTE2LEN(). + */ +#ifdef LEN_FROM_CONV + /* When 'encoding' is different from the current locale mblen() won't + * work. Use conversion to "utf-8" instead. */ + vimconv.vc_type = CONV_NONE; + if (enc_dbcs) + { + p = enc_locale(); + if (p == NULL || STRCMP(p, p_enc) != 0) + { + convert_setup(&vimconv, p_enc, (char_u *)"utf-8"); + vimconv.vc_fail = TRUE; + } + vim_free(p); + } +#endif + + for (i = 0; i < 256; ++i) + { + /* Our own function to reliably check the length of UTF-8 characters, + * independent of mblen(). */ + if (enc_utf8) + n = utf8len_tab[i]; + else if (enc_dbcs == 0) + n = 1; + else + { +#if defined(WIN3264) || defined(WIN32UNIX) + /* enc_dbcs is set by setting 'fileencoding'. It becomes a Windows + * CodePage identifier, which we can pass directly in to Windows + * API */ + n = IsDBCSLeadByteEx(enc_dbcs, (BYTE)i) ? 2 : 1; +#else +# ifdef MACOS + /* + * if mblen() is not available, character which MSB is turned on + * are treated as leading byte character. (note : This assumption + * is not always true.) + */ + n = (i & 0x80) ? 2 : 1; +# else + char buf[MB_MAXBYTES]; +# ifdef X_LOCALE +# ifndef mblen +# define mblen _Xmblen +# endif +# endif + if (i == NUL) /* just in case mblen() can't handle "" */ + n = 1; + else + { + buf[0] = i; + buf[1] = 0; +#ifdef LEN_FROM_CONV + if (vimconv.vc_type != CONV_NONE) + { + /* + * string_convert() should fail when converting the first + * byte of a double-byte character. + */ + p = string_convert(&vimconv, (char_u *)buf, NULL); + if (p != NULL) + { + vim_free(p); + n = 1; + } + else + n = 2; + } + else +#endif + { + /* + * mblen() should return -1 for invalid (means the leading + * multibyte) character. However there are some platforms + * where mblen() returns 0 for invalid character. + * Therefore, following condition includes 0. + */ + if (mblen(buf, (size_t)1) <= 0) + n = 2; + else + n = 1; + } + } +# endif +#endif + } + + mb_bytelen_tab[i] = n; + } + +#ifdef LEN_FROM_CONV + convert_setup(&vimconv, NULL, NULL); +#endif + + /* The cell width depends on the type of multi-byte characters. */ + (void)init_chartab(); + + /* When enc_utf8 is set or reset, (de)allocate ScreenLinesUC[] */ + screenalloc(FALSE); + + /* When using Unicode, set default for 'fileencodings'. */ + if (enc_utf8 && !option_was_set((char_u *)"fencs")) + set_string_option_direct((char_u *)"fencs", -1, + (char_u *)"ucs-bom,utf-8,latin1", OPT_FREE); +#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(FEAT_GETTEXT) + /* GNU gettext 0.10.37 supports this feature: set the codeset used for + * translated messages independently from the current locale. */ + (void)bind_textdomain_codeset(VIMPACKAGE, + enc_utf8 ? "utf-8" : (char *)p_enc); +#endif + +#ifdef FEAT_AUTOCMD + /* Fire an autocommand to let people do custom font setup. This must be + * after Vim has been setup for the new encoding. */ + apply_autocmds(EVENT_ENCODINGCHANGED, NULL, (char_u *)"", FALSE, curbuf); +#endif + + return NULL; +} + +/* + * Return the size of the BOM for the current buffer: + * 0 - no BOM + * 2 - UCS-2 or UTF-16 BOM + * 4 - UCS-4 BOM + * 3 - UTF-8 BOM + */ + int +bomb_size() +{ + int n = 0; + + if (curbuf->b_p_bomb && !curbuf->b_p_bin) + { + if (*curbuf->b_p_fenc == NUL) + { + if (enc_utf8) + { + if (enc_unicode != 0) + n = enc_unicode; + else + n = 3; + } + } + else if (STRCMP(curbuf->b_p_fenc, "utf-8") == 0) + n = 3; + else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0 + || STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0) + n = 2; + else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0) + n = 4; + } + return n; +} + +/* + * Get class of pointer: + * 0 for blank or NUL + * 1 for punctuation + * 2 for an (ASCII) word character + * >2 for other word characters + */ + int +mb_get_class(p) + char_u *p; +{ + if (MB_BYTE2LEN(p[0]) == 1) + { + if (p[0] == NUL || vim_iswhite(p[0])) + return 0; + if (vim_iswordc(p[0])) + return 2; + return 1; + } + if (enc_dbcs != 0 && p[0] != NUL && p[1] != NUL) + return dbcs_class(p[0], p[1]); + if (enc_utf8) + return utf_class(utf_ptr2char(p)); + return 0; +} + +/* + * Get class of a double-byte character. This always returns 3 or bigger. + * TODO: Should return 1 for punctuation. + */ + int +dbcs_class(lead, trail) + unsigned lead; + unsigned trail; +{ + switch (enc_dbcs) + { + /* please add classfy routine for your language in here */ + + case DBCS_JPNU: /* ? */ + case DBCS_JPN: + { + /* JIS code classification */ + unsigned char lb = lead; + unsigned char tb = trail; + + /* convert process code to JIS */ +# if defined(WIN3264) || defined(WIN32UNIX) || defined(MACOS) + /* process code is SJIS */ + if (lb <= 0x9f) + lb = (lb - 0x81) * 2 + 0x21; + else + lb = (lb - 0xc1) * 2 + 0x21; + if (tb <= 0x7e) + tb -= 0x1f; + else if (tb <= 0x9e) + tb -= 0x20; + else + { + tb -= 0x7e; + lb += 1; + } +# else + /* + * XXX: Code page identification can not use with all + * system! So, some other encoding information + * will be needed. + * In japanese: SJIS,EUC,UNICODE,(JIS) + * Note that JIS-code system don't use as + * process code in most system because it uses + * escape sequences(JIS is context depend encoding). + */ + /* assume process code is JAPANESE-EUC */ + lb &= 0x7f; + tb &= 0x7f; +# endif + /* exceptions */ + switch (lb << 8 | tb) + { + case 0x2121: /* ZENKAKU space */ + return 0; + case 0x2122: /* KU-TEN (Japanese comma) */ + case 0x2123: /* TOU-TEN (Japanese period) */ + case 0x2124: /* ZENKAKU comma */ + case 0x2125: /* ZENKAKU period */ + return 1; + case 0x213c: /* prolongedsound handled as KATAKANA */ + return 13; + } + /* sieved by KU code */ + switch (lb) + { + case 0x21: + case 0x22: + /* special symbols */ + return 10; + case 0x23: + /* alpha-numeric */ + return 11; + case 0x24: + /* hiragana */ + return 12; + case 0x25: + /* katakana */ + return 13; + case 0x26: + /* greek */ + return 14; + case 0x27: + /* russian */ + return 15; + case 0x28: + /* lines */ + return 16; + default: + /* kanji */ + return 17; + } + } + + case DBCS_KORU: /* ? */ + case DBCS_KOR: + { + /* KS code classification */ + unsigned char c1 = lead; + unsigned char c2 = trail; + + /* + * 20 : Hangul + * 21 : Hanja + * 22 : Symbols + * 23 : Alpha-numeric/Roman Letter (Full width) + * 24 : Hangul Letter(Alphabet) + * 25 : Roman Numeral/Greek Letter + * 26 : Box Drawings + * 27 : Unit Symbols + * 28 : Circled/Parenthesized Letter + * 29 : Hirigana/Katakana + * 30 : Cyrillic Letter + */ + + if (c1 >= 0xB0 && c1 <= 0xC8) + /* Hangul */ + return 20; +#if defined(WIN3264) || defined(WIN32UNIX) + else if (c1 <= 0xA0 || c2 <= 0xA0) + /* Extended Hangul Region : MS UHC(Unified Hangul Code) */ + /* c1: 0x81-0xA0 with c2: 0x41-0x5A, 0x61-0x7A, 0x81-0xFE + * c1: 0xA1-0xC6 with c2: 0x41-0x5A, 0x61-0x7A, 0x81-0xA0 + */ + return 20; +#endif + + else if (c1 >= 0xCA && c1 <= 0xFD) + /* Hanja */ + return 21; + else switch (c1) + { + case 0xA1: + case 0xA2: + /* Symbols */ + return 22; + case 0xA3: + /* Alpha-numeric */ + return 23; + case 0xA4: + /* Hangul Letter(Alphabet) */ + return 24; + case 0xA5: + /* Roman Numeral/Greek Letter */ + return 25; + case 0xA6: + /* Box Drawings */ + return 26; + case 0xA7: + /* Unit Symbols */ + return 27; + case 0xA8: + case 0xA9: + if (c2 <= 0xAF) + return 25; /* Roman Letter */ + else if (c2 >= 0xF6) + return 22; /* Symbols */ + else + /* Circled/Parenthesized Letter */ + return 28; + case 0xAA: + case 0xAB: + /* Hirigana/Katakana */ + return 29; + case 0xAC: + /* Cyrillic Letter */ + return 30; + } + } + default: + break; + } + return 3; +} + +/* + * mb_char2len() function pointer. + * Return length in bytes of character "c". + * Returns 1 for a single-byte character. + */ +/* ARGSUSED */ + int +latin_char2len(c) + int c; +{ + return 1; +} + + static int +dbcs_char2len(c) + int c; +{ + if (c >= 0x100) + return 2; + return 1; +} + +/* + * mb_char2bytes() function pointer. + * Convert a character to its bytes. + * Returns the length in bytes. + */ + int +latin_char2bytes(c, buf) + int c; + char_u *buf; +{ + buf[0] = c; + return 1; +} + + static int +dbcs_char2bytes(c, buf) + int c; + char_u *buf; +{ + if (c >= 0x100) + { + buf[0] = (unsigned)c >> 8; + buf[1] = c; + return 2; + } + buf[0] = c; + return 1; +} + +/* + * mb_ptr2len_check() function pointer. + * Get byte length of character at "*p" but stop at a NUL. + * For UTF-8 this includes following composing characters. + * Returns 0 when *p is NUL. + * + */ + int +latin_ptr2len_check(p) + char_u *p; +{ + return MB_BYTE2LEN(*p); +} + + static int +dbcs_ptr2len_check(p) + char_u *p; +{ + int len; + + /* Check if second byte is not missing. */ + len = MB_BYTE2LEN(*p); + if (len == 2 && p[1] == NUL) + len = 1; + return len; +} + +struct interval +{ + unsigned short first; + unsigned short last; +}; +static int intable __ARGS((struct interval *table, size_t size, int c)); + +/* + * Return TRUE if "c" is in "table[size / sizeof(struct interval)]". + */ + static int +intable(table, size, c) + struct interval *table; + size_t size; + int c; +{ + int mid, bot, top; + + /* first quick check for Latin1 etc. characters */ + if (c < table[0].first) + return FALSE; + + /* binary search in table */ + bot = 0; + top = size / sizeof(struct interval) - 1; + while (top >= bot) + { + mid = (bot + top) / 2; + if (table[mid].last < c) + bot = mid + 1; + else if (table[mid].first > c) + top = mid - 1; + else + return TRUE; + } + return FALSE; +} + +/* + * For UTF-8 character "c" return 2 for a double-width character, 1 for others. + * Returns 4 or 6 for an unprintable character. + * Is only correct for characters >= 0x80. + * When p_ambw is "double", return 2 for a character with East Asian Width + * class 'A'(mbiguous). + */ + int +utf_char2cells(c) + int c; +{ + /* sorted list of non-overlapping intervals of East Asian Ambiguous + * characters, generated with: + * "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c" */ + static struct interval ambiguous[] = { + {0x00A1, 0x00A1}, {0x00A4, 0x00A4}, {0x00A7, 0x00A8}, + {0x00AA, 0x00AA}, {0x00AE, 0x00AE}, {0x00B0, 0x00B4}, + {0x00B6, 0x00BA}, {0x00BC, 0x00BF}, {0x00C6, 0x00C6}, + {0x00D0, 0x00D0}, {0x00D7, 0x00D8}, {0x00DE, 0x00E1}, + {0x00E6, 0x00E6}, {0x00E8, 0x00EA}, {0x00EC, 0x00ED}, + {0x00F0, 0x00F0}, {0x00F2, 0x00F3}, {0x00F7, 0x00FA}, + {0x00FC, 0x00FC}, {0x00FE, 0x00FE}, {0x0101, 0x0101}, + {0x0111, 0x0111}, {0x0113, 0x0113}, {0x011B, 0x011B}, + {0x0126, 0x0127}, {0x012B, 0x012B}, {0x0131, 0x0133}, + {0x0138, 0x0138}, {0x013F, 0x0142}, {0x0144, 0x0144}, + {0x0148, 0x014B}, {0x014D, 0x014D}, {0x0152, 0x0153}, + {0x0166, 0x0167}, {0x016B, 0x016B}, {0x01CE, 0x01CE}, + {0x01D0, 0x01D0}, {0x01D2, 0x01D2}, {0x01D4, 0x01D4}, + {0x01D6, 0x01D6}, {0x01D8, 0x01D8}, {0x01DA, 0x01DA}, + {0x01DC, 0x01DC}, {0x0251, 0x0251}, {0x0261, 0x0261}, + {0x02C4, 0x02C4}, {0x02C7, 0x02C7}, {0x02C9, 0x02CB}, + {0x02CD, 0x02CD}, {0x02D0, 0x02D0}, {0x02D8, 0x02DB}, + {0x02DD, 0x02DD}, {0x02DF, 0x02DF}, {0x0391, 0x03A1}, + {0x03A3, 0x03A9}, {0x03B1, 0x03C1}, {0x03C3, 0x03C9}, + {0x0401, 0x0401}, {0x0410, 0x044F}, {0x0451, 0x0451}, + {0x2010, 0x2010}, {0x2013, 0x2016}, {0x2018, 0x2019}, + {0x201C, 0x201D}, {0x2020, 0x2022}, {0x2024, 0x2027}, + {0x2030, 0x2030}, {0x2032, 0x2033}, {0x2035, 0x2035}, + {0x203B, 0x203B}, {0x203E, 0x203E}, {0x2074, 0x2074}, + {0x207F, 0x207F}, {0x2081, 0x2084}, {0x20AC, 0x20AC}, + {0x2103, 0x2103}, {0x2105, 0x2105}, {0x2109, 0x2109}, + {0x2113, 0x2113}, {0x2116, 0x2116}, {0x2121, 0x2122}, + {0x2126, 0x2126}, {0x212B, 0x212B}, {0x2153, 0x2154}, + {0x215B, 0x215E}, {0x2160, 0x216B}, {0x2170, 0x2179}, + {0x2190, 0x2199}, {0x21B8, 0x21B9}, {0x21D2, 0x21D2}, + {0x21D4, 0x21D4}, {0x21E7, 0x21E7}, {0x2200, 0x2200}, + {0x2202, 0x2203}, {0x2207, 0x2208}, {0x220B, 0x220B}, + {0x220F, 0x220F}, {0x2211, 0x2211}, {0x2215, 0x2215}, + {0x221A, 0x221A}, {0x221D, 0x2220}, {0x2223, 0x2223}, + {0x2225, 0x2225}, {0x2227, 0x222C}, {0x222E, 0x222E}, + {0x2234, 0x2237}, {0x223C, 0x223D}, {0x2248, 0x2248}, + {0x224C, 0x224C}, {0x2252, 0x2252}, {0x2260, 0x2261}, + {0x2264, 0x2267}, {0x226A, 0x226B}, {0x226E, 0x226F}, + {0x2282, 0x2283}, {0x2286, 0x2287}, {0x2295, 0x2295}, + {0x2299, 0x2299}, {0x22A5, 0x22A5}, {0x22BF, 0x22BF}, + {0x2312, 0x2312}, {0x2460, 0x24E9}, {0x24EB, 0x254B}, + {0x2550, 0x2573}, {0x2580, 0x258F}, {0x2592, 0x2595}, + {0x25A0, 0x25A1}, {0x25A3, 0x25A9}, {0x25B2, 0x25B3}, + {0x25B6, 0x25B7}, {0x25BC, 0x25BD}, {0x25C0, 0x25C1}, + {0x25C6, 0x25C8}, {0x25CB, 0x25CB}, {0x25CE, 0x25D1}, + {0x25E2, 0x25E5}, {0x25EF, 0x25EF}, {0x2605, 0x2606}, + {0x2609, 0x2609}, {0x260E, 0x260F}, {0x2614, 0x2615}, + {0x261C, 0x261C}, {0x261E, 0x261E}, {0x2640, 0x2640}, + {0x2642, 0x2642}, {0x2660, 0x2661}, {0x2663, 0x2665}, + {0x2667, 0x266A}, {0x266C, 0x266D}, {0x266F, 0x266F}, + {0x273D, 0x273D}, {0x2776, 0x277F}, {0xE000, 0xF8FF}, + {0xFFFD, 0xFFFD}, /* {0xF0000, 0xFFFFD}, {0x100000, 0x10FFFD} */ + }; + + if (c >= 0x100) + { +#ifdef USE_WCHAR_FUNCTIONS + /* + * Assume the library function wcwidth() works better than our own + * stuff. It should return 1 for ambiguous width chars! + */ + int n = wcwidth(c); + + if (n < 0) + return 6; /* unprintable, displays <xxxx> */ + if (n > 1) + return n; +#else + if (!utf_printable(c)) + return 6; /* unprintable, displays <xxxx> */ + if (c >= 0x1100 + && (c <= 0x115f /* Hangul Jamo */ + || c == 0x2329 + || c == 0x232a + || (c >= 0x2e80 && c <= 0xa4cf + && c != 0x303f) /* CJK ... Yi */ + || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */ + || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility + Ideographs */ + || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */ + || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */ + || (c >= 0xffe0 && c <= 0xffe6) + || (c >= 0x20000 && c <= 0x2fffd) + || (c >= 0x30000 && c <= 0x3fffd))) + return 2; +#endif + } + + /* Characters below 0x100 are influenced by 'isprint' option */ + else if (c >= 0x80 && !vim_isprintc(c)) + return 4; /* unprintable, displays <xx> */ + + if (c >= 0x80 && *p_ambw == 'd' && intable(ambiguous, sizeof(ambiguous), c)) + return 2; + + return 1; +} + +/* + * mb_ptr2cells() function pointer. + * Return the number of display cells character at "*p" occupies. + * This doesn't take care of unprintable characters, use ptr2cells() for that. + */ +/*ARGSUSED*/ + int +latin_ptr2cells(p) + char_u *p; +{ + return 1; +} + + int +utf_ptr2cells(p) + char_u *p; +{ + int c; + + /* Need to convert to a wide character. */ + if (*p >= 0x80) + { + c = utf_ptr2char(p); + /* An illegal byte is displayed as <xx>. */ + if (utf_ptr2len_check(p) == 1 || c == NUL) + return 4; + /* If the char is ASCII it must be an overlong sequence. */ + if (c < 0x80) + return char2cells(c); + return utf_char2cells(c); + } + return 1; +} + + int +dbcs_ptr2cells(p) + char_u *p; +{ + /* Number of cells is equal to number of bytes, except for euc-jp when + * the first byte is 0x8e. */ + if (enc_dbcs == DBCS_JPNU && *p == 0x8e) + return 1; + return MB_BYTE2LEN(*p); +} + +/* + * mb_char2cells() function pointer. + * Return the number of display cells character "c" occupies. + * Only takes care of multi-byte chars, not "^C" and such. + */ +/*ARGSUSED*/ + int +latin_char2cells(c) |