diff options
author | Bram Moolenaar <Bram@vim.org> | 2019-03-22 16:33:15 +0100 |
---|---|---|
committer | Bram Moolenaar <Bram@vim.org> | 2019-03-22 16:33:15 +0100 |
commit | dc4fa190e7b9d6ba49416ce875d2192c4444d3eb (patch) | |
tree | bd25a63a1ba370df4e7dd8e1213497d6fed1817d /src/arabic.c | |
parent | 6b6f7aae4a3329d685e512699287605540257b40 (diff) |
patch 8.1.1038: Arabic support excludes Farsiv8.1.1038
Problem: Arabic support excludes Farsi.
Solution: Add Farsi support to the Arabic support. (Ali Gholami Rudi,
Ameretat Reith)
Diffstat (limited to 'src/arabic.c')
-rw-r--r-- | src/arabic.c | 849 |
1 files changed, 265 insertions, 584 deletions
diff --git a/src/arabic.c b/src/arabic.c index adf414712c..d323abf644 100644 --- a/src/arabic.c +++ b/src/arabic.c @@ -11,541 +11,311 @@ * arabic.c: functions for Arabic language * * Author: Nadim Shaikli & Isam Bayazidi + * Farsi support and restructuring to make adding new letters easier by Ali + * Gholami Rudi. Further work by Ameretat Reith. + */ + +/* + * Sorted list of unicode Arabic characters. Each entry holds the + * presentation forms of a letter. + * + * Arabic characters are categorized into following types: + * + * Isolated - iso-8859-6 form + * Initial - unicode form-B start + * Medial - unicode form-B middle + * Final - unicode form-B final + * Stand-Alone - unicode form-B isolated */ #include "vim.h" #if defined(FEAT_ARABIC) || defined(PROTO) -static int A_firstc_laa(int c1, int c); -static int A_is_harakat(int c); -static int A_is_iso(int c); -static int A_is_formb(int c); -static int A_is_ok(int c); -static int A_is_valid(int c); -static int A_is_special(int c); - +// Unicode values for Arabic characters. +#define a_HAMZA 0x0621 +#define a_ALEF_MADDA 0x0622 +#define a_ALEF_HAMZA_ABOVE 0x0623 +#define a_WAW_HAMZA 0x0624 +#define a_ALEF_HAMZA_BELOW 0x0625 +#define a_YEH_HAMZA 0x0626 +#define a_ALEF 0x0627 +#define a_BEH 0x0628 +#define a_TEH_MARBUTA 0x0629 +#define a_TEH 0x062a +#define a_THEH 0x062b +#define a_JEEM 0x062c +#define a_HAH 0x062d +#define a_KHAH 0x062e +#define a_DAL 0x062f +#define a_THAL 0x0630 +#define a_REH 0x0631 +#define a_ZAIN 0x0632 +#define a_SEEN 0x0633 +#define a_SHEEN 0x0634 +#define a_SAD 0x0635 +#define a_DAD 0x0636 +#define a_TAH 0x0637 +#define a_ZAH 0x0638 +#define a_AIN 0x0639 +#define a_GHAIN 0x063a +#define a_TATWEEL 0x0640 +#define a_FEH 0x0641 +#define a_QAF 0x0642 +#define a_KAF 0x0643 +#define a_LAM 0x0644 +#define a_MEEM 0x0645 +#define a_NOON 0x0646 +#define a_HEH 0x0647 +#define a_WAW 0x0648 +#define a_ALEF_MAKSURA 0x0649 +#define a_YEH 0x064a +#define a_FATHATAN 0x064b +#define a_DAMMATAN 0x064c +#define a_KASRATAN 0x064d +#define a_FATHA 0x064e +#define a_DAMMA 0x064f +#define a_KASRA 0x0650 +#define a_SHADDA 0x0651 +#define a_SUKUN 0x0652 +#define a_MADDA_ABOVE 0x0653 +#define a_HAMZA_ABOVE 0x0654 +#define a_HAMZA_BELOW 0x0655 + +#define a_PEH 0x067e +#define a_TCHEH 0x0686 +#define a_JEH 0x0698 +#define a_FKAF 0x06a9 +#define a_GAF 0x06af +#define a_FYEH 0x06cc + +#define a_s_LAM_ALEF_MADDA_ABOVE 0xfef5 +#define a_f_LAM_ALEF_MADDA_ABOVE 0xfef6 +#define a_s_LAM_ALEF_HAMZA_ABOVE 0xfef7 +#define a_f_LAM_ALEF_HAMZA_ABOVE 0xfef8 +#define a_s_LAM_ALEF_HAMZA_BELOW 0xfef9 +#define a_f_LAM_ALEF_HAMZA_BELOW 0xfefa +#define a_s_LAM_ALEF 0xfefb +#define a_f_LAM_ALEF 0xfefc + +static struct achar { + unsigned c; + unsigned isolated; + unsigned initial; + unsigned medial; + unsigned final; +} achars[] = { + {a_HAMZA, 0xfe80, 0, 0, 0}, + {a_ALEF_MADDA, 0xfe81, 0, 0, 0xfe82}, + {a_ALEF_HAMZA_ABOVE, 0xfe83, 0, 0, 0xfe84}, + {a_WAW_HAMZA, 0xfe85, 0, 0, 0xfe86}, + {a_ALEF_HAMZA_BELOW, 0xfe87, 0, 0, 0xfe88}, + {a_YEH_HAMZA, 0xfe89, 0xfe8b, 0xfe8c, 0xfe8a}, + {a_ALEF, 0xfe8d, 0, 0, 0xfe8e}, + {a_BEH, 0xfe8f, 0xfe91, 0xfe92, 0xfe90}, + {a_TEH_MARBUTA, 0xfe93, 0, 0, 0xfe94}, + {a_TEH, 0xfe95, 0xfe97, 0xfe98, 0xfe96}, + {a_THEH, 0xfe99, 0xfe9b, 0xfe9c, 0xfe9a}, + {a_JEEM, 0xfe9d, 0xfe9f, 0xfea0, 0xfe9e}, + {a_HAH, 0xfea1, 0xfea3, 0xfea4, 0xfea2}, + {a_KHAH, 0xfea5, 0xfea7, 0xfea8, 0xfea6}, + {a_DAL, 0xfea9, 0, 0, 0xfeaa}, + {a_THAL, 0xfeab, 0, 0, 0xfeac}, + {a_REH, 0xfead, 0, 0, 0xfeae}, + {a_ZAIN, 0xfeaf, 0, 0, 0xfeb0}, + {a_SEEN, 0xfeb1, 0xfeb3, 0xfeb4, 0xfeb2}, + {a_SHEEN, 0xfeb5, 0xfeb7, 0xfeb8, 0xfeb6}, + {a_SAD, 0xfeb9, 0xfebb, 0xfebc, 0xfeba}, + {a_DAD, 0xfebd, 0xfebf, 0xfec0, 0xfebe}, + {a_TAH, 0xfec1, 0xfec3, 0xfec4, 0xfec2}, + {a_ZAH, 0xfec5, 0xfec7, 0xfec8, 0xfec6}, + {a_AIN, 0xfec9, 0xfecb, 0xfecc, 0xfeca}, + {a_GHAIN, 0xfecd, 0xfecf, 0xfed0, 0xfece}, + {a_TATWEEL, 0, 0x0640, 0x0640, 0x0640}, + {a_FEH, 0xfed1, 0xfed3, 0xfed4, 0xfed2}, + {a_QAF, 0xfed5, 0xfed7, 0xfed8, 0xfed6}, + {a_KAF, 0xfed9, 0xfedb, 0xfedc, 0xfeda}, + {a_LAM, 0xfedd, 0xfedf, 0xfee0, 0xfede}, + {a_MEEM, 0xfee1, 0xfee3, 0xfee4, 0xfee2}, + {a_NOON, 0xfee5, 0xfee7, 0xfee8, 0xfee6}, + {a_HEH, 0xfee9, 0xfeeb, 0xfeec, 0xfeea}, + {a_WAW, 0xfeed, 0, 0, 0xfeee}, + {a_ALEF_MAKSURA, 0xfeef, 0, 0, 0xfef0}, + {a_YEH, 0xfef1, 0xfef3, 0xfef4, 0xfef2}, + {a_FATHATAN, 0xfe70, 0, 0, 0}, + {a_DAMMATAN, 0xfe72, 0, 0, 0}, + {a_KASRATAN, 0xfe74, 0, 0, 0}, + {a_FATHA, 0xfe76, 0, 0xfe77, 0}, + {a_DAMMA, 0xfe78, 0, 0xfe79, 0}, + {a_KASRA, 0xfe7a, 0, 0xfe7b, 0}, + {a_SHADDA, 0xfe7c, 0, 0xfe7c, 0}, + {a_SUKUN, 0xfe7e, 0, 0xfe7f, 0}, + {a_MADDA_ABOVE, 0, 0, 0, 0}, + {a_HAMZA_ABOVE, 0, 0, 0, 0}, + {a_HAMZA_BELOW, 0, 0, 0, 0}, + {a_PEH, 0xfb56, 0xfb58, 0xfb59, 0xfb57}, + {a_TCHEH, 0xfb7a, 0xfb7c, 0xfb7d, 0xfb7b}, + {a_JEH, 0xfb8a, 0, 0, 0xfb8b}, + {a_FKAF, 0xfb8e, 0xfb90, 0xfb91, 0xfb8f}, + {a_GAF, 0xfb92, 0xfb94, 0xfb95, 0xfb93}, + {a_FYEH, 0xfbfc, 0xfbfe, 0xfbff, 0xfbfd}, +}; + +#define a_BYTE_ORDER_MARK 0xfeff + +#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0])) /* - * Returns True if c is an ISO-8859-6 shaped ARABIC letter (user entered) + * Find the struct achar pointer to the given Arabic char. + * Returns NULL if not found. */ - static int -A_is_a(int cur_c) + static struct achar * +find_achar(int c) { - switch (cur_c) + int h, m, l; + + // using binary search to find c + h = ARRAY_SIZE(achars); + l = 0; + while (l < h) { - case a_HAMZA: - case a_ALEF_MADDA: - case a_ALEF_HAMZA_ABOVE: - case a_WAW_HAMZA: - case a_ALEF_HAMZA_BELOW: - case a_YEH_HAMZA: - case a_ALEF: - case a_BEH: - case a_TEH_MARBUTA: - case a_TEH: - case a_THEH: - case a_JEEM: - case a_HAH: - case a_KHAH: - case a_DAL: - case a_THAL: - case a_REH: - case a_ZAIN: - case a_SEEN: - case a_SHEEN: - case a_SAD: - case a_DAD: - case a_TAH: - case a_ZAH: - case a_AIN: - case a_GHAIN: - case a_TATWEEL: - case a_FEH: - case a_QAF: - case a_KAF: - case a_LAM: - case a_MEEM: - case a_NOON: - case a_HEH: - case a_WAW: - case a_ALEF_MAKSURA: - case a_YEH: - return TRUE; + m = (h + l) / 2; + if (achars[m].c == (unsigned)c) + return &achars[m]; + if ((unsigned)c < achars[m].c) + h = m; + else + l = m + 1; } - - return FALSE; + return NULL; } - /* - * Returns True if c is an Isolated Form-B ARABIC letter + * Change shape - from Combination (2 char) to an Isolated */ static int -A_is_s(int cur_c) +chg_c_laa2i(int hid_c) { - switch (cur_c) + int tempc; + + switch (hid_c) { - case a_s_HAMZA: - case a_s_ALEF_MADDA: - case a_s_ALEF_HAMZA_ABOVE: - case a_s_WAW_HAMZA: - case a_s_ALEF_HAMZA_BELOW: - case a_s_YEH_HAMZA: - case a_s_ALEF: - case a_s_BEH: - case a_s_TEH_MARBUTA: - case a_s_TEH: - case a_s_THEH: - case a_s_JEEM: - case a_s_HAH: - case a_s_KHAH: - case a_s_DAL: - case a_s_THAL: - case a_s_REH: - case a_s_ZAIN: - case a_s_SEEN: - case a_s_SHEEN: - case a_s_SAD: - case a_s_DAD: - case a_s_TAH: - case a_s_ZAH: - case a_s_AIN: - case a_s_GHAIN: - case a_s_FEH: - case a_s_QAF: - case a_s_KAF: - case a_s_LAM: - case a_s_MEEM: - case a_s_NOON: - case a_s_HEH: - case a_s_WAW: - case a_s_ALEF_MAKSURA: - case a_s_YEH: - return TRUE; + case a_ALEF_MADDA: + tempc = a_s_LAM_ALEF_MADDA_ABOVE; + break; + case a_ALEF_HAMZA_ABOVE: + tempc = a_s_LAM_ALEF_HAMZA_ABOVE; + break; + case a_ALEF_HAMZA_BELOW: + tempc = a_s_LAM_ALEF_HAMZA_BELOW; + break; + case a_ALEF: + tempc = a_s_LAM_ALEF; + break; + default: + tempc = 0; } - return FALSE; + return tempc; } - /* - * Returns True if c is a Final shape of an ARABIC letter + * Change shape - from Combination-Isolated to Final */ static int -A_is_f(int cur_c) +chg_c_laa2f(int hid_c) { - switch (cur_c) - { - case a_f_ALEF_MADDA: - case a_f_ALEF_HAMZA_ABOVE: - case a_f_WAW_HAMZA: - case a_f_ALEF_HAMZA_BELOW: - case a_f_YEH_HAMZA: - case a_f_ALEF: - case a_f_BEH: - case a_f_TEH_MARBUTA: - case a_f_TEH: - case a_f_THEH: - case a_f_JEEM: - case a_f_HAH: - case a_f_KHAH: - case a_f_DAL: - case a_f_THAL: - case a_f_REH: - case a_f_ZAIN: - case a_f_SEEN: - case a_f_SHEEN: - case a_f_SAD: - case a_f_DAD: - case a_f_TAH: - case a_f_ZAH: - case a_f_AIN: - case a_f_GHAIN: - case a_f_FEH: - case a_f_QAF: - case a_f_KAF: - case a_f_LAM: - case a_f_MEEM: - case a_f_NOON: - case a_f_HEH: - case a_f_WAW: - case a_f_ALEF_MAKSURA: - case a_f_YEH: - case a_f_LAM_ALEF_MADDA_ABOVE: - case a_f_LAM_ALEF_HAMZA_ABOVE: - case a_f_LAM_ALEF_HAMZA_BELOW: - case a_f_LAM_ALEF: - return TRUE; - } - return FALSE; -} + int tempc; - -/* - * Change shape - from ISO-8859-6/Isolated to Form-B Isolated - */ - static int -chg_c_a2s(int cur_c) -{ - switch (cur_c) + switch (hid_c) { - case a_HAMZA: return a_s_HAMZA; - case a_ALEF_MADDA: return a_s_ALEF_MADDA; - case a_ALEF_HAMZA_ABOVE: return a_s_ALEF_HAMZA_ABOVE; - case a_WAW_HAMZA: return a_s_WAW_HAMZA; - case a_ALEF_HAMZA_BELOW: return a_s_ALEF_HAMZA_BELOW; - case a_YEH_HAMZA: return a_s_YEH_HAMZA; - case a_ALEF: return a_s_ALEF; - case a_TEH_MARBUTA: return a_s_TEH_MARBUTA; - case a_DAL: return a_s_DAL; - case a_THAL: return a_s_THAL; - case a_REH: return a_s_REH; - case a_ZAIN: return a_s_ZAIN; - case a_TATWEEL: return cur_c; /* exceptions */ - case a_WAW: return a_s_WAW; - case a_ALEF_MAKSURA: return a_s_ALEF_MAKSURA; - case a_BEH: return a_s_BEH; - case a_TEH: return a_s_TEH; - case a_THEH: return a_s_THEH; - case a_JEEM: return a_s_JEEM; - case a_HAH: return a_s_HAH; - case a_KHAH: return a_s_KHAH; - case a_SEEN: return a_s_SEEN; - case a_SHEEN: return a_s_SHEEN; - case a_SAD: return a_s_SAD; - case a_DAD: return a_s_DAD; - case a_TAH: return a_s_TAH; - case a_ZAH: return a_s_ZAH; - case a_AIN: return a_s_AIN; - case a_GHAIN: return a_s_GHAIN; - case a_FEH: return a_s_FEH; - case a_QAF: return a_s_QAF; - case a_KAF: return a_s_KAF; - case a_LAM: return a_s_LAM; - case a_MEEM: return a_s_MEEM; - case a_NOON: return a_s_NOON; - case a_HEH: return a_s_HEH; - case a_YEH: return a_s_YEH; + case a_ALEF_MADDA: + tempc = a_f_LAM_ALEF_MADDA_ABOVE; + break; + case a_ALEF_HAMZA_ABOVE: + tempc = a_f_LAM_ALEF_HAMZA_ABOVE; + break; + case a_ALEF_HAMZA_BELOW: + tempc = a_f_LAM_ALEF_HAMZA_BELOW; + break; + case a_ALEF: + tempc = a_f_LAM_ALEF; + break; + default: + tempc = 0; } - return 0; -} - -/* - * Change shape - from ISO-8859-6/Isolated to Initial - */ - static int -chg_c_a2i(int cur_c) -{ - switch (cur_c) - { - case a_YEH_HAMZA: return a_i_YEH_HAMZA; - case a_HAMZA: /* exceptions */ - return a_s_HAMZA; - case a_ALEF_MADDA: /* exceptions */ - return a_s_ALEF_MADDA; - case a_ALEF_HAMZA_ABOVE: /* exceptions */ - return a_s_ALEF_HAMZA_ABOVE; - case a_WAW_HAMZA: /* exceptions */ - return a_s_WAW_HAMZA; - case a_ALEF_HAMZA_BELOW: /* exceptions */ - return a_s_ALEF_HAMZA_BELOW; - case a_ALEF: /* exceptions */ - return a_s_ALEF; - case a_TEH_MARBUTA: /* exceptions */ - return a_s_TEH_MARBUTA; - case a_DAL: /* exceptions */ - return a_s_DAL; - case a_THAL: /* exceptions */ - return a_s_THAL; - case a_REH: /* exceptions */ - return a_s_REH; - case a_ZAIN: /* exceptions */ - return a_s_ZAIN; - case a_TATWEEL: /* exceptions */ - return cur_c; - case a_WAW: /* exceptions */ - return a_s_WAW; - case a_ALEF_MAKSURA: /* exceptions */ - return a_s_ALEF_MAKSURA; - case a_BEH: return a_i_BEH; - case a_TEH: return a_i_TEH; - case a_THEH: return a_i_THEH; - case a_JEEM: return a_i_JEEM; - case a_HAH: return a_i_HAH; - case a_KHAH: return a_i_KHAH; - case a_SEEN: return a_i_SEEN; - case a_SHEEN: return a_i_SHEEN; - case a_SAD: return a_i_SAD; - case a_DAD: return a_i_DAD; - case a_TAH: return a_i_TAH; - case a_ZAH: return a_i_ZAH; - case a_AIN: return a_i_AIN; - case a_GHAIN: return a_i_GHAIN; - case a_FEH: return a_i_FEH; - case a_QAF: return a_i_QAF; - case a_KAF: return a_i_KAF; - case a_LAM: return a_i_LAM; - case a_MEEM: return a_i_MEEM; - case a_NOON: return a_i_NOON; - case a_HEH: return a_i_HEH; - case a_YEH: return a_i_YEH; - } - return 0; + return tempc; } - /* - * Change shape - from ISO-8859-6/Isolated to Medial + * Returns whether it is possible to join the given letters */ static int -chg_c_a2m(int cur_c) +can_join(int c1, int c2) { - switch (cur_c) - { - case a_HAMZA: return a_s_HAMZA; /* exception */ - case a_ALEF_MADDA: return a_f_ALEF_MADDA; /* exception */ - case a_ALEF_HAMZA_ABOVE: return a_f_ALEF_HAMZA_ABOVE; /* exception */ - case a_WAW_HAMZA: return a_f_WAW_HAMZA; /* exception */ - case a_ALEF_HAMZA_BELOW: return a_f_ALEF_HAMZA_BELOW; /* exception */ - case a_YEH_HAMZA: return a_m_YEH_HAMZA; - case a_ALEF: return a_f_ALEF; /* exception */ - case a_BEH: return a_m_BEH; - case a_TEH_MARBUTA: return a_f_TEH_MARBUTA; /* exception */ - case a_TEH: return a_m_TEH; - case a_THEH: return a_m_THEH; - case a_JEEM: return a_m_JEEM; - case a_HAH: return a_m_HAH; - case a_KHAH: return a_m_KHAH; - case a_DAL: return a_f_DAL; /* exception */ - case a_THAL: return a_f_THAL; /* exception */ - case a_REH: return a_f_REH; /* exception */ - case a_ZAIN: return a_f_ZAIN; /* exception */ - case a_SEEN: return a_m_SEEN; - case a_SHEEN: return a_m_SHEEN; - case a_SAD: return a_m_SAD; - case a_DAD: return a_m_DAD; - case a_TAH: return a_m_TAH; - case a_ZAH: return a_m_ZAH; - case a_AIN: return a_m_AIN; - case a_GHAIN: return a_m_GHAIN; - case a_TATWEEL: return cur_c; /* exception */ - case a_FEH: return a_m_FEH; - case a_QAF: return a_m_QAF; - case a_KAF: return a_m_KAF; - case a_LAM: return a_m_LAM; - case a_MEEM: return a_m_MEEM; - case a_NOON: return a_m_NOON; - case a_HEH: return a_m_HEH; - case a_WAW: return a_f_WAW; /* exception */ - case a_ALEF_MAKSURA: return a_f_ALEF_MAKSURA; /* exception */ - case a_YEH: return a_m_YEH; - } - return 0; -} - + struct achar *a1 = find_achar(c1); + struct achar *a2 = find_achar(c2); -/* - * Change shape - from ISO-8859-6/Isolated to final - */ - static int -chg_c_a2f(int cur_c) -{ - /* NOTE: these encodings need to be accounted for - * a_f_ALEF_MADDA; - * a_f_ALEF_HAMZA_ABOVE; - * a_f_ALEF_HAMZA_BELOW; - * a_f_LAM_ALEF_MADDA_ABOVE; - * a_f_LAM_ALEF_HAMZA_ABOVE; - * a_f_LAM_ALEF_HAMZA_BELOW; - */ - switch (cur_c) - { - case a_HAMZA: return a_s_HAMZA; /* exception */ - case a_ALEF_MADDA: return a_f_ALEF_MADDA; - case a_ALEF_HAMZA_ABOVE: return a_f_ALEF_HAMZA_ABOVE; - case a_WAW_HAMZA: return a_f_WAW_HAMZA; - case a_ALEF_HAMZA_BELOW: return a_f_ALEF_HAMZA_BELOW; - case a_YEH_HAMZA: return a_f_YEH_HAMZA; - case a_ALEF: return a_f_ALEF; - case a_BEH: return a_f_BEH; - case a_TEH_MARBUTA: return a_f_TEH_MARBUTA; - case a_TEH: return a_f_TEH; - case a_THEH: return a_f_THEH; - case a_JEEM: return a_f_JEEM; - case a_HAH: return a_f_HAH; - case a_KHAH: return a_f_KHAH; - case a_DAL: return a_f_DAL; - case a_THAL: return a_f_THAL; - case a_REH: return a_f_REH; - case a_ZAIN: return a_f_ZAIN; - case a_SEEN: return a_f_SEEN; - case a_SHEEN: return a_f_SHEEN; - case a_SAD: return a_f_SAD; - case a_DAD: return a_f_DAD; - case a_TAH: return a_f_TAH; - case a_ZAH: return a_f_ZAH; - case a_AIN: return a_f_AIN; - case a_GHAIN: return a_f_GHAIN; - case a_TATWEEL: return cur_c; /* exception */ - case a_FEH: return a_f_FEH; - case a_QAF: return a_f_QAF; - case a_KAF: return a_f_KAF; - case a_LAM: return a_f_LAM; - case a_MEEM: return a_f_MEEM; - case a_NOON: return a_f_NOON; - case a_HEH: return a_f_HEH; - case a_WAW: return a_f_WAW; - case a_ALEF_MAKSURA: return a_f_ALEF_MAKSURA; - case a_YEH: return a_f_YEH; - } - return 0; + return a1 && a2 && (a1->initial || a1->medial) && (a2->final || a2->medial); } - /* - * Change shape - from Initial to Medial - * This code is unreachable, because for the relevant characters ARABIC_CHAR() - * is FALSE; + * Check whether we are dealing with a character that could be regarded as an + * Arabic combining character, need to check the character before this. */ -#if 0 - static int -chg_c_i2m(int cur_c) + int +arabic_maycombine(int two) { - switch (cur_c) - { - case a_i_YEH_HAMZA: return a_m_YEH_HAMZA; - case a_i_BEH: return a_m_BEH; - case a_i_TEH: return a_m_TEH; - case a_i_THEH: return a_m_THEH; - case a_i_JEEM: return a_m_JEEM; - case a_i_HAH: return a_m_HAH; - case a_i_KHAH: return a_m_KHAH; - case a_i_SEEN: return a_m_SEEN; - case a_i_SHEEN: return a_m_SHEEN; - case a_i_SAD: return a_m_SAD; - case a_i_DAD: return a_m_DAD; - case a_i_TAH: return a_m_TAH; - case a_i_ZAH: return a_m_ZAH; - case a_i_AIN: return a_m_AIN; - case a_i_GHAIN: return a_m_GHAIN; - case a_i_FEH: return a_m_FEH; - case a_i_QAF: return a_m_QAF; - case a_i_KAF: return a_m_KAF; - case a_i_LAM: return a_m_LAM; - case a_i_MEEM: return a_m_MEEM; - case a_i_NOON: return a_m_NOON; - case a_i_HEH: return a_m_HEH; - case a_i_YEH: return a_m_YEH; - } - return 0; + if (p_arshape && !p_tbidi) + return (two == a_ALEF_MADDA + || two == a_ALEF_HAMZA_ABOVE + || two == a_ALEF_HAMZA_BELOW + || two == a_ALEF); + return FALSE; } -#endif - /* - * Change shape - from Final to Medial + * Check whether we are dealing with Arabic combining characters. + * Note: these are NOT really composing characters! */ - static int -chg_c_f2m(int cur_c) + int +arabic_combine( + int one, // first character + int two) // character just after "one" { - switch (cur_c) - { - /* NOTE: these encodings are multi-positional, no ? - * case a_f_ALEF_MADDA: - * case a_f_ALEF_HAMZA_ABOVE: - * case a_f_ALEF_HAMZA_BELOW: - */ - case a_f_YEH_HAMZA: return a_m_YEH_HAMZA; - case a_f_WAW_HAMZA: /* exceptions */ - case a_f_ALEF: - case a_f_TEH_MARBUTA: - case a_f_DAL: - case a_f_THAL: - case a_f_REH: - case a_f_ZAIN: - case a_f_WAW: - case a_f_ALEF_MAKSURA: - return cur_c; - case a_f_BEH: return a_m_BEH; - case a_f_TEH: return a_m_TEH; - case a_f_THEH: return a_m_THEH; - case a_f_JEEM: return a_m_JEEM; - case a_f_HAH: return a_m_HAH; - case a_f_KHAH: return a_m_KHAH; - case a_f_SEEN: return a_m_SEEN; - case a_f_SHEEN: return a_m_SHEEN; - case a_f_SAD: return a_m_SAD; - case a_f_DAD: return a_m_DAD; - case a_f_TAH: return a_m_TAH; - case a_f_ZAH: return a_m_ZAH; - case a_f_AIN: return a_m_AIN; - case a_f_GHAIN: return a_m_GHAIN; - case a_f_FEH: return a_m_FEH; - case a_f_QAF: return a_m_QAF; - case a_f_KAF: return a_m_KAF; - case a_f_LAM: return a_m_LAM; - case a_f_MEEM: return a_m_MEEM; - case a_f_NOON: return a_m_NOON; - case a_f_HEH: return a_m_HEH; - case a_f_YEH: return a_m_YEH; - - /* NOTE: these encodings are multi-positional, no ? - * case a_f_LAM_ALEF_MADDA_ABOVE: - * case a_f_LAM_ALEF_HAMZA_ABOVE: - * case a_f_LAM_ALEF_HAMZA_BELOW: - * case a_f_LAM_ALEF: - */ - } - return 0; + if (one == a_LAM) + return arabic_maycombine(two); + return FALSE; } - /* - * Change shape - from Combination (2 char) to an Isolated + * A_is_iso returns true if 'c' is an Arabic ISO-8859-6 character + * (alphabet/number/punctuation) */ static int -chg_c_laa2i(int hid_c) +A_is_iso(int c) { - switch (hid_c) - { - case a_ALEF_MADDA: return a_s_LAM_ALEF_MADDA_ABOVE; - case a_ALEF_HAMZA_ABOVE: return a_s_LAM_ALEF_HAMZA_ABOVE; - case a_ALEF_HAMZA_BELOW: return a_s_LAM_ALEF_HAMZA_BELOW; - case a_ALEF: return a_s_LAM_ALEF; - } - return 0; + return find_achar(c) != NULL; } - /* - * Change shape - from Combination-Isolated to Final + * A_is_ok returns true if 'c' is an Arabic 10646 (8859-6 or Form-B) */ static int -chg_c_laa2f(int hid_c) +A_is_ok(int c) { - switch (hid_c) - { - case a_ALEF_MADDA: return a_f_LAM_ALEF_MADDA_ABOVE; - case a_ALEF_HAMZA_ABOVE: return a_f_LAM_ALEF_HAMZA_ABOVE; - case a_ALEF_HAMZA_BELOW: return a_f_LAM_ALEF_HAMZA_BELOW; - case a_ALEF: return a_f_LAM_ALEF; - } - return 0; + return (A_is_iso(c) || c == a_BYTE_ORDER_MARK); } /* - * Do "half-shaping" on character "c". Return zero if no shaping. + * A_is_valid returns true if 'c' is an Arabic 10646 (8859-6 or Form-B) + * with some exceptions/exclusions */ static int -half_shape(int c) +A_is_valid(int c) { - if (A_is_a(c)) - return chg_c_a2i(c); - if (A_is_valid(c) && A_is_f(c)) - return chg_c_f2m(c); - return 0; + return (A_is_ok(c) && c != a_HAMZA); } /* @@ -567,48 +337,44 @@ arabic_shape( int next_c) { int curr_c; - int shape_c; int curr_laa; int prev_laa; - /* Deal only with Arabic character, pass back all others */ + // Deal only with Arabic characters, pass back all others if (!A_is_ok(c)) return c; - /* half-shape current and previous character */ - shape_c = half_shape(prev_c); - - curr_laa = A_firstc_laa(c, *c1p); - prev_laa = A_firstc_laa(prev_c, prev_c1); + curr_laa = arabic_combine(c, *c1p); + prev_laa = arabic_combine(prev_c, prev_c1); if (curr_laa) { - if (A_is_valid(prev_c) && !A_is_f(shape_c) - && !A_is_s(shape_c) && !prev_laa) - curr_c = chg_c_laa2f(curr_laa); + if (A_is_valid(prev_c) && can_join(prev_c, a_LAM) && !prev_laa) + curr_c = chg_c_laa2f(*c1p); else - curr_c = chg_c_laa2i(curr_laa); + curr_c = chg_c_laa2i(*c1p); - /* Remove the composing character */ + // Remove the composing character *c1p = 0; } - else if (!A_is_valid(prev_c) && A_is_valid(next_c)) - curr_c = chg_c_a2i(c); - else if (!shape_c || A_is_f(shape_c) || A_is_s(shape_c) || prev_laa) - curr_c = A_is_valid(next_c) ? chg_c_a2i(c) : chg_c_a2s(c); - else if (A_is_valid(next_c)) -#if 0 - curr_c = A_is_iso(c) ? chg_c_a2m(c) : chg_c_i2m(c); -#else - curr_c = A_is_iso(c) ? chg_c_a2m(c) : 0; -#endif - else if (A_is_valid(prev_c)) - curr_c = chg_c_a2f(c); else - curr_c = chg_c_a2s(c); + { + struct achar *curr_a = find_achar(c); + int backward_combine = !prev_laa && can_join(prev_c, c); + int forward_combine = can_join(c, next_c); + + if (backward_combine && forward_combine) + curr_c = curr_a->medial; + if (backward_combine && !forward_combine) + curr_c = curr_a->final; + if (!backward_combine && forward_combine) + curr_c = curr_a->initial; + if (!backward_combine && !forward_combine) + curr_c = curr_a->isolated; + } - /* Sanity check -- curr_c should, in the future, never be 0. - * We should, in the future, insert a fatal error here. */ + // Sanity check -- curr_c should, in the future, never be 0. + // We should, in the future, insert a fatal error here. if (curr_c == NUL) curr_c = c; @@ -616,97 +382,12 @@ arabic_shape( { char_u buf[MB_MAXBYTES + 1]; - /* Update the first byte of the character. */ + // Update the first byte of the character. (*mb_char2bytes)(curr_c, buf); *ccp = buf[0]; } - /* Return the shaped character */ + // Return the shaped character return curr_c; } - - -/* - * A_firstc_laa returns first character of LAA combination if it exists - */ - static int -A_firstc_laa( - int c, /* base character */ - int c1) /* first composing character */ -{ - if (c1 != NUL && c == a_LAM && !A_is_harakat(c1)) - return c1; - return 0; -} - - -/* - * A_is_harakat returns TRUE if 'c' is an Arabic Harakat character - * (harakat/tanween) - */ - static int -A_is_harakat(int c) -{ - return (c >= a_FATHATAN && c <= a_SUKUN); -} - - -/* - * A_is_iso returns TRUE if 'c' is an Arabic ISO-8859-6 character - * (alphabet/number/punctuation) - */ - static int -A_is_iso(int c) -{ - return ((c >= a_HAMZA && c <= a_GHAIN) - || (c >= a_TATWEEL && c <= a_HAMZA_BELOW) - || c == a_MINI_ALEF); -} - - -/* - * A_is_formb returns TRUE if 'c' is an Arabic 10646-1 FormB character - * (alphabet/number/punctuation) - */ - static int -A_is_formb(int c) -{ - return ((c >= a_s_FATHATAN && c <= a_s_DAMMATAN) - || c == a_s_KASRATAN - || (c >= a_s_FATHA && c <= a_f_LAM_ALEF) - || c == a_BYTE_ORDER_MARK); -} - - -/* - * A_is_ok returns TRUE if 'c' is an Arabic 10646 (8859-6 or Form-B) - */ - static int -A_is_ok(int c) -{ - return (A_is_iso(c) || A_is_formb(c)); -} - - -/* - * A_is_valid returns TRUE if 'c' is an Arabic 10646 (8859-6 or Form-B) - * with some exceptions/exclusions - */ - static int -A_is_valid(int c) -{ - return (A_is_ok(c) && !A_is_special(c)); -} - - -/* - * A_is_special returns TRUE if 'c' is not a special Arabic character. - * Specials don't adhere to most of the rules. - */ - static int -A_is_special(int c) -{ - return (c == a_HAMZA || c == a_s_HAMZA); -} - -#endif /* FEAT_ARABIC */ +#endif // FEAT_ARABIC |