diff options
Diffstat (limited to 'src/spell.c')
-rw-r--r-- | src/spell.c | 408 |
1 files changed, 334 insertions, 74 deletions
diff --git a/src/spell.c b/src/spell.c index 10390ece6d..114e3f2aa8 100644 --- a/src/spell.c +++ b/src/spell.c @@ -35,7 +35,7 @@ * original case. The second one is only used for keep-case words and is * usually small. * - * There is one additional tree for when prefixes are not applied when + * There is one additional tree for when not all prefixes are applied when * generating the .spl file. This tree stores all the possible prefixes, as * if they were words. At each word (prefix) end the prefix nr is stored, the * following word must support this prefix nr. And the condition nr is @@ -72,21 +72,6 @@ #define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4) /* - * The double scoring mechanism is based on the principle that there are two - * kinds of spelling mistakes: - * 1. You know how to spell the word, but mistype something. This results in - * a small editing distance (character swapped/omitted/inserted) and - * possibly a word that sounds completely different. - * 2. You don't know how to spell the word and type something that sounds - * right. The edit distance can be big but the word is similar after - * sound-folding. - * Since scores for these two mistakes will be very different we use a list - * for each. - * The sound-folding is slow, only do double scoring when 'spellsuggest' is - * "double". - */ - -/* * Vim spell file format: <HEADER> * <SUGGEST> * <LWORDTREE> @@ -98,9 +83,10 @@ * <charflagslen> <charflags> * <fcharslen> <fchars> * <midwordlen> <midword> + * <compoundlen> <compoundtype> <compoundinfo> * <prefcondcnt> <prefcond> ... * - * <fileID> 10 bytes "VIMspell09" + * <fileID> 10 bytes "VIMspell10" * <regioncnt> 1 byte number of regions following (8 supported) * <regionname> 2 bytes Region name: ca, au, etc. Lower case. * First <regionname> is region 1. @@ -116,6 +102,17 @@ * <midword> N bytes Characters that are word characters only when used * in the middle of a word. * + * <compoundlen> 2 bytes Number of bytes following for compound info (can + * be used to skip it when it's not understood). + * + * <compoundtype 1 byte 1: compound words using <comp1minlen> and + * <comp1flags> + * + * <comp1minlen> 1 byte minimal word length for compounding + * + * <comp1flags> N bytes flags used for compounding words + * + * * <prefcondcnt> 2 bytes Number of <prefcond> items following. * * <prefcond> : <condlen> <condstr> @@ -182,16 +179,16 @@ * follow in sorted order. * * <sibling>: <byte> [ <nodeidx> <xbyte> - * | <flags> [<flags2>] [<region>] [<prefixID>] - * | [<pflags>] <prefixID> <prefcondnr> ] + * | <flags> [<flags2>] [<region>] [<affixID>] + * | [<pflags>] <affixID> <prefcondnr> ] * * <byte> 1 byte Byte value of the sibling. Special cases: * BY_NOFLAGS: End of word without flags and for all * regions. - * For PREFIXTREE <prefixID> and + * For PREFIXTREE <affixID> and * <prefcondnr> follow. * BY_FLAGS: End of word, <flags> follow. - * For PREFIXTREE <pflags>, <prefixID> + * For PREFIXTREE <pflags>, <affixID> * and <prefcondnr> follow. * BY_FLAGS2: End of word, <flags> and <flags2> * follow. Not used in PREFIXTREE. @@ -210,7 +207,7 @@ * WF_RARE rare word * WF_BANNED bad word * WF_REGION <region> follows - * WF_PFX <prefixID> follows + * WF_AFX <affixID> follows * * <flags2> 1 byte Only used when there are postponed prefixes. * Bitmask of: @@ -225,7 +222,7 @@ * omitted it's valid in all regions. * Lowest bit is for region 1. * - * <prefixID> 1 byte ID of prefix that can be used with this word. For + * <affixID> 1 byte ID of affix that can be used with this word. In * PREFIXTREE used for the required prefix ID. * * <prefcondnr> 2 bytes Prefix condition number, index in <prefcond> list @@ -265,7 +262,7 @@ typedef long idx_T; #define WF_ALLCAP 0x04 /* word must be all capitals */ #define WF_RARE 0x08 /* rare word */ #define WF_BANNED 0x10 /* bad word */ -#define WF_PFX 0x20 /* prefix ID follows */ +#define WF_AFX 0x20 /* affix ID follows */ #define WF_FIXCAP 0x40 /* keep-case word, allcap not allowed */ #define WF_KEEPCAP 0x80 /* keep-case word */ @@ -279,7 +276,7 @@ typedef long idx_T; #define WFP_NC 0x02 /* prefix is not combining */ #define WFP_UP 0x04 /* to-upper prefix */ -/* flags for postponed prefixes. Must be above prefixID (one byte) +/* Flags for postponed prefixes. Must be above affixID (one byte) * and prefcondnr (two bytes). */ #define WF_RAREPFX (WFP_RARE << 24) /* in sl_pidxs: flag for rare * postponed prefix */ @@ -343,8 +340,8 @@ typedef short salfirst_T; * The "idxs" array stores the index of the child node corresponding to the * byte in "byts". * Exception: when the byte is zero, the word may end here and "idxs" holds - * the flags, region mask and prefixID for the word. There may be several - * zeros in sequence for alternative flag/region combinations. + * the flags, region mask and affixID for the word. There may be several + * zeros in sequence for alternative flag/region/affixID combinations. */ typedef struct slang_S slang_T; struct slang_S @@ -365,6 +362,9 @@ struct slang_S char_u *sl_midword; /* MIDWORD string or NULL */ + int sl_compminlen; /* COMPOUNDMIN */ + char_u *sl_compflags; /* COMPOUNDFLAGS (NULL when no compounding) */ + int sl_prefixcnt; /* number of items in "sl_prefprog" */ regprog_T **sl_prefprog; /* table with regprogs for prefixes */ @@ -419,7 +419,7 @@ typedef struct langp_S #define SP_LOCAL 2 #define SP_BAD 3 -#define VIMSPELLMAGIC "VIMspell09" /* string at start of Vim spell file */ +#define VIMSPELLMAGIC "VIMspell10" /* string at start of Vim spell file */ #define VIMSPELLMAGICL 10 /* file used for "zG" and "zW" */ @@ -510,7 +510,7 @@ typedef struct matchinf_S /* for when checking word after a prefix */ int mi_prefarridx; /* index in sl_pidxs with list of - prefixID/condition */ + affixID/condition */ int mi_prefcnt; /* number of entries at mi_prefarridx */ int mi_prefixlen; /* byte length of prefix */ #ifdef FEAT_MBYTE @@ -520,6 +520,9 @@ typedef struct matchinf_S # define mi_cprefixlen mi_prefixlen /* it's the same value */ #endif + /* for when checking a compound word */ + int mi_compoff; /* start of following word offset */ + /* others */ int mi_result; /* result so far: SP_BAD, SP_OK, etc. */ int mi_capflags; /* WF_ONECAP WF_ALLCAP WF_KEEPCAP */ @@ -614,9 +617,11 @@ typedef struct trystate_S #define NOPREFIX 0xff /* not using prefixes */ /* mode values for find_word */ -#define FIND_FOLDWORD 0 /* find word case-folded */ -#define FIND_KEEPWORD 1 /* find keep-case word */ -#define FIND_PREFIX 2 /* find word after prefix */ +#define FIND_FOLDWORD 0 /* find word case-folded */ +#define FIND_KEEPWORD 1 /* find keep-case word */ +#define FIND_PREFIX 2 /* find word after prefix */ +#define FIND_COMPOUND 3 /* find case-folded compound word */ +#define FIND_KEEPCOMPOUND 4 /* find keep-case compound word */ static slang_T *slang_alloc __ARGS((char_u *lang)); static void slang_free __ARGS((slang_T *lp)); @@ -928,14 +933,19 @@ find_word(mip, mode) unsigned flags; char_u *byts; idx_T *idxs; + int word_ends; - if (mode == FIND_KEEPWORD) + if (mode == FIND_KEEPWORD || mode == FIND_KEEPCOMPOUND) { /* Check for word with matching case in keep-case tree. */ ptr = mip->mi_word; flen = 9999; /* no case folding, always enough bytes */ byts = slang->sl_kbyts; idxs = slang->sl_kidxs; + + if (mode == FIND_KEEPCOMPOUND) + /* Skip over the previously found word(s). */ + wlen += mip->mi_compoff; } else { @@ -951,6 +961,13 @@ find_word(mip, mode) wlen = mip->mi_prefixlen; flen -= mip->mi_prefixlen; } + else if (mode == FIND_COMPOUND) + { + /* Skip over the previously found word(s). */ + wlen = mip->mi_compoff; + flen -= mip->mi_compoff; + } + } if (byts == NULL) @@ -1058,7 +1075,13 @@ find_word(mip, mode) continue; /* not at first byte of character */ #endif if (spell_iswordp(ptr + wlen, mip->mi_buf)) - continue; /* next char is a word character */ + { + if (slang->sl_compflags == NULL) + continue; /* next char is a word character */ + word_ends = FALSE; + } + else + word_ends = TRUE; #ifdef FEAT_MBYTE if (mode != FIND_KEEPWORD && has_mbyte) @@ -1108,9 +1131,8 @@ find_word(mip, mode) /* When mode is FIND_PREFIX the word must support the prefix: * check the prefix ID and the condition. Do that for the list at * mip->mi_prefarridx that find_prefix() filled. */ - if (mode == FIND_PREFIX) + else if (mode == FIND_PREFIX) { - /* The prefix ID is stored two bytes above the flags. */ c = valid_word_prefix(mip->mi_prefcnt, mip->mi_prefarridx, flags, mip->mi_word + mip->mi_cprefixlen, slang, @@ -1123,6 +1145,58 @@ find_word(mip, mode) flags |= WF_RARE; } + if (mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND + || !word_ends) + { + /* Makes you wonder why someone puts a compound flag on a word + * that's too short... Myspell compatibility requires this + * anyway. */ + if (wlen < slang->sl_compminlen) + continue; + + /* The word doesn't end or it comes after another: it must + * have a compound flag. */ + /* TODO: check more flags */ + if (*slang->sl_compflags != ((unsigned)flags >> 24)) + continue; + } + + if (!word_ends) + { + /* Check that a valid word follows. If there is one, it will + * set "mi_result", thus we are always finished here. + * Recursive! */ + + /* Find following word in case-folded tree. */ + mip->mi_compoff = endlen[endidxcnt]; +#ifdef FEAT_MBYTE + if (has_mbyte && mode == FIND_KEEPWORD) + { + /* Compute byte length in case-folded word from "wlen": + * byte length in keep-case word. Length may change when + * folding case. This can be slow, take a shortcut when + * the case-folded word is equal to the keep-case word. */ + p = mip->mi_fword; + if (STRNCMP(ptr, p, wlen) != 0) + { + for (s = ptr; s < ptr + wlen; mb_ptr_adv(s)) + mb_ptr_adv(p); + mip->mi_compoff = p - mip->mi_fword; + } + } +#endif + find_word(mip, FIND_COMPOUND); + if (mip->mi_result == SP_OK) + break; + + /* Find following word in keep-case tree. */ + mip->mi_compoff = wlen; + find_word(mip, FIND_KEEPCOMPOUND); + if (mip->mi_result == SP_OK) + break; + continue; + } + if (flags & WF_BANNED) res = SP_BANNED; else if (flags & WF_REGION) @@ -1758,6 +1832,9 @@ slang_clear(lp) vim_free(lp->sl_midword); lp->sl_midword = NULL; + vim_free(lp->sl_compflags); + lp->sl_compflags = NULL; + #ifdef FEAT_MBYTE { int todo = lp->sl_map_hash.ht_used; @@ -1870,6 +1947,7 @@ spell_load_file(fname, lang, old_lp, silent) * <charflagslen> <charflags> * <fcharslen> <fchars> * <midwordlen> <midword> + * <compoundlen> <compoundtype> <compoundinfo> * <prefcondcnt> <prefcond> ... */ for (i = 0; i < VIMSPELLMAGICL; ++i) @@ -1929,6 +2007,41 @@ formerr: if (cnt < 0) goto endFAIL; + /* <compoundlen> <compoundtype> <compoundinfo> */ + cnt = (getc(fd) << 8) + getc(fd); /* <compoundlen> */ + if (cnt < 0) + goto endFAIL; + if (cnt > 0) + { + --cnt; + c = getc(fd); /* <compoundtype> */ + if (c != 1) + { + /* Unknown kind of compound words, skip the info. */ + while (cnt-- > 0) + getc(fd); + } + else if (cnt < 2) + goto formerr; + else + { + --cnt; + c = getc(fd); /* <comp1minlen> */ + if (c < 1 || c > 50) + c = 3; + lp->sl_compminlen = c; + + p = alloc(cnt + 1); + if (p == NULL) + goto endFAIL; + lp->sl_compflags = p; + while (cnt-- > 0) + *p++ = getc(fd); /* <comp1flags> */ + *p = NUL; + } + } + + /* <prefcondcnt> <prefcond> ... */ cnt = (getc(fd) << 8) + getc(fd); /* <prefcondcnt> */ if (cnt > 0) @@ -1943,7 +2056,7 @@ formerr: { /* <prefcond> : <condlen> <condstr> */ n = getc(fd); /* <condlen> */ - if (n < 0) + if (n < 0 || n >= MAXWLEN) goto formerr; /* When <condlen> is zero we have an empty condition. Otherwise * compile the regexp program used to check for the condition. */ @@ -2518,7 +2631,7 @@ read_tree(fd, byts, idxs, maxidx, startidx, prefixtree, maxprefcondnr) else c = 0; - c |= getc(fd); /* <prefixID> */ + c |= getc(fd); /* <affixID> */ n = (getc(fd) << 8) + getc(fd); /* <prefcondnr> */ if (n >= maxprefcondnr) @@ -2536,8 +2649,8 @@ read_tree(fd, byts, idxs, maxidx, startidx, prefixtree, maxprefcondnr) c = (getc(fd) << 8) + c; /* <flags2> */ if (c & WF_REGION) c = (getc(fd) << 16) + c; /* <region> */ - if (c & WF_PFX) - c = (getc(fd) << 24) + c; /* <prefixID> */ + if (c & WF_AFX) + c = (getc(fd) << 24) + c; /* <affixID> */ } idxs[idx] = c; @@ -3110,9 +3223,12 @@ spell_reload_one(fname, added_word) typedef struct afffile_S { char_u *af_enc; /* "SET", normalized, alloc'ed string or NULL */ + int af_slash; /* character used in word for slash */ int af_rar; /* RAR ID for rare word */ int af_kep; /* KEP ID for keep-case word */ int af_bad; /* BAD ID for banned word */ + char_u *af_compflags; /* COMPOUNDFLAG or COMPOUNDFLAGS */ + int af_compminlen; /* COMPOUNDMIN */ int af_pfxpostpone; /* postpone prefixes without chop string */ hashtab_T af_pref; /* hashtable for prefixes, affheader_T */ hashtab_T af_suff; /* hashtable for suffixes, affheader_T */ @@ -3187,7 +3303,7 @@ struct wordnode_S siblings, in following siblings it is always one. */ char_u wn_byte; /* Byte for this node. NUL for word end */ - char_u wn_prefixID; /* when "wn_byte" is NUL: supported/required + char_u wn_affixID; /* when "wn_byte" is NUL: supported/required prefix ID or 0 */ short_u wn_flags; /* when "wn_byte" is NUL: WF_ flags */ short wn_region; /* when "wn_byte" is NUL: region mask; for @@ -3245,6 +3361,8 @@ typedef struct spellinfo_S int si_rem_accents; /* soundsalike: remove accents */ garray_T si_map; /* MAP info concatenated */ char_u *si_midword; /* MIDWORD chars, alloc'ed string or NULL */ + int si_compminlen; /* minimal length for compounding */ + char_u *si_compflags; /* flags used for compounding */ garray_T si_prefcond; /* table with conditions for postponed * prefixes, each stored as a string */ int si_newID; /* current value for ah_newID */ @@ -3258,6 +3376,7 @@ static int has_non_ascii __ARGS((char_u *s)); static void spell_free_aff __ARGS((afffile_T *aff)); static int spell_read_dic __ARGS((spellinfo_T *spin, char_u *fname, afffile_T *affile)); static char_u *get_pfxlist __ARGS((spellinfo_T *spin, afffile_T *affile, char_u *afflist)); +static char_u *get_compflags __ARGS((spellinfo_T *spin, char_u *afflist)); static int store_aff_word __ARGS((spellinfo_T *spin, char_u *word, char_u *afflist, afffile_T *affile, hashtab_T *ht, hashtab_T *xht, int comb, int flags, char_u *pfxlist)); static int spell_read_wordfile __ARGS((spellinfo_T *spin, char_u *fname)); static void *getroom __ARGS((spellinfo_T *spin, size_t len, int align)); @@ -3265,7 +3384,7 @@ static char_u *getroom_save __ARGS((spellinfo_T *spin, char_u *s)); static void free_blocks __ARGS((sblock_T *bl)); static wordnode_T *wordtree_alloc __ARGS((spellinfo_T *spin)); static int store_word __ARGS((spellinfo_T *spin, char_u *word, int flags, int region, char_u *pfxlist)); -static int tree_add_word __ARGS((spellinfo_T *spin, char_u *word, wordnode_T *tree, int flags, int region, int prefixID)); +static int tree_add_word __ARGS((spellinfo_T *spin, char_u *word, wordnode_T *tree, int flags, int region, int affixID)); static wordnode_T *get_wordnode __ARGS((spellinfo_T *spin)); static void deref_wordnode __ARGS((spellinfo_T *spin, wordnode_T *node)); static void free_wordnode __ARGS((spellinfo_T *spin, wordnode_T *n)); @@ -3547,6 +3666,14 @@ spell_read_aff(spin, fname) { /* ignored, we look in the tree for what chars may appear */ } + else if (STRCMP(items[0], "SLASH") == 0 && itemcnt == 2 + && aff->af_slash == 0) + { + aff->af_slash = items[1][0]; + if (items[1][1] != NUL) + smsg((char_u *)_("Character used for SLASH must be ASCII; in %s line %d: %s"), + fname, lnum, items[1]); + } else if (STRCMP(items[0], "RAR") == 0 && itemcnt == 2 && aff->af_rar == 0) { @@ -3568,6 +3695,26 @@ spell_read_aff(spin, fname) if (items[1][1] != NUL) smsg((char_u *)_(e_affname), fname, lnum, items[1]); } + else if (STRCMP(items[0], "COMPOUNDFLAG") == 0 && itemcnt == 2 + && aff->af_compflags == 0) + { + aff->af_compflags = getroom_save(spin, items[1]); + if (items[1][1] != NUL) + smsg((char_u *)_(e_affname), fname, lnum, items[1]); + } + else if (STRCMP(items[0], "COMPOUNDFLAGS") == 0 && itemcnt == 2 + && aff->af_compflags == 0) + { + aff->af_compflags = getroom_save(spin, items[1]); + } + else if (STRCMP(items[0], "COMPOUNDMIN") == 0 && itemcnt == 2 + && aff->af_compminlen == 0) + { + aff->af_compminlen = atoi((char *)items[1]); + if (aff->af_compminlen == 0) + smsg((char_u *)_("Wrong COMPOUNDMIN value in %s line %d: %s"), + fname, lnum, items[1]); + } else if (STRCMP(items[0], "PFXPOSTPONE") == 0 && itemcnt == 1) { aff->af_pfxpostpone = TRUE; @@ -3688,7 +3835,10 @@ spell_read_aff(spin, fname) else sprintf((char *)buf, "%s$", items[4]); aff_entry->ae_prog = vim_regcomp(buf, - RE_MAGIC + RE_STRING); + RE_MAGIC + RE_STRING + RE_STRICT); + if (aff_entry->ae_prog == NULL) + smsg((char_u *)_("Broken condition in %s line %d: %s"), + fname, lnum, items[4]); } /* For postponed prefixes we need an entry in si_prefcond @@ -3908,7 +4058,7 @@ spell_read_aff(spin, fname) spin->si_sofoto = vim_strsave(items[1]); } else - smsg((char_u *)_("Unrecognized item in %s line %d: %s"), + smsg((char_u *)_("Unrecognized or duplicate item in %s line %d: %s"), fname, lnum, items[0]); } } @@ -3952,6 +4102,28 @@ spell_read_aff(spin, fname) vim_free(upp); } + /* Use compound specifications of the .aff file for the spell info. */ + if (aff->af_compminlen != 0) + { + if (spin->si_compminlen != 0 + && spin->si_compminlen != aff->af_compminlen) + smsg((char_u *)_("COMPOUNDMIN value differs from what is used in another .aff file")); + else + spin->si_compminlen = aff->af_compminlen; + } + + if (aff->af_compflags != NULL) + { + if (spin->si_compflags != NULL + && STRCMP(spin->si_compflags, aff->af_compflags) != 0) + smsg((char_u *)_("COMPOUNDFLAG(S) value differs from what is used in another .aff file")); + else + spin->si_compflags = aff->af_compflags; + + if (aff->af_pfxpostpone) + smsg((char_u *)_("Cannot use both PFXPOSTPONE and COMPOUNDFLAG(S)")); + } + vim_free(pc); fclose(fd); return aff; @@ -4072,8 +4244,9 @@ spell_read_dic(spin, fname, affile) { hashtab_T ht; char_u line[MAXLINELEN]; + char_u *p; char_u *afflist; - char_u *pfxlist; + char_u *store_afflist; char_u *dw; char_u *pc; char_u *w; @@ -4086,6 +4259,7 @@ spell_read_dic(spin, fname, affile) int retval = OK; char_u message[MAXLINELEN + MAXWLEN]; int flags; + int duplicate = 0; /* * Open the file. @@ -4139,10 +4313,20 @@ spell_read_dic(spin, fname, affile) continue; /* empty line */ line[l] = NUL; - /* Find the optional affix names. */ - afflist = vim_strchr(line, '/'); - if (afflist != NULL) - *afflist++ = NUL; + /* Find the optional affix names. Replace the SLASH character by a + * slash. */ + afflist = NULL; + for (p = line; *p != NUL; mb_ptr_adv(p)) + { + if (*p == affile->af_slash) + *p = '/'; + else if (*p == '/') + { + *p = NUL; + afflist = p + 1; + break; + } + } /* Skip non-ASCII words when "spin->si_ascii" is TRUE. */ if (spin->si_ascii && has_non_ascii(line)) @@ -4197,13 +4381,20 @@ spell_read_dic(spin, fname, affile) hash = hash_hash(dw); hi = hash_lookup(&ht, dw, hash); if (!HASHITEM_EMPTY(hi)) - smsg((char_u *)_("Duplicate word in %s line %d: %s"), + { + if (p_verbose > 0) + smsg((char_u *)_("Duplicate word in %s line %d: %s"), + fname, lnum, dw); + else if (duplicate == 0) + smsg((char_u *)_("First duplicate word in %s line %d: %s"), fname, lnum, dw); + ++duplicate; + } else hash_add_item(&ht, hi, dw, hash); flags = 0; - pfxlist = NULL; + store_afflist = NULL; if (afflist != NULL) { /* Check for affix name that stands for keep-case word and stands @@ -4220,11 +4411,15 @@ spell_read_dic(spin, fname, affile) if (affile->af_pfxpostpone) /* Need to store the list of prefix IDs with the word. */ - pfxlist = get_pfxlist(spin, affile, afflist); + store_afflist = get_pfxlist(spin, affile, afflist); + else if (spin->si_compflags) + /* Need to store the list of affix IDs for compounding with + * the word. */ + store_afflist = get_compflags(spin, afflist); } /* Add the word to the word tree(s). */ - if (store_word(spin, dw, flags, spin->si_region, pfxlist) == FAIL) + if (store_word(spin, dw, flags, spin->si_region, store_afflist) == FAIL) retval = FAIL; if (afflist != NULL) @@ -4233,20 +4428,22 @@ spell_read_dic(spin, fname, affile) * Additionally do matching prefixes that combine. */ if (store_aff_word(spin, dw, afflist, affile, &affile->af_suff, &affile->af_pref, - FALSE, flags, pfxlist) == FAIL) + FALSE, flags, store_afflist) == FAIL) retval = FAIL; /* Find all matching prefixes and add the resulting words. */ if (store_aff_word(spin, dw, afflist, affile, &affile->af_pref, NULL, - FALSE, flags, pfxlist) == FAIL) + FALSE, flags, store_afflist) == FAIL) retval = FAIL; } } + if (duplicate > 0) + smsg((char_u *)_("%d duplicate word(s) in %s"), duplicate, fname); if (spin->si_ascii && non_ascii > 0) - smsg((char_u *)_("Ignored %d words with non-ASCII characters"), - non_ascii); + smsg((char_u *)_("Ignored %d word(s) with non-ASCII characters in %s"), + non_ascii, fname); hash_clear(&ht); fclose(fd); @@ -4303,6 +4500,49 @@ get_pfxlist(spin, affile, afflist) } /* + * Get the list of affix IDs from the affix list "afflist" that are used for + * compound words. + * Returns a string allocated with getroom(). NULL when there are no relevant + * affixes or when out of memory. + */ + static char_u * +get_compflags(spin, afflist) + spellinfo_T *spin; + char_u *afflist; +{ + char_u *p; + int cnt; + int round; + char_u *res = NULL; + + /* round 1: count the number of affix IDs. + * round 2: move affix IDs to "res" */ + for (round = 1; round <= 2; ++round) + { + cnt = 0; + for (p = afflist; *p != NUL; ++p) + { + if (*p != ',' && *p != '-' + && vim_strchr(spin->si_compflags, *p) != NULL) + { + /* This is a compount affix ID. */ + if (round == 2) + res[cnt] = *p; + ++cnt; + } + } + if (round == 1 && cnt > 0) + res = getroom(spin, cnt + 1, FALSE); + if (res == NULL) + break; + } + + if (res != NULL) + res[cnt] = NUL; + return res; +} + +/* * Apply affixes to a word and store the resulting words. * "ht" is the hashtable with affentry_T that need to be applied, either * prefixes or suffixes. @@ -4335,6 +4575,7 @@ store_aff_word(spin, word, afflist, affile, ht, xht, comb, flags, pfxlist) int use_flags; char_u *use_pfxlist; int c; + int wordlen = STRLEN(word); todo = ht->ht_used; for (hi = ht->ht_array; todo > 0 && retval == OK; ++hi) @@ -4355,12 +4596,16 @@ store_aff_word(spin, word, afflist, affile, ht, xht, comb, flags, pfxlist) /* Check the condition. It's not logical to match case * here, but it is required for compatibility with * Myspell. + * Another requirement from Myspell is that the chop + * string is shorter than the word itself. * For prefixes, when "PFXPOSTPONE" was used, only do * prefixes with a chop string. */ regmatch.regprog = ae->ae_prog; regmatch.rm_ic = FALSE; if ((xht != NULL || !affile->af_pfxpostpone || ae->ae_chop != NULL) + && (ae->ae_chop == NULL + || STRLEN(ae->ae_chop) < wordlen) && (ae->ae_prog == NULL || vim_regexec(®match, word, (colnr_T)0))) { @@ -4798,13 +5043,13 @@ store_word(spin, word, flags, region, pfxlist) * Returns FAIL when out of memory. */ static int -tree_add_word(spin, word, root, flags, region, prefixID) +tree_add_word(spin, word, root, flags, region, affixID) spellinfo_T *spin; char_u *word; wordnode_T *root; int flags; int region; - int prefixID; + int affixID; { wordnode_T *node = root; wordnode_T *np; @@ -4836,7 +5081,7 @@ tree_add_word(spin, word, root, flags, region, prefixID) { np->wn_flags = copyp->wn_flags; np->wn_region = copyp->wn_region; - np->wn_prefixID = copyp->wn_prefixID; + np->wn_affixID = copyp->wn_affixID; } /* Link the new node in the list, there will be one ref. */ @@ -4853,15 +5098,15 @@ tree_add_word(spin, word, root, flags, region, prefixID) /* Look for the sibling that has the same character. They are sorted * on byte value, thus stop searching when a sibling is found with a * higher byte value. For zero bytes (end of word) the sorting is - * done on flags and then on prefixID. */ + * done on flags and then on affixID. */ while (node != NULL && (node->wn_byte < word[i] || (node->wn_byte == NUL && (flags < 0 - ? node->wn_prefixID < prefixID + ? node->wn_affixID < affixID : node->wn_flags < (flags & WN_MASK) || (node->wn_flags == (flags & WN_MASK) - && node->wn_prefixID < prefixID))))) + && node->wn_affixID < affixID))))) { prev = &node->wn_sibling; node = *prev; @@ -4871,7 +5116,7 @@ tree_add_word(spin, word, root, flags, region, prefixID) || (word[i] == NUL && (flags < 0 || node->wn_flags != (flags & WN_MASK) - || node->wn_prefixID != prefixID))) + || node->wn_affixID != affixID))) { /* Allocate a new node. */ np = get_wordnode(spin); @@ -4899,7 +5144,7 @@ tree_add_word(spin, word, root, flags, region, prefixID) { node->wn_flags = flags; node->wn_region |= region; - node->wn_prefixID = prefixID; + node->wn_affixID = affixID; break; } prev = &node->wn_child; @@ -5134,8 +5379,8 @@ node_compress(spin, node, ht, tot) for (np = node; np != NULL; np = np->wn_sibling) { if (np->wn_byte == NUL) - /* end node: use wn_flags, wn_region and wn_prefixID */ - n = np->wn_flags + (np->wn_region << 8) + (np->wn_prefixID << 16); + /* end node: use wn_flags, wn_region and wn_affixID */ + n = np->wn_flags + (np->wn_region << 8) + (np->wn_affixID << 16); else /* byte node: use the byte value and the child pointer */ n = np->wn_byte + ((long_u)np->wn_child << 8); @@ -5173,7 +5418,7 @@ node_equal(n1, n2) || (p1->wn_byte == NUL ? (p1->wn_flags != p2->wn_flags || p1->wn_region != p2->wn_region - || p1->wn_prefixID != p2->wn_prefixID) + || p1->wn_affixID != p2->wn_affixID) : (p1->wn_child != p2->wn_child))) break; @@ -5249,6 +5494,7 @@ write_vim_spell(spin, fname) * <charflagslen> <charflags> * <fcharslen> <fchars> * <midwordlen> <midword> + * <compoundlen> <compoundtype> <compoundinfo> * <prefcondcnt> <prefcond> ... */ /* <fileID> */ @@ -5297,6 +5543,20 @@ write_vim_spell(spin, fname) } + /* Write the compound info. */ + if (spin->si_compflags == NULL) + put_bytes(fd, 0L, 2); /* <compoundlen> */ + else + { + l = STRLEN(spin->si_compflags); + put_bytes(fd, (long_u)(l + 2), 2); /* <compoundlen> */ + putc(1, fd); /* <compoundtype> */ + putc(spin->si_compminlen, fd); /* <comp1minlen> */ + fwrite(spin->si_compflags, (size_t)l, (size_t)1, fd); + /* <comp1flags> */ + } + + /* Write the prefix conditions. */ write_spell_prefcond(fd, &spin->si_prefcond); @@ -5472,7 +5732,7 @@ put_node(fd, node, index, regionmask, prefixtree) /* For a NUL byte (end of word) write the flags etc. */ if (prefixtree) { - /* In PREFIXTREE write the required prefixID and the + /* In PREFIXTREE write the required affixID and the * associated condition nr (stored in wn_region). The * byte value is misused to store the "rare" and "not * combining" flags */ @@ -5483,7 +5743,7 @@ put_node(fd, node, index, regionmask, prefixtree) putc(BY_FLAGS, fd); /* <byte> */ putc(np->wn_flags, fd); /* <pflags> */ } - putc(np->wn_prefixID, fd); /* <prefixID> */ + putc(np->wn_affixID, fd); /* <affixID> */ put_bytes(fd, (long_u)np->wn_region, 2); /* <prefcondnr> */ } else @@ -5492,8 +5752,8 @@ put_node(fd, node, index, regionmask, prefixtree) flags = np->wn_flags; if (regionmask != 0 && np->wn_region != regionmask) flags |= WF_REGION; - if (np->wn_prefixID != 0) - flags |= WF_PFX; + if (np->wn_affixID != 0) + flags |= WF_AFX; if (flags == 0) { /* word without flags or region */ @@ -5514,8 +5774,8 @@ put_node(fd, node, index, regionmask, prefixtree) } if (flags & WF_REGION) putc(np->wn_region, fd); /* <region> */ - if (flags & WF_PFX) - putc(np->wn_prefixID, fd); /* <prefixID> */ + if (flags & WF_AFX) + putc(np->wn_affixID, fd); /* <affixID> */ } } } |