updated for version 7.0085

author: Bram Moolenaar <Bram@vim.org> 2005-06-14 22:01:04 +0000
committer: Bram Moolenaar <Bram@vim.org> 2005-06-14 22:01:04 +0000
commit: 9f30f50471678a0a986c30b50dce705bdcc991dc (patch)
tree: f4db0c82382dd5997a02e406c91acef45907f9d5 /src
parent: 9ba0eb850c0f4c94df3b7f7461610bf0b073f712 (diff)
1 files changed, 535 insertions, 193 deletions
diff --git a/src/spell.c b/src/spell.c
index 2b37f632fc..54ca1036bd 100644
--- a/src/spell.c
+++ b/src/spell.c
@@ -13,7 +13,23 @@
  * The spell checking mechanism uses a tree (aka trie).  Each node in the tree
  * has a list of bytes that can appear (siblings).  For each byte there is a
  * pointer to the node with the byte that follows in the word (child).
- * A NUL byte is used where the word may end.
+ *
+ * A NUL byte is used where the word may end.  The bytes are sorted, so that
+ * binary searching can be used and the NUL bytes are at the start.  The
+ * number of possible bytes is stored before the list of bytes.
+ *
+ * The tree uses two arrays: "byts" stores the characters, "idxs" stores
+ * either the next index or flags.  The tree starts at index 0.  For example,
+ * to lookup "vi" this sequence is followed:
+ *	i = 0
+ *	len = byts[i]
+ *	n = where "v" appears in byts[i + 1] to byts[i + len]
+ *	i = idxs[n]
+ *	len = byts[i]
+ *	n = where "i" appears in byts[i + 1] to byts[i + len]
+ *	i = idxs[n]
+ *	len = byts[i]
+ *	find that byts[i + 1] is 0, idxs[i + 1] has flags for "vi".
  *
  * There are two trees: one with case-folded words and one with words in
  * original case.  The second one is only used for keep-case words and is
@@ -30,8 +46,17 @@
 
 /*
  * Use this to let the score depend in how much a suggestion sounds like the
- * bad word.  It's quite slow and doesn't make the sorting much better....
- * #define SOUNDFOLD_SCORE
+ * bad word.  It's quite slow and only occasionally makes the sorting better.
+#define SOUNDFOLD_SCORE
+ */
+
+/*
+ * Use this to adjust the score after finding suggestions, based on the
+ * suggested word sounding like the bad word.  This is much faster than doing
+ * it for every possible suggestion.
+ * Disadvantage: When "the" is typed as "hte" it sounds different and goes
+ * down in the list.
+#define RESCORE(word_score, sound_score) ((2 * word_score + sound_score) / 3)
  */
 
 /*
@@ -47,8 +72,8 @@
  *
  * <charflagslen> 1 byte    Number of bytes in <charflags> (should be 128).
  * <charflags>  N bytes     List of flags (first one is for character 128):
- *			    0x01  word character
- *			    0x02  upper-case character
+ *			    0x01  word character	CF_WORD
+ *			    0x02  upper-case character	CF_UPPER
  * <fcharslen>  2 bytes     Number of bytes in <fchars>.
  * <fchars>     N bytes	    Folded characters, first one is for character 128.
  *
@@ -145,7 +170,16 @@
 				   Some places assume a word length fits in a
 				   byte, thus it can't be above 255. */
 
-/* Flags used for a word. */
+/* Type used for indexes in the word tree need to be at least 3 bytes.  If int
+ * is 8 bytes we could use something smaller, but what? */
+#if SIZEOF_INT > 2
+typedef int idx_T;
+#else
+typedef long idx_T;
+#endif
+
+/* Flags used for a word.  Only the lowest byte can be used, the region byte
+ * comes above it. */
 #define WF_REGION   0x01	/* region byte follows */
 #define WF_ONECAP   0x02	/* word with one capital (or all capitals) */
 #define WF_ALLCAP   0x04	/* word must be all capitals */
@@ -155,6 +189,9 @@
 
 #define WF_CAPMASK (WF_ONECAP | WF_ALLCAP | WF_KEEPCAP)
 
+#define WF_USED	    0x10000	/* Word was found in text. Must be in separate
+				   byte before region and flags. */
+
 #define BY_NOFLAGS  0		/* end of word without flags or region */
 #define BY_FLAGS    1		/* end of word, flag byte follows */
 #define BY_INDEX    2		/* child is shared, index follows */
@@ -192,9 +229,9 @@ struct slang_S
     char_u	*sl_fname;	/* name of .spl file */
     int		sl_add;		/* TRUE if it's a .add file. */
     char_u	*sl_fbyts;	/* case-folded word bytes */
-    int		*sl_fidxs;	/* case-folded word indexes */
+    idx_T	*sl_fidxs;	/* case-folded word indexes */
     char_u	*sl_kbyts;	/* keep-case word bytes */
-    int		*sl_kidxs;	/* keep-case word indexes */
+    idx_T	*sl_kidxs;	/* keep-case word indexes */
     char_u	sl_regions[17];	/* table with up to 8 region names plus NUL */
 
     garray_T	sl_rep;		/* list of fromto_T entries from REP lines */
@@ -267,6 +304,9 @@ typedef struct suggest_S
     char_u	*st_word;	/* suggested word, allocated string */
     int		st_orglen;	/* length of replaced text */
     int		st_score;	/* lower is better */
+#ifdef RESCORE
+    int		st_had_bonus;	/* bonus already included in score */
+#endif
 } suggest_T;
 
 #define SUG(sup, i) (((suggest_T *)(sup)->su_ga.ga_data)[i])
@@ -274,8 +314,14 @@ typedef struct suggest_S
 /* Number of suggestions displayed. */
 #define SUG_PROMPT_COUNT    ((int)Rows - 2)
 
-/* Threshold for sorting and cleaning up suggestions. */
-#define SUG_CLEANUP_COUNT   (SUG_PROMPT_COUNT + 50)
+/* Number of suggestions kept when cleaning up.  When rescore_suggestions() is
+ * called the score may change, thus we need to keep more than what is
+ * displayed. */
+#define SUG_CLEAN_COUNT	    (SUG_PROMPT_COUNT < 25 ? 25 : SUG_PROMPT_COUNT)
+
+/* Threshold for sorting and cleaning up suggestions.  Don't want to keep lots
+ * of suggestions that are not going to be displayed. */
+#define SUG_MAX_COUNT	    (SUG_PROMPT_COUNT + 50)
 
 /* score for various changes */
 #define SCORE_SPLIT	99	/* split bad word */
@@ -283,6 +329,7 @@ typedef struct suggest_S
 #define SCORE_ALLCAP	120	/* need all-cap case */
 #define SCORE_REGION	70	/* word is for different region */
 #define SCORE_RARE	180	/* rare word */
+#define SCORE_NOTUSED	11	/* word not found in text yet */
 
 /* score for edit distance */
 #define SCORE_SWAP	90	/* swap two characters */
@@ -290,8 +337,8 @@ typedef struct suggest_S
 #define SCORE_REP	87	/* REP replacement */
 #define SCORE_SUBST	93	/* substitute a character */
 #define SCORE_SIMILAR	33	/* substitute a similar character */
-#define SCORE_DEL	96	/* delete a character */
-#define SCORE_INS	94	/* insert a character */
+#define SCORE_DEL	94	/* delete a character */
+#define SCORE_INS	96	/* insert a character */
 
 #define SCORE_MAXINIT	350	/* Initial maximum score: higher == slower.
 				 * 350 allows for about three changes. */
@@ -329,13 +376,14 @@ typedef struct spelltab_S
     char_u  st_isw[256];	/* flags: is word char */
     char_u  st_isu[256];	/* flags: is uppercase char */
     char_u  st_fold[256];	/* chars: folded case */
+    char_u  st_upper[256];	/* chars: upper case */
 } spelltab_T;
 
 static spelltab_T   spelltab;
 static int	    did_set_spelltab;
 
-#define SPELL_ISWORD	1
-#define SPELL_ISUPPER	2
+#define CF_WORD		0x01
+#define CF_UPPER	0x02
 
 static void clear_spell_chartab __ARGS((spelltab_T *sp));
 static int set_spell_finish __ARGS((spelltab_T	*new_st));
@@ -364,7 +412,7 @@ typedef struct trystate_S
     int		ts_fidx;	/* index in fword[], case-folded bad word */
     int		ts_fidxtry;	/* ts_fidx at which bytes may be changed */
     int		ts_twordlen;	/* valid length of tword[] */
-    int		ts_arridx;	/* index in tree array, start of node */
+    idx_T	ts_arridx;	/* index in tree array, start of node */
     char_u	ts_save_prewordlen; /* saved "prewordlen" */
     int		ts_save_splitoff;   /* su_splitoff saved here */
     int		ts_save_badflags;   /* badflags saved here */
@@ -379,30 +427,80 @@ static void spell_load_lang __ARGS((char_u *lang));
 static char_u *spell_enc __ARGS((void));
 static void spell_load_cb __ARGS((char_u *fname, void *cookie));
 static slang_T *spell_load_file __ARGS((char_u *fname, char_u *lang, slang_T *old_lp, int silent));
-static int read_tree __ARGS((FILE *fd, char_u *byts, int *idxs, int maxidx, int startidx));
+static idx_T read_tree __ARGS((FILE *fd, char_u *byts, idx_T *idxs, int maxidx, int startidx));
 static int find_region __ARGS((char_u *rp, char_u *region));
 static int captype __ARGS((char_u *word, char_u *end));
 static void spell_reload_one __ARGS((char_u *fname, int added_word));
 static int set_spell_charflags __ARGS((char_u *flags, int cnt, char_u *upp));
 static int set_spell_chartab __ARGS((char_u *fol, char_u *low, char_u *upp));
 static void write_spell_chartab __ARGS((FILE *fd));
-static int spell_isupper __ARGS((int c));
 static int spell_casefold __ARGS((char_u *p, int len, char_u *buf, int buflen));
-static void onecap_copy __ARGS((char_u *word, int len, char_u *wcopy, int upper));
+static void onecap_copy __ARGS((char_u *word, char_u *wcopy, int upper));
 static void spell_try_change __ARGS((suginfo_T *su));
 static int try_deeper __ARGS((suginfo_T *su, trystate_T *stack, int depth, int score_add));
 static void find_keepcap_word __ARGS((slang_T *slang, char_u *fword, char_u *kword));
 static void spell_try_soundalike __ARGS((suginfo_T *su));
 static void make_case_word __ARGS((char_u *fword, char_u *cword, int flags));
+#if 0
 static int similar_chars __ARGS((slang_T *slang, int c1, int c2));
+#endif
+#ifdef RESCORE
+static void add_suggestion __ARGS((suginfo_T *su, char_u *goodword, int use_score, int had_bonus));
+#else
 static void add_suggestion __ARGS((suginfo_T *su, char_u *goodword, int use_score));
+#endif
 static void add_banned __ARGS((suginfo_T *su, char_u *word));
 static int was_banned __ARGS((suginfo_T *su, char_u *word));
 static void free_banned __ARGS((suginfo_T *su));
-static void cleanup_suggestions __ARGS((suginfo_T *su));
+#ifdef RESCORE
+static void rescore_suggestions __ARGS((suginfo_T *su));
+#endif
+static void cleanup_suggestions __ARGS((suginfo_T *su, int keep));
 static void spell_soundfold __ARGS((slang_T *slang, char_u *inword, char_u *res));
+#if defined(RESCORE) || defined(SOUNDFOLD_SCORE)
+static int spell_sound_score __ARGS((slang_T *slang, char_u *goodword, char_u	*badsound));
+#endif
 static int spell_edit_score __ARGS((char_u *badword, char_u *goodword));
 
+/*
+ * Use our own character-case definitions, because the current locale may
+ * differ from what the .spl file uses.
+ * These must not be called with negative number!
+ */
+#ifndef FEAT_MBYTE
+/* Non-multi-byte implementation. */
+# define SPELL_TOFOLD(c) ((c) < 256 ? spelltab.st_fold[c] : (c))
+# define SPELL_TOUPPER(c) ((c) < 256 ? spelltab.st_upper[c] : (c))
+# define SPELL_ISUPPER(c) ((c) < 256 ? spelltab.st_isu[c] : FALSE)
+#else
+/* Multi-byte implementation.  For Unicode we can call utf_*(), but don't do
+ * that for ASCII, because we don't want to use 'casemap' here.  Otherwise use
+ * the "w" library function for characters above 255 if available. */
+# ifdef HAVE_TOWLOWER
+#  define SPELL_TOFOLD(c) (enc_utf8 && (c) >= 128 ? utf_fold(c) \
+	    : (c) < 256 ? spelltab.st_fold[c] : towlower(c))
+# else
+#  define SPELL_TOFOLD(c) (enc_utf8 && (c) >= 128 ? utf_fold(c) \
+	    : (c) < 256 ? spelltab.st_fold[c] : (c))
+# endif
+
+# ifdef HAVE_TOWUPPER
+#  define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? utf_toupper(c) \
+	    : (c) < 256 ? spelltab.st_upper[c] : towupper(c))
+# else
+#  define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? utf_toupper(c) \
+	    : (c) < 256 ? spelltab.st_upper[c] : (c))
+# endif
+
+# ifdef HAVE_ISWUPPER
+#  define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? utf_isupper(c) \
+	    : (c) < 256 ? spelltab.st_isu[c] : iswupper(c))
+# else
+#  define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? utf_isupper(c) \
+	    : (c) < 256 ? spelltab.st_isu[c] : (c))
+# endif
+#endif
+
 
 static char *e_format = N_("E759: Format error in spell file");
 
@@ -489,6 +587,7 @@ spell_check(wp, ptr, attrp)
 	    /* Check for a matching word in case-folded words. */
 	    find_word(&mi, FALSE);
 
+	    /* Check for a matching word in keep-case words. */
 	    find_word(&mi, TRUE);
 	}
 
@@ -528,16 +627,16 @@ find_word(mip, keepcap)
     matchinf_T	*mip;
     int		keepcap;
 {
-    int		arridx = 0;
+    idx_T	arridx = 0;
     int		endlen[MAXWLEN];    /* length at possible word endings */
-    int		endidx[MAXWLEN];    /* possible word endings */
+    idx_T	endidx[MAXWLEN];    /* possible word endings */
     int		endidxcnt = 0;
     int		len;
     int		wlen = 0;
     int		flen;
     int		c;
     char_u	*ptr;
-    unsigned	lo, hi, m;
+    idx_T	lo, hi, m;
 #ifdef FEAT_MBYTE
     char_u	*s;
 #endif
@@ -547,7 +646,7 @@ find_word(mip, keepcap)
     slang_T	*slang = mip->mi_lp->lp_slang;
     unsigned	flags;
     char_u	*byts;
-    int		*idxs;
+    idx_T	*idxs;
 
     if (keepcap)
     {
@@ -692,6 +791,11 @@ find_word(mip, keepcap)
 	for (len = byts[arridx - 1]; len > 0 && byts[arridx] == 0; --len)
 	{
 	    flags = idxs[arridx];
+
+	    /* Set a flag for words that were used.  The region and case
+	     * doesn't matter here, it's only used to rate the suggestions. */
+	    idxs[arridx] = flags | WF_USED;
+
 	    if (keepcap)
 	    {
 		/* For "keepcap" tree the case is always right. */
@@ -823,7 +927,6 @@ spell_move_to(dir, allwords, curline)
 	    if (attr != 0)
 	    {
 		/* We found a bad word.  Check the attribute. */
-		/* TODO: check for syntax @Spell cluster. */
 		if (allwords || attr == highlight_attr[HLF_SPB])
 		{
 		    /* When searching forward only accept a bad word after
@@ -1073,6 +1176,7 @@ spell_load_file(fname, lang, old_lp, silent)
     fromto_T	*ftp;
     int		rr;
     short	*first;
+    idx_T	idx;
 
     fd = mch_fopen((char *)fname, "r");
     if (fd == NULL)
@@ -1170,7 +1274,7 @@ formerr:
 	    fol[i] = getc(fd);				/* <fchars> */
 	fol[i] = NUL;
 
-	/* Set the word-char flags and fill spell_isupper() table. */
+	/* Set the word-char flags and fill SPELL_ISUPPER() table. */
 	i = set_spell_charflags(p, cnt, fol);
 	vim_free(p);
 	vim_free(fol);
@@ -1293,19 +1397,19 @@ formerr:
 	    if (p == NULL)
 		goto endFAIL;
 	    if (round == 1)
-		lp->sl_fidxs = (int *)p;
+		lp->sl_fidxs = (idx_T *)p;
 	    else
-		lp->sl_kidxs = (int *)p;
+		lp->sl_kidxs = (idx_T *)p;
 
 
 	    /* Read the tree and store it in the array. */
-	    i = read_tree(fd,
+	    idx = read_tree(fd,
 			round == 1 ? lp->sl_fbyts : lp->sl_kbyts,
 			round == 1 ? lp->sl_fidxs : lp->sl_kidxs,
 			len, 0);
-	    if (i == -1)
+	    if (idx == -1)
 		goto truncerr;
-	    if (i < 0)
+	    if (idx < 0)
 		goto formerr;
 	}
     }
@@ -1348,18 +1452,18 @@ endOK:
  * Returns -1 if the file is shorter than expected.
  * Returns -2 if there is a format error.
  */
-    static int
+    static idx_T
 read_tree(fd, byts, idxs, maxidx, startidx)
     FILE	*fd;
     char_u	*byts;
-    int		*idxs;
+    idx_T	*idxs;
     int		maxidx;		    /* size of arrays */
-    int		startidx;	    /* current index in "byts" and "idxs" */
+    idx_T	startidx;	    /* current index in "byts" and "idxs" */
 {
     int		len;
     int		i;
     int		n;
-    int		idx = startidx;
+    idx_T	idx = startidx;
     int		c;
 #define SHARED_MASK	0x8000000
 
@@ -1619,7 +1723,7 @@ captype(word, end)
     else
 #endif
 	c = *p++;
-    firstcap = allcap = spell_isupper(c);
+    firstcap = allcap = SPELL_ISUPPER(c);
 
     /*
      * Need to check all letters to find a word with mixed upper/lower.
@@ -1633,7 +1737,7 @@ captype(word, end)
 #else
 	    c = *p;
 #endif
-	    if (!spell_isupper(c))
+	    if (!SPELL_ISUPPER(c))
 	    {
 		/* UUl -> KEEPCAP */
 		if (past_second && allcap)
@@ -1876,6 +1980,7 @@ spell_read_aff(fname, spin)
     int		do_sal;
     int		do_map;
     int		found_map = FALSE;
+    hashitem_T	*hi;
 
     /*
      * Open the file.
@@ -2031,7 +2136,8 @@ spell_read_aff(fname, spin)
 		else
 		    tp = &aff->af_suff;
 		aff_todo = atoi((char *)items[3]);
-		if (!HASHITEM_EMPTY(hash_find(tp, cur_aff->ah_key)))
+		hi = hash_find(tp, cur_aff->ah_key);
+		if (!HASHITEM_EMPTY(hi))
 		{
 		    smsg((char_u *)_("Duplicate affix in %s line %d: %s"),
 						       fname, lnum, items[1]);
@@ -2171,7 +2277,7 @@ spell_read_aff(fname, spin)
 	/*
 	 * Don't write a word table for an ASCII file, so that we don't check
 	 * for conflicts with a word table that matches 'encoding'.
-	 * Don't write one for utf-8 either, we use utf_isupper() and
+	 * Don't write one for utf-8 either, we use utf_*() and
 	 * mb_get_class(), the list of chars in the file will be incomplete.
 	 */
 	if (!spin->si_ascii
@@ -2336,7 +2442,7 @@ spell_read_dic(fname, spin, affile)
 
     /* Read and ignore the first line: word count. */
     (void)vim_fgets(line, MAXLINELEN, fd);
-    if (!isdigit(*skipwhite(line)))
+    if (!vim_isdigit(*skipwhite(line)))
 	EMSG2(_("E760: No word count in %s"), fname);
 
     /*
@@ -2528,12 +2634,14 @@ store_aff_word(word, spin, afflist, ht, xht, comb, flags)
 				/* Skip chop string. */
 #ifdef FEAT_MBYTE
 				if (has_mbyte)
+				{
 				    i = mb_charlen(ae->ae_chop);
+				    for ( ; i > 0; --i)
+					mb_ptr_adv(p);
+				}
 				else
 #endif
-				    i = STRLEN(ae->ae_chop);
-				for ( ; i > 0; --i)
-				    mb_ptr_adv(p);
+				    p += STRLEN(ae->ae_chop);
 			    }
 			    STRCAT(newword, p);
 			}
@@ -3754,13 +3862,16 @@ init_spellfile()
 clear_spell_chartab(sp)
     spelltab_T	*sp;
 {
-    int	    i;
+    int		i;
 
     /* Init everything to FALSE. */
     vim_memset(sp->st_isw, FALSE, sizeof(sp->st_isw));
     vim_memset(sp->st_isu, FALSE, sizeof(sp->st_isu));
     for (i = 0; i < 256; ++i)
+    {
 	sp->st_fold[i] = i;
+	sp->st_upper[i] = i;
+    }
 
     /* We include digits.  A word shouldn't start with a digit, but handling
      * that is done separately. */
@@ -3773,7 +3884,10 @@ clear_spell_chartab(sp)
 	sp->st_fold[i] = i + 0x20;
     }
     for (i = 'a'; i <= 'z'; ++i)
+    {
 	sp->st_isw[i] = TRUE;
+	sp->st_upper[i] = i - 0x20;
+    }
 }
 
 /*
@@ -3799,18 +3913,33 @@ init_spell_chartab()
 	    if (MB_BYTE2LEN(i) == 2)
 		spelltab.st_isw[i] = TRUE;
     }
+    else if (enc_utf8)
+    {
+	for (i = 128; i < 256; ++i)
+	{
+	    spelltab.st_isu[i] = utf_isupper(i);
+	    spelltab.st_isw[i] = spelltab.st_isu[i] || utf_islower(i);
+	    spelltab.st_fold[i] = utf_fold(i);
+	    spelltab.st_upper[i] = utf_toupper(i);
+	}
+    }
     else
 #endif
     {
-	/* Rough guess: use isalpha() and isupper() for characters above 128. */
+	/* Rough guess: use locale-dependent library functions. */
 	for (i = 128; i < 256; ++i)
 	{
-	    spelltab.st_isw[i] = MB_ISUPPER(i) || MB_ISLOWER(i);
 	    if (MB_ISUPPER(i))
 	    {
+		spelltab.st_isw[i] = TRUE;
 		spelltab.st_isu[i] = TRUE;
 		spelltab.st_fold[i] = MB_TOLOWER(i);
 	    }
+	    else if (MB_ISLOWER(i))
+	    {
+		spelltab.st_isw[i] = TRUE;
+		spelltab.st_upper[i] = MB_TOUPPER(i);
+	    }
 	}
     }
 }
@@ -3872,7 +4001,8 @@ set_spell_chartab(fol, low, upp)
 	}
 
 	/* if "UPP" and "FOL" are not the same the "UPP" char needs
-	 * case-folding and it's upper case. */
+	 * case-folding, it's upper case and the "UPP" is the upper case of
+	 * "FOL" . */
 	if (u < 256 && u != f)
 	{
 	    if (f >= 256)
@@ -3882,6 +4012,7 @@ set_spell_chartab(fol, low, upp)
 	    }
 	    new_st.st_fold[u] = f;
 	    new_st.st_isu[u] = TRUE;
+	    new_st.st_upper[f] = u;
 	}
     }
 
@@ -3908,21 +4039,25 @@ set_spell_charflags(flags, cnt, upp)
     spelltab_T	new_st;
     int		i;
     char_u	*p = upp;
+    int		c;
 
     clear_spell_chartab(&new_st);
 
     for (i = 0; i < cnt; ++i)
     {
-	new_st.st_isw[i + 128] = (flags[i] & SPELL_ISWORD) != 0;
-	new_st.st_isu[i + 128] = (flags[i] & SPELL_ISUPPER) != 0;
+	new_st.st_isw[i + 128] = (flags[i] & CF_WORD) != 0;
+	new_st.st_isu[i + 128] = (flags[i] & CF_UPPER) != 0;
 
 	if (*p == NUL)
 	    return FAIL;
 #ifdef FEAT_MBYTE
-	new_st.st_fold[i + 128] = mb_ptr2char_adv(&p);
+	c = mb_ptr2char_adv(&p);
 #else
-	new_st.st_fold[i + 128] = *p++;
+	c = *p++;
 #endif
+	new_st.st_fold[i + 128] = c;
+	if (i + 128 != c && new_st.st_isu[i + 128] && c < 256)
+	    new_st.st_upper[c] = i + 128;
     }
 
     return set_spell_finish(&new_st);
@@ -3941,7 +4076,8 @@ set_spell_finish(new_st)
 	{
 	    if (spelltab.st_isw[i] != new_st->st_isw[i]
 		    || spelltab.st_isu[i] != new_st->st_isu[i]
-		    || spelltab.st_fold[i] != new_st->st_fold[i])
+		    || spelltab.st_fold[i] != new_st->st_fold[i]
+		    || spelltab.st_upper[i] != new_st->st_upper[i])
 	    {
 		EMSG(_("E763: Word characters differ between spell files"));
 		return FAIL;
@@ -3977,9 +4113,9 @@ write_spell_chartab(fd)
     {
 	flags = 0;
 	if (spelltab.st_isw[i])
-	    flags |= SPELL_ISWORD;
+	    flags |= CF_WORD;
 	if (spelltab.st_isu[i])
-	    flags |= SPELL_ISUPPER;
+	    flags |= CF_UPPER;
 	fputc(flags, fd);			    /* <charflags> */
 
 #ifdef FEAT_MBYTE
@@ -3995,43 +4131,14 @@ write_spell_chartab(fd)
 }
 
 /*
- * Return TRUE if "c" is an upper-case character for spelling.
- */
-    static int
-spell_isupper(c)
-    int		c;
-{
-# ifdef FEAT_MBYTE
-    if (enc_utf8)
-    {
-	/* For Unicode we can call utf_isupper(), but don't do that for ASCII,
-	 * because we don't want to use 'casemap' here. */
-	if (c >= 128)
-	    return utf_isupper(c);
-    }
-    else if (has_mbyte && c > 256)
-    {
-	/* For characters above 255 we don't have something specfied.
-	 * Fall back to locale-dependent iswupper().  If not available
-	 * simply return FALSE. */
-#  ifdef HAVE_ISWUPPER
-	return iswupper(c);
-#  else
-	return FALSE;
-#  endif
-    }
-# endif
-    return spelltab.st_isu[c];
-}
-
-/*
- * Case-fold "p[len]" into "buf[buflen]".  Used for spell checking.
+ * Case-fold "str[len]" into "buf[buflen]".  The result is NUL terminated.
+ * Uses the character definitions from the .spl file.
  * When using a multi-byte 'encoding' the length may change!
  * Returns FAIL when something wrong.
  */
     static int
-spell_casefold(p, len, buf, buflen)
-    char_u	*p;
+spell_casefold(str, len, buf, buflen)
+    char_u	*str;
     int		len;
     char_u	*buf;
     int		buflen;
@@ -4047,32 +4154,20 @@ spell_casefold(p, len, buf, buflen)
 #ifdef FEAT_MBYTE
     if (has_mbyte)
     {
-	int	c;
 	int	outi = 0;
+	char_u	*p;
+	int	c;
 
 	/* Fold one character at a time. */
-	for (i = 0; i < len; i += mb_ptr2len_check(p + i))
+	for (p = str; p < str + len; )
 	{
-	    c = mb_ptr2char(p + i);
-	    if (enc_utf8)
-		/* For Unicode case folding is always the same, no need to use
-		 * the table from the spell file. */
-		c = utf_fold(c);
-	    else if (c < 256)
-		/* Use the table from the spell file. */
-		c = spelltab.st_fold[c];
-# ifdef HAVE_TOWLOWER
-	    else
-		/* We don't know what to do, fall back to towlower(), it
-		 * depends on the current locale. */
-		c = towlower(c);
-# endif
 	    if (outi + MB_MAXBYTES > buflen)
 	    {
 		buf[outi] = NUL;
 		return FAIL;
 	    }
-	    outi += mb_char2bytes(c, buf + outi);
+	    c = mb_ptr2char_adv(&p);
+	    outi += mb_char2bytes(SPELL_TOFOLD(c), buf + outi);
 	}
 	buf[outi] = NUL;
     }
@@ -4081,7 +4176,7 @@ spell_casefold(p, len, buf, buflen)
     {
 	/* Be quick for non-multibyte encodings. */
 	for (i = 0; i < len; ++i)
-	    buf[i] = spelltab.st_fold[p[i]];
+	    buf[i] = spelltab.st_fold[str[i]];
 	buf[i] = NUL;
     }
 
@@ -4136,22 +4231,28 @@ spell_suggest()
     /*
      * 1. Try inserting/deleting/swapping/changing a letter, use REP entries
      *    from the .aff file and inserting a space (split the word).
+     *
+     * Set a maximum score to limit the combination of operations that is
+     * tried.
      */
-    /* Set a maximum score to limit the combination of operations that is
-     * tried. */
     sug.su_maxscore = SCORE_MAXINIT;
     spell_try_change(&sug);
-    cleanup_suggestions(&sug);
 
     /*
      * 2. Try finding sound-a-like words.
+     *
+     * Only do this when we don't have a lot of suggestions yet, because it's
+     * very slow and often doesn't find new suggestions.
      */
-    /* Allow a higher score if we don't have many suggestions yet. */
-    if (sug.su_maxscore == SCORE_MAXINIT)
+    if (sug.su_ga.ga_len < SUG_CLEAN_COUNT)
+    {
+	/* Allow a higher score now. */
 	sug.su_maxscore = SCORE_MAXMAX;
-    spell_try_soundalike(&sug);
+	spell_try_soundalike(&sug);
+    }
 
     /* When CTRL-C was hit while searching do show the results. */
+    ui_breakcheck();
     if (got_int)
     {
 	(void)vgetc();
@@ -4162,8 +4263,13 @@ spell_suggest()
 	MSG(_("Sorry, no suggestions"));
     else
     {
-	/* Cleanup, sort the suggestions and truncate at SUG_PROMPT_COUNT. */
-	cleanup_suggestions(&sug);
+#ifdef RESCORE
+	/* Do slow but more accurate computation of the word score. */
+	rescore_suggestions(&sug);
+#endif
+
+	/* Sort the suggestions and truncate at SUG_PROMPT_COUNT. */
+	cleanup_suggestions(&sug, SUG_PROMPT_COUNT);
 
 	/* List the suggestions. */
 	msg_start();
@@ -4184,9 +4290,12 @@ spell_suggest()
 		vim_strncpy(wcopy + STRLEN(wcopy),
 					       sug.su_badptr + stp->st_orglen,
 					      sug.su_badlen - stp->st_orglen);
-	    /* TODO: remove score */
-	    vim_snprintf((char *)IObuff, IOSIZE, _("%2d \"%s\"  (%d)"),
+	    if (p_verbose > 0)
+		vim_snprintf((char *)IObuff, IOSIZE, _("%2d \"%s\"  (%d)"),
 						 i + 1, wcopy, stp->st_score);
+	    else
+		vim_snprintf((char *)IObuff, IOSIZE, _("%2d \"%s\""),
+								i + 1, wcopy);
 	    msg_puts(IObuff);
 	    lines_left = 3;		/* avoid more prompt */
 	    msg_putchar('\n');
@@ -4224,13 +4333,13 @@ spell_suggest()
 }
 
 /*
- * Make a copy of "word[len]", with the first letter upper or lower cased,
- * to "wcopy[MAXWLEN]".
+ * Make a copy of "word", with the first letter upper or lower cased, to
+ * "wcopy[MAXWLEN]".  "word" must not be empty.
+ * The result is NUL terminated.
  */
     static void
-onecap_copy(word, len, wcopy, upper)
+onecap_copy(word, wcopy, upper)
     char_u	*word;
-    int		len;
     char_u	*wcopy;
     int		upper;	    /* TRUE: first letter made upper case */
 {
@@ -4246,9 +4355,9 @@ onecap_copy(word, len, wcopy, upper)
 #endif
 	c = *p++;
     if (upper)
-	c = MB_TOUPPER(c);
+	c = SPELL_TOUPPER(c);
     else
-	c = MB_TOLOWER(c);
+	c = SPELL_TOFOLD(c);
 #ifdef FEAT_MBYTE
     if (has_mbyte)
 	l = mb_char2bytes(c, wcopy);
@@ -4258,12 +4367,12 @@ onecap_copy(word, len, wcopy, upper)
 	l = 1;
 	wcopy[0] = c;
     }
-    vim_strncpy(wcopy + l, p, len - (p - word));
+    vim_strncpy(wcopy + l, p, MAXWLEN - l);
 }
 
 /*
- * Make a copy of "word[len]" with all the letters upper cased into
- * "wcopy[MAXWLEN]".
+ * Make a copy of "word" with all the letters upper cased into
+ * "wcopy[MAXWLEN]".  The result is NUL terminated.
  */
     static void
 allcap_copy(word, wcopy)
@@ -4283,8 +4392,7 @@ allcap_copy(word, wcopy)
 	else
 #endif
 	    c = *s++;
-
-	c = MB_TOUPPER(c);	/* TODO: use spell toupper */
+	c = SPELL_TOUPPER(c);
 
 #ifdef FEAT_MBYTE
 	if (has_mbyte)
@@ -4322,14 +4430,14 @@ spell_try_change(su)
     int		newscore;
     langp_T	*lp;
     char_u	*byts;
-    int		*idxs;
+    idx_T	*idxs;
     int		depth;
     int		c;
     int		n;
     int		flags;
     int		badflags;
     garray_T	*gap;
-    int		arridx;
+    idx_T	arridx;
     int		len;
     char_u	*p;
     fromto_T	*ftp;
@@ -4417,7 +4525,7 @@ spell_try_change(su)
 		 */
 		++sp->ts_curi;		/* eat one NUL byte */
 
-		flags = idxs[arridx];
+		flags = (int)idxs[arridx];
 
 		/*
 		 * Form the word with proper case in preword.
@@ -4451,6 +4559,10 @@ spell_try_change(su)
 		if (flags & WF_RARE)
 		    newscore += SCORE_RARE;
 
+		/* Words that were not found in the text get a penalty. */
+		if ((flags & WF_USED) == 0)
+		    newscore += SCORE_NOTUSED;
+
 		if (!spell_valid_case(badflags,
 					 captype(preword + prewordlen, NULL)))
 		    newscore += SCORE_ICASE;
@@ -4458,7 +4570,11 @@ spell_try_change(su)
 		if (fword[sp->ts_fidx] == 0)
 		{
 		    /* The badword also ends: add suggestions, */
-		    add_suggestion(su, preword, sp->ts_score + newscore);
+		    add_suggestion(su, preword, sp->ts_score + newscore
+#ifdef RESCORE
+			    , FALSE
+#endif
+			    );
 		}
 		else if (sp->ts_fidx >= sp->ts_fidxtry)
 		{
@@ -4476,10 +4592,24 @@ spell_try_change(su)
 			STRCAT(preword, " ");
 			prewordlen = STRLEN(preword);
 			splitoff = sp->ts_twordlen;
-			/* TODO: when case-folding changed the number of bytes
-			 * this doesn't work... */
-			badflags = captype(su->su_badptr + sp->ts_fidx,
-					       su->su_badptr + su->su_badlen);
+#ifdef FEAT_MBYTE
+			if (has_mbyte)
+			{
+			    int		i = 0;
+
+			    /* Case-folding may change the number of bytes:
+			     * Count nr of chars in fword[sp->ts_fidx] and
+			     * advance that many chars in su->su_badptr. */
+			    for (p = fword; p < fword + sp->ts_fidx;
+								mb_ptr_adv(p))
+				++i;
+			    for (p = su->su_badptr; i > 0; mb_ptr_adv(p))
+				--i;
+			}
+			else
+#endif
+			    p = su->su_badptr + sp->ts_fidx;
+			badflags = captype(p, su->su_badptr + su->su_badlen);
 
 			sp->ts_state = STATE_SPLITUNDO;
 			++depth;
@@ -4535,11 +4665,15 @@ spell_try_change(su)
 		     * even try when the byte was already changed. */
 		    if (c == fword[sp->ts_fidx])
 			newscore = 0;
-			/* TODO: multi-byte characters */
+
+		    /* TODO: this is too slow and comparing bytes isn't right
+		     * for multi-byte characters. */
+#if 0
 		    else if (lp->lp_slang->sl_map != NULL
-			    && similar_chars(lp->lp_slang,
+					&& similar_chars(lp->lp_slang,
 						       c, fword[sp->ts_fidx]))
 			newscore = SCORE_SIMILAR;
+#endif
 		    else
 			newscore = SCORE_SUBST;
 		    if ((newscore == 0 || sp->ts_fidx >= sp->ts_fidxtry)
@@ -4818,10 +4952,10 @@ find_keepcap_word(slang, fword, kword)
 {
     char_u	uword[MAXWLEN];		/* "fword" in upper-case */
     int		depth;
-    int		tryidx;
+    idx_T	tryidx;
 
     /* The following arrays are used at each depth in the tree. */
-    int		arridx[MAXWLEN];
+    idx_T	arridx[MAXWLEN];
     int		round[MAXWLEN];
     int		fwordidx[MAXWLEN];
     int		uwordidx[MAXWLEN];
@@ -4831,10 +4965,10 @@ find_keepcap_word(slang, fword, kword)
     int		l;
     int		len;
     int		c;
-    unsigned	lo, hi, m;
+    idx_T	lo, hi, m;
     char_u	*p;
     char_u	*byts = slang->sl_kbyts;    /* array with bytes of the words */
-    int		*idxs = slang->sl_kidxs;    /* array with indexes */
+    idx_T	*idxs = slang->sl_kidxs;    /* array with indexes */
 
     if (byts == NULL)
     {
@@ -4976,16 +5110,18 @@ spell_try_soundalike(su)
     char_u	tword[MAXWLEN];
     char_u	tfword[MAXWLEN];
     char_u	tsalword[MAXWLEN];
-    int		arridx[MAXWLEN];
+    idx_T	arridx[MAXWLEN];
     int		curi[MAXWLEN];
     langp_T	*lp;
     char_u	*byts;
-    int		*idxs;
+    idx_T	*idxs;
     int		depth;
     int		c;
-    int		n;
+    idx_T	n;
     int		round;
     int		flags;
+    int		score, sound_score;
+    char_u	*bp, *sp;
 
     for (lp = LANGP_ENTRY(curwin->w_buffer->b_langp, 0);
 						   lp->lp_slang != NULL; ++lp)
@@ -5030,7 +5166,7 @@ spell_try_soundalike(su)
 			if (c == 0)
 			{
 			    /* End of word, deal with the word. */
-			    flags = idxs[n];
+			    flags = (int)idxs[n];
 			    if (round == 2 || (flags & WF_KEEPCAP) == 0)
 			    {
 				tword[depth] = NUL;
@@ -5047,19 +5183,63 @@ spell_try_soundalike(su)
 							    tfword, tsalword);
 				}
 
-				/* TODO: also compare with small changes
-				 * (insert char, swap char, etc.) */
-				if (STRCMP(salword, tsalword) == 0)
+				/*
+				 * Accept the word if the sound-folded words
+				 * are (almost) equal.
+				 */
+				for (bp = salword, sp = tsalword; *bp == *sp;
+								   ++bp, ++sp)
+				    if (*bp == NUL)
+					break;
+
+				if (*bp == *sp)
+				    /* equal */
+				    sound_score = 0;
+				else if (*bp != NUL && bp[1] != NUL
+					&& *bp == sp[1] && bp[1] == *sp
+					       && STRCMP(bp + 2, sp + 2) == 0)
+				    /* swap two bytes */
+				    sound_score = SCORE_SWAP;
+				else if (STRCMP(bp + 1, sp) == 0)
+				    /* delete byte */
+				    sound_score = SCORE_DEL;
+				else if (STRCMP(bp, sp + 1) == 0)
+				    /* insert byte */
+				    sound_score = SCORE_INS;
+				else if (STRCMP(bp + 1, sp + 1) == 0)
+				    /* skip one byte */
+				    sound_score = SCORE_SUBST;
+				else
+				    /* not equal or similar */
+				    sound_score = SCORE_MAXMAX;
+
+				if (sound_score < SCORE_MAXMAX)
 				{
+				    char_u	cword[MAXWLEN];
+				    char_u	*p;
+
 				    if (round == 1 && flags != 0)
 				    {
-					char_u	cword[MAXWLEN];
-
+					/* Need to fix case according to
+					 * "flags". */
 					make_case_word(tword, cword, flags);
-					add_suggestion(su, cword, 0);
+					p = cword;
 				    }
 				    else
-					add_suggestion(su, tword, 0);
+					p = tword;
+
+				    /* Compute the score. */
+				    score = spell_edit_score(su->su_badword, p);
+#ifdef RESCORE
+				    /* give a bonus for the good word sounding
+				     * the same as the bad word */
+				    add_suggestion(su, tword,
+						 RESCORE(score, sound_score),
+									TRUE);
+#else
+				    add_suggestion(su, tword,
+							 score + sound_score);
+#endif
 				}
 			    }
 
@@ -5078,15 +5258,16 @@ spell_try_soundalike(su)
 			    curi[depth] = 1;
 			}
 		    }
+
+		    line_breakcheck();
 		}
-		line_breakcheck();
 	    }
 	}
     }
 }
 
 /*
- * Copy "fword" to "cword", fixing according to "flags".
+ * Copy "fword" to "cword", fixing case according to "flags".
  */
     static void
 make_case_word(fword, cword, flags)
@@ -5099,12 +5280,13 @@ make_case_word(fword, cword, flags)
 	allcap_copy(fword, cword);
     else if (flags & WF_ONECAP)
 	/* Make the first letter upper-case */
-	onecap_copy(fword, STRLEN(fword), cword, TRUE);
+	onecap_copy(fword, cword, TRUE);
     else
 	/* Use goodword as-is. */
 	STRCPY(cword, fword);
 }
 
+#if 0
 /*
  * Return TRUE if "c1" and "c2" are similar characters according to the MAP
  * lines in the .aff file.
@@ -5129,6 +5311,7 @@ similar_chars(slang, c1, c2)
 	return FALSE;
     return vim_strchr(p1, '/') == vim_strchr(p2, '/');
 }
+#endif
 
 /*
  * Add a suggestion to the list of suggestions.
@@ -5137,13 +5320,19 @@ similar_chars(slang, c1, c2)
  * with spell_edit_score().
  */
     static void
-add_suggestion(su, goodword, use_score)
+add_suggestion(su, goodword, score
+#ifdef RESCORE
+	    , had_bonus
+#endif
+	    )
     suginfo_T	*su;
     char_u	*goodword;
-    int		use_score;
+    int		score;
+#ifdef RESCORE
+    int		had_bonus;	/* set st_had_bonus */
+#endif
 {
     suggest_T   *stp;
-    int		score;
     int		i;
 #ifdef SOUNDFOLD_SCORE
     char_u	fword[MAXWLEN];
@@ -5154,23 +5343,13 @@ add_suggestion(su, goodword, use_score)
     if (was_banned(su, goodword))
 	return;
 
-    /* Compute the score and add the suggestion if it's good enough. */
-    if (use_score != 0)
-	score = use_score;
-    else
-	score = spell_edit_score(su->su_badword, goodword);
-
     if (score <=
author	Bram Moolenaar <Bram@vim.org>	2005-06-14 22:01:04 +0000
committer	Bram Moolenaar <Bram@vim.org>	2005-06-14 22:01:04 +0000
commit	9f30f50471678a0a986c30b50dce705bdcc991dc (patch)
tree	f4db0c82382dd5997a02e406c91acef45907f9d5 /src
parent	9ba0eb850c0f4c94df3b7f7461610bf0b073f712 (diff)