summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorBram Moolenaar <Bram@vim.org>2019-09-07 23:16:33 +0200
committerBram Moolenaar <Bram@vim.org>2019-09-07 23:16:33 +0200
commit6d7d7cf750bca5d641e464f6a3af5ee5b99a5ac8 (patch)
tree1b35d750cc6c6613afc5af3ed18c907c186634a2 /src
parentf6ed61e1489e40eada55a4f1782e1ed82bcad7d9 (diff)
patch 8.1.2005: the regexp.c file is too bigv8.1.2005
Problem: The regexp.c file is too big. Solution: Move the backtracking engine to a separate file. (Yegappan Lakshmanan, closes #4905)
Diffstat (limited to 'src')
-rw-r--r--src/Make_cyg_ming.mak2
-rw-r--r--src/Make_mvc.mak2
-rw-r--r--src/Makefile4
-rw-r--r--src/regexp.c5476
-rw-r--r--src/regexp_bt.c5381
-rw-r--r--src/version.c2
6 files changed, 5417 insertions, 5450 deletions
diff --git a/src/Make_cyg_ming.mak b/src/Make_cyg_ming.mak
index bd2dba26cd..9f5f5e0e8b 100644
--- a/src/Make_cyg_ming.mak
+++ b/src/Make_cyg_ming.mak
@@ -1179,7 +1179,7 @@ $(OUTDIR)/os_w32exeg.o: os_w32exe.c $(INCL)
$(OUTDIR)/os_win32.o: os_win32.c $(INCL) $(MZSCHEME_INCL)
$(CC) -c $(CFLAGS) os_win32.c -o $@
-$(OUTDIR)/regexp.o: regexp.c regexp_nfa.c $(INCL)
+$(OUTDIR)/regexp.o: regexp.c regexp_bt.c regexp_nfa.c $(INCL)
$(CC) -c $(CFLAGS) regexp.c -o $@
$(OUTDIR)/terminal.o: terminal.c $(INCL) $(TERM_DEPS)
diff --git a/src/Make_mvc.mak b/src/Make_mvc.mak
index 0401902f22..ac02e809ac 100644
--- a/src/Make_mvc.mak
+++ b/src/Make_mvc.mak
@@ -1637,7 +1637,7 @@ $(OUTDIR)/profiler.obj: $(OUTDIR) profiler.c $(INCL)
$(OUTDIR)/quickfix.obj: $(OUTDIR) quickfix.c $(INCL)
-$(OUTDIR)/regexp.obj: $(OUTDIR) regexp.c regexp_nfa.c $(INCL)
+$(OUTDIR)/regexp.obj: $(OUTDIR) regexp.c regexp_bt.c regexp_nfa.c $(INCL)
$(OUTDIR)/scriptfile.obj: $(OUTDIR) scriptfile.c $(INCL)
diff --git a/src/Makefile b/src/Makefile
index 34d1c145af..34198ea007 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -3326,7 +3326,7 @@ objects/pty.o: pty.c
objects/quickfix.o: quickfix.c
$(CCC) -o $@ quickfix.c
-objects/regexp.o: regexp.c regexp_nfa.c
+objects/regexp.o: regexp.c regexp_bt.c regexp_nfa.c
$(CCC) -o $@ regexp.c
objects/scriptfile.o: scriptfile.c
@@ -3794,7 +3794,7 @@ objects/quickfix.o: quickfix.c vim.h protodef.h auto/config.h feature.h os_unix.
objects/regexp.o: regexp.c vim.h protodef.h auto/config.h feature.h os_unix.h \
auto/osdef.h ascii.h keymap.h term.h macros.h option.h beval.h \
proto/gui_beval.pro structs.h regexp.h gui.h alloc.h ex_cmds.h spell.h \
- proto.h globals.h regexp_nfa.c
+ proto.h globals.h regexp_bt.c regexp_nfa.c
objects/scriptfile.o: scriptfile.c vim.h protodef.h auto/config.h feature.h os_unix.h \
auto/osdef.h ascii.h keymap.h term.h macros.h option.h beval.h \
proto/gui_beval.pro structs.h regexp.h gui.h alloc.h ex_cmds.h spell.h \
diff --git a/src/regexp.c b/src/regexp.c
index c21e0e62f0..b952315b21 100644
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -1,41 +1,6 @@
/* vi:set ts=8 sts=4 sw=4 noet:
*
* Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub()
- *
- * NOTICE:
- *
- * This is NOT the original regular expression code as written by Henry
- * Spencer. This code has been modified specifically for use with the VIM
- * editor, and should not be used separately from Vim. If you want a good
- * regular expression library, get the original code. The copyright notice
- * that follows is from the original.
- *
- * END NOTICE
- *
- * Copyright (c) 1986 by University of Toronto.
- * Written by Henry Spencer. Not derived from licensed software.
- *
- * Permission is granted to anyone to use this software for any
- * purpose on any computer system, and to redistribute it freely,
- * subject to the following restrictions:
- *
- * 1. The author is not responsible for the consequences of use of
- * this software, no matter how awful, even if they arise
- * from defects in it.
- *
- * 2. The origin of this software must not be misrepresented, either
- * by explicit claim or by omission.
- *
- * 3. Altered versions must be plainly marked as such, and must not
- * be misrepresented as being the original software.
- *
- * Beware that some of this code is subtly aware of the way operator
- * precedence is structured in regular expressions. Serious changes in
- * regular-expression syntax might require a total rethink.
- *
- * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert
- * Webb, Ciaran McCreesh and Bram Moolenaar.
- * Named character class support added by Walter Briscoe (1998 Jul 01)
*/
// By default: do not create debugging logs or files related to regular
@@ -56,197 +21,6 @@
#endif
/*
- * The "internal use only" fields in regexp.h are present to pass info from
- * compile to execute that permits the execute phase to run lots faster on
- * simple cases. They are:
- *
- * regstart char that must begin a match; NUL if none obvious; Can be a
- * multi-byte character.
- * reganch is the match anchored (at beginning-of-line only)?
- * regmust string (pointer into program) that match must include, or NULL
- * regmlen length of regmust string
- * regflags RF_ values or'ed together
- *
- * Regstart and reganch permit very fast decisions on suitable starting points
- * for a match, cutting down the work a lot. Regmust permits fast rejection
- * of lines that cannot possibly match. The regmust tests are costly enough
- * that vim_regcomp() supplies a regmust only if the r.e. contains something
- * potentially expensive (at present, the only such thing detected is * or +
- * at the start of the r.e., which can involve a lot of backup). Regmlen is
- * supplied because the test in vim_regexec() needs it and vim_regcomp() is
- * computing it anyway.
- */
-
-/*
- * Structure for regexp "program". This is essentially a linear encoding
- * of a nondeterministic finite-state machine (aka syntax charts or
- * "railroad normal form" in parsing technology). Each node is an opcode
- * plus a "next" pointer, possibly plus an operand. "Next" pointers of
- * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next"
- * pointer with a BRANCH on both ends of it is connecting two alternatives.
- * (Here we have one of the subtle syntax dependencies: an individual BRANCH
- * (as opposed to a collection of them) is never concatenated with anything
- * because of operator precedence). The "next" pointer of a BRACES_COMPLEX
- * node points to the node after the stuff to be repeated.
- * The operand of some types of node is a literal string; for others, it is a
- * node leading into a sub-FSM. In particular, the operand of a BRANCH node
- * is the first node of the branch.
- * (NB this is *not* a tree structure: the tail of the branch connects to the
- * thing following the set of BRANCHes.)
- *
- * pattern is coded like:
- *
- * +-----------------+
- * | V
- * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END
- * | ^ | ^
- * +------+ +----------+
- *
- *
- * +------------------+
- * V |
- * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END
- * | | ^ ^
- * | +---------------+ |
- * +---------------------------------------------+
- *
- *
- * +----------------------+
- * V |
- * <aa>\+ BRANCH <aa> --> BRANCH --> BACK BRANCH --> NOTHING --> END
- * | | ^ ^
- * | +-----------+ |
- * +--------------------------------------------------+
- *
- *
- * +-------------------------+
- * V |
- * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END
- * | | ^
- * | +----------------+
- * +-----------------------------------------------+
- *
- *
- * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END
- * | | ^ ^
- * | +----------------+ |
- * +--------------------------------+
- *
- * +---------+
- * | V
- * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END
- * | | | | ^ ^
- * | | | +-----+ |
- * | | +----------------+ |
- * | +---------------------------+ |
- * +------------------------------------------------------+
- *
- * They all start with a BRANCH for "\|" alternatives, even when there is only
- * one alternative.
- */
-
-/*
- * The opcodes are:
- */
-
-/* definition number opnd? meaning */
-#define END 0 /* End of program or NOMATCH operand. */
-#define BOL 1 /* Match "" at beginning of line. */
-#define EOL 2 /* Match "" at end of line. */
-#define BRANCH 3 /* node Match this alternative, or the
- * next... */
-#define BACK 4 /* Match "", "next" ptr points backward. */
-#define EXACTLY 5 /* str Match this string. */
-#define NOTHING 6 /* Match empty string. */
-#define STAR 7 /* node Match this (simple) thing 0 or more
- * times. */
-#define PLUS 8 /* node Match this (simple) thing 1 or more
- * times. */
-#define MATCH 9 /* node match the operand zero-width */
-#define NOMATCH 10 /* node check for no match with operand */
-#define BEHIND 11 /* node look behind for a match with operand */
-#define NOBEHIND 12 /* node look behind for no match with operand */
-#define SUBPAT 13 /* node match the operand here */
-#define BRACE_SIMPLE 14 /* node Match this (simple) thing between m and
- * n times (\{m,n\}). */
-#define BOW 15 /* Match "" after [^a-zA-Z0-9_] */
-#define EOW 16 /* Match "" at [^a-zA-Z0-9_] */
-#define BRACE_LIMITS 17 /* nr nr define the min & max for BRACE_SIMPLE
- * and BRACE_COMPLEX. */
-#define NEWL 18 /* Match line-break */
-#define BHPOS 19 /* End position for BEHIND or NOBEHIND */
-
-
-/* character classes: 20-48 normal, 50-78 include a line-break */
-#define ADD_NL 30
-#define FIRST_NL ANY + ADD_NL
-#define ANY 20 /* Match any one character. */
-#define ANYOF 21 /* str Match any character in this string. */
-#define ANYBUT 22 /* str Match any character not in this
- * string. */
-#define IDENT 23 /* Match identifier char */
-#define SIDENT 24 /* Match identifier char but no digit */
-#define KWORD 25 /* Match keyword char */
-#define SKWORD 26 /* Match word char but no digit */
-#define FNAME 27 /* Match file name char */
-#define SFNAME 28 /* Match file name char but no digit */
-#define PRINT 29 /* Match printable char */
-#define SPRINT 30 /* Match printable char but no digit */
-#define WHITE 31 /* Match whitespace char */
-#define NWHITE 32 /* Match non-whitespace char */
-#define DIGIT 33 /* Match digit char */
-#define NDIGIT 34 /* Match non-digit char */
-#define HEX 35 /* Match hex char */
-#define NHEX 36 /* Match non-hex char */
-#define OCTAL 37 /* Match octal char */
-#define NOCTAL 38 /* Match non-octal char */
-#define WORD 39 /* Match word char */
-#define NWORD 40 /* Match non-word char */
-#define HEAD 41 /* Match head char */
-#define NHEAD 42 /* Match non-head char */
-#define ALPHA 43 /* Match alpha char */
-#define NALPHA 44 /* Match non-alpha char */
-#define LOWER 45 /* Match lowercase char */
-#define NLOWER 46 /* Match non-lowercase char */
-#define UPPER 47 /* Match uppercase char */
-#define NUPPER 48 /* Match non-uppercase char */
-#define LAST_NL NUPPER + ADD_NL
-#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL)
-
-#define MOPEN 80 /* -89 Mark this point in input as start of
- * \( subexpr. MOPEN + 0 marks start of
- * match. */
-#define MCLOSE 90 /* -99 Analogous to MOPEN. MCLOSE + 0 marks
- * end of match. */
-#define BACKREF 100 /* -109 node Match same string again \1-\9 */
-
-#ifdef FEAT_SYN_HL
-# define ZOPEN 110 /* -119 Mark this point in input as start of
- * \z( subexpr. */
-# define ZCLOSE 120 /* -129 Analogous to ZOPEN. */
-# define ZREF 130 /* -139 node Match external submatch \z1-\z9 */
-#endif
-
-#define BRACE_COMPLEX 140 /* -149 node Match nodes between m & n times */
-
-#define NOPEN 150 /* Mark this point in input as start of
- \%( subexpr. */
-#define NCLOSE 151 /* Analogous to NOPEN. */
-
-#define MULTIBYTECODE 200 /* mbc Match one multi-byte character */
-#define RE_BOF 201 /* Match "" at beginning of file. */
-#define RE_EOF 202 /* Match "" at end of file. */
-#define CURSOR 203 /* Match location of cursor. */
-
-#define RE_LNUM 204 /* nr cmp Match line number */
-#define RE_COL 205 /* nr cmp Match column number */
-#define RE_VCOL 206 /* nr cmp Match virtual column number */
-
-#define RE_MARK 207 /* mark cmp Match mark position */
-#define RE_VISUAL 208 /* Match Visual area */
-#define RE_COMPOSING 209 /* any composing characters */
-
-/*
* Magic characters have a special meaning, they don't match literally.
* Magic characters are negative. This separates them from literal characters
* (possibly multi-byte). Only ASCII characters can be Magic.
@@ -272,7 +46,7 @@ toggle_Magic(int x)
}
/*
- * The first byte of the regexp internal "program" is actually this magic
+ * The first byte of the BT regexp internal "program" is actually this magic
* number; the start node begins in the second byte. It's used to catch the
* most severe mutilation of the program by the caller.
*/
@@ -280,54 +54,6 @@ toggle_Magic(int x)
#define REGMAGIC 0234
/*
- * Opcode notes:
- *
- * BRANCH The set of branches constituting a single choice are hooked
- * together with their "next" pointers, since precedence prevents
- * anything being concatenated to any individual branch. The
- * "next" pointer of the last BRANCH in a choice points to the
- * thing following the whole choice. This is also where the
- * final "next" pointer of each individual branch points; each
- * branch starts with the operand node of a BRANCH node.
- *
- * BACK Normal "next" pointers all implicitly point forward; BACK
- * exists to make loop structures possible.
- *
- * STAR,PLUS '=', and complex '*' and '+', are implemented as circular
- * BRANCH structures using BACK. Simple cases (one character
- * per match) are implemented with STAR and PLUS for speed
- * and to minimize recursive plunges.
- *
- * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX
- * node, and defines the min and max limits to be used for that
- * node.
- *
- * MOPEN,MCLOSE ...are numbered at compile time.
- * ZOPEN,ZCLOSE ...ditto
- */
-
-/*
- * A node is one char of opcode followed by two chars of "next" pointer.
- * "Next" pointers are stored as two 8-bit bytes, high order first. The
- * value is a positive offset from the opcode of the node containing it.
- * An operand, if any, simply follows the node. (Note that much of the
- * code generation knows about this implicit relationship.)
- *
- * Using two bytes for the "next" pointer is vast overkill for most things,
- * but allows patterns to get big without disasters.
- */
-#define OP(p) ((int)*(p))
-#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
-#define OPERAND(p) ((p) + 3)
-/* Obtain an operand that was stored as four bytes, MSB first. */
-#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \
- + ((long)(p)[5] << 8) + (long)(p)[6])
-/* Obtain a second operand stored as four bytes. */
-#define OPERAND_MAX(p) OPERAND_MIN((p) + 4)
-/* Obtain a second single-byte operand stored after a four bytes operand. */
-#define OPERAND_CMP(p) (p)[7]
-
-/*
* Utility definitions.
*/
#define UCHARAT(p) ((int)*(char_u *)(p))
@@ -345,18 +71,6 @@ toggle_Magic(int x)
#define MAX_LIMIT (32767L << 16L)
-static int cstrncmp(char_u *s1, char_u *s2, int *n);
-static char_u *cstrchr(char_u *, int);
-
-#ifdef BT_REGEXP_DUMP
-static void regdump(char_u *, bt_regprog_T *);
-#endif
-#ifdef DEBUG
-static char_u *regprop(char_u *);
-#endif
-
-static int re_mult_next(char *what);
-
static char_u e_missingbracket[] = N_("E769: Missing ] after %s[");
static char_u e_reverse_range[] = N_("E944: Reverse range in character class");
static char_u e_large_class[] = N_("E945: Range too large in character class");
@@ -374,6 +88,14 @@ static char_u e_recursive[] = N_("E956: Cannot use pattern recursively");
#define NOT_MULTI 0
#define MULTI_ONE 1
#define MULTI_MULT 2
+
+// return values for regmatch()
+#define RA_FAIL 1 /* something failed, abort */
+#define RA_CONT 2 /* continue in inner loop */
+#define RA_BREAK 3 /* break inner loop */
+#define RA_MATCH 4 /* successful match */
+#define RA_NOMATCH 5 /* didn't match */
+
/*
* Return NOT_MULTI if c is not a "multi" operator.
* Return MULTI_ONE if c is a single "multi" operator.
@@ -389,22 +111,6 @@ re_multi_type(int c)
return NOT_MULTI;
}
-/*
- * Flags to be passed up and down.
- */
-#define HASWIDTH 0x1 /* Known never to match null string. */
-#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
-#define SPSTART 0x4 /* Starts with * or +. */
-#define HASNL 0x8 /* Contains some \n. */
-#define HASLOOKBH 0x10 /* Contains "\@<=" or "\@<!". */
-#define WORST 0 /* Worst case. */
-
-/*
- * When regcode is set to this value, code is not emitted and size is computed
- * instead.
- */
-#define JUST_CALC_SIZE ((char_u *) -1)
-
static char_u *reg_prev_sub = NULL;
/*
@@ -587,25 +293,15 @@ init_class_tab(void)
*/
static char_u *regparse; /* Input-scan pointer. */
-static int prevchr_len; /* byte length of previous char */
-static int num_complex_braces; /* Complex \{...} count */
static int regnpar; /* () count. */
#ifdef FEAT_SYN_HL
static int regnzpar; /* \z() count. */
static int re_has_z; /* \z item detected */
#endif
-static char_u *regcode; /* Code-emit pointer, or JUST_CALC_SIZE */
-static long regsize; /* Code size. */
-static int reg_toolong; /* TRUE when offset out of range */
-static char_u had_endbrace[NSUBEXP]; /* flags, TRUE if end of () found */
static unsigned regflags; /* RF_ flags for prog */
-static long brace_min[10]; /* Minimums for complex brace repeats */
-static long brace_max[10]; /* Maximums for complex brace repeats */
-static int brace_count[10]; /* Current counts for complex brace repeats */
#if defined(FEAT_SYN_HL) || defined(PROTO)
static int had_eol; /* TRUE when EOL found by vim_regcomp() */
#endif
-static int one_exactly = FALSE; /* only do one char for EXACTLY */
static int reg_magic; /* magicness of the pattern: */
#define MAGIC_NONE 1 /* "\V" very unmagic */
@@ -670,9 +366,6 @@ typedef struct
int regnpar;
} parse_state_T;
-/*
- * Forward declarations for vim_regcomp()'s friends.
- */
static void initchr(char_u *);
static int getchr(void);
static void skipchr_keepstart(void);
@@ -683,27 +376,10 @@ static long gethexchrs(int maxinputlen);
static long getoctchrs(void);
static long getdecchrs(void);
static int coll_get_char(void);
-static void regcomp_start(char_u *expr, int flags);
-static char_u *reg(int, int *);
-static char_u *regbranch(int *flagp);
-static char_u *regconcat(int *flagp);
-static char_u *regpiece(int *);
-static char_u *regatom(int *);
-static char_u *regnode(int);
-static int use_multibytecode(int c);
static int prog_magic_wrong(void);
-static char_u *regnext(char_u *);
-static void regc(int b);
-static void regmbc(int c);
-#define REGMBC(x) regmbc(x);
-#define CASEMBC(x) case x:
-static void reginsert(int, char_u *);
-static void reginsert_nr(int op, long val, char_u *opnd);
-static void reginsert_limits(int, long, long, char_u *);
-static char_u *re_put_long(char_u *pr, long_u val);
-static int read_limits(long *, long *);
-static void regtail(char_u *, char_u *);
-static void regoptail(char_u *, char_u *);
+static int cstrncmp(char_u *s1, char_u *s2, int *n);
+static char_u *cstrchr(char_u *, int);
+static int re_mult_next(char *what);
static int reg_iswordc(int);
static regengine_T bt_regengine;
@@ -772,333 +448,6 @@ static char *EQUIVAL_CLASS_C[16] = {
#endif
/*
- * Produce the bytes for equivalence class "c".
- * Currently only handles latin1, latin9 and utf-8.
- * NOTE: When changing this function, also change nfa_emit_equi_class()
- */
- static void
-reg_equi_class(int c)
-{
- if (enc_utf8 || STRCMP(p_enc, "latin1") == 0
- || STRCMP(p_enc, "iso-8859-15") == 0)
- {
-#ifdef EBCDIC
- int i;
-
- /* This might be slower than switch/case below. */
- for (i = 0; i < 16; i++)
- {
- if (vim_strchr(EQUIVAL_CLASS_C[i], c) != NULL)
- {
- char *p = EQUIVAL_CLASS_C[i];
-
- while (*p != 0)
- regmbc(*p++);
- return;
- }
- }
-#else
- switch (c)
- {
- /* Do not use '\300' style, it results in a negative number. */
- case 'A': case 0xc0: case 0xc1: case 0xc2:
- case 0xc3: case 0xc4: case 0xc5:
- CASEMBC(0x100) CASEMBC(0x102) CASEMBC(0x104) CASEMBC(0x1cd)
- CASEMBC(0x1de) CASEMBC(0x1e0) CASEMBC(0x1ea2)
- regmbc('A'); regmbc(0xc0); regmbc(0xc1);
- regmbc(0xc2); regmbc(0xc3); regmbc(0xc4);
- regmbc(0xc5);
- REGMBC(0x100) REGMBC(0x102) REGMBC(0x104)
- REGMBC(0x1cd) REGMBC(0x1de) REGMBC(0x1e0)
- REGMBC(0x1ea2)
- return;
- case 'B': CASEMBC(0x1e02) CASEMBC(0x1e06)
- regmbc('B'); REGMBC(0x1e02) REGMBC(0x1e06)
- return;
- case 'C': case 0xc7:
- CASEMBC(0x106) CASEMBC(0x108) CASEMBC(0x10a) CASEMBC(0x10c)
- regmbc('C'); regmbc(0xc7);
- REGMBC(0x106) REGMBC(0x108) REGMBC(0x10a)
- REGMBC(0x10c)
- return;
- case 'D': CASEMBC(0x10e) CASEMBC(0x110) CASEMBC(0x1e0a)
- CASEMBC(0x1e0e) CASEMBC(0x1e10)
- regmbc('D'); REGMBC(0x10e) REGMBC(0x110)
- REGMBC(0x1e0a) REGMBC(0x1e0e) REGMBC(0x1e10)
- return;
- case 'E': case 0xc8: case 0xc9: case 0xca: case 0xcb:
- CASEMBC(0x112) CASEMBC(0x114) CASEMBC(0x116) CASEMBC(0x118)
- CASEMBC(0x11a) CASEMBC(0x1eba) CASEMBC(0x1ebc)
- regmbc('E'); regmbc(0xc8); regmbc(0xc9);
- regmbc(0xca); regmbc(0xcb);
- REGMBC(0x112) REGMBC(0x114) REGMBC(0x116)
- REGMBC(0x118) REGMBC(0x11a) REGMBC(0x1eba)
- REGMBC(0x1ebc)
- return;
- case 'F': CASEMBC(0x1e1e)
- regmbc('F'); REGMBC(0x1e1e)
- return;
- case 'G': CASEMBC(0x11c) CASEMBC(0x11e) CASEMBC(0x120)
- CASEMBC(0x122) CASEMBC(0x1e4) CASEMBC(0x1e6) CASEMBC(0x1f4)
- CASEMBC(0x1e20)
- regmbc('G'); REGMBC(0x11c) REGMBC(0x11e)
- REGMBC(0x120) REGMBC(0x122) REGMBC(0x1e4)
- REGMBC(0x1e6) REGMBC(0x1f4) REGMBC(0x1e20)
- return;
- case 'H': CASEMBC(0x124) CASEMBC(0x126) CASEMBC(0x1e22)
- CASEMBC(0x1e26) CASEMBC(0x1e28)
- regmbc('H'); REGMBC(0x124) REGMBC(0x126)
- REGMBC(0x1e22) REGMBC(0x1e26) REGMBC(0x1e28)
- return;
- case 'I': case 0xcc: case 0xcd: case 0xce: case 0xcf:
- CASEMBC(0x128) CASEMBC(0x12a) CASEMBC(0x12c) CASEMBC(0x12e)
- CASEMBC(0x130) CASEMBC(0x1cf) CASEMBC(0x1ec8)
- regmbc('I'); regmbc(0xcc); regmbc(0xcd);
- regmbc(0xce); regmbc(0xcf);
- REGMBC(0x128) REGMBC(0x12a) REGMBC(0x12c)
- REGMBC(0x12e) REGMBC(0x130) REGMBC(0x1cf)
- REGMBC(0x1ec8)
- return;
- case 'J': CASEMBC(0x134)
- regmbc('J'); REGMBC(0x134)
- return;
- case 'K': CASEMBC(0x136) CASEMBC(0x1e8) CASEMBC(0x1e30)
- CASEMBC(0x1e34)
- regmbc('K'); REGMBC(0x136) REGMBC(0x1e8)
- REGMBC(0x1e30) REGMBC(0x1e34)
- return;
- case 'L': CASEMBC(0x139) CASEMBC(0x13b) CASEMBC(0x13d)
- CASEMBC(0x13f) CASEMBC(0x141) CASEMBC(0x1e3a)
- regmbc('L'); REGMBC(0x139) REGMBC(0x13b)
- REGMBC(0x13d) REGMBC(0x13f) REGMBC(0x141)
- REGMBC(0x1e3a)
- return;
- case 'M': CASEMBC(0x1e3e) CASEMBC(0x1e40)
- regmbc('M'); REGMBC(0x1e3e) REGMBC(0x1e40)
- return;
- case 'N': case 0xd1:
- CASEMBC(0x143) CASEMBC(0x145) CASEMBC(0x147) CASEMBC(0x1e44)
- CASEMBC(0x1e48)
- regmbc('N'); regmbc(0xd1);
- REGMBC(0x143) REGMBC(0x145) REGMBC(0x147)
- REGMBC(0x1e44) REGMBC(0x1e48)
- return;
- case 'O': case 0xd2: case 0xd3: case 0xd4: case 0xd5:
- case 0xd6: case 0xd8:
- CASEMBC(0x14c) CASEMBC(0x14e) CASEMBC(0x150) CASEMBC(0x1a0)
- CASEMBC(0x1d1) CASEMBC(0x1ea) CASEMBC(0x1ec) CASEMBC(0x1ece)
- regmbc('O'); regmbc(0xd2); regmbc(0xd3);
- regmbc(0xd4); regmbc(0xd5); regmbc(0xd6);
- regmbc(0xd8);
- REGMBC(0x14c) REGMBC(0x14e) REGMBC(0x150)
- REGMBC(0x1a0) REGMBC(0x1d1) REGMBC(0x1ea)
- REGMBC(0x1ec) REGMBC(0x1ece)
- return;
- case 'P': case 0x1e54: case 0x1e56:
- regmbc('P'); REGMBC(0x1e54) REGMBC(0x1e56)
- return;
- case 'R': CASEMBC(0x154) CASEMBC(0x156) CASEMBC(0x158)
- CASEMBC(0x1e58) CASEMBC(0x1e5e)
- regmbc('R'); REGMBC(0x154) REGMBC(0x156) REGMBC(0x158)
- REGMBC(0x1e58) REGMBC(0x1e5e)
- return;
- case 'S': CASEMBC(0x15a) CASEMBC(0x15c) CASEMBC(0x15e)
- CASEMBC(0x160) CASEMBC(0x1e60)
- regmbc('S'); REGMBC(0x15a) REGMBC(0x15c)
- REGMBC(0x15e) REGMBC(0x160) REGMBC(0x1e60)
- return;
- case 'T': CASEMBC(0x162) CASEMBC(0x164) CASEMBC(0x166)
- CASEMBC(0x1e6a) CASEMBC(0x1e6e)
- regmbc('T'); REGMBC(0x162) REGMBC(0x164)
- REGMBC(0x166) REGMBC(0x1e6a) REGMBC(0x1e6e)
- return;
- case 'U': case 0xd9: case 0xda: case 0xdb: case 0xdc:
- CASEMBC(0x168) CASEMBC(0x16a) CASEMBC(0x16c) CASEMBC(0x16e)
- CASEMBC(0x170) CASEMBC(0x172) CASEMBC(0x1af) CASEMBC(0x1d3)
- CASEMBC(0x1ee6)
- regmbc('U'); regmbc(0xd9); regmbc(0xda);
- regmbc(0xdb); regmbc(0xdc);
- REGMBC(0x168) REGMBC(0x16a) REGMBC(0x16c)
- REGMBC(0x16e) REGMBC(0x170) REGMBC(0x172)
- REGMBC(0x1af) REGMBC(0x1d3) REGMBC(0x1ee6)
- return;
- case 'V': CASEMBC(0x1e7c)
- regmbc('V'); REGMBC(0x1e7c)
- return;
- case 'W': CASEMBC(0x174) CASEMBC(0x1e80) CASEMBC(0x1e82)
- CASEMBC(0x1e84) CASEMBC(0x1e86)
- regmbc('W'); REGMBC(0x174) REGMBC(0x1e80)
- REGMBC(0x1e82) REGMBC(0x1e84) REGMBC(0x1e86)
- return;
- case 'X': CASEMBC(0x1e8a) CASEMBC(0x1e8c)
- regmbc('X'); REGMBC(0x1e8a) REGMBC(0x1e8c)
- return;
- case 'Y': case 0xdd:
- CASEMBC(0x176) CASEMBC(0x178) CASEMBC(0x1e8e) CASEMBC(0x1ef2)
- CASEMBC(0x1ef6) CASEMBC(0x1ef8)
- regmbc('Y'); regmbc(0xdd);
- REGMBC(0x176) REGMBC(0x178) REGMBC(0x1e8e)
- REGMBC(0x1ef2) REGMBC(0x1ef6) REGMBC(0x1ef8)
- return;
- case 'Z': CASEMBC(0x179) CASEMBC(0x17b) CASEMBC(0x17d)
- CASEMBC(0x1b5) CASEMBC(0x1e90) CASEMBC(0x1e94)
- regmbc('Z'); REGMBC(0x179) REGMBC(0x17b)
- REGMBC(0x17d) REGMBC(0x1b5) REGMBC(0x1e90)
- REGMBC(0x1e94)
- return;
- case 'a': case 0xe0: case 0xe1: case 0xe2:
- case 0xe3: case 0xe4: case 0xe5:
- CASEMBC(0x101) CASEMBC(0x103) CASEMBC(0x105) CASEMBC(0x1ce)
- CASEMBC(0x1df) CASEMBC(0x1e1) CASEMBC(0x1ea3)
- regmbc('a'); regmbc(0xe0); regmbc(0xe1);
- regmbc(0xe2); regmbc(0xe3); regmbc(0xe4);
- regmbc(0xe5);
- REGMBC(0x101) REGMBC(0x103) REGMBC(0x105)
- REGMBC(0x1ce) REGMBC(0x1df) REGMBC(0x1e1)
- REGMBC(0x1ea3)
- return;
- case 'b': CASEMBC(0x1e03) CASEMBC(0x1e07)
- regmbc('b'); REGMBC(0x1e03) REGMBC(0x1e07)
- return;
- case 'c': case 0xe7:
- CASEMBC(0x107) CASEMBC(0x109) CASEMBC(0x10b) CASEMBC(0x10d)
- regmbc('c'); regmbc(0xe7);
- REGMBC(0x107) REGMBC(0x109) REGMBC(0x10b)
- REGMBC(0x10d)
- return;
- case 'd': CASEMBC(0x10f) CASEMBC(0x111) CASEMBC(0x1e0b)
- CASEMBC(0x1e0f) CASEMBC(0x1e11)
- regmbc('d'); REGMBC(0x10f) REGMBC(0x111)
- REGMBC(0x1e0b) REGMBC(0x1e0f) REGMBC(0x1e11)
- return;
- case 'e': case 0xe8: case 0xe9: case 0xea: case 0xeb:
- CASEMBC(0x113) CASEMBC(0x115) CASEMBC(0x117) CASEMBC(0x119)
- CASEMBC(0x11b) CASEMBC(0x1ebb) CASEMBC(0x1ebd)
- regmbc('e'); regmbc(0xe8); regmbc(0xe9);
- regmbc(0xea); regmbc(0xeb);
- REGMBC(0x113) REGMBC(0x115) REGMBC(0x117)
- REGMBC(0x119) REGMBC(0x11b) REGMBC(0x1ebb)
- REGMBC(0x1ebd)
- return;
- case 'f': CASEMBC(0x1e1f)
- regmbc('f'); REGMBC(0x1e1f)
- return;
- case 'g': CASEMBC(0x11d) CASEMBC(0x11f) CASEMBC(0x121)
- CASEMBC(0x123) CASEMBC(0x1e5) CASEMBC(0x1e7) CASEMBC(0x1f5)
- CASEMBC(0x1e21)
- regmbc('g'); REGMBC(0x11d) REGMBC(0x11f)
- REGMBC(0x121) REGMBC(0x123) REGMBC(0x1e5)
- REGMBC(0x1e7) REGMBC(0x1f5) REGMBC(0x1e21)
- return;
- case 'h': CASEMBC(0x125) CASEMBC(0x127) CASEMBC(0x1e23)
- CASEMBC(0x1e27) CASEMBC(0x1e29) CASEMBC(0x1e96)
- regmbc('h'); REGMBC(0x125) REGMBC(0x127)
- REGMBC(0x1e23) REGMBC(0x1e27) REGMBC(0x1e29)
- REGMBC(0x1e96)
- return;
- case 'i': case 0xec: case 0xed: case 0xee: case 0xef:
- CASEMBC(0x129) CASEMBC(0x12b) CASEMBC(0x12d) CASEMBC(0x12f)
- CASEMBC(0x1d0) CASEMBC(0x1ec9)
- regmbc('i'); regmbc(0xec); regmbc(0xed);
- regmbc(0xee); regmbc(0xef);
- REGMBC(0x129) REGMBC(0x12b) REGMBC(0x12d)
- REGMBC(0x12f) REGMBC(0x1d0) REGMBC(0x1ec9)
- return;
- case 'j': CASEMBC(0x135) CASEMBC(0x1f0)
- regmbc('j'); REGMBC(0x135) REGMBC(0x1f0)
- return;
- case 'k': CASEMBC(0x137) CASEMBC(0x1e9) CASEMBC(0x1e31)
- CASEMBC(0x1e35)
- regmbc('k'); REGMBC(0x137) REGMBC(0x1e9)
- REGMBC(0x1e31) REGMBC(0x1e35)
- return;
- case 'l': CASEMBC(0x13a) CASEMBC(0x13c) CASEMBC(0x13e)
- CASEMBC(0x140) CASEMBC(0x142) CASEMBC(0x1e3b)
- regmbc('l'); REGMBC(0x13a) REGMBC(0x13c)
- REGMBC(0x13e) REGMBC(0x140) REGMBC(0x142)
- REGMBC(0x1e3b)
- return;
- case 'm': CASEMBC(0x1e3f) CASEMBC(0x1e41)
- regmbc('m'); REGMBC(0x1e3f) REGMBC(0x1e41)
- return;
- case 'n': case 0xf1:
- CASEMBC(0x144) CASEMBC(0x146) CASEMBC(0x148) CASEMBC(0x149)
- CASEMBC(0x1e45) CASEMBC(0x1e49)
- regmbc('n'); regmbc(0xf1);
- REGMBC(0x144) REGMBC(0x146) REGMBC(0x148)
- REGMBC(0x149) REGMBC(0x1e45) REGMBC(0x1e49)
- return;
- case 'o': case 0xf2: case 0xf3: case 0xf4: case 0xf5:
- case 0xf6: case 0xf8:
- CASEMBC(0x14d) CASEMBC(0x14f) CASEMBC(0x151) CASEMBC(0x1a1)
- CASEMBC(0x1d2) CASEMBC(0x1eb) CASEMBC(0x1ed) CASEMBC(0x1ecf)
- regmbc('o'); regmbc(0xf2); regmbc(0xf3);
- regmbc(0xf4); regmbc(0xf5); regmbc(0xf6);
- regmbc(0xf8);
- REGMBC(0x14d) REGMBC(0x14f) REGMBC(0x151)
- REGMBC(0x1a1) REGMBC(0x1d2) REGMBC(0x1eb)
- REGMBC(0x1ed) REGMBC(0x1ecf)
- return;
- case 'p': CASEMBC(0x1e55) CASEMBC(0x1e57)
- regmbc('p'); REGMBC(0x1e55) REGMBC(0x1e57)
- return;
- case 'r': CASEMBC(0x155) CASEMBC(0x157) CASEMBC(0x159)
- CASEMBC(0x1e59) CASEMBC(0x1e5f)
- regmbc('r'); REGMBC(0x155) REGMBC(0x157) REGMBC(0x159)
- REGMBC(0x1e59) REGMBC(0x1e5f)
- return;
- case 's': CASEMBC(0x15b) CASEMBC(0x15d) CASEMBC(0x15f)
- CASEMBC(0x161) CASEMBC(0x1e61)
- regmbc('s'); REGMBC(0x15b) REGMBC(0x15d)
- REGMBC(0x15f) REGMBC(0x161) REGMBC(0x1e61)
- return;
- case 't': CASEMBC(0x163) CASEMBC(0x165) CASEMBC(0x167)
- CASEMBC(0x1e6b) CASEMBC(0x1e6f) CASEMBC(0x1e97)
- regmbc('t'); REGMBC(0x163) REGMBC(0x165) REGMBC(0x167)
- REGMBC(0x1e6b) REGMBC(0x1e6f) REGMBC(0x1e97)
- return;
- case 'u': case 0xf9: case 0xfa: case 0xfb: case 0xfc:
- CASEMBC(0x169) CASEMBC(0x16b) CASEMBC(0x16d) CASEMBC(0x16f)
- CASEMBC(0x171) CASEMBC(0x173) CASEMBC(0x1b0) CASEMBC(0x1d4)
- CASEMBC(0x1ee7)
- regmbc('u'); regmbc(0xf9); regmbc(0xfa);
- regmbc(0xfb); regmbc(0xfc);
- REGMBC(0x169) REGMBC(0x16b) REGMBC(0x16d)
- REGMBC(0x16f) REGMBC(0x171) REGMBC(0x173)
- REGMBC(0x1b0) REGMBC(0x1d4) REGMBC(0x1ee7)
- return;
- case 'v': CASEMBC(0x1e7d)
- regmbc('v'); REGMBC(0x1e7d)
- return;
- case 'w': CASEMBC(0x175) CASEMBC(0x1e81) CASEMBC(0x1e83)
- CASEMBC(0x1e85) CASEMBC(0x1e87) CASEMBC(0x1e98)
- regmbc('w'); REGMBC(0x175) REGMBC(0x1e81)
- REGMBC(0x1e83) REGMBC(0x1e85) REGMBC(0x1e87)
- REGMBC(0x1e98)
- return;
- case 'x': CASEMBC(0x1e8b) CASEMBC(0x1e8d)
- regmbc('x'); REGMBC(0x1e8b) REGMBC(0x1e8d)
- return;
- case 'y': case 0xfd: case 0xff:
- CASEMBC(0x177) CASEMBC(0x1e8f) CASEMBC(0x1e99)
- CASEMBC(0x1ef3) CASEMBC(0x1ef7) CASEMBC(0x1ef9)
- regmbc('y'); regmbc(0xfd); regmbc(0xff);
- REGMBC(0x177) REGMBC(0x1e8f) REGMBC(0x1e99)
- REGMBC(0x1ef3) REGMBC(0x1ef7) REGMBC(0x1ef9)
- return;
- case 'z': CASEMBC(0x17a) CASEMBC(0x17c) CASEMBC(0x17e)
- CASEMBC(0x1b6) CASEMBC(0x1e91) CASEMBC(0x1e95)
- regmbc('z'); REGMBC(0x17a) REGMBC(0x17c)
- REGMBC(0x17e) REGMBC(0x1b6) REGMBC(0x1e91)
- REGMBC(0x1e95)
- return;
- }
-#endif
- }
- regmbc(c);
-}
-
-/*
* Check for a collating element "[.a.]". "pp" points to the '['.
* Returns a character. Zero means that no item was recognized. Otherwise
* "pp" is advanced to after the item.
@@ -1247,1660 +596,13 @@ skip_regexp(
}
/*
- * Return TRUE if the back reference is legal. We must have seen the close
- * brace.
- * TODO: Should also check that we don't refer to something that is repeated
- * (+*=): what instance of the repetition should we match?
- */
- static int
-seen_endbrace(int refnum)
-{
- if (!had_endbrace[refnum])
- {
- char_u *p;
-
- /* Trick: check if "@<=" or "@<!" follows, in which case
- * the \1 can appear before the referenced match. */
- for (p = regparse; *p != NUL; ++p)
- if (p[0] == '@' && p[1] == '<' && (p[2] == '!' || p[2] == '='))
- break;
- if (*p == NUL)
- {
- emsg(_("E65: Illegal back reference"));
- rc_did_emsg = TRUE;
- return FALSE;
- }
- }
- return TRUE;
-}
-
-/*
- * bt_regcomp() - compile a regular expression into internal code for the
- * traditional back track matcher.
- * Returns the program in allocated space. Returns NULL for an error.
- *
- * We can't allocate space until we know how big the compiled form will be,
- * but we can't compile it (and thus know how big it is) until we've got a
- * place to put the code. So we cheat: we compile it twice, once with code
- * generation turned off and size counting turned on, and once "for real".
- * This also means that we don't allocate space until we are sure that the
- * thing really will compile successfully, and we never have to move the
- * code and thus invalidate pointers into it. (Note that it has to be in
- * one piece because vim_free() must be able to free it all.)
- *
- * Whether upper/lower case is to be ignored is decided when executing the
- * program, it does not matter here.
- *
- * Beware that the optimization-preparation code in here knows about some
- * of the structure of the compiled regexp.
- * "re_flags": RE_MAGIC and/or RE_STRING.
- */
- static regprog_T *
-bt_regcomp(char_u *expr, int re_flags)
-{
- bt_regprog_T *r;
- char_u *scan;
- char_u *longest;
- int len;
- int flags;
-
- if (expr == NULL)
- EMSG_RET_NULL(_(e_null));
-
- init_class_tab();
-
- /*
- * First pass: determine size, legality.
- */
- regcomp_start(expr, re_flags);
- regcode = JUST_CALC_SIZE;
- regc(REGMAGIC);
- if (reg(REG_NOPAREN, &flags) == NULL)
- return NULL;
-
- /* Allocate space. */
- r = alloc(offsetof(bt_regprog_T, program) + regsize);
- if (r == NULL)
- return NULL;
- r->re_in_use = FALSE;
-
- /*
- * Second pass: emit code.
- */
- regcomp_start(expr, re_flags);
- regcode = r->program;
- regc(REGMAGIC);
- if (reg(REG_NOPAREN, &flags) == NULL || reg_toolong)
- {
- vim_free(r);
- if (reg_toolong)
- EMSG_RET_NULL(_("E339: Pattern too long"));
- return NULL;
- }
-
- /* Dig out information for optimizations. */
- r->regstart = NUL; /* Worst-case defaults. */
- r->reganch = 0;
- r->regmust = NULL;
- r->regmlen = 0;
- r->regflags = regflags;
- if (flags & HASNL)
- r->regflags |= RF_HASNL;
- if (flags & HASLOOKBH)
- r->regflags |= RF_LOOKBH;
-#ifdef FEAT_SYN_HL
- /* Remember whether this pattern has any \z specials in it. */
- r->reghasz = re_has_z;
-#endif
- scan = r->program + 1; /* First BRANCH. */
- if (OP(regnext(scan)) == END) /* Only one top-level choice. */
- {
- scan = OPERAND(scan);