diff options
author | Bram Moolenaar <Bram@vim.org> | 2019-09-07 23:16:33 +0200 |
---|---|---|
committer | Bram Moolenaar <Bram@vim.org> | 2019-09-07 23:16:33 +0200 |
commit | 6d7d7cf750bca5d641e464f6a3af5ee5b99a5ac8 (patch) | |
tree | 1b35d750cc6c6613afc5af3ed18c907c186634a2 /src | |
parent | f6ed61e1489e40eada55a4f1782e1ed82bcad7d9 (diff) |
patch 8.1.2005: the regexp.c file is too bigv8.1.2005
Problem: The regexp.c file is too big.
Solution: Move the backtracking engine to a separate file. (Yegappan
Lakshmanan, closes #4905)
Diffstat (limited to 'src')
-rw-r--r-- | src/Make_cyg_ming.mak | 2 | ||||
-rw-r--r-- | src/Make_mvc.mak | 2 | ||||
-rw-r--r-- | src/Makefile | 4 | ||||
-rw-r--r-- | src/regexp.c | 5476 | ||||
-rw-r--r-- | src/regexp_bt.c | 5381 | ||||
-rw-r--r-- | src/version.c | 2 |
6 files changed, 5417 insertions, 5450 deletions
diff --git a/src/Make_cyg_ming.mak b/src/Make_cyg_ming.mak index bd2dba26cd..9f5f5e0e8b 100644 --- a/src/Make_cyg_ming.mak +++ b/src/Make_cyg_ming.mak @@ -1179,7 +1179,7 @@ $(OUTDIR)/os_w32exeg.o: os_w32exe.c $(INCL) $(OUTDIR)/os_win32.o: os_win32.c $(INCL) $(MZSCHEME_INCL) $(CC) -c $(CFLAGS) os_win32.c -o $@ -$(OUTDIR)/regexp.o: regexp.c regexp_nfa.c $(INCL) +$(OUTDIR)/regexp.o: regexp.c regexp_bt.c regexp_nfa.c $(INCL) $(CC) -c $(CFLAGS) regexp.c -o $@ $(OUTDIR)/terminal.o: terminal.c $(INCL) $(TERM_DEPS) diff --git a/src/Make_mvc.mak b/src/Make_mvc.mak index 0401902f22..ac02e809ac 100644 --- a/src/Make_mvc.mak +++ b/src/Make_mvc.mak @@ -1637,7 +1637,7 @@ $(OUTDIR)/profiler.obj: $(OUTDIR) profiler.c $(INCL) $(OUTDIR)/quickfix.obj: $(OUTDIR) quickfix.c $(INCL) -$(OUTDIR)/regexp.obj: $(OUTDIR) regexp.c regexp_nfa.c $(INCL) +$(OUTDIR)/regexp.obj: $(OUTDIR) regexp.c regexp_bt.c regexp_nfa.c $(INCL) $(OUTDIR)/scriptfile.obj: $(OUTDIR) scriptfile.c $(INCL) diff --git a/src/Makefile b/src/Makefile index 34d1c145af..34198ea007 100644 --- a/src/Makefile +++ b/src/Makefile @@ -3326,7 +3326,7 @@ objects/pty.o: pty.c objects/quickfix.o: quickfix.c $(CCC) -o $@ quickfix.c -objects/regexp.o: regexp.c regexp_nfa.c +objects/regexp.o: regexp.c regexp_bt.c regexp_nfa.c $(CCC) -o $@ regexp.c objects/scriptfile.o: scriptfile.c @@ -3794,7 +3794,7 @@ objects/quickfix.o: quickfix.c vim.h protodef.h auto/config.h feature.h os_unix. objects/regexp.o: regexp.c vim.h protodef.h auto/config.h feature.h os_unix.h \ auto/osdef.h ascii.h keymap.h term.h macros.h option.h beval.h \ proto/gui_beval.pro structs.h regexp.h gui.h alloc.h ex_cmds.h spell.h \ - proto.h globals.h regexp_nfa.c + proto.h globals.h regexp_bt.c regexp_nfa.c objects/scriptfile.o: scriptfile.c vim.h protodef.h auto/config.h feature.h os_unix.h \ auto/osdef.h ascii.h keymap.h term.h macros.h option.h beval.h \ proto/gui_beval.pro structs.h regexp.h gui.h alloc.h ex_cmds.h spell.h \ diff --git a/src/regexp.c b/src/regexp.c index c21e0e62f0..b952315b21 100644 --- a/src/regexp.c +++ b/src/regexp.c @@ -1,41 +1,6 @@ /* vi:set ts=8 sts=4 sw=4 noet: * * Handling of regular expressions: vim_regcomp(), vim_regexec(), vim_regsub() - * - * NOTICE: - * - * This is NOT the original regular expression code as written by Henry - * Spencer. This code has been modified specifically for use with the VIM - * editor, and should not be used separately from Vim. If you want a good - * regular expression library, get the original code. The copyright notice - * that follows is from the original. - * - * END NOTICE - * - * Copyright (c) 1986 by University of Toronto. - * Written by Henry Spencer. Not derived from licensed software. - * - * Permission is granted to anyone to use this software for any - * purpose on any computer system, and to redistribute it freely, - * subject to the following restrictions: - * - * 1. The author is not responsible for the consequences of use of - * this software, no matter how awful, even if they arise - * from defects in it. - * - * 2. The origin of this software must not be misrepresented, either - * by explicit claim or by omission. - * - * 3. Altered versions must be plainly marked as such, and must not - * be misrepresented as being the original software. - * - * Beware that some of this code is subtly aware of the way operator - * precedence is structured in regular expressions. Serious changes in - * regular-expression syntax might require a total rethink. - * - * Changes have been made by Tony Andrews, Olaf 'Rhialto' Seibert, Robert - * Webb, Ciaran McCreesh and Bram Moolenaar. - * Named character class support added by Walter Briscoe (1998 Jul 01) */ // By default: do not create debugging logs or files related to regular @@ -56,197 +21,6 @@ #endif /* - * The "internal use only" fields in regexp.h are present to pass info from - * compile to execute that permits the execute phase to run lots faster on - * simple cases. They are: - * - * regstart char that must begin a match; NUL if none obvious; Can be a - * multi-byte character. - * reganch is the match anchored (at beginning-of-line only)? - * regmust string (pointer into program) that match must include, or NULL - * regmlen length of regmust string - * regflags RF_ values or'ed together - * - * Regstart and reganch permit very fast decisions on suitable starting points - * for a match, cutting down the work a lot. Regmust permits fast rejection - * of lines that cannot possibly match. The regmust tests are costly enough - * that vim_regcomp() supplies a regmust only if the r.e. contains something - * potentially expensive (at present, the only such thing detected is * or + - * at the start of the r.e., which can involve a lot of backup). Regmlen is - * supplied because the test in vim_regexec() needs it and vim_regcomp() is - * computing it anyway. - */ - -/* - * Structure for regexp "program". This is essentially a linear encoding - * of a nondeterministic finite-state machine (aka syntax charts or - * "railroad normal form" in parsing technology). Each node is an opcode - * plus a "next" pointer, possibly plus an operand. "Next" pointers of - * all nodes except BRANCH and BRACES_COMPLEX implement concatenation; a "next" - * pointer with a BRANCH on both ends of it is connecting two alternatives. - * (Here we have one of the subtle syntax dependencies: an individual BRANCH - * (as opposed to a collection of them) is never concatenated with anything - * because of operator precedence). The "next" pointer of a BRACES_COMPLEX - * node points to the node after the stuff to be repeated. - * The operand of some types of node is a literal string; for others, it is a - * node leading into a sub-FSM. In particular, the operand of a BRANCH node - * is the first node of the branch. - * (NB this is *not* a tree structure: the tail of the branch connects to the - * thing following the set of BRANCHes.) - * - * pattern is coded like: - * - * +-----------------+ - * | V - * <aa>\|<bb> BRANCH <aa> BRANCH <bb> --> END - * | ^ | ^ - * +------+ +----------+ - * - * - * +------------------+ - * V | - * <aa>* BRANCH BRANCH <aa> --> BACK BRANCH --> NOTHING --> END - * | | ^ ^ - * | +---------------+ | - * +---------------------------------------------+ - * - * - * +----------------------+ - * V | - * <aa>\+ BRANCH <aa> --> BRANCH --> BACK BRANCH --> NOTHING --> END - * | | ^ ^ - * | +-----------+ | - * +--------------------------------------------------+ - * - * - * +-------------------------+ - * V | - * <aa>\{} BRANCH BRACE_LIMITS --> BRACE_COMPLEX <aa> --> BACK END - * | | ^ - * | +----------------+ - * +-----------------------------------------------+ - * - * - * <aa>\@!<bb> BRANCH NOMATCH <aa> --> END <bb> --> END - * | | ^ ^ - * | +----------------+ | - * +--------------------------------+ - * - * +---------+ - * | V - * \z[abc] BRANCH BRANCH a BRANCH b BRANCH c BRANCH NOTHING --> END - * | | | | ^ ^ - * | | | +-----+ | - * | | +----------------+ | - * | +---------------------------+ | - * +------------------------------------------------------+ - * - * They all start with a BRANCH for "\|" alternatives, even when there is only - * one alternative. - */ - -/* - * The opcodes are: - */ - -/* definition number opnd? meaning */ -#define END 0 /* End of program or NOMATCH operand. */ -#define BOL 1 /* Match "" at beginning of line. */ -#define EOL 2 /* Match "" at end of line. */ -#define BRANCH 3 /* node Match this alternative, or the - * next... */ -#define BACK 4 /* Match "", "next" ptr points backward. */ -#define EXACTLY 5 /* str Match this string. */ -#define NOTHING 6 /* Match empty string. */ -#define STAR 7 /* node Match this (simple) thing 0 or more - * times. */ -#define PLUS 8 /* node Match this (simple) thing 1 or more - * times. */ -#define MATCH 9 /* node match the operand zero-width */ -#define NOMATCH 10 /* node check for no match with operand */ -#define BEHIND 11 /* node look behind for a match with operand */ -#define NOBEHIND 12 /* node look behind for no match with operand */ -#define SUBPAT 13 /* node match the operand here */ -#define BRACE_SIMPLE 14 /* node Match this (simple) thing between m and - * n times (\{m,n\}). */ -#define BOW 15 /* Match "" after [^a-zA-Z0-9_] */ -#define EOW 16 /* Match "" at [^a-zA-Z0-9_] */ -#define BRACE_LIMITS 17 /* nr nr define the min & max for BRACE_SIMPLE - * and BRACE_COMPLEX. */ -#define NEWL 18 /* Match line-break */ -#define BHPOS 19 /* End position for BEHIND or NOBEHIND */ - - -/* character classes: 20-48 normal, 50-78 include a line-break */ -#define ADD_NL 30 -#define FIRST_NL ANY + ADD_NL -#define ANY 20 /* Match any one character. */ -#define ANYOF 21 /* str Match any character in this string. */ -#define ANYBUT 22 /* str Match any character not in this - * string. */ -#define IDENT 23 /* Match identifier char */ -#define SIDENT 24 /* Match identifier char but no digit */ -#define KWORD 25 /* Match keyword char */ -#define SKWORD 26 /* Match word char but no digit */ -#define FNAME 27 /* Match file name char */ -#define SFNAME 28 /* Match file name char but no digit */ -#define PRINT 29 /* Match printable char */ -#define SPRINT 30 /* Match printable char but no digit */ -#define WHITE 31 /* Match whitespace char */ -#define NWHITE 32 /* Match non-whitespace char */ -#define DIGIT 33 /* Match digit char */ -#define NDIGIT 34 /* Match non-digit char */ -#define HEX 35 /* Match hex char */ -#define NHEX 36 /* Match non-hex char */ -#define OCTAL 37 /* Match octal char */ -#define NOCTAL 38 /* Match non-octal char */ -#define WORD 39 /* Match word char */ -#define NWORD 40 /* Match non-word char */ -#define HEAD 41 /* Match head char */ -#define NHEAD 42 /* Match non-head char */ -#define ALPHA 43 /* Match alpha char */ -#define NALPHA 44 /* Match non-alpha char */ -#define LOWER 45 /* Match lowercase char */ -#define NLOWER 46 /* Match non-lowercase char */ -#define UPPER 47 /* Match uppercase char */ -#define NUPPER 48 /* Match non-uppercase char */ -#define LAST_NL NUPPER + ADD_NL -#define WITH_NL(op) ((op) >= FIRST_NL && (op) <= LAST_NL) - -#define MOPEN 80 /* -89 Mark this point in input as start of - * \( subexpr. MOPEN + 0 marks start of - * match. */ -#define MCLOSE 90 /* -99 Analogous to MOPEN. MCLOSE + 0 marks - * end of match. */ -#define BACKREF 100 /* -109 node Match same string again \1-\9 */ - -#ifdef FEAT_SYN_HL -# define ZOPEN 110 /* -119 Mark this point in input as start of - * \z( subexpr. */ -# define ZCLOSE 120 /* -129 Analogous to ZOPEN. */ -# define ZREF 130 /* -139 node Match external submatch \z1-\z9 */ -#endif - -#define BRACE_COMPLEX 140 /* -149 node Match nodes between m & n times */ - -#define NOPEN 150 /* Mark this point in input as start of - \%( subexpr. */ -#define NCLOSE 151 /* Analogous to NOPEN. */ - -#define MULTIBYTECODE 200 /* mbc Match one multi-byte character */ -#define RE_BOF 201 /* Match "" at beginning of file. */ -#define RE_EOF 202 /* Match "" at end of file. */ -#define CURSOR 203 /* Match location of cursor. */ - -#define RE_LNUM 204 /* nr cmp Match line number */ -#define RE_COL 205 /* nr cmp Match column number */ -#define RE_VCOL 206 /* nr cmp Match virtual column number */ - -#define RE_MARK 207 /* mark cmp Match mark position */ -#define RE_VISUAL 208 /* Match Visual area */ -#define RE_COMPOSING 209 /* any composing characters */ - -/* * Magic characters have a special meaning, they don't match literally. * Magic characters are negative. This separates them from literal characters * (possibly multi-byte). Only ASCII characters can be Magic. @@ -272,7 +46,7 @@ toggle_Magic(int x) } /* - * The first byte of the regexp internal "program" is actually this magic + * The first byte of the BT regexp internal "program" is actually this magic * number; the start node begins in the second byte. It's used to catch the * most severe mutilation of the program by the caller. */ @@ -280,54 +54,6 @@ toggle_Magic(int x) #define REGMAGIC 0234 /* - * Opcode notes: - * - * BRANCH The set of branches constituting a single choice are hooked - * together with their "next" pointers, since precedence prevents - * anything being concatenated to any individual branch. The - * "next" pointer of the last BRANCH in a choice points to the - * thing following the whole choice. This is also where the - * final "next" pointer of each individual branch points; each - * branch starts with the operand node of a BRANCH node. - * - * BACK Normal "next" pointers all implicitly point forward; BACK - * exists to make loop structures possible. - * - * STAR,PLUS '=', and complex '*' and '+', are implemented as circular - * BRANCH structures using BACK. Simple cases (one character - * per match) are implemented with STAR and PLUS for speed - * and to minimize recursive plunges. - * - * BRACE_LIMITS This is always followed by a BRACE_SIMPLE or BRACE_COMPLEX - * node, and defines the min and max limits to be used for that - * node. - * - * MOPEN,MCLOSE ...are numbered at compile time. - * ZOPEN,ZCLOSE ...ditto - */ - -/* - * A node is one char of opcode followed by two chars of "next" pointer. - * "Next" pointers are stored as two 8-bit bytes, high order first. The - * value is a positive offset from the opcode of the node containing it. - * An operand, if any, simply follows the node. (Note that much of the - * code generation knows about this implicit relationship.) - * - * Using two bytes for the "next" pointer is vast overkill for most things, - * but allows patterns to get big without disasters. - */ -#define OP(p) ((int)*(p)) -#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377)) -#define OPERAND(p) ((p) + 3) -/* Obtain an operand that was stored as four bytes, MSB first. */ -#define OPERAND_MIN(p) (((long)(p)[3] << 24) + ((long)(p)[4] << 16) \ - + ((long)(p)[5] << 8) + (long)(p)[6]) -/* Obtain a second operand stored as four bytes. */ -#define OPERAND_MAX(p) OPERAND_MIN((p) + 4) -/* Obtain a second single-byte operand stored after a four bytes operand. */ -#define OPERAND_CMP(p) (p)[7] - -/* * Utility definitions. */ #define UCHARAT(p) ((int)*(char_u *)(p)) @@ -345,18 +71,6 @@ toggle_Magic(int x) #define MAX_LIMIT (32767L << 16L) -static int cstrncmp(char_u *s1, char_u *s2, int *n); -static char_u *cstrchr(char_u *, int); - -#ifdef BT_REGEXP_DUMP -static void regdump(char_u *, bt_regprog_T *); -#endif -#ifdef DEBUG -static char_u *regprop(char_u *); -#endif - -static int re_mult_next(char *what); - static char_u e_missingbracket[] = N_("E769: Missing ] after %s["); static char_u e_reverse_range[] = N_("E944: Reverse range in character class"); static char_u e_large_class[] = N_("E945: Range too large in character class"); @@ -374,6 +88,14 @@ static char_u e_recursive[] = N_("E956: Cannot use pattern recursively"); #define NOT_MULTI 0 #define MULTI_ONE 1 #define MULTI_MULT 2 + +// return values for regmatch() +#define RA_FAIL 1 /* something failed, abort */ +#define RA_CONT 2 /* continue in inner loop */ +#define RA_BREAK 3 /* break inner loop */ +#define RA_MATCH 4 /* successful match */ +#define RA_NOMATCH 5 /* didn't match */ + /* * Return NOT_MULTI if c is not a "multi" operator. * Return MULTI_ONE if c is a single "multi" operator. @@ -389,22 +111,6 @@ re_multi_type(int c) return NOT_MULTI; } -/* - * Flags to be passed up and down. - */ -#define HASWIDTH 0x1 /* Known never to match null string. */ -#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */ -#define SPSTART 0x4 /* Starts with * or +. */ -#define HASNL 0x8 /* Contains some \n. */ -#define HASLOOKBH 0x10 /* Contains "\@<=" or "\@<!". */ -#define WORST 0 /* Worst case. */ - -/* - * When regcode is set to this value, code is not emitted and size is computed - * instead. - */ -#define JUST_CALC_SIZE ((char_u *) -1) - static char_u *reg_prev_sub = NULL; /* @@ -587,25 +293,15 @@ init_class_tab(void) */ static char_u *regparse; /* Input-scan pointer. */ -static int prevchr_len; /* byte length of previous char */ -static int num_complex_braces; /* Complex \{...} count */ static int regnpar; /* () count. */ #ifdef FEAT_SYN_HL static int regnzpar; /* \z() count. */ static int re_has_z; /* \z item detected */ #endif -static char_u *regcode; /* Code-emit pointer, or JUST_CALC_SIZE */ -static long regsize; /* Code size. */ -static int reg_toolong; /* TRUE when offset out of range */ -static char_u had_endbrace[NSUBEXP]; /* flags, TRUE if end of () found */ static unsigned regflags; /* RF_ flags for prog */ -static long brace_min[10]; /* Minimums for complex brace repeats */ -static long brace_max[10]; /* Maximums for complex brace repeats */ -static int brace_count[10]; /* Current counts for complex brace repeats */ #if defined(FEAT_SYN_HL) || defined(PROTO) static int had_eol; /* TRUE when EOL found by vim_regcomp() */ #endif -static int one_exactly = FALSE; /* only do one char for EXACTLY */ static int reg_magic; /* magicness of the pattern: */ #define MAGIC_NONE 1 /* "\V" very unmagic */ @@ -670,9 +366,6 @@ typedef struct int regnpar; } parse_state_T; -/* - * Forward declarations for vim_regcomp()'s friends. - */ static void initchr(char_u *); static int getchr(void); static void skipchr_keepstart(void); @@ -683,27 +376,10 @@ static long gethexchrs(int maxinputlen); static long getoctchrs(void); static long getdecchrs(void); static int coll_get_char(void); -static void regcomp_start(char_u *expr, int flags); -static char_u *reg(int, int *); -static char_u *regbranch(int *flagp); -static char_u *regconcat(int *flagp); -static char_u *regpiece(int *); -static char_u *regatom(int *); -static char_u *regnode(int); -static int use_multibytecode(int c); static int prog_magic_wrong(void); -static char_u *regnext(char_u *); -static void regc(int b); -static void regmbc(int c); -#define REGMBC(x) regmbc(x); -#define CASEMBC(x) case x: -static void reginsert(int, char_u *); -static void reginsert_nr(int op, long val, char_u *opnd); -static void reginsert_limits(int, long, long, char_u *); -static char_u *re_put_long(char_u *pr, long_u val); -static int read_limits(long *, long *); -static void regtail(char_u *, char_u *); -static void regoptail(char_u *, char_u *); +static int cstrncmp(char_u *s1, char_u *s2, int *n); +static char_u *cstrchr(char_u *, int); +static int re_mult_next(char *what); static int reg_iswordc(int); static regengine_T bt_regengine; @@ -772,333 +448,6 @@ static char *EQUIVAL_CLASS_C[16] = { #endif /* - * Produce the bytes for equivalence class "c". - * Currently only handles latin1, latin9 and utf-8. - * NOTE: When changing this function, also change nfa_emit_equi_class() - */ - static void -reg_equi_class(int c) -{ - if (enc_utf8 || STRCMP(p_enc, "latin1") == 0 - || STRCMP(p_enc, "iso-8859-15") == 0) - { -#ifdef EBCDIC - int i; - - /* This might be slower than switch/case below. */ - for (i = 0; i < 16; i++) - { - if (vim_strchr(EQUIVAL_CLASS_C[i], c) != NULL) - { - char *p = EQUIVAL_CLASS_C[i]; - - while (*p != 0) - regmbc(*p++); - return; - } - } -#else - switch (c) - { - /* Do not use '\300' style, it results in a negative number. */ - case 'A': case 0xc0: case 0xc1: case 0xc2: - case 0xc3: case 0xc4: case 0xc5: - CASEMBC(0x100) CASEMBC(0x102) CASEMBC(0x104) CASEMBC(0x1cd) - CASEMBC(0x1de) CASEMBC(0x1e0) CASEMBC(0x1ea2) - regmbc('A'); regmbc(0xc0); regmbc(0xc1); - regmbc(0xc2); regmbc(0xc3); regmbc(0xc4); - regmbc(0xc5); - REGMBC(0x100) REGMBC(0x102) REGMBC(0x104) - REGMBC(0x1cd) REGMBC(0x1de) REGMBC(0x1e0) - REGMBC(0x1ea2) - return; - case 'B': CASEMBC(0x1e02) CASEMBC(0x1e06) - regmbc('B'); REGMBC(0x1e02) REGMBC(0x1e06) - return; - case 'C': case 0xc7: - CASEMBC(0x106) CASEMBC(0x108) CASEMBC(0x10a) CASEMBC(0x10c) - regmbc('C'); regmbc(0xc7); - REGMBC(0x106) REGMBC(0x108) REGMBC(0x10a) - REGMBC(0x10c) - return; - case 'D': CASEMBC(0x10e) CASEMBC(0x110) CASEMBC(0x1e0a) - CASEMBC(0x1e0e) CASEMBC(0x1e10) - regmbc('D'); REGMBC(0x10e) REGMBC(0x110) - REGMBC(0x1e0a) REGMBC(0x1e0e) REGMBC(0x1e10) - return; - case 'E': case 0xc8: case 0xc9: case 0xca: case 0xcb: - CASEMBC(0x112) CASEMBC(0x114) CASEMBC(0x116) CASEMBC(0x118) - CASEMBC(0x11a) CASEMBC(0x1eba) CASEMBC(0x1ebc) - regmbc('E'); regmbc(0xc8); regmbc(0xc9); - regmbc(0xca); regmbc(0xcb); - REGMBC(0x112) REGMBC(0x114) REGMBC(0x116) - REGMBC(0x118) REGMBC(0x11a) REGMBC(0x1eba) - REGMBC(0x1ebc) - return; - case 'F': CASEMBC(0x1e1e) - regmbc('F'); REGMBC(0x1e1e) - return; - case 'G': CASEMBC(0x11c) CASEMBC(0x11e) CASEMBC(0x120) - CASEMBC(0x122) CASEMBC(0x1e4) CASEMBC(0x1e6) CASEMBC(0x1f4) - CASEMBC(0x1e20) - regmbc('G'); REGMBC(0x11c) REGMBC(0x11e) - REGMBC(0x120) REGMBC(0x122) REGMBC(0x1e4) - REGMBC(0x1e6) REGMBC(0x1f4) REGMBC(0x1e20) - return; - case 'H': CASEMBC(0x124) CASEMBC(0x126) CASEMBC(0x1e22) - CASEMBC(0x1e26) CASEMBC(0x1e28) - regmbc('H'); REGMBC(0x124) REGMBC(0x126) - REGMBC(0x1e22) REGMBC(0x1e26) REGMBC(0x1e28) - return; - case 'I': case 0xcc: case 0xcd: case 0xce: case 0xcf: - CASEMBC(0x128) CASEMBC(0x12a) CASEMBC(0x12c) CASEMBC(0x12e) - CASEMBC(0x130) CASEMBC(0x1cf) CASEMBC(0x1ec8) - regmbc('I'); regmbc(0xcc); regmbc(0xcd); - regmbc(0xce); regmbc(0xcf); - REGMBC(0x128) REGMBC(0x12a) REGMBC(0x12c) - REGMBC(0x12e) REGMBC(0x130) REGMBC(0x1cf) - REGMBC(0x1ec8) - return; - case 'J': CASEMBC(0x134) - regmbc('J'); REGMBC(0x134) - return; - case 'K': CASEMBC(0x136) CASEMBC(0x1e8) CASEMBC(0x1e30) - CASEMBC(0x1e34) - regmbc('K'); REGMBC(0x136) REGMBC(0x1e8) - REGMBC(0x1e30) REGMBC(0x1e34) - return; - case 'L': CASEMBC(0x139) CASEMBC(0x13b) CASEMBC(0x13d) - CASEMBC(0x13f) CASEMBC(0x141) CASEMBC(0x1e3a) - regmbc('L'); REGMBC(0x139) REGMBC(0x13b) - REGMBC(0x13d) REGMBC(0x13f) REGMBC(0x141) - REGMBC(0x1e3a) - return; - case 'M': CASEMBC(0x1e3e) CASEMBC(0x1e40) - regmbc('M'); REGMBC(0x1e3e) REGMBC(0x1e40) - return; - case 'N': case 0xd1: - CASEMBC(0x143) CASEMBC(0x145) CASEMBC(0x147) CASEMBC(0x1e44) - CASEMBC(0x1e48) - regmbc('N'); regmbc(0xd1); - REGMBC(0x143) REGMBC(0x145) REGMBC(0x147) - REGMBC(0x1e44) REGMBC(0x1e48) - return; - case 'O': case 0xd2: case 0xd3: case 0xd4: case 0xd5: - case 0xd6: case 0xd8: - CASEMBC(0x14c) CASEMBC(0x14e) CASEMBC(0x150) CASEMBC(0x1a0) - CASEMBC(0x1d1) CASEMBC(0x1ea) CASEMBC(0x1ec) CASEMBC(0x1ece) - regmbc('O'); regmbc(0xd2); regmbc(0xd3); - regmbc(0xd4); regmbc(0xd5); regmbc(0xd6); - regmbc(0xd8); - REGMBC(0x14c) REGMBC(0x14e) REGMBC(0x150) - REGMBC(0x1a0) REGMBC(0x1d1) REGMBC(0x1ea) - REGMBC(0x1ec) REGMBC(0x1ece) - return; - case 'P': case 0x1e54: case 0x1e56: - regmbc('P'); REGMBC(0x1e54) REGMBC(0x1e56) - return; - case 'R': CASEMBC(0x154) CASEMBC(0x156) CASEMBC(0x158) - CASEMBC(0x1e58) CASEMBC(0x1e5e) - regmbc('R'); REGMBC(0x154) REGMBC(0x156) REGMBC(0x158) - REGMBC(0x1e58) REGMBC(0x1e5e) - return; - case 'S': CASEMBC(0x15a) CASEMBC(0x15c) CASEMBC(0x15e) - CASEMBC(0x160) CASEMBC(0x1e60) - regmbc('S'); REGMBC(0x15a) REGMBC(0x15c) - REGMBC(0x15e) REGMBC(0x160) REGMBC(0x1e60) - return; - case 'T': CASEMBC(0x162) CASEMBC(0x164) CASEMBC(0x166) - CASEMBC(0x1e6a) CASEMBC(0x1e6e) - regmbc('T'); REGMBC(0x162) REGMBC(0x164) - REGMBC(0x166) REGMBC(0x1e6a) REGMBC(0x1e6e) - return; - case 'U': case 0xd9: case 0xda: case 0xdb: case 0xdc: - CASEMBC(0x168) CASEMBC(0x16a) CASEMBC(0x16c) CASEMBC(0x16e) - CASEMBC(0x170) CASEMBC(0x172) CASEMBC(0x1af) CASEMBC(0x1d3) - CASEMBC(0x1ee6) - regmbc('U'); regmbc(0xd9); regmbc(0xda); - regmbc(0xdb); regmbc(0xdc); - REGMBC(0x168) REGMBC(0x16a) REGMBC(0x16c) - REGMBC(0x16e) REGMBC(0x170) REGMBC(0x172) - REGMBC(0x1af) REGMBC(0x1d3) REGMBC(0x1ee6) - return; - case 'V': CASEMBC(0x1e7c) - regmbc('V'); REGMBC(0x1e7c) - return; - case 'W': CASEMBC(0x174) CASEMBC(0x1e80) CASEMBC(0x1e82) - CASEMBC(0x1e84) CASEMBC(0x1e86) - regmbc('W'); REGMBC(0x174) REGMBC(0x1e80) - REGMBC(0x1e82) REGMBC(0x1e84) REGMBC(0x1e86) - return; - case 'X': CASEMBC(0x1e8a) CASEMBC(0x1e8c) - regmbc('X'); REGMBC(0x1e8a) REGMBC(0x1e8c) - return; - case 'Y': case 0xdd: - CASEMBC(0x176) CASEMBC(0x178) CASEMBC(0x1e8e) CASEMBC(0x1ef2) - CASEMBC(0x1ef6) CASEMBC(0x1ef8) - regmbc('Y'); regmbc(0xdd); - REGMBC(0x176) REGMBC(0x178) REGMBC(0x1e8e) - REGMBC(0x1ef2) REGMBC(0x1ef6) REGMBC(0x1ef8) - return; - case 'Z': CASEMBC(0x179) CASEMBC(0x17b) CASEMBC(0x17d) - CASEMBC(0x1b5) CASEMBC(0x1e90) CASEMBC(0x1e94) - regmbc('Z'); REGMBC(0x179) REGMBC(0x17b) - REGMBC(0x17d) REGMBC(0x1b5) REGMBC(0x1e90) - REGMBC(0x1e94) - return; - case 'a': case 0xe0: case 0xe1: case 0xe2: - case 0xe3: case 0xe4: case 0xe5: - CASEMBC(0x101) CASEMBC(0x103) CASEMBC(0x105) CASEMBC(0x1ce) - CASEMBC(0x1df) CASEMBC(0x1e1) CASEMBC(0x1ea3) - regmbc('a'); regmbc(0xe0); regmbc(0xe1); - regmbc(0xe2); regmbc(0xe3); regmbc(0xe4); - regmbc(0xe5); - REGMBC(0x101) REGMBC(0x103) REGMBC(0x105) - REGMBC(0x1ce) REGMBC(0x1df) REGMBC(0x1e1) - REGMBC(0x1ea3) - return; - case 'b': CASEMBC(0x1e03) CASEMBC(0x1e07) - regmbc('b'); REGMBC(0x1e03) REGMBC(0x1e07) - return; - case 'c': case 0xe7: - CASEMBC(0x107) CASEMBC(0x109) CASEMBC(0x10b) CASEMBC(0x10d) - regmbc('c'); regmbc(0xe7); - REGMBC(0x107) REGMBC(0x109) REGMBC(0x10b) - REGMBC(0x10d) - return; - case 'd': CASEMBC(0x10f) CASEMBC(0x111) CASEMBC(0x1e0b) - CASEMBC(0x1e0f) CASEMBC(0x1e11) - regmbc('d'); REGMBC(0x10f) REGMBC(0x111) - REGMBC(0x1e0b) REGMBC(0x1e0f) REGMBC(0x1e11) - return; - case 'e': case 0xe8: case 0xe9: case 0xea: case 0xeb: - CASEMBC(0x113) CASEMBC(0x115) CASEMBC(0x117) CASEMBC(0x119) - CASEMBC(0x11b) CASEMBC(0x1ebb) CASEMBC(0x1ebd) - regmbc('e'); regmbc(0xe8); regmbc(0xe9); - regmbc(0xea); regmbc(0xeb); - REGMBC(0x113) REGMBC(0x115) REGMBC(0x117) - REGMBC(0x119) REGMBC(0x11b) REGMBC(0x1ebb) - REGMBC(0x1ebd) - return; - case 'f': CASEMBC(0x1e1f) - regmbc('f'); REGMBC(0x1e1f) - return; - case 'g': CASEMBC(0x11d) CASEMBC(0x11f) CASEMBC(0x121) - CASEMBC(0x123) CASEMBC(0x1e5) CASEMBC(0x1e7) CASEMBC(0x1f5) - CASEMBC(0x1e21) - regmbc('g'); REGMBC(0x11d) REGMBC(0x11f) - REGMBC(0x121) REGMBC(0x123) REGMBC(0x1e5) - REGMBC(0x1e7) REGMBC(0x1f5) REGMBC(0x1e21) - return; - case 'h': CASEMBC(0x125) CASEMBC(0x127) CASEMBC(0x1e23) - CASEMBC(0x1e27) CASEMBC(0x1e29) CASEMBC(0x1e96) - regmbc('h'); REGMBC(0x125) REGMBC(0x127) - REGMBC(0x1e23) REGMBC(0x1e27) REGMBC(0x1e29) - REGMBC(0x1e96) - return; - case 'i': case 0xec: case 0xed: case 0xee: case 0xef: - CASEMBC(0x129) CASEMBC(0x12b) CASEMBC(0x12d) CASEMBC(0x12f) - CASEMBC(0x1d0) CASEMBC(0x1ec9) - regmbc('i'); regmbc(0xec); regmbc(0xed); - regmbc(0xee); regmbc(0xef); - REGMBC(0x129) REGMBC(0x12b) REGMBC(0x12d) - REGMBC(0x12f) REGMBC(0x1d0) REGMBC(0x1ec9) - return; - case 'j': CASEMBC(0x135) CASEMBC(0x1f0) - regmbc('j'); REGMBC(0x135) REGMBC(0x1f0) - return; - case 'k': CASEMBC(0x137) CASEMBC(0x1e9) CASEMBC(0x1e31) - CASEMBC(0x1e35) - regmbc('k'); REGMBC(0x137) REGMBC(0x1e9) - REGMBC(0x1e31) REGMBC(0x1e35) - return; - case 'l': CASEMBC(0x13a) CASEMBC(0x13c) CASEMBC(0x13e) - CASEMBC(0x140) CASEMBC(0x142) CASEMBC(0x1e3b) - regmbc('l'); REGMBC(0x13a) REGMBC(0x13c) - REGMBC(0x13e) REGMBC(0x140) REGMBC(0x142) - REGMBC(0x1e3b) - return; - case 'm': CASEMBC(0x1e3f) CASEMBC(0x1e41) - regmbc('m'); REGMBC(0x1e3f) REGMBC(0x1e41) - return; - case 'n': case 0xf1: - CASEMBC(0x144) CASEMBC(0x146) CASEMBC(0x148) CASEMBC(0x149) - CASEMBC(0x1e45) CASEMBC(0x1e49) - regmbc('n'); regmbc(0xf1); - REGMBC(0x144) REGMBC(0x146) REGMBC(0x148) - REGMBC(0x149) REGMBC(0x1e45) REGMBC(0x1e49) - return; - case 'o': case 0xf2: case 0xf3: case 0xf4: case 0xf5: - case 0xf6: case 0xf8: - CASEMBC(0x14d) CASEMBC(0x14f) CASEMBC(0x151) CASEMBC(0x1a1) - CASEMBC(0x1d2) CASEMBC(0x1eb) CASEMBC(0x1ed) CASEMBC(0x1ecf) - regmbc('o'); regmbc(0xf2); regmbc(0xf3); - regmbc(0xf4); regmbc(0xf5); regmbc(0xf6); - regmbc(0xf8); - REGMBC(0x14d) REGMBC(0x14f) REGMBC(0x151) - REGMBC(0x1a1) REGMBC(0x1d2) REGMBC(0x1eb) - REGMBC(0x1ed) REGMBC(0x1ecf) - return; - case 'p': CASEMBC(0x1e55) CASEMBC(0x1e57) - regmbc('p'); REGMBC(0x1e55) REGMBC(0x1e57) - return; - case 'r': CASEMBC(0x155) CASEMBC(0x157) CASEMBC(0x159) - CASEMBC(0x1e59) CASEMBC(0x1e5f) - regmbc('r'); REGMBC(0x155) REGMBC(0x157) REGMBC(0x159) - REGMBC(0x1e59) REGMBC(0x1e5f) - return; - case 's': CASEMBC(0x15b) CASEMBC(0x15d) CASEMBC(0x15f) - CASEMBC(0x161) CASEMBC(0x1e61) - regmbc('s'); REGMBC(0x15b) REGMBC(0x15d) - REGMBC(0x15f) REGMBC(0x161) REGMBC(0x1e61) - return; - case 't': CASEMBC(0x163) CASEMBC(0x165) CASEMBC(0x167) - CASEMBC(0x1e6b) CASEMBC(0x1e6f) CASEMBC(0x1e97) - regmbc('t'); REGMBC(0x163) REGMBC(0x165) REGMBC(0x167) - REGMBC(0x1e6b) REGMBC(0x1e6f) REGMBC(0x1e97) - return; - case 'u': case 0xf9: case 0xfa: case 0xfb: case 0xfc: - CASEMBC(0x169) CASEMBC(0x16b) CASEMBC(0x16d) CASEMBC(0x16f) - CASEMBC(0x171) CASEMBC(0x173) CASEMBC(0x1b0) CASEMBC(0x1d4) - CASEMBC(0x1ee7) - regmbc('u'); regmbc(0xf9); regmbc(0xfa); - regmbc(0xfb); regmbc(0xfc); - REGMBC(0x169) REGMBC(0x16b) REGMBC(0x16d) - REGMBC(0x16f) REGMBC(0x171) REGMBC(0x173) - REGMBC(0x1b0) REGMBC(0x1d4) REGMBC(0x1ee7) - return; - case 'v': CASEMBC(0x1e7d) - regmbc('v'); REGMBC(0x1e7d) - return; - case 'w': CASEMBC(0x175) CASEMBC(0x1e81) CASEMBC(0x1e83) - CASEMBC(0x1e85) CASEMBC(0x1e87) CASEMBC(0x1e98) - regmbc('w'); REGMBC(0x175) REGMBC(0x1e81) - REGMBC(0x1e83) REGMBC(0x1e85) REGMBC(0x1e87) - REGMBC(0x1e98) - return; - case 'x': CASEMBC(0x1e8b) CASEMBC(0x1e8d) - regmbc('x'); REGMBC(0x1e8b) REGMBC(0x1e8d) - return; - case 'y': case 0xfd: case 0xff: - CASEMBC(0x177) CASEMBC(0x1e8f) CASEMBC(0x1e99) - CASEMBC(0x1ef3) CASEMBC(0x1ef7) CASEMBC(0x1ef9) - regmbc('y'); regmbc(0xfd); regmbc(0xff); - REGMBC(0x177) REGMBC(0x1e8f) REGMBC(0x1e99) - REGMBC(0x1ef3) REGMBC(0x1ef7) REGMBC(0x1ef9) - return; - case 'z': CASEMBC(0x17a) CASEMBC(0x17c) CASEMBC(0x17e) - CASEMBC(0x1b6) CASEMBC(0x1e91) CASEMBC(0x1e95) - regmbc('z'); REGMBC(0x17a) REGMBC(0x17c) - REGMBC(0x17e) REGMBC(0x1b6) REGMBC(0x1e91) - REGMBC(0x1e95) - return; - } -#endif - } - regmbc(c); -} - -/* * Check for a collating element "[.a.]". "pp" points to the '['. * Returns a character. Zero means that no item was recognized. Otherwise * "pp" is advanced to after the item. @@ -1247,1660 +596,13 @@ skip_regexp( } /* - * Return TRUE if the back reference is legal. We must have seen the close - * brace. - * TODO: Should also check that we don't refer to something that is repeated - * (+*=): what instance of the repetition should we match? - */ - static int -seen_endbrace(int refnum) -{ - if (!had_endbrace[refnum]) - { - char_u *p; - - /* Trick: check if "@<=" or "@<!" follows, in which case - * the \1 can appear before the referenced match. */ - for (p = regparse; *p != NUL; ++p) - if (p[0] == '@' && p[1] == '<' && (p[2] == '!' || p[2] == '=')) - break; - if (*p == NUL) - { - emsg(_("E65: Illegal back reference")); - rc_did_emsg = TRUE; - return FALSE; - } - } - return TRUE; -} - -/* - * bt_regcomp() - compile a regular expression into internal code for the - * traditional back track matcher. - * Returns the program in allocated space. Returns NULL for an error. - * - * We can't allocate space until we know how big the compiled form will be, - * but we can't compile it (and thus know how big it is) until we've got a - * place to put the code. So we cheat: we compile it twice, once with code - * generation turned off and size counting turned on, and once "for real". - * This also means that we don't allocate space until we are sure that the - * thing really will compile successfully, and we never have to move the - * code and thus invalidate pointers into it. (Note that it has to be in - * one piece because vim_free() must be able to free it all.) - * - * Whether upper/lower case is to be ignored is decided when executing the - * program, it does not matter here. - * - * Beware that the optimization-preparation code in here knows about some - * of the structure of the compiled regexp. - * "re_flags": RE_MAGIC and/or RE_STRING. - */ - static regprog_T * -bt_regcomp(char_u *expr, int re_flags) -{ - bt_regprog_T *r; - char_u *scan; - char_u *longest; - int len; - int flags; - - if (expr == NULL) - EMSG_RET_NULL(_(e_null)); - - init_class_tab(); - - /* - * First pass: determine size, legality. - */ - regcomp_start(expr, re_flags); - regcode = JUST_CALC_SIZE; - regc(REGMAGIC); - if (reg(REG_NOPAREN, &flags) == NULL) - return NULL; - - /* Allocate space. */ - r = alloc(offsetof(bt_regprog_T, program) + regsize); - if (r == NULL) - return NULL; - r->re_in_use = FALSE; - - /* - * Second pass: emit code. - */ - regcomp_start(expr, re_flags); - regcode = r->program; - regc(REGMAGIC); - if (reg(REG_NOPAREN, &flags) == NULL || reg_toolong) - { - vim_free(r); - if (reg_toolong) - EMSG_RET_NULL(_("E339: Pattern too long")); - return NULL; - } - - /* Dig out information for optimizations. */ - r->regstart = NUL; /* Worst-case defaults. */ - r->reganch = 0; - r->regmust = NULL; - r->regmlen = 0; - r->regflags = regflags; - if (flags & HASNL) - r->regflags |= RF_HASNL; - if (flags & HASLOOKBH) - r->regflags |= RF_LOOKBH; -#ifdef FEAT_SYN_HL - /* Remember whether this pattern has any \z specials in it. */ - r->reghasz = re_has_z; -#endif - scan = r->program + 1; /* First BRANCH. */ - if (OP(regnext(scan)) == END) /* Only one top-level choice. */ - { - scan = OPERAND(scan); |