summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBram Moolenaar <Bram@vim.org>2017-03-29 15:31:20 +0200
committerBram Moolenaar <Bram@vim.org>2017-03-29 15:31:20 +0200
commit0c078fc7db2902d4ccba04506db082ddbef45a8c (patch)
tree7c142af9692ea6315986e3d2239e8d3f143f6881
parentc6cd8409c2993b1476e123fba11cb4b8d743b896 (diff)
patch 8.0.0519: character classes are not well testedv8.0.0519
Problem: Character classes are not well tested. They can differ between platforms. Solution: Add tests. In the documentation make clear which classes depend on what library function. Only use :cntrl: and :graph: for ASCII. (Kazunobu Kuriyama, Dominique Pelle, closes #1560) Update the documentation.
-rw-r--r--runtime/doc/pattern.txt43
-rw-r--r--src/regexp.c6
-rw-r--r--src/regexp_nfa.c4
-rw-r--r--src/testdir/test_regexp_utf8.vim67
-rw-r--r--src/version.c2
5 files changed, 90 insertions, 32 deletions
diff --git a/runtime/doc/pattern.txt b/runtime/doc/pattern.txt
index 1496604983..090ca6452e 100644
--- a/runtime/doc/pattern.txt
+++ b/runtime/doc/pattern.txt
@@ -1085,25 +1085,27 @@ x A single character, with no special meaning, matches itself
- A character class expression is evaluated to the set of characters
belonging to that character class. The following character classes
are supported:
- Name Contents ~
-*[:alnum:]* [:alnum:] ASCII letters and digits
-*[:alpha:]* [:alpha:] ASCII letters
-*[:blank:]* [:blank:] space and tab characters
-*[:cntrl:]* [:cntrl:] control characters
-*[:digit:]* [:digit:] decimal digits
-*[:graph:]* [:graph:] printable characters excluding space
-*[:lower:]* [:lower:] lowercase letters (all letters when
+ Name Func Contents ~
+*[:alnum:]* [:alnum:] isalnum ASCII letters and digits
+*[:alpha:]* [:alpha:] isalpha ASCII letters
+*[:blank:]* [:blank:] space and tab
+*[:cntrl:]* [:cntrl:] iscntrl ASCII control characters
+*[:digit:]* [:digit:] decimal digits '0' to '9'
+*[:graph:]* [:graph:] isgraph ASCII printable characters excluding
+ space
+*[:lower:]* [:lower:] (1) lowercase letters (all letters when
'ignorecase' is used)
-*[:print:]* [:print:] printable characters including space
-*[:punct:]* [:punct:] ASCII punctuation characters
-*[:space:]* [:space:] whitespace characters
-*[:upper:]* [:upper:] uppercase letters (all letters when
+*[:print:]* [:print:] (2) printable characters including space
+*[:punct:]* [:punct:] ispunct ASCII punctuation characters
+*[:space:]* [:space:] whitespace characters: space, tab, CR,
+ NL, vertical tab, form feed
+*[:upper:]* [:upper:] (3) uppercase letters (all letters when
'ignorecase' is used)
-*[:xdigit:]* [:xdigit:] hexadecimal digits
-*[:return:]* [:return:] the <CR> character
-*[:tab:]* [:tab:] the <Tab> character
-*[:escape:]* [:escape:] the <Esc> character
-*[:backspace:]* [:backspace:] the <BS> character
+*[:xdigit:]* [:xdigit:] hexadecimal digits: 0-9, a-f, A-F
+*[:return:]* [:return:] the <CR> character
+*[:tab:]* [:tab:] the <Tab> character
+*[:escape:]* [:escape:] the <Esc> character
+*[:backspace:]* [:backspace:] the <BS> character
The brackets in character class expressions are additional to the
brackets delimiting a collection. For example, the following is a
plausible pattern for a UNIX filename: "[-./[:alnum:]_~]\+" That is,
@@ -1114,6 +1116,13 @@ x A single character, with no special meaning, matches itself
regexp engine. See |two-engines|. In the future these items may
work for multi-byte characters. For now, to get all "alpha"
characters you can use: [[:lower:][:upper:]].
+
+ The "Func" column shows what library function is used. The
+ implementation depends on the system. Otherwise:
+ (1) Uses islower() for ASCII and Vim builtin rules for other
+ characters when built with the |+multi_byte| feature.
+ (2) Uses Vim builtin rules
+ (3) As with (1) but using isupper()
*/[[=* *[==]*
- An equivalence class. This means that characters are matched that
have almost the same meaning, e.g., when ignoring accents. This
diff --git a/src/regexp.c b/src/regexp.c
index 91b8015bb8..b4fe7d7ebf 100644
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -2555,17 +2555,17 @@ collection:
regc('\t');
break;
case CLASS_CNTRL:
- for (cu = 1; cu <= 255; cu++)
+ for (cu = 1; cu <= 127; cu++)
if (iscntrl(cu))
regmbc(cu);
break;
case CLASS_DIGIT:
- for (cu = 1; cu <= 255; cu++)
+ for (cu = 1; cu <= 127; cu++)
if (VIM_ISDIGIT(cu))
regmbc(cu);
break;
case CLASS_GRAPH:
- for (cu = 1; cu <= 255; cu++)
+ for (cu = 1; cu <= 127; cu++)
if (isgraph(cu))
regmbc(cu);
break;
diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c
index 20ef1869e2..e6d8255e9c 100644
--- a/src/regexp_nfa.c
+++ b/src/regexp_nfa.c
@@ -4871,7 +4871,7 @@ check_char_class(int class, int c)
return OK;
break;
case NFA_CLASS_CNTRL:
- if (c >= 1 && c <= 255 && iscntrl(c))
+ if (c >= 1 && c <= 127 && iscntrl(c))
return OK;
break;
case NFA_CLASS_DIGIT:
@@ -4879,7 +4879,7 @@ check_char_class(int class, int c)
return OK;
break;
case NFA_CLASS_GRAPH:
- if (c >= 1 && c <= 255 && isgraph(c))
+ if (c >= 1 && c <= 127 && isgraph(c))
return OK;
break;
case NFA_CLASS_LOWER:
diff --git a/src/testdir/test_regexp_utf8.vim b/src/testdir/test_regexp_utf8.vim
index d2259835ca..47bd7014ab 100644
--- a/src/testdir/test_regexp_utf8.vim
+++ b/src/testdir/test_regexp_utf8.vim
@@ -38,12 +38,21 @@ func s:classes_test()
set isprint=@,161-255
call assert_equal('Motörhead', matchstr('Motörhead', '[[:print:]]\+'))
+ let alnumchars = ''
let alphachars = ''
+ let backspacechar = ''
+ let blankchars = ''
+ let cntrlchars = ''
+ let digitchars = ''
+ let escapechar = ''
+ let graphchars = ''
let lowerchars = ''
- let upperchars = ''
- let alnumchars = ''
let printchars = ''
let punctchars = ''
+ let returnchar = ''
+ let spacechars = ''
+ let tabchar = ''
+ let upperchars = ''
let xdigitchars = ''
let i = 1
while i <= 255
@@ -51,21 +60,48 @@ func s:classes_test()
if c =~ '[[:alpha:]]'
let alphachars .= c
endif
- if c =~ '[[:lower:]]'
- let lowerchars .= c
- endif
- if c =~ '[[:upper:]]'
- let upperchars .= c
- endif
if c =~ '[[:alnum:]]'
let alnumchars .= c
endif
+ if c =~ '[[:backspace:]]'
+ let backspacechar .= c
+ endif
+ if c =~ '[[:blank:]]'
+ let blankchars .= c
+ endif
+ if c =~ '[[:cntrl:]]'
+ let cntrlchars .= c
+ endif
+ if c =~ '[[:digit:]]'
+ let digitchars .= c
+ endif
+ if c =~ '[[:escape:]]'
+ let escapechar .= c
+ endif
+ if c =~ '[[:graph:]]'
+ let graphchars .= c
+ endif
+ if c =~ '[[:lower:]]'
+ let lowerchars .= c
+ endif
if c =~ '[[:print:]]'
let printchars .= c
endif
if c =~ '[[:punct:]]'
let punctchars .= c
endif
+ if c =~ '[[:return:]]'
+ let returnchar .= c
+ endif
+ if c =~ '[[:space:]]'
+ let spacechars .= c
+ endif
+ if c =~ '[[:tab:]]'
+ let tabchar .= c
+ endif
+ if c =~ '[[:upper:]]'
+ let upperchars .= c
+ endif
if c =~ '[[:xdigit:]]'
let xdigitchars .= c
endif
@@ -73,11 +109,22 @@ func s:classes_test()
endwhile
call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', alphachars)
- call assert_equal('abcdefghijklmnopqrstuvwxyzµßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', lowerchars)
- call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ', upperchars)
call assert_equal('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', alnumchars)
+ call assert_equal("\b", backspacechar)
+ call assert_equal("\t ", blankchars)
+ " Commented out: it succeeds on Linux and Windows, but fails on macOs in Travis.
+ " call assert_equal("\x01\x02\x03\x04\x05\x06\x07\b\t\n\x0b\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\e\x1c\x1d\x1e\x1f\x7f", cntrlchars)
+ call assert_equal("0123456789", digitchars)
+ call assert_equal("\<Esc>", escapechar)
+ " Commented out: it succeeds on Linux and Windows, but fails on macOs in Travis.
+ " call assert_equal('!"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~', graphchars)
+ call assert_equal('abcdefghijklmnopqrstuvwxyzµßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', lowerchars)
call assert_equal(' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ', printchars)
call assert_equal('!"#$%&''()*+,-./:;<=>?@[\]^_`{|}~', punctchars)
+ call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ', upperchars)
+ call assert_equal("\r", returnchar)
+ call assert_equal("\t\n\x0b\f\r ", spacechars)
+ call assert_equal("\t", tabchar)
call assert_equal('0123456789ABCDEFabcdef', xdigitchars)
endfunc
diff --git a/src/version.c b/src/version.c
index 45d6e8f730..36adf25a54 100644
--- a/src/version.c
+++ b/src/version.c
@@ -765,6 +765,8 @@ static char *(features[]) =
static int included_patches[] =
{ /* Add new patch number below this line */
/**/
+ 519,
+/**/
518,
/**/
517,