summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorpkoppstein <pkoppstein@gmail.com>2023-07-03 18:46:29 -0400
committerGitHub <noreply@github.com>2023-07-04 07:46:29 +0900
commit83f375cc831039396167d4d2b5f901f4b33a8707 (patch)
tree1ec0e167f315971aefeceed86b1d3d283a3528e3
parentedb0d88e3bb563bbc671c87d139d3c051987e77c (diff)
Revamp sub/3 to resolve most issues with gsub (and sub with "g") (#2641)
The primary purpose of this commit is to rectify most problems with `gsub` (and also `sub` with the `g` option), in particular fix #1425 ('\b'), fix #2354 (lookahead), and fix #2532 (regex == `"^(?!cd ).*$|^cd "`). This commit also partly resolves #2148 and resolves #1206 in that `gsub` no longer loops infinitely; however, because the new `gsub` depends critically on `match/2`, the behavior when regex == `""` is sometimes non-standard. The documentation has been updated to reflect the fact that `sub` and `gsub` are intended to be regular in the second argument. Also, `_nwise/1` has been tweaked to take advantage of TCO.
-rw-r--r--docs/content/manual/manual.yml37
-rw-r--r--src/builtin.c3
-rw-r--r--src/builtin.jq62
-rw-r--r--tests/jq.test1
-rw-r--r--tests/onig.test65
5 files changed, 113 insertions, 55 deletions
diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml
index f9bdb102..d5918be8 100644
--- a/docs/content/manual/manual.yml
+++ b/docs/content/manual/manual.yml
@@ -2481,32 +2481,43 @@ sections:
input: '("ab,cd", "ef, gh")'
output: ['"ab"', '"cd"', '"ef"', '"gh"']
- - title: "`sub(regex; tostring)`, `sub(regex; string; flags)`"
+ - title: "`sub(regex; tostring)`, `sub(regex; tostring; flags)`"
body: |
- Emit the string obtained by replacing the first match of regex in the
- input string with `tostring`, after interpolation. `tostring` should
- be a jq string, and may contain references to named captures. The
- named captures are, in effect, presented as a JSON object (as
- constructed by `capture`) to `tostring`, so a reference to a captured
- variable named "x" would take the form: `"\(.x)"`.
+ Emit the string obtained by replacing the first match of
+ regex in the input string with `tostring`, after
+ interpolation. `tostring` should be a jq string or a stream
+ of such strings, each of which may contain references to
+ named captures. The named captures are, in effect, presented
+ as a JSON object (as constructed by `capture`) to
+ `tostring`, so a reference to a captured variable named "x"
+ would take the form: `"\(.x)"`.
example:
- - program: 'sub("^[^a-z]*(?<x>[a-z]*).*")'
- input: '"123abc456"'
- output: '"ZabcZabc"'
+ - program: 'sub("[^a-z]*(?<x>[a-z]*)"; "Z\(.x)"; "g")'
+ input: '"123abc456def"'
+ output: ['"ZabcZdef"']
- - title: "`gsub(regex; string)`, `gsub(regex; string; flags)`"
+ - program: '[sub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)")]'
+ input: '"aB"'
+ output: ['["AB","aB"]']
+
+ - title: "`gsub(regex; tostring)`, `gsub(regex; tostring; flags)`"
body: |
`gsub` is like `sub` but all the non-overlapping occurrences of the regex are
- replaced by the string, after interpolation.
+ replaced by `tostring`, after interpolation. If the second argument is a stream
+ of jq strings, then `gsub` will produce a corresponding stream of JSON strings.
example:
- program: 'gsub("(?<x>.)[^a]*"; "+\(.x)-")'
input: '"Abcabc"'
- output: '"+A-+a-"'
+ output: ['"+A-+a-"']
+
+ - program: '[gsub("p"; "a", "b")]'
+ input: '"p"'
+ output: ['["a","b"]']
- title: Advanced features
diff --git a/src/builtin.c b/src/builtin.c
index 9b2d9a23..2d1156dc 100644
--- a/src/builtin.c
+++ b/src/builtin.c
@@ -930,7 +930,8 @@ static jv f_match(jq_state *jq, jv input, jv regex, jv modifiers, jv testmode) {
match = jv_object_set(match, jv_string("string"), jv_string(""));
match = jv_object_set(match, jv_string("captures"), jv_array());
result = jv_array_append(result, match);
- start += 1;
+ // ensure '"qux" | match("(?=u)"; "g")' matches just once
+ start = (const UChar*)(input_string+region->end[0]+1);
continue;
}
diff --git a/src/builtin.jq b/src/builtin.jq
index a102fd51..09663511 100644
--- a/src/builtin.jq
+++ b/src/builtin.jq
@@ -99,8 +99,10 @@ def scan(re):
#
# If input is an array, then emit a stream of successive subarrays of length n (or less),
# and similarly for strings.
-def _nwise(a; $n): if a|length <= $n then a else a[0:$n] , _nwise(a[$n:]; $n) end;
-def _nwise($n): _nwise(.; $n);
+def _nwise($n):
+ def n: if length <= $n then . else .[0:$n] , (.[$n:] | n) end;
+ n;
+def _nwise(a; $n): a | _nwise($n);
#
# splits/1 produces a stream; split/1 is retained for backward compatibility.
def splits($re; flags): . as $s
@@ -114,47 +116,26 @@ def splits($re): splits($re; null);
# split emits an array for backward compatibility
def split($re; flags): [ splits($re; flags) ];
#
-# If s contains capture variables, then create a capture object and pipe it to s
-def sub($re; s):
- . as $in
- | [match($re)]
- | if length == 0 then $in
- else .[0]
- | . as $r
-# # create the "capture" object:
- | reduce ( $r | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair
- ({}; . + $pair)
- | $in[0:$r.offset] + s + $in[$r.offset+$r.length:]
- end ;
-#
-# If s contains capture variables, then create a capture object and pipe it to s
-def sub($re; s; flags):
- def subg: [explode[] | select(. != 103)] | implode;
- # "fla" should be flags with all occurrences of g removed; gs should be non-nil if flags has a g
- def sub1(fla; gs):
- def mysub:
- . as $in
- | [match($re; fla)]
- | if length == 0 then $in
- else .[0] as $edit
- | ($edit | .offset + .length) as $len
- # create the "capture" object:
- | reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair
- ({}; . + $pair)
- | $in[0:$edit.offset]
- + s
- + ($in[$len:] | if length > 0 and gs then mysub else . end)
- end ;
- mysub ;
- (flags | index("g")) as $gs
- | (flags | if $gs then subg else . end) as $fla
- | sub1($fla; $gs);
+# If s contains capture variables, then create a capture object and pipe it to s, bearing
+# in mind that s could be a stream
+def sub($re; s; $flags):
+ . as $in
+ | (reduce match($re; $flags) as $edit
+ ({result: [], previous: 0};
+ $in[ .previous: ($edit | .offset) ] as $gap
+ # create the "capture" objects (one per item in s)
+ | [reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair
+ ({}; . + $pair) | s ] as $inserts
+ | reduce range(0; $inserts|length) as $ix (.; .result[$ix] += $gap + $inserts[$ix])
+ | .previous = ($edit | .offset + .length ) )
+ | .result[] + $in[.previous:] )
+ // $in;
#
def sub($re; s): sub($re; s; "");
-# repeated substitution of re (which may contain named captures)
+#
def gsub($re; s; flags): sub($re; s; flags + "g");
def gsub($re; s): sub($re; s; "g");
-
+#
########################################################################
# generic iterator/generator
def while(cond; update):
@@ -206,7 +187,7 @@ def transpose:
| length as $length
| reduce range(0; $max) as $j
([]; . + [reduce range(0;$length) as $i ([]; . + [ $in[$i][$j] ] )] )
- end;
+ end;
def in(xs): . as $x | xs | has($x);
def inside(xs): . as $x | xs | contains($x);
def repeat(exp):
@@ -237,7 +218,6 @@ def tostream:
getpath($p) |
reduce path(.[]?) as $q ([$p, .]; [$p+$q]);
-
# Assuming the input array is sorted, bsearch/1 returns
# the index of the target if the target is in the input array; and otherwise
# (-1 - ix), where ix is the insertion point that would leave the array sorted.
diff --git a/tests/jq.test b/tests/jq.test
index ca8e2705..78c4017b 100644
--- a/tests/jq.test
+++ b/tests/jq.test
@@ -1731,3 +1731,4 @@ false
. |= try . catch .
1
1
+
diff --git a/tests/onig.test b/tests/onig.test
index daacae9c..805efaba 100644
--- a/tests/onig.test
+++ b/tests/onig.test
@@ -75,6 +75,45 @@ gsub( "(.*)"; ""; "x")
""
""
+gsub( ""; "a"; "g")
+""
+"a"
+
+gsub( "^"; ""; "g")
+"a"
+"a"
+
+
+# The following is a regression test and should not be construed as a requirement other than that execution should terminate:
+gsub( ""; "a"; "g")
+"a"
+"aa"
+
+gsub( "$"; "a"; "g")
+"a"
+"aa"
+
+gsub( "^"; "a")
+""
+"a"
+
+gsub("(?=u)"; "u")
+"qux"
+"quux"
+
+gsub("^.*a"; "b")
+"aaa"
+"b"
+
+gsub("^.*?a"; "b")
+"aaa"
+"baa"
+
+# The following is for regression testing and should not be construed as a requirement:
+[gsub("a"; "b", "c")]
+"a"
+["b","c"]
+
[.[] | scan(", ")]
["a,b, c, d, e,f",", a,b, c, d, e,f, "]
[", ",", ",", ",", ",", ",", ",", ",", "]
@@ -92,7 +131,33 @@ gsub("(?<x>.)[^a]*"; "+\(.x)-")
"Abcabc"
"+A-+a-"
+gsub("(?<x>.)(?<y>[0-9])"; "\(.x|ascii_downcase)\(.y)")
+"A1 B2 CD"
+"a1 b2 CD"
+
+gsub("\\b(?<x>.)"; "\(.x|ascii_downcase)")
+"ABC DEF"
+"aBC dEF"
+
# utf-8
sub("(?<x>.)"; "\(.x)!")
"’"
"’!"
+
+[sub("a"; "b", "c")]
+"a"
+["b","c"]
+
+[sub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)", "c")]
+"aB"
+["AB","aB","cB"]
+
+[gsub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)", "c")]
+"aB"
+["AB","ab","cc"]
+
+# splits and _nwise
+[splits("")]
+"ab"
+["","a","b"]
+