diff options
author | pkoppstein <pkoppstein@gmail.com> | 2023-07-03 18:46:29 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-07-04 07:46:29 +0900 |
commit | 83f375cc831039396167d4d2b5f901f4b33a8707 (patch) | |
tree | 1ec0e167f315971aefeceed86b1d3d283a3528e3 | |
parent | edb0d88e3bb563bbc671c87d139d3c051987e77c (diff) |
Revamp sub/3 to resolve most issues with gsub (and sub with "g") (#2641)
The primary purpose of this commit is to rectify most problems with
`gsub` (and also `sub` with the `g` option), in particular fix #1425 ('\b'),
fix #2354 (lookahead), and fix #2532 (regex == `"^(?!cd ).*$|^cd "`).
This commit also partly resolves #2148 and resolves #1206 in that
`gsub` no longer loops infinitely; however, because the new `gsub`
depends critically on `match/2`, the behavior when regex == `""` is
sometimes non-standard.
The documentation has been updated to reflect the fact that `sub`
and `gsub` are intended to be regular in the second argument.
Also, `_nwise/1` has been tweaked to take advantage of TCO.
-rw-r--r-- | docs/content/manual/manual.yml | 37 | ||||
-rw-r--r-- | src/builtin.c | 3 | ||||
-rw-r--r-- | src/builtin.jq | 62 | ||||
-rw-r--r-- | tests/jq.test | 1 | ||||
-rw-r--r-- | tests/onig.test | 65 |
5 files changed, 113 insertions, 55 deletions
diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml index f9bdb102..d5918be8 100644 --- a/docs/content/manual/manual.yml +++ b/docs/content/manual/manual.yml @@ -2481,32 +2481,43 @@ sections: input: '("ab,cd", "ef, gh")' output: ['"ab"', '"cd"', '"ef"', '"gh"'] - - title: "`sub(regex; tostring)`, `sub(regex; string; flags)`" + - title: "`sub(regex; tostring)`, `sub(regex; tostring; flags)`" body: | - Emit the string obtained by replacing the first match of regex in the - input string with `tostring`, after interpolation. `tostring` should - be a jq string, and may contain references to named captures. The - named captures are, in effect, presented as a JSON object (as - constructed by `capture`) to `tostring`, so a reference to a captured - variable named "x" would take the form: `"\(.x)"`. + Emit the string obtained by replacing the first match of + regex in the input string with `tostring`, after + interpolation. `tostring` should be a jq string or a stream + of such strings, each of which may contain references to + named captures. The named captures are, in effect, presented + as a JSON object (as constructed by `capture`) to + `tostring`, so a reference to a captured variable named "x" + would take the form: `"\(.x)"`. example: - - program: 'sub("^[^a-z]*(?<x>[a-z]*).*")' - input: '"123abc456"' - output: '"ZabcZabc"' + - program: 'sub("[^a-z]*(?<x>[a-z]*)"; "Z\(.x)"; "g")' + input: '"123abc456def"' + output: ['"ZabcZdef"'] - - title: "`gsub(regex; string)`, `gsub(regex; string; flags)`" + - program: '[sub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)")]' + input: '"aB"' + output: ['["AB","aB"]'] + + - title: "`gsub(regex; tostring)`, `gsub(regex; tostring; flags)`" body: | `gsub` is like `sub` but all the non-overlapping occurrences of the regex are - replaced by the string, after interpolation. + replaced by `tostring`, after interpolation. If the second argument is a stream + of jq strings, then `gsub` will produce a corresponding stream of JSON strings. example: - program: 'gsub("(?<x>.)[^a]*"; "+\(.x)-")' input: '"Abcabc"' - output: '"+A-+a-"' + output: ['"+A-+a-"'] + + - program: '[gsub("p"; "a", "b")]' + input: '"p"' + output: ['["a","b"]'] - title: Advanced features diff --git a/src/builtin.c b/src/builtin.c index 9b2d9a23..2d1156dc 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -930,7 +930,8 @@ static jv f_match(jq_state *jq, jv input, jv regex, jv modifiers, jv testmode) { match = jv_object_set(match, jv_string("string"), jv_string("")); match = jv_object_set(match, jv_string("captures"), jv_array()); result = jv_array_append(result, match); - start += 1; + // ensure '"qux" | match("(?=u)"; "g")' matches just once + start = (const UChar*)(input_string+region->end[0]+1); continue; } diff --git a/src/builtin.jq b/src/builtin.jq index a102fd51..09663511 100644 --- a/src/builtin.jq +++ b/src/builtin.jq @@ -99,8 +99,10 @@ def scan(re): # # If input is an array, then emit a stream of successive subarrays of length n (or less), # and similarly for strings. -def _nwise(a; $n): if a|length <= $n then a else a[0:$n] , _nwise(a[$n:]; $n) end; -def _nwise($n): _nwise(.; $n); +def _nwise($n): + def n: if length <= $n then . else .[0:$n] , (.[$n:] | n) end; + n; +def _nwise(a; $n): a | _nwise($n); # # splits/1 produces a stream; split/1 is retained for backward compatibility. def splits($re; flags): . as $s @@ -114,47 +116,26 @@ def splits($re): splits($re; null); # split emits an array for backward compatibility def split($re; flags): [ splits($re; flags) ]; # -# If s contains capture variables, then create a capture object and pipe it to s -def sub($re; s): - . as $in - | [match($re)] - | if length == 0 then $in - else .[0] - | . as $r -# # create the "capture" object: - | reduce ( $r | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair - ({}; . + $pair) - | $in[0:$r.offset] + s + $in[$r.offset+$r.length:] - end ; -# -# If s contains capture variables, then create a capture object and pipe it to s -def sub($re; s; flags): - def subg: [explode[] | select(. != 103)] | implode; - # "fla" should be flags with all occurrences of g removed; gs should be non-nil if flags has a g - def sub1(fla; gs): - def mysub: - . as $in - | [match($re; fla)] - | if length == 0 then $in - else .[0] as $edit - | ($edit | .offset + .length) as $len - # create the "capture" object: - | reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair - ({}; . + $pair) - | $in[0:$edit.offset] - + s - + ($in[$len:] | if length > 0 and gs then mysub else . end) - end ; - mysub ; - (flags | index("g")) as $gs - | (flags | if $gs then subg else . end) as $fla - | sub1($fla; $gs); +# If s contains capture variables, then create a capture object and pipe it to s, bearing +# in mind that s could be a stream +def sub($re; s; $flags): + . as $in + | (reduce match($re; $flags) as $edit + ({result: [], previous: 0}; + $in[ .previous: ($edit | .offset) ] as $gap + # create the "capture" objects (one per item in s) + | [reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair + ({}; . + $pair) | s ] as $inserts + | reduce range(0; $inserts|length) as $ix (.; .result[$ix] += $gap + $inserts[$ix]) + | .previous = ($edit | .offset + .length ) ) + | .result[] + $in[.previous:] ) + // $in; # def sub($re; s): sub($re; s; ""); -# repeated substitution of re (which may contain named captures) +# def gsub($re; s; flags): sub($re; s; flags + "g"); def gsub($re; s): sub($re; s; "g"); - +# ######################################################################## # generic iterator/generator def while(cond; update): @@ -206,7 +187,7 @@ def transpose: | length as $length | reduce range(0; $max) as $j ([]; . + [reduce range(0;$length) as $i ([]; . + [ $in[$i][$j] ] )] ) - end; + end; def in(xs): . as $x | xs | has($x); def inside(xs): . as $x | xs | contains($x); def repeat(exp): @@ -237,7 +218,6 @@ def tostream: getpath($p) | reduce path(.[]?) as $q ([$p, .]; [$p+$q]); - # Assuming the input array is sorted, bsearch/1 returns # the index of the target if the target is in the input array; and otherwise # (-1 - ix), where ix is the insertion point that would leave the array sorted. diff --git a/tests/jq.test b/tests/jq.test index ca8e2705..78c4017b 100644 --- a/tests/jq.test +++ b/tests/jq.test @@ -1731,3 +1731,4 @@ false . |= try . catch . 1 1 + diff --git a/tests/onig.test b/tests/onig.test index daacae9c..805efaba 100644 --- a/tests/onig.test +++ b/tests/onig.test @@ -75,6 +75,45 @@ gsub( "(.*)"; ""; "x") "" "" +gsub( ""; "a"; "g") +"" +"a" + +gsub( "^"; ""; "g") +"a" +"a" + + +# The following is a regression test and should not be construed as a requirement other than that execution should terminate: +gsub( ""; "a"; "g") +"a" +"aa" + +gsub( "$"; "a"; "g") +"a" +"aa" + +gsub( "^"; "a") +"" +"a" + +gsub("(?=u)"; "u") +"qux" +"quux" + +gsub("^.*a"; "b") +"aaa" +"b" + +gsub("^.*?a"; "b") +"aaa" +"baa" + +# The following is for regression testing and should not be construed as a requirement: +[gsub("a"; "b", "c")] +"a" +["b","c"] + [.[] | scan(", ")] ["a,b, c, d, e,f",", a,b, c, d, e,f, "] [", ",", ",", ",", ",", ",", ",", ",", "] @@ -92,7 +131,33 @@ gsub("(?<x>.)[^a]*"; "+\(.x)-") "Abcabc" "+A-+a-" +gsub("(?<x>.)(?<y>[0-9])"; "\(.x|ascii_downcase)\(.y)") +"A1 B2 CD" +"a1 b2 CD" + +gsub("\\b(?<x>.)"; "\(.x|ascii_downcase)") +"ABC DEF" +"aBC dEF" + # utf-8 sub("(?<x>.)"; "\(.x)!") "’" "’!" + +[sub("a"; "b", "c")] +"a" +["b","c"] + +[sub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)", "c")] +"aB" +["AB","aB","cB"] + +[gsub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)", "c")] +"aB" +["AB","ab","cc"] + +# splits and _nwise +[splits("")] +"ab" +["","a","b"] + |