Revamp sub/3 to resolve most issues with gsub (and sub with "g") (#2641)

The primary purpose of this commit is to rectify most problems with `gsub` (and also `sub` with the `g` option), in particular fix #1425 ('\b'), fix #2354 (lookahead), and fix #2532 (regex == `"^(?!cd ).*$|^cd "`). This commit also partly resolves #2148 and resolves #1206 in that `gsub` no longer loops infinitely; however, because the new `gsub` depends critically on `match/2`, the behavior when regex == `""` is sometimes non-standard. The documentation has been updated to reflect the fact that `sub` and `gsub` are intended to be regular in the second argument. Also, `_nwise/1` has been tweaked to take advantage of TCO.
author: pkoppstein <pkoppstein@gmail.com> 2023-07-03 18:46:29 -0400
committer: GitHub <noreply@github.com> 2023-07-04 07:46:29 +0900
commit: 83f375cc831039396167d4d2b5f901f4b33a8707 (patch)
tree: 1ec0e167f315971aefeceed86b1d3d283a3528e3
parent: edb0d88e3bb563bbc671c87d139d3c051987e77c (diff)
5 files changed, 113 insertions, 55 deletions
diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml
index f9bdb102..d5918be8 100644
--- a/docs/content/manual/manual.yml
+++ b/docs/content/manual/manual.yml
@@ -2481,32 +2481,43 @@ sections:
             input: '("ab,cd", "ef, gh")'
             output: ['"ab"', '"cd"', '"ef"', '"gh"']
 
-      - title: "`sub(regex; tostring)`, `sub(regex; string; flags)`"
+      - title: "`sub(regex; tostring)`, `sub(regex; tostring; flags)`"
         body: |
 
-          Emit the string obtained by replacing the first match of regex in the
-          input string with `tostring`, after interpolation.  `tostring` should
-          be a jq string, and may contain references to named captures. The
-          named captures are, in effect, presented as a JSON object (as
-          constructed by `capture`) to `tostring`, so a reference to a captured
-          variable named "x" would take the form: `"\(.x)"`.
+          Emit the string obtained by replacing the first match of
+          regex in the input string with `tostring`, after
+          interpolation.  `tostring` should be a jq string or a stream
+          of such strings, each of which may contain references to
+          named captures. The named captures are, in effect, presented
+          as a JSON object (as constructed by `capture`) to
+          `tostring`, so a reference to a captured variable named "x"
+          would take the form: `"\(.x)"`.
 
         example:
-          - program: 'sub("^[^a-z]*(?<x>[a-z]*).*")'
-            input: '"123abc456"'
-            output: '"ZabcZabc"'
 
+          - program: 'sub("[^a-z]*(?<x>[a-z]*)"; "Z\(.x)"; "g")'
+            input: '"123abc456def"'
+            output: ['"ZabcZdef"']
 
-      - title: "`gsub(regex; string)`, `gsub(regex; string; flags)`"
+          - program: '[sub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)")]'
+            input: '"aB"'
+            output: ['["AB","aB"]']
+
+      - title: "`gsub(regex; tostring)`, `gsub(regex; tostring; flags)`"
         body: |
 
           `gsub` is like `sub` but all the non-overlapping occurrences of the regex are
-          replaced by the string, after interpolation.
+          replaced by `tostring`, after interpolation. If the second argument is a stream
+          of jq strings, then `gsub` will produce a corresponding stream of JSON strings.
 
         example:
           - program: 'gsub("(?<x>.)[^a]*"; "+\(.x)-")'
             input: '"Abcabc"'
-            output: '"+A-+a-"'
+            output: ['"+A-+a-"']
+
+          - program: '[gsub("p"; "a", "b")]'
+            input: '"p"'
+            output: ['["a","b"]']
 
 
   - title: Advanced features
diff --git a/src/builtin.c b/src/builtin.c
index 9b2d9a23..2d1156dc 100644
--- a/src/builtin.c
+++ b/src/builtin.c
@@ -930,7 +930,8 @@ static jv f_match(jq_state *jq, jv input, jv regex, jv modifiers, jv testmode) {
         match = jv_object_set(match, jv_string("string"), jv_string(""));
         match = jv_object_set(match, jv_string("captures"), jv_array());
         result = jv_array_append(result, match);
-        start += 1;
+        // ensure '"qux" | match("(?=u)"; "g")' matches just once
+        start = (const UChar*)(input_string+region->end[0]+1);
         continue;
       }
 
diff --git a/src/builtin.jq b/src/builtin.jq
index a102fd51..09663511 100644
--- a/src/builtin.jq
+++ b/src/builtin.jq
@@ -99,8 +99,10 @@ def scan(re):
 #
 # If input is an array, then emit a stream of successive subarrays of length n (or less),
 # and similarly for strings.
-def _nwise(a; $n): if a|length <= $n then a else a[0:$n] , _nwise(a[$n:]; $n) end;
-def _nwise($n): _nwise(.; $n);
+def _nwise($n):
+  def n: if length <= $n then . else .[0:$n] , (.[$n:] | n) end;
+  n;
+def _nwise(a; $n): a | _nwise($n);
 #
 # splits/1 produces a stream; split/1 is retained for backward compatibility.
 def splits($re; flags): . as $s
@@ -114,47 +116,26 @@ def splits($re): splits($re; null);
 # split emits an array for backward compatibility
 def split($re; flags): [ splits($re; flags) ];
 #
-# If s contains capture variables, then create a capture object and pipe it to s
-def sub($re; s):
-  . as $in
-  | [match($re)]
-  | if length == 0 then $in
-    else .[0]
-    | . as $r
-#  # create the "capture" object:
-    | reduce ( $r | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair
-        ({}; . + $pair)
-    | $in[0:$r.offset] + s + $in[$r.offset+$r.length:]
-    end ;
-#
-# If s contains capture variables, then create a capture object and pipe it to s
-def sub($re; s; flags):
-  def subg: [explode[] | select(. != 103)] | implode;
-  # "fla" should be flags with all occurrences of g removed; gs should be non-nil if flags has a g
-  def sub1(fla; gs):
-    def mysub:
-      . as $in
-      | [match($re; fla)]
-      | if length == 0 then $in
-        else .[0] as $edit
-        | ($edit | .offset + .length) as $len
-        # create the "capture" object:
-        | reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair
-            ({}; . + $pair)
-        | $in[0:$edit.offset]
-          + s
-          + ($in[$len:] | if length > 0 and gs then mysub else . end)
-        end ;
-    mysub ;
-    (flags | index("g")) as $gs
-    | (flags | if $gs then subg else . end) as $fla
-    | sub1($fla; $gs);
+# If s contains capture variables, then create a capture object and pipe it to s, bearing
+# in mind that s could be a stream
+def sub($re; s; $flags):
+   . as $in
+   | (reduce match($re; $flags) as $edit
+        ({result: [], previous: 0};
+            $in[ .previous: ($edit | .offset) ] as $gap
+            # create the "capture" objects (one per item in s)
+            | [reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair
+                 ({}; . + $pair) | s ] as $inserts
+            | reduce range(0; $inserts|length) as $ix (.; .result[$ix] += $gap + $inserts[$ix])
+            | .previous = ($edit | .offset + .length ) )
+          | .result[] + $in[.previous:] )
+      // $in;
 #
 def sub($re; s): sub($re; s; "");
-# repeated substitution of re (which may contain named captures)
+#
 def gsub($re; s; flags): sub($re; s; flags + "g");
 def gsub($re; s): sub($re; s; "g");
-
+#
 ########################################################################
 # generic iterator/generator
 def while(cond; update):
@@ -206,7 +187,7 @@ def transpose:
   | length as $length
   | reduce range(0; $max) as $j
       ([]; . + [reduce range(0;$length) as $i ([]; . + [ $in[$i][$j] ] )] )
-	        end;
+  end;
 def in(xs): . as $x | xs | has($x);
 def inside(xs): . as $x | xs | contains($x);
 def repeat(exp):
@@ -237,7 +218,6 @@ def tostream:
   getpath($p) |
   reduce path(.[]?) as $q ([$p, .]; [$p+$q]);
 
-
 # Assuming the input array is sorted, bsearch/1 returns
 # the index of the target if the target is in the input array; and otherwise
 #  (-1 - ix), where ix is the insertion point that would leave the array sorted.
diff --git a/tests/jq.test b/tests/jq.test
index ca8e2705..78c4017b 100644
--- a/tests/jq.test
+++ b/tests/jq.test
@@ -1731,3 +1731,4 @@ false
 . |= try . catch .
 1
 1
+
diff --git a/tests/onig.test b/tests/onig.test
index daacae9c..805efaba 100644
--- a/tests/onig.test
+++ b/tests/onig.test
@@ -75,6 +75,45 @@ gsub( "(.*)"; "";  "x")
 ""
 ""
 
+gsub( ""; "a";  "g")
+""
+"a"
+
+gsub( "^"; "";  "g")
+"a"
+"a"
+
+
+# The following is a regression test and should not be construed as a requirement other than that execution should terminate:
+gsub( ""; "a";  "g")
+"a"
+"aa"
+
+gsub( "$"; "a";  "g")
+"a"
+"aa"
+
+gsub( "^"; "a")
+""
+"a"
+
+gsub("(?=u)"; "u")
+"qux"
+"quux"
+
+gsub("^.*a"; "b")
+"aaa"
+"b"
+
+gsub("^.*?a"; "b")
+"aaa"
+"baa"
+
+# The following is for regression testing and should not be construed as a requirement:
+[gsub("a"; "b", "c")]
+"a"
+["b","c"]
+
 [.[] | scan(", ")]
 ["a,b, c, d, e,f",", a,b, c, d, e,f, "]
 [", ",", ",", ",", ",", ",", ",", ",", "]
@@ -92,7 +131,33 @@ gsub("(?<x>.)[^a]*"; "+\(.x)-")
 "Abcabc"
 "+A-+a-"
 
+gsub("(?<x>.)(?<y>[0-9])"; "\(.x|ascii_downcase)\(.y)")
+"A1 B2 CD"
+"a1 b2 CD"
+
+gsub("\\b(?<x>.)"; "\(.x|ascii_downcase)")
+"ABC DEF"
+"aBC dEF"
+
 # utf-8
 sub("(?<x>.)"; "\(.x)!")
 "’"
 "’!"
+
+[sub("a"; "b", "c")]
+"a"
+["b","c"]
+
+[sub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)", "c")]
+"aB"
+["AB","aB","cB"]
+
+[gsub("(?<a>.)"; "\(.a|ascii_upcase)", "\(.a|ascii_downcase)", "c")]
+"aB"
+["AB","ab","cc"]
+
+# splits and _nwise
+[splits("")]
+"ab"
+["","a","b"]
+
author	pkoppstein <pkoppstein@gmail.com>	2023-07-03 18:46:29 -0400
committer	GitHub <noreply@github.com>	2023-07-04 07:46:29 +0900
commit	83f375cc831039396167d4d2b5f901f4b33a8707 (patch)
tree	1ec0e167f315971aefeceed86b1d3d283a3528e3
parent	edb0d88e3bb563bbc671c87d139d3c051987e77c (diff)