regex filters (#432): scan, splits, split, sub, gsub

author: pkoppstein <pkoppstein@gmail.com> 2014-07-31 20:32:44 -0400
committer: Nicolas Williams <nico@cryptonector.com> 2014-08-08 17:00:14 -0500
commit: a696c6b551879c7a9d16cfaa867c6f1bec57e6f8 (patch)
tree: fbc3d7c676db862eb427c3bad351ddef17023e90
parent: 0d437e25de7d14dc780fd152e86e0414a027a2f5 (diff)
3 files changed, 185 insertions, 11 deletions
diff --git a/builtin.c b/builtin.c
index 4fb496cd..8acde3a6 100644
--- a/builtin.c
+++ b/builtin.c
@@ -974,23 +974,78 @@ static const char* const jq_builtins[] = {
   "def flatten: reduce .[] as $i ([]; if $i | type == \"array\" then . + ($i | flatten) else . + [$i] end);",
   "def flatten(x): x as $x | reduce .[] as $i ([]; if $i | type == \"array\" and $x > 0 then . + ($i | flatten($x-1)) else . + [$i] end);",
   "def range(x): x as $x | range(0;$x);",
-  // regular expressions:
   "def match(re; mode): _match_impl(re; mode; false)|.[];",
   "def match(val): (val|type) as $vt | if $vt == \"string\" then match(val; null)"
-   "  elif $vt == \"array\" and (val | length) > 1 then match(val[0]; val[1])"
-   "  elif $vt == \"array\" and (val | length) > 0 then match(val[0]; null)"
-   "  else error( $vt + \" not a string or array\") end;",
+  "   elif $vt == \"array\" and (val | length) > 1 then match(val[0]; val[1])"
+  "   elif $vt == \"array\" and (val | length) > 0 then match(val[0]; null)"
+  "   else error( $vt + \" not a string or array\") end;",
   "def test(re; mode): _match_impl(re; mode; true);",
   "def test(val): (val|type) as $vt | if $vt == \"string\" then test(val; null)"
-   "  elif $vt == \"array\" and (val | length) > 1 then test(val[0]; val[1])"
-   "  elif $vt == \"array\" and (val | length) > 0 then test(val[0]; null)"
-   "  else error( $vt + \" not a string or array\") end;",
-  // Ex.: "a1" | capture( "(?<x>[a-z*])" ).x => "a"
+  "   elif $vt == \"array\" and (val | length) > 1 then test(val[0]; val[1])"
+  "   elif $vt == \"array\" and (val | length) > 0 then test(val[0]; null)"
+  "   else error( $vt + \" not a string or array\") end;",
   "def capture(re; mods): match(re; mods) | reduce ( .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair ({}; . + $pair);",
   "def capture(val): (val|type) as $vt | if $vt == \"string\" then capture(val; null)"
-   "  elif $vt == \"array\" and (val | length) > 1 then capture(val[0]; val[1])"
-   "  elif $vt == \"array\" and (val | length) > 0 then capture(val[0]; null)"
-   "  else error( $vt + \" not a string or array\") end;",
+  "   elif $vt == \"array\" and (val | length) > 1 then capture(val[0]; val[1])"
+  "   elif $vt == \"array\" and (val | length) > 0 then capture(val[0]; null)"
+  "   else error( $vt + \" not a string or array\") end;",
+  "def scan(re):"
+  "  match(re; \"g\")"
+  "  |  if (.captures|length > 0)"
+  "      then [ .captures | .[] | .string ]"
+  "      else .string"
+  "      end ;",
+  //
+  // If input is an array, then emit a stream of successive subarrays of length n (or less),
+  // and similarly for strings.
+  "def nwise(a; n): if a|length <= n then a else a[0:n] , nwise(a[n:]; n) end;",
+  "def nwise(n): nwise(.; n);",
+  //
+  // splits/1 produces a stream; split/1 is retained for backward compatibility.
+  "def splits(re; flags): . as $s"
+     //  # multiple occurrences of "g" are acceptable
+  "  | [ match(re; \"g\" + flags) | (.offset, .offset + .length) ]"
+  "  | [0] + . +[$s|length]"
+  "  | nwise(2)"
+  "  | $s[.[0]:.[1] ] ;",
+  "def splits(re): splits(re; null);",
+  //
+  // split emits an array for backward compatibility
+  "def split(re; flags): [ splits(re; flags) ];",
+  "def split(re): [ splits(re; null) ];",
+  //
+  // If s contains capture variables, then create a capture object and pipe it to s
+  "def sub(re; s):"
+  "  . as $in"
+  "  | [match(re)]"
+  "  | .[0]"
+  "  | . as $r"
+     //  # create the \"capture\" object:
+  "  | reduce ( $r | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair"
+  "      ({}; . + $pair)"
+  "  | if . == {} then $in | .[0:$r.offset]+s+.[$r.offset+$r.length:]"
+  "    else (. | s)"
+  "    end ;",
+  //
+  // repeated substitution of re (which may contain named captures)
+  "def gsub(re; s):"
+  //   # _stredit(edits;s) - s is the \"to\" string, which might contain capture variables,
+  //   # so if an edit contains captures, then create the capture object and pipe it to s
+  "   def _stredit(edits; s):"
+  "     if (edits|length) == 0 then ."
+  "     else . as $in"
+  "       | (edits|length -1) as $l"
+  "       | (edits[$l]) as $edit"
+  //       # create the \"capture\" object:
+  "       | ($edit | reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair"
+  "         ({}; . + $pair) )"
+  "       | if . == {} then $in | .[0:$edit.offset]+s+.[$edit.offset+$edit.length:] | _stredit(edits[0:$l]; s)"
+  "         else (if $l == 0 then \"\" else ($in | _stredit(edits[0:$l]; s)) end) + (. | s)"
+  "         end"
+  "     end ;"
+  "  [match(re;\"g\")] as $edits | _stredit($edits; s) ;",
+
+  //#######################################################################
   // range/3, with a `by` expression argument
   "def range(init; upto; by): "
   "    init as $init |"
diff --git a/docs/content/3.manual/manual.yml b/docs/content/3.manual/manual.yml
index 8d0ebee5..58517a3a 100644
--- a/docs/content/3.manual/manual.yml
+++ b/docs/content/3.manual/manual.yml
@@ -1721,6 +1721,91 @@ sections:
             - program: 'capture("(?<a>[a-z]+)-(?<n>[0-9]+)")'
               input: '"xyzzy-14"'
               output: '{ "a": "xyzzy", "n": "14" }''
+
+  - title: "`scan(regex)`, `scan(regex; flags)`"
+  body: |
+  
+  Emit a stream of the non-overlapping substrings of the input
+  that match the regex in accordance with the flags, if any
+  have been specified.  If there is no match, the stream is empty.
+  To capture all the matches for each input string, use the idiom
+  [ expr ], e.g. [ scan(regex) ].
+  
+  example:
+  - program: 'scan("c")'
+    input: '"abcdefabc"'
+    output: '"c"'
+            '"c"'
+  
+  - program: 'scan("b")'
+    input: ("", "")
+    output: '[]'
+            '[]"'
+  
+  - title: "`split(regex)`, split(regex; flags)`"
+  body: |
+  
+  For backwards compatibility, `split` emits an array of the strings
+  corresponding to the successive segments of the input string after it
+  has been split at the boundaries defined by the regex and any
+  specified flags.  The substrings corresponding to the boundaries
+  themselves are excluded.  If regex is the empty string, then the first
+  match will be the empty string.
+  
+  `split(regex)` can be thought of as a wrapper around `splits(regex)`,
+  and similarly for `split(regex; flags)`.
+  
+  example:
+  - program: 'split(", *")'
+    input: '"ab,cd, ef"`
+    output: '["ab","cd","ef"]'
+  
+  
+  - title: "`splits(regex)`, splits(regex; flags)`"
+  body: |
+  
+  These provide the same results as their `split` counterparts,
+  but as a stream instead of an array.
+  
+  example:
+  - program: 'splits(", *")'
+    input: '("ab,cd", "ef, gh")`
+    output:
+           '"ab"'
+           '"cd"'
+           '"ef"'
+           '"gh"'
+  
+  - title: "`sub(regex; tostring)`"
+  
+  body: |
+  
+  Emit the string obtained by replacing the first match of regex in the
+  input string with `tostring`, after interpolation.  `tostring` should
+  be a jq string, and may contain references to named captures. The
+  named captures are, in effect, presented as a JSON object (as
+  constructed by `capture`) to `tostring`, so a reference to a captured
+  variable named "x" would take the form: "\(.x)".
+  
+  example:
+  - program: 'sub("^[^a-z]*(?<x>[a-z]*).*")'
+     input: '"123abc456"'
+     output: '"ZabcZabc"'
+  
+  
+  - title: "`gsub(regex; string)`"
+  
+  body: |
+  
+  `gsub` is like `sub` but all the non-overlapping occurrences of the regex are
+  replaced by the string, after interpolation.
+  
+  example:
+  - program: 'gsub("(?<x>.)[^a]*"; "+\(.x)-")'
+  
+    input: '"Abcabc"'
+    output: '"+A-+a-"'
+  
   
   - title: Advanced features
     body: |
diff --git a/tests/all.test b/tests/all.test
index a9a153df..c57c6d07 100644
--- a/tests/all.test
+++ b/tests/all.test
@@ -820,6 +820,40 @@ capture("(?<a>[a-z]+)-(?<n>[0-9]+)")
 "xyzzy-14"
 {"a":"xyzzy","n":"14"}
 
+
+# jq-coded utilities built on match:
+#
+# The second element in these tests' inputs tests the case where the
+# fromstring matches both the head and tail of the string
+[.[] | sub(", "; ":")]
+["a,b, c, d, e,f", ", a,b, c, d, e,f, "]
+["a,b:c, d, e,f",":a,b, c, d, e,f, "]
+, #2 [", ",", ",", "],["a,b","c","d","e,f"]], #3 [[":a,b, c, d, e,f,"],[":a,b:c:d:e,f:"],[", ",", ",", ",", ",", "],["","a,b","c","d","e,f",""]]]
+
+[.[] | gsub(", "; ":")]
+["a,b, c, d, e,f",", a,b, c, d, e,f, "]
+["a,b:c:d:e,f",":a,b:c:d:e,f:"]
+
+[.[] | scan(", ")]
+["a,b, c, d, e,f",", a,b, c, d, e,f, "]
+
+[.[] | split(", ")]
+["a,b, c, d, e,f",", a,b, c, d, e,f, "]
+
+########################
+[.[]|[[sub(", *";":")], [gsub(", *";":")], [scan(", *")], split(", *")]]
+["a,b, c, d, e,f",", a,b, c, d, e,f, "]
+[[["a:b, c, d, e,f"],["a:b:c:d:e:f"],[",",", ",", ",", ",","],["a","b","c","d","e","f"]],[[":a,b, c, d, e,f, "],[":a:b:c:d:e:f:"],[", ",",",", ",", ",", ",",",", "],["","a","b","c","d","e","f",""]]]
+
+[.[]|[[sub(", +";":")], [gsub(", +";":")], [scan(", +")], split(", +")]]
+["a,b, c, d, e,f",", a,b, c, d, e,f, "]
+[[["a,b:c, d, e,f"],["a,b:c:d:e,f"],[", ",", ",", "],["a,b","c","d","e,f"]],[[":a,b, c, d, e,f, "],[":a,b:c:d:e,f:"],[", ",", ",", ",", ",", "],["","a,b","c","d","e,f",""]]]
+
+# reference to named captures
+gsub("(?<x>.)[^a]*"; "+\(.x)-")
+"Abcabc"
+"+A-+a-"
+
 [.[]|ltrimstr("foo")]
 ["fo", "foo", "barfoo", "foobar", "afoo"]
 ["fo","","barfoo","bar","afoo"]
author	pkoppstein <pkoppstein@gmail.com>	2014-07-31 20:32:44 -0400
committer	Nicolas Williams <nico@cryptonector.com>	2014-08-08 17:00:14 -0500
commit	a696c6b551879c7a9d16cfaa867c6f1bec57e6f8 (patch)
tree	fbc3d7c676db862eb427c3bad351ddef17023e90
parent	0d437e25de7d14dc780fd152e86e0414a027a2f5 (diff)