summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorpkoppstein <pkoppstein@gmail.com>2014-07-31 20:32:44 -0400
committerNicolas Williams <nico@cryptonector.com>2014-08-08 17:00:14 -0500
commita696c6b551879c7a9d16cfaa867c6f1bec57e6f8 (patch)
treefbc3d7c676db862eb427c3bad351ddef17023e90
parent0d437e25de7d14dc780fd152e86e0414a027a2f5 (diff)
regex filters (#432): scan, splits, split, sub, gsub
-rw-r--r--builtin.c77
-rw-r--r--docs/content/3.manual/manual.yml85
-rw-r--r--tests/all.test34
3 files changed, 185 insertions, 11 deletions
diff --git a/builtin.c b/builtin.c
index 4fb496cd..8acde3a6 100644
--- a/builtin.c
+++ b/builtin.c
@@ -974,23 +974,78 @@ static const char* const jq_builtins[] = {
"def flatten: reduce .[] as $i ([]; if $i | type == \"array\" then . + ($i | flatten) else . + [$i] end);",
"def flatten(x): x as $x | reduce .[] as $i ([]; if $i | type == \"array\" and $x > 0 then . + ($i | flatten($x-1)) else . + [$i] end);",
"def range(x): x as $x | range(0;$x);",
- // regular expressions:
"def match(re; mode): _match_impl(re; mode; false)|.[];",
"def match(val): (val|type) as $vt | if $vt == \"string\" then match(val; null)"
- " elif $vt == \"array\" and (val | length) > 1 then match(val[0]; val[1])"
- " elif $vt == \"array\" and (val | length) > 0 then match(val[0]; null)"
- " else error( $vt + \" not a string or array\") end;",
+ " elif $vt == \"array\" and (val | length) > 1 then match(val[0]; val[1])"
+ " elif $vt == \"array\" and (val | length) > 0 then match(val[0]; null)"
+ " else error( $vt + \" not a string or array\") end;",
"def test(re; mode): _match_impl(re; mode; true);",
"def test(val): (val|type) as $vt | if $vt == \"string\" then test(val; null)"
- " elif $vt == \"array\" and (val | length) > 1 then test(val[0]; val[1])"
- " elif $vt == \"array\" and (val | length) > 0 then test(val[0]; null)"
- " else error( $vt + \" not a string or array\") end;",
- // Ex.: "a1" | capture( "(?<x>[a-z*])" ).x => "a"
+ " elif $vt == \"array\" and (val | length) > 1 then test(val[0]; val[1])"
+ " elif $vt == \"array\" and (val | length) > 0 then test(val[0]; null)"
+ " else error( $vt + \" not a string or array\") end;",
"def capture(re; mods): match(re; mods) | reduce ( .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair ({}; . + $pair);",
"def capture(val): (val|type) as $vt | if $vt == \"string\" then capture(val; null)"
- " elif $vt == \"array\" and (val | length) > 1 then capture(val[0]; val[1])"
- " elif $vt == \"array\" and (val | length) > 0 then capture(val[0]; null)"
- " else error( $vt + \" not a string or array\") end;",
+ " elif $vt == \"array\" and (val | length) > 1 then capture(val[0]; val[1])"
+ " elif $vt == \"array\" and (val | length) > 0 then capture(val[0]; null)"
+ " else error( $vt + \" not a string or array\") end;",
+ "def scan(re):"
+ " match(re; \"g\")"
+ " | if (.captures|length > 0)"
+ " then [ .captures | .[] | .string ]"
+ " else .string"
+ " end ;",
+ //
+ // If input is an array, then emit a stream of successive subarrays of length n (or less),
+ // and similarly for strings.
+ "def nwise(a; n): if a|length <= n then a else a[0:n] , nwise(a[n:]; n) end;",
+ "def nwise(n): nwise(.; n);",
+ //
+ // splits/1 produces a stream; split/1 is retained for backward compatibility.
+ "def splits(re; flags): . as $s"
+ // # multiple occurrences of "g" are acceptable
+ " | [ match(re; \"g\" + flags) | (.offset, .offset + .length) ]"
+ " | [0] + . +[$s|length]"
+ " | nwise(2)"
+ " | $s[.[0]:.[1] ] ;",
+ "def splits(re): splits(re; null);",
+ //
+ // split emits an array for backward compatibility
+ "def split(re; flags): [ splits(re; flags) ];",
+ "def split(re): [ splits(re; null) ];",
+ //
+ // If s contains capture variables, then create a capture object and pipe it to s
+ "def sub(re; s):"
+ " . as $in"
+ " | [match(re)]"
+ " | .[0]"
+ " | . as $r"
+ // # create the \"capture\" object:
+ " | reduce ( $r | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair"
+ " ({}; . + $pair)"
+ " | if . == {} then $in | .[0:$r.offset]+s+.[$r.offset+$r.length:]"
+ " else (. | s)"
+ " end ;",
+ //
+ // repeated substitution of re (which may contain named captures)
+ "def gsub(re; s):"
+ // # _stredit(edits;s) - s is the \"to\" string, which might contain capture variables,
+ // # so if an edit contains captures, then create the capture object and pipe it to s
+ " def _stredit(edits; s):"
+ " if (edits|length) == 0 then ."
+ " else . as $in"
+ " | (edits|length -1) as $l"
+ " | (edits[$l]) as $edit"
+ // # create the \"capture\" object:
+ " | ($edit | reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair"
+ " ({}; . + $pair) )"
+ " | if . == {} then $in | .[0:$edit.offset]+s+.[$edit.offset+$edit.length:] | _stredit(edits[0:$l]; s)"
+ " else (if $l == 0 then \"\" else ($in | _stredit(edits[0:$l]; s)) end) + (. | s)"
+ " end"
+ " end ;"
+ " [match(re;\"g\")] as $edits | _stredit($edits; s) ;",
+
+ //#######################################################################
// range/3, with a `by` expression argument
"def range(init; upto; by): "
" init as $init |"
diff --git a/docs/content/3.manual/manual.yml b/docs/content/3.manual/manual.yml
index 8d0ebee5..58517a3a 100644
--- a/docs/content/3.manual/manual.yml
+++ b/docs/content/3.manual/manual.yml
@@ -1721,6 +1721,91 @@ sections:
- program: 'capture("(?<a>[a-z]+)-(?<n>[0-9]+)")'
input: '"xyzzy-14"'
output: '{ "a": "xyzzy", "n": "14" }''
+
+ - title: "`scan(regex)`, `scan(regex; flags)`"
+ body: |
+
+ Emit a stream of the non-overlapping substrings of the input
+ that match the regex in accordance with the flags, if any
+ have been specified. If there is no match, the stream is empty.
+ To capture all the matches for each input string, use the idiom
+ [ expr ], e.g. [ scan(regex) ].
+
+ example:
+ - program: 'scan("c")'
+ input: '"abcdefabc"'
+ output: '"c"'
+ '"c"'
+
+ - program: 'scan("b")'
+ input: ("", "")
+ output: '[]'
+ '[]"'
+
+ - title: "`split(regex)`, split(regex; flags)`"
+ body: |
+
+ For backwards compatibility, `split` emits an array of the strings
+ corresponding to the successive segments of the input string after it
+ has been split at the boundaries defined by the regex and any
+ specified flags. The substrings corresponding to the boundaries
+ themselves are excluded. If regex is the empty string, then the first
+ match will be the empty string.
+
+ `split(regex)` can be thought of as a wrapper around `splits(regex)`,
+ and similarly for `split(regex; flags)`.
+
+ example:
+ - program: 'split(", *")'
+ input: '"ab,cd, ef"`
+ output: '["ab","cd","ef"]'
+
+
+ - title: "`splits(regex)`, splits(regex; flags)`"
+ body: |
+
+ These provide the same results as their `split` counterparts,
+ but as a stream instead of an array.
+
+ example:
+ - program: 'splits(", *")'
+ input: '("ab,cd", "ef, gh")`
+ output:
+ '"ab"'
+ '"cd"'
+ '"ef"'
+ '"gh"'
+
+ - title: "`sub(regex; tostring)`"
+
+ body: |
+
+ Emit the string obtained by replacing the first match of regex in the
+ input string with `tostring`, after interpolation. `tostring` should
+ be a jq string, and may contain references to named captures. The
+ named captures are, in effect, presented as a JSON object (as
+ constructed by `capture`) to `tostring`, so a reference to a captured
+ variable named "x" would take the form: "\(.x)".
+
+ example:
+ - program: 'sub("^[^a-z]*(?<x>[a-z]*).*")'
+ input: '"123abc456"'
+ output: '"ZabcZabc"'
+
+
+ - title: "`gsub(regex; string)`"
+
+ body: |
+
+ `gsub` is like `sub` but all the non-overlapping occurrences of the regex are
+ replaced by the string, after interpolation.
+
+ example:
+ - program: 'gsub("(?<x>.)[^a]*"; "+\(.x)-")'
+
+ input: '"Abcabc"'
+ output: '"+A-+a-"'
+
- title: Advanced features
body: |
diff --git a/tests/all.test b/tests/all.test
index a9a153df..c57c6d07 100644
--- a/tests/all.test
+++ b/tests/all.test
@@ -820,6 +820,40 @@ capture("(?<a>[a-z]+)-(?<n>[0-9]+)")
"xyzzy-14"
{"a":"xyzzy","n":"14"}
+
+# jq-coded utilities built on match:
+#
+# The second element in these tests' inputs tests the case where the
+# fromstring matches both the head and tail of the string
+[.[] | sub(", "; ":")]
+["a,b, c, d, e,f", ", a,b, c, d, e,f, "]
+["a,b:c, d, e,f",":a,b, c, d, e,f, "]
+, #2 [", ",", ",", "],["a,b","c","d","e,f"]], #3 [[":a,b, c, d, e,f,"],[":a,b:c:d:e,f:"],[", ",", ",", ",", ",", "],["","a,b","c","d","e,f",""]]]
+
+[.[] | gsub(", "; ":")]
+["a,b, c, d, e,f",", a,b, c, d, e,f, "]
+["a,b:c:d:e,f",":a,b:c:d:e,f:"]
+
+[.[] | scan(", ")]
+["a,b, c, d, e,f",", a,b, c, d, e,f, "]
+
+[.[] | split(", ")]
+["a,b, c, d, e,f",", a,b, c, d, e,f, "]
+
+########################
+[.[]|[[sub(", *";":")], [gsub(", *";":")], [scan(", *")], split(", *")]]
+["a,b, c, d, e,f",", a,b, c, d, e,f, "]
+[[["a:b, c, d, e,f"],["a:b:c:d:e:f"],[",",", ",", ",", ",","],["a","b","c","d","e","f"]],[[":a,b, c, d, e,f, "],[":a:b:c:d:e:f:"],[", ",",",", ",", ",", ",",",", "],["","a","b","c","d","e","f",""]]]
+
+[.[]|[[sub(", +";":")], [gsub(", +";":")], [scan(", +")], split(", +")]]
+["a,b, c, d, e,f",", a,b, c, d, e,f, "]
+[[["a,b:c, d, e,f"],["a,b:c:d:e,f"],[", ",", ",", "],["a,b","c","d","e,f"]],[[":a,b, c, d, e,f, "],[":a,b:c:d:e,f:"],[", ",", ",", ",", ",", "],["","a,b","c","d","e,f",""]]]
+
+# reference to named captures
+gsub("(?<x>.)[^a]*"; "+\(.x)-")
+"Abcabc"
+"+A-+a-"
+
[.[]|ltrimstr("foo")]
["fo", "foo", "barfoo", "foobar", "afoo"]
["fo","","barfoo","bar","afoo"]