From a696c6b551879c7a9d16cfaa867c6f1bec57e6f8 Mon Sep 17 00:00:00 2001 From: pkoppstein Date: Thu, 31 Jul 2014 20:32:44 -0400 Subject: regex filters (#432): scan, splits, split, sub, gsub --- builtin.c | 77 ++++++++++++++++++++++++++++++------ docs/content/3.manual/manual.yml | 85 ++++++++++++++++++++++++++++++++++++++++ tests/all.test | 34 ++++++++++++++++ 3 files changed, 185 insertions(+), 11 deletions(-) diff --git a/builtin.c b/builtin.c index 4fb496cd..8acde3a6 100644 --- a/builtin.c +++ b/builtin.c @@ -974,23 +974,78 @@ static const char* const jq_builtins[] = { "def flatten: reduce .[] as $i ([]; if $i | type == \"array\" then . + ($i | flatten) else . + [$i] end);", "def flatten(x): x as $x | reduce .[] as $i ([]; if $i | type == \"array\" and $x > 0 then . + ($i | flatten($x-1)) else . + [$i] end);", "def range(x): x as $x | range(0;$x);", - // regular expressions: "def match(re; mode): _match_impl(re; mode; false)|.[];", "def match(val): (val|type) as $vt | if $vt == \"string\" then match(val; null)" - " elif $vt == \"array\" and (val | length) > 1 then match(val[0]; val[1])" - " elif $vt == \"array\" and (val | length) > 0 then match(val[0]; null)" - " else error( $vt + \" not a string or array\") end;", + " elif $vt == \"array\" and (val | length) > 1 then match(val[0]; val[1])" + " elif $vt == \"array\" and (val | length) > 0 then match(val[0]; null)" + " else error( $vt + \" not a string or array\") end;", "def test(re; mode): _match_impl(re; mode; true);", "def test(val): (val|type) as $vt | if $vt == \"string\" then test(val; null)" - " elif $vt == \"array\" and (val | length) > 1 then test(val[0]; val[1])" - " elif $vt == \"array\" and (val | length) > 0 then test(val[0]; null)" - " else error( $vt + \" not a string or array\") end;", - // Ex.: "a1" | capture( "(?[a-z*])" ).x => "a" + " elif $vt == \"array\" and (val | length) > 1 then test(val[0]; val[1])" + " elif $vt == \"array\" and (val | length) > 0 then test(val[0]; null)" + " else error( $vt + \" not a string or array\") end;", "def capture(re; mods): match(re; mods) | reduce ( .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair ({}; . + $pair);", "def capture(val): (val|type) as $vt | if $vt == \"string\" then capture(val; null)" - " elif $vt == \"array\" and (val | length) > 1 then capture(val[0]; val[1])" - " elif $vt == \"array\" and (val | length) > 0 then capture(val[0]; null)" - " else error( $vt + \" not a string or array\") end;", + " elif $vt == \"array\" and (val | length) > 1 then capture(val[0]; val[1])" + " elif $vt == \"array\" and (val | length) > 0 then capture(val[0]; null)" + " else error( $vt + \" not a string or array\") end;", + "def scan(re):" + " match(re; \"g\")" + " | if (.captures|length > 0)" + " then [ .captures | .[] | .string ]" + " else .string" + " end ;", + // + // If input is an array, then emit a stream of successive subarrays of length n (or less), + // and similarly for strings. + "def nwise(a; n): if a|length <= n then a else a[0:n] , nwise(a[n:]; n) end;", + "def nwise(n): nwise(.; n);", + // + // splits/1 produces a stream; split/1 is retained for backward compatibility. + "def splits(re; flags): . as $s" + // # multiple occurrences of "g" are acceptable + " | [ match(re; \"g\" + flags) | (.offset, .offset + .length) ]" + " | [0] + . +[$s|length]" + " | nwise(2)" + " | $s[.[0]:.[1] ] ;", + "def splits(re): splits(re; null);", + // + // split emits an array for backward compatibility + "def split(re; flags): [ splits(re; flags) ];", + "def split(re): [ splits(re; null) ];", + // + // If s contains capture variables, then create a capture object and pipe it to s + "def sub(re; s):" + " . as $in" + " | [match(re)]" + " | .[0]" + " | . as $r" + // # create the \"capture\" object: + " | reduce ( $r | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair" + " ({}; . + $pair)" + " | if . == {} then $in | .[0:$r.offset]+s+.[$r.offset+$r.length:]" + " else (. | s)" + " end ;", + // + // repeated substitution of re (which may contain named captures) + "def gsub(re; s):" + // # _stredit(edits;s) - s is the \"to\" string, which might contain capture variables, + // # so if an edit contains captures, then create the capture object and pipe it to s + " def _stredit(edits; s):" + " if (edits|length) == 0 then ." + " else . as $in" + " | (edits|length -1) as $l" + " | (edits[$l]) as $edit" + // # create the \"capture\" object: + " | ($edit | reduce ( $edit | .captures | .[] | select(.name != null) | { (.name) : .string } ) as $pair" + " ({}; . + $pair) )" + " | if . == {} then $in | .[0:$edit.offset]+s+.[$edit.offset+$edit.length:] | _stredit(edits[0:$l]; s)" + " else (if $l == 0 then \"\" else ($in | _stredit(edits[0:$l]; s)) end) + (. | s)" + " end" + " end ;" + " [match(re;\"g\")] as $edits | _stredit($edits; s) ;", + + //####################################################################### // range/3, with a `by` expression argument "def range(init; upto; by): " " init as $init |" diff --git a/docs/content/3.manual/manual.yml b/docs/content/3.manual/manual.yml index 8d0ebee5..58517a3a 100644 --- a/docs/content/3.manual/manual.yml +++ b/docs/content/3.manual/manual.yml @@ -1721,6 +1721,91 @@ sections: - program: 'capture("(?[a-z]+)-(?[0-9]+)")' input: '"xyzzy-14"' output: '{ "a": "xyzzy", "n": "14" }'' + + - title: "`scan(regex)`, `scan(regex; flags)`" + body: | + + Emit a stream of the non-overlapping substrings of the input + that match the regex in accordance with the flags, if any + have been specified. If there is no match, the stream is empty. + To capture all the matches for each input string, use the idiom + [ expr ], e.g. [ scan(regex) ]. + + example: + - program: 'scan("c")' + input: '"abcdefabc"' + output: '"c"' + '"c"' + + - program: 'scan("b")' + input: ("", "") + output: '[]' + '[]"' + + - title: "`split(regex)`, split(regex; flags)`" + body: | + + For backwards compatibility, `split` emits an array of the strings + corresponding to the successive segments of the input string after it + has been split at the boundaries defined by the regex and any + specified flags. The substrings corresponding to the boundaries + themselves are excluded. If regex is the empty string, then the first + match will be the empty string. + + `split(regex)` can be thought of as a wrapper around `splits(regex)`, + and similarly for `split(regex; flags)`. + + example: + - program: 'split(", *")' + input: '"ab,cd, ef"` + output: '["ab","cd","ef"]' + + + - title: "`splits(regex)`, splits(regex; flags)`" + body: | + + These provide the same results as their `split` counterparts, + but as a stream instead of an array. + + example: + - program: 'splits(", *")' + input: '("ab,cd", "ef, gh")` + output: + '"ab"' + '"cd"' + '"ef"' + '"gh"' + + - title: "`sub(regex; tostring)`" + + body: | + + Emit the string obtained by replacing the first match of regex in the + input string with `tostring`, after interpolation. `tostring` should + be a jq string, and may contain references to named captures. The + named captures are, in effect, presented as a JSON object (as + constructed by `capture`) to `tostring`, so a reference to a captured + variable named "x" would take the form: "\(.x)". + + example: + - program: 'sub("^[^a-z]*(?[a-z]*).*")' + input: '"123abc456"' + output: '"ZabcZabc"' + + + - title: "`gsub(regex; string)`" + + body: | + + `gsub` is like `sub` but all the non-overlapping occurrences of the regex are + replaced by the string, after interpolation. + + example: + - program: 'gsub("(?.)[^a]*"; "+\(.x)-")' + + input: '"Abcabc"' + output: '"+A-+a-"' + - title: Advanced features body: | diff --git a/tests/all.test b/tests/all.test index a9a153df..c57c6d07 100644 --- a/tests/all.test +++ b/tests/all.test @@ -820,6 +820,40 @@ capture("(?[a-z]+)-(?[0-9]+)") "xyzzy-14" {"a":"xyzzy","n":"14"} + +# jq-coded utilities built on match: +# +# The second element in these tests' inputs tests the case where the +# fromstring matches both the head and tail of the string +[.[] | sub(", "; ":")] +["a,b, c, d, e,f", ", a,b, c, d, e,f, "] +["a,b:c, d, e,f",":a,b, c, d, e,f, "] +, #2 [", ",", ",", "],["a,b","c","d","e,f"]], #3 [[":a,b, c, d, e,f,"],[":a,b:c:d:e,f:"],[", ",", ",", ",", ",", "],["","a,b","c","d","e,f",""]]] + +[.[] | gsub(", "; ":")] +["a,b, c, d, e,f",", a,b, c, d, e,f, "] +["a,b:c:d:e,f",":a,b:c:d:e,f:"] + +[.[] | scan(", ")] +["a,b, c, d, e,f",", a,b, c, d, e,f, "] + +[.[] | split(", ")] +["a,b, c, d, e,f",", a,b, c, d, e,f, "] + +######################## +[.[]|[[sub(", *";":")], [gsub(", *";":")], [scan(", *")], split(", *")]] +["a,b, c, d, e,f",", a,b, c, d, e,f, "] +[[["a:b, c, d, e,f"],["a:b:c:d:e:f"],[",",", ",", ",", ",","],["a","b","c","d","e","f"]],[[":a,b, c, d, e,f, "],[":a:b:c:d:e:f:"],[", ",",",", ",", ",", ",",",", "],["","a","b","c","d","e","f",""]]] + +[.[]|[[sub(", +";":")], [gsub(", +";":")], [scan(", +")], split(", +")]] +["a,b, c, d, e,f",", a,b, c, d, e,f, "] +[[["a,b:c, d, e,f"],["a,b:c:d:e,f"],[", ",", ",", "],["a,b","c","d","e,f"]],[[":a,b, c, d, e,f, "],[":a,b:c:d:e,f:"],[", ",", ",", ",", ",", "],["","a,b","c","d","e,f",""]]] + +# reference to named captures +gsub("(?.)[^a]*"; "+\(.x)-") +"Abcabc" +"+A-+a-" + [.[]|ltrimstr("foo")] ["fo", "foo", "barfoo", "foobar", "afoo"] ["fo","","barfoo","bar","afoo"] -- cgit v1.2.3