diff options
author | Mattias Wadman <mattias.wadman@gmail.com> | 2024-03-20 11:04:17 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-20 11:04:17 +0100 |
commit | be437ec049bb2300731522ca93f37cd2629b4cc8 (patch) | |
tree | 43b578a215508e105f632e923a47bff93c3eda6b /src | |
parent | 81f4f883ac76de11ae9d99266554d31d3b7f0c0c (diff) |
Add trim/0, ltrim/0 and rtrim/0 that trims leading and trailing whitespace (#3056)
Diffstat (limited to 'src')
-rw-r--r-- | src/builtin.c | 55 | ||||
-rw-r--r-- | src/jv_unicode.c | 18 | ||||
-rw-r--r-- | src/jv_unicode.h | 2 |
3 files changed, 75 insertions, 0 deletions
diff --git a/src/builtin.c b/src/builtin.c index 5f24cfb8..e93ac321 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -1197,6 +1197,58 @@ static jv f_string_indexes(jq_state *jq, jv a, jv b) { return jv_string_indexes(a, b); } +enum trim_op { + TRIM_LEFT = 1 << 0, + TRIM_RIGHT = 1 << 1 +}; + +static jv string_trim(jv a, int op) { + if (jv_get_kind(a) != JV_KIND_STRING) { + return ret_error(a, jv_string("trim input must be a string")); + } + + int len = jv_string_length_bytes(jv_copy(a)); + const char *start = jv_string_value(a); + const char *trim_start = start; + const char *end = trim_start + len; + const char *trim_end = end; + int c; + + if (op & TRIM_LEFT) { + for (;;) { + const char *ns = jvp_utf8_next(trim_start, end, &c); + if (!ns || !jvp_codepoint_is_whitespace(c)) + break; + trim_start = ns; + } + } + + // make sure not empty string or start trim has trimmed everything + if ((op & TRIM_RIGHT) && trim_end > trim_start) { + for (;;) { + const char *ns = jvp_utf8_backtrack(trim_end-1, trim_start, NULL); + jvp_utf8_next(ns, trim_end, &c); + if (!jvp_codepoint_is_whitespace(c)) + break; + trim_end = ns; + if (ns == trim_start) + break; + } + } + + // no new string needed if there is nothing to trim + if (trim_start == start && trim_end == end) + return a; + + jv ts = jv_string_sized(trim_start, trim_end - trim_start); + jv_free(a); + return ts; +} + +static jv f_string_trim(jq_state *jq, jv a) { return string_trim(a, TRIM_LEFT | TRIM_RIGHT); } +static jv f_string_ltrim(jq_state *jq, jv a) { return string_trim(a, TRIM_LEFT); } +static jv f_string_rtrim(jq_state *jq, jv a) { return string_trim(a, TRIM_RIGHT); } + static jv f_string_implode(jq_state *jq, jv a) { if (jv_get_kind(a) != JV_KIND_ARRAY) { return ret_error(a, jv_string("implode input must be an array")); @@ -1721,6 +1773,9 @@ BINOPS {f_string_explode, "explode", 1}, {f_string_implode, "implode", 1}, {f_string_indexes, "_strindices", 2}, + {f_string_trim, "trim", 1}, + {f_string_ltrim, "ltrim", 1}, + {f_string_rtrim, "rtrim", 1}, {f_setpath, "setpath", 3}, // FIXME typechecking {f_getpath, "getpath", 2}, {f_delpaths, "delpaths", 2}, diff --git a/src/jv_unicode.c b/src/jv_unicode.c index d197349f..5a762315 100644 --- a/src/jv_unicode.c +++ b/src/jv_unicode.c @@ -118,3 +118,21 @@ int jvp_utf8_encode(int codepoint, char* out) { assert(out - start == jvp_utf8_encode_length(codepoint)); return out - start; } + +// characters with White_Space property in: +// https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt +int jvp_codepoint_is_whitespace(int c) { + return + (c >= 0x0009 && c <= 0x000D) || // <control-0009>..<control-000D> + c == 0x0020 || // SPACE + c == 0x0085 || // <control-0085> + c == 0x00A0 || // NO-BREAK SPACE + c == 0x1680 || // OGHAM SPACE MARK + (c >= 0x2000 && c <= 0x200A) || // EN QUAD..HAIR SPACE + c == 0x2028 || // LINE SEPARATOR + c == 0x2029 || // PARAGRAPH SEPARATOR + c == 0x202F || // NARROW NO-BREAK SPACE + c == 0x205F || // MEDIUM MATHEMATICAL SPACE + c == 0x3000 // IDEOGRAPHIC SPACE + ; +} diff --git a/src/jv_unicode.h b/src/jv_unicode.h index 558721a8..0e5e9557 100644 --- a/src/jv_unicode.h +++ b/src/jv_unicode.h @@ -9,4 +9,6 @@ int jvp_utf8_decode_length(char startchar); int jvp_utf8_encode_length(int codepoint); int jvp_utf8_encode(int codepoint, char* out); + +int jvp_codepoint_is_whitespace(int c); #endif |