summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorMattias Wadman <mattias.wadman@gmail.com>2024-03-20 11:04:17 +0100
committerGitHub <noreply@github.com>2024-03-20 11:04:17 +0100
commitbe437ec049bb2300731522ca93f37cd2629b4cc8 (patch)
tree43b578a215508e105f632e923a47bff93c3eda6b /src
parent81f4f883ac76de11ae9d99266554d31d3b7f0c0c (diff)
Add trim/0, ltrim/0 and rtrim/0 that trims leading and trailing whitespace (#3056)
Diffstat (limited to 'src')
-rw-r--r--src/builtin.c55
-rw-r--r--src/jv_unicode.c18
-rw-r--r--src/jv_unicode.h2
3 files changed, 75 insertions, 0 deletions
diff --git a/src/builtin.c b/src/builtin.c
index 5f24cfb8..e93ac321 100644
--- a/src/builtin.c
+++ b/src/builtin.c
@@ -1197,6 +1197,58 @@ static jv f_string_indexes(jq_state *jq, jv a, jv b) {
return jv_string_indexes(a, b);
}
+enum trim_op {
+ TRIM_LEFT = 1 << 0,
+ TRIM_RIGHT = 1 << 1
+};
+
+static jv string_trim(jv a, int op) {
+ if (jv_get_kind(a) != JV_KIND_STRING) {
+ return ret_error(a, jv_string("trim input must be a string"));
+ }
+
+ int len = jv_string_length_bytes(jv_copy(a));
+ const char *start = jv_string_value(a);
+ const char *trim_start = start;
+ const char *end = trim_start + len;
+ const char *trim_end = end;
+ int c;
+
+ if (op & TRIM_LEFT) {
+ for (;;) {
+ const char *ns = jvp_utf8_next(trim_start, end, &c);
+ if (!ns || !jvp_codepoint_is_whitespace(c))
+ break;
+ trim_start = ns;
+ }
+ }
+
+ // make sure not empty string or start trim has trimmed everything
+ if ((op & TRIM_RIGHT) && trim_end > trim_start) {
+ for (;;) {
+ const char *ns = jvp_utf8_backtrack(trim_end-1, trim_start, NULL);
+ jvp_utf8_next(ns, trim_end, &c);
+ if (!jvp_codepoint_is_whitespace(c))
+ break;
+ trim_end = ns;
+ if (ns == trim_start)
+ break;
+ }
+ }
+
+ // no new string needed if there is nothing to trim
+ if (trim_start == start && trim_end == end)
+ return a;
+
+ jv ts = jv_string_sized(trim_start, trim_end - trim_start);
+ jv_free(a);
+ return ts;
+}
+
+static jv f_string_trim(jq_state *jq, jv a) { return string_trim(a, TRIM_LEFT | TRIM_RIGHT); }
+static jv f_string_ltrim(jq_state *jq, jv a) { return string_trim(a, TRIM_LEFT); }
+static jv f_string_rtrim(jq_state *jq, jv a) { return string_trim(a, TRIM_RIGHT); }
+
static jv f_string_implode(jq_state *jq, jv a) {
if (jv_get_kind(a) != JV_KIND_ARRAY) {
return ret_error(a, jv_string("implode input must be an array"));
@@ -1721,6 +1773,9 @@ BINOPS
{f_string_explode, "explode", 1},
{f_string_implode, "implode", 1},
{f_string_indexes, "_strindices", 2},
+ {f_string_trim, "trim", 1},
+ {f_string_ltrim, "ltrim", 1},
+ {f_string_rtrim, "rtrim", 1},
{f_setpath, "setpath", 3}, // FIXME typechecking
{f_getpath, "getpath", 2},
{f_delpaths, "delpaths", 2},
diff --git a/src/jv_unicode.c b/src/jv_unicode.c
index d197349f..5a762315 100644
--- a/src/jv_unicode.c
+++ b/src/jv_unicode.c
@@ -118,3 +118,21 @@ int jvp_utf8_encode(int codepoint, char* out) {
assert(out - start == jvp_utf8_encode_length(codepoint));
return out - start;
}
+
+// characters with White_Space property in:
+// https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
+int jvp_codepoint_is_whitespace(int c) {
+ return
+ (c >= 0x0009 && c <= 0x000D) || // <control-0009>..<control-000D>
+ c == 0x0020 || // SPACE
+ c == 0x0085 || // <control-0085>
+ c == 0x00A0 || // NO-BREAK SPACE
+ c == 0x1680 || // OGHAM SPACE MARK
+ (c >= 0x2000 && c <= 0x200A) || // EN QUAD..HAIR SPACE
+ c == 0x2028 || // LINE SEPARATOR
+ c == 0x2029 || // PARAGRAPH SEPARATOR
+ c == 0x202F || // NARROW NO-BREAK SPACE
+ c == 0x205F || // MEDIUM MATHEMATICAL SPACE
+ c == 0x3000 // IDEOGRAPHIC SPACE
+ ;
+}
diff --git a/src/jv_unicode.h b/src/jv_unicode.h
index 558721a8..0e5e9557 100644
--- a/src/jv_unicode.h
+++ b/src/jv_unicode.h
@@ -9,4 +9,6 @@ int jvp_utf8_decode_length(char startchar);
int jvp_utf8_encode_length(int codepoint);
int jvp_utf8_encode(int codepoint, char* out);
+
+int jvp_codepoint_is_whitespace(int c);
#endif