Add trim/0, ltrim/0 and rtrim/0 that trims leading and trailing whitespace (#3056)

author: Mattias Wadman <mattias.wadman@gmail.com> 2024-03-20 11:04:17 +0100
committer: GitHub <noreply@github.com> 2024-03-20 11:04:17 +0100
commit: be437ec049bb2300731522ca93f37cd2629b4cc8 (patch)
tree: 43b578a215508e105f632e923a47bff93c3eda6b
parent: 81f4f883ac76de11ae9d99266554d31d3b7f0c0c (diff)
7 files changed, 144 insertions, 0 deletions
diff --git a/docs/content/manual/manual.yml b/docs/content/manual/manual.yml
index ba2486c1..f2000f6f 100644
--- a/docs/content/manual/manual.yml
+++ b/docs/content/manual/manual.yml
@@ -1772,6 +1772,25 @@ sections:
             input: '["fo", "foo", "barfoo", "foobar", "foob"]'
             output: ['["fo","","bar","foobar","foob"]']
 
+      - title: "`trim`, `ltrim`, `rtrim`"
+        body: |
+
+          `trim` trims both leading and trailing whitespace.
+
+          `ltrim` trims only leading (left side) whitespace.
+
+          `rtrim` trims only trailing (right side) whitespace.
+
+          Whitespace characters are the usual `" "`, `"\n"` `"\t"`, `"\r"`
+          and also all characters in the Unicode character database with the
+          whitespace property. Note that what considers whitespace might
+          change in the future.
+
+        examples:
+          - program: 'trim, ltrim, rtrim'
+            input: '" abc "'
+            output: ['"abc"', '"abc "', '" abc"']
+
       - title: "`explode`"
         body: |
 
diff --git a/jq.1.prebuilt b/jq.1.prebuilt
index 32221515..efa5aa2f 100644
--- a/jq.1.prebuilt
+++ b/jq.1.prebuilt
@@ -1930,6 +1930,30 @@ jq \'[\.[]|rtrimstr("foo")]\'
 .
 .IP "" 0
 .
+.SS "trim, ltrim, rtrim"
+\fBtrim\fR trims both leading and trailing whitespace\.
+.
+.P
+\fBltrim\fR trims only leading (left side) whitespace\.
+.
+.P
+\fBrtrim\fR trims only trailing (right side) whitespace\.
+.
+.P
+Whitespace characters are the usual \fB" "\fR, \fB"\en"\fR \fB"\et"\fR, \fB"\er"\fR and also all characters in the Unicode character database with the whitespace property\. Note that what considers whitespace might change in the future\.
+.
+.IP "" 4
+.
+.nf
+
+jq \'trim, ltrim, rtrim\'
+   " abc "
+=> "abc", "abc ", " abc"
+.
+.fi
+.
+.IP "" 0
+.
 .SS "explode"
 Converts an input string into an array of the string\'s codepoint numbers\.
 .
diff --git a/src/builtin.c b/src/builtin.c
index 5f24cfb8..e93ac321 100644
--- a/src/builtin.c
+++ b/src/builtin.c
@@ -1197,6 +1197,58 @@ static jv f_string_indexes(jq_state *jq, jv a, jv b) {
   return jv_string_indexes(a, b);
 }
 
+enum trim_op {
+  TRIM_LEFT  = 1 << 0,
+  TRIM_RIGHT = 1 << 1
+};
+
+static jv string_trim(jv a, int op) {
+  if (jv_get_kind(a) != JV_KIND_STRING) {
+    return ret_error(a, jv_string("trim input must be a string"));
+  }
+
+  int len = jv_string_length_bytes(jv_copy(a));
+  const char *start = jv_string_value(a);
+  const char *trim_start = start;
+  const char *end = trim_start + len;
+  const char *trim_end = end;
+  int c;
+
+  if (op & TRIM_LEFT) {
+    for (;;) {
+      const char *ns = jvp_utf8_next(trim_start, end, &c);
+      if (!ns || !jvp_codepoint_is_whitespace(c))
+        break;
+      trim_start = ns;
+    }
+  }
+
+  // make sure not empty string or start trim has trimmed everything
+  if ((op & TRIM_RIGHT) && trim_end > trim_start) {
+    for (;;) {
+      const char *ns = jvp_utf8_backtrack(trim_end-1, trim_start, NULL);
+      jvp_utf8_next(ns, trim_end, &c);
+      if (!jvp_codepoint_is_whitespace(c))
+        break;
+      trim_end = ns;
+      if (ns == trim_start)
+        break;
+    }
+  }
+
+  // no new string needed if there is nothing to trim
+  if (trim_start == start && trim_end == end)
+    return a;
+
+  jv ts = jv_string_sized(trim_start, trim_end - trim_start);
+  jv_free(a);
+  return ts;
+}
+
+static jv f_string_trim(jq_state *jq, jv a)  { return string_trim(a, TRIM_LEFT | TRIM_RIGHT); }
+static jv f_string_ltrim(jq_state *jq, jv a) { return string_trim(a, TRIM_LEFT); }
+static jv f_string_rtrim(jq_state *jq, jv a) { return string_trim(a, TRIM_RIGHT); }
+
 static jv f_string_implode(jq_state *jq, jv a) {
   if (jv_get_kind(a) != JV_KIND_ARRAY) {
     return ret_error(a, jv_string("implode input must be an array"));
@@ -1721,6 +1773,9 @@ BINOPS
   {f_string_explode, "explode", 1},
   {f_string_implode, "implode", 1},
   {f_string_indexes, "_strindices", 2},
+  {f_string_trim, "trim", 1},
+  {f_string_ltrim, "ltrim", 1},
+  {f_string_rtrim, "rtrim", 1},
   {f_setpath, "setpath", 3}, // FIXME typechecking
   {f_getpath, "getpath", 2},
   {f_delpaths, "delpaths", 2},
diff --git a/src/jv_unicode.c b/src/jv_unicode.c
index d197349f..5a762315 100644
--- a/src/jv_unicode.c
+++ b/src/jv_unicode.c
@@ -118,3 +118,21 @@ int jvp_utf8_encode(int codepoint, char* out) {
   assert(out - start == jvp_utf8_encode_length(codepoint));
   return out - start;
 }
+
+// characters with White_Space property in:
+// https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
+int jvp_codepoint_is_whitespace(int c) {
+  return
+    (c >= 0x0009 && c <= 0x000D) || // <control-0009>..<control-000D>
+    c == 0x0020                  || // SPACE
+    c == 0x0085                  || // <control-0085>
+    c == 0x00A0                  || // NO-BREAK SPACE
+    c == 0x1680                  || // OGHAM SPACE MARK
+    (c >= 0x2000 && c <= 0x200A) || // EN QUAD..HAIR SPACE
+    c == 0x2028                  || // LINE SEPARATOR
+    c == 0x2029                  || // PARAGRAPH SEPARATOR
+    c == 0x202F                  || // NARROW NO-BREAK SPACE
+    c == 0x205F                  || // MEDIUM MATHEMATICAL SPACE
+    c == 0x3000                     // IDEOGRAPHIC SPACE
+    ;
+}
diff --git a/src/jv_unicode.h b/src/jv_unicode.h
index 558721a8..0e5e9557 100644
--- a/src/jv_unicode.h
+++ b/src/jv_unicode.h
@@ -9,4 +9,6 @@ int jvp_utf8_decode_length(char startchar);
 
 int jvp_utf8_encode_length(int codepoint);
 int jvp_utf8_encode(int codepoint, char* out);
+
+int jvp_codepoint_is_whitespace(int c);
 #endif
diff --git a/tests/jq.test b/tests/jq.test
index 584ab2b6..eabf836f 100644
--- a/tests/jq.test
+++ b/tests/jq.test
@@ -1334,6 +1334,26 @@ split("")
 "xababababax"
 [1,7,[1,3,5,7]]
 
+# trim
+# \u000b is vertical tab (\v not supported by json)
+map(trim), map(ltrim), map(rtrim)
+[" \n\t\r\f\u000b", "","  ", "a", " a ", "abc", "  abc  ", "  abc", "abc  "]
+["", "", "", "a", "a", "abc", "abc", "abc", "abc"]
+["", "", "", "a", "a ", "abc", "abc  ", "abc", "abc  "]
+["", "", "", "a", " a", "abc", "  abc", "  abc", "abc"]
+
+trim, ltrim, rtrim
+"\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000abc\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000"
+"abc"
+"abc\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000"
+"\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000abc"
+
+try trim catch ., try ltrim catch ., try rtrim catch .
+123
+"trim input must be a string"
+"trim input must be a string"
+"trim input must be a string"
+
 indices(1)
 [0,1,1,2,3,4,1,5]
 [1,2,6]
diff --git a/tests/man.test b/tests/man.test
index 07938cd5..31ae3bf2 100644
--- a/tests/man.test
+++ b/tests/man.test
@@ -602,6 +602,12 @@ combinations(2)
 ["fo", "foo", "barfoo", "foobar", "foob"]
 ["fo","","bar","foobar","foob"]
 
+trim, ltrim, rtrim
+" abc "
+"abc"
+"abc "
+" abc"
+
 explode
 "foobar"
 [102,111,111,98,97,114]
author	Mattias Wadman <mattias.wadman@gmail.com>	2024-03-20 11:04:17 +0100
committer	GitHub <noreply@github.com>	2024-03-20 11:04:17 +0100
commit	be437ec049bb2300731522ca93f37cd2629b4cc8 (patch)
tree	43b578a215508e105f632e923a47bff93c3eda6b
parent	81f4f883ac76de11ae9d99266554d31d3b7f0c0c (diff)