summaryrefslogtreecommitdiffstats
path: root/libnetdata
diff options
context:
space:
mode:
authorTimo <timotejs@gmail.com>2019-07-24 14:32:08 +0200
committerChris Akritidis <43294513+cakrit@users.noreply.github.com>2019-07-24 14:32:08 +0200
commit19f1bd14debbc3654f156d05e44b792efc45d3d3 (patch)
tree46d7fa0ff3e17c2c58e295044da6a37850c7255d /libnetdata
parentadb7026b14fea7bbebee93c8958d92ee9cf1efaf (diff)
Utf8 Badge Fix And URL Parser International Support (initial) (#6426)
#### Summary Fixes #3117 Additionally it adds support for UTF-8 in URL parser (as it should). Label sizes now are updated by browser with JavaScript (although guess is still calculated by verdana11_widths with minor improvements) #### Component Name API/Badges, LibNetData/URL #### Additional Information It was found that not only verdana11_widths need to be updated but the url parser replaces international characters with spaces (one space per each byte of multibyte character). Therefore I update both to support international chars.
Diffstat (limited to 'libnetdata')
-rw-r--r--libnetdata/libnetdata.h1
-rw-r--r--libnetdata/string/utf8.h9
-rw-r--r--libnetdata/url/url.c144
3 files changed, 151 insertions, 3 deletions
diff --git a/libnetdata/libnetdata.h b/libnetdata/libnetdata.h
index 1672ae3004..ef883300b8 100644
--- a/libnetdata/libnetdata.h
+++ b/libnetdata/libnetdata.h
@@ -313,5 +313,6 @@ extern char *netdata_configured_host_prefix;
#include "url/url.h"
#include "json/json.h"
#include "health/health.h"
+#include "string/utf8.h"
#endif // NETDATA_LIB_H
diff --git a/libnetdata/string/utf8.h b/libnetdata/string/utf8.h
new file mode 100644
index 0000000000..133ec710b6
--- /dev/null
+++ b/libnetdata/string/utf8.h
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef NETDATA_STRING_UTF8_H
+#define NETDATA_STRING_UTF8_H 1
+
+#define IS_UTF8_BYTE(x) (x & 0x80)
+#define IS_UTF8_STARTBYTE(x) (IS_UTF8_BYTE(x)&&(x & 0x40))
+
+#endif /* NETDATA_STRING_UTF8_H */
diff --git a/libnetdata/url/url.c b/libnetdata/url/url.c
index 07a9f8069e..1929d6686f 100644
--- a/libnetdata/url/url.c
+++ b/libnetdata/url/url.c
@@ -52,6 +52,125 @@ char *url_decode(char *str) {
return url_decode_r(buf, str, size);
}
+//decode %XX character or return 0 if cannot
+char url_percent_escape_decode(char *s) {
+ if(likely(s[1] && s[2]))
+ return from_hex(s[1]) << 4 | from_hex(s[2]);
+ return 0;
+}
+
+//this (utf8 string related) should be moved in separate file in future
+char url_utf8_get_byte_length(char c) {
+ if(!IS_UTF8_BYTE(c))
+ return 1;
+
+ char length = 0;
+ while(likely(c & 0x80)) {
+ length++;
+ c <<= 1;
+ }
+ //4 byte is max size for UTF-8 char
+ //10XX XXXX is not valid character -> check length == 1
+ if(length > 4 || length == 1)
+ return -1;
+
+ return length;
+}
+
+//decode % encoded UTF-8 characters and copy them to *d
+//return count of bytes written to *d
+char url_decode_multibyte_utf8(char *s, char *d, char *d_end) {
+ char first_byte = url_percent_escape_decode(s);
+
+ if(unlikely(!first_byte || !IS_UTF8_STARTBYTE(first_byte)))
+ return 0;
+
+ char byte_length = url_utf8_get_byte_length(first_byte);
+
+ if(unlikely(byte_length <= 0 || d+byte_length >= d_end))
+ return 0;
+
+ char to_read = byte_length;
+ while(to_read > 0) {
+ char c = url_percent_escape_decode(s);
+
+ if(unlikely( !IS_UTF8_BYTE(c) ))
+ return 0;
+ if((to_read != byte_length) && IS_UTF8_STARTBYTE(c))
+ return 0;
+
+ *d++ = c;
+ s+=3;
+ to_read--;
+ }
+
+ return byte_length;
+}
+
+/*
+ * The utf8_check() function scans the '\0'-terminated string starting
+ * at s. It returns a pointer to the first byte of the first malformed
+ * or overlong UTF-8 sequence found, or NULL if the string contains
+ * only correct UTF-8. It also spots UTF-8 sequences that could cause
+ * trouble if converted to UTF-16, namely surrogate characters
+ * (U+D800..U+DFFF) and non-Unicode positions (U+FFFE..U+FFFF). This
+ * routine is very likely to find a malformed sequence if the input
+ * uses any other encoding than UTF-8. It therefore can be used as a
+ * very effective heuristic for distinguishing between UTF-8 and other
+ * encodings.
+ *
+ * Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2005-03-30
+ * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
+ */
+
+unsigned char *utf8_check(unsigned char *s)
+{
+ while (*s)
+ {
+ if (*s < 0x80)
+ /* 0xxxxxxx */
+ s++;
+ else if ((s[0] & 0xe0) == 0xc0)
+ {
+ /* 110XXXXx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[0] & 0xfe) == 0xc0) /* overlong? */
+ return s;
+ else
+ s += 2;
+ }
+ else if ((s[0] & 0xf0) == 0xe0)
+ {
+ /* 1110XXXX 10Xxxxxx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */
+ (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */
+ (s[0] == 0xef && s[1] == 0xbf &&
+ (s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */
+ return s;
+ else
+ s += 3;
+ }
+ else if ((s[0] & 0xf8) == 0xf0)
+ {
+ /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+ if ((s[1] & 0xc0) != 0x80 ||
+ (s[2] & 0xc0) != 0x80 ||
+ (s[3] & 0xc0) != 0x80 ||
+ (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */
+ (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
+ return s;
+ else
+ s += 4;
+ }
+ else
+ return s;
+ }
+
+ return NULL;
+}
+
char *url_decode_r(char *to, char *url, size_t size) {
char *s = url, // source
*d = to, // destination
@@ -59,12 +178,24 @@ char *url_decode_r(char *to, char *url, size_t size) {
while(*s && d < e) {
if(unlikely(*s == '%')) {
- if(likely(s[1] && s[2])) {
- char t = from_hex(s[1]) << 4 | from_hex(s[2]);
+ char t = url_percent_escape_decode(s);
+ if(IS_UTF8_BYTE(t)) {
+ char bytes_written = url_decode_multibyte_utf8(s, d, e);
+ if(likely(bytes_written)){
+ d += bytes_written;
+ s += (bytes_written * 3)-1;
+ }
+ else {
+ goto fail_cleanup;
+ }
+ }
+ else if(likely(t) && isprint(t)) {
// avoid HTTP header injection
- *d++ = (char)((isprint(t))? t : ' ');
+ *d++ = t;
s += 2;
}
+ else
+ goto fail_cleanup;
}
else if(unlikely(*s == '+'))
*d++ = ' ';
@@ -77,5 +208,12 @@ char *url_decode_r(char *to, char *url, size_t size) {
*d = '\0';
+ if(unlikely( utf8_check(to) )) //NULL means sucess here
+ return NULL;
+
return to;
+
+fail_cleanup:
+ *d = '\0';
+ return NULL;
}