summaryrefslogtreecommitdiffstats
path: root/jv.c
diff options
context:
space:
mode:
authorStephen Dolan <mu@netsoc.tcd.ie>2013-06-22 13:34:24 +0100
committerStephen Dolan <mu@netsoc.tcd.ie>2013-06-22 13:34:24 +0100
commitff48bd6ec538b01d1057be8e93b94eef6914e9ef (patch)
tree49b16387bc0741afebd2b22e265f5db3518fb473 /jv.c
parent54b9c9bdb225af5d886466d72f47eafc51acb4f7 (diff)
Fix various UTF8 parsing bugs.
In particular, parse bad UTF8 by replacing the broken bits with U+FFFD and resychronise correctly after broken sequences.
Diffstat (limited to 'jv.c')
-rw-r--r--jv.c41
1 files changed, 38 insertions, 3 deletions
diff --git a/jv.c b/jv.c
index 9316aec1..0c69a568 100644
--- a/jv.c
+++ b/jv.c
@@ -377,6 +377,32 @@ static jvp_string* jvp_string_alloc(uint32_t size) {
return s;
}
+/* Copy a UTF8 string, replacing all badly encoded points with U+FFFD */
+static jv_nontrivial jvp_string_copy_replace_bad(const char* data, uint32_t length) {
+ const char* end = data + length;
+ const char* i = data;
+ const char* cstart;
+
+ uint32_t maxlength = length * 3 + 1; // worst case: all bad bytes, each becomes a 3-byte U+FFFD
+ jvp_string* s = jvp_string_alloc(maxlength);
+ char* out = s->data;
+ int c = 0;
+
+ while ((i = jvp_utf8_next((cstart = i), end, &c))) {
+ if (c == -1) {
+ c = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER
+ }
+ out += jvp_utf8_encode(c, out);
+ assert(out < s->data + maxlength);
+ }
+ length = out - s->data;
+ s->data[length] = 0;
+ s->length_hashed = length << 1;
+ jv_nontrivial r = {&s->refcnt, {0,0}};
+ return r;
+}
+
+/* Assumes valid UTF8 */
static jv_nontrivial jvp_string_new(const char* data, uint32_t length) {
jvp_string* s = jvp_string_alloc(length);
s->length_hashed = length << 1;
@@ -523,7 +549,9 @@ static int jvp_string_equal(jv_nontrivial* a, jv_nontrivial* b) {
jv jv_string_sized(const char* str, int len) {
jv j;
j.kind = JV_KIND_STRING;
- j.val.nontrivial = jvp_string_new(str, len);
+ j.val.nontrivial = jvp_utf8_is_valid(str, str+len) ?
+ jvp_string_new(str, len) :
+ jvp_string_copy_replace_bad(str, len);
return j;
}
@@ -568,14 +596,21 @@ jv jv_string_concat(jv a, jv b) {
}
jv jv_string_append_buf(jv a, const char* buf, int len) {
- jvp_string_append(&a.val.nontrivial, buf, len);
+ if (jvp_utf8_is_valid(buf, buf+len)) {
+ jvp_string_append(&a.val.nontrivial, buf, len);
+ } else {
+ jv b;
+ b.kind = JV_KIND_STRING;
+ b.val.nontrivial = jvp_string_copy_replace_bad(buf, len);
+ a = jv_string_concat(a, b);
+ }
return a;
}
jv jv_string_append_str(jv a, const char* str) {
return jv_string_append_buf(a, str, strlen(str));
}
-
+
jv jv_string_fmt(const char* fmt, ...) {
int size = 1024;
while (1) {