Move everything around - delete old Haskell code, clean up build.

author: Stephen Dolan <mu@netsoc.tcd.ie> 2012-09-18 17:44:43 +0100
committer: Stephen Dolan <mu@netsoc.tcd.ie> 2012-09-18 17:44:43 +0100
commit: a4eea165bbab6d13f89b59707e835d58b7014a66 (patch)
tree: b99ee5dde8540f8dbe5de3d87b99e04ac4dd2673 /jv_parse.c
parent: 25cbab056b1f73e96b636c88779a92400d92dc15 (diff)
1 files changed, 414 insertions, 0 deletions
diff --git a/jv_parse.c b/jv_parse.c
new file mode 100644
index 00000000..e4565ef7
--- /dev/null
+++ b/jv_parse.c
@@ -0,0 +1,414 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "jv.h"
+#include "jv_dtoa.h"
+#include "jv_parse.h"
+#include "jv_unicode.h"
+
+typedef const char* presult;
+
+#define TRY(x) do {presult msg__ = (x); if (msg__) return msg__; } while(0)
+#ifdef __GNUC__
+#define pfunc __attribute__((warn_unused_result)) presult
+#else
+#define pfunc presult
+#endif
+
+void jv_parser_init(struct jv_parser* p) {
+  p->stack = 0;
+  p->stacklen = p->stackpos = 0;
+  p->next = jv_invalid();
+  p->tokenbuf = 0;
+  p->tokenlen = p->tokenpos = 0;
+  p->st = JV_PARSER_NORMAL;
+  p->curr_buf = 0;
+  p->curr_buf_length = p->curr_buf_pos = p->curr_buf_is_partial = 0;
+  jvp_dtoa_context_init(&p->dtoa);
+}
+
+void jv_parser_free(struct jv_parser* p) {
+  jv_free(p->next);
+  for (int i=0; i<p->stackpos; i++) 
+    jv_free(p->stack[i]);
+  free(p->stack);
+  free(p->tokenbuf);
+  jvp_dtoa_context_free(&p->dtoa);
+}
+
+static pfunc value(struct jv_parser* p, jv val) {
+  if (jv_is_valid(p->next)) return "Expected separator between values";
+  jv_free(p->next);
+  p->next = val;
+  return 0;
+}
+
+static void push(struct jv_parser* p, jv v) {
+  assert(p->stackpos <= p->stacklen);
+  if (p->stackpos == p->stacklen) {
+    p->stacklen = p->stacklen * 2 + 10;
+    p->stack = realloc(p->stack, p->stacklen * sizeof(jv));
+  }
+  assert(p->stackpos < p->stacklen);
+  p->stack[p->stackpos++] = v;
+}
+
+static pfunc token(struct jv_parser* p, char ch) {
+  switch (ch) {
+  case '[':
+    if (jv_is_valid(p->next)) return "Expected separator between values";
+    push(p, jv_array());
+    break;
+
+  case '{':
+    if (jv_is_valid(p->next)) return "Expected separator between values";
+    push(p, jv_object());
+    break;
+
+  case ':':
+    if (!jv_is_valid(p->next)) 
+      return "Expected string key before ':'";
+    if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT)
+      return "':' not as part of an object";
+    if (jv_get_kind(p->next) != JV_KIND_STRING)
+      return "Object keys must be strings";
+    push(p, p->next);
+    p->next = jv_invalid();
+    break;
+
+  case ',':
+    if (!jv_is_valid(p->next))
+      return "Expected value before ','";
+    if (p->stackpos == 0)
+      return "',' not as part of an object or array";
+    if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_ARRAY) {
+      p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next);
+      p->next = jv_invalid();
+    } else if (jv_get_kind(p->stack[p->stackpos-1]) == JV_KIND_STRING) {
+      assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT);
+      p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2], 
+                                              p->stack[p->stackpos-1], p->next);
+      p->stackpos--;
+      p->next = jv_invalid();
+    } else {
+      // this case hits on input like {"a", "b"}
+      return "Objects must consist of key:value pairs";
+    }
+    break;
+
+  case ']':
+    if (p->stackpos == 0 || jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_ARRAY)
+      return "Unmatched ']'";
+    if (jv_is_valid(p->next)) {
+      p->stack[p->stackpos-1] = jv_array_append(p->stack[p->stackpos-1], p->next);
+      p->next = jv_invalid();
+    } else {
+      if (jv_array_length(jv_copy(p->stack[p->stackpos-1])) != 0) {
+        // this case hits on input like [1,2,3,]
+        return "Expected another array element";
+      }
+    }
+    jv_free(p->next);
+    p->next = p->stack[--p->stackpos];
+    break;
+
+  case '}':
+    if (p->stackpos == 0)
+      return "Unmatched '}'";
+    if (jv_is_valid(p->next)) {
+      if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_STRING)
+        return "Objects must consist of key:value pairs";
+      assert(p->stackpos > 1 && jv_get_kind(p->stack[p->stackpos-2]) == JV_KIND_OBJECT);
+      p->stack[p->stackpos-2] = jv_object_set(p->stack[p->stackpos-2], 
+                                              p->stack[p->stackpos-1], p->next);
+      p->stackpos--;
+      p->next = jv_invalid();
+    } else {
+      if (jv_get_kind(p->stack[p->stackpos-1]) != JV_KIND_OBJECT)
+        return "Unmatched '}'";
+      if (jv_object_length(jv_copy(p->stack[p->stackpos-1])) != 0)
+        return "Expected another key-value pair";
+    }
+    jv_free(p->next);
+    p->next = p->stack[--p->stackpos];
+    break;
+  }
+  return 0;
+}
+
+
+static void tokenadd(struct jv_parser* p, char c) {
+  assert(p->tokenpos <= p->tokenlen);
+  if (p->tokenpos == p->tokenlen) {
+    p->tokenlen = p->tokenlen*2 + 256;
+    p->tokenbuf = realloc(p->tokenbuf, p->tokenlen);
+  }
+  assert(p->tokenpos < p->tokenlen);
+  p->tokenbuf[p->tokenpos++] = c;
+}
+
+static int unhex4(char* hex) {
+  int r = 0;
+  for (int i=0; i<4; i++) {
+    char c = *hex++;
+    int n;
+    if ('0' <= c && c <= '9') n = c - '0';
+    else if ('a' <= c && c <= 'f') n = c - 'a' + 10;
+    else if ('A' <= c && c <= 'F') n = c - 'A' + 10;
+    r <<= 4;
+    r |= n;
+  }
+  return r;
+}
+
+static pfunc found_string(struct jv_parser* p) {
+  char* in = p->tokenbuf;
+  char* out = p->tokenbuf;
+  char* end = p->tokenbuf + p->tokenpos;
+  
+  while (in < end) {
+    char c = *in++;
+    if (c == '\\') {
+      if (in >= end)
+        return "Expected escape character at end of string";
+      c = *in++;
+      switch (c) {
+      case '\\':
+      case '"':
+      case '/': *out++ = c;    break;
+      case 'b': *out++ = '\b'; break;
+      case 'f': *out++ = '\f'; break;
+      case 't': *out++ = '\t'; break;
+      case 'n': *out++ = '\n'; break;
+      case 'r': *out++ = '\r'; break;
+
+      case 'u':
+        /* ahh, the complicated case */
+        if (in + 4 > end)
+          return "Invalid \\uXXXX escape";
+        unsigned long codepoint = unhex4(in);
+        in += 4;
+        if (0xD800 <= codepoint && codepoint <= 0xDBFF) {
+          /* who thought UTF-16 surrogate pairs were a good idea? */
+          if (in + 6 > end || in[0] != '\\' || in[1] != 'u')
+            return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
+          unsigned long surrogate = unhex4(in+2);
+          if (!(0xDC00 <= surrogate && surrogate <= 0xDFFF))
+            return "Invalid \\uXXXX\\uXXXX surrogate pair escape";
+          in += 6;
+          codepoint = 0x10000 + (((codepoint - 0xD800) << 10)
+                                 |(surrogate - 0xDC00));
+        }
+        // FIXME assert valid codepoint
+        out += jvp_utf8_encode(codepoint, out);
+        break;
+
+      default:
+        return "Invalid escape";
+      }
+    } else {
+      *out++ = c;
+    }
+  }
+  TRY(value(p, jv_string_sized(p->tokenbuf, out - p->tokenbuf)));
+  p->tokenpos = 0;
+  return 0;
+}
+
+static pfunc check_literal(struct jv_parser* p) {
+  if (p->tokenpos == 0) return 0;
+
+  const char* pattern = 0;
+  int plen;
+  jv v;
+  switch (p->tokenbuf[0]) {
+  case 't': pattern = "true"; plen = 4; v = jv_true(); break;
+  case 'f': pattern = "false"; plen = 5; v = jv_false(); break;
+  case 'n': pattern = "null"; plen = 4; v = jv_null(); break;
+  }
+  if (pattern) {
+    if (p->tokenpos != plen) return "Invalid literal";
+    for (int i=0; i<plen; i++) 
+      if (p->tokenbuf[i] != pattern[i])
+        return "Invalid literal";
+    TRY(value(p, v));
+  } else {
+    // FIXME: better parser
+    p->tokenbuf[p->tokenpos] = 0; // FIXME: invalid
+    char* end = 0;
+    double d = jvp_strtod(&p->dtoa, p->tokenbuf, &end);
+    if (end == 0 || *end != 0)
+      return "Invalid numeric literal";
+    TRY(value(p, jv_number(d)));
+  }
+  p->tokenpos = 0;
+  return 0;
+}
+
+typedef enum {
+  LITERAL,
+  WHITESPACE,
+  STRUCTURE,
+  QUOTE,
+  INVALID
+} chclass;
+
+static chclass classify(char c) {
+  switch (c) {
+  case ' ':
+  case '\t':
+  case '\r':
+  case '\n':
+    return WHITESPACE;
+  case '"':
+    return QUOTE;
+  case '[':
+  case ',':
+  case ']':
+  case '{':
+  case ':':
+  case '}':
+    return STRUCTURE;
+  default:
+    return LITERAL;
+  }
+}
+
+
+static presult OK = "output produced";
+
+static int check_done(struct jv_parser* p, jv* out) {
+  if (p->stackpos == 0 && jv_is_valid(p->next)) {
+    *out = p->next;
+    p->next = jv_invalid();
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+static pfunc scan(struct jv_parser* p, char ch, jv* out) {
+  presult answer = 0;
+  if (p->st == JV_PARSER_NORMAL) {
+    chclass cls = classify(ch);
+    if (cls != LITERAL) {
+      TRY(check_literal(p));
+      if (check_done(p, out)) answer = OK;
+    }
+    switch (cls) {
+    case LITERAL:
+      tokenadd(p, ch);
+      break;
+    case WHITESPACE:
+      break;
+    case QUOTE:
+      p->st = JV_PARSER_STRING;
+      break;
+    case STRUCTURE:
+      TRY(token(p, ch));
+      break;
+    case INVALID:
+      return "Invalid character";
+    }
+    if (check_done(p, out)) answer = OK;
+  } else {
+    if (ch == '"' && p->st == JV_PARSER_STRING) {
+      TRY(found_string(p));
+      p->st = JV_PARSER_NORMAL;
+      if (check_done(p, out)) answer = OK;
+    } else {
+      tokenadd(p, ch);
+      if (ch == '\\' && p->st == JV_PARSER_STRING) {
+        p->st = JV_PARSER_STRING_ESCAPE;
+      } else {
+        p->st = JV_PARSER_STRING;
+      }
+    }
+  }
+  return answer;
+}
+
+void jv_parser_set_buf(struct jv_parser* p, const char* buf, int length, int is_partial) {
+  assert((p->curr_buf == 0 || p->curr_buf_pos == p->curr_buf_length)
+         && "previous buffer not exhausted");
+  p->curr_buf = buf;
+  p->curr_buf_length = length;
+  p->curr_buf_pos = 0;
+  p->curr_buf_is_partial = is_partial;
+}
+
+jv jv_parser_next(struct jv_parser* p) {
+  assert(p->curr_buf && "a buffer must be provided");
+  jv value;
+  presult msg = 0;
+  while (!msg && p->curr_buf_pos < p->curr_buf_length) {
+    char ch = p->curr_buf[p->curr_buf_pos++];
+    msg = scan(p, ch, &value);
+  }
+  if (msg == OK) {
+    return value;
+  } else if (msg) {
+    return jv_invalid_with_msg(jv_string(msg));
+  } else if (p->curr_buf_is_partial) {
+    assert(p->curr_buf_pos == p->curr_buf_length);
+    // need another buffer
+    return jv_invalid();
+  } else {
+    assert(p->curr_buf_pos == p->curr_buf_length);
+    // at EOF
+    if (p->st != JV_PARSER_NORMAL) 
+      return jv_invalid_with_msg(jv_string("Unfinished string"));
+    if ((msg = check_literal(p)))
+      return jv_invalid_with_msg(jv_string(msg));
+    if (p->stackpos != 0)
+      return jv_invalid_with_msg(jv_string("Unfinished JSON term"));
+    // p->next is either invalid (nothing here but no syntax error)
+    // or valid (this is the value). either way it's the thing to return
+    value = p->next;
+    p->next = jv_invalid();
+    return value;
+  }
+}
+
+jv jv_parse_sized(const char* string, int length) {
+  struct jv_parser parser;
+  jv_parser_init(&parser);
+  jv_parser_set_buf(&parser, string, length, 0);
+  jv value = jv_parser_next(&parser);
+  if (jv_is_valid(value)) {
+    jv next = jv_parser_next(&parser);
+    if (jv_is_valid(next)) {
+      // multiple JSON values, we only wanted one
+      jv_free(value);
+      jv_free(next);
+      value = jv_invalid_with_msg(jv_string("Unexpected extra JSON values"));
+    } else if (jv_invalid_has_msg(jv_copy(next))) {
+      // parser error after the first JSON value
+      jv_free(value);
+      value = next;
+    } else {
+      // a single valid JSON value
+      jv_free(next);
+    }
+  } else if (jv_invalid_has_msg(jv_copy(value))) {
+    // parse error, we'll return it
+  } else {
+    // no value at all
+    jv_free(value);
+    value = jv_invalid_with_msg(jv_string("Expected JSON value"));
+  }
+  jv_parser_free(&parser);
+
+  if (!jv_is_valid(value) && jv_invalid_has_msg(jv_copy(value))) {
+    jv msg = jv_invalid_get_msg(value);
+    value = jv_invalid_with_msg(jv_string_fmt("%s (while parsing '%s')",
+                                              jv_string_value(msg),
+                                              string));
+    jv_free(msg);
+  }
+  return value;
+}
+
+jv jv_parse(const char* string) {
+  return jv_parse_sized(string, strlen(string));
+}
author	Stephen Dolan <mu@netsoc.tcd.ie>	2012-09-18 17:44:43 +0100
committer	Stephen Dolan <mu@netsoc.tcd.ie>	2012-09-18 17:44:43 +0100
commit	a4eea165bbab6d13f89b59707e835d58b7014a66 (patch)
tree	b99ee5dde8540f8dbe5de3d87b99e04ac4dd2673 /jv_parse.c
parent	25cbab056b1f73e96b636c88779a92400d92dc15 (diff)