summaryrefslogtreecommitdiffstats
path: root/vendored_parsers/tree-sitter-elm/src/scanner.c
diff options
context:
space:
mode:
Diffstat (limited to 'vendored_parsers/tree-sitter-elm/src/scanner.c')
-rw-r--r--vendored_parsers/tree-sitter-elm/src/scanner.c496
1 files changed, 496 insertions, 0 deletions
diff --git a/vendored_parsers/tree-sitter-elm/src/scanner.c b/vendored_parsers/tree-sitter-elm/src/scanner.c
new file mode 100644
index 000000000..45b1ce6d3
--- /dev/null
+++ b/vendored_parsers/tree-sitter-elm/src/scanner.c
@@ -0,0 +1,496 @@
+#include "tree_sitter/parser.h"
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+#define VEC_RESIZE(vec, _cap) \
+ void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0])); \
+ assert(tmp != NULL); \
+ (vec).data = tmp; \
+ assert((vec).data != NULL); \
+ (vec).cap = (_cap);
+
+#define VEC_GROW(vec, _cap) \
+ if ((vec).cap < (_cap)) { \
+ VEC_RESIZE((vec), (_cap)); \
+ }
+
+#define VEC_PUSH(vec, el) \
+ if ((vec).cap == (vec).len) { \
+ VEC_RESIZE((vec), MAX(16, (vec).len * 2)); \
+ } \
+ (vec).data[(vec).len++] = (el);
+
+#define VEC_POP(vec) (vec).len--;
+
+#define VEC_BACK(vec) ((vec).data[(vec).len - 1])
+
+#define VEC_FREE(vec) \
+ { \
+ if ((vec).data != NULL) \
+ free((vec).data); \
+ }
+
+#define VEC_CLEAR(vec) (vec).len = 0;
+
+#define VEC_REVERSE(vec) \
+ do { \
+ if ((vec).len > 1) { \
+ for (size_t i = 0, j = (vec).len - 1; i < j; i++, j--) { \
+ uint8_t tmp = (vec).data[i]; \
+ (vec).data[i] = (vec).data[j]; \
+ (vec).data[j] = tmp; \
+ } \
+ } \
+ } while (0)
+
+enum TokenType {
+ VIRTUAL_END_DECL,
+ VIRTUAL_OPEN_SECTION,
+ VIRTUAL_END_SECTION,
+ MINUS_WITHOUT_TRAILING_WHITESPACE,
+ GLSL_CONTENT,
+ BLOCK_COMMENT_CONTENT,
+};
+
+typedef struct {
+ uint32_t len;
+ uint32_t cap;
+ uint8_t *data;
+} vec;
+
+typedef struct {
+ uint32_t indent_length;
+ vec indents;
+ vec runback;
+} Scanner;
+
+static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
+
+static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
+
+// > You can detect error recovery in the external scanner by the fact that
+// > _all_ tokens are considered valid at once.
+// https://github.com/tree-sitter/tree-sitter/pull/1783#issuecomment-1181011411
+static bool in_error_recovery(const bool *valid_symbols) {
+ return (valid_symbols[VIRTUAL_END_DECL] &&
+ valid_symbols[VIRTUAL_OPEN_SECTION] &&
+ valid_symbols[VIRTUAL_END_SECTION] &&
+ valid_symbols[MINUS_WITHOUT_TRAILING_WHITESPACE] &&
+ valid_symbols[GLSL_CONTENT] &&
+ valid_symbols[BLOCK_COMMENT_CONTENT]);
+}
+
+static bool is_elm_space(TSLexer *lexer) {
+ return lexer->lookahead == ' ' || lexer->lookahead == '\r' ||
+ lexer->lookahead == '\n';
+}
+
+static int checkForIn(TSLexer *lexer, const bool *valid_symbols) {
+ // Are we at the end of a let (in) declaration
+ if (valid_symbols[VIRTUAL_END_SECTION] && lexer->lookahead == 'i') {
+ skip(lexer);
+
+ if (lexer->lookahead == 'n') {
+ skip(lexer);
+ if (is_elm_space(lexer) || lexer->eof(lexer)) {
+ return 2; // Success
+ }
+ return 1; // Partial
+ }
+ return 1; // Partial
+ }
+ return 0;
+}
+
+static bool scan_block_comment(TSLexer *lexer) {
+ lexer->mark_end(lexer);
+ if (lexer->lookahead != '{') {
+ return false;
+ }
+
+ advance(lexer);
+ if (lexer->lookahead != '-') {
+ return false;
+ }
+
+ advance(lexer);
+
+ while (true) {
+ switch (lexer->lookahead) {
+ case '{':
+ scan_block_comment(lexer);
+ break;
+ case '-':
+ advance(lexer);
+ if (lexer->lookahead == '}') {
+ advance(lexer);
+ return true;
+ }
+ break;
+ case '\0':
+ return true;
+ default:
+ advance(lexer);
+ }
+ }
+}
+
+static void advance_to_line_end(TSLexer *lexer) {
+ while (true) {
+ if (lexer->lookahead == '\n' || lexer->eof(lexer)) {
+ break;
+ }
+ advance(lexer);
+ }
+}
+
+static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
+ if (in_error_recovery(valid_symbols)) {
+ return false;
+ }
+
+ // First handle eventual runback tokens, we saved on a previous scan op
+ if (scanner->runback.len > 0 && VEC_BACK(scanner->runback) == 0 &&
+ valid_symbols[VIRTUAL_END_DECL]) {
+ VEC_POP(scanner->runback);
+ lexer->result_symbol = VIRTUAL_END_DECL;
+ return true;
+ }
+ if (scanner->runback.len > 0 && VEC_BACK(scanner->runback) == 1 &&
+ valid_symbols[VIRTUAL_END_SECTION]) {
+ VEC_POP(scanner->runback);
+ lexer->result_symbol = VIRTUAL_END_SECTION;
+ return true;
+ }
+ VEC_CLEAR(scanner->runback);
+
+ // Check if we have newlines and how much indentation
+ bool has_newline = false;
+ bool found_in = false;
+ bool can_call_mark_end = true;
+ lexer->mark_end(lexer);
+ while (true) {
+ if (lexer->lookahead == ' ' || lexer->lookahead == '\r') {
+ skip(lexer);
+ } else if (lexer->lookahead == '\n') {
+ skip(lexer);
+ has_newline = true;
+ while (true) {
+ if (lexer->lookahead == ' ') {
+ skip(lexer);
+ } else {
+ scanner->indent_length = lexer->get_column(lexer);
+ break;
+ }
+ }
+ } else if (!valid_symbols[BLOCK_COMMENT_CONTENT] &&
+ lexer->lookahead == '-') {
+ advance(lexer);
+ int32_t lookahead = lexer->lookahead;
+
+ // Handle minus without a whitespace for negate
+ if (valid_symbols[MINUS_WITHOUT_TRAILING_WHITESPACE] &&
+ ((lookahead >= 'a' && lookahead <= 'z') ||
+ (lookahead >= 'A' && lookahead <= 'Z') || lookahead == '(')) {
+ if (can_call_mark_end) {
+ lexer->result_symbol = MINUS_WITHOUT_TRAILING_WHITESPACE;
+ lexer->mark_end(lexer);
+ return true;
+ }
+ return false;
+ }
+ // Scan past line comments. As far as the special token
+ // types we're scanning for here are concerned line comments
+ // are like whitespace. There is nothing useful to be
+ // learned from, say, their indentation. So we advance past
+ // them here.
+ //
+ // The one thing we need to keep in mind is that we should
+ // not call `lexer->mark_end(lexer)` after this point, or
+ // the comment will be lost.
+ if (lookahead == '-' && has_newline) {
+ can_call_mark_end = false;
+ advance(lexer);
+ advance_to_line_end(lexer);
+ } else if (valid_symbols[BLOCK_COMMENT_CONTENT] &&
+ lexer->lookahead == '}') {
+ lexer->result_symbol = BLOCK_COMMENT_CONTENT;
+ return true;
+ } else {
+ return false;
+ }
+ } else if (lexer->eof(lexer)) {
+ if (valid_symbols[VIRTUAL_END_SECTION]) {
+ lexer->result_symbol = VIRTUAL_END_SECTION;
+ return true;
+ }
+ if (valid_symbols[VIRTUAL_END_DECL]) {
+ lexer->result_symbol = VIRTUAL_END_DECL;
+ return true;
+ }
+
+ break;
+ } else {
+ break;
+ }
+ }
+
+ if (checkForIn(lexer, valid_symbols) == 2) {
+ if (has_newline) {
+ found_in = true;
+ } else {
+ lexer->result_symbol = VIRTUAL_END_SECTION;
+ VEC_POP(scanner->indents);
+ return true;
+ }
+ }
+
+ // Open section if the grammar lets us but only push to indent stack if
+ // we go further down in the stack
+ if (valid_symbols[VIRTUAL_OPEN_SECTION] && !lexer->eof(lexer)) {
+ VEC_PUSH(scanner->indents, lexer->get_column(lexer));
+ lexer->result_symbol = VIRTUAL_OPEN_SECTION;
+ return true;
+ }
+ if (valid_symbols[BLOCK_COMMENT_CONTENT]) {
+ if (!can_call_mark_end) {
+ return false;
+ }
+ lexer->mark_end(lexer);
+ while (true) {
+ if (lexer->lookahead == '\0') {
+ break;
+ }
+ if (lexer->lookahead != '{' && lexer->lookahead != '-') {
+ advance(lexer);
+ } else if (lexer->lookahead == '-') {
+ lexer->mark_end(lexer);
+ advance(lexer);
+ if (lexer->lookahead == '}') {
+ break;
+ }
+ } else if (scan_block_comment(lexer)) {
+ lexer->mark_end(lexer);
+ advance(lexer);
+ if (lexer->lookahead == '-') {
+ break;
+ }
+ }
+ }
+
+ lexer->result_symbol = BLOCK_COMMENT_CONTENT;
+ return true;
+ }
+ if (has_newline) {
+ // We had a newline now it's time to check if we need to add
+ // multiple tokens to get back up to the right level
+ VEC_CLEAR(scanner->runback);
+
+ while (scanner->indent_length <= VEC_BACK(scanner->indents)) {
+ if (scanner->indent_length == VEC_BACK(scanner->indents)) {
+ if (found_in) {
+ VEC_PUSH(scanner->runback, 1);
+ found_in = false;
+ break;
+ }
+ // Don't insert VIRTUAL_END_DECL when there is a line
+ // comment incoming
+
+ if (lexer->lookahead == '-') {
+ skip(lexer);
+ if (lexer->lookahead == '-') {
+ break;
+ }
+ }
+ // Don't insert VIRTUAL_END_DECL when there is a block
+ // comment incoming
+ if (lexer->lookahead == '{') {
+ skip(lexer);
+ if (lexer->lookahead == '-') {
+ break;
+ }
+ }
+ VEC_PUSH(scanner->runback, 0);
+ break;
+ }
+ if (scanner->indent_length < VEC_BACK(scanner->indents)) {
+ VEC_POP(scanner->indents);
+ VEC_PUSH(scanner->runback, 1);
+ found_in = false;
+ }
+ }
+
+ // Needed for some of the more weird cases where let is in the same
+ // line as everything before the in in the next line
+ if (found_in) {
+ VEC_PUSH(scanner->runback, 1);
+ found_in = false;
+ }
+
+ // Our list is the wrong way around, reverse it
+ VEC_REVERSE(scanner->runback);
+ // Handle the first runback token if we have them, if there are more
+ // they will be handled on the next scan operation
+ if (scanner->runback.len > 0 && VEC_BACK(scanner->runback) == 0 &&
+ valid_symbols[VIRTUAL_END_DECL]) {
+ VEC_POP(scanner->runback);
+ lexer->result_symbol = VIRTUAL_END_DECL;
+ return true;
+ }
+ if (scanner->runback.len > 0 && VEC_BACK(scanner->runback) == 1 &&
+ valid_symbols[VIRTUAL_END_SECTION]) {
+ VEC_POP(scanner->runback);
+ lexer->result_symbol = VIRTUAL_END_SECTION;
+ return true;
+ }
+ if (lexer->eof(lexer) && valid_symbols[VIRTUAL_END_SECTION]) {
+ lexer->result_symbol = VIRTUAL_END_SECTION;
+ return true;
+ }
+ }
+
+ if (valid_symbols[GLSL_CONTENT]) {
+ if (!can_call_mark_end) {
+ return false;
+ }
+ lexer->result_symbol = GLSL_CONTENT;
+ while (true) {
+ switch (lexer->lookahead) {
+ case '|':
+ lexer->mark_end(lexer);
+ advance(lexer);
+ if (lexer->lookahead == ']') {
+ advance(lexer);
+ return true;
+ }
+ break;
+ case '\0':
+ lexer->mark_end(lexer);
+ return true;
+ default:
+ advance(lexer);
+ }
+ }
+ }
+
+ return false;
+}
+
+// --------------------------------------------------------------------------------------------------------
+// API
+// --------------------------------------------------------------------------------------------------------
+
+/**
+ * This function allocates the persistent state of the parser that is passed
+ * into the other API functions.
+ */
+void *tree_sitter_elm_external_scanner_create() {
+ Scanner *scanner = (Scanner *)calloc(1, sizeof(Scanner));
+ return scanner;
+}
+
+/**
+ * Main logic entry point.
+ * Since the state is a singular vector, it can just be cast and used directly.
+ */
+bool tree_sitter_elm_external_scanner_scan(void *payload, TSLexer *lexer,
+ const bool *valid_symbols) {
+ Scanner *scanner = (Scanner *)payload;
+ return scan(scanner, lexer, valid_symbols);
+}
+
+/**
+ * Copy the current state to another location for later reuse.
+ * This is normally more complex, but since this parser's state constists solely
+ * of a vector of integers, it can just be copied.
+ */
+unsigned tree_sitter_elm_external_scanner_serialize(void *payload,
+ char *buffer) {
+ Scanner *scanner = (Scanner *)payload;
+ size_t size = 0;
+
+ if (3 + scanner->indents.len + scanner->runback.len >=
+ TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
+ return 0;
+ }
+
+ size_t runback_count = scanner->runback.len;
+ if (runback_count > UINT8_MAX) {
+ runback_count = UINT8_MAX;
+ }
+ buffer[size++] = (char)runback_count;
+
+ if (runback_count > 0) {
+ memcpy(&buffer[size], scanner->runback.data, runback_count);
+ }
+ size += runback_count;
+
+ size_t indent_length_length = sizeof(scanner->indent_length);
+ buffer[size++] = (char)indent_length_length;
+ if (indent_length_length > 0) {
+ memcpy(&buffer[size], &scanner->indent_length, indent_length_length);
+ }
+ size += indent_length_length;
+
+ int iter = 1;
+ for (; iter != scanner->indents.len &&
+ size < TREE_SITTER_SERIALIZATION_BUFFER_SIZE;
+ ++iter) {
+ buffer[size++] = (char)scanner->indents.data[iter];
+ }
+
+ return size;
+}
+
+/**
+ * Load another parser state into the currently active state.
+ * `payload` is the state of the previous parser execution, while `buffer` is
+ * the saved state of a different position (e.g. when doing incremental
+ * parsing).
+ */
+void tree_sitter_elm_external_scanner_deserialize(void *payload,
+ const char *buffer,
+ unsigned length) {
+ Scanner *scanner = (Scanner *)payload;
+ VEC_CLEAR(scanner->runback);
+ VEC_CLEAR(scanner->indents);
+ VEC_PUSH(scanner->indents, 0);
+
+ if (length == 0) {
+ return;
+ }
+
+ size_t size = 0;
+ size_t runback_count = (unsigned char)buffer[size++];
+ VEC_GROW(scanner->runback, runback_count)
+ if (runback_count > 0) {
+ memcpy(scanner->runback.data, &buffer[size], runback_count);
+ scanner->runback.len = runback_count;
+ size += runback_count;
+ }
+
+ size_t indent_length_length = (unsigned char)buffer[size++];
+ if (indent_length_length > 0) {
+ memcpy(&scanner->indent_length, &buffer[size], indent_length_length);
+ size += indent_length_length;
+ }
+
+ for (; size < length; size++) {
+ VEC_PUSH(scanner->indents, (unsigned char)buffer[size]);
+ }
+ assert(size == length);
+}
+
+/**
+ * Destroy the state.
+ */
+void tree_sitter_elm_external_scanner_destroy(void *payload) {
+ Scanner *scanner = (Scanner *)payload;
+ VEC_FREE(scanner->indents);
+ VEC_FREE(scanner->runback);
+ free(scanner);
+}