diff options
Diffstat (limited to 'vendored_parsers/tree-sitter-elm/src/scanner.c')
-rw-r--r-- | vendored_parsers/tree-sitter-elm/src/scanner.c | 496 |
1 files changed, 496 insertions, 0 deletions
diff --git a/vendored_parsers/tree-sitter-elm/src/scanner.c b/vendored_parsers/tree-sitter-elm/src/scanner.c new file mode 100644 index 000000000..45b1ce6d3 --- /dev/null +++ b/vendored_parsers/tree-sitter-elm/src/scanner.c @@ -0,0 +1,496 @@ +#include "tree_sitter/parser.h" +#include <assert.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +#define VEC_RESIZE(vec, _cap) \ + void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0])); \ + assert(tmp != NULL); \ + (vec).data = tmp; \ + assert((vec).data != NULL); \ + (vec).cap = (_cap); + +#define VEC_GROW(vec, _cap) \ + if ((vec).cap < (_cap)) { \ + VEC_RESIZE((vec), (_cap)); \ + } + +#define VEC_PUSH(vec, el) \ + if ((vec).cap == (vec).len) { \ + VEC_RESIZE((vec), MAX(16, (vec).len * 2)); \ + } \ + (vec).data[(vec).len++] = (el); + +#define VEC_POP(vec) (vec).len--; + +#define VEC_BACK(vec) ((vec).data[(vec).len - 1]) + +#define VEC_FREE(vec) \ + { \ + if ((vec).data != NULL) \ + free((vec).data); \ + } + +#define VEC_CLEAR(vec) (vec).len = 0; + +#define VEC_REVERSE(vec) \ + do { \ + if ((vec).len > 1) { \ + for (size_t i = 0, j = (vec).len - 1; i < j; i++, j--) { \ + uint8_t tmp = (vec).data[i]; \ + (vec).data[i] = (vec).data[j]; \ + (vec).data[j] = tmp; \ + } \ + } \ + } while (0) + +enum TokenType { + VIRTUAL_END_DECL, + VIRTUAL_OPEN_SECTION, + VIRTUAL_END_SECTION, + MINUS_WITHOUT_TRAILING_WHITESPACE, + GLSL_CONTENT, + BLOCK_COMMENT_CONTENT, +}; + +typedef struct { + uint32_t len; + uint32_t cap; + uint8_t *data; +} vec; + +typedef struct { + uint32_t indent_length; + vec indents; + vec runback; +} Scanner; + +static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } + +static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } + +// > You can detect error recovery in the external scanner by the fact that +// > _all_ tokens are considered valid at once. +// https://github.com/tree-sitter/tree-sitter/pull/1783#issuecomment-1181011411 +static bool in_error_recovery(const bool *valid_symbols) { + return (valid_symbols[VIRTUAL_END_DECL] && + valid_symbols[VIRTUAL_OPEN_SECTION] && + valid_symbols[VIRTUAL_END_SECTION] && + valid_symbols[MINUS_WITHOUT_TRAILING_WHITESPACE] && + valid_symbols[GLSL_CONTENT] && + valid_symbols[BLOCK_COMMENT_CONTENT]); +} + +static bool is_elm_space(TSLexer *lexer) { + return lexer->lookahead == ' ' || lexer->lookahead == '\r' || + lexer->lookahead == '\n'; +} + +static int checkForIn(TSLexer *lexer, const bool *valid_symbols) { + // Are we at the end of a let (in) declaration + if (valid_symbols[VIRTUAL_END_SECTION] && lexer->lookahead == 'i') { + skip(lexer); + + if (lexer->lookahead == 'n') { + skip(lexer); + if (is_elm_space(lexer) || lexer->eof(lexer)) { + return 2; // Success + } + return 1; // Partial + } + return 1; // Partial + } + return 0; +} + +static bool scan_block_comment(TSLexer *lexer) { + lexer->mark_end(lexer); + if (lexer->lookahead != '{') { + return false; + } + + advance(lexer); + if (lexer->lookahead != '-') { + return false; + } + + advance(lexer); + + while (true) { + switch (lexer->lookahead) { + case '{': + scan_block_comment(lexer); + break; + case '-': + advance(lexer); + if (lexer->lookahead == '}') { + advance(lexer); + return true; + } + break; + case '\0': + return true; + default: + advance(lexer); + } + } +} + +static void advance_to_line_end(TSLexer *lexer) { + while (true) { + if (lexer->lookahead == '\n' || lexer->eof(lexer)) { + break; + } + advance(lexer); + } +} + +static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) { + if (in_error_recovery(valid_symbols)) { + return false; + } + + // First handle eventual runback tokens, we saved on a previous scan op + if (scanner->runback.len > 0 && VEC_BACK(scanner->runback) == 0 && + valid_symbols[VIRTUAL_END_DECL]) { + VEC_POP(scanner->runback); + lexer->result_symbol = VIRTUAL_END_DECL; + return true; + } + if (scanner->runback.len > 0 && VEC_BACK(scanner->runback) == 1 && + valid_symbols[VIRTUAL_END_SECTION]) { + VEC_POP(scanner->runback); + lexer->result_symbol = VIRTUAL_END_SECTION; + return true; + } + VEC_CLEAR(scanner->runback); + + // Check if we have newlines and how much indentation + bool has_newline = false; + bool found_in = false; + bool can_call_mark_end = true; + lexer->mark_end(lexer); + while (true) { + if (lexer->lookahead == ' ' || lexer->lookahead == '\r') { + skip(lexer); + } else if (lexer->lookahead == '\n') { + skip(lexer); + has_newline = true; + while (true) { + if (lexer->lookahead == ' ') { + skip(lexer); + } else { + scanner->indent_length = lexer->get_column(lexer); + break; + } + } + } else if (!valid_symbols[BLOCK_COMMENT_CONTENT] && + lexer->lookahead == '-') { + advance(lexer); + int32_t lookahead = lexer->lookahead; + + // Handle minus without a whitespace for negate + if (valid_symbols[MINUS_WITHOUT_TRAILING_WHITESPACE] && + ((lookahead >= 'a' && lookahead <= 'z') || + (lookahead >= 'A' && lookahead <= 'Z') || lookahead == '(')) { + if (can_call_mark_end) { + lexer->result_symbol = MINUS_WITHOUT_TRAILING_WHITESPACE; + lexer->mark_end(lexer); + return true; + } + return false; + } + // Scan past line comments. As far as the special token + // types we're scanning for here are concerned line comments + // are like whitespace. There is nothing useful to be + // learned from, say, their indentation. So we advance past + // them here. + // + // The one thing we need to keep in mind is that we should + // not call `lexer->mark_end(lexer)` after this point, or + // the comment will be lost. + if (lookahead == '-' && has_newline) { + can_call_mark_end = false; + advance(lexer); + advance_to_line_end(lexer); + } else if (valid_symbols[BLOCK_COMMENT_CONTENT] && + lexer->lookahead == '}') { + lexer->result_symbol = BLOCK_COMMENT_CONTENT; + return true; + } else { + return false; + } + } else if (lexer->eof(lexer)) { + if (valid_symbols[VIRTUAL_END_SECTION]) { + lexer->result_symbol = VIRTUAL_END_SECTION; + return true; + } + if (valid_symbols[VIRTUAL_END_DECL]) { + lexer->result_symbol = VIRTUAL_END_DECL; + return true; + } + + break; + } else { + break; + } + } + + if (checkForIn(lexer, valid_symbols) == 2) { + if (has_newline) { + found_in = true; + } else { + lexer->result_symbol = VIRTUAL_END_SECTION; + VEC_POP(scanner->indents); + return true; + } + } + + // Open section if the grammar lets us but only push to indent stack if + // we go further down in the stack + if (valid_symbols[VIRTUAL_OPEN_SECTION] && !lexer->eof(lexer)) { + VEC_PUSH(scanner->indents, lexer->get_column(lexer)); + lexer->result_symbol = VIRTUAL_OPEN_SECTION; + return true; + } + if (valid_symbols[BLOCK_COMMENT_CONTENT]) { + if (!can_call_mark_end) { + return false; + } + lexer->mark_end(lexer); + while (true) { + if (lexer->lookahead == '\0') { + break; + } + if (lexer->lookahead != '{' && lexer->lookahead != '-') { + advance(lexer); + } else if (lexer->lookahead == '-') { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '}') { + break; + } + } else if (scan_block_comment(lexer)) { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == '-') { + break; + } + } + } + + lexer->result_symbol = BLOCK_COMMENT_CONTENT; + return true; + } + if (has_newline) { + // We had a newline now it's time to check if we need to add + // multiple tokens to get back up to the right level + VEC_CLEAR(scanner->runback); + + while (scanner->indent_length <= VEC_BACK(scanner->indents)) { + if (scanner->indent_length == VEC_BACK(scanner->indents)) { + if (found_in) { + VEC_PUSH(scanner->runback, 1); + found_in = false; + break; + } + // Don't insert VIRTUAL_END_DECL when there is a line + // comment incoming + + if (lexer->lookahead == '-') { + skip(lexer); + if (lexer->lookahead == '-') { + break; + } + } + // Don't insert VIRTUAL_END_DECL when there is a block + // comment incoming + if (lexer->lookahead == '{') { + skip(lexer); + if (lexer->lookahead == '-') { + break; + } + } + VEC_PUSH(scanner->runback, 0); + break; + } + if (scanner->indent_length < VEC_BACK(scanner->indents)) { + VEC_POP(scanner->indents); + VEC_PUSH(scanner->runback, 1); + found_in = false; + } + } + + // Needed for some of the more weird cases where let is in the same + // line as everything before the in in the next line + if (found_in) { + VEC_PUSH(scanner->runback, 1); + found_in = false; + } + + // Our list is the wrong way around, reverse it + VEC_REVERSE(scanner->runback); + // Handle the first runback token if we have them, if there are more + // they will be handled on the next scan operation + if (scanner->runback.len > 0 && VEC_BACK(scanner->runback) == 0 && + valid_symbols[VIRTUAL_END_DECL]) { + VEC_POP(scanner->runback); + lexer->result_symbol = VIRTUAL_END_DECL; + return true; + } + if (scanner->runback.len > 0 && VEC_BACK(scanner->runback) == 1 && + valid_symbols[VIRTUAL_END_SECTION]) { + VEC_POP(scanner->runback); + lexer->result_symbol = VIRTUAL_END_SECTION; + return true; + } + if (lexer->eof(lexer) && valid_symbols[VIRTUAL_END_SECTION]) { + lexer->result_symbol = VIRTUAL_END_SECTION; + return true; + } + } + + if (valid_symbols[GLSL_CONTENT]) { + if (!can_call_mark_end) { + return false; + } + lexer->result_symbol = GLSL_CONTENT; + while (true) { + switch (lexer->lookahead) { + case '|': + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == ']') { + advance(lexer); + return true; + } + break; + case '\0': + lexer->mark_end(lexer); + return true; + default: + advance(lexer); + } + } + } + + return false; +} + +// -------------------------------------------------------------------------------------------------------- +// API +// -------------------------------------------------------------------------------------------------------- + +/** + * This function allocates the persistent state of the parser that is passed + * into the other API functions. + */ +void *tree_sitter_elm_external_scanner_create() { + Scanner *scanner = (Scanner *)calloc(1, sizeof(Scanner)); + return scanner; +} + +/** + * Main logic entry point. + * Since the state is a singular vector, it can just be cast and used directly. + */ +bool tree_sitter_elm_external_scanner_scan(void *payload, TSLexer *lexer, + const bool *valid_symbols) { + Scanner *scanner = (Scanner *)payload; + return scan(scanner, lexer, valid_symbols); +} + +/** + * Copy the current state to another location for later reuse. + * This is normally more complex, but since this parser's state constists solely + * of a vector of integers, it can just be copied. + */ +unsigned tree_sitter_elm_external_scanner_serialize(void *payload, + char *buffer) { + Scanner *scanner = (Scanner *)payload; + size_t size = 0; + + if (3 + scanner->indents.len + scanner->runback.len >= + TREE_SITTER_SERIALIZATION_BUFFER_SIZE) { + return 0; + } + + size_t runback_count = scanner->runback.len; + if (runback_count > UINT8_MAX) { + runback_count = UINT8_MAX; + } + buffer[size++] = (char)runback_count; + + if (runback_count > 0) { + memcpy(&buffer[size], scanner->runback.data, runback_count); + } + size += runback_count; + + size_t indent_length_length = sizeof(scanner->indent_length); + buffer[size++] = (char)indent_length_length; + if (indent_length_length > 0) { + memcpy(&buffer[size], &scanner->indent_length, indent_length_length); + } + size += indent_length_length; + + int iter = 1; + for (; iter != scanner->indents.len && + size < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; + ++iter) { + buffer[size++] = (char)scanner->indents.data[iter]; + } + + return size; +} + +/** + * Load another parser state into the currently active state. + * `payload` is the state of the previous parser execution, while `buffer` is + * the saved state of a different position (e.g. when doing incremental + * parsing). + */ +void tree_sitter_elm_external_scanner_deserialize(void *payload, + const char *buffer, + unsigned length) { + Scanner *scanner = (Scanner *)payload; + VEC_CLEAR(scanner->runback); + VEC_CLEAR(scanner->indents); + VEC_PUSH(scanner->indents, 0); + + if (length == 0) { + return; + } + + size_t size = 0; + size_t runback_count = (unsigned char)buffer[size++]; + VEC_GROW(scanner->runback, runback_count) + if (runback_count > 0) { + memcpy(scanner->runback.data, &buffer[size], runback_count); + scanner->runback.len = runback_count; + size += runback_count; + } + + size_t indent_length_length = (unsigned char)buffer[size++]; + if (indent_length_length > 0) { + memcpy(&scanner->indent_length, &buffer[size], indent_length_length); + size += indent_length_length; + } + + for (; size < length; size++) { + VEC_PUSH(scanner->indents, (unsigned char)buffer[size]); + } + assert(size == length); +} + +/** + * Destroy the state. + */ +void tree_sitter_elm_external_scanner_destroy(void *payload) { + Scanner *scanner = (Scanner *)payload; + VEC_FREE(scanner->indents); + VEC_FREE(scanner->runback); + free(scanner); +} |