From a901729099257aac932d79c60adb5e8a53fa7e6c Mon Sep 17 00:00:00 2001 From: Amaan Qureshi Date: Thu, 21 Sep 2023 03:19:36 -0400 Subject: [PATCH] fix: do not peek beyond comments if indent, dedent, or newline aren't valid --- .github/workflows/ci.yml | 1 + package.json | 3 +- src/parser.c | 256 +++++++++++++++++++-------------------- src/scanner.c | 22 ++-- 4 files changed, 144 insertions(+), 138 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a5a381eb..57786cd5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,4 +1,5 @@ name: CI + on: pull_request: branches: diff --git a/package.json b/package.json index ad65fb9f..06af5911 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,8 @@ "main": "bindings/node", "keywords": [ "parser", - "lexer" + "lexer", + "python" ], "author": "Max Brunsfeld", "license": "MIT", diff --git a/src/parser.c b/src/parser.c index da6e977b..faef70da 100644 --- a/src/parser.c +++ b/src/parser.c @@ -1,4 +1,4 @@ -#include +#include "tree_sitter/parser.h" #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic push @@ -16,7 +16,7 @@ #define MAX_ALIAS_SEQUENCE_LENGTH 10 #define PRODUCTION_ID_COUNT 140 -enum { +enum ts_symbol_identifiers { sym_identifier = 1, anon_sym_SEMI = 2, anon_sym_import = 3, @@ -288,8 +288,8 @@ enum { aux_sym_format_specifier_repeat1 = 269, alias_sym_as_pattern_target = 270, alias_sym_format_expression = 271, - anon_alias_sym_is_SPACEnot = 272, - anon_alias_sym_not_SPACEin = 273, + anon_alias_sym_isnot = 272, + anon_alias_sym_notin = 273, }; static const char * const ts_symbol_names[] = { @@ -565,8 +565,8 @@ static const char * const ts_symbol_names[] = { [aux_sym_format_specifier_repeat1] = "format_specifier_repeat1", [alias_sym_as_pattern_target] = "as_pattern_target", [alias_sym_format_expression] = "format_expression", - [anon_alias_sym_is_SPACEnot] = "is not", - [anon_alias_sym_not_SPACEin] = "not in", + [anon_alias_sym_isnot] = "is not", + [anon_alias_sym_notin] = "not in", }; static const TSSymbol ts_symbol_map[] = { @@ -842,8 +842,8 @@ static const TSSymbol ts_symbol_map[] = { [aux_sym_format_specifier_repeat1] = aux_sym_format_specifier_repeat1, [alias_sym_as_pattern_target] = alias_sym_as_pattern_target, [alias_sym_format_expression] = alias_sym_format_expression, - [anon_alias_sym_is_SPACEnot] = anon_alias_sym_is_SPACEnot, - [anon_alias_sym_not_SPACEin] = anon_alias_sym_not_SPACEin, + [anon_alias_sym_isnot] = anon_alias_sym_isnot, + [anon_alias_sym_notin] = anon_alias_sym_notin, }; static const TSSymbolMetadata ts_symbol_metadata[] = { @@ -1939,17 +1939,17 @@ static const TSSymbolMetadata ts_symbol_metadata[] = { .visible = true, .named = true, }, - [anon_alias_sym_is_SPACEnot] = { + [anon_alias_sym_isnot] = { .visible = true, .named = false, }, - [anon_alias_sym_not_SPACEin] = { + [anon_alias_sym_notin] = { .visible = true, .named = false, }, }; -enum { +enum ts_field_identifiers { field_alias = 1, field_alternative = 2, field_argument = 3, @@ -2650,12 +2650,12 @@ static const TSSymbol ts_alias_sequences[PRODUCTION_ID_COUNT][MAX_ALIAS_SEQUENCE [3] = sym_block, }, [49] = { - [0] = anon_alias_sym_not_SPACEin, - [1] = anon_alias_sym_not_SPACEin, + [0] = anon_alias_sym_notin, + [1] = anon_alias_sym_notin, }, [50] = { - [0] = anon_alias_sym_is_SPACEnot, - [1] = anon_alias_sym_is_SPACEnot, + [0] = anon_alias_sym_isnot, + [1] = anon_alias_sym_isnot, }, [51] = { [0] = alias_sym_format_expression, @@ -12751,119 +12751,6 @@ static const TSLexMode ts_lex_modes[STATE_COUNT] = { [2818] = {.lex_state = 54, .external_lex_state = 11}, }; -enum { - ts_external_token__newline = 0, - ts_external_token__indent = 1, - ts_external_token__dedent = 2, - ts_external_token_string_start = 3, - ts_external_token__string_content = 4, - ts_external_token_escape_interpolation = 5, - ts_external_token_string_end = 6, - ts_external_token_comment = 7, - ts_external_token_RBRACK = 8, - ts_external_token_RPAREN = 9, - ts_external_token_RBRACE = 10, -}; - -static const TSSymbol ts_external_scanner_symbol_map[EXTERNAL_TOKEN_COUNT] = { - [ts_external_token__newline] = sym__newline, - [ts_external_token__indent] = sym__indent, - [ts_external_token__dedent] = sym__dedent, - [ts_external_token_string_start] = sym_string_start, - [ts_external_token__string_content] = sym__string_content, - [ts_external_token_escape_interpolation] = sym_escape_interpolation, - [ts_external_token_string_end] = sym_string_end, - [ts_external_token_comment] = sym_comment, - [ts_external_token_RBRACK] = anon_sym_RBRACK, - [ts_external_token_RPAREN] = anon_sym_RPAREN, - [ts_external_token_RBRACE] = anon_sym_RBRACE, -}; - -static const bool ts_external_scanner_states[17][EXTERNAL_TOKEN_COUNT] = { - [1] = { - [ts_external_token__newline] = true, - [ts_external_token__indent] = true, - [ts_external_token__dedent] = true, - [ts_external_token_string_start] = true, - [ts_external_token__string_content] = true, - [ts_external_token_escape_interpolation] = true, - [ts_external_token_string_end] = true, - [ts_external_token_comment] = true, - [ts_external_token_RBRACK] = true, - [ts_external_token_RPAREN] = true, - [ts_external_token_RBRACE] = true, - }, - [2] = { - [ts_external_token_string_start] = true, - [ts_external_token_comment] = true, - }, - [3] = { - [ts_external_token__dedent] = true, - [ts_external_token_string_start] = true, - [ts_external_token_comment] = true, - }, - [4] = { - [ts_external_token__newline] = true, - [ts_external_token_string_start] = true, - [ts_external_token_comment] = true, - }, - [5] = { - [ts_external_token__newline] = true, - [ts_external_token__indent] = true, - [ts_external_token_string_start] = true, - [ts_external_token_comment] = true, - }, - [6] = { - [ts_external_token_string_start] = true, - [ts_external_token_comment] = true, - [ts_external_token_RBRACE] = true, - }, - [7] = { - [ts_external_token_string_start] = true, - [ts_external_token_comment] = true, - [ts_external_token_RPAREN] = true, - }, - [8] = { - [ts_external_token_string_start] = true, - [ts_external_token_comment] = true, - [ts_external_token_RBRACK] = true, - }, - [9] = { - [ts_external_token__newline] = true, - [ts_external_token_comment] = true, - }, - [10] = { - [ts_external_token_comment] = true, - [ts_external_token_RBRACE] = true, - }, - [11] = { - [ts_external_token_comment] = true, - [ts_external_token_RPAREN] = true, - }, - [12] = { - [ts_external_token_comment] = true, - }, - [13] = { - [ts_external_token_comment] = true, - [ts_external_token_RBRACK] = true, - }, - [14] = { - [ts_external_token__string_content] = true, - [ts_external_token_escape_interpolation] = true, - [ts_external_token_string_end] = true, - [ts_external_token_comment] = true, - }, - [15] = { - [ts_external_token__dedent] = true, - [ts_external_token_comment] = true, - }, - [16] = { - [ts_external_token__newline] = true, - [ts_external_token__indent] = true, - [ts_external_token_comment] = true, - }, -}; - static const uint16_t ts_parse_table[LARGE_STATE_COUNT][SYMBOL_COUNT] = { [0] = { [ts_builtin_sym_end] = ACTIONS(1), @@ -132805,6 +132692,119 @@ static const TSParseActionEntry ts_parse_actions[] = { [5071] = {.entry = {.count = 1, .reusable = true}}, SHIFT(2540), }; +enum ts_external_scanner_symbol_identifiers { + ts_external_token__newline = 0, + ts_external_token__indent = 1, + ts_external_token__dedent = 2, + ts_external_token_string_start = 3, + ts_external_token__string_content = 4, + ts_external_token_escape_interpolation = 5, + ts_external_token_string_end = 6, + ts_external_token_comment = 7, + ts_external_token_RBRACK = 8, + ts_external_token_RPAREN = 9, + ts_external_token_RBRACE = 10, +}; + +static const TSSymbol ts_external_scanner_symbol_map[EXTERNAL_TOKEN_COUNT] = { + [ts_external_token__newline] = sym__newline, + [ts_external_token__indent] = sym__indent, + [ts_external_token__dedent] = sym__dedent, + [ts_external_token_string_start] = sym_string_start, + [ts_external_token__string_content] = sym__string_content, + [ts_external_token_escape_interpolation] = sym_escape_interpolation, + [ts_external_token_string_end] = sym_string_end, + [ts_external_token_comment] = sym_comment, + [ts_external_token_RBRACK] = anon_sym_RBRACK, + [ts_external_token_RPAREN] = anon_sym_RPAREN, + [ts_external_token_RBRACE] = anon_sym_RBRACE, +}; + +static const bool ts_external_scanner_states[17][EXTERNAL_TOKEN_COUNT] = { + [1] = { + [ts_external_token__newline] = true, + [ts_external_token__indent] = true, + [ts_external_token__dedent] = true, + [ts_external_token_string_start] = true, + [ts_external_token__string_content] = true, + [ts_external_token_escape_interpolation] = true, + [ts_external_token_string_end] = true, + [ts_external_token_comment] = true, + [ts_external_token_RBRACK] = true, + [ts_external_token_RPAREN] = true, + [ts_external_token_RBRACE] = true, + }, + [2] = { + [ts_external_token_string_start] = true, + [ts_external_token_comment] = true, + }, + [3] = { + [ts_external_token__dedent] = true, + [ts_external_token_string_start] = true, + [ts_external_token_comment] = true, + }, + [4] = { + [ts_external_token__newline] = true, + [ts_external_token_string_start] = true, + [ts_external_token_comment] = true, + }, + [5] = { + [ts_external_token__newline] = true, + [ts_external_token__indent] = true, + [ts_external_token_string_start] = true, + [ts_external_token_comment] = true, + }, + [6] = { + [ts_external_token_string_start] = true, + [ts_external_token_comment] = true, + [ts_external_token_RBRACE] = true, + }, + [7] = { + [ts_external_token_string_start] = true, + [ts_external_token_comment] = true, + [ts_external_token_RPAREN] = true, + }, + [8] = { + [ts_external_token_string_start] = true, + [ts_external_token_comment] = true, + [ts_external_token_RBRACK] = true, + }, + [9] = { + [ts_external_token__newline] = true, + [ts_external_token_comment] = true, + }, + [10] = { + [ts_external_token_comment] = true, + [ts_external_token_RBRACE] = true, + }, + [11] = { + [ts_external_token_comment] = true, + [ts_external_token_RPAREN] = true, + }, + [12] = { + [ts_external_token_comment] = true, + }, + [13] = { + [ts_external_token_comment] = true, + [ts_external_token_RBRACK] = true, + }, + [14] = { + [ts_external_token__string_content] = true, + [ts_external_token_escape_interpolation] = true, + [ts_external_token_string_end] = true, + [ts_external_token_comment] = true, + }, + [15] = { + [ts_external_token__dedent] = true, + [ts_external_token_comment] = true, + }, + [16] = { + [ts_external_token__newline] = true, + [ts_external_token__indent] = true, + [ts_external_token_comment] = true, + }, +}; + #ifdef __cplusplus extern "C" { #endif diff --git a/src/scanner.c b/src/scanner.c index b145dd45..2a755943 100644 --- a/src/scanner.c +++ b/src/scanner.c @@ -1,8 +1,9 @@ +#include "tree_sitter/parser.h" + #include #include #include #include -#include #define MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -173,9 +174,9 @@ bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer, valid_symbols[CLOSE_PAREN] || valid_symbols[CLOSE_BRACKET]; - bool advanced_once = false; + bool advanced_once = false; if (valid_symbols[ESCAPE_INTERPOLATION] && scanner->delimiters.len > 0 && - (lexer->lookahead == '{' || lexer->lookahead == '}') && + (lexer->lookahead == '{' || lexer->lookahead == '}') && !error_recovery_mode) { Delimiter delimiter = VEC_BACK(scanner->delimiters); if (is_format(&delimiter)) { @@ -200,7 +201,8 @@ bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer, int32_t end_char = end_character(&delimiter); bool has_content = advanced_once; while (lexer->lookahead) { - if ((advanced_once || lexer->lookahead == '{' || lexer->lookahead == '}') && + if ((advanced_once || lexer->lookahead == '{' || + lexer->lookahead == '}') && is_format(&delimiter)) { lexer->mark_end(lexer); lexer->result_symbol = STRING_CONTENT; @@ -216,12 +218,12 @@ bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer, advance(lexer); } // Step over newlines - if (lexer -> lookahead == '\r') { - advance(lexer); - if (lexer -> lookahead == '\n') { + if (lexer->lookahead == '\r') { advance(lexer); + if (lexer->lookahead == '\n') { + advance(lexer); } - } else if (lexer -> lookahead == '\n') { + } else if (lexer->lookahead == '\n') { advance(lexer); } continue; @@ -309,7 +311,9 @@ bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer, } else if (lexer->lookahead == '\t') { indent_length += 8; skip(lexer); - } else if (lexer->lookahead == '#') { + } else if (lexer->lookahead == '#' && + (valid_symbols[INDENT] || valid_symbols[DEDENT] || + valid_symbols[NEWLINE])) { // If we haven't found an EOL yet, // then this is a comment after an expression: // foo = bar # comment