diff --git a/.gitmodules b/.gitmodules index d3004c26..64ab42fa 100644 --- a/.gitmodules +++ b/.gitmodules @@ -9,7 +9,7 @@ url = https://github.com/tree-sitter/tree-sitter-ruby.git [submodule "lang/semgrep-grammars/src/tree-sitter-kotlin"] path = lang/semgrep-grammars/src/tree-sitter-kotlin - url = https://github.com/returntocorp/tree-sitter-kotlin.git + url = https://github.com/fwcd/tree-sitter-kotlin.git [submodule "lang/semgrep-grammars/src/tree-sitter-c-sharp"] path = lang/semgrep-grammars/src/tree-sitter-c-sharp url = https://github.com/tree-sitter/tree-sitter-c-sharp.git diff --git a/lang/semgrep-grammars/src/semgrep-kotlin/src/scanner.c b/lang/semgrep-grammars/src/semgrep-kotlin/src/scanner.c index 0a7293e8..30d532d6 100644 --- a/lang/semgrep-grammars/src/semgrep-kotlin/src/scanner.c +++ b/lang/semgrep-grammars/src/semgrep-kotlin/src/scanner.c @@ -1,12 +1,10 @@ //INVARIANT: THIS SHOULD BE ALMOST EXACTLY THE SAME FILE THAN //../../tree-sitter-kotlin/src/scanner.c EXCEPT FOR A FEW SEMGREP-SPECIFIC //EXTENSIONS -// Also: Make sure that you are merging any commits into the `semgrep` -// branch of `tree-sitter-kotlin`! This is because our version of -// `tree-sitter-kotlin` is forked from the original repository, and we -// want our branch to be kept separate. +#include "tree_sitter/array.h" +#include "tree_sitter/parser.h" + #include -#include #include // Mostly a copy paste of tree-sitter-javascript/src/scanner.c @@ -23,7 +21,7 @@ enum TokenType { /* Pretty much all of this code is taken from the Julia tree-sitter parser. - + Julia has similar problems with multiline comments that can be nested, line comments, as well as line and multiline strings. @@ -31,8 +29,6 @@ enum TokenType { particularly with respect to interpolation. */ -typedef char Delimiter; - // Block comments are easy to parse, but strings require extra-attention. // The main problems that arise when parsing strings are: @@ -41,87 +37,58 @@ typedef char Delimiter; // sequences, but you can always write \" and \`. // To efficiently store a delimiter, we take advantage of the fact that: -// (int)'"' == 34 && 34 % 2 == 0 +// (int)'"' == 34 && (34 & 1) == 0 // i.e. " has an even numeric representation, so we can store a triple // quoted delimiter as (delimiter + 1). -// We use a stack to keep track of the string delimiters. -typedef struct { - Delimiter *arr; - unsigned len; -} Stack; - -static Stack *new_stack() { - Delimiter *arr = malloc(TREE_SITTER_SERIALIZATION_BUFFER_SIZE); - if (arr == NULL) exit(1); - Stack *stack = malloc(sizeof(Stack)); - if (stack == NULL) exit(1); - stack->arr = arr; - stack->len = 0; - return stack; -} +#define DELIMITER_LENGTH 3 -static void free_stack(Stack *stack) { - free(stack->arr); - free(stack); -} +typedef char Delimiter; -static void push(Stack *stack, char c, bool triple) { - if (stack->len >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) exit(1); - stack->arr[stack->len++] = triple ? (c + 1) : c; -} +// We use a stack to keep track of the string delimiters. +typedef Array(Delimiter) Stack; -static Delimiter pop(Stack *stack) { - if (stack->len == 0) exit(1); - return stack->arr[stack->len--]; +static inline void stack_push(Stack *stack, char chr, bool triple) { + if (stack->size >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) abort(); + array_push(stack, (Delimiter)(triple ? (chr + 1) : chr)); } -static unsigned serialize_stack(Stack *stack, char *buffer) { - unsigned len = stack->len; - memcpy(buffer, stack->arr, len); - return len; +static inline Delimiter stack_pop(Stack *stack) { + if (stack->size == 0) abort(); + return array_pop(stack); } -static void deserialize_stack(Stack *stack, const char *buffer, unsigned len) { - if (len > 0) { - memcpy(stack->arr, buffer, len); - stack->len = len; - } else { - stack->len = 0; - } -} +static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } -static void skip(TSLexer *lexer) { lexer->advance(lexer, true); } -static void advance(TSLexer *lexer) { lexer->advance(lexer, false); } -static void mark_end(TSLexer *lexer) { lexer->mark_end(lexer); } +static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } // Scanner functions static bool scan_string_start(TSLexer *lexer, Stack *stack) { if (lexer->lookahead != '"') return false; advance(lexer); - mark_end(lexer); - for (unsigned count = 1; count < 3; count++) { + lexer->mark_end(lexer); + for (unsigned count = 1; count < DELIMITER_LENGTH; ++count) { if (lexer->lookahead != '"') { // It's not a triple quoted delimiter. - push(stack, '"', false); + stack_push(stack, '"', false); return true; } advance(lexer); } - mark_end(lexer); - push(stack, '"', true); + lexer->mark_end(lexer); + stack_push(stack, '"', true); return true; } static bool scan_string_content(TSLexer *lexer, Stack *stack) { - if (stack->len == 0) return false; // Stack is empty. We're not in a string. - Delimiter end_char = stack->arr[stack->len - 1]; // peek + if (stack->size == 0) return false; // Stack is empty. We're not in a string. + Delimiter end_char = stack->contents[stack->size - 1]; // peek bool is_triple = false; bool has_content = false; - if (end_char % 2 != 0) { + if (end_char & 1) { is_triple = true; - end_char--; + end_char -= 1; } while (lexer->lookahead) { if (lexer->lookahead == '$') { @@ -132,49 +99,57 @@ static bool scan_string_content(TSLexer *lexer, Stack *stack) { lexer->result_symbol = STRING_CONTENT; return has_content; } - // otherwise, if this is the start, determine if it is an + // otherwise, if this is the start, determine if it is an // interpolated identifier. - // otherwise, it's just string content, so continue - else { - advance(lexer); - if (iswalpha(lexer->lookahead) || lexer->lookahead == '{') { - // this must be a string interpolation, let's - // fail so we parse it as such - return false; - } - lexer->result_symbol = STRING_CONTENT; - mark_end(lexer); - return true; + // otherwise, it's just string content, so continue + advance(lexer); + if (iswalpha(lexer->lookahead) || lexer->lookahead == '{') { + // this must be a string interpolation, let's + // fail so we parse it as such + return false; } - } else if (lexer->lookahead == '\\') { + lexer->result_symbol = STRING_CONTENT; + lexer->mark_end(lexer); + return true; + } + if (lexer->lookahead == '\\') { // if we see a \, then this might possibly escape a dollar sign - // in which case, we need to not defer to the interpolation - has_content = true; + // in which case, we should not defer to the interpolation advance(lexer); // this dollar sign is escaped, so it must be content. // we consume it here so we don't enter the dollar sign case above, - // which leaves the possibility that it is an interpolation + // which leaves the possibility that it is an interpolation if (lexer->lookahead == '$') { advance(lexer); + // however this leaves an edgecase where an escaped dollar sign could + // appear at the end of a string (e.g "aa\$") which isn't handled + // correctly; if we were at the end of the string, terminate properly + if (lexer->lookahead == end_char) { + stack_pop(stack); + advance(lexer); + lexer->mark_end(lexer); + lexer->result_symbol = STRING_END; + return true; + } } } else if (lexer->lookahead == end_char) { if (is_triple) { - mark_end(lexer); - for (unsigned count = 1; count < 3; count++) { + lexer->mark_end(lexer); + for (unsigned count = 1; count < DELIMITER_LENGTH; ++count) { advance(lexer); if (lexer->lookahead != end_char) { - mark_end(lexer); + lexer->mark_end(lexer); lexer->result_symbol = STRING_CONTENT; return true; } } - /* This is so if we lex something like + /* This is so if we lex something like """foo""" - ^ + ^ where we are at the `f`, we should quit after reading `foo`, and ascribe it to STRING_CONTENT. - + Then, we restart and try to read the end. This is to prevent `foo` from being absorbed into the STRING_END token. @@ -185,33 +160,31 @@ static bool scan_string_content(TSLexer *lexer, Stack *stack) { } /* Since the string internals are all hidden in the syntax - tree anyways, there's no point in going to the effort of + tree anyways, there's no point in going to the effort of specifically separating the string end from string contents. If we see a bunch of quotes in a row, then we just go until they stop appearing, then stop lexing and call it the string's end. */ lexer->result_symbol = STRING_END; - mark_end(lexer); + lexer->mark_end(lexer); while (lexer->lookahead == end_char) { advance(lexer); - mark_end(lexer); + lexer->mark_end(lexer); } - pop(stack); + stack_pop(stack); + return true; + } + if (has_content) { + lexer->mark_end(lexer); + lexer->result_symbol = STRING_CONTENT; return true; - } else { - if (has_content) { - mark_end(lexer); - lexer->result_symbol = STRING_CONTENT; - return true; - } else { - pop(stack); - advance(lexer); - mark_end(lexer); - lexer->result_symbol = STRING_END; - return true; - } } + stack_pop(stack); + advance(lexer); + lexer->mark_end(lexer); + lexer->result_symbol = STRING_END; + return true; } advance(lexer); has_content = true; @@ -219,7 +192,7 @@ static bool scan_string_content(TSLexer *lexer, Stack *stack) { return false; } -bool scan_multiline_comment(TSLexer *lexer) { +static bool scan_multiline_comment(TSLexer *lexer) { if (lexer->lookahead != '/') return false; advance(lexer); if (lexer->lookahead != '*') return false; @@ -237,16 +210,16 @@ bool scan_multiline_comment(TSLexer *lexer) { advance(lexer); if (after_star) { after_star = false; - nesting_depth--; + nesting_depth -= 1; if (nesting_depth == 0) { lexer->result_symbol = MULTILINE_COMMENT; - mark_end(lexer); + lexer->mark_end(lexer); return true; } } else { after_star = false; if (lexer->lookahead == '*') { - nesting_depth++; + nesting_depth += 1; advance(lexer); } } @@ -262,36 +235,26 @@ bool scan_multiline_comment(TSLexer *lexer) { } static bool scan_whitespace_and_comments(TSLexer *lexer) { - for (;;) { - while (iswspace(lexer->lookahead)) { - skip(lexer); - } - - if (lexer->lookahead == '/') { - return false; - } else { - return true; - } - } + while (iswspace(lexer->lookahead)) skip(lexer); + return lexer->lookahead != '/'; } -bool scan_for_word(TSLexer *lexer, char* word, unsigned len) { +static bool scan_for_word(TSLexer *lexer, const char* word, unsigned len) { skip(lexer); - for (unsigned i = 0; i < len; i++) { + for (unsigned i = 0; i < len; ++i) { if (lexer->lookahead != word[i]) return false; skip(lexer); } return true; } -bool scan_automatic_semicolon(TSLexer *lexer) { +static bool scan_automatic_semicolon(TSLexer *lexer) { lexer->result_symbol = AUTOMATIC_SEMICOLON; lexer->mark_end(lexer); bool sameline = true; for (;;) { - if (lexer->eof(lexer)) - return true; + if (lexer->eof(lexer)) return true; if (lexer->lookahead == ';') { advance(lexer); @@ -299,13 +262,10 @@ bool scan_automatic_semicolon(TSLexer *lexer) { return true; } - if (!iswspace(lexer->lookahead)) { - break; - } + if (!iswspace(lexer->lookahead)) break; if (lexer->lookahead == '\n') { skip(lexer); - sameline = false; break; } @@ -313,9 +273,7 @@ bool scan_automatic_semicolon(TSLexer *lexer) { if (lexer->lookahead == '\r') { skip(lexer); - if (lexer->lookahead == '\n') { - skip(lexer); - } + if (lexer->lookahead == '\n') skip(lexer); sameline = false; break; @@ -363,24 +321,23 @@ bool scan_automatic_semicolon(TSLexer *lexer) { case '&': case '/': return false; - + //semgrep-ext: could be start of method chain, or start of ... case '.': { advance(lexer); return lexer->lookahead == '.'; } - + // Insert a semicolon before `--` and `++`, but not before binary `+` or `-`. // Insert before +/-Float case '+': skip(lexer); - if (lexer->lookahead == '+') - return true; + if (lexer->lookahead == '+') return true; return iswdigit(lexer->lookahead); + case '-': skip(lexer); - if (lexer->lookahead == '-') - return true; + if (lexer->lookahead == '-') return true; return iswdigit(lexer->lookahead); // Don't insert a semicolon before `!=`, but do insert one before a unary `!`. @@ -396,26 +353,22 @@ bool scan_automatic_semicolon(TSLexer *lexer) { // before an identifier or an import. case 'i': skip(lexer); - if (lexer->lookahead != 'n') - return true; - + if (lexer->lookahead != 'n') return true; skip(lexer); - if (!iswalpha(lexer->lookahead)) - return false; - + if (!iswalpha(lexer->lookahead)) return false; return !scan_for_word(lexer, "stanceof", 8); - case ';': - advance(lexer); - lexer->mark_end(lexer); - return true; + case ';': + advance(lexer); + lexer->mark_end(lexer); + return true; default: return true; } } -bool scan_safe_nav(TSLexer *lexer) { +static bool scan_safe_nav(TSLexer *lexer) { lexer->result_symbol = SAFE_NAV; lexer->mark_end(lexer); @@ -439,7 +392,7 @@ bool scan_safe_nav(TSLexer *lexer) { return true; } -bool scan_line_sep(TSLexer *lexer) { +static bool scan_line_sep(TSLexer *lexer) { // Line Seps: [ CR, LF, CRLF ] int state = 0; while (true) { @@ -473,7 +426,7 @@ bool scan_line_sep(TSLexer *lexer) { } } -bool scan_import_list_delimiter(TSLexer *lexer) { +static bool scan_import_list_delimiter(TSLexer *lexer) { // Import lists are terminated either by an empty line or a non import statement lexer->result_symbol = IMPORT_LIST_DELIMITER; lexer->mark_end(lexer); @@ -513,36 +466,30 @@ bool scan_import_list_delimiter(TSLexer *lexer) { } } -bool tree_sitter_kotlin_external_scanner_scan(void *payload, TSLexer *lexer, - const bool *valid_symbols) { - - bool *in_string = payload; - +bool tree_sitter_kotlin_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { if (valid_symbols[AUTOMATIC_SEMICOLON]) { bool ret = scan_automatic_semicolon(lexer); - if (!ret && valid_symbols[SAFE_NAV] && lexer->lookahead == '?') + if (!ret && valid_symbols[SAFE_NAV] && lexer->lookahead == '?') { return scan_safe_nav(lexer); + } - // if we fail to find an automatic semicolon, it's still possible that we may + // if we fail to find an automatic semicolon, it's still possible that we may // want to lex a string or comment later - if (ret) - return ret; + if (ret) return ret; } - if (valid_symbols[IMPORT_LIST_DELIMITER]) + if (valid_symbols[IMPORT_LIST_DELIMITER]) { return scan_import_list_delimiter(lexer); + } // content or end - if (valid_symbols[STRING_CONTENT] && - scan_string_content(lexer, payload)) { + if (valid_symbols[STRING_CONTENT] && scan_string_content(lexer, payload)) { return true; } - // a string might follow after some whitespace, so we can't lookahead + // a string might follow after some whitespace, so we can't lookahead // until we get rid of it - while(iswspace(lexer->lookahead)) { - skip(lexer); - } + while (iswspace(lexer->lookahead)) skip(lexer); if (valid_symbols[STRING_START] && scan_string_start(lexer, payload)) { lexer->result_symbol = STRING_START; @@ -560,23 +507,35 @@ bool tree_sitter_kotlin_external_scanner_scan(void *payload, TSLexer *lexer, return false; } -void *tree_sitter_kotlin_external_scanner_create() { return new_stack(); } +void *tree_sitter_kotlin_external_scanner_create() { + Stack *stack = ts_calloc(1, sizeof(Stack)); + if (stack == NULL) abort(); + array_init(stack); + return stack; +} void tree_sitter_kotlin_external_scanner_destroy(void *payload) { - free_stack(payload); + Stack *stack = (Stack *)payload; + array_delete(stack); + ts_free(stack); } -unsigned tree_sitter_kotlin_external_scanner_serialize( - void *payload, - char *buffer -) { - return serialize_stack(payload, buffer); +unsigned tree_sitter_kotlin_external_scanner_serialize(void *payload, char *buffer) { + Stack *stack = (Stack *)payload; + if (stack->size > 0) { + // it's an undefined behavior to memcpy 0 bytes + memcpy(buffer, stack->contents, stack->size); + } + return stack->size; } -void tree_sitter_kotlin_external_scanner_deserialize( - void *payload, - const char *buffer, - unsigned length -) { - deserialize_stack(payload, buffer, length); +void tree_sitter_kotlin_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { + Stack *stack = (Stack *)payload; + if (length > 0) { + array_reserve(stack, length); + memcpy(stack->contents, buffer, length); + stack->size = length; + } else { + array_clear(stack); + } } diff --git a/lang/semgrep-grammars/src/semgrep-kotlin/test/corpus/semgrep.txt b/lang/semgrep-grammars/src/semgrep-kotlin/test/corpus/semgrep.txt index b4f6687d..95800ec7 100644 --- a/lang/semgrep-grammars/src/semgrep-kotlin/test/corpus/semgrep.txt +++ b/lang/semgrep-grammars/src/semgrep-kotlin/test/corpus/semgrep.txt @@ -42,6 +42,7 @@ val x = ($X : int) (source_file (property_declaration + (binding_pattern_kind) (variable_declaration (simple_identifier)) (typed_metavar @@ -61,6 +62,7 @@ val x = 2 (source_file (semgrep_named_ellipsis) (property_declaration + (binding_pattern_kind) (variable_declaration (simple_identifier)) (integer_literal))) @@ -110,6 +112,7 @@ class Foo { (function_body (statements (property_declaration + (binding_pattern_kind) (variable_declaration (simple_identifier)) (call_expression @@ -290,9 +293,11 @@ constructor(arg: Int) { (property_declaration (modifiers (visibility_modifier)) + (binding_pattern_kind) (variable_declaration (simple_identifier)) (string_literal + (string_content) (interpolated_expression (simple_identifier))))))) @@ -325,9 +330,11 @@ class Foo { (property_declaration (modifiers (visibility_modifier)) + (binding_pattern_kind) (variable_declaration (simple_identifier)) (string_literal + (string_content) (interpolated_expression (simple_identifier))))))))) diff --git a/lang/semgrep-grammars/src/tree-sitter-kotlin b/lang/semgrep-grammars/src/tree-sitter-kotlin index beafd098..76f53c48 160000 --- a/lang/semgrep-grammars/src/tree-sitter-kotlin +++ b/lang/semgrep-grammars/src/tree-sitter-kotlin @@ -1 +1 @@ -Subproject commit beafd098d7e0f57f0541a52d6050a7638aee91ab +Subproject commit 76f53c48d29e8588934fb55b0240d7bdfe00bfe5