From cd3cb7e332fb0a00de2ccd9b671a2335873820b0 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sun, 17 Dec 2023 09:03:49 -0800 Subject: [PATCH 1/2] Added treesitter_updater automation to update this project to using the specified version of the treesitter library, in _automation/treesitter_updater/main.go. --- _automation/treesitter_updater/README.md | 23 +++ _automation/treesitter_updater/main.go | 245 +++++++++++++++++++++++ 2 files changed, 268 insertions(+) create mode 100644 _automation/treesitter_updater/README.md create mode 100644 _automation/treesitter_updater/main.go diff --git a/_automation/treesitter_updater/README.md b/_automation/treesitter_updater/README.md new file mode 100644 index 00000000..47739750 --- /dev/null +++ b/_automation/treesitter_updater/README.md @@ -0,0 +1,23 @@ +# Tree Sitter Updater + +This Go program automates the process of downloading, extracting, and processing the specified version of the [Tree Sitter library](https://github.com/tree-sitter/tree-sitter). It's designed to simplify the updating of the upstream Tree Sitter library for use in this project. + + +## Usage + +If you want to change the version of the treesitter library that is retrieved, update the `sitterVersion` variable in `main.go`. + +Note, you must run this script from within the `_automation/treesitter_updater` directory because it makes an assumption that the final destination for the .C and .H files is 2 directories up from it's current directory. + +```bash +cd _automation/treesitter_updater +go run main.go +``` + +The success / failure will be printed to stdout, as well as a list of (1) new files and (2) replaced files. + +## Constants + +- `sitterVersion`: Specifies the version of Tree Sitter to download. +- `sitterURL`: The URL to the Tree Sitter source for the specified version. + diff --git a/_automation/treesitter_updater/main.go b/_automation/treesitter_updater/main.go new file mode 100644 index 00000000..4e2b8b0d --- /dev/null +++ b/_automation/treesitter_updater/main.go @@ -0,0 +1,245 @@ +package main + +import ( + // Import necessary packages + "archive/tar" + "compress/gzip" + "fmt" + "io" + "io/ioutil" + "log" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" +) + +// Constants for the Tree Sitter version and download URL +const sitterVersion = "0.20.8" +const sitterURL = "https://github.com/tree-sitter/tree-sitter/archive/refs/tags/v" + sitterVersion + ".tar.gz" + +func main() { + // Get the current working directory + currentDir, err := os.Getwd() + if err != nil { + log.Fatalf("Error getting current directory: %v", err) + } + + // Construct the directory path for the downloaded Tree Sitter files + treeSitterDir := "tree-sitter-" + sitterVersion + parentPath := filepath.Join(currentDir, "tmpts", treeSitterDir) + + // Download and extract the Tree Sitter source code + if err := downloadAndExtractSitter(sitterURL, sitterVersion); err != nil { + log.Fatalf("Error: %v", err) + } + + // Copy necessary files to tmpts directory + copyFiles(filepath.Join(parentPath, "lib", "include", "tree_sitter"), filepath.Join(currentDir, "tmpts"), "*.h") + copyFiles(filepath.Join(parentPath, "lib", "src"), filepath.Join(currentDir, "tmpts"), "*.c") + copyFiles(filepath.Join(parentPath, "lib", "src"), filepath.Join(currentDir, "tmpts"), "*.h") + copyFiles(filepath.Join(parentPath, "lib", "src", "unicode"), filepath.Join(currentDir, "tmpts"), "*.h") + + // Remove the original extracted directory + err = os.RemoveAll(parentPath) + if err != nil { + log.Fatalf("Error removing extracted treesitter directory: %v", err) + } + + // Modify include paths in the copied files + if err := modifyIncludePaths(filepath.Join(currentDir, "tmpts")); err != nil { + log.Fatalf("Error modifying include paths: %v", err) + } + + // Clean up unnecessary files + cleanup(filepath.Join(currentDir, "tmpts")) + + // Copy and report files from tmpts to two levels up in the directory structure + err = copyAndReportFiles(filepath.Join(currentDir, "tmpts"), filepath.Join(currentDir, "..", "..")) + if err != nil { + log.Fatalf("Error copying and reporting files: %v", err) + } + + err = os.RemoveAll(filepath.Join(currentDir, "tmpts")) + if err != nil { + log.Fatalf("Error removing tmpts directory: %v", err) + } + + fmt.Printf("\n\nDone!\n") +} + +// Function to copy and report files from source to destination directory +func copyAndReportFiles(srcDir, dstDir string) error { + // Walk through the source directory + return filepath.Walk(srcDir, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if info.IsDir() { + return nil + } + + // Calculate relative file path and destination file path + relPath, err := filepath.Rel(srcDir, path) + if err != nil { + return err + } + dstFilePath := filepath.Join(dstDir, relPath) + + // Check if file exists at destination and print appropriate message + if _, err := os.Stat(dstFilePath); err == nil { + fmt.Printf("%-39s %s\n", filepath.Base(dstFilePath), "[replaced]") + } else if os.IsNotExist(err) { + fmt.Printf("%-39s %s\n", filepath.Base(dstFilePath), "[new file]") + } + + // Copy the file to destination + return copyFile(path, dstFilePath) + }) +} + +// Function to copy files matching a pattern from source to destination directory +func copyFiles(srcDir, dstDir, pattern string) { + files, err := ioutil.ReadDir(srcDir) + if err != nil { + log.Fatal(err) + } + + // Iterate through files and copy if they match the pattern + for _, file := range files { + if matched, _ := filepath.Match(pattern, file.Name()); matched { + srcFilePath := filepath.Join(srcDir, file.Name()) + dstFilePath := filepath.Join(dstDir, file.Name()) + copyFile(srcFilePath, dstFilePath) + } + } +} + +// Function to copy a single file from source to destination +func copyFile(src, dst string) error { + // Read the file from source + input, err := ioutil.ReadFile(src) + if err != nil { + return err + } + + // Write the file to destination + err = ioutil.WriteFile(dst, input, 0644) + if err != nil { + return err + } + return nil +} + +// Function to modify include paths in .c and .h files +func modifyIncludePaths(path string) error { + // Walk through the directory and modify files + return filepath.Walk(path, func(filePath string, info os.FileInfo, err error) error { + if err != nil { + return err + } + + // Skip directories and non .c/.h files + if info.IsDir() || (filepath.Ext(filePath) != ".c" && filepath.Ext(filePath) != ".h") { + return nil + } + + // Read the file content + content, err := os.ReadFile(filePath) + if err != nil { + return err + } + + // Modify the content and write back + modifiedContent := strings.ReplaceAll(string(content), `"tree_sitter/`, `"`) + modifiedContent = strings.ReplaceAll(modifiedContent, `"unicode/`, `"`) + return os.WriteFile(filePath, []byte(modifiedContent), info.Mode()) + }) +} + +// Function to download and extract Tree Sitter from the given URL +func downloadAndExtractSitter(url, version string) error { + // Send HTTP request to download the file + resp, err := http.Get(url) + if err != nil { + return err + } + defer resp.Body.Close() + + // Prepare gzip reader + gzr, err := gzip.NewReader(resp.Body) + if err != nil { + return err + } + defer gzr.Close() + + // Prepare tar reader and extract files + tr := tar.NewReader(gzr) + for { + header, err := tr.Next() + if err == io.EOF { + break + } + if err != nil { + return err + } + + // Process files within specified directories + if !strings.HasPrefix(header.Name, "tree-sitter-"+version+"/lib/src") && !strings.HasPrefix(header.Name, "tree-sitter-"+version+"/lib/include") { + continue + } + + relPath := strings.TrimPrefix(header.Name, version+"/") + target := filepath.Join("tmpts", relPath) + + // Create directories and files as needed + switch header.Typeflag { + case tar.TypeDir: + if err := os.MkdirAll(target, 0755); err != nil { + return err + } + case tar.TypeReg: + outFile, err := os.Create(target) + if err != nil { + return err + } + if _, err := io.Copy(outFile, tr); err != nil { + outFile.Close() + return err + } + outFile.Close() + } + } + + return nil +} + +// Function to clean up the specified directory +func cleanup(path string) { + // Walk through the directory and remove unnecessary files + err := filepath.Walk(path, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if info.IsDir() { + return nil + } + if filepath.Ext(path) != ".h" && filepath.Ext(path) != ".c" || filepath.Base(path) == "lib.c" { + return os.Remove(path) + } + return nil + }) + + if err != nil { + // Handle the error + } +} + +// Function to run a command and pipe its output +func runCmd(name string, args ...string) error { + cmd := exec.Command(name, args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + return cmd.Run() +} From 55cfc70084d13ca4779a1f9358c033105b24c44e Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sun, 17 Dec 2023 09:04:31 -0800 Subject: [PATCH 2/2] Bumped treesitter to v0.20.8. --- api.h | 36 +- array.h | 6 +- clock.h | 5 + lexer.c | 54 +- parser.c | 26 +- query.c | 1669 +++++++++++++++++++++++++++---------------------- stack.c | 2 +- subtree.h | 8 +- tree.c | 50 +- tree.h | 2 + tree_cursor.c | 118 ++-- tree_cursor.h | 15 + 12 files changed, 1134 insertions(+), 857 deletions(-) diff --git a/api.h b/api.h index 727dded3..edc1c36a 100644 --- a/api.h +++ b/api.h @@ -381,6 +381,13 @@ TSNode ts_tree_root_node_with_offset( */ const TSLanguage *ts_tree_language(const TSTree *); +/** + * Get the array of included ranges that was used to parse the syntax tree. + * + * The returned pointer must be freed by the caller. + */ +TSRange *ts_tree_included_ranges(const TSTree *, uint32_t *length); + /** * Edit the syntax tree to keep it in sync with source code that has been * edited. @@ -413,7 +420,7 @@ TSRange *ts_tree_get_changed_ranges( /** * Write a DOT graph describing the syntax tree to the given file. */ -void ts_tree_print_dot_graph(const TSTree *, FILE *); +void ts_tree_print_dot_graph(const TSTree *, int file_descriptor); /******************/ /* Section - Node */ @@ -743,15 +750,26 @@ const TSQueryPredicateStep *ts_query_predicates_for_pattern( uint32_t *length ); -bool ts_query_is_pattern_rooted( - const TSQuery *self, - uint32_t pattern_index -); +/* + * Check if the given pattern in the query has a single root node. + */ +bool ts_query_is_pattern_rooted(const TSQuery *self, uint32_t pattern_index); -bool ts_query_is_pattern_guaranteed_at_step( - const TSQuery *self, - uint32_t byte_offset -); +/* + * Check if the given pattern in the query is 'non local'. + * + * A non-local pattern has multiple root nodes and can match within a + * repeating sequence of nodes, as specified by the grammar. Non-local + * patterns disable certain optimizations that would otherwise be possible + * when executing a query on a specific range of a syntax tree. + */ +bool ts_query_is_pattern_non_local(const TSQuery *self, uint32_t pattern_index); + +/* + * Check if a given pattern is guaranteed to match once a given step is reached. + * The step is specified by its byte offset in the query's source code. + */ +bool ts_query_is_pattern_guaranteed_at_step(const TSQuery *self, uint32_t byte_offset); /** * Get the name and length of one of the query's captures, or one of the diff --git a/array.h b/array.h index 5ff5580a..abec9410 100644 --- a/array.h +++ b/array.h @@ -170,10 +170,10 @@ static inline void array__swap(VoidArray *self, VoidArray *other) { *self = swap; } -static inline void array__grow(VoidArray *self, size_t count, size_t element_size) { - size_t new_size = self->size + count; +static inline void array__grow(VoidArray *self, uint32_t count, size_t element_size) { + uint32_t new_size = self->size + count; if (new_size > self->capacity) { - size_t new_capacity = self->capacity * 2; + uint32_t new_capacity = self->capacity * 2; if (new_capacity < 8) new_capacity = 8; if (new_capacity < new_size) new_capacity = new_size; array__reserve(self, element_size, new_capacity); diff --git a/clock.h b/clock.h index 94545f35..6e75729e 100644 --- a/clock.h +++ b/clock.h @@ -1,6 +1,7 @@ #ifndef TREE_SITTER_CLOCK_H_ #define TREE_SITTER_CLOCK_H_ +#include #include typedef uint64_t TSDuration; @@ -82,6 +83,10 @@ static inline TSClock clock_after(TSClock base, TSDuration duration) { TSClock result = base; result.tv_sec += duration / 1000000; result.tv_nsec += (duration % 1000000) * 1000; + if (result.tv_nsec >= 1000000000) { + result.tv_nsec -= 1000000000; + ++(result.tv_sec); + } return result; } diff --git a/lexer.c b/lexer.c index f2c10fbd..acaf3e8c 100644 --- a/lexer.c +++ b/lexer.c @@ -104,13 +104,16 @@ static void ts_lexer__get_lookahead(Lexer *self) { static void ts_lexer_goto(Lexer *self, Length position) { self->current_position = position; - bool found_included_range = false; // Move to the first valid position at or after the given position. + bool found_included_range = false; for (unsigned i = 0; i < self->included_range_count; i++) { TSRange *included_range = &self->included_ranges[i]; - if (included_range->end_byte > position.bytes) { - if (included_range->start_byte >= position.bytes) { + if ( + included_range->end_byte > self->current_position.bytes && + included_range->end_byte > included_range->start_byte + ) { + if (included_range->start_byte >= self->current_position.bytes) { self->current_position = (Length) { .bytes = included_range->start_byte, .extent = included_range->start_point, @@ -127,8 +130,8 @@ static void ts_lexer_goto(Lexer *self, Length position) { // If the current position is outside of the current chunk of text, // then clear out the current chunk of text. if (self->chunk && ( - position.bytes < self->chunk_start || - position.bytes >= self->chunk_start + self->chunk_size + self->current_position.bytes < self->chunk_start || + self->current_position.bytes >= self->chunk_start + self->chunk_size )) { ts_lexer__clear_chunk(self); } @@ -164,27 +167,31 @@ static void ts_lexer__do_advance(Lexer *self, bool skip) { } } - const TSRange *current_range = NULL; - if (self->current_included_range_index < self->included_range_count) { - current_range = &self->included_ranges[self->current_included_range_index]; - if (self->current_position.bytes == current_range->end_byte) { - self->current_included_range_index++; - if (self->current_included_range_index < self->included_range_count) { - current_range++; - self->current_position = (Length) { - current_range->start_byte, - current_range->start_point, - }; - } else { - current_range = NULL; - } + const TSRange *current_range = &self->included_ranges[self->current_included_range_index]; + while ( + self->current_position.bytes >= current_range->end_byte || + current_range->end_byte == current_range->start_byte + ) { + self->current_included_range_index++; + if (self->current_included_range_index < self->included_range_count) { + current_range++; + self->current_position = (Length) { + current_range->start_byte, + current_range->start_point, + }; + } else { + current_range = NULL; + break; } } if (skip) self->token_start_position = self->current_position; if (current_range) { - if (self->current_position.bytes >= self->chunk_start + self->chunk_size) { + if ( + self->current_position.bytes < self->chunk_start || + self->current_position.bytes >= self->chunk_start + self->chunk_size + ) { ts_lexer__get_chunk(self); } ts_lexer__get_lookahead(self); @@ -339,6 +346,13 @@ void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) { ts_lexer__mark_end(&self->data); } + // If the token ended at an included range boundary, then its end position + // will have been reset to the end of the preceding range. Reset the start + // position to match. + if (self->token_end_position.bytes < self->token_start_position.bytes) { + self->token_start_position = self->token_end_position; + } + uint32_t current_lookahead_end_byte = self->current_position.bytes + 1; // In order to determine that a byte sequence is invalid UTF8 or UTF16, diff --git a/parser.c b/parser.c index 6f914a8e..6955b1eb 100644 --- a/parser.c +++ b/parser.c @@ -447,8 +447,14 @@ static Subtree ts_parser__lex( // avoid infinite loops which could otherwise occur, because the lexer is // looking for any possible token, instead of looking for the specific set of // tokens that are valid in some parse state. + // + // Note that it's possible that the token end position may be *before* the + // original position of the lexer because of the way that tokens are positioned + // at included range boundaries: when a token is terminated at the start of + // an included range, it is marked as ending at the *end* of the preceding + // included range. if ( - self->lexer.token_end_position.bytes == current_position.bytes && + self->lexer.token_end_position.bytes <= current_position.bytes && (error_mode || !ts_stack_has_advanced_since_error(self->stack, version)) && !external_scanner_state_changed ) { @@ -525,10 +531,6 @@ static Subtree ts_parser__lex( self->language ); } else { - if (self->lexer.token_end_position.bytes < self->lexer.token_start_position.bytes) { - self->lexer.token_start_position = self->lexer.token_end_position; - } - bool is_keyword = false; TSSymbol symbol = self->lexer.data.result_symbol; Length padding = length_sub(self->lexer.token_start_position, start_position); @@ -605,7 +607,7 @@ static Subtree ts_parser__get_cached_token( static void ts_parser__set_cached_token( TSParser *self, - size_t byte_index, + uint32_t byte_index, Subtree last_external_token, Subtree token ) { @@ -1461,7 +1463,9 @@ static bool ts_parser__advance( ((self->cancellation_flag && atomic_load(self->cancellation_flag)) || (!clock_is_null(self->end_clock) && clock_is_gt(clock_now(), self->end_clock))) ) { - ts_subtree_release(&self->tree_pool, lookahead); + if (lookahead.ptr) { + ts_subtree_release(&self->tree_pool, lookahead); + } return false; } @@ -1937,8 +1941,16 @@ TSTree *ts_parser_parse( } } + // After advancing each version of the stack, re-sort the versions by their cost, + // removing any versions that are no longer worth pursuing. unsigned min_error_cost = ts_parser__condense_stack(self); + + // If there's already a finished parse tree that's better than any in-progress version, + // then terminate parsing. Clear the parse stack to remove any extra references to subtrees + // within the finished tree, ensuring that these subtrees can be safely mutated in-place + // for rebalancing. if (self->finished_tree.ptr && ts_subtree_error_cost(self->finished_tree) < min_error_cost) { + ts_stack_clear(self->stack); break; } diff --git a/query.c b/query.c index cf4f9bf5..4d0de69d 100644 --- a/query.c +++ b/query.c @@ -146,6 +146,7 @@ typedef struct { Slice steps; Slice predicate_steps; uint32_t start_byte; + bool is_non_local; } QueryPattern; typedef struct { @@ -228,11 +229,20 @@ typedef struct { AnalysisStateEntry stack[MAX_ANALYSIS_STATE_DEPTH]; uint16_t depth; uint16_t step_index; + TSSymbol root_symbol; } AnalysisState; typedef Array(AnalysisState *) AnalysisStateSet; -typedef Array(AnalysisState *) AnalysisStatePool; +typedef struct { + AnalysisStateSet states; + AnalysisStateSet next_states; + AnalysisStateSet deeper_states; + AnalysisStateSet state_pool; + Array(uint16_t) final_step_indices; + Array(TSSymbol) finished_parent_symbols; + bool did_abort; +} QueryAnalysis; /* * AnalysisSubgraph - A subset of the states in the parse table that are used @@ -253,6 +263,8 @@ typedef struct { Array(AnalysisSubgraphNode) nodes; } AnalysisSubgraph; +typedef Array(AnalysisSubgraph) AnalysisSubgraphArray; + /* * StatePredecessorMap - A map that stores the predecessors of each parse state. * This is used during query analysis to determine which parse states can lead @@ -269,8 +281,8 @@ typedef struct { */ struct TSQuery { SymbolTable captures; - Array(CaptureQuantifiers) capture_quantifiers; SymbolTable predicate_values; + Array(CaptureQuantifiers) capture_quantifiers; Array(QueryStep) steps; Array(PatternEntry) pattern_map; Array(TSQueryPredicateStep) predicate_steps; @@ -278,6 +290,7 @@ struct TSQuery { Array(StepOffset) step_offsets; Array(TSFieldId) negated_fields; Array(char) string_buffer; + Array(TSSymbol) repeat_symbols_with_rootless_patterns; const TSLanguage *language; uint16_t wildcard_root_pattern_count; }; @@ -297,6 +310,7 @@ struct TSQueryCursor { TSPoint start_point; TSPoint end_point; uint32_t next_state_id; + bool on_visible_node; bool ascending; bool halted; bool did_exceed_match_limit; @@ -934,30 +948,23 @@ static inline bool analysis_state__has_supertype(AnalysisState *self, TSSymbol s return false; } -static inline AnalysisState *analysis_state__clone(AnalysisState const *self) { - AnalysisState *new_state = ts_malloc(sizeof(AnalysisState)); - *new_state = *self; - return new_state; -} - -/**************** +/****************** * AnalysisStateSet - ****************/ + ******************/ // Obtains an `AnalysisState` instance, either by consuming one from this set's object pool, or by // cloning one from scratch. static inline AnalysisState *analysis_state_pool__clone_or_reuse( - AnalysisStatePool *self, + AnalysisStateSet *self, AnalysisState *borrowed_item ) { AnalysisState *new_item; if (self->size) { new_item = array_pop(self); - *new_item = *borrowed_item; } else { - new_item = analysis_state__clone(borrowed_item); + new_item = ts_malloc(sizeof(AnalysisState)); } - + *new_item = *borrowed_item; return new_item; } @@ -967,9 +974,9 @@ static inline AnalysisState *analysis_state_pool__clone_or_reuse( // // The caller retains ownership of the passed-in memory. However, the clone that is created by this // function will be managed by the state set. -static inline void analysis_state_set__insert_sorted_by_clone( +static inline void analysis_state_set__insert_sorted( AnalysisStateSet *self, - AnalysisStatePool *pool, + AnalysisStateSet *pool, AnalysisState *borrowed_item ) { unsigned index, exists; @@ -988,9 +995,9 @@ static inline void analysis_state_set__insert_sorted_by_clone( // // The caller retains ownership of the passed-in memory. However, the clone that is created by this // function will be managed by the state set. -static inline void analysis_state_set__push_by_clone( +static inline void analysis_state_set__push( AnalysisStateSet *self, - AnalysisStatePool *pool, + AnalysisStateSet *pool, AnalysisState *borrowed_item ) { AnalysisState *new_item = analysis_state_pool__clone_or_reuse(pool, borrowed_item); @@ -998,7 +1005,7 @@ static inline void analysis_state_set__push_by_clone( } // Removes all items from this set, returning it to an empty state. -static inline void analysis_state_set__clear(AnalysisStateSet *self, AnalysisStatePool *pool) { +static inline void analysis_state_set__clear(AnalysisStateSet *self, AnalysisStateSet *pool) { array_push_all(pool, self); array_clear(self); } @@ -1012,6 +1019,31 @@ static inline void analysis_state_set__delete(AnalysisStateSet *self) { array_delete(self); } +/**************** + * QueryAnalyzer + ****************/ + +static inline QueryAnalysis query_analysis__new() { + return (QueryAnalysis) { + .states = array_new(), + .next_states = array_new(), + .deeper_states = array_new(), + .state_pool = array_new(), + .final_step_indices = array_new(), + .finished_parent_symbols = array_new(), + .did_abort = false, + }; +} + +static inline void query_analysis__delete(QueryAnalysis *self) { + analysis_state_set__delete(&self->states); + analysis_state_set__delete(&self->next_states); + analysis_state_set__delete(&self->deeper_states); + analysis_state_set__delete(&self->state_pool); + array_delete(&self->final_step_indices); + array_delete(&self->finished_parent_symbols); +} + /*********************** * AnalysisSubgraphNode ***********************/ @@ -1113,7 +1145,322 @@ static inline void ts_query__pattern_map_insert( array_insert(&self->pattern_map, index, new_entry); } +// Walk the subgraph for this non-terminal, tracking all of the possible +// sequences of progress within the pattern. +static void ts_query__perform_analysis( + TSQuery *self, + const AnalysisSubgraphArray *subgraphs, + QueryAnalysis *analysis +) { + unsigned recursion_depth_limit = 0; + unsigned prev_final_step_count = 0; + array_clear(&analysis->final_step_indices); + array_clear(&analysis->finished_parent_symbols); + + for (unsigned iteration = 0;; iteration++) { + if (iteration == MAX_ANALYSIS_ITERATION_COUNT) { + analysis->did_abort = true; + break; + } + + #ifdef DEBUG_ANALYZE_QUERY + printf("Iteration: %u. Final step indices:", iteration); + for (unsigned j = 0; j < analysis->final_step_indices.size; j++) { + printf(" %4u", analysis->final_step_indices.contents[j]); + } + printf("\n"); + for (unsigned j = 0; j < analysis->states.size; j++) { + AnalysisState *state = analysis->states.contents[j]; + printf(" %3u: step: %u, stack: [", j, state->step_index); + for (unsigned k = 0; k < state->depth; k++) { + printf( + " {%s, child: %u, state: %4u", + self->language->symbol_names[state->stack[k].parent_symbol], + state->stack[k].child_index, + state->stack[k].parse_state + ); + if (state->stack[k].field_id) printf(", field: %s", self->language->field_names[state->stack[k].field_id]); + if (state->stack[k].done) printf(", DONE"); + printf("}"); + } + printf(" ]\n"); + } + #endif + + // If no further progress can be made within the current recursion depth limit, then + // bump the depth limit by one, and continue to process the states the exceeded the + // limit. But only allow this if progress has been made since the last time the depth + // limit was increased. + if (analysis->states.size == 0) { + if ( + analysis->deeper_states.size > 0 && + analysis->final_step_indices.size > prev_final_step_count + ) { + #ifdef DEBUG_ANALYZE_QUERY + printf("Increase recursion depth limit to %u\n", recursion_depth_limit + 1); + #endif + + prev_final_step_count = analysis->final_step_indices.size; + recursion_depth_limit++; + AnalysisStateSet _states = analysis->states; + analysis->states = analysis->deeper_states; + analysis->deeper_states = _states; + continue; + } + + break; + } + + analysis_state_set__clear(&analysis->next_states, &analysis->state_pool); + for (unsigned j = 0; j < analysis->states.size; j++) { + AnalysisState * const state = analysis->states.contents[j]; + + // For efficiency, it's important to avoid processing the same analysis state more + // than once. To achieve this, keep the states in order of ascending position within + // their hypothetical syntax trees. In each iteration of this loop, start by advancing + // the states that have made the least progress. Avoid advancing states that have already + // made more progress. + if (analysis->next_states.size > 0) { + int comparison = analysis_state__compare_position( + &state, + array_back(&analysis->next_states) + ); + if (comparison == 0) { + analysis_state_set__insert_sorted(&analysis->next_states, &analysis->state_pool, state); + continue; + } else if (comparison > 0) { + #ifdef DEBUG_ANALYZE_QUERY + printf("Terminate iteration at state %u\n", j); + #endif + while (j < analysis->states.size) { + analysis_state_set__push( + &analysis->next_states, + &analysis->state_pool, + analysis->states.contents[j] + ); + j++; + } + break; + } + } + + const TSStateId parse_state = analysis_state__top(state)->parse_state; + const TSSymbol parent_symbol = analysis_state__top(state)->parent_symbol; + const TSFieldId parent_field_id = analysis_state__top(state)->field_id; + const unsigned child_index = analysis_state__top(state)->child_index; + const QueryStep * const step = &self->steps.contents[state->step_index]; + + unsigned subgraph_index, exists; + array_search_sorted_by(subgraphs, .symbol, parent_symbol, &subgraph_index, &exists); + if (!exists) continue; + const AnalysisSubgraph *subgraph = &subgraphs->contents[subgraph_index]; + + // Follow every possible path in the parse table, but only visit states that + // are part of the subgraph for the current symbol. + LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, parse_state); + while (ts_lookahead_iterator_next(&lookahead_iterator)) { + TSSymbol sym = lookahead_iterator.symbol; + + AnalysisSubgraphNode successor = { + .state = parse_state, + .child_index = child_index, + }; + if (lookahead_iterator.action_count) { + const TSParseAction *action = &lookahead_iterator.actions[lookahead_iterator.action_count - 1]; + if (action->type == TSParseActionTypeShift) { + if (!action->shift.extra) { + successor.state = action->shift.state; + successor.child_index++; + } + } else { + continue; + } + } else if (lookahead_iterator.next_state != 0) { + successor.state = lookahead_iterator.next_state; + successor.child_index++; + } else { + continue; + } + + unsigned node_index; + array_search_sorted_with( + &subgraph->nodes, + analysis_subgraph_node__compare, &successor, + &node_index, &exists + ); + while (node_index < subgraph->nodes.size) { + AnalysisSubgraphNode *node = &subgraph->nodes.contents[node_index++]; + if (node->state != successor.state || node->child_index != successor.child_index) break; + + // Use the subgraph to determine what alias and field will eventually be applied + // to this child node. + TSSymbol alias = ts_language_alias_at(self->language, node->production_id, child_index); + TSSymbol visible_symbol = alias + ? alias + : self->language->symbol_metadata[sym].visible + ? self->language->public_symbol_map[sym] + : 0; + TSFieldId field_id = parent_field_id; + if (!field_id) { + const TSFieldMapEntry *field_map, *field_map_end; + ts_language_field_map(self->language, node->production_id, &field_map, &field_map_end); + for (; field_map != field_map_end; field_map++) { + if (!field_map->inherited && field_map->child_index == child_index) { + field_id = field_map->field_id; + break; + } + } + } + + // Create a new state that has advanced past this hypothetical subtree. + AnalysisState next_state = *state; + AnalysisStateEntry *next_state_top = analysis_state__top(&next_state); + next_state_top->child_index = successor.child_index; + next_state_top->parse_state = successor.state; + if (node->done) next_state_top->done = true; + + // Determine if this hypothetical child node would match the current step + // of the query pattern. + bool does_match = false; + if (visible_symbol) { + does_match = true; + if (step->symbol == WILDCARD_SYMBOL) { + if ( + step->is_named && + !self->language->symbol_metadata[visible_symbol].named + ) does_match = false; + } else if (step->symbol != visible_symbol) { + does_match = false; + } + if (step->field && step->field != field_id) { + does_match = false; + } + if ( + step->supertype_symbol && + !analysis_state__has_supertype(state, step->supertype_symbol) + ) does_match = false; + } + + // If this child is hidden, then descend into it and walk through its children. + // If the top entry of the stack is at the end of its rule, then that entry can + // be replaced. Otherwise, push a new entry onto the stack. + else if (sym >= self->language->token_count) { + if (!next_state_top->done) { + if (next_state.depth + 1 >= MAX_ANALYSIS_STATE_DEPTH) { + #ifdef DEBUG_ANALYZE_QUERY + printf("Exceeded depth limit for state %u\n", j); + #endif + + analysis->did_abort = true; + continue; + } + + next_state.depth++; + next_state_top = analysis_state__top(&next_state); + } + + *next_state_top = (AnalysisStateEntry) { + .parse_state = parse_state, + .parent_symbol = sym, + .child_index = 0, + .field_id = field_id, + .done = false, + }; + + if (analysis_state__recursion_depth(&next_state) > recursion_depth_limit) { + analysis_state_set__insert_sorted( + &analysis->deeper_states, + &analysis->state_pool, + &next_state + ); + continue; + } + } + + // Pop from the stack when this state reached the end of its current syntax node. + while (next_state.depth > 0 && next_state_top->done) { + next_state.depth--; + next_state_top = analysis_state__top(&next_state); + } + + // If this hypothetical child did match the current step of the query pattern, + // then advance to the next step at the current depth. This involves skipping + // over any descendant steps of the current child. + const QueryStep *next_step = step; + if (does_match) { + for (;;) { + next_state.step_index++; + next_step = &self->steps.contents[next_state.step_index]; + if ( + next_step->depth == PATTERN_DONE_MARKER || + next_step->depth <= step->depth + ) break; + } + } else if (successor.state == parse_state) { + continue; + } + + for (;;) { + // Skip pass-through states. Although these states have alternatives, they are only + // used to implement repetitions, and query analysis does not need to process + // repetitions in order to determine whether steps are possible and definite. + if (next_step->is_pass_through) { + next_state.step_index++; + next_step++; + continue; + } + + // If the pattern is finished or hypothetical parent node is complete, then + // record that matching can terminate at this step of the pattern. Otherwise, + // add this state to the list of states to process on the next iteration. + if (!next_step->is_dead_end) { + bool did_finish_pattern = self->steps.contents[next_state.step_index].depth != step->depth; + if (did_finish_pattern) { + array_insert_sorted_by(&analysis->finished_parent_symbols, , state->root_symbol); + } else if (next_state.depth == 0) { + array_insert_sorted_by(&analysis->final_step_indices, , next_state.step_index); + } else { + analysis_state_set__insert_sorted(&analysis->next_states, &analysis->state_pool, &next_state); + } + } + + // If the state has advanced to a step with an alternative step, then add another state + // at that alternative step. This process is simpler than the process of actually matching a + // pattern during query execution, because for the purposes of query analysis, there is no + // need to process repetitions. + if ( + does_match && + next_step->alternative_index != NONE && + next_step->alternative_index > next_state.step_index + ) { + next_state.step_index = next_step->alternative_index; + next_step = &self->steps.contents[next_state.step_index]; + } else { + break; + } + } + } + } + } + + AnalysisStateSet _states = analysis->states; + analysis->states = analysis->next_states; + analysis->next_states = _states; + } +} + static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { + Array(uint16_t) non_rooted_pattern_start_steps = array_new(); + for (unsigned i = 0; i < self->pattern_map.size; i++) { + PatternEntry *pattern = &self->pattern_map.contents[i]; + if (!pattern->is_rooted) { + QueryStep *step = &self->steps.contents[pattern->step_index]; + if (step->symbol != WILDCARD_SYMBOL) { + array_push(&non_rooted_pattern_start_steps, i); + } + } + } + // Walk forward through all of the steps in the query, computing some // basic information about each step. Mark all of the steps that contain // captures, and record the indices of all of the steps that have child steps. @@ -1158,7 +1505,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // of the hidden symbols in the grammar, because these might occur within // one of the parent nodes, such that their children appear to belong to the // parent. - Array(AnalysisSubgraph) subgraphs = array_new(); + AnalysisSubgraphArray subgraphs = array_new(); for (unsigned i = 0; i < parent_step_indices.size; i++) { uint32_t parent_step_index = parent_step_indices.contents[i]; TSSymbol parent_symbol = self->steps.contents[parent_step_index].symbol; @@ -1320,11 +1667,7 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // For each non-terminal pattern, determine if the pattern can successfully match, // and identify all of the possible children within the pattern where matching could fail. bool all_patterns_are_valid = true; - AnalysisStateSet states = array_new(); - AnalysisStateSet next_states = array_new(); - AnalysisStateSet deeper_states = array_new(); - AnalysisStatePool state_pool = array_new(); - Array(uint16_t) final_step_indices = array_new(); + QueryAnalysis analysis = query_analysis__new(); for (unsigned i = 0; i < parent_step_indices.size; i++) { uint16_t parent_step_index = parent_step_indices.contents[i]; uint16_t parent_depth = self->steps.contents[parent_step_index].depth; @@ -1348,11 +1691,11 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // Initialize an analysis state at every parse state in the table where // this parent symbol can occur. AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index]; - analysis_state_set__clear(&states, &state_pool); - analysis_state_set__clear(&deeper_states, &state_pool); + analysis_state_set__clear(&analysis.states, &analysis.state_pool); + analysis_state_set__clear(&analysis.deeper_states, &analysis.state_pool); for (unsigned j = 0; j < subgraph->start_states.size; j++) { TSStateId parse_state = subgraph->start_states.contents[j]; - analysis_state_set__push_by_clone(&states, &state_pool, &((AnalysisState) { + analysis_state_set__push(&analysis.states, &analysis.state_pool, &((AnalysisState) { .step_index = parent_step_index + 1, .stack = { [0] = { @@ -1364,312 +1707,23 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { }, }, .depth = 1, + .root_symbol = parent_symbol, })); } - // Walk the subgraph for this non-terminal, tracking all of the possible - // sequences of progress within the pattern. - bool can_finish_pattern = false; - bool did_abort_analysis = false; - unsigned recursion_depth_limit = 0; - unsigned prev_final_step_count = 0; - array_clear(&final_step_indices); - for (unsigned iteration = 0;; iteration++) { - if (iteration == MAX_ANALYSIS_ITERATION_COUNT) { - did_abort_analysis = true; - break; - } - - #ifdef DEBUG_ANALYZE_QUERY - printf("Iteration: %u. Final step indices:", iteration); - for (unsigned j = 0; j < final_step_indices.size; j++) { - printf(" %4u", final_step_indices.contents[j]); - } - printf("\nWalk states for %u %s:\n", i, ts_language_symbol_name(self->language, parent_symbol)); - for (unsigned j = 0; j < states.size; j++) { - AnalysisState *state = states.contents[j]; - printf(" %3u: step: %u, stack: [", j, state->step_index); - for (unsigned k = 0; k < state->depth; k++) { - printf( - " {%s, child: %u, state: %4u", - self->language->symbol_names[state->stack[k].parent_symbol], - state->stack[k].child_index, - state->stack[k].parse_state - ); - if (state->stack[k].field_id) printf(", field: %s", self->language->field_names[state->stack[k].field_id]); - if (state->stack[k].done) printf(", DONE"); - printf("}"); - } - printf(" ]\n"); - } - #endif - - // If no further progress can be made within the current recursion depth limit, then - // bump the depth limit by one, and continue to process the states the exceeded the - // limit. But only allow this if progress has been made since the last time the depth - // limit was increased. - if (states.size == 0) { - if ( - deeper_states.size > 0 - && final_step_indices.size > prev_final_step_count - ) { - #ifdef DEBUG_ANALYZE_QUERY - printf("Increase recursion depth limit to %u\n", recursion_depth_limit + 1); - #endif - - prev_final_step_count = final_step_indices.size; - recursion_depth_limit++; - AnalysisStateSet _states = states; - states = deeper_states; - deeper_states = _states; - continue; - } - - break; - } - - analysis_state_set__clear(&next_states, &state_pool); - for (unsigned j = 0; j < states.size; j++) { - AnalysisState * const state = states.contents[j]; - - // For efficiency, it's important to avoid processing the same analysis state more - // than once. To achieve this, keep the states in order of ascending position within - // their hypothetical syntax trees. In each iteration of this loop, start by advancing - // the states that have made the least progress. Avoid advancing states that have already - // made more progress. - if (next_states.size > 0) { - int comparison = analysis_state__compare_position( - &state, - array_back(&next_states) - ); - if (comparison == 0) { - #ifdef DEBUG_ANALYZE_QUERY - printf("Skip iteration for state %u\n", j); - #endif - analysis_state_set__insert_sorted_by_clone(&next_states, &state_pool, state); - continue; - } else if (comparison > 0) { - #ifdef DEBUG_ANALYZE_QUERY - printf("Terminate iteration at state %u\n", j); - #endif - while (j < states.size) { - analysis_state_set__push_by_clone( - &next_states, - &state_pool, - states.contents[j] - ); - j++; - } - break; - } - } - - const TSStateId parse_state = analysis_state__top(state)->parse_state; - const TSSymbol parent_symbol = analysis_state__top(state)->parent_symbol; - const TSFieldId parent_field_id = analysis_state__top(state)->field_id; - const unsigned child_index = analysis_state__top(state)->child_index; - const QueryStep * const step = &self->steps.contents[state->step_index]; - - unsigned subgraph_index, exists; - array_search_sorted_by(&subgraphs, .symbol, parent_symbol, &subgraph_index, &exists); - if (!exists) continue; - const AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index]; - - // Follow every possible path in the parse table, but only visit states that - // are part of the subgraph for the current symbol. - LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, parse_state); - while (ts_lookahead_iterator_next(&lookahead_iterator)) { - TSSymbol sym = lookahead_iterator.symbol; - - AnalysisSubgraphNode successor = { - .state = parse_state, - .child_index = child_index, - }; - if (lookahead_iterator.action_count) { - const TSParseAction *action = &lookahead_iterator.actions[lookahead_iterator.action_count - 1]; - if (action->type == TSParseActionTypeShift) { - if (!action->shift.extra) { - successor.state = action->shift.state; - successor.child_index++; - } - } else { - continue; - } - } else if (lookahead_iterator.next_state != 0) { - successor.state = lookahead_iterator.next_state; - successor.child_index++; - } else { - continue; - } - - unsigned node_index; - array_search_sorted_with( - &subgraph->nodes, - analysis_subgraph_node__compare, &successor, - &node_index, &exists - ); - while (node_index < subgraph->nodes.size) { - AnalysisSubgraphNode *node = &subgraph->nodes.contents[node_index++]; - if (node->state != successor.state || node->child_index != successor.child_index) break; - - // Use the subgraph to determine what alias and field will eventually be applied - // to this child node. - TSSymbol alias = ts_language_alias_at(self->language, node->production_id, child_index); - TSSymbol visible_symbol = alias - ? alias - : self->language->symbol_metadata[sym].visible - ? self->language->public_symbol_map[sym] - : 0; - TSFieldId field_id = parent_field_id; - if (!field_id) { - const TSFieldMapEntry *field_map, *field_map_end; - ts_language_field_map(self->language, node->production_id, &field_map, &field_map_end); - for (; field_map != field_map_end; field_map++) { - if (!field_map->inherited && field_map->child_index == child_index) { - field_id = field_map->field_id; - break; - } - } - } - - // Create a new state that has advanced past this hypothetical subtree. - AnalysisState next_state = *state; - AnalysisStateEntry *next_state_top = analysis_state__top(&next_state); - next_state_top->child_index = successor.child_index; - next_state_top->parse_state = successor.state; - if (node->done) next_state_top->done = true; - - // Determine if this hypothetical child node would match the current step - // of the query pattern. - bool does_match = false; - if (visible_symbol) { - does_match = true; - if (step->symbol == WILDCARD_SYMBOL) { - if ( - step->is_named && - !self->language->symbol_metadata[visible_symbol].named - ) does_match = false; - } else if (step->symbol != visible_symbol) { - does_match = false; - } - if (step->field && step->field != field_id) { - does_match = false; - } - if ( - step->supertype_symbol && - !analysis_state__has_supertype(state, step->supertype_symbol) - ) does_match = false; - } - - // If this child is hidden, then descend into it and walk through its children. - // If the top entry of the stack is at the end of its rule, then that entry can - // be replaced. Otherwise, push a new entry onto the stack. - else if (sym >= self->language->token_count) { - if (!next_state_top->done) { - if (next_state.depth + 1 >= MAX_ANALYSIS_STATE_DEPTH) { - #ifdef DEBUG_ANALYZE_QUERY - printf("Exceeded depth limit for state %u\n", j); - #endif - - did_abort_analysis = true; - continue; - } - - next_state.depth++; - next_state_top = analysis_state__top(&next_state); - } - - *next_state_top = (AnalysisStateEntry) { - .parse_state = parse_state, - .parent_symbol = sym, - .child_index = 0, - .field_id = field_id, - .done = false, - }; - - if (analysis_state__recursion_depth(&next_state) > recursion_depth_limit) { - analysis_state_set__insert_sorted_by_clone( - &deeper_states, - &state_pool, - &next_state - ); - continue; - } - } - - // Pop from the stack when this state reached the end of its current syntax node. - while (next_state.depth > 0 && next_state_top->done) { - next_state.depth--; - next_state_top = analysis_state__top(&next_state); - } - - // If this hypothetical child did match the current step of the query pattern, - // then advance to the next step at the current depth. This involves skipping - // over any descendant steps of the current child. - const QueryStep *next_step = step; - if (does_match) { - for (;;) { - next_state.step_index++; - next_step = &self->steps.contents[next_state.step_index]; - if ( - next_step->depth == PATTERN_DONE_MARKER || - next_step->depth <= parent_depth + 1 - ) break; - } - } else if (successor.state == parse_state) { - continue; - } - - for (;;) { - // Skip pass-through states. Although these states have alternatives, they are only - // used to implement repetitions, and query analysis does not need to process - // repetitions in order to determine whether steps are possible and definite. - if (next_step->is_pass_through) { - next_state.step_index++; - next_step++; - continue; - } - - // If the pattern is finished or hypothetical parent node is complete, then - // record that matching can terminate at this step of the pattern. Otherwise, - // add this state to the list of states to process on the next iteration. - if (!next_step->is_dead_end) { - bool did_finish_pattern = self->steps.contents[next_state.step_index].depth != parent_depth + 1; - if (did_finish_pattern) can_finish_pattern = true; - if (did_finish_pattern || next_state.depth == 0) { - array_insert_sorted_by(&final_step_indices, , next_state.step_index); - } else { - analysis_state_set__insert_sorted_by_clone(&next_states, &state_pool, &next_state); - } - } - - // If the state has advanced to a step with an alternative step, then add another state - // at that alternative step. This process is simpler than the process of actually matching a - // pattern during query execution, because for the purposes of query analysis, there is no - // need to process repetitions. - if ( - does_match && - next_step->alternative_index != NONE && - next_step->alternative_index > next_state.step_index - ) { - next_state.step_index = next_step->alternative_index; - next_step = &self->steps.contents[next_state.step_index]; - } else { - break; - } - } - } - } - } + #ifdef DEBUG_ANALYZE_QUERY + printf( + "\nWalk states for %s:\n", + ts_language_symbol_name(self->language, analysis.states.contents[0]->stack[0].parent_symbol) + ); + #endif - AnalysisStateSet _states = states; - states = next_states; - next_states = _states; - } + analysis.did_abort = false; + ts_query__perform_analysis(self, &subgraphs, &analysis); // If this pattern could not be fully analyzed, then every step should // be considered fallible. - if (did_abort_analysis) { + if (analysis.did_abort) { for (unsigned j = parent_step_index + 1; j < self->steps.size; j++) { QueryStep *step = &self->steps.contents[j]; if ( @@ -1686,9 +1740,9 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // If this pattern cannot match, store the pattern index so that it can be // returned to the caller. - if (!can_finish_pattern) { - assert(final_step_indices.size > 0); - uint16_t impossible_step_index = *array_back(&final_step_indices); + if (analysis.finished_parent_symbols.size == 0) { + assert(analysis.final_step_indices.size > 0); + uint16_t impossible_step_index = *array_back(&analysis.final_step_indices); uint32_t i, exists; array_search_sorted_by(&self->step_offsets, .step_index, impossible_step_index, &i, &exists); if (i >= self->step_offsets.size) i = self->step_offsets.size - 1; @@ -1699,8 +1753,8 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { // Mark as fallible any step where a match terminated. // Later, this property will be propagated to all of the step's predecessors. - for (unsigned j = 0; j < final_step_indices.size; j++) { - uint32_t final_step_index = final_step_indices.contents[j]; + for (unsigned j = 0; j < analysis.final_step_indices.size; j++) { + uint32_t final_step_index = analysis.final_step_indices.contents[j]; QueryStep *step = &self->steps.contents[final_step_index]; if ( step->depth != PATTERN_DONE_MARKER && @@ -1810,21 +1864,81 @@ static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) { } #endif + // Determine which repetition symbols in this language have the possibility + // of matching non-rooted patterns in this query. These repetition symbols + // prevent certain optimizations with range restrictions. + analysis.did_abort = false; + for (uint32_t i = 0; i < non_rooted_pattern_start_steps.size; i++) { + uint16_t pattern_entry_index = non_rooted_pattern_start_steps.contents[i]; + PatternEntry *pattern_entry = &self->pattern_map.contents[pattern_entry_index]; + + analysis_state_set__clear(&analysis.states, &analysis.state_pool); + analysis_state_set__clear(&analysis.deeper_states, &analysis.state_pool); + for (unsigned j = 0; j < subgraphs.size; j++) { + AnalysisSubgraph *subgraph = &subgraphs.contents[j]; + TSSymbolMetadata metadata = ts_language_symbol_metadata(self->language, subgraph->symbol); + if (metadata.visible || metadata.named) continue; + + for (uint32_t k = 0; k < subgraph->start_states.size; k++) { + TSStateId parse_state = subgraph->start_states.contents[k]; + analysis_state_set__push(&analysis.states, &analysis.state_pool, &((AnalysisState) { + .step_index = pattern_entry->step_index, + .stack = { + [0] = { + .parse_state = parse_state, + .parent_symbol = subgraph->symbol, + .child_index = 0, + .field_id = 0, + .done = false, + }, + }, + .root_symbol = subgraph->symbol, + .depth = 1, + })); + } + } + + #ifdef DEBUG_ANALYZE_QUERY + printf("\nWalk states for rootless pattern step %u:\n", step_index); + #endif + + ts_query__perform_analysis( + self, + &subgraphs, + &analysis + ); + + if (analysis.finished_parent_symbols.size > 0) { + self->patterns.contents[pattern_entry->pattern_index].is_non_local = true; + } + + for (unsigned k = 0; k < analysis.finished_parent_symbols.size; k++) { + TSSymbol symbol = analysis.finished_parent_symbols.contents[k]; + array_insert_sorted_by(&self->repeat_symbols_with_rootless_patterns, , symbol); + } + } + + #ifdef DEBUG_ANALYZE_QUERY + if (self->repeat_symbols_with_rootless_patterns.size > 0) { + printf("\nRepetition symbols with rootless patterns:\n"); + printf("aborted analysis: %d\n", analysis.did_abort); + for (unsigned i = 0; i < self->repeat_symbols_with_rootless_patterns.size; i++) { + TSSymbol symbol = self->repeat_symbols_with_rootless_patterns.contents[i]; + printf(" %u, %s\n", symbol, ts_language_symbol_name(self->language, symbol)); + } + printf("\n"); + } + #endif + // Cleanup for (unsigned i = 0; i < subgraphs.size; i++) { array_delete(&subgraphs.contents[i].start_states); array_delete(&subgraphs.contents[i].nodes); } array_delete(&subgraphs); - for (unsigned i = 0; i < state_pool.size; i++) { - ts_free(state_pool.contents[i]); - } - array_delete(&state_pool); + query_analysis__delete(&analysis); array_delete(&next_nodes); - analysis_state_set__delete(&states); - analysis_state_set__delete(&next_states); - analysis_state_set__delete(&deeper_states); - array_delete(&final_step_indices); + array_delete(&non_rooted_pattern_start_steps); array_delete(&parent_step_indices); array_delete(&predicate_capture_ids); state_predecessor_map_delete(&predecessor_map); @@ -1918,11 +2032,11 @@ static TSQueryError ts_query__parse_string_literal( prev_position = stream->input + stream->next_size; } else { if (stream->next == '\\') { - array_extend(&self->string_buffer, (stream->input - prev_position), prev_position); + array_extend(&self->string_buffer, (uint32_t)(stream->input - prev_position), prev_position); prev_position = stream->input + 1; is_escaped = true; } else if (stream->next == '"') { - array_extend(&self->string_buffer, (stream->input - prev_position), prev_position); + array_extend(&self->string_buffer, (uint32_t)(stream->input - prev_position), prev_position); stream_advance(stream); return TSQueryErrorNone; } else if (stream->next == '\n') { @@ -2571,6 +2685,7 @@ TSQuery *ts_query_new( .step_offsets = array_new(), .string_buffer = array_new(), .negated_fields = array_new(), + .repeat_symbols_with_rootless_patterns = array_new(), .wildcard_root_pattern_count = 0, .language = language, }; @@ -2588,6 +2703,7 @@ TSQuery *ts_query_new( .steps = (Slice) {.offset = start_step_index}, .predicate_steps = (Slice) {.offset = start_predicate_step_index}, .start_byte = stream_offset(&stream), + .is_non_local = false, })); CaptureQuantifiers capture_quantifiers = capture_quantifiers_new(); *error_type = ts_query__parse_pattern(self, &stream, 0, false, &capture_quantifiers); @@ -2685,6 +2801,7 @@ void ts_query_delete(TSQuery *self) { array_delete(&self->step_offsets); array_delete(&self->string_buffer); array_delete(&self->negated_fields); + array_delete(&self->repeat_symbols_with_rootless_patterns); symbol_table_delete(&self->captures); symbol_table_delete(&self->predicate_values); for (uint32_t index = 0; index < self->capture_quantifiers.size; index++) { @@ -2766,6 +2883,17 @@ bool ts_query_is_pattern_rooted( return true; } +bool ts_query_is_pattern_non_local( + const TSQuery *self, + uint32_t pattern_index +) { + if (pattern_index < self->patterns.size) { + return self->patterns.contents[pattern_index].is_non_local; + } else { + return false; + } +} + bool ts_query_is_pattern_guaranteed_at_step( const TSQuery *self, uint32_t byte_offset @@ -2880,6 +3008,7 @@ void ts_query_cursor_exec( array_clear(&self->finished_states); ts_tree_cursor_reset(&self->cursor, node); capture_list_pool_reset(&self->capture_list_pool); + self->on_visible_node = true; self->next_state_id = 0; self->depth = 0; self->ascending = false; @@ -3214,6 +3343,50 @@ static QueryState *ts_query_cursor__copy_state( return &self->states.contents[state_index + 1]; } +static inline bool ts_query_cursor__should_descend_outside_of_range( + TSQueryCursor *self +) { + // If there are in-progress matches whose remaining steps occur + // deeper in the tree, then descend. + for (unsigned i = 0; i < self->states.size; i++) { + QueryState *state = &self->states.contents[i];; + QueryStep *next_step = &self->query->steps.contents[state->step_index]; + if ( + next_step->depth != PATTERN_DONE_MARKER && + state->start_depth + next_step->depth > self->depth + ) { + return true; + } + } + + // If the current node is hidden, then a non-rooted pattern might match + // one if its roots inside of this node, and match another of its roots + // as part of a sibling node, so we may need to descend. + if (!self->on_visible_node) { + // Descending into a repetition node outside of the range can be + // expensive, because these nodes can have many visible children. + // Avoid descending into repetition nodes unless we have already + // determined that this query can match rootless patterns inside + // of this type of repetition node. + Subtree subtree = ts_tree_cursor_current_subtree(&self->cursor); + if (ts_subtree_is_repetition(subtree)) { + bool exists; + uint32_t index; + array_search_sorted_by( + &self->query->repeat_symbols_with_rootless_patterns,, + ts_subtree_symbol(subtree), + &index, + &exists + ); + return exists; + } + + return true; + } + + return false; +} + // Walk the tree, processing patterns until at least one pattern finishes, // If one or more patterns finish, return `true` and store their states in the // `finished_states` array. Multiple patterns can finish on the same node. If @@ -3238,61 +3411,80 @@ static inline bool ts_query_cursor__advance( // Exit the current node. if (self->ascending) { - LOG( - "leave node. depth:%u, type:%s\n", - self->depth, - ts_node_type(ts_tree_cursor_current_node(&self->cursor)) - ); + if (self->on_visible_node) { + LOG( + "leave node. depth:%u, type:%s\n", + self->depth, + ts_node_type(ts_tree_cursor_current_node(&self->cursor)) + ); + } // Leave this node by stepping to its next sibling or to its parent. - if (ts_tree_cursor_goto_next_sibling(&self->cursor)) { - self->ascending = false; - } else if (ts_tree_cursor_goto_parent(&self->cursor)) { - self->depth--; - } else { - LOG("halt at root\n"); - self->halted = true; + switch (ts_tree_cursor_goto_next_sibling_internal(&self->cursor)) { + case TreeCursorStepVisible: + if (!self->on_visible_node) { + self->depth++; + self->on_visible_node = true; + } + self->ascending = false; + break; + case TreeCursorStepHidden: + if (self->on_visible_node) { + self->depth--; + self->on_visible_node = false; + } + self->ascending = false; + break; + default: + if (ts_tree_cursor_goto_parent(&self->cursor)) { + self->depth--; + } else { + LOG("halt at root\n"); + self->halted = true; + } } - // After leaving a node, remove any states that cannot make further progress. - uint32_t deleted_count = 0; - for (unsigned i = 0, n = self->states.size; i < n; i++) { - QueryState *state = &self->states.contents[i]; - QueryStep *step = &self->query->steps.contents[state->step_index]; - - // If a state completed its pattern inside of this node, but was deferred from finishing - // in order to search for longer matches, mark it as finished. - if (step->depth == PATTERN_DONE_MARKER) { - if (state->start_depth > self->depth || self->halted) { - LOG(" finish pattern %u\n", state->pattern_index); - array_push(&self->finished_states, *state); - did_match = true; + if (self->on_visible_node) { + // After leaving a node, remove any states that cannot make further progress. + uint32_t deleted_count = 0; + for (unsigned i = 0, n = self->states.size; i < n; i++) { + QueryState *state = &self->states.contents[i]; + QueryStep *step = &self->query->steps.contents[state->step_index]; + + // If a state completed its pattern inside of this node, but was deferred from finishing + // in order to search for longer matches, mark it as finished. + if (step->depth == PATTERN_DONE_MARKER) { + if (state->start_depth > self->depth || self->halted) { + LOG(" finish pattern %u\n", state->pattern_index); + array_push(&self->finished_states, *state); + did_match = true; + deleted_count++; + continue; + } + } + + // If a state needed to match something within this node, then remove that state + // as it has failed to match. + else if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) { + LOG( + " failed to match. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release( + &self->capture_list_pool, + state->capture_list_id + ); deleted_count++; continue; } - } - - // If a state needed to match something within this node, then remove that state - // as it has failed to match. - else if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) { - LOG( - " failed to match. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release( - &self->capture_list_pool, - state->capture_list_id - ); - deleted_count++; - continue; - } - if (deleted_count > 0) { - self->states.contents[i - deleted_count] = *state; + if (deleted_count > 0) { + self->states.contents[i - deleted_count] = *state; + } } + self->states.size -= deleted_count; } - self->states.size -= deleted_count; } // Enter a new node. @@ -3300,413 +3492,410 @@ static inline bool ts_query_cursor__advance( // Get the properties of the current node. TSNode node = ts_tree_cursor_current_node(&self->cursor); TSNode parent_node = ts_tree_cursor_parent_node(&self->cursor); - TSSymbol symbol = ts_node_symbol(node); - bool is_named = ts_node_is_named(node); - bool has_later_siblings; - bool has_later_named_siblings; - bool can_have_later_siblings_with_this_field; - TSFieldId field_id = 0; - TSSymbol supertypes[8] = {0}; - unsigned supertype_count = 8; - ts_tree_cursor_current_status( - &self->cursor, - &field_id, - &has_later_siblings, - &has_later_named_siblings, - &can_have_later_siblings_with_this_field, - supertypes, - &supertype_count + bool parent_precedes_range = !ts_node_is_null(parent_node) && ( + ts_node_end_byte(parent_node) <= self->start_byte || + point_lte(ts_node_end_point(parent_node), self->start_point) ); - LOG( - "enter node. depth:%u, type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u\n", - self->depth, - ts_node_type(node), - ts_language_field_name_for_id(self->query->language, field_id), - ts_node_start_point(node).row, - self->states.size, - self->finished_states.size + bool parent_follows_range = !ts_node_is_null(parent_node) && ( + ts_node_start_byte(parent_node) >= self->end_byte || + point_gte(ts_node_start_point(parent_node), self->end_point) ); - - bool node_intersects_range = ( - ts_node_end_byte(node) > self->start_byte && - ts_node_start_byte(node) < self->end_byte && - point_gt(ts_node_end_point(node), self->start_point) && - point_lt(ts_node_start_point(node), self->end_point) + bool node_precedes_range = parent_precedes_range || ( + ts_node_end_byte(node) <= self->start_byte || + point_lte(ts_node_end_point(node), self->start_point) ); - bool parent_intersects_range = ts_node_is_null(parent_node) || ( - ts_node_end_byte(parent_node) > self->start_byte && - ts_node_start_byte(parent_node) < self->end_byte && - point_gt(ts_node_end_point(parent_node), self->start_point) && - point_lt(ts_node_start_point(parent_node), self->end_point) + bool node_follows_range = parent_follows_range || ( + ts_node_start_byte(node) >= self->end_byte || + point_gte(ts_node_start_point(node), self->end_point) ); - bool node_is_error = symbol == ts_builtin_sym_error; - bool parent_is_error = - !ts_node_is_null(parent_node) && - ts_node_symbol(parent_node) == ts_builtin_sym_error; - - // Add new states for any patterns whose root node is a wildcard. - if (!node_is_error) { - for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) { - PatternEntry *pattern = &self->query->pattern_map.contents[i]; + bool parent_intersects_range = !parent_precedes_range && !parent_follows_range; + bool node_intersects_range = !node_precedes_range && !node_follows_range; + + if (self->on_visible_node) { + TSSymbol symbol = ts_node_symbol(node); + bool is_named = ts_node_is_named(node); + bool has_later_siblings; + bool has_later_named_siblings; + bool can_have_later_siblings_with_this_field; + TSFieldId field_id = 0; + TSSymbol supertypes[8] = {0}; + unsigned supertype_count = 8; + ts_tree_cursor_current_status( + &self->cursor, + &field_id, + &has_later_siblings, + &has_later_named_siblings, + &can_have_later_siblings_with_this_field, + supertypes, + &supertype_count + ); + LOG( + "enter node. depth:%u, type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u\n", + self->depth, + ts_node_type(node), + ts_language_field_name_for_id(self->query->language, field_id), + ts_node_start_point(node).row, + self->states.size, + self->finished_states.size + ); - // If this node matches the first step of the pattern, then add a new - // state at the start of this pattern. - QueryStep *step = &self->query->steps.contents[pattern->step_index]; - if ( - (pattern->is_rooted ? - node_intersects_range : - (parent_intersects_range && !parent_is_error)) && - (!step->field || field_id == step->field) && - (!step->supertype_symbol || supertype_count > 0) - ) { - ts_query_cursor__add_state(self, pattern); + bool node_is_error = symbol == ts_builtin_sym_error; + bool parent_is_error = + !ts_node_is_null(parent_node) && + ts_node_symbol(parent_node) == ts_builtin_sym_error; + + // Add new states for any patterns whose root node is a wildcard. + if (!node_is_error) { + for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) { + PatternEntry *pattern = &self->query->pattern_map.contents[i]; + + // If this node matches the first step of the pattern, then add a new + // state at the start of this pattern. + QueryStep *step = &self->query->steps.contents[pattern->step_index]; + if ( + (pattern->is_rooted ? + node_intersects_range : + (parent_intersects_range && !parent_is_error)) && + (!step->field || field_id == step->field) && + (!step->supertype_symbol || supertype_count > 0) + ) { + ts_query_cursor__add_state(self, pattern); + } } } - } - - // Add new states for any patterns whose root node matches this node. - unsigned i; - if (ts_query__pattern_map_search(self->query, symbol, &i)) { - PatternEntry *pattern = &self->query->pattern_map.contents[i]; - - QueryStep *step = &self->query->steps.contents[pattern->step_index]; - do { - // If this node matches the first step of the pattern, then add a new - // state at the start of this pattern. - if ( - (pattern->is_rooted ? - node_intersects_range : - (parent_intersects_range && !parent_is_error)) && - (!step->field || field_id == step->field) - ) { - ts_query_cursor__add_state(self, pattern); - } - // Advance to the next pattern whose root node matches this node. - i++; - if (i == self->query->pattern_map.size) break; - pattern = &self->query->pattern_map.contents[i]; - step = &self->query->steps.contents[pattern->step_index]; - } while (step->symbol == symbol); - } + // Add new states for any patterns whose root node matches this node. + unsigned i; + if (ts_query__pattern_map_search(self->query, symbol, &i)) { + PatternEntry *pattern = &self->query->pattern_map.contents[i]; - // Update all of the in-progress states with current node. - for (unsigned i = 0, copy_count = 0; i < self->states.size; i += 1 + copy_count) { - QueryState *state = &self->states.contents[i]; - QueryStep *step = &self->query->steps.contents[state->step_index]; - state->has_in_progress_alternatives = false; - copy_count = 0; - - // Check that the node matches all of the criteria for the next - // step of the pattern. - if ((uint32_t)state->start_depth + (uint32_t)step->depth != self->depth) continue; - - // Determine if this node matches this step of the pattern, and also - // if this node can have later siblings that match this step of the - // pattern. - bool node_does_match = false; - if (step->symbol == WILDCARD_SYMBOL) { - node_does_match = !node_is_error && (is_named || !step->is_named); - } else { - node_does_match = symbol == step->symbol; - } - bool later_sibling_can_match = has_later_siblings; - if ((step->is_immediate && is_named) || state->seeking_immediate_match) { - later_sibling_can_match = false; - } - if (step->is_last_child && has_later_named_siblings) { - node_does_match = false; - } - if (step->supertype_symbol) { - bool has_supertype = false; - for (unsigned j = 0; j < supertype_count; j++) { - if (supertypes[j] == step->supertype_symbol) { - has_supertype = true; - break; + QueryStep *step = &self->query->steps.contents[pattern->step_index]; + do { + // If this node matches the first step of the pattern, then add a new + // state at the start of this pattern. + if ( + (pattern->is_rooted ? + node_intersects_range : + (parent_intersects_range && !parent_is_error)) && + (!step->field || field_id == step->field) + ) { + ts_query_cursor__add_state(self, pattern); } - } - if (!has_supertype) node_does_match = false; + + // Advance to the next pattern whose root node matches this node. + i++; + if (i == self->query->pattern_map.size) break; + pattern = &self->query->pattern_map.contents[i]; + step = &self->query->steps.contents[pattern->step_index]; + } while (step->symbol == symbol); } - if (step->field) { - if (step->field == field_id) { - if (!can_have_later_siblings_with_this_field) { - later_sibling_can_match = false; - } + + // Update all of the in-progress states with current node. + for (unsigned i = 0, copy_count = 0; i < self->states.size; i += 1 + copy_count) { + QueryState *state = &self->states.contents[i]; + QueryStep *step = &self->query->steps.contents[state->step_index]; + state->has_in_progress_alternatives = false; + copy_count = 0; + + // Check that the node matches all of the criteria for the next + // step of the pattern. + if ((uint32_t)state->start_depth + (uint32_t)step->depth != self->depth) continue; + + // Determine if this node matches this step of the pattern, and also + // if this node can have later siblings that match this step of the + // pattern. + bool node_does_match = false; + if (step->symbol == WILDCARD_SYMBOL) { + node_does_match = !node_is_error && (is_named || !step->is_named); } else { + node_does_match = symbol == step->symbol; + } + bool later_sibling_can_match = has_later_siblings; + if ((step->is_immediate && is_named) || state->seeking_immediate_match) { + later_sibling_can_match = false; + } + if (step->is_last_child && has_later_named_siblings) { node_does_match = false; } - } - - if (step->negated_field_list_id) { - TSFieldId *negated_field_ids = &self->query->negated_fields.contents[step->negated_field_list_id]; - for (;;) { - TSFieldId negated_field_id = *negated_field_ids; - if (negated_field_id) { - negated_field_ids++; - if (ts_node_child_by_field_id(node, negated_field_id).id) { - node_does_match = false; + if (step->supertype_symbol) { + bool has_supertype = false; + for (unsigned j = 0; j < supertype_count; j++) { + if (supertypes[j] == step->supertype_symbol) { + has_supertype = true; break; } + } + if (!has_supertype) node_does_match = false; + } + if (step->field) { + if (step->field == field_id) { + if (!can_have_later_siblings_with_this_field) { + later_sibling_can_match = false; + } } else { - break; + node_does_match = false; } } - } - // Remove states immediately if it is ever clear that they cannot match. - if (!node_does_match) { - if (!later_sibling_can_match) { - LOG( - " discard state. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release( - &self->capture_list_pool, - state->capture_list_id - ); - array_erase(&self->states, i); - i--; + if (step->negated_field_list_id) { + TSFieldId *negated_field_ids = &self->query->negated_fields.contents[step->negated_field_list_id]; + for (;;) { + TSFieldId negated_field_id = *negated_field_ids; + if (negated_field_id) { + negated_field_ids++; + if (ts_node_child_by_field_id(node, negated_field_id).id) { + node_does_match = false; + break; + } + } else { + break; + } + } } - continue; - } - // Some patterns can match their root node in multiple ways, capturing different - // children. If this pattern step could match later children within the same - // parent, then this query state cannot simply be updated in place. It must be - // split into two states: one that matches this node, and one which skips over - // this node, to preserve the possibility of matching later siblings. - if (later_sibling_can_match && ( - step->contains_captures || - ts_query__step_is_fallible(self->query, state->step_index) - )) { - if (ts_query_cursor__copy_state(self, &state)) { - LOG( - " split state for capture. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); - copy_count++; + // Remove states immediately if it is ever clear that they cannot match. + if (!node_does_match) { + if (!later_sibling_can_match) { + LOG( + " discard state. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release( + &self->capture_list_pool, + state->capture_list_id + ); + array_erase(&self->states, i); + i--; + } + continue; } - } - // If this pattern started with a wildcard, such that the pattern map - // actually points to the *second* step of the pattern, then check - // that the node has a parent, and capture the parent node if necessary. - if (state->needs_parent) { - TSNode parent = ts_tree_cursor_parent_node(&self->cursor); - if (ts_node_is_null(parent)) { - LOG(" missing parent node\n"); - state->dead = true; - } else { - state->needs_parent = false; - QueryStep *skipped_wildcard_step = step; - do { - skipped_wildcard_step--; - } while ( - skipped_wildcard_step->is_dead_end || - skipped_wildcard_step->is_pass_through || - skipped_wildcard_step->depth > 0 - ); - if (skipped_wildcard_step->capture_ids[0] != NONE) { - LOG(" capture wildcard parent\n"); - ts_query_cursor__capture( - self, - state, - skipped_wildcard_step, - parent + // Some patterns can match their root node in multiple ways, capturing different + // children. If this pattern step could match later children within the same + // parent, then this query state cannot simply be updated in place. It must be + // split into two states: one that matches this node, and one which skips over + // this node, to preserve the possibility of matching later siblings. + if (later_sibling_can_match && ( + step->contains_captures || + ts_query__step_is_fallible(self->query, state->step_index) + )) { + if (ts_query_cursor__copy_state(self, &state)) { + LOG( + " split state for capture. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index ); + copy_count++; } } - } - // If the current node is captured in this pattern, add it to the capture list. - if (step->capture_ids[0] != NONE) { - ts_query_cursor__capture(self, state, step, node); - } + // If this pattern started with a wildcard, such that the pattern map + // actually points to the *second* step of the pattern, then check + // that the node has a parent, and capture the parent node if necessary. + if (state->needs_parent) { + TSNode parent = ts_tree_cursor_parent_node(&self->cursor); + if (ts_node_is_null(parent)) { + LOG(" missing parent node\n"); + state->dead = true; + } else { + state->needs_parent = false; + QueryStep *skipped_wildcard_step = step; + do { + skipped_wildcard_step--; + } while ( + skipped_wildcard_step->is_dead_end || + skipped_wildcard_step->is_pass_through || + skipped_wildcard_step->depth > 0 + ); + if (skipped_wildcard_step->capture_ids[0] != NONE) { + LOG(" capture wildcard parent\n"); + ts_query_cursor__capture( + self, + state, + skipped_wildcard_step, + parent + ); + } + } + } - if (state->dead) { - array_erase(&self->states, i); - i--; - continue; - } + // If the current node is captured in this pattern, add it to the capture list. + if (step->capture_ids[0] != NONE) { + ts_query_cursor__capture(self, state, step, node); + } - // Advance this state to the next step of its pattern. - state->step_index++; - state->seeking_immediate_match = false; - LOG( - " advance state. pattern:%u, step:%u\n", - state->pattern_index, - state->step_index - ); + if (state->dead) { + array_erase(&self->states, i); + i--; + continue; + } - QueryStep *next_step = &self->query->steps.contents[state->step_index]; - if (stop_on_definite_step && next_step->root_pattern_guaranteed) did_match = true; + // Advance this state to the next step of its pattern. + state->step_index++; + state->seeking_immediate_match = false; + LOG( + " advance state. pattern:%u, step:%u\n", + state->pattern_index, + state->step_index + ); - // If this state's next step has an alternative step, then copy the state in order - // to pursue both alternatives. The alternative step itself may have an alternative, - // so this is an interactive process. - unsigned end_index = i + 1; - for (unsigned j = i; j < end_index; j++) { - QueryState *state = &self->states.contents[j]; QueryStep *next_step = &self->query->steps.contents[state->step_index]; - if (next_step->alternative_index != NONE) { - // A "dead-end" step exists only to add a non-sequential jump into the step sequence, - // via its alternative index. When a state reaches a dead-end step, it jumps straight - // to the step's alternative. - if (next_step->is_dead_end) { - state->step_index = next_step->alternative_index; - j--; - continue; - } + if (stop_on_definite_step && next_step->root_pattern_guaranteed) did_match = true; + + // If this state's next step has an alternative step, then copy the state in order + // to pursue both alternatives. The alternative step itself may have an alternative, + // so this is an interactive process. + unsigned end_index = i + 1; + for (unsigned j = i; j < end_index; j++) { + QueryState *state = &self->states.contents[j]; + QueryStep *next_step = &self->query->steps.contents[state->step_index]; + if (next_step->alternative_index != NONE) { + // A "dead-end" step exists only to add a non-sequential jump into the step sequence, + // via its alternative index. When a state reaches a dead-end step, it jumps straight + // to the step's alternative. + if (next_step->is_dead_end) { + state->step_index = next_step->alternative_index; + j--; + continue; + } - // A "pass-through" step exists only to add a branch into the step sequence, - // via its alternative_index. When a state reaches a pass-through step, it splits - // in order to process the alternative step, and then it advances to the next step. - if (next_step->is_pass_through) { - state->step_index++; - j--; - } + // A "pass-through" step exists only to add a branch into the step sequence, + // via its alternative_index. When a state reaches a pass-through step, it splits + // in order to process the alternative step, and then it advances to the next step. + if (next_step->is_pass_through) { + state->step_index++; + j--; + } - QueryState *copy = ts_query_cursor__copy_state(self, &state); - if (copy) { - LOG( - " split state for branch. pattern:%u, from_step:%u, to_step:%u, immediate:%d, capture_count: %u\n", - copy->pattern_index, - copy->step_index, - next_step->alternative_index, - next_step->alternative_is_immediate, - capture_list_pool_get(&self->capture_list_pool, copy->capture_list_id)->size - ); - end_index++; - copy_count++; - copy->step_index = next_step->alternative_index; - if (next_step->alternative_is_immediate) { - copy->seeking_immediate_match = true; + QueryState *copy = ts_query_cursor__copy_state(self, &state); + if (copy) { + LOG( + " split state for branch. pattern:%u, from_step:%u, to_step:%u, immediate:%d, capture_count: %u\n", + copy->pattern_index, + copy->step_index, + next_step->alternative_index, + next_step->alternative_is_immediate, + capture_list_pool_get(&self->capture_list_pool, copy->capture_list_id)->size + ); + end_index++; + copy_count++; + copy->step_index = next_step->alternative_index; + if (next_step->alternative_is_immediate) { + copy->seeking_immediate_match = true; + } } } } } - } - for (unsigned i = 0; i < self->states.size; i++) { - QueryState *state = &self->states.contents[i]; - if (state->dead) { - array_erase(&self->states, i); - i--; - continue; - } + for (unsigned i = 0; i < self->states.size; i++) { + QueryState *state = &self->states.contents[i]; + if (state->dead) { + array_erase(&self->states, i); + i--; + continue; + } - // Enfore the longest-match criteria. When a query pattern contains optional or - // repeated nodes, this is necessary to avoid multiple redundant states, where - // one state has a strict subset of another state's captures. - bool did_remove = false; - for (unsigned j = i + 1; j < self->states.size; j++) { - QueryState *other_state = &self->states.contents[j]; - - // Query states are kept in ascending order of start_depth and pattern_index. - // Since the longest-match criteria is only used for deduping matches of the same - // pattern and root node, we only need to perform pairwise comparisons within a - // small slice of the states array. - if ( - other_state->start_depth != state->start_depth || - other_state->pattern_index != state->pattern_index - ) break; - - bool left_contains_right, right_contains_left; - ts_query_cursor__compare_captures( - self, - state, - other_state, - &left_contains_right, - &right_contains_left - ); - if (left_contains_right) { - if (state->step_index == other_state->step_index) { - LOG( - " drop shorter state. pattern: %u, step_index: %u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id); - array_erase(&self->states, j); - j--; - continue; + // Enfore the longest-match criteria. When a query pattern contains optional or + // repeated nodes, this is necessary to avoid multiple redundant states, where + // one state has a strict subset of another state's captures. + bool did_remove = false; + for (unsigned j = i + 1; j < self->states.size; j++) { + QueryState *other_state = &self->states.contents[j]; + + // Query states are kept in ascending order of start_depth and pattern_index. + // Since the longest-match criteria is only used for deduping matches of the same + // pattern and root node, we only need to perform pairwise comparisons within a + // small slice of the states array. + if ( + other_state->start_depth != state->start_depth || + other_state->pattern_index != state->pattern_index + ) break; + + bool left_contains_right, right_contains_left; + ts_query_cursor__compare_captures( + self, + state, + other_state, + &left_contains_right, + &right_contains_left + ); + if (left_contains_right) { + if (state->step_index == other_state->step_index) { + LOG( + " drop shorter state. pattern: %u, step_index: %u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id); + array_erase(&self->states, j); + j--; + continue; + } + other_state->has_in_progress_alternatives = true; } - other_state->has_in_progress_alternatives = true; - } - if (right_contains_left) { - if (state->step_index == other_state->step_index) { - LOG( - " drop shorter state. pattern: %u, step_index: %u\n", - state->pattern_index, - state->step_index - ); - capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); - array_erase(&self->states, i); - i--; - did_remove = true; - break; + if (right_contains_left) { + if (state->step_index == other_state->step_index) { + LOG( + " drop shorter state. pattern: %u, step_index: %u\n", + state->pattern_index, + state->step_index + ); + capture_list_pool_release(&self->capture_list_pool, state->capture_list_id); + array_erase(&self->states, i); + i--; + did_remove = true; + break; + } + state->has_in_progress_alternatives = true; } - state->has_in_progress_alternatives = true; } - } - // If the state is at the end of its pattern, remove it from the list - // of in-progress states and add it to the list of finished states. - if (!did_remove) { - LOG( - " keep state. pattern: %u, start_depth: %u, step_index: %u, capture_count: %u\n", - state->pattern_index, - state->start_depth, - state->step_index, - capture_list_pool_get(&self->capture_list_pool, state->capture_list_id)->size - ); - QueryStep *next_step = &self->query->steps.contents[state->step_index]; - if (next_step->depth == PATTERN_DONE_MARKER) { - if (state->has_in_progress_alternatives) { - LOG(" defer finishing pattern %u\n", state->pattern_index); - } else { - LOG(" finish pattern %u\n", state->pattern_index); - array_push(&self->finished_states, *state); - array_erase(&self->states, state - self->states.contents); - did_match = true; - i--; + // If the state is at the end of its pattern, remove it from the list + // of in-progress states and add it to the list of finished states. + if (!did_remove) { + LOG( + " keep state. pattern: %u, start_depth: %u, step_index: %u, capture_count: %u\n", + state->pattern_index, + state->start_depth, + state->step_index, + capture_list_pool_get(&self->capture_list_pool, state->capture_list_id)->size + ); + QueryStep *next_step = &self->query->steps.contents[state->step_index]; + if (next_step->depth == PATTERN_DONE_MARKER) { + if (state->has_in_progress_alternatives) { + LOG(" defer finishing pattern %u\n", state->pattern_index); + } else { + LOG(" finish pattern %u\n", state->pattern_index); + array_push(&self->finished_states, *state); + array_erase(&self->states, (uint32_t)(state - self->states.contents)); + did_match = true; + i--; + } } } } } - // When the current node ends prior to the desired start offset, - // only descend for the purpose of continuing in-progress matches. - bool should_descend = node_intersects_range; - if (!should_descend) { - for (unsigned i = 0; i < self->states.size; i++) { - QueryState *state = &self->states.contents[i];; - QueryStep *next_step = &self->query->steps.contents[state->step_index]; - if ( - next_step->depth != PATTERN_DONE_MARKER && - state->start_depth + next_step->depth > self->depth - ) { - should_descend = true; + bool should_descend = + node_intersects_range || + ts_query_cursor__should_descend_outside_of_range(self); + if (should_descend) { + switch (ts_tree_cursor_goto_first_child_internal(&self->cursor)) { + case TreeCursorStepVisible: + self->depth++; + self->on_visible_node = true; + continue; + case TreeCursorStepHidden: + self->on_visible_node = false; + continue; + default: break; - } } } - if (!should_descend) { - LOG( - " not descending. node end byte: %u, start byte: %u\n", - ts_node_end_byte(node), - self->start_byte - ); - } - - if (should_descend && ts_tree_cursor_goto_first_child(&self->cursor)) { - self->depth++; - } else { - self->ascending = true; - } + self->ascending = true; } } } diff --git a/stack.c b/stack.c index caad7b47..98e3a96f 100644 --- a/stack.c +++ b/stack.c @@ -326,7 +326,7 @@ inline StackSliceArray stack__iter( bool include_subtrees = false; if (goal_subtree_count >= 0) { include_subtrees = true; - array_reserve(&iterator.subtrees, ts_subtree_alloc_size(goal_subtree_count) / sizeof(Subtree)); + array_reserve(&iterator.subtrees, (uint32_t)ts_subtree_alloc_size(goal_subtree_count) / sizeof(Subtree)); } array_push(&self->iterators, iterator); diff --git a/subtree.h b/subtree.h index 2c40922e..59cd7fd5 100644 --- a/subtree.h +++ b/subtree.h @@ -114,7 +114,7 @@ typedef struct { Length size; uint32_t lookahead_bytes; uint32_t error_cost; - uint16_t child_count; + uint32_t child_count; TSSymbol symbol; TSStateId parse_state; @@ -291,6 +291,12 @@ static inline uint32_t ts_subtree_repeat_depth(Subtree self) { return self.data.is_inline ? 0 : self.ptr->repeat_depth; } +static inline uint32_t ts_subtree_is_repetition(Subtree self) { + return self.data.is_inline + ? 0 + : !self.ptr->named && !self.ptr->visible && self.ptr->child_count != 0; +} + static inline uint32_t ts_subtree_node_count(Subtree self) { return (self.data.is_inline || self.ptr->child_count == 0) ? 1 : self.ptr->node_count; } diff --git a/tree.c b/tree.c index 7be69b6f..b45f7fec 100644 --- a/tree.c +++ b/tree.c @@ -66,17 +66,23 @@ void ts_tree_edit(TSTree *self, const TSInputEdit *edit) { range->end_point = POINT_MAX; } } - if (range->start_byte >= edit->old_end_byte) { - range->start_byte = edit->new_end_byte + (range->start_byte - edit->old_end_byte); - range->start_point = point_add( - edit->new_end_point, - point_sub(range->start_point, edit->old_end_point) - ); - if (range->start_byte < edit->new_end_byte) { - range->start_byte = UINT32_MAX; - range->start_point = POINT_MAX; - } + } else if (range->end_byte > edit->start_byte) { + range->end_byte = edit->start_byte; + range->end_point = edit->start_point; + } + if (range->start_byte >= edit->old_end_byte) { + range->start_byte = edit->new_end_byte + (range->start_byte - edit->old_end_byte); + range->start_point = point_add( + edit->new_end_point, + point_sub(range->start_point, edit->old_end_point) + ); + if (range->start_byte < edit->new_end_byte) { + range->start_byte = UINT32_MAX; + range->start_point = POINT_MAX; } + } else if (range->start_byte > edit->start_byte) { + range->start_byte = edit->start_byte; + range->start_point = edit->start_point; } } @@ -85,6 +91,13 @@ void ts_tree_edit(TSTree *self, const TSInputEdit *edit) { ts_subtree_pool_delete(&pool); } +TSRange *ts_tree_included_ranges(const TSTree *self, uint32_t *length) { + *length = self->included_range_count; + TSRange *ranges = ts_calloc(self->included_range_count, sizeof(TSRange)); + memcpy(ranges, self->included_ranges, self->included_range_count * sizeof(TSRange)); + return ranges; +} + TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uint32_t *count) { TreeCursor cursor1 = {NULL, array_new()}; TreeCursor cursor2 = {NULL, array_new()}; @@ -110,6 +123,21 @@ TSRange *ts_tree_get_changed_ranges(const TSTree *self, const TSTree *other, uin return result; } -void ts_tree_print_dot_graph(const TSTree *self, FILE *file) { +#ifdef _WIN32 + +void ts_tree_print_dot_graph(const TSTree *self, int fd) { + (void)self; + (void)fd; +} + +#else + +#include + +void ts_tree_print_dot_graph(const TSTree *self, int fd) { + FILE *file = fdopen(dup(fd), "a"); ts_subtree_print_dot_graph(self->root, self->language, file); + fclose(file); } + +#endif diff --git a/tree.h b/tree.h index 0334b824..f012f888 100644 --- a/tree.h +++ b/tree.h @@ -1,6 +1,8 @@ #ifndef TREE_SITTER_TREE_H_ #define TREE_SITTER_TREE_H_ +#include "./subtree.h" + #ifdef __cplusplus extern "C" { #endif diff --git a/tree_cursor.c b/tree_cursor.c index 2833aa48..008d6638 100644 --- a/tree_cursor.c +++ b/tree_cursor.c @@ -98,34 +98,43 @@ void ts_tree_cursor_delete(TSTreeCursor *_self) { // TSTreeCursor - walking the tree -bool ts_tree_cursor_goto_first_child(TSTreeCursor *_self) { +TreeCursorStep ts_tree_cursor_goto_first_child_internal(TSTreeCursor *_self) { TreeCursor *self = (TreeCursor *)_self; + bool visible; + TreeCursorEntry entry; + CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); + while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { + if (visible) { + array_push(&self->stack, entry); + return TreeCursorStepVisible; + } + if (ts_subtree_visible_child_count(*entry.subtree) > 0) { + array_push(&self->stack, entry); + return TreeCursorStepHidden; + } + } + return TreeCursorStepNone; +} - bool did_descend; - do { - did_descend = false; - - bool visible; - TreeCursorEntry entry; - CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); - while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { - if (visible) { - array_push(&self->stack, entry); +bool ts_tree_cursor_goto_first_child(TSTreeCursor *self) { + for (;;) { + switch (ts_tree_cursor_goto_first_child_internal(self)) { + case TreeCursorStepHidden: + continue; + case TreeCursorStepVisible: return true; - } - - if (ts_subtree_visible_child_count(*entry.subtree) > 0) { - array_push(&self->stack, entry); - did_descend = true; - break; - } + default: + return false; } - } while (did_descend); - + } return false; } -int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *_self, uint32_t goal_byte) { +static inline int64_t ts_tree_cursor_goto_first_child_for_byte_and_point( + TSTreeCursor *_self, + uint32_t goal_byte, + TSPoint goal_point +) { TreeCursor *self = (TreeCursor *)_self; uint32_t initial_size = self->stack.size; uint32_t visible_child_index = 0; @@ -138,16 +147,14 @@ int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *_self, uint32_t g TreeCursorEntry entry; CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { - uint32_t end_byte = entry.position.bytes + ts_subtree_size(*entry.subtree).bytes; - bool at_goal = end_byte >= goal_byte; + Length entry_end = length_add(entry.position, ts_subtree_size(*entry.subtree)); + bool at_goal = entry_end.bytes >= goal_byte && point_gte(entry_end.extent, goal_point); uint32_t visible_child_count = ts_subtree_visible_child_count(*entry.subtree); - if (at_goal) { if (visible) { array_push(&self->stack, entry); return visible_child_index; } - if (visible_child_count > 0) { array_push(&self->stack, entry); did_descend = true; @@ -165,45 +172,15 @@ int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *_self, uint32_t g return -1; } -int64_t ts_tree_cursor_goto_first_child_for_point(TSTreeCursor *_self, TSPoint goal_point) { - TreeCursor *self = (TreeCursor *)_self; - uint32_t initial_size = self->stack.size; - uint32_t visible_child_index = 0; - - bool did_descend; - do { - did_descend = false; - - bool visible; - TreeCursorEntry entry; - CursorChildIterator iterator = ts_tree_cursor_iterate_children(self); - while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { - TSPoint end_point = point_add(entry.position.extent, ts_subtree_size(*entry.subtree).extent); - bool at_goal = point_gte(end_point, goal_point); - uint32_t visible_child_count = ts_subtree_visible_child_count(*entry.subtree); - if (at_goal) { - if (visible) { - array_push(&self->stack, entry); - return visible_child_index; - } - if (visible_child_count > 0) { - array_push(&self->stack, entry); - did_descend = true; - break; - } - } else if (visible) { - visible_child_index++; - } else { - visible_child_index += visible_child_count; - } - } - } while (did_descend); +int64_t ts_tree_cursor_goto_first_child_for_byte(TSTreeCursor *self, uint32_t goal_byte) { + return ts_tree_cursor_goto_first_child_for_byte_and_point(self, goal_byte, POINT_ZERO); +} - self->stack.size = initial_size; - return -1; +int64_t ts_tree_cursor_goto_first_child_for_point(TSTreeCursor *self, TSPoint goal_point) { + return ts_tree_cursor_goto_first_child_for_byte_and_point(self, 0, goal_point); } -bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *_self) { +TreeCursorStep ts_tree_cursor_goto_next_sibling_internal(TSTreeCursor *_self) { TreeCursor *self = (TreeCursor *)_self; uint32_t initial_size = self->stack.size; @@ -221,19 +198,30 @@ bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *_self) { while (ts_tree_cursor_child_iterator_next(&iterator, &entry, &visible)) { if (visible) { array_push(&self->stack, entry); - return true; + return TreeCursorStepVisible; } if (ts_subtree_visible_child_count(*entry.subtree)) { array_push(&self->stack, entry); - ts_tree_cursor_goto_first_child(_self); - return true; + return TreeCursorStepHidden; } } } self->stack.size = initial_size; - return false; + return TreeCursorStepNone; +} + +bool ts_tree_cursor_goto_next_sibling(TSTreeCursor *self) { + switch (ts_tree_cursor_goto_next_sibling_internal(self)) { + case TreeCursorStepHidden: + ts_tree_cursor_goto_first_child(self); + return true; + case TreeCursorStepVisible: + return true; + default: + return false; + } } bool ts_tree_cursor_goto_parent(TSTreeCursor *_self) { diff --git a/tree_cursor.h b/tree_cursor.h index 69647d1d..7b94db6b 100644 --- a/tree_cursor.h +++ b/tree_cursor.h @@ -15,6 +15,12 @@ typedef struct { Array(TreeCursorEntry) stack; } TreeCursor; +typedef enum { + TreeCursorStepNone, + TreeCursorStepHidden, + TreeCursorStepVisible, +} TreeCursorStep; + void ts_tree_cursor_init(TreeCursor *, TSNode); void ts_tree_cursor_current_status( const TSTreeCursor *, @@ -26,6 +32,15 @@ void ts_tree_cursor_current_status( unsigned * ); +TreeCursorStep ts_tree_cursor_goto_first_child_internal(TSTreeCursor *); +TreeCursorStep ts_tree_cursor_goto_next_sibling_internal(TSTreeCursor *); + +static inline Subtree ts_tree_cursor_current_subtree(const TSTreeCursor *_self) { + const TreeCursor *self = (const TreeCursor *)_self; + TreeCursorEntry *last_entry = array_back(&self->stack); + return *last_entry->subtree; +} + TSNode ts_tree_cursor_parent_node(const TSTreeCursor *); #endif // TREE_SITTER_TREE_CURSOR_H_