formatter: add tree-sitter dependency and commit early draft work on a proper code formatter (#2536)

This commit is contained in:
Tyler Wilding 2023-04-24 22:46:55 -05:00 committed by GitHub
parent 83f43b7153
commit 0ffb912a04
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
385 changed files with 71427 additions and 28 deletions

View File

@ -242,6 +242,20 @@
"projectTarget" : "type_searcher.exe (bin\\type_searcher.exe)",
"name" : "Tools - Type Searcher",
"args" : ["--game", "jak2", "--output-path", "./search-results.json", "--size", 255, "--fields", "[{\\\"type\\\":\\\"quaternion\\\",\\\"offset\\\":48}]"]
},
{
"type" : "default",
"project" : "CMakeLists.txt",
"projectTarget" : "formatter.exe (bin\\formatter.exe)",
"name" : "Tools - Formatter",
"args" : ["--new", "--file", "C:\\Users\\xtvas\\Repos\\opengoal\\jak-project\\test-formatter.gc"]
},
{
"type": "default",
"project": "CMakeLists.txt",
"projectTarget": "goalc-test.exe (bin\\goalc-test.exe)",
"name": "Tests - Formatter",
"args": ["--gtest_brief=0", "--gtest_filter=\"*FormatterTests*\""]
}
]
}

View File

@ -182,6 +182,11 @@ include_directories(third-party/SQLiteCpp/include)
add_subdirectory(third-party/SQLiteCpp)
string(REPLACE " ${THIRDPARTY_IGNORED_WARNINGS} " "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
# build tree-sitter parser
include_directories(third-party/tree-sitter/tree-sitter/lib/include)
include_directories(third-party/tree-sitter/tree-sitter-opengoal/include)
add_subdirectory(third-party/tree-sitter EXCLUDE_FROM_ALL)
# build common library
add_subdirectory(common)

View File

@ -75,6 +75,7 @@ tasks:
format:
desc: "Format code"
cmds:
- cmd: python ./scripts/cpp/format-includes.py
- cmd: python ./third-party/run-clang-format/run-clang-format.py -r common decompiler game goalc test tools lsp -i
# npm install -g prettier
- cmd: npx prettier --write ./decompiler/config/jak1/**/*.jsonc

View File

@ -22,32 +22,34 @@ endfunction()
write_revision_h()
add_library(common
versions/versions.cpp
audio/audio_formats.cpp
cross_os_debug/xdbg.cpp
cross_sockets/XSocket.cpp
cross_sockets/XSocketServer.cpp
cross_sockets/XSocketClient.cpp
cross_sockets/XSocketServer.cpp
custom_data/pack_helpers.cpp
custom_data/TFrag3Data.cpp
dma/dma.cpp
dma/dma_copy.cpp
dma/dma.cpp
dma/gs.cpp
formatter/formatter.cpp
global_profiler/GlobalProfiler.cpp
goos/Interpreter.cpp
goos/Object.cpp
goos/ParseHelpers.cpp
goos/Printer.cpp
goos/PrettyPrinter.cpp
goos/PrettyPrinter2.cpp
goos/Printer.cpp
goos/Reader.cpp
goos/TextDB.cpp
repl/config.cpp
repl/util.cpp
log/log.cpp
math/geometry.cpp
repl/config.cpp
repl/nrepl/ReplClient.cpp
repl/nrepl/ReplServer.cpp
repl/util.cpp
serialization/subtitles/subtitles_deser.cpp
serialization/subtitles/subtitles_ser.cpp
type_system/defenum.cpp
type_system/deftype.cpp
type_system/state.cpp
@ -55,8 +57,6 @@ add_library(common
type_system/TypeFieldLookup.cpp
type_system/TypeSpec.cpp
type_system/TypeSystem.cpp
serialization/subtitles/subtitles_ser.cpp
serialization/subtitles/subtitles_deser.cpp
util/Assert.cpp
util/BitUtils.cpp
util/compress.cpp
@ -67,18 +67,20 @@ add_library(common
util/diff.cpp
util/FileUtil.cpp
util/FontUtils.cpp
util/FrameLimiter.cpp
util/json_util.cpp
util/os.cpp
util/print_float.cpp
util/read_iso_file.cpp
util/SimpleThreadGroup.cpp
util/string_util.cpp
util/term_util.cpp
util/Timer.cpp
util/os.cpp
util/print_float.cpp
util/FrameLimiter.cpp
util/unicode_util.cpp
util/term_util.cpp )
versions/versions.cpp
)
target_link_libraries(common fmt lzokay replxx libzstd_static)
target_link_libraries(common fmt lzokay replxx libzstd_static tree-sitter)
if(WIN32)
target_link_libraries(common wsock32 ws2_32 windowsapp)

View File

@ -0,0 +1,161 @@
#include "formatter.h"
#include "common/util/FileUtil.h"
#include "common/util/string_util.h"
#include "tree_sitter/api.h"
#include "third-party/fmt/core.h"
// Declare the `tree_sitter_opengoal` function, which is
// implemented by the `tree-sitter-opengoal` library.
extern "C" {
extern const TSLanguage* tree_sitter_opengoal();
}
void walk_tree(TSTreeCursor* cursor, std::string& output, const std::string& source_code) {
// an imperative breadth-first-search
while (true) {
// Process the node
const auto curr_node = ts_tree_cursor_current_node(cursor);
const std::string curr_node_type = ts_node_type(curr_node);
std::string curr_node_field_name;
if (ts_tree_cursor_current_field_name(cursor)) {
curr_node_field_name = ts_tree_cursor_current_field_name(cursor);
}
if (curr_node_field_name == "open") {
output += "(";
} else if (curr_node_field_name == "close") {
output.pop_back();
output += ") ";
}
if (curr_node_type == "sym_name" || curr_node_type == "num_lit" ||
curr_node_type == "str_lit") {
uint32_t start = ts_node_start_byte(curr_node);
uint32_t end = ts_node_end_byte(curr_node);
const char* type = ts_node_type(curr_node);
// TODO - if it's a string literal, take out any newlines and reflow the string to the
// line-length
const auto contents = source_code.substr(start, end - start);
output += contents + " ";
}
if (ts_tree_cursor_goto_first_child(cursor)) {
continue;
}
if (ts_tree_cursor_goto_next_sibling(cursor)) {
continue;
}
while (true) {
if (!ts_tree_cursor_goto_parent(cursor)) {
if (output.at(output.length() - 1) == ' ') {
output.pop_back();
}
return;
}
if (ts_tree_cursor_goto_next_sibling(cursor)) {
break;
}
}
}
}
// TODO - move this to str_util
std::string repeat(size_t n, const std::string& str) {
if (n == 0 || str.empty())
return {};
if (n == 1)
return str;
const auto period = str.size();
if (period == 1)
return std::string(n, str.front());
std::string ret(str);
ret.reserve(period * n);
std::size_t m{2};
for (; m < n; m *= 2)
ret += ret;
ret.append(ret.c_str(), (n - (m / 2)) * period);
return ret;
}
// It's possible to walk a tree-sitter tree imperatively with a cursor
// but the code for that is more verbose and less intuitive and I'm not sure how much
// of a benefit I'd get out of it since for formatting i basically have to convert every
// cursor to it's fat node
//
// But in any case, do it the easy way first and refactor later
void format_code(const std::string& source,
TSNode curr_node,
std::string& output,
std::string curr_form_head = "",
int indent = 0) {
if (ts_node_child_count(curr_node) == 0) {
uint32_t start = ts_node_start_byte(curr_node);
uint32_t end = ts_node_end_byte(curr_node);
// TODO - if it's a string literal, take out any newlines and reflow the string to the
// line-length
const auto contents = source.substr(start, end - start);
if (contents == ")") {
output.pop_back();
output += ") ";
} else if (contents == "(") {
output += "(";
} else {
output += contents + " ";
}
return;
}
const std::string curr_node_type = ts_node_type(curr_node);
for (int i = 0; i < ts_node_child_count(curr_node); i++) {
auto child_node = ts_node_child(curr_node, i);
// If we are opening a list, peek at the first element in the list
// this is so we can properly handle indentation based on different forms
if (curr_node_type == "list_lit" && i == 1) {
uint32_t start = ts_node_start_byte(child_node);
uint32_t end = ts_node_end_byte(child_node);
// TODO - if it's a string literal, take out any newlines and reflow the string to the
// line-length
curr_form_head = source.substr(start, end - start);
}
std::string curr_node_field_name;
auto curr_field_name_raw = ts_node_field_name_for_child(
curr_node, i); // TODO - why is this always returning `close` for the opening paren..
if (curr_field_name_raw) {
curr_node_field_name = curr_field_name_raw;
}
if (curr_form_head == "defun" && i == 4) {
indent += 2;
output += "\n" + repeat(indent, " ");
} else if (curr_form_head == "defun" && i == 5) {
output += "\n" + repeat(indent, " ");
}
format_code(source, child_node, output, curr_form_head, indent);
if (curr_node_type == "source") {
output += "\n\n";
}
}
}
std::string formatter::format_code(const std::string& source) {
// Create a parser.
std::shared_ptr<TSParser> parser(ts_parser_new(), TreeSitterParserDeleter());
// Set the parser's language (JSON in this case).
ts_parser_set_language(parser.get(), tree_sitter_opengoal());
// Build a syntax tree based on source code stored in a string.
std::shared_ptr<TSTree> tree(
ts_parser_parse_string(parser.get(), NULL, source.c_str(), source.length()),
TreeSitterTreeDeleter());
// Get the root node of the syntax tree.
TSNode root_node = ts_tree_root_node(tree.get());
std::string output = "";
format_code(source, root_node, output, "", 0);
return str_util::trim(output);
}

View File

@ -0,0 +1,17 @@
#pragma once
#include <string>
#include "tree_sitter/api.h"
namespace formatter {
struct TreeSitterParserDeleter {
void operator()(TSParser* ptr) const { ts_parser_delete(ptr); }
};
struct TreeSitterTreeDeleter {
void operator()(TSTree* ptr) const { ts_tree_delete(ptr); }
};
std::string format_code(const std::string& source);
} // namespace formatter

View File

@ -93,4 +93,12 @@ std::vector<std::string> regex_get_capture_groups(const std::string& str,
}
return groups;
}
bool replace(std::string& str, const std::string& from, const std::string& to) {
size_t start_pos = str.find(from);
if (start_pos == std::string::npos)
return false;
str.replace(start_pos, from.length(), to);
return true;
}
} // namespace str_util

View File

@ -20,4 +20,5 @@ std::string diff(const std::string& lhs, const std::string& rhs);
std::vector<std::string> split(const ::std::string& str, char delimiter = '\n');
std::string join(const std::vector<std::string>& strs, const std::string& join_with);
std::vector<std::string> regex_get_capture_groups(const std::string& str, const std::string& regex);
bool replace(std::string& str, const std::string& from, const std::string& to);
} // namespace str_util

View File

@ -47,11 +47,11 @@ std::vector<std::string> valid_game_version_names() {
}
std::string build_revision() {
if (BUILT_TAG != "") {
return BUILT_TAG;
if (std::string(BUILT_TAG) != "") {
return std::string(BUILT_TAG);
}
if (BUILT_SHA != "") {
return BUILT_SHA;
if (std::string(BUILT_SHA) != "") {
return std::string(BUILT_SHA);
}
return "Unknown Revision";
}

View File

@ -3,8 +3,8 @@
#include <string>
#include <unordered_map>
#include <decompiler/ObjectFile/ObjectFileDB.h>
#include "decompiler/Function/Function.h"
#include "decompiler/ObjectFile/ObjectFileDB.h"
#include "decompiler/util/DecompilerTypeSystem.h"
namespace decompiler {

View File

@ -1,5 +1,3 @@
{
"hud": [
[14, "(function object :behavior hud)"]
]
}
"hud": [[14, "(function object :behavior hud)"]]
}

View File

@ -9,6 +9,7 @@
#include "runtime.h"
#include "common/global_profiler/GlobalProfiler.h"
#include "common/log/log.h"
#include "common/util/FileUtil.h"
#include "common/util/os.h"

View File

@ -11,7 +11,8 @@
#include "common/log/log.h"
#include "sfxblock2.h"
#include <third-party/fmt/core.h>
#include "third-party/fmt/core.h"
namespace snd {
enum chunk : u32 { bank, samples, midi };

View File

@ -7,7 +7,8 @@
#include "common/log/log.h"
#include "game/sound/989snd/util.h"
#include <third-party/fmt/core.h>
#include "third-party/fmt/core.h"
namespace snd {
/*

View File

@ -4,7 +4,7 @@
#include <fstream>
#include <third-party/fmt/core.h>
#include "third-party/fmt/core.h"
#ifdef _WIN32
#include <combaseapi.h>

View File

@ -11,5 +11,5 @@ add_executable(lsp
protocol/hover.cpp
state/data/mips_instruction.cpp)
target_link_libraries(lsp common decomp)
target_link_libraries(lsp common decomp tree-sitter)

View File

@ -0,0 +1,34 @@
# Visual Studio is dumb and doesn't let you customize the automatic include formats
# so i'll do it myself.
import glob
import re
folders_to_check = ["common", "decompiler", "game", "goalc", "test", "tools", "lsp"]
for folder in folders_to_check:
files_to_check = glob.glob("./{}/**/*.cpp".format(folder), recursive=True)
files_to_check += glob.glob("./{}/**/*.h".format(folder), recursive=True)
for filename in files_to_check:
# Get the file contents
with open(filename, "r", encoding="utf-8") as f:
lines = f.readlines()
new_lines = []
need_to_write = False
for i, line in enumerate(lines):
include_match = re.search(r"#include <(.*)>", line)
if include_match:
include = include_match.groups()[0]
if include.startswith("sys/") or include.startswith("netinet/") or include.startswith("arpa/"):
new_lines.append(line)
elif "/" in include:
new_lines.append(line.replace("<", "\"").replace(">", "\""))
need_to_write = True
else:
new_lines.append(line)
else:
new_lines.append(line)
if need_to_write:
print("Fixing includes in {}".format(filename))
with open(filename, "w", encoding="utf-8") as f:
f.writelines(new_lines)

View File

@ -35,10 +35,12 @@ add_executable(goalc-test
${CMAKE_CURRENT_LIST_DIR}/decompiler/test_DisasmVifDecompile.cpp
${CMAKE_CURRENT_LIST_DIR}/decompiler/test_VuDisasm.cpp
${CMAKE_CURRENT_LIST_DIR}/game/test_newpad.cpp
${CMAKE_CURRENT_LIST_DIR}/common/formatter/test_formatter.cpp
${GOALC_TEST_FRAMEWORK_SOURCES}
${GOALC_TEST_CASES})
${GOALC_TEST_CASES}
)
target_link_libraries(goalc-test common runtime compiler gtest decomp Zydis libzstd_static)
target_link_libraries(goalc-test common runtime compiler gtest decomp Zydis libzstd_static tree-sitter)
if(WIN32)
target_link_libraries(goalc-test mman)

View File

@ -0,0 +1,34 @@
===
Basic Function
===
(defun test-function ((hello string))
"world hello"
(+ 1 1))
---
(defun test-function ((hello string))
"world hello"
(+ 1 1))
===
Two Functions
===
(defun test-function ((hello string))
"world hello"
(+ 1 1))
(defun test-function ((hello string))
"world hello"
(+ 1 1))
---
(defun test-function ((hello string))
"world hello"
(+ 1 1))
(defun test-function ((hello string))
"world hello"
(+ 1 1))

View File

@ -0,0 +1,105 @@
// TODO - eventually replace our `goalc` tests with this setup
// A simple test runner framework for debugging / iterating on the formatter
// Tests are defined in files as such:
/*
===
TEST NAME
===
INPUT
---
EXPECTED OUTPUT
*/
// Test files can contain multiple tests, upon running we will recurse a directory
// looking for any `.test` files and run them through the framework
//
// Any differences will be diff'd and displayed
#include "common/formatter/formatter.h"
#include "common/util/FileUtil.h"
#include "common/util/string_util.h"
#include "gtest/gtest.h"
#include "third-party/fmt/core.h"
struct TestDefinition {
std::string name;
std::string input;
std::string output;
};
bool run_tests(fs::path file_path) {
// Read in the file, and run the test
const auto contents = str_util::split(file_util::read_text_file(file_path));
std::vector<TestDefinition> tests;
TestDefinition curr_test;
int i = 0;
while (i < contents.size()) {
const auto& line = contents.at(i);
if (line == "===") {
curr_test = TestDefinition();
curr_test.name = contents.at(i + 1);
i += 3;
continue;
}
// Parse the input and output
if (!curr_test.name.empty() && line.empty()) {
i++;
while (true) {
if (contents.at(i) == "---") {
i++;
curr_test.input = str_util::trim(curr_test.input);
break;
}
curr_test.input += contents.at(i) + "\n";
i++;
}
i++;
while (true) {
if (i == contents.size() || contents.at(i) == "===") {
curr_test.output = str_util::trim(curr_test.output);
tests.push_back(curr_test);
break;
}
curr_test.output += contents.at(i) + "\n";
i++;
}
continue;
}
}
// Run the tests, report successes and failures
bool test_failed = false;
fmt::print("{}:\n", file_util::base_name(file_path.string()));
for (const auto& test : tests) {
const auto formatted_result = formatter::format_code(test.input);
if (formatted_result != test.output) {
fmt::print(" ❌ - {}\n", test.name);
fmt::print("{}\n", str_util::diff(test.output, formatted_result));
test_failed = true;
} else {
fmt::print(" ✅ - {}\n", test.name);
}
}
return test_failed;
}
bool find_and_run_tests() {
// Enumerate test files
const auto test_files = file_util::find_files_recursively(
file_util::get_file_path({"test/common/formatter/corpus"}), std::regex("^.*\.test$"));
bool failed = false;
for (const auto& file : test_files) {
failed = run_tests(file);
}
return !failed;
}
TEST(Formatter, FormatterTests) {
EXPECT_TRUE(find_and_run_tests());
}

4
third-party/tree-sitter/CMakeLists.txt generated vendored Normal file
View File

@ -0,0 +1,4 @@
add_library(tree-sitter
tree-sitter/lib/src/lib.c
tree-sitter-opengoal/parser.c)

View File

@ -0,0 +1,669 @@
{
"name": "opengoal",
"rules": {
"source": {
"type": "REPEAT",
"content": {
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_form"
},
{
"type": "SYMBOL",
"name": "_gap"
}
]
}
},
"_gap": {
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_ws"
},
{
"type": "SYMBOL",
"name": "comment"
},
{
"type": "SYMBOL",
"name": "comment_multiline"
}
]
},
"_ws": {
"type": "TOKEN",
"content": {
"type": "REPEAT1",
"content": {
"type": "PATTERN",
"value": "[\\f\\n\\r\\t \\u000B\\u001C\\u001D\\u001E\\u001F\\u2028\\u2029\\u1680\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2008\\u2009\\u200a\\u205f\\u3000]"
}
}
},
"comment": {
"type": "TOKEN",
"content": {
"type": "PATTERN",
"value": "(;).*\\n?"
}
},
"comment_multiline": {
"type": "SEQ",
"members": [
{
"type": "TOKEN",
"content": {
"type": "STRING",
"value": "#|"
}
},
{
"type": "REPEAT",
"content": {
"type": "CHOICE",
"members": [
{
"type": "PATTERN",
"value": "[^|#]+"
},
{
"type": "PATTERN",
"value": "#[^|]"
},
{
"type": "PATTERN",
"value": "[^#]\\|"
},
{
"type": "PATTERN",
"value": "[\\n\\r]+"
}
]
}
},
{
"type": "TOKEN",
"content": {
"type": "STRING",
"value": "|#"
}
}
]
},
"_form": {
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "num_lit"
},
{
"type": "SYMBOL",
"name": "kwd_lit"
},
{
"type": "SYMBOL",
"name": "str_lit"
},
{
"type": "SYMBOL",
"name": "char_lit"
},
{
"type": "SYMBOL",
"name": "null_lit"
},
{
"type": "SYMBOL",
"name": "bool_lit"
},
{
"type": "SYMBOL",
"name": "sym_lit"
},
{
"type": "SYMBOL",
"name": "list_lit"
},
{
"type": "SYMBOL",
"name": "quoting_lit"
},
{
"type": "SYMBOL",
"name": "quasi_quoting_lit"
},
{
"type": "SYMBOL",
"name": "unquote_splicing_lit"
},
{
"type": "SYMBOL",
"name": "unquoting_lit"
}
]
},
"num_lit": {
"type": "TOKEN",
"content": {
"type": "PREC",
"value": 10,
"content": {
"type": "SEQ",
"members": [
{
"type": "CHOICE",
"members": [
{
"type": "PATTERN",
"value": "[+-]"
},
{
"type": "BLANK"
}
]
},
{
"type": "CHOICE",
"members": [
{
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "#x"
},
{
"type": "REPEAT1",
"content": {
"type": "PATTERN",
"value": "[0-9a-fA-F]"
}
}
]
},
{
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "#b"
},
{
"type": "REPEAT1",
"content": {
"type": "PATTERN",
"value": "[0-1]"
}
}
]
},
{
"type": "SEQ",
"members": [
{
"type": "REPEAT1",
"content": {
"type": "PATTERN",
"value": "[0-9]"
}
},
{
"type": "CHOICE",
"members": [
{
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "."
},
{
"type": "REPEAT",
"content": {
"type": "PATTERN",
"value": "[0-9]"
}
}
]
},
{
"type": "BLANK"
}
]
}
]
},
{
"type": "SEQ",
"members": [
{
"type": "REPEAT1",
"content": {
"type": "PATTERN",
"value": "[0-9]"
}
}
]
}
]
}
]
}
}
},
"kwd_lit": {
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_kwd_unqualified"
}
]
},
"_kwd_unqualified": {
"type": "PREC",
"value": 1,
"content": {
"type": "SEQ",
"members": [
{
"type": "FIELD",
"name": "marker",
"content": {
"type": "SYMBOL",
"name": "_kwd_marker"
}
},
{
"type": "FIELD",
"name": "name",
"content": {
"type": "ALIAS",
"content": {
"type": "TOKEN",
"content": {
"type": "SEQ",
"members": [
{
"type": "PATTERN",
"value": "[^\\f\\n\\r\\t ()\\[\\]{}\"@~^;`\\\\,:/\\u000B\\u001C\\u001D\\u001E\\u001F\\u2028\\u2029\\u1680\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2008\\u2009\\u200a\\u205f\\u3000]"
},
{
"type": "REPEAT",
"content": {
"type": "CHOICE",
"members": [
{
"type": "PATTERN",
"value": "[:']"
},
{
"type": "PATTERN",
"value": "[^\\f\\n\\r\\t ()\\[\\]{}\"@~^;`\\\\,:/\\u000B\\u001C\\u001D\\u001E\\u001F\\u2028\\u2029\\u1680\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2008\\u2009\\u200a\\u205f\\u3000]"
}
]
}
}
]
}
},
"named": true,
"value": "kwd_name"
}
}
]
}
},
"_kwd_marker": {
"type": "CHOICE",
"members": [
{
"type": "TOKEN",
"content": {
"type": "STRING",
"value": ":"
}
}
]
},
"str_lit": {
"type": "TOKEN",
"content": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "\""
},
{
"type": "REPEAT",
"content": {
"type": "PATTERN",
"value": "[^\"\\\\]"
}
},
{
"type": "REPEAT",
"content": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "\\"
},
{
"type": "PATTERN",
"value": "."
},
{
"type": "REPEAT",
"content": {
"type": "PATTERN",
"value": "[^\"\\\\]"
}
}
]
}
},
{
"type": "STRING",
"value": "\""
}
]
}
},
"char_lit": {
"type": "TOKEN",
"content": {
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "#\\"
},
{
"type": "CHOICE",
"members": [
{
"type": "PATTERN",
"value": ".|\\n"
},
{
"type": "STRING",
"value": "\\s"
},
{
"type": "STRING",
"value": "\\n"
},
{
"type": "STRING",
"value": "\\t"
}
]
}
]
}
},
"null_lit": {
"type": "TOKEN",
"content": {
"type": "STRING",
"value": "none"
}
},
"bool_lit": {
"type": "TOKEN",
"content": {
"type": "CHOICE",
"members": [
{
"type": "STRING",
"value": "#f"
},
{
"type": "STRING",
"value": "#t"
}
]
}
},
"sym_lit": {
"type": "SEQ",
"members": [
{
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_sym_unqualified"
}
]
}
]
},
"_sym_unqualified": {
"type": "FIELD",
"name": "name",
"content": {
"type": "ALIAS",
"content": {
"type": "CHOICE",
"members": [
{
"type": "STRING",
"value": "/"
},
{
"type": "TOKEN",
"content": {
"type": "SEQ",
"members": [
{
"type": "PATTERN",
"value": "[^\\f\\n\\r\\t \\/()\\[\\]{}\"@~^;`\\\\,:#'0-9\\u000B\\u001C\\u001D\\u001E\\u001F\\u2028\\u2029\\u1680\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2008\\u2009\\u200a\\u205f\\u3000]"
},
{
"type": "REPEAT",
"content": {
"type": "CHOICE",
"members": [
{
"type": "PATTERN",
"value": "[^\\f\\n\\r\\t \\/()\\[\\]{}\"@~^;`\\\\,:#'0-9\\u000B\\u001C\\u001D\\u001E\\u001F\\u2028\\u2029\\u1680\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2008\\u2009\\u200a\\u205f\\u3000]"
},
{
"type": "PATTERN",
"value": "[:#'0-9]"
}
]
}
}
]
}
}
]
},
"named": true,
"value": "sym_name"
}
},
"list_lit": {
"type": "SEQ",
"members": [
{
"type": "SYMBOL",
"name": "_bare_list_lit"
}
]
},
"_bare_list_lit": {
"type": "SEQ",
"members": [
{
"type": "FIELD",
"name": "open",
"content": {
"type": "STRING",
"value": "("
}
},
{
"type": "REPEAT",
"content": {
"type": "CHOICE",
"members": [
{
"type": "FIELD",
"name": "value",
"content": {
"type": "SYMBOL",
"name": "_form"
}
},
{
"type": "SYMBOL",
"name": "_gap"
}
]
}
},
{
"type": "FIELD",
"name": "close",
"content": {
"type": "STRING",
"value": ")"
}
}
]
},
"quoting_lit": {
"type": "SEQ",
"members": [
{
"type": "FIELD",
"name": "marker",
"content": {
"type": "STRING",
"value": "'"
}
},
{
"type": "REPEAT",
"content": {
"type": "SYMBOL",
"name": "_gap"
}
},
{
"type": "FIELD",
"name": "value",
"content": {
"type": "SYMBOL",
"name": "_form"
}
}
]
},
"quasi_quoting_lit": {
"type": "SEQ",
"members": [
{
"type": "FIELD",
"name": "marker",
"content": {
"type": "STRING",
"value": "`"
}
},
{
"type": "REPEAT",
"content": {
"type": "SYMBOL",
"name": "_gap"
}
},
{
"type": "FIELD",
"name": "value",
"content": {
"type": "SYMBOL",
"name": "_form"
}
}
]
},
"unquote_splicing_lit": {
"type": "SEQ",
"members": [
{
"type": "FIELD",
"name": "marker",
"content": {
"type": "STRING",
"value": ",@"
}
},
{
"type": "REPEAT",
"content": {
"type": "SYMBOL",
"name": "_gap"
}
},
{
"type": "FIELD",
"name": "value",
"content": {
"type": "SYMBOL",
"name": "_form"
}
}
]
},
"unquoting_lit": {
"type": "SEQ",
"members": [
{
"type": "FIELD",
"name": "marker",
"content": {
"type": "STRING",
"value": ","
}
},
{
"type": "REPEAT",
"content": {
"type": "SYMBOL",
"name": "_gap"
}
},
{
"type": "FIELD",
"name": "value",
"content": {
"type": "SYMBOL",
"name": "_form"
}
}
]
}
},
"extras": [],
"conflicts": [],
"precedences": [],
"externals": [],
"inline": [
"_kwd_unqualified",
"_sym_unqualified"
],
"supertypes": []
}

View File

@ -0,0 +1,614 @@
[
{
"type": "comment_multiline",
"named": true,
"fields": {}
},
{
"type": "kwd_lit",
"named": true,
"fields": {
"marker": {
"multiple": false,
"required": true,
"types": [
{
"type": ":",
"named": false
}
]
},
"name": {
"multiple": false,
"required": true,
"types": [
{
"type": "kwd_name",
"named": true
}
]
}
}
},
{
"type": "list_lit",
"named": true,
"fields": {
"close": {
"multiple": false,
"required": true,
"types": [
{
"type": ")",
"named": false
}
]
},
"open": {
"multiple": false,
"required": true,
"types": [
{
"type": "(",
"named": false
}
]
},
"value": {
"multiple": true,
"required": false,
"types": [
{
"type": "bool_lit",
"named": true
},
{
"type": "char_lit",
"named": true
},
{
"type": "kwd_lit",
"named": true
},
{
"type": "list_lit",
"named": true
},
{
"type": "null_lit",
"named": true
},
{
"type": "num_lit",
"named": true
},
{
"type": "quasi_quoting_lit",
"named": true
},
{
"type": "quoting_lit",
"named": true
},
{
"type": "str_lit",
"named": true
},
{
"type": "sym_lit",
"named": true
},
{
"type": "unquote_splicing_lit",
"named": true
},
{
"type": "unquoting_lit",
"named": true
}
]
}
},
"children": {
"multiple": true,
"required": false,
"types": [
{
"type": "comment",
"named": true
},
{
"type": "comment_multiline",
"named": true
}
]
}
},
{
"type": "quasi_quoting_lit",
"named": true,
"fields": {
"marker": {
"multiple": false,
"required": true,
"types": [
{
"type": "`",
"named": false
}
]
},
"value": {
"multiple": false,
"required": true,
"types": [
{
"type": "bool_lit",
"named": true
},
{
"type": "char_lit",
"named": true
},
{
"type": "kwd_lit",
"named": true
},
{
"type": "list_lit",
"named": true
},
{
"type": "null_lit",
"named": true
},
{
"type": "num_lit",
"named": true
},
{
"type": "quasi_quoting_lit",
"named": true
},
{
"type": "quoting_lit",
"named": true
},
{
"type": "str_lit",
"named": true
},
{
"type": "sym_lit",
"named": true
},
{
"type": "unquote_splicing_lit",
"named": true
},
{
"type": "unquoting_lit",
"named": true
}
]
}
},
"children": {
"multiple": true,
"required": false,
"types": [
{
"type": "comment",
"named": true
},
{
"type": "comment_multiline",
"named": true
}
]
}
},
{
"type": "quoting_lit",
"named": true,
"fields": {
"marker": {
"multiple": false,
"required": true,
"types": [
{
"type": "'",
"named": false
}
]
},
"value": {
"multiple": false,
"required": true,
"types": [
{
"type": "bool_lit",
"named": true
},
{
"type": "char_lit",
"named": true
},
{
"type": "kwd_lit",
"named": true
},
{
"type": "list_lit",
"named": true
},
{
"type": "null_lit",
"named": true
},
{
"type": "num_lit",
"named": true
},
{
"type": "quasi_quoting_lit",
"named": true
},
{
"type": "quoting_lit",
"named": true
},
{
"type": "str_lit",
"named": true
},
{
"type": "sym_lit",
"named": true
},
{
"type": "unquote_splicing_lit",
"named": true
},
{
"type": "unquoting_lit",
"named": true
}
]
}
},
"children": {
"multiple": true,
"required": false,
"types": [
{
"type": "comment",
"named": true
},
{
"type": "comment_multiline",
"named": true
}
]
}
},
{
"type": "source",
"named": true,
"fields": {},
"children": {
"multiple": true,
"required": false,
"types": [
{
"type": "bool_lit",
"named": true
},
{
"type": "char_lit",
"named": true
},
{
"type": "comment",
"named": true
},
{
"type": "comment_multiline",
"named": true
},
{
"type": "kwd_lit",
"named": true
},
{
"type": "list_lit",
"named": true
},
{
"type": "null_lit",
"named": true
},
{
"type": "num_lit",
"named": true
},
{
"type": "quasi_quoting_lit",
"named": true
},
{
"type": "quoting_lit",
"named": true
},
{
"type": "str_lit",
"named": true
},
{
"type": "sym_lit",
"named": true
},
{
"type": "unquote_splicing_lit",
"named": true
},
{
"type": "unquoting_lit",
"named": true
}
]
}
},
{
"type": "sym_lit",
"named": true,
"fields": {
"name": {
"multiple": false,
"required": true,
"types": [
{
"type": "sym_name",
"named": true
}
]
}
}
},
{
"type": "unquote_splicing_lit",
"named": true,
"fields": {
"marker": {
"multiple": false,
"required": true,
"types": [
{
"type": ",@",
"named": false
}
]
},
"value": {
"multiple": false,
"required": true,
"types": [
{
"type": "bool_lit",
"named": true
},
{
"type": "char_lit",
"named": true
},
{
"type": "kwd_lit",
"named": true
},
{
"type": "list_lit",
"named": true
},
{
"type": "null_lit",
"named": true
},
{
"type": "num_lit",
"named": true
},
{
"type": "quasi_quoting_lit",
"named": true
},
{
"type": "quoting_lit",
"named": true
},
{
"type": "str_lit",
"named": true
},
{
"type": "sym_lit",
"named": true
},
{
"type": "unquote_splicing_lit",
"named": true
},
{
"type": "unquoting_lit",
"named": true
}
]
}
},
"children": {
"multiple": true,
"required": false,
"types": [
{
"type": "comment",
"named": true
},
{
"type": "comment_multiline",
"named": true
}
]
}
},
{
"type": "unquoting_lit",
"named": true,
"fields": {
"marker": {
"multiple": false,
"required": true,
"types": [
{
"type": ",",
"named": false
}
]
},
"value": {
"multiple": false,
"required": true,
"types": [
{
"type": "bool_lit",
"named": true
},
{
"type": "char_lit",
"named": true
},
{
"type": "kwd_lit",
"named": true
},
{
"type": "list_lit",
"named": true
},
{
"type": "null_lit",
"named": true
},
{
"type": "num_lit",
"named": true
},
{
"type": "quasi_quoting_lit",
"named": true
},
{
"type": "quoting_lit",
"named": true
},
{
"type": "str_lit",
"named": true
},
{
"type": "sym_lit",
"named": true
},
{
"type": "unquote_splicing_lit",
"named": true
},
{
"type": "unquoting_lit",
"named": true
}
]
}
},
"children": {
"multiple": true,
"required": false,
"types": [
{
"type": "comment",
"named": true
},
{
"type": "comment_multiline",
"named": true
}
]
}
},
{
"type": "#|",
"named": false
},
{
"type": "'",
"named": false
},
{
"type": "(",
"named": false
},
{
"type": ")",
"named": false
},
{
"type": ",",
"named": false
},
{
"type": ",@",
"named": false
},
{
"type": ":",
"named": false
},
{
"type": "`",
"named": false
},
{
"type": "bool_lit",
"named": true
},
{
"type": "char_lit",
"named": true
},
{
"type": "comment",
"named": true
},
{
"type": "kwd_name",
"named": true
},
{
"type": "null_lit",
"named": true
},
{
"type": "num_lit",
"named": true
},
{
"type": "str_lit",
"named": true
},
{
"type": "sym_name",
"named": true
},
{
"type": "|#",
"named": false
}
]

1947
third-party/tree-sitter/tree-sitter-opengoal/parser.c generated vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,224 @@
#ifndef TREE_SITTER_PARSER_H_
#define TREE_SITTER_PARSER_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#define ts_builtin_sym_error ((TSSymbol)-1)
#define ts_builtin_sym_end 0
#define TREE_SITTER_SERIALIZATION_BUFFER_SIZE 1024
typedef uint16_t TSStateId;
#ifndef TREE_SITTER_API_H_
typedef uint16_t TSSymbol;
typedef uint16_t TSFieldId;
typedef struct TSLanguage TSLanguage;
#endif
typedef struct {
TSFieldId field_id;
uint8_t child_index;
bool inherited;
} TSFieldMapEntry;
typedef struct {
uint16_t index;
uint16_t length;
} TSFieldMapSlice;
typedef struct {
bool visible;
bool named;
bool supertype;
} TSSymbolMetadata;
typedef struct TSLexer TSLexer;
struct TSLexer {
int32_t lookahead;
TSSymbol result_symbol;
void (*advance)(TSLexer *, bool);
void (*mark_end)(TSLexer *);
uint32_t (*get_column)(TSLexer *);
bool (*is_at_included_range_start)(const TSLexer *);
bool (*eof)(const TSLexer *);
};
typedef enum {
TSParseActionTypeShift,
TSParseActionTypeReduce,
TSParseActionTypeAccept,
TSParseActionTypeRecover,
} TSParseActionType;
typedef union {
struct {
uint8_t type;
TSStateId state;
bool extra;
bool repetition;
} shift;
struct {
uint8_t type;
uint8_t child_count;
TSSymbol symbol;
int16_t dynamic_precedence;
uint16_t production_id;
} reduce;
uint8_t type;
} TSParseAction;
typedef struct {
uint16_t lex_state;
uint16_t external_lex_state;
} TSLexMode;
typedef union {
TSParseAction action;
struct {
uint8_t count;
bool reusable;
} entry;
} TSParseActionEntry;
struct TSLanguage {
uint32_t version;
uint32_t symbol_count;
uint32_t alias_count;
uint32_t token_count;
uint32_t external_token_count;
uint32_t state_count;
uint32_t large_state_count;
uint32_t production_id_count;
uint32_t field_count;
uint16_t max_alias_sequence_length;
const uint16_t *parse_table;
const uint16_t *small_parse_table;
const uint32_t *small_parse_table_map;
const TSParseActionEntry *parse_actions;
const char * const *symbol_names;
const char * const *field_names;
const TSFieldMapSlice *field_map_slices;
const TSFieldMapEntry *field_map_entries;
const TSSymbolMetadata *symbol_metadata;
const TSSymbol *public_symbol_map;
const uint16_t *alias_map;
const TSSymbol *alias_sequences;
const TSLexMode *lex_modes;
bool (*lex_fn)(TSLexer *, TSStateId);
bool (*keyword_lex_fn)(TSLexer *, TSStateId);
TSSymbol keyword_capture_token;
struct {
const bool *states;
const TSSymbol *symbol_map;
void *(*create)(void);
void (*destroy)(void *);
bool (*scan)(void *, TSLexer *, const bool *symbol_whitelist);
unsigned (*serialize)(void *, char *);
void (*deserialize)(void *, const char *, unsigned);
} external_scanner;
const TSStateId *primary_state_ids;
};
/*
* Lexer Macros
*/
#define START_LEXER() \
bool result = false; \
bool skip = false; \
bool eof = false; \
int32_t lookahead; \
goto start; \
next_state: \
lexer->advance(lexer, skip); \
start: \
skip = false; \
lookahead = lexer->lookahead;
#define ADVANCE(state_value) \
{ \
state = state_value; \
goto next_state; \
}
#define SKIP(state_value) \
{ \
skip = true; \
state = state_value; \
goto next_state; \
}
#define ACCEPT_TOKEN(symbol_value) \
result = true; \
lexer->result_symbol = symbol_value; \
lexer->mark_end(lexer);
#define END_STATE() return result;
/*
* Parse Table Macros
*/
#define SMALL_STATE(id) id - LARGE_STATE_COUNT
#define STATE(id) id
#define ACTIONS(id) id
#define SHIFT(state_value) \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.state = state_value \
} \
}}
#define SHIFT_REPEAT(state_value) \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.state = state_value, \
.repetition = true \
} \
}}
#define SHIFT_EXTRA() \
{{ \
.shift = { \
.type = TSParseActionTypeShift, \
.extra = true \
} \
}}
#define REDUCE(symbol_val, child_count_val, ...) \
{{ \
.reduce = { \
.type = TSParseActionTypeReduce, \
.symbol = symbol_val, \
.child_count = child_count_val, \
__VA_ARGS__ \
}, \
}}
#define RECOVER() \
{{ \
.type = TSParseActionTypeRecover \
}}
#define ACCEPT_INPUT() \
{{ \
.type = TSParseActionTypeAccept \
}}
#ifdef __cplusplus
}
#endif
#endif // TREE_SITTER_PARSER_H_

5
third-party/tree-sitter/tree-sitter/.gitattributes generated vendored Normal file
View File

@ -0,0 +1,5 @@
/lib/src/unicode/*.h linguist-vendored
/lib/src/unicode/LICENSE linguist-vendored
/cli/src/generate/prepare_grammar/*.json -diff
Cargo.lock -diff

View File

@ -0,0 +1,10 @@
#!/bin/bash
set -x
set -e
if [ "$CROSS" != 1 ]; then
exit 111
fi
docker run --rm -v /home/runner:/home/runner -w "$PWD" "$CROSS_IMAGE" "$@"

View File

@ -0,0 +1,19 @@
#!/bin/bash
set -x
set -e
if [ "$CROSS" = 1 ]; then
if [ -z "$CC" ]; then
echo "make.sh: CC is not set" >&2
exit 111
fi
if [ -z "$AR" ]; then
echo "make.sh: AR is not set" >&2
exit 111
fi
cross.sh make CC=$CC AR=$AR "$@"
else
make "$@"
fi

View File

@ -0,0 +1,12 @@
#!/bin/bash
set -x
set -e
tree_sitter="$ROOT"/target/"$TARGET"/release/tree-sitter
if [ "$CROSS" = 1 ]; then
cross.sh $CROSS_RUNNER "$tree_sitter" "$@"
else
"$tree_sitter" "$@"
fi

View File

@ -0,0 +1,69 @@
name: CICD
on:
workflow_dispatch:
pull_request:
push:
branches:
- master
- check/*
concurrency:
group: '${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}'
cancel-in-progress: true
jobs:
init:
name: Init
runs-on: ubuntu-latest
steps:
- name: Get PR head ref
if: ${{ github.event_name == 'pull_request' }}
id: ref
run: |
echo "ref=refs/pull/${{ github.event.pull_request.number }}/head" >> $GITHUB_OUTPUT
outputs:
ref: >-
${{
(github.event_name == 'pull_request' && startsWith(github.head_ref, 'release/v'))
&& steps.ref.outputs.ref
|| github.ref
}}
fast_checks:
name: Fast checks
uses: ./.github/workflows/fast_checks.yml
full_checks:
name: Full Rust checks
needs: fast_checks
uses: ./.github/workflows/full_rust_checks.yml
min_version:
name: Minimum supported rust version
needs: fast_checks
uses: ./.github/workflows/msrv.yml
with:
package: tree-sitter-cli
build:
name: Build & Test
needs: [init, fast_checks]
uses: ./.github/workflows/build.yml
with:
ref: ${{ needs.init.outputs.ref }}
release:
name: Release
needs: [init, fast_checks, full_checks, min_version, build]
if: >
github.event.pull_request.head.repo.full_name == github.repository &&
startsWith(github.head_ref, 'release/v')
uses: ./.github/workflows/release.yml
with:
ref: ${{ needs.init.outputs.ref }}
publish:
name: Publish
needs: release
uses: ./.github/workflows/publish.yml

View File

@ -0,0 +1,171 @@
name: Build & Test
env:
CARGO_TERM_COLOR: always
RUSTFLAGS: "-D warnings"
CROSS_DEBUG: 1
on:
workflow_call:
inputs:
ref:
default: ${{ github.ref }}
type: string
jobs:
build:
name: ${{ matrix.job.name }} (${{ matrix.job.target }}) (${{ matrix.job.os }})
runs-on: ${{ matrix.job.os }}
strategy:
fail-fast: false
matrix:
job:
- { name: linux-aarch64 , target: aarch64-unknown-linux-gnu , os: ubuntu-latest , use-cross: true }
- { name: linux-arm , target: arm-unknown-linux-gnueabihf , os: ubuntu-latest , use-cross: true }
- { name: linux-x64 , target: x86_64-unknown-linux-gnu , os: ubuntu-latest }
- { name: linux-x86 , target: i686-unknown-linux-gnu , os: ubuntu-latest , use-cross: true }
- { name: windows-x64 , target: x86_64-pc-windows-msvc , os: windows-latest }
- { name: windows-x86 , target: i686-pc-windows-msvc , os: windows-latest }
- { name: macos-x64 , target: x86_64-apple-darwin , os: macos-latest }
env:
BUILD_CMD: cargo
defaults:
run:
shell: bash
steps:
- name: Checkout source code
uses: actions/checkout@v3
with:
ref: ${{ inputs.ref }}
- name: Read Emscripten version
run: |
echo "EMSCRIPTEN_VERSION=$(cat cli/emscripten-version)" >> $GITHUB_ENV
- name: Install Emscripten
uses: mymindstorm/setup-emsdk@v12
with:
version: ${{ env.EMSCRIPTEN_VERSION }}
- name: Install Rust toolchain
uses: dtolnay/rust-toolchain@stable
with:
targets: ${{ matrix.job.target }}
- name: Install cross
if: matrix.job.use-cross
uses: taiki-e/install-action@v2
with:
tool: cross
- name: Build custom cross image
if: ${{ matrix.job.use-cross && matrix.job.os == 'ubuntu-latest' }}
run: |
cd ..
target="${{ matrix.job.target }}"
image=ghcr.io/cross-rs/$target:custom
echo "CROSS_IMAGE=$image" >> $GITHUB_ENV
echo "[target.$target]" >> Cross.toml
echo "image = \"$image\"" >> Cross.toml
echo "CROSS_CONFIG=$PWD/Cross.toml" >> $GITHUB_ENV
echo "FROM ghcr.io/cross-rs/$target:edge" >> Dockerfile
echo "ENV DEBIAN_FRONTEND=noninteractive" >> Dockerfile
echo "RUN apt-get update && apt-get install -y nodejs" >> Dockerfile
docker build -t $image .
docker images
docker run --rm $image env
cd -
- name: Setup extra env
run: |
PATH="$PWD/.github/scripts:$PATH"
echo "PATH=$PATH" >> $GITHUB_ENV
echo "ROOT=$PWD" >> $GITHUB_ENV
echo "TREE_SITTER=tree-sitter.sh" >> $GITHUB_ENV
export TARGET=${{ matrix.job.target }}
echo "TARGET=$TARGET" >> $GITHUB_ENV
USE_CROSS="${{ matrix.job.use-cross }}"
if [ "$USE_CROSS" == "true" ]; then
echo "BUILD_CMD=cross" >> $GITHUB_ENV
export CROSS=1; echo "CROSS=$CROSS" >> $GITHUB_ENV
runner=$(cross.sh bash -c "env | sed -nr '/^CARGO_TARGET_.*_RUNNER=/s///p'")
[ -n "$runner" ] && echo "CROSS_RUNNER=$runner" >> $GITHUB_ENV
echo "runner: $runner"
case "$TARGET" in
i686-unknown-linux-gnu) CC=i686-linux-gnu-gcc AR=i686-linux-gnu-ar ;;
aarch64-unknown-linux-gnu) CC=aarch64-linux-gnu-gcc AR=aarch64-linux-gnu-ar ;;
arm-unknown-linux-gnueabihf) CC=arm-unknown-linux-gnueabihf-gcc AR=arm-unknown-linux-gnueabihf-gcc-ar ;;
esac
[ -n "$CC" ] && echo "CC=$CC" >> $GITHUB_ENV
[ -n "$AR" ] && echo "AR=$AR" >> $GITHUB_ENV
fi
case "$TARGET" in
*-windows-*)
echo "RUST_TEST_THREADS=1" >> $GITHUB_ENV # See #2041 tree-sitter issue
;;
esac
- name: Build C library
if: "!contains(matrix.job.os, 'windows')" # Requires an additional adapted Makefile for `cl.exe` compiler
run: make.sh CFLAGS="-Werror" -j
- name: Build wasm library
run: script/build-wasm
- name: Build CLI
run: $BUILD_CMD build --release --target=${{ matrix.job.target }}
- name: Fetch fixtures
run: script/fetch-fixtures
- name: Generate fixtures
run: script/generate-fixtures
- name: Generate WASM fixtures
if: "!matrix.job.use-cross"
run: script/generate-fixtures-wasm
- name: Run main tests
run: $BUILD_CMD test --target=${{ matrix.job.target }}
- name: Run wasm tests
if: "!matrix.job.use-cross" # TODO: Install Emscripten into custom cross images
run: script/test-wasm
- name: Run benchmarks
if: "!matrix.job.use-cross" # It doesn't make sense to benchmark something in an emulator
run: $BUILD_CMD bench benchmark -p tree-sitter-cli --target=${{ matrix.job.target }}
- name: Upload CLI artifact
uses: actions/upload-artifact@v3
with:
name: tree-sitter.${{ matrix.job.name }}
path: target/${{ matrix.job.target }}/release/tree-sitter${{ contains(matrix.job.target, 'windows') && '.exe' || '' }}
if-no-files-found: error
retention-days: 7
- name: Upload WASM artifacts
if: ${{ matrix.job.name == 'linux-x64' }}
uses: actions/upload-artifact@v3
with:
name: tree-sitter.wasm
path: |
lib/binding_web/tree-sitter.js
lib/binding_web/tree-sitter.wasm
if-no-files-found: error
retention-days: 7

View File

@ -0,0 +1,31 @@
name: Fast checks to fail fast on any simple code issues
env:
CARGO_TERM_COLOR: always
RUSTFLAGS: "-D warnings"
on:
workflow_call:
jobs:
check_rust_formatting:
name: Check Rust formating
runs-on: ubuntu-latest
steps:
- name: Checkout source code
uses: actions/checkout@v3
- name: Run cargo fmt
run: cargo fmt -- --check
check_c_warnings:
name: Check C warnings
runs-on: ubuntu-latest
steps:
- name: Checkout source code
uses: actions/checkout@v3
- name: Make C library to check that it's able to compile without warnings
run: make -j CFLAGS="-Werror"

View File

@ -0,0 +1,32 @@
name: Full Rust codebase checks
env:
CARGO_TERM_COLOR: always
RUSTFLAGS: "-D warnings"
on:
workflow_call:
jobs:
run:
name: Run checks
runs-on: ubuntu-latest
steps:
- name: Checkout source code
uses: actions/checkout@v3
- name: Install rust toolchain
uses: dtolnay/rust-toolchain@master
with:
toolchain: stable
components: clippy, rustfmt
- name: Run cargo fmt
run: cargo fmt -- --check
# - name: Run clippy
# run: cargo clippy --all-targets
- name: Run cargo check
run: cargo check --workspace --examples --tests --benches --bins

View File

@ -0,0 +1,42 @@
name: Minimum supported rust version
env:
CARGO_TERM_COLOR: always
RUSTFLAGS: "-D warnings"
on:
workflow_call:
inputs:
package:
description: Target cargo package name
required: true
type: string
jobs:
run:
name: Run checks
runs-on: ubuntu-latest
steps:
- name: Checkout source code
uses: actions/checkout@v3
- name: Get the MSRV from the package metadata
id: msrv
run: cargo metadata --no-deps --format-version 1 | jq -r '"version=" + (.packages[] | select(.name == "${{ inputs.package }}").rust_version)' >> $GITHUB_OUTPUT
- name: Install rust toolchain (v${{ steps.msrv.outputs.version }})
uses: dtolnay/rust-toolchain@master
with:
toolchain: ${{ steps.msrv.outputs.version }}
components: clippy, rustfmt
- name: Run cargo fmt
run: cargo fmt -- --check
# - name: Run clippy (on minimum supported rust version to prevent warnings we can't fix)
# run: cargo clippy --all-targets
# - name: Run main tests
# run: cargo test

View File

@ -0,0 +1,21 @@
name: Publish to registries
on:
workflow_call:
jobs:
crates_io:
name: Publish to Crates.io
runs-on: ubuntu-latest
steps:
- name: Publish packages
run: |
echo "::warning::TODO: add a Crates.io publish logic"
npm:
name: Publish to npmjs.com
runs-on: ubuntu-latest
steps:
- name: Publish packages
run: |
echo "::warning::TODO: add a npmjs.com publish logic"

View File

@ -0,0 +1,101 @@
name: Release
on:
workflow_call:
inputs:
ref:
default: ${{ github.ref }}
type: string
jobs:
permissions:
name: Check permissions
runs-on: ubuntu-latest
outputs:
release_allowed: ${{ steps.maintainer.outputs.is_maintainer == 'true' }}
steps:
- name: Is maintainer
id: maintainer
env:
GH_TOKEN: ${{ github.token }}
repo: ${{ github.repository }}
actor: ${{ github.actor }}
run: |
maintainer=$(
gh api "/repos/${repo}/collaborators" |
jq ".[] | {login, maintainer: .permissions | .maintain} | select(.login == \"${actor}\") | .maintainer"
);
if [ "$maintainer" == "true" ]; then
echo "@${actor} has maintainer level permissions :rocket:" >> $GITHUB_STEP_SUMMARY;
echo "is_maintainer=true" >> $GITHUB_OUTPUT
fi
release:
name: Release
needs: permissions
if: needs.permissions.outputs.release_allowed
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Checkout source code
uses: actions/checkout@v3
with:
ref: ${{ inputs.ref }}
- name: Download build artifacts
uses: actions/download-artifact@v3
with:
path: artifacts
- name: Display structure of downloaded files
run: ls -lR
working-directory: artifacts
- name: Prepare release artifacts
run: |
mkdir -p target
mv artifacts/tree-sitter.wasm/* target/
rm -r artifacts/tree-sitter.wasm
for platform in $(cd artifacts; ls | sed 's/^tree-sitter\.//'); do
exe=$(ls artifacts/tree-sitter.$platform/tree-sitter*)
gzip --stdout --name $exe > target/tree-sitter-$platform.gz
done
rm -rf artifacts
ls -l target/
- name: Get tag name from a release/v* branch name
id: tag_name
env:
tag: ${{ github.head_ref }}
run: echo "tag=${tag#release/}" >> $GITHUB_OUTPUT
- name: Add a release tag
env:
ref: ${{ inputs.ref }}
tag: ${{ steps.tag_name.outputs.tag }}
message: "Release ${{ steps.tag_name.outputs.tag }}"
run: |
git config user.name "${GITHUB_ACTOR}"
git config user.email "${GITHUB_ACTOR}@users.noreply.github.com"
git tag -a "$tag" HEAD -m "$message"
git push origin "$tag"
- name: Create release
uses: softprops/action-gh-release@v1
with:
name: ${{ steps.tag_name.outputs.tag }}
tag_name: ${{ steps.tag_name.outputs.tag }}
fail_on_unmatched_files: true
files: |
target/tree-sitter-*.gz
target/tree-sitter.wasm
target/tree-sitter.js
- name: Merge release PR
env:
GH_TOKEN: ${{ github.token }}
run: |
gh pr merge ${{ github.event.pull_request.html_url }} --match-head-commit $(git rev-parse HEAD) --merge --delete-branch

27
third-party/tree-sitter/tree-sitter/.gitignore generated vendored Normal file
View File

@ -0,0 +1,27 @@
log*.html
.idea
*.xcodeproj
.vscode
.cache
fuzz-results
test/fixtures/grammars/*
!test/fixtures/grammars/.gitkeep
package-lock.json
node_modules
docs/assets/js/tree-sitter.js
/target
*.rs.bk
*.a
*.dylib
*.so
*.so.[0-9]*
*.o
*.obj
*.exp
*.lib
*.wasm

1
third-party/tree-sitter/tree-sitter/CONTRIBUTING.md generated vendored Normal file
View File

@ -0,0 +1 @@
docs/section-6-contributing.md

1116
third-party/tree-sitter/tree-sitter/Cargo.lock generated vendored Normal file

File diff suppressed because it is too large Load Diff

10
third-party/tree-sitter/tree-sitter/Cargo.toml generated vendored Normal file
View File

@ -0,0 +1,10 @@
[workspace]
default-members = ["cli"]
members = ["cli", "lib"]
resolver = "2"
[workspace.package]
rust-version = "1.65"
[profile.release]
strip = true

21
third-party/tree-sitter/tree-sitter/LICENSE generated vendored Normal file
View File

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2018-2021 Max Brunsfeld
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

71
third-party/tree-sitter/tree-sitter/Makefile generated vendored Normal file
View File

@ -0,0 +1,71 @@
VERSION := 0.20.9
# install directory layout
PREFIX ?= /usr/local
INCLUDEDIR ?= $(PREFIX)/include
LIBDIR ?= $(PREFIX)/lib
PCLIBDIR ?= $(LIBDIR)/pkgconfig
# collect sources
ifneq ($(AMALGAMATED),1)
SRC := $(wildcard lib/src/*.c)
# do not double-include amalgamation
SRC := $(filter-out lib/src/lib.c,$(SRC))
else
# use amalgamated build
SRC := lib/src/lib.c
endif
OBJ := $(SRC:.c=.o)
# define default flags, and override to append mandatory flags
CFLAGS ?= -O3 -Wall -Wextra -Werror
override CFLAGS += -std=gnu99 -fPIC -Ilib/src -Ilib/include
# ABI versioning
SONAME_MAJOR := 0
SONAME_MINOR := 0
# OS-specific bits
ifeq ($(shell uname),Darwin)
SOEXT = dylib
SOEXTVER_MAJOR = $(SONAME_MAJOR).dylib
SOEXTVER = $(SONAME_MAJOR).$(SONAME_MINOR).dylib
LINKSHARED += -dynamiclib -Wl,-install_name,$(LIBDIR)/libtree-sitter.$(SONAME_MAJOR).dylib
else
SOEXT = so
SOEXTVER_MAJOR = so.$(SONAME_MAJOR)
SOEXTVER = so.$(SONAME_MAJOR).$(SONAME_MINOR)
LINKSHARED += -shared -Wl,-soname,libtree-sitter.so.$(SONAME_MAJOR)
endif
ifneq (,$(filter $(shell uname),FreeBSD NetBSD DragonFly))
PCLIBDIR := $(PREFIX)/libdata/pkgconfig
endif
all: libtree-sitter.a libtree-sitter.$(SOEXTVER)
libtree-sitter.a: $(OBJ)
$(AR) rcs $@ $^
libtree-sitter.$(SOEXTVER): $(OBJ)
$(CC) $(LDFLAGS) $(LINKSHARED) $^ $(LDLIBS) -o $@
ln -sf $@ libtree-sitter.$(SOEXT)
ln -sf $@ libtree-sitter.$(SOEXTVER_MAJOR)
install: all
install -d '$(DESTDIR)$(LIBDIR)'
install -m755 libtree-sitter.a '$(DESTDIR)$(LIBDIR)'/libtree-sitter.a
install -m755 libtree-sitter.$(SOEXTVER) '$(DESTDIR)$(LIBDIR)'/libtree-sitter.$(SOEXTVER)
ln -sf libtree-sitter.$(SOEXTVER) '$(DESTDIR)$(LIBDIR)'/libtree-sitter.$(SOEXTVER_MAJOR)
ln -sf libtree-sitter.$(SOEXTVER) '$(DESTDIR)$(LIBDIR)'/libtree-sitter.$(SOEXT)
install -d '$(DESTDIR)$(INCLUDEDIR)'/tree_sitter
install -m644 lib/include/tree_sitter/*.h '$(DESTDIR)$(INCLUDEDIR)'/tree_sitter/
install -d '$(DESTDIR)$(PCLIBDIR)'
sed -e 's|@LIBDIR@|$(LIBDIR)|;s|@INCLUDEDIR@|$(INCLUDEDIR)|;s|@VERSION@|$(VERSION)|' \
-e 's|=$(PREFIX)|=$${prefix}|' \
-e 's|@PREFIX@|$(PREFIX)|' \
tree-sitter.pc.in > '$(DESTDIR)$(PCLIBDIR)'/tree-sitter.pc
clean:
rm -f lib/src/*.o libtree-sitter.a libtree-sitter.$(SOEXT) libtree-sitter.$(SOEXTVER_MAJOR) libtree-sitter.$(SOEXTVER)
.PHONY: all install clean

18
third-party/tree-sitter/tree-sitter/README.md generated vendored Normal file
View File

@ -0,0 +1,18 @@
# tree-sitter
[![CICD](https://github.com/tree-sitter/tree-sitter/actions/workflows/CICD.yml/badge.svg)](https://github.com/tree-sitter/tree-sitter/actions/workflows/CICD.yml)
[![DOI](https://zenodo.org/badge/14164618.svg)](https://zenodo.org/badge/latestdoi/14164618)
Tree-sitter is a parser generator tool and an incremental parsing library. It can build a concrete syntax tree for a source file and efficiently update the syntax tree as the source file is edited. Tree-sitter aims to be:
- **General** enough to parse any programming language
- **Fast** enough to parse on every keystroke in a text editor
- **Robust** enough to provide useful results even in the presence of syntax errors
- **Dependency-free** so that the runtime library (which is written in pure C) can be embedded in any application
## Links
- [Documentation](https://tree-sitter.github.io)
- [Rust binding](lib/binding_rust/README.md)
- [WASM binding](lib/binding_web/README.md)
- [Command-line interface](cli/README.md)

82
third-party/tree-sitter/tree-sitter/cli/Cargo.toml generated vendored Normal file
View File

@ -0,0 +1,82 @@
[package]
name = "tree-sitter-cli"
description = "CLI tool for developing, testing, and using Tree-sitter parsers"
version = "0.20.8"
authors = ["Max Brunsfeld <maxbrunsfeld@gmail.com>"]
edition = "2021"
license = "MIT"
readme = "README.md"
keywords = ["incremental", "parsing"]
categories = ["command-line-utilities", "parsing"]
repository = "https://github.com/tree-sitter/tree-sitter"
rust-version.workspace = true
[[bin]]
name = "tree-sitter"
path = "src/main.rs"
[[bench]]
name = "benchmark"
harness = false
[dependencies]
ansi_term = "0.12"
anyhow = "1.0"
atty = "0.2"
clap = "2.32"
difference = "2.0"
dirs = "3.0"
glob = "0.3.0"
html-escape = "0.2.6"
indexmap = "1"
lazy_static = "1.2.0"
regex = "1"
regex-syntax = "0.6.4"
rustc-hash = "1"
semver = "1.0"
serde = { version = "1.0.130", features = ["derive"] }
smallbitvec = "2.5.1"
tiny_http = "0.12.0"
walkdir = "2.3"
webbrowser = "0.8.3"
which = "4.1.0"
[dependencies.tree-sitter]
version = "0.20.10"
path = "../lib"
[dependencies.tree-sitter-config]
version = "0.19.0"
path = "config"
[dependencies.tree-sitter-highlight]
version = "0.20"
path = "../highlight"
[dependencies.tree-sitter-loader]
version = "0.20"
path = "loader"
[dependencies.tree-sitter-tags]
version = "0.20"
path = "../tags"
[dependencies.serde_json]
version = "1.0"
features = ["preserve_order"]
[dependencies.log]
version = "0.4.6"
features = ["std"]
[dev-dependencies]
proc_macro = { path = "src/tests/proc_macro" }
rand = "0.8"
tempfile = "3"
pretty_assertions = "0.7.2"
ctor = "0.1"
unindent = "0.2"
[build-dependencies]
toml = "0.5"

37
third-party/tree-sitter/tree-sitter/cli/README.md generated vendored Normal file
View File

@ -0,0 +1,37 @@
Tree-sitter CLI
===============
[![Crates.io](https://img.shields.io/crates/v/tree-sitter-cli.svg)](https://crates.io/crates/tree-sitter-cli)
The Tree-sitter CLI allows you to develop, test, and use Tree-sitter grammars from the command line. It works on MacOS, Linux, and Windows.
### Installation
You can install the `tree-sitter-cli` with `cargo`:
```sh
cargo install tree-sitter-cli
```
or with `npm`:
```sh
npm install tree-sitter-cli
```
You can also download a pre-built binary for your platform from [the releases page](https://github.com/tree-sitter/tree-sitter/releases/latest).
### Dependencies
The `tree-sitter` binary itself has no dependencies, but specific commands have dependencies that must be present at runtime:
* To generate a parser from a grammar, you must have [`node`](https://nodejs.org) on your PATH.
* To run and test parsers, you must have a C and C++ compiler on your system.
### Commands
* `generate` - The `tree-sitter generate` command will generate a Tree-sitter parser based on the grammar in the current working directory. See [the documentation](http://tree-sitter.github.io/tree-sitter/creating-parsers) for more information.
* `test` - The `tree-sitter test` command will run the unit tests for the Tree-sitter parser in the current working directory. See [the documentation](http://tree-sitter.github.io/tree-sitter/creating-parsers) for more information.
* `parse` - The `tree-sitter parse` command will parse a file (or list of files) using Tree-sitter parsers.

View File

@ -0,0 +1,214 @@
use anyhow::Context;
use lazy_static::lazy_static;
use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
use std::time::Instant;
use std::{env, fs, str, usize};
use tree_sitter::{Language, Parser, Query};
use tree_sitter_loader::Loader;
include!("../src/tests/helpers/dirs.rs");
lazy_static! {
static ref LANGUAGE_FILTER: Option<String> =
env::var("TREE_SITTER_BENCHMARK_LANGUAGE_FILTER").ok();
static ref EXAMPLE_FILTER: Option<String> =
env::var("TREE_SITTER_BENCHMARK_EXAMPLE_FILTER").ok();
static ref REPETITION_COUNT: usize = env::var("TREE_SITTER_BENCHMARK_REPETITION_COUNT")
.map(|s| usize::from_str_radix(&s, 10).unwrap())
.unwrap_or(5);
static ref TEST_LOADER: Loader = Loader::with_parser_lib_path(SCRATCH_DIR.clone());
static ref EXAMPLE_AND_QUERY_PATHS_BY_LANGUAGE_DIR: BTreeMap<PathBuf, (Vec<PathBuf>, Vec<PathBuf>)> = {
fn process_dir(result: &mut BTreeMap<PathBuf, (Vec<PathBuf>, Vec<PathBuf>)>, dir: &Path) {
if dir.join("grammar.js").exists() {
let relative_path = dir.strip_prefix(GRAMMARS_DIR.as_path()).unwrap();
let (example_paths, query_paths) =
result.entry(relative_path.to_owned()).or_default();
if let Ok(example_files) = fs::read_dir(&dir.join("examples")) {
example_paths.extend(example_files.filter_map(|p| {
let p = p.unwrap().path();
if p.is_file() {
Some(p.to_owned())
} else {
None
}
}));
}
if let Ok(query_files) = fs::read_dir(&dir.join("queries")) {
query_paths.extend(query_files.filter_map(|p| {
let p = p.unwrap().path();
if p.is_file() {
Some(p.to_owned())
} else {
None
}
}));
}
} else {
for entry in fs::read_dir(&dir).unwrap() {
let entry = entry.unwrap().path();
if entry.is_dir() {
process_dir(result, &entry);
}
}
}
}
let mut result = BTreeMap::new();
process_dir(&mut result, &GRAMMARS_DIR);
result
};
}
fn main() {
let max_path_length = EXAMPLE_AND_QUERY_PATHS_BY_LANGUAGE_DIR
.values()
.flat_map(|(e, q)| {
e.iter()
.chain(q.iter())
.map(|s| s.file_name().unwrap().to_str().unwrap().len())
})
.max()
.unwrap_or(0);
eprintln!("Benchmarking with {} repetitions", *REPETITION_COUNT);
let mut parser = Parser::new();
let mut all_normal_speeds = Vec::new();
let mut all_error_speeds = Vec::new();
for (language_path, (example_paths, query_paths)) in
EXAMPLE_AND_QUERY_PATHS_BY_LANGUAGE_DIR.iter()
{
let language_name = language_path.file_name().unwrap().to_str().unwrap();
if let Some(filter) = LANGUAGE_FILTER.as_ref() {
if language_name != filter.as_str() {
continue;
}
}
eprintln!("\nLanguage: {}", language_name);
let language = get_language(language_path);
parser.set_language(language).unwrap();
eprintln!(" Constructing Queries");
for path in query_paths {
if let Some(filter) = EXAMPLE_FILTER.as_ref() {
if !path.to_str().unwrap().contains(filter.as_str()) {
continue;
}
}
parse(&path, max_path_length, |source| {
Query::new(language, str::from_utf8(source).unwrap())
.expect("Failed to parse query");
});
}
eprintln!(" Parsing Valid Code:");
let mut normal_speeds = Vec::new();
for example_path in example_paths {
if let Some(filter) = EXAMPLE_FILTER.as_ref() {
if !example_path.to_str().unwrap().contains(filter.as_str()) {
continue;
}
}
normal_speeds.push(parse(example_path, max_path_length, |code| {
parser.parse(code, None).expect("Failed to parse");
}));
}
eprintln!(" Parsing Invalid Code (mismatched languages):");
let mut error_speeds = Vec::new();
for (other_language_path, (example_paths, _)) in
EXAMPLE_AND_QUERY_PATHS_BY_LANGUAGE_DIR.iter()
{
if other_language_path != language_path {
for example_path in example_paths {
if let Some(filter) = EXAMPLE_FILTER.as_ref() {
if !example_path.to_str().unwrap().contains(filter.as_str()) {
continue;
}
}
error_speeds.push(parse(example_path, max_path_length, |code| {
parser.parse(code, None).expect("Failed to parse");
}));
}
}
}
if let Some((average_normal, worst_normal)) = aggregate(&normal_speeds) {
eprintln!(" Average Speed (normal): {} bytes/ms", average_normal);
eprintln!(" Worst Speed (normal): {} bytes/ms", worst_normal);
}
if let Some((average_error, worst_error)) = aggregate(&error_speeds) {
eprintln!(" Average Speed (errors): {} bytes/ms", average_error);
eprintln!(" Worst Speed (errors): {} bytes/ms", worst_error);
}
all_normal_speeds.extend(normal_speeds);
all_error_speeds.extend(error_speeds);
}
eprintln!("\n Overall");
if let Some((average_normal, worst_normal)) = aggregate(&all_normal_speeds) {
eprintln!(" Average Speed (normal): {} bytes/ms", average_normal);
eprintln!(" Worst Speed (normal): {} bytes/ms", worst_normal);
}
if let Some((average_error, worst_error)) = aggregate(&all_error_speeds) {
eprintln!(" Average Speed (errors): {} bytes/ms", average_error);
eprintln!(" Worst Speed (errors): {} bytes/ms", worst_error);
}
eprintln!("");
}
fn aggregate(speeds: &Vec<usize>) -> Option<(usize, usize)> {
if speeds.is_empty() {
return None;
}
let mut total = 0;
let mut max = usize::MAX;
for speed in speeds.iter().cloned() {
total += speed;
if speed < max {
max = speed;
}
}
Some((total / speeds.len(), max))
}
fn parse(path: &Path, max_path_length: usize, mut action: impl FnMut(&[u8])) -> usize {
eprint!(
" {:width$}\t",
path.file_name().unwrap().to_str().unwrap(),
width = max_path_length
);
let source_code = fs::read(path)
.with_context(|| format!("Failed to read {:?}", path))
.unwrap();
let time = Instant::now();
for _ in 0..*REPETITION_COUNT {
action(&source_code);
}
let duration = time.elapsed() / (*REPETITION_COUNT as u32);
let duration_ms = duration.as_millis();
let speed = source_code.len() as u128 / (duration_ms + 1);
eprintln!("time {} ms\tspeed {} bytes/ms", duration_ms as usize, speed);
speed as usize
}
fn get_language(path: &Path) -> Language {
let src_dir = GRAMMARS_DIR.join(path).join("src");
TEST_LOADER
.load_language_at_path(&src_dir, &src_dir)
.with_context(|| format!("Failed to load language at path {:?}", src_dir))
.unwrap()
}

126
third-party/tree-sitter/tree-sitter/cli/build.rs generated vendored Normal file
View File

@ -0,0 +1,126 @@
use std::ffi::OsStr;
use std::path::{Path, PathBuf};
use std::{env, fs};
fn main() {
if let Some(git_sha) = read_git_sha() {
println!("cargo:rustc-env={}={}", "BUILD_SHA", git_sha);
}
if web_playground_files_present() {
println!("cargo:rustc-cfg={}", "TREE_SITTER_EMBED_WASM_BINDING");
}
let rust_binding_version = read_rust_binding_version();
println!(
"cargo:rustc-env={}={}",
"RUST_BINDING_VERSION", rust_binding_version,
);
let emscripten_version = fs::read_to_string("emscripten-version").unwrap();
println!(
"cargo:rustc-env={}={}",
"EMSCRIPTEN_VERSION", emscripten_version,
);
}
fn web_playground_files_present() -> bool {
let paths = [
"../docs/assets/js/playground.js",
"../lib/binding_web/tree-sitter.js",
"../lib/binding_web/tree-sitter.wasm",
];
paths.iter().all(|p| Path::new(p).exists())
}
fn read_git_sha() -> Option<String> {
let mut repo_path = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
let mut git_path;
loop {
git_path = repo_path.join(".git");
if git_path.exists() {
break;
} else if !repo_path.pop() {
return None;
}
}
let git_dir_path;
if git_path.is_dir() {
git_dir_path = git_path;
} else if let Ok(git_path_content) = fs::read_to_string(&git_path) {
git_dir_path = repo_path.join(git_path_content.get("gitdir: ".len()..).unwrap().trim_end());
} else {
return None;
}
let git_head_path = git_dir_path.join("HEAD");
if let Some(path) = git_head_path.to_str() {
println!("cargo:rerun-if-changed={}", path);
}
if let Ok(mut head_content) = fs::read_to_string(&git_head_path) {
if head_content.ends_with("\n") {
head_content.pop();
}
// If we're on a branch, read the SHA from the ref file.
if head_content.starts_with("ref: ") {
head_content.replace_range(0.."ref: ".len(), "");
let ref_filename = {
// Go to real non-worktree gitdir
let git_dir_path = git_dir_path
.parent()
.map(|p| {
p.file_name()
.map(|n| n == OsStr::new("worktrees"))
.and_then(|x| x.then(|| p.parent()))
})
.flatten()
.flatten()
.unwrap_or(&git_dir_path);
let file = git_dir_path.join(&head_content);
if file.is_file() {
file
} else {
let packed_refs = git_dir_path.join("packed-refs");
if let Ok(packed_refs_content) = fs::read_to_string(&packed_refs) {
for line in packed_refs_content.lines() {
if let Some((hash, r#ref)) = line.split_once(' ') {
if r#ref == head_content {
if let Some(path) = packed_refs.to_str() {
println!("cargo:rerun-if-changed={}", path);
}
return Some(hash.to_string());
}
}
}
}
return None;
}
};
if let Some(path) = ref_filename.to_str() {
println!("cargo:rerun-if-changed={}", path);
}
return fs::read_to_string(&ref_filename).ok();
}
// If we're on a detached commit, then the `HEAD` file itself contains the sha.
else if head_content.len() == 40 {
return Some(head_content);
}
}
None
}
fn read_rust_binding_version() -> String {
let path = "Cargo.toml";
let text = fs::read_to_string(path).unwrap();
let cargo_toml = toml::from_str::<toml::Value>(text.as_ref()).unwrap();
cargo_toml["dependencies"]["tree-sitter"]["version"]
.as_str()
.unwrap()
.trim_matches('"')
.to_string()
}

View File

@ -0,0 +1,21 @@
[package]
name = "tree-sitter-config"
description = "User configuration of tree-sitter's command line programs"
version = "0.19.0"
authors = ["Max Brunsfeld <maxbrunsfeld@gmail.com>"]
edition = "2018"
license = "MIT"
readme = "README.md"
keywords = ["incremental", "parsing"]
categories = ["command-line-utilities", "parsing"]
repository = "https://github.com/tree-sitter/tree-sitter"
rust-version.workspace = true
[dependencies]
anyhow = "1.0"
dirs = "3.0"
serde = { version = "1.0.130", features = ["derive"] }
[dependencies.serde_json]
version = "1.0.45"
features = ["preserve_order"]

View File

@ -0,0 +1,5 @@
# `tree-sitter-config`
You can use a configuration file to control the behavior of the `tree-sitter`
command-line program. This crate implements the logic for finding and the
parsing the contents of the configuration file.

View File

@ -0,0 +1,131 @@
//! Manages tree-sitter's configuration file.
use anyhow::{anyhow, Context, Result};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::path::PathBuf;
use std::{env, fs};
/// Holds the contents of tree-sitter's configuration file.
///
/// The file typically lives at `~/.config/tree-sitter/config.json`, but see the [`Config::load`][]
/// method for the full details on where it might be located.
///
/// This type holds the generic JSON content of the configuration file. Individual tree-sitter
/// components will use the [`Config::get`][] method to parse that JSON to extract configuration
/// fields that are specific to that component.
#[derive(Debug)]
pub struct Config {
pub location: PathBuf,
pub config: Value,
}
impl Config {
pub fn find_config_file() -> Result<Option<PathBuf>> {
if let Ok(path) = env::var("TREE_SITTER_DIR") {
let mut path = PathBuf::from(path);
path.push("config.json");
if !path.exists() {
return Ok(None);
}
if path.is_file() {
return Ok(Some(path));
}
}
let xdg_path = Self::xdg_config_file()?;
if xdg_path.is_file() {
return Ok(Some(xdg_path));
}
let legacy_path = dirs::home_dir()
.ok_or(anyhow!("Cannot determine home directory"))?
.join(".tree-sitter")
.join("config.json");
if legacy_path.is_file() {
return Ok(Some(legacy_path));
}
Ok(None)
}
fn xdg_config_file() -> Result<PathBuf> {
let xdg_path = dirs::config_dir()
.ok_or(anyhow!("Cannot determine config directory"))?
.join("tree-sitter")
.join("config.json");
Ok(xdg_path)
}
/// Locates and loads in the user's configuration file. We search for the configuration file
/// in the following locations, in order:
///
/// - `$TREE_SITTER_DIR/config.json`, if the `TREE_SITTER_DIR` environment variable is set
/// - `tree-sitter/config.json` in your default user configuration directory, as determined
/// by [`dirs::config_dir`](https://docs.rs/dirs/*/dirs/fn.config_dir.html)
/// - `$HOME/.tree-sitter/config.json` as a fallback from where tree-sitter _used_ to store
/// its configuration
pub fn load() -> Result<Config> {
let location = match Self::find_config_file()? {
Some(location) => location,
None => return Config::initial(),
};
let content = fs::read_to_string(&location)
.with_context(|| format!("Failed to read {}", &location.to_string_lossy()))?;
let config = serde_json::from_str(&content)
.with_context(|| format!("Bad JSON config {}", &location.to_string_lossy()))?;
Ok(Config { location, config })
}
/// Creates an empty initial configuration file. You can then use the [`Config::add`][] method
/// to add the component-specific configuration types for any components that want to add
/// content to the default file, and then use [`Config::save`][] to write the configuration to
/// disk.
///
/// (Note that this is typically only done by the `tree-sitter init-config` command.)
pub fn initial() -> Result<Config> {
let location = if let Ok(path) = env::var("TREE_SITTER_DIR") {
let mut path = PathBuf::from(path);
path.push("config.json");
path
} else {
Self::xdg_config_file()?
};
let config = serde_json::json!({});
Ok(Config { location, config })
}
/// Saves this configuration to the file that it was originally loaded from.
pub fn save(&self) -> Result<()> {
let json = serde_json::to_string_pretty(&self.config)?;
fs::create_dir_all(self.location.parent().unwrap())?;
fs::write(&self.location, json)?;
Ok(())
}
/// Parses a component-specific configuration from the configuration file. The type `C` must
/// be [deserializable](https://docs.rs/serde/*/serde/trait.Deserialize.html) from a JSON
/// object, and must only include the fields relevant to that component.
pub fn get<C>(&self) -> Result<C>
where
C: for<'de> Deserialize<'de>,
{
let config = serde_json::from_value(self.config.clone())?;
Ok(config)
}
/// Adds a component-specific configuration to the configuration file. The type `C` must be
/// [serializable](https://docs.rs/serde/*/serde/trait.Serialize.html) into a JSON object, and
/// must only include the fields relevant to that component.
pub fn add<C>(&mut self, config: C) -> Result<()>
where
C: Serialize,
{
let mut config = serde_json::to_value(&config)?;
self.config
.as_object_mut()
.unwrap()
.append(config.as_object_mut().unwrap());
Ok(())
}
}

View File

@ -0,0 +1 @@
3.1.29

View File

@ -0,0 +1,37 @@
[package]
name = "tree-sitter-loader"
description = "Locates, builds, and loads tree-sitter grammars at runtime"
version = "0.20.0"
authors = ["Max Brunsfeld <maxbrunsfeld@gmail.com>"]
edition = "2018"
license = "MIT"
readme = "README.md"
keywords = ["incremental", "parsing"]
categories = ["command-line-utilities", "parsing"]
repository = "https://github.com/tree-sitter/tree-sitter"
rust-version.workspace = true
[dependencies]
anyhow = "1.0"
cc = "^1.0.58"
dirs = "3.0"
libloading = "0.7"
once_cell = "1.7"
regex = "1"
serde = { version = "1.0.130", features = ["derive"] }
[dependencies.serde_json]
version = "1.0"
features = ["preserve_order"]
[dependencies.tree-sitter]
version = "0.20"
path = "../../lib"
[dependencies.tree-sitter-highlight]
version = "0.20"
path = "../../highlight"
[dependencies.tree-sitter-tags]
version = "0.20"
path = "../../tags"

View File

@ -0,0 +1,6 @@
# `tree-sitter-loader`
The `tree-sitter` command-line program will dynamically find and build grammars
at runtime, if you have cloned the grammars' repositories to your local
filesystem. This helper crate implements that logic, so that you can use it in
your own program analysis tools, as well.

View File

@ -0,0 +1,6 @@
fn main() {
println!(
"cargo:rustc-env=BUILD_TARGET={}",
std::env::var("TARGET").unwrap()
);
}

View File

@ -0,0 +1,853 @@
use anyhow::{anyhow, Context, Error, Result};
use libloading::{Library, Symbol};
use once_cell::unsync::OnceCell;
use regex::{Regex, RegexBuilder};
use serde::{Deserialize, Deserializer, Serialize};
use std::collections::HashMap;
use std::io::BufReader;
use std::ops::Range;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::sync::Mutex;
use std::time::SystemTime;
use std::{env, fs, mem};
use tree_sitter::{Language, QueryError, QueryErrorKind};
use tree_sitter_highlight::HighlightConfiguration;
use tree_sitter_tags::{Error as TagsError, TagsConfiguration};
#[derive(Default, Deserialize, Serialize)]
pub struct Config {
#[serde(default)]
#[serde(
rename = "parser-directories",
deserialize_with = "deserialize_parser_directories"
)]
pub parser_directories: Vec<PathBuf>,
}
// Replace `~` or `$HOME` with home path string.
// (While paths like "~/.tree-sitter/config.json" can be deserialized,
// they're not valid path for I/O modules.)
fn deserialize_parser_directories<'de, D>(deserializer: D) -> Result<Vec<PathBuf>, D::Error>
where
D: Deserializer<'de>,
{
let paths = Vec::<PathBuf>::deserialize(deserializer)?;
let home = match dirs::home_dir() {
Some(home) => home,
None => return Ok(paths),
};
let standardized = paths
.into_iter()
.map(|path| standardize_path(path, &home))
.collect();
Ok(standardized)
}
fn standardize_path(path: PathBuf, home: &Path) -> PathBuf {
if let Ok(p) = path.strip_prefix("~") {
return home.join(p);
}
if let Ok(p) = path.strip_prefix("$HOME") {
return home.join(p);
}
path
}
impl Config {
pub fn initial() -> Config {
let home_dir = dirs::home_dir().expect("Cannot determine home directory");
Config {
parser_directories: vec![
home_dir.join("github"),
home_dir.join("src"),
home_dir.join("source"),
],
}
}
}
#[cfg(unix)]
const DYLIB_EXTENSION: &'static str = "so";
#[cfg(windows)]
const DYLIB_EXTENSION: &'static str = "dll";
const BUILD_TARGET: &'static str = env!("BUILD_TARGET");
pub struct LanguageConfiguration<'a> {
pub scope: Option<String>,
pub content_regex: Option<Regex>,
pub _first_line_regex: Option<Regex>,
pub injection_regex: Option<Regex>,
pub file_types: Vec<String>,
pub root_path: PathBuf,
pub highlights_filenames: Option<Vec<String>>,
pub injections_filenames: Option<Vec<String>>,
pub locals_filenames: Option<Vec<String>>,
pub tags_filenames: Option<Vec<String>>,
language_id: usize,
highlight_config: OnceCell<Option<HighlightConfiguration>>,
tags_config: OnceCell<Option<TagsConfiguration>>,
highlight_names: &'a Mutex<Vec<String>>,
use_all_highlight_names: bool,
}
pub struct Loader {
parser_lib_path: PathBuf,
languages_by_id: Vec<(PathBuf, OnceCell<Language>)>,
language_configurations: Vec<LanguageConfiguration<'static>>,
language_configuration_ids_by_file_type: HashMap<String, Vec<usize>>,
highlight_names: Box<Mutex<Vec<String>>>,
use_all_highlight_names: bool,
debug_build: bool,
}
unsafe impl Send for Loader {}
unsafe impl Sync for Loader {}
impl Loader {
pub fn new() -> Result<Self> {
let parser_lib_path = match env::var("TREE_SITTER_LIBDIR") {
Ok(path) => PathBuf::from(path),
_ => dirs::cache_dir()
.ok_or(anyhow!("Cannot determine cache directory"))?
.join("tree-sitter")
.join("lib"),
};
Ok(Self::with_parser_lib_path(parser_lib_path))
}
pub fn with_parser_lib_path(parser_lib_path: PathBuf) -> Self {
Loader {
parser_lib_path,
languages_by_id: Vec::new(),
language_configurations: Vec::new(),
language_configuration_ids_by_file_type: HashMap::new(),
highlight_names: Box::new(Mutex::new(Vec::new())),
use_all_highlight_names: true,
debug_build: false,
}
}
pub fn configure_highlights(&mut self, names: &Vec<String>) {
self.use_all_highlight_names = false;
let mut highlights = self.highlight_names.lock().unwrap();
highlights.clear();
highlights.extend(names.iter().cloned());
}
pub fn highlight_names(&self) -> Vec<String> {
self.highlight_names.lock().unwrap().clone()
}
pub fn find_all_languages(&mut self, config: &Config) -> Result<()> {
if config.parser_directories.is_empty() {
eprintln!("Warning: You have not configured any parser directories!");
eprintln!("Please run `tree-sitter init-config` and edit the resulting");
eprintln!("configuration file to indicate where we should look for");
eprintln!("language grammars.");
eprintln!("");
}
for parser_container_dir in &config.parser_directories {
if let Ok(entries) = fs::read_dir(parser_container_dir) {
for entry in entries {
let entry = entry?;
if let Some(parser_dir_name) = entry.file_name().to_str() {
if parser_dir_name.starts_with("tree-sitter-") {
self.find_language_configurations_at_path(
&parser_container_dir.join(parser_dir_name),
)
.ok();
}
}
}
}
}
Ok(())
}
pub fn languages_at_path(&mut self, path: &Path) -> Result<Vec<Language>> {
if let Ok(configurations) = self.find_language_configurations_at_path(path) {
let mut language_ids = configurations
.iter()
.map(|c| c.language_id)
.collect::<Vec<_>>();
language_ids.sort();
language_ids.dedup();
language_ids
.into_iter()
.map(|id| self.language_for_id(id))
.collect::<Result<Vec<_>>>()
} else {
Ok(Vec::new())
}
}
pub fn get_all_language_configurations(&self) -> Vec<(&LanguageConfiguration, &Path)> {
self.language_configurations
.iter()
.map(|c| (c, self.languages_by_id[c.language_id].0.as_ref()))
.collect()
}
pub fn language_configuration_for_scope(
&self,
scope: &str,
) -> Result<Option<(Language, &LanguageConfiguration)>> {
for configuration in &self.language_configurations {
if configuration.scope.as_ref().map_or(false, |s| s == scope) {
let language = self.language_for_id(configuration.language_id)?;
return Ok(Some((language, configuration)));
}
}
Ok(None)
}
pub fn language_configuration_for_file_name(
&self,
path: &Path,
) -> Result<Option<(Language, &LanguageConfiguration)>> {
// Find all the language configurations that match this file name
// or a suffix of the file name.
let configuration_ids = path
.file_name()
.and_then(|n| n.to_str())
.and_then(|file_name| self.language_configuration_ids_by_file_type.get(file_name))
.or_else(|| {
path.extension()
.and_then(|extension| extension.to_str())
.and_then(|extension| {
self.language_configuration_ids_by_file_type.get(extension)
})
});
if let Some(configuration_ids) = configuration_ids {
if !configuration_ids.is_empty() {
let configuration;
// If there is only one language configuration, then use it.
if configuration_ids.len() == 1 {
configuration = &self.language_configurations[configuration_ids[0]];
}
// If multiple language configurations match, then determine which
// one to use by applying the configurations' content regexes.
else {
let file_contents = fs::read(path)
.with_context(|| format!("Failed to read path {:?}", path))?;
let file_contents = String::from_utf8_lossy(&file_contents);
let mut best_score = -2isize;
let mut best_configuration_id = None;
for configuration_id in configuration_ids {
let config = &self.language_configurations[*configuration_id];
// If the language configuration has a content regex, assign
// a score based on the length of the first match.
let score;
if let Some(content_regex) = &config.content_regex {
if let Some(mat) = content_regex.find(&file_contents) {
score = (mat.end() - mat.start()) as isize;
}
// If the content regex does not match, then *penalize* this
// language configuration, so that language configurations
// without content regexes are preferred over those with
// non-matching content regexes.
else {
score = -1;
}
} else {
score = 0;
}
if score > best_score {
best_configuration_id = Some(*configuration_id);
best_score = score;
}
}
configuration = &self.language_configurations[best_configuration_id.unwrap()];
}
let language = self.language_for_id(configuration.language_id)?;
return Ok(Some((language, configuration)));
}
}
Ok(None)
}
pub fn language_configuration_for_injection_string(
&self,
string: &str,
) -> Result<Option<(Language, &LanguageConfiguration)>> {
let mut best_match_length = 0;
let mut best_match_position = None;
for (i, configuration) in self.language_configurations.iter().enumerate() {
if let Some(injection_regex) = &configuration.injection_regex {
if let Some(mat) = injection_regex.find(string) {
let length = mat.end() - mat.start();
if length > best_match_length {
best_match_position = Some(i);
best_match_length = length;
}
}
}
}
if let Some(i) = best_match_position {
let configuration = &self.language_configurations[i];
let language = self.language_for_id(configuration.language_id)?;
Ok(Some((language, configuration)))
} else {
Ok(None)
}
}
fn language_for_id(&self, id: usize) -> Result<Language> {
let (path, language) = &self.languages_by_id[id];
language
.get_or_try_init(|| {
let src_path = path.join("src");
self.load_language_at_path(&src_path, &src_path)
})
.map(|l| *l)
}
pub fn load_language_at_path(&self, src_path: &Path, header_path: &Path) -> Result<Language> {
let grammar_path = src_path.join("grammar.json");
let parser_path = src_path.join("parser.c");
let mut scanner_path = src_path.join("scanner.c");
#[derive(Deserialize)]
struct GrammarJSON {
name: String,
}
let mut grammar_file =
fs::File::open(grammar_path).with_context(|| "Failed to read grammar.json")?;
let grammar_json: GrammarJSON = serde_json::from_reader(BufReader::new(&mut grammar_file))
.with_context(|| "Failed to parse grammar.json")?;
let scanner_path = if scanner_path.exists() {
Some(scanner_path)
} else {
scanner_path.set_extension("cc");
if scanner_path.exists() {
Some(scanner_path)
} else {
None
}
};
self.load_language_from_sources(
&grammar_json.name,
&header_path,
&parser_path,
&scanner_path,
)
}
pub fn load_language_from_sources(
&self,
name: &str,
header_path: &Path,
parser_path: &Path,
scanner_path: &Option<PathBuf>,
) -> Result<Language> {
let mut lib_name = name.to_string();
if self.debug_build {
lib_name.push_str(".debug._");
}
let mut library_path = self.parser_lib_path.join(lib_name);
library_path.set_extension(DYLIB_EXTENSION);
let recompile = needs_recompile(&library_path, &parser_path, &scanner_path)
.with_context(|| "Failed to compare source and binary timestamps")?;
if recompile {
fs::create_dir_all(&self.parser_lib_path)?;
let mut config = cc::Build::new();
config
.cpp(true)
.opt_level(2)
.cargo_metadata(false)
.target(BUILD_TARGET)
.host(BUILD_TARGET);
let compiler = config.get_compiler();
let mut command = Command::new(compiler.path());
for (key, value) in compiler.env() {
command.env(key, value);
}
if cfg!(windows) {
command.args(&["/nologo", "/LD", "/I"]).arg(header_path);
if self.debug_build {
command.arg("/Od");
} else {
command.arg("/O2");
}
command.arg(parser_path);
if let Some(scanner_path) = scanner_path.as_ref() {
command.arg(scanner_path);
}
command
.arg("/link")
.arg(format!("/out:{}", library_path.to_str().unwrap()));
} else {
command
.arg("-shared")
.arg("-fPIC")
.arg("-fno-exceptions")
.arg("-g")
.arg("-I")
.arg(header_path)
.arg("-o")
.arg(&library_path);
if self.debug_build {
command.arg("-O0");
} else {
command.arg("-O2");
}
// For conditional compilation of external scanner code when
// used internally by `tree-siteer parse` and other sub commands.
command.arg("-DTREE_SITTER_INTERNAL_BUILD");
if let Some(scanner_path) = scanner_path.as_ref() {
if scanner_path.extension() == Some("c".as_ref()) {
command.arg("-xc").arg("-std=c99").arg(scanner_path);
} else {
command.arg(scanner_path);
}
}
command.arg("-xc").arg(parser_path);
}
let output = command
.output()
.with_context(|| "Failed to execute C compiler")?;
if !output.status.success() {
return Err(anyhow!(
"Parser compilation failed.\nStdout: {}\nStderr: {}",
String::from_utf8_lossy(&output.stdout),
String::from_utf8_lossy(&output.stderr)
));
}
}
let library = unsafe { Library::new(&library_path) }
.with_context(|| format!("Error opening dynamic library {:?}", &library_path))?;
let language_fn_name = format!("tree_sitter_{}", replace_dashes_with_underscores(name));
let language = unsafe {
let language_fn: Symbol<unsafe extern "C" fn() -> Language> = library
.get(language_fn_name.as_bytes())
.with_context(|| format!("Failed to load symbol {}", language_fn_name))?;
language_fn()
};
mem::forget(library);
Ok(language)
}
pub fn highlight_config_for_injection_string<'a>(
&'a self,
string: &str,
) -> Option<&'a HighlightConfiguration> {
match self.language_configuration_for_injection_string(string) {
Err(e) => {
eprintln!(
"Failed to load language for injection string '{}': {}",
string, e
);
None
}
Ok(None) => None,
Ok(Some((language, configuration))) => match configuration.highlight_config(language) {
Err(e) => {
eprintln!(
"Failed to load property sheet for injection string '{}': {}",
string, e
);
None
}
Ok(None) => None,
Ok(Some(config)) => Some(config),
},
}
}
pub fn find_language_configurations_at_path<'a>(
&'a mut self,
parser_path: &Path,
) -> Result<&[LanguageConfiguration]> {
#[derive(Deserialize)]
#[serde(untagged)]
enum PathsJSON {
Empty,
Single(String),
Multiple(Vec<String>),
}
impl Default for PathsJSON {
fn default() -> Self {
PathsJSON::Empty
}
}
impl PathsJSON {
fn into_vec(self) -> Option<Vec<String>> {
match self {
PathsJSON::Empty => None,
PathsJSON::Single(s) => Some(vec![s]),
PathsJSON::Multiple(s) => Some(s),
}
}
}
#[derive(Deserialize)]
struct LanguageConfigurationJSON {
#[serde(default)]
path: PathBuf,
scope: Option<String>,
#[serde(rename = "file-types")]
file_types: Option<Vec<String>>,
#[serde(rename = "content-regex")]
content_regex: Option<String>,
#[serde(rename = "first-line-regex")]
first_line_regex: Option<String>,
#[serde(rename = "injection-regex")]
injection_regex: Option<String>,
#[serde(default)]
highlights: PathsJSON,
#[serde(default)]
injections: PathsJSON,
#[serde(default)]
locals: PathsJSON,
#[serde(default)]
tags: PathsJSON,
}
#[derive(Deserialize)]
struct PackageJSON {
#[serde(default)]
#[serde(rename = "tree-sitter")]
tree_sitter: Vec<LanguageConfigurationJSON>,
}
let initial_language_configuration_count = self.language_configurations.len();
if let Ok(package_json_contents) = fs::read_to_string(&parser_path.join("package.json")) {
let package_json = serde_json::from_str::<PackageJSON>(&package_json_contents);
if let Ok(package_json) = package_json {
let language_count = self.languages_by_id.len();
for config_json in package_json.tree_sitter {
// Determine the path to the parser directory. This can be specified in
// the package.json, but defaults to the directory containing the package.json.
let language_path = parser_path.join(config_json.path);
// Determine if a previous language configuration in this package.json file
// already uses the same language.
let mut language_id = None;
for (id, (path, _)) in
self.languages_by_id.iter().enumerate().skip(language_count)
{
if language_path == *path {
language_id = Some(id);
}
}
// If not, add a new language path to the list.
let language_id = language_id.unwrap_or_else(|| {
self.languages_by_id.push((language_path, OnceCell::new()));
self.languages_by_id.len() - 1
});
let configuration = LanguageConfiguration {
root_path: parser_path.to_path_buf(),
scope: config_json.scope,
language_id,
file_types: config_json.file_types.unwrap_or(Vec::new()),
content_regex: Self::regex(config_json.content_regex),
_first_line_regex: Self::regex(config_json.first_line_regex),
injection_regex: Self::regex(config_json.injection_regex),
injections_filenames: config_json.injections.into_vec(),
locals_filenames: config_json.locals.into_vec(),
tags_filenames: config_json.tags.into_vec(),
highlights_filenames: config_json.highlights.into_vec(),
highlight_config: OnceCell::new(),
tags_config: OnceCell::new(),
highlight_names: &*self.highlight_names,
use_all_highlight_names: self.use_all_highlight_names,
};
for file_type in &configuration.file_types {
self.language_configuration_ids_by_file_type
.entry(file_type.to_string())
.or_insert(Vec::new())
.push(self.language_configurations.len());
}
self.language_configurations
.push(unsafe { mem::transmute(configuration) });
}
}
}
if self.language_configurations.len() == initial_language_configuration_count
&& parser_path.join("src").join("grammar.json").exists()
{
let configuration = LanguageConfiguration {
root_path: parser_path.to_owned(),
language_id: self.languages_by_id.len(),
file_types: Vec::new(),
scope: None,
content_regex: None,
_first_line_regex: None,
injection_regex: None,
injections_filenames: None,
locals_filenames: None,
highlights_filenames: None,
tags_filenames: None,
highlight_config: OnceCell::new(),
tags_config: OnceCell::new(),
highlight_names: &*self.highlight_names,
use_all_highlight_names: self.use_all_highlight_names,
};
self.language_configurations
.push(unsafe { mem::transmute(configuration) });
self.languages_by_id
.push((parser_path.to_owned(), OnceCell::new()));
}
Ok(&self.language_configurations[initial_language_configuration_count..])
}
fn regex(pattern: Option<String>) -> Option<Regex> {
pattern.and_then(|r| RegexBuilder::new(&r).multi_line(true).build().ok())
}
pub fn select_language(
&mut self,
path: &Path,
current_dir: &Path,
scope: Option<&str>,
) -> Result<Language> {
if let Some(scope) = scope {
if let Some(config) = self
.language_configuration_for_scope(scope)
.with_context(|| format!("Failed to load language for scope '{}'", scope))?
{
Ok(config.0)
} else {
return Err(anyhow!("Unknown scope '{}'", scope));
}
} else if let Some((lang, _)) = self
.language_configuration_for_file_name(path)
.with_context(|| {
format!(
"Failed to load language for file name {}",
&path.file_name().unwrap().to_string_lossy()
)
})?
{
Ok(lang)
} else if let Some(lang) = self
.languages_at_path(&current_dir)
.with_context(|| "Failed to load language in current directory")?
.first()
.cloned()
{
Ok(lang)
} else {
Err(anyhow!("No language found"))
}
}
pub fn use_debug_build(&mut self, flag: bool) {
self.debug_build = flag;
}
}
impl<'a> LanguageConfiguration<'a> {
pub fn highlight_config(&self, language: Language) -> Result<Option<&HighlightConfiguration>> {
return self
.highlight_config
.get_or_try_init(|| {
let (highlights_query, highlight_ranges) =
self.read_queries(&self.highlights_filenames, "highlights.scm")?;
let (injections_query, injection_ranges) =
self.read_queries(&self.injections_filenames, "injections.scm")?;
let (locals_query, locals_ranges) =
self.read_queries(&self.locals_filenames, "locals.scm")?;
if highlights_query.is_empty() {
Ok(None)
} else {
let mut result = HighlightConfiguration::new(
language,
&highlights_query,
&injections_query,
&locals_query,
)
.map_err(|error| match error.kind {
QueryErrorKind::Language => Error::from(error),
_ => {
if error.offset < injections_query.len() {
Self::include_path_in_query_error(
error,
&injection_ranges,
&injections_query,
0,
)
} else if error.offset < injections_query.len() + locals_query.len() {
Self::include_path_in_query_error(
error,
&locals_ranges,
&locals_query,
injections_query.len(),
)
} else {
Self::include_path_in_query_error(
error,
&highlight_ranges,
&highlights_query,
injections_query.len() + locals_query.len(),
)
}
}
})?;
let mut all_highlight_names = self.highlight_names.lock().unwrap();
if self.use_all_highlight_names {
for capture_name in result.query.capture_names() {
if !all_highlight_names.contains(capture_name) {
all_highlight_names.push(capture_name.clone());
}
}
}
result.configure(&all_highlight_names.as_slice());
Ok(Some(result))
}
})
.map(Option::as_ref);
}
pub fn tags_config(&self, language: Language) -> Result<Option<&TagsConfiguration>> {
self.tags_config
.get_or_try_init(|| {
let (tags_query, tags_ranges) =
self.read_queries(&self.tags_filenames, "tags.scm")?;
let (locals_query, locals_ranges) =
self.read_queries(&self.locals_filenames, "locals.scm")?;
if tags_query.is_empty() {
Ok(None)
} else {
TagsConfiguration::new(language, &tags_query, &locals_query)
.map(Some)
.map_err(|error| {
if let TagsError::Query(error) = error {
if error.offset < locals_query.len() {
Self::include_path_in_query_error(
error,
&locals_ranges,
&locals_query,
0,
)
} else {
Self::include_path_in_query_error(
error,
&tags_ranges,
&tags_query,
locals_query.len(),
)
}
.into()
} else {
error.into()
}
})
}
})
.map(Option::as_ref)
}
fn include_path_in_query_error<'b>(
mut error: QueryError,
ranges: &'b Vec<(String, Range<usize>)>,
source: &str,
start_offset: usize,
) -> Error {
let offset_within_section = error.offset - start_offset;
let (path, range) = ranges
.iter()
.find(|(_, range)| range.contains(&offset_within_section))
.unwrap();
error.offset = offset_within_section - range.start;
error.row = source[range.start..offset_within_section]
.chars()
.filter(|c| *c == '\n')
.count();
Error::from(error).context(format!("Error in query file {:?}", path))
}
fn read_queries(
&self,
paths: &Option<Vec<String>>,
default_path: &str,
) -> Result<(String, Vec<(String, Range<usize>)>)> {
let mut query = String::new();
let mut path_ranges = Vec::new();
if let Some(paths) = paths.as_ref() {
for path in paths {
let abs_path = self.root_path.join(path);
let prev_query_len = query.len();
query += &fs::read_to_string(&abs_path)
.with_context(|| format!("Failed to read query file {:?}", path))?;
path_ranges.push((path.clone(), prev_query_len..query.len()));
}
} else {
let queries_path = self.root_path.join("queries");
let path = queries_path.join(default_path);
if path.exists() {
query = fs::read_to_string(&path)
.with_context(|| format!("Failed to read query file {:?}", path))?;
path_ranges.push((default_path.to_string(), 0..query.len()));
}
}
Ok((query, path_ranges))
}
}
fn needs_recompile(
lib_path: &Path,
parser_c_path: &Path,
scanner_path: &Option<PathBuf>,
) -> Result<bool> {
if !lib_path.exists() {
return Ok(true);
}
let lib_mtime = mtime(lib_path)?;
if mtime(parser_c_path)? > lib_mtime {
return Ok(true);
}
if let Some(scanner_path) = scanner_path {
if mtime(scanner_path)? > lib_mtime {
return Ok(true);
}
}
Ok(false)
}
fn mtime(path: &Path) -> Result<SystemTime> {
Ok(fs::metadata(path)?.modified()?)
}
fn replace_dashes_with_underscores(name: &str) -> String {
let mut result = String::with_capacity(name.len());
for c in name.chars() {
if c == '-' {
result.push('_');
} else {
result.push(c);
}
}
result
}

View File

@ -0,0 +1,5 @@
tree-sitter
tree-sitter.exe
*.gz
*.tgz
LICENSE

12
third-party/tree-sitter/tree-sitter/cli/npm/cli.js generated vendored Normal file
View File

@ -0,0 +1,12 @@
#!/usr/bin/env node
const path = require('path');
const spawn = require("child_process").spawn;
const executable = process.platform === 'win32'
? 'tree-sitter.exe'
: 'tree-sitter';
spawn(
path.join(__dirname, executable),
process.argv.slice(2),
{stdio: 'inherit'}
).on('close', process.exit)

369
third-party/tree-sitter/tree-sitter/cli/npm/dsl.d.ts generated vendored Normal file
View File

@ -0,0 +1,369 @@
type AliasRule = {type: 'ALIAS'; named: boolean; content: Rule; value: string};
type BlankRule = {type: 'BLANK'};
type ChoiceRule = {type: 'CHOICE'; members: Rule[]};
type FieldRule = {type: 'FIELD'; name: string; content: Rule};
type ImmediateTokenRule = {type: 'IMMEDIATE_TOKEN'; content: Rule};
type PatternRule = {type: 'PATTERN'; value: string};
type PrecDynamicRule = {type: 'PREC_DYNAMIC'; content: Rule; value: number};
type PrecLeftRule = {type: 'PREC_LEFT'; content: Rule; value: number};
type PrecRightRule = {type: 'PREC_RIGHT'; content: Rule; value: number};
type PrecRule = {type: 'PREC'; content: Rule; value: number};
type Repeat1Rule = {type: 'REPEAT1'; content: Rule};
type RepeatRule = {type: 'REPEAT'; content: Rule};
type SeqRule = {type: 'SEQ'; members: Rule[]};
type StringRule = {type: 'STRING'; value: string};
type SymbolRule<Name extends string> = {type: 'SYMBOL'; name: Name};
type TokenRule = {type: 'TOKEN'; content: Rule};
type Rule =
| AliasRule
| BlankRule
| ChoiceRule
| FieldRule
| ImmediateTokenRule
| PatternRule
| PrecDynamicRule
| PrecLeftRule
| PrecRightRule
| PrecRule
| Repeat1Rule
| RepeatRule
| SeqRule
| StringRule
| SymbolRule<string>
| TokenRule;
type RuleOrLiteral = Rule | RegExp | string;
type GrammarSymbols<RuleName extends string> = {
[name in RuleName]: SymbolRule<name>;
} &
Record<string, SymbolRule<string>>;
type RuleBuilder<RuleName extends string> = (
$: GrammarSymbols<RuleName>,
) => RuleOrLiteral;
type RuleBuilders<
RuleName extends string,
BaseGrammarRuleName extends string
> = {
[name in RuleName]: RuleBuilder<RuleName | BaseGrammarRuleName>;
};
interface Grammar<
RuleName extends string,
BaseGrammarRuleName extends string = never,
Rules extends RuleBuilders<RuleName, BaseGrammarRuleName> = RuleBuilders<
RuleName,
BaseGrammarRuleName
>
> {
/**
* Name of the grammar language.
*/
name: string;
/** Mapping of grammar rule names to rule builder functions. */
rules: Rules;
/**
* An array of arrays of precedence names. Each inner array represents
* a *descending* ordering. Names listed earlier in one of these arrays
* have higher precedence than any names listed later in the same array.
*/
precedences?: () => String[][],
/**
* An array of arrays of rule names. Each inner array represents a set of
* rules that's involved in an _LR(1) conflict_ that is _intended to exist_
* in the grammar. When these conflicts occur at runtime, Tree-sitter will
* use the GLR algorithm to explore all of the possible interpretations. If
* _multiple_ parses end up succeeding, Tree-sitter will pick the subtree
* whose corresponding rule has the highest total _dynamic precedence_.
*
* @param $ grammar rules
*/
conflicts?: (
$: GrammarSymbols<RuleName | BaseGrammarRuleName>,
) => RuleOrLiteral[][];
/**
* An array of token names which can be returned by an _external scanner_.
* External scanners allow you to write custom C code which runs during the
* lexing process in order to handle lexical rules (e.g. Python's indentation
* tokens) that cannot be described by regular expressions.
*
* @param $ grammar rules
* @param previous array of externals from the base schema, if any
*
* @see https://tree-sitter.github.io/tree-sitter/creating-parsers#external-scanners
*/
externals?: (
$: Record<string, SymbolRule<string>>,
previous: Rule[],
) => SymbolRule<string>[];
/**
* An array of tokens that may appear anywhere in the language. This
* is often used for whitespace and comments. The default value of
* extras is to accept whitespace. To control whitespace explicitly,
* specify extras: `$ => []` in your grammar.
*
* @param $ grammar rules
*/
extras?: (
$: GrammarSymbols<RuleName | BaseGrammarRuleName>,
) => RuleOrLiteral[];
/**
* An array of rules that should be automatically removed from the
* grammar by replacing all of their usages with a copy of their definition.
* This is useful for rules that are used in multiple places but for which
* you don't want to create syntax tree nodes at runtime.
*
* @param $ grammar rules
*/
inline?: (
$: GrammarSymbols<RuleName | BaseGrammarRuleName>,
) => RuleOrLiteral[];
/**
* A list of hidden rule names that should be considered supertypes in the
* generated node types file.
*
* @param $ grammar rules
*
* @see http://tree-sitter.github.io/tree-sitter/using-parsers#static-node-types
*/
supertypes?: (
$: GrammarSymbols<RuleName | BaseGrammarRuleName>,
) => RuleOrLiteral[];
/**
* The name of a token that will match keywords for the purpose of the
* keyword extraction optimization.
*
* @param $ grammar rules
*
* @see https://tree-sitter.github.io/tree-sitter/creating-parsers#keyword-extraction
*/
word?: ($: GrammarSymbols<RuleName | BaseGrammarRuleName>) => RuleOrLiteral;
}
type GrammarSchema<RuleName extends string> = {
[K in keyof Grammar<RuleName>]: K extends 'rules'
? Record<RuleName, Rule>
: Grammar<RuleName>[K];
};
/**
* Causes the given rule to appear with an alternative name in the syntax tree.
* For instance with `alias($.foo, 'bar')`, the aliased rule will appear as an
* anonymous node, as if the rule had been written as the simple string.
*
* @param rule rule that will be aliased
* @param name target name for the alias
*/
declare function alias(rule: RuleOrLiteral, name: string): AliasRule;
/**
* Causes the given rule to appear as an alternative named node, for instance
* with `alias($.foo, $.bar)`, the aliased rule `foo` will appear as a named
* node called `bar`.
*
* @param rule rule that will be aliased
* @param symbol target symbol for the alias
*/
declare function alias(
rule: RuleOrLiteral,
symbol: SymbolRule<string>,
): AliasRule;
/**
* Creates a blank rule, matching nothing.
*/
declare function blank(): BlankRule;
/**
* Assigns a field name to the child node(s) matched by the given rule.
* In the resulting syntax tree, you can then use that field name to
* access specific children.
*
* @param name name of the field
* @param rule rule the field should match
*/
declare function field(name: string, rule: RuleOrLiteral): FieldRule;
/**
* Creates a rule that matches one of a set of possible rules. The order
* of the arguments does not matter. This is analogous to the `|` (pipe)
* operator in EBNF notation.
*
* @param options possible rule choices
*/
declare function choice(...options: RuleOrLiteral[]): ChoiceRule;
/**
* Creates a rule that matches zero or one occurrence of a given rule.
* It is analogous to the `[x]` (square bracket) syntax in EBNF notation.
*
* @param value rule to be made optional
*/
declare function optional(rule: RuleOrLiteral): ChoiceRule;
/**
* Marks the given rule with a precedence which will be used to resolve LR(1)
* conflicts at parser-generation time. When two rules overlap in a way that
* represents either a true ambiguity or a _local_ ambiguity given one token
* of lookahead, Tree-sitter will try to resolve the conflict by matching the
* rule with the higher precedence.
*
* Precedence values can either be strings or numbers. When comparing rules
* with numerical precedence, higher numbers indicate higher precedences. To
* compare rules with string precedence, Tree-sitter uses the grammar's `precedences`
* field.
*
* rules is zero. This works similarly to the precedence directives in Yacc grammars.
*
* @param value precedence weight
* @param rule rule being weighted
*
* @see https://en.wikipedia.org/wiki/LR_parser#Conflicts_in_the_constructed_tables
* @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html
*/
declare const prec: {
(value: String | number, rule: RuleOrLiteral): PrecRule;
/**
* Marks the given rule as left-associative (and optionally applies a
* numerical precedence). When an LR(1) conflict arises in which all of the
* rules have the same numerical precedence, Tree-sitter will consult the
* rules' associativity. If there is a left-associative rule, Tree-sitter
* will prefer matching a rule that ends _earlier_. This works similarly to
* associativity directives in Yacc grammars.
*
* @param value (optional) precedence weight
* @param rule rule to mark as left-associative
*
* @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html
*/
left(rule: RuleOrLiteral): PrecLeftRule;
left(value: String | number, rule: RuleOrLiteral): PrecLeftRule;
/**
* Marks the given rule as right-associative (and optionally applies a
* numerical precedence). When an LR(1) conflict arises in which all of the
* rules have the same numerical precedence, Tree-sitter will consult the
* rules' associativity. If there is a right-associative rule, Tree-sitter
* will prefer matching a rule that ends _later_. This works similarly to
* associativity directives in Yacc grammars.
*
* @param value (optional) precedence weight
* @param rule rule to mark as right-associative
*
* @see https://docs.oracle.com/cd/E19504-01/802-5880/6i9k05dh3/index.html
*/
right(rule: RuleOrLiteral): PrecRightRule;
right(value: String | number, rule: RuleOrLiteral): PrecRightRule;
/**
* Marks the given rule with a numerical precedence which will be used to
* resolve LR(1) conflicts at _runtime_ instead of parser-generation time.
* This is only necessary when handling a conflict dynamically using the
* `conflicts` field in the grammar, and when there is a genuine _ambiguity_:
* multiple rules correctly match a given piece of code. In that event,
* Tree-sitter compares the total dynamic precedence associated with each
* rule, and selects the one with the highest total. This is similar to
* dynamic precedence directives in Bison grammars.
*
* @param value precedence weight
* @param rule rule being weighted
*
* @see https://www.gnu.org/software/bison/manual/html_node/Generalized-LR-Parsing.html
*/
dynamic(value: String | number, rule: RuleOrLiteral): PrecDynamicRule;
};
/**
* Creates a rule that matches _zero-or-more_ occurrences of a given rule.
* It is analogous to the `{x}` (curly brace) syntax in EBNF notation. This
* rule is implemented in terms of `repeat1` but is included because it
* is very commonly used.
*
* @param rule rule to repeat, zero or more times
*/
declare function repeat(rule: RuleOrLiteral): RepeatRule;
/**
* Creates a rule that matches one-or-more occurrences of a given rule.
*
* @param rule rule to repeat, one or more times
*/
declare function repeat1(rule: RuleOrLiteral): Repeat1Rule;
/**
* Creates a rule that matches any number of other rules, one after another.
* It is analogous to simply writing multiple symbols next to each other
* in EBNF notation.
*
* @param rules ordered rules that comprise the sequence
*/
declare function seq(...rules: RuleOrLiteral[]): SeqRule;
/**
* Creates a symbol rule, representing another rule in the grammar by name.
*
* @param name name of the target rule
*/
declare function sym<Name extends string>(name: Name): SymbolRule<Name>;
/**
* Marks the given rule as producing only a single token. Tree-sitter's
* default is to treat each String or RegExp literal in the grammar as a
* separate token. Each token is matched separately by the lexer and
* returned as its own leaf node in the tree. The token function allows
* you to express a complex rule using the DSL functions (rather
* than as a single regular expression) but still have Tree-sitter treat
* it as a single token.
*
* @param rule rule to represent as a single token
*/
declare const token: {
(rule: RuleOrLiteral): TokenRule;
/**
* Marks the given rule as producing an immediate token. This allows
* the parser to produce a different token based on whether or not
* there are `extras` preceding the token's main content. When there
* are _no_ leading `extras`, an immediate token is preferred over a
* normal token which would otherwise match.
*
* @param rule rule to represent as an immediate token
*/
immediate(rule: RuleOrLiteral): ImmediateTokenRule;
};
/**
* Creates a new language grammar with the provided schema.
*
* @param options grammar options
*/
declare function grammar<RuleName extends string>(
options: Grammar<RuleName>,
): GrammarSchema<RuleName>;
/**
* Extends an existing language grammar with the provided options,
* creating a new language.
*
* @param baseGrammar base grammar schema to extend from
* @param options grammar options for the new extended language
*/
declare function grammar<
BaseGrammarRuleName extends string,
RuleName extends string
>(
baseGrammar: GrammarSchema<BaseGrammarRuleName>,
options: Grammar<RuleName, BaseGrammarRuleName>,
): GrammarSchema<RuleName | BaseGrammarRuleName>;

93
third-party/tree-sitter/tree-sitter/cli/npm/install.js generated vendored Normal file
View File

@ -0,0 +1,93 @@
#!/usr/bin/env node
const fs = require('fs');
const zlib = require('zlib');
const http = require('http');
const https = require('https');
const packageJSON = require('./package.json');
// Determine the URL of the file.
const platformName = {
'darwin': 'macos',
'linux': 'linux',
'win32': 'windows'
}[process.platform];
let archName = {
'x64': 'x64',
'x86': 'x86',
'ia32': 'x86'
}[process.arch];
// ARM macs can run x64 binaries via Rosetta. Rely on that for now.
if (platformName === 'macos' && process.arch === 'arm64') {
archName = 'x64';
}
if (!platformName || !archName) {
console.error(
`Cannot install tree-sitter-cli for platform ${process.platform}, architecture ${process.arch}`
);
process.exit(1);
}
const releaseURL = `https://github.com/tree-sitter/tree-sitter/releases/download/v${packageJSON.version}`;
const assetName = `tree-sitter-${platformName}-${archName}.gz`;
const assetURL = `${releaseURL}/${assetName}`;
// Remove previously-downloaded files.
const executableName = process.platform === 'win32' ? 'tree-sitter.exe' : 'tree-sitter';
if (fs.existsSync(executableName)) {
fs.unlinkSync(executableName);
}
// Download the compressed file.
console.log(`Downloading ${assetURL}`);
const file = fs.createWriteStream(executableName);
get(assetURL, response => {
if (response.statusCode > 299) {
console.error([
'Download failed',
'',
`url: ${assetURL}`,
`status: ${response.statusCode}`,
`headers: ${JSON.stringify(response.headers, null, 2)}`,
'',
].join('\n'));
process.exit(1);
}
response.pipe(zlib.createGunzip()).pipe(file);
});
file.on('finish', () => {
fs.chmodSync(executableName, '755');
});
// Follow redirects.
function get(url, callback) {
const requestUrl = new URL(url)
let request = https
let requestConfig = requestUrl
const proxyEnv = process.env['HTTPS_PROXY'] || process.env['https_proxy']
if (proxyEnv) {
const proxyUrl = new URL(proxyEnv)
request = proxyUrl.protocol === 'https:' ? https : http
requestConfig = {
hostname: proxyUrl.hostname,
port: proxyUrl.port,
path: requestUrl.toString(),
headers: {
Host: requestUrl.hostname
}
}
}
request.get(requestConfig, response => {
if (response.statusCode === 301 || response.statusCode === 302) {
get(response.headers.location, callback);
} else {
callback(response);
}
});
}

View File

@ -0,0 +1,23 @@
{
"name": "tree-sitter-cli",
"version": "0.20.8",
"author": "Max Brunsfeld",
"license": "MIT",
"repository": {
"type": "git",
"url": "http://github.com/tree-sitter/tree-sitter.git"
},
"description": "CLI for generating fast incremental parsers",
"keywords": [
"parser",
"lexer"
],
"main": "lib/api/index.js",
"scripts": {
"install": "node install.js",
"prepack": "cp ../../LICENSE ."
},
"bin": {
"tree-sitter": "cli.js"
}
}

View File

@ -0,0 +1,154 @@
use super::write_file;
use anyhow::{Context, Result};
use std::path::{Path, PathBuf};
use std::{fs, str};
const BINDING_CC_TEMPLATE: &'static str = include_str!("./templates/binding.cc");
const BINDING_GYP_TEMPLATE: &'static str = include_str!("./templates/binding.gyp");
const INDEX_JS_TEMPLATE: &'static str = include_str!("./templates/index.js");
const LIB_RS_TEMPLATE: &'static str = include_str!("./templates/lib.rs");
const BUILD_RS_TEMPLATE: &'static str = include_str!("./templates/build.rs");
const CARGO_TOML_TEMPLATE: &'static str = include_str!("./templates/cargo.toml");
const PACKAGE_JSON_TEMPLATE: &'static str = include_str!("./templates/package.json");
const PARSER_NAME_PLACEHOLDER: &'static str = "PARSER_NAME";
const CLI_VERSION_PLACEHOLDER: &'static str = "CLI_VERSION";
const CLI_VERSION: &'static str = env!("CARGO_PKG_VERSION");
const RUST_BINDING_VERSION: &'static str = env!("RUST_BINDING_VERSION");
const RUST_BINDING_VERSION_PLACEHOLDER: &'static str = "RUST_BINDING_VERSION";
pub fn generate_binding_files(repo_path: &Path, language_name: &str) -> Result<()> {
let bindings_dir = repo_path.join("bindings");
let dashed_language_name = language_name.replace("_", "-");
let dashed_language_name = dashed_language_name.as_str();
// Generate rust bindings if needed.
let rust_binding_dir = bindings_dir.join("rust");
create_path(&rust_binding_dir, |path| create_dir(path))?;
create_path(&rust_binding_dir.join("lib.rs").to_owned(), |path| {
generate_file(path, LIB_RS_TEMPLATE, language_name)
})?;
create_path(&rust_binding_dir.join("build.rs").to_owned(), |path| {
generate_file(path, BUILD_RS_TEMPLATE, language_name)
})?;
create_path(&repo_path.join("Cargo.toml").to_owned(), |path| {
generate_file(path, CARGO_TOML_TEMPLATE, dashed_language_name)
})?;
// Generate node bindings
let node_binding_dir = bindings_dir.join("node");
create_path(&node_binding_dir, |path| create_dir(path))?;
create_path(&node_binding_dir.join("index.js").to_owned(), |path| {
generate_file(path, INDEX_JS_TEMPLATE, language_name)
})?;
create_path(&node_binding_dir.join("binding.cc").to_owned(), |path| {
generate_file(path, BINDING_CC_TEMPLATE, language_name)
})?;
// Create binding.gyp, or update it with new binding path.
let binding_gyp_path = repo_path.join("binding.gyp");
create_path_else(
&binding_gyp_path,
|path| generate_file(path, BINDING_GYP_TEMPLATE, language_name),
|path| {
let binding_gyp =
fs::read_to_string(path).with_context(|| "Failed to read binding.gyp")?;
let old_path = "\"src/binding.cc\"";
if binding_gyp.contains(old_path) {
eprintln!("Updating binding.gyp with new binding path");
let binding_gyp = binding_gyp.replace(old_path, "\"bindings/node/binding.cc\"");
write_file(path, binding_gyp)?;
}
Ok(())
},
)?;
// Create package.json, or update it with new binding path.
let package_json_path = repo_path.join("package.json");
create_path_else(
&package_json_path,
|path| generate_file(path, PACKAGE_JSON_TEMPLATE, dashed_language_name),
|path| {
let package_json_str =
fs::read_to_string(path).with_context(|| "Failed to read package.json")?;
let mut package_json =
serde_json::from_str::<serde_json::Map<String, serde_json::Value>>(
&package_json_str,
)
.with_context(|| "Failed to parse package.json")?;
let package_json_main = package_json.get("main");
let package_json_needs_update = package_json_main.map_or(true, |v| {
let main_string = v.as_str();
main_string == Some("index.js") || main_string == Some("./index.js")
});
if package_json_needs_update {
eprintln!("Updating package.json with new binding path");
package_json.insert(
"main".to_string(),
serde_json::Value::String("bindings/node".to_string()),
);
let mut package_json_str = serde_json::to_string_pretty(&package_json)?;
package_json_str.push('\n');
write_file(path, package_json_str)?;
}
Ok(())
},
)?;
// Remove files from old node binding paths.
let old_index_js_path = repo_path.join("index.js");
let old_binding_cc_path = repo_path.join("src").join("binding.cc");
if old_index_js_path.exists() {
fs::remove_file(old_index_js_path).ok();
}
if old_binding_cc_path.exists() {
fs::remove_file(old_binding_cc_path).ok();
}
Ok(())
}
fn generate_file(path: &Path, template: &str, language_name: &str) -> Result<()> {
write_file(
path,
template
.replace(PARSER_NAME_PLACEHOLDER, language_name)
.replace(CLI_VERSION_PLACEHOLDER, CLI_VERSION)
.replace(RUST_BINDING_VERSION_PLACEHOLDER, RUST_BINDING_VERSION),
)
}
fn create_dir(path: &Path) -> Result<()> {
fs::create_dir_all(&path)
.with_context(|| format!("Failed to create {:?}", path.to_string_lossy()))
}
fn create_path<F>(path: &PathBuf, action: F) -> Result<bool>
where
F: Fn(&PathBuf) -> Result<()>,
{
if !path.exists() {
action(path)?;
return Ok(true);
}
Ok(false)
}
fn create_path_else<T, F>(path: &PathBuf, action: T, else_action: F) -> Result<bool>
where
T: Fn(&PathBuf) -> Result<()>,
F: Fn(&PathBuf) -> Result<()>,
{
if !path.exists() {
action(path)?;
return Ok(true);
} else {
else_action(path)?;
}
Ok(false)
}

View File

@ -0,0 +1,379 @@
use super::coincident_tokens::CoincidentTokenIndex;
use super::token_conflicts::TokenConflictMap;
use crate::generate::dedup::split_state_id_groups;
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
use crate::generate::nfa::NfaCursor;
use crate::generate::rules::{Symbol, TokenSet};
use crate::generate::tables::{AdvanceAction, LexState, LexTable, ParseStateId, ParseTable};
use log::info;
use std::collections::hash_map::Entry;
use std::collections::{HashMap, VecDeque};
use std::mem;
pub(crate) fn build_lex_table(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
keywords: &TokenSet,
coincident_token_index: &CoincidentTokenIndex,
token_conflict_map: &TokenConflictMap,
) -> (LexTable, LexTable) {
let keyword_lex_table;
if syntax_grammar.word_token.is_some() {
let mut builder = LexTableBuilder::new(lexical_grammar);
builder.add_state_for_tokens(keywords);
keyword_lex_table = builder.table;
} else {
keyword_lex_table = LexTable::default();
}
let mut parse_state_ids_by_token_set: Vec<(TokenSet, Vec<ParseStateId>)> = Vec::new();
for (i, state) in parse_table.states.iter().enumerate() {
let tokens = state
.terminal_entries
.keys()
.filter_map(|token| {
if token.is_terminal() {
if keywords.contains(&token) {
syntax_grammar.word_token
} else {
Some(*token)
}
} else if token.is_eof() {
Some(*token)
} else {
None
}
})
.collect();
let mut did_merge = false;
for entry in parse_state_ids_by_token_set.iter_mut() {
if merge_token_set(
&mut entry.0,
&tokens,
lexical_grammar,
token_conflict_map,
coincident_token_index,
) {
did_merge = true;
entry.1.push(i);
break;
}
}
if !did_merge {
parse_state_ids_by_token_set.push((tokens, vec![i]));
}
}
let mut builder = LexTableBuilder::new(lexical_grammar);
for (tokens, parse_state_ids) in parse_state_ids_by_token_set {
let lex_state_id = builder.add_state_for_tokens(&tokens);
for id in parse_state_ids {
parse_table.states[id].lex_state_id = lex_state_id;
}
}
let mut table = builder.table;
minimize_lex_table(&mut table, parse_table);
sort_states(&mut table, parse_table);
(table, keyword_lex_table)
}
struct QueueEntry {
state_id: usize,
nfa_states: Vec<u32>,
eof_valid: bool,
}
struct LexTableBuilder<'a> {
lexical_grammar: &'a LexicalGrammar,
cursor: NfaCursor<'a>,
table: LexTable,
state_queue: VecDeque<QueueEntry>,
state_ids_by_nfa_state_set: HashMap<(Vec<u32>, bool), usize>,
}
impl<'a> LexTableBuilder<'a> {
fn new(lexical_grammar: &'a LexicalGrammar) -> Self {
Self {
lexical_grammar,
cursor: NfaCursor::new(&lexical_grammar.nfa, vec![]),
table: LexTable::default(),
state_queue: VecDeque::new(),
state_ids_by_nfa_state_set: HashMap::new(),
}
}
fn add_state_for_tokens(&mut self, tokens: &TokenSet) -> usize {
let mut eof_valid = false;
let nfa_states = tokens
.iter()
.filter_map(|token| {
if token.is_terminal() {
Some(self.lexical_grammar.variables[token.index].start_state)
} else {
eof_valid = true;
None
}
})
.collect();
let (state_id, is_new) = self.add_state(nfa_states, eof_valid);
if is_new {
info!(
"entry point state: {}, tokens: {:?}",
state_id,
tokens
.iter()
.map(|t| &self.lexical_grammar.variables[t.index].name)
.collect::<Vec<_>>()
);
}
while let Some(QueueEntry {
state_id,
nfa_states,
eof_valid,
}) = self.state_queue.pop_front()
{
self.populate_state(state_id, nfa_states, eof_valid);
}
state_id
}
fn add_state(&mut self, nfa_states: Vec<u32>, eof_valid: bool) -> (usize, bool) {
self.cursor.reset(nfa_states);
match self
.state_ids_by_nfa_state_set
.entry((self.cursor.state_ids.clone(), eof_valid))
{
Entry::Occupied(o) => (*o.get(), false),
Entry::Vacant(v) => {
let state_id = self.table.states.len();
self.table.states.push(LexState::default());
self.state_queue.push_back(QueueEntry {
state_id,
nfa_states: v.key().0.clone(),
eof_valid,
});
v.insert(state_id);
(state_id, true)
}
}
}
fn populate_state(&mut self, state_id: usize, nfa_states: Vec<u32>, eof_valid: bool) {
self.cursor.force_reset(nfa_states);
// The EOF state is represented as an empty list of NFA states.
let mut completion = None;
for (id, prec) in self.cursor.completions() {
if let Some((prev_id, prev_precedence)) = completion {
if TokenConflictMap::prefer_token(
self.lexical_grammar,
(prev_precedence, prev_id),
(prec, id),
) {
continue;
}
}
completion = Some((id, prec));
}
let transitions = self.cursor.transitions();
let has_sep = self.cursor.transition_chars().any(|(_, sep)| sep);
// If EOF is a valid lookahead token, add a transition predicated on the null
// character that leads to the empty set of NFA states.
if eof_valid {
let (next_state_id, _) = self.add_state(Vec::new(), false);
self.table.states[state_id].eof_action = Some(AdvanceAction {
state: next_state_id,
in_main_token: true,
});
}
for transition in transitions {
if let Some((completed_id, completed_precedence)) = completion {
if !TokenConflictMap::prefer_transition(
&self.lexical_grammar,
&transition,
completed_id,
completed_precedence,
has_sep,
) {
continue;
}
}
let (next_state_id, _) =
self.add_state(transition.states, eof_valid && transition.is_separator);
self.table.states[state_id].advance_actions.push((
transition.characters,
AdvanceAction {
state: next_state_id,
in_main_token: !transition.is_separator,
},
));
}
if let Some((complete_id, _)) = completion {
self.table.states[state_id].accept_action = Some(Symbol::terminal(complete_id));
} else if self.cursor.state_ids.is_empty() {
self.table.states[state_id].accept_action = Some(Symbol::end());
}
}
}
fn merge_token_set(
tokens: &mut TokenSet,
other: &TokenSet,
lexical_grammar: &LexicalGrammar,
token_conflict_map: &TokenConflictMap,
coincident_token_index: &CoincidentTokenIndex,
) -> bool {
for i in 0..lexical_grammar.variables.len() {
let symbol = Symbol::terminal(i);
let set_without_terminal = match (tokens.contains_terminal(i), other.contains_terminal(i)) {
(true, false) => other,
(false, true) => tokens,
_ => continue,
};
for existing_token in set_without_terminal.terminals() {
if token_conflict_map.does_conflict(i, existing_token.index)
|| token_conflict_map.does_match_prefix(i, existing_token.index)
{
return false;
}
if !coincident_token_index.contains(symbol, existing_token) {
if token_conflict_map.does_overlap(existing_token.index, i)
|| token_conflict_map.does_overlap(i, existing_token.index)
{
return false;
}
}
}
}
tokens.insert_all(other);
true
}
fn minimize_lex_table(table: &mut LexTable, parse_table: &mut ParseTable) {
// Initially group the states by their accept action and their
// valid lookahead characters.
let mut state_ids_by_signature = HashMap::new();
for (i, state) in table.states.iter().enumerate() {
let signature = (
i == 0,
state.accept_action,
state.eof_action.is_some(),
state
.advance_actions
.iter()
.map(|(characters, action)| (characters.clone(), action.in_main_token))
.collect::<Vec<_>>(),
);
state_ids_by_signature
.entry(signature)
.or_insert(Vec::new())
.push(i);
}
let mut state_ids_by_group_id = state_ids_by_signature
.into_iter()
.map(|e| e.1)
.collect::<Vec<_>>();
state_ids_by_group_id.sort();
let error_group_index = state_ids_by_group_id
.iter()
.position(|g| g.contains(&0))
.unwrap();
state_ids_by_group_id.swap(error_group_index, 0);
let mut group_ids_by_state_id = vec![0; table.states.len()];
for (group_id, state_ids) in state_ids_by_group_id.iter().enumerate() {
for state_id in state_ids {
group_ids_by_state_id[*state_id] = group_id;
}
}
while split_state_id_groups(
&table.states,
&mut state_ids_by_group_id,
&mut group_ids_by_state_id,
1,
lex_states_differ,
) {
continue;
}
let mut new_states = Vec::with_capacity(state_ids_by_group_id.len());
for state_ids in &state_ids_by_group_id {
let mut new_state = LexState::default();
mem::swap(&mut new_state, &mut table.states[state_ids[0]]);
for (_, advance_action) in new_state.advance_actions.iter_mut() {
advance_action.state = group_ids_by_state_id[advance_action.state];
}
if let Some(eof_action) = &mut new_state.eof_action {
eof_action.state = group_ids_by_state_id[eof_action.state];
}
new_states.push(new_state);
}
for state in parse_table.states.iter_mut() {
state.lex_state_id = group_ids_by_state_id[state.lex_state_id];
}
table.states = new_states;
}
fn lex_states_differ(
left: &LexState,
right: &LexState,
group_ids_by_state_id: &Vec<usize>,
) -> bool {
left.advance_actions
.iter()
.zip(right.advance_actions.iter())
.any(|(left, right)| {
group_ids_by_state_id[left.1.state] != group_ids_by_state_id[right.1.state]
})
}
fn sort_states(table: &mut LexTable, parse_table: &mut ParseTable) {
// Get a mapping of old state index -> new_state_index
let mut old_ids_by_new_id = (0..table.states.len()).collect::<Vec<_>>();
old_ids_by_new_id[1..].sort_by_key(|id| &table.states[*id]);
// Get the inverse mapping
let mut new_ids_by_old_id = vec![0; old_ids_by_new_id.len()];
for (id, old_id) in old_ids_by_new_id.iter().enumerate() {
new_ids_by_old_id[*old_id] = id;
}
// Reorder the parse states and update their references to reflect
// the new ordering.
table.states = old_ids_by_new_id
.iter()
.map(|old_id| {
let mut state = LexState::default();
mem::swap(&mut state, &mut table.states[*old_id]);
for (_, advance_action) in state.advance_actions.iter_mut() {
advance_action.state = new_ids_by_old_id[advance_action.state];
}
if let Some(eof_action) = &mut state.eof_action {
eof_action.state = new_ids_by_old_id[eof_action.state];
}
state
})
.collect();
// Update the parse table's lex state references
for state in parse_table.states.iter_mut() {
state.lex_state_id = new_ids_by_old_id[state.lex_state_id];
}
}

View File

@ -0,0 +1,997 @@
use super::item::{ParseItem, ParseItemSet, ParseItemSetCore};
use super::item_set_builder::ParseItemSetBuilder;
use crate::generate::grammars::PrecedenceEntry;
use crate::generate::grammars::{
InlinedProductionMap, LexicalGrammar, SyntaxGrammar, VariableType,
};
use crate::generate::node_types::VariableInfo;
use crate::generate::rules::{Associativity, Precedence, Symbol, SymbolType, TokenSet};
use crate::generate::tables::{
FieldLocation, GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
ProductionInfo, ProductionInfoId,
};
use anyhow::{anyhow, Result};
use std::cmp::Ordering;
use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
use std::fmt::Write;
use std::hash::BuildHasherDefault;
use std::u32;
use indexmap::{map::Entry, IndexMap};
use rustc_hash::FxHasher;
// For conflict reporting, each parse state is associated with an example
// sequence of symbols that could lead to that parse state.
type SymbolSequence = Vec<Symbol>;
type AuxiliarySymbolSequence = Vec<AuxiliarySymbolInfo>;
pub(crate) type ParseStateInfo<'a> = (SymbolSequence, ParseItemSet<'a>);
#[derive(Clone)]
struct AuxiliarySymbolInfo {
auxiliary_symbol: Symbol,
parent_symbols: Vec<Symbol>,
}
#[derive(Debug, Default)]
struct ReductionInfo {
precedence: Precedence,
symbols: Vec<Symbol>,
has_left_assoc: bool,
has_right_assoc: bool,
has_non_assoc: bool,
}
struct ParseStateQueueEntry {
state_id: ParseStateId,
preceding_auxiliary_symbols: AuxiliarySymbolSequence,
}
struct ParseTableBuilder<'a> {
item_set_builder: ParseItemSetBuilder<'a>,
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
variable_info: &'a Vec<VariableInfo>,
core_ids_by_core: HashMap<ParseItemSetCore<'a>, usize>,
state_ids_by_item_set: IndexMap<ParseItemSet<'a>, ParseStateId, BuildHasherDefault<FxHasher>>,
parse_state_info_by_id: Vec<ParseStateInfo<'a>>,
parse_state_queue: VecDeque<ParseStateQueueEntry>,
non_terminal_extra_states: Vec<(Symbol, usize)>,
parse_table: ParseTable,
}
impl<'a> ParseTableBuilder<'a> {
fn build(mut self) -> Result<(ParseTable, Vec<ParseStateInfo<'a>>)> {
// Ensure that the empty alias sequence has index 0.
self.parse_table
.production_infos
.push(ProductionInfo::default());
// Add the error state at index 0.
self.add_parse_state(&Vec::new(), &Vec::new(), ParseItemSet::default());
// Add the starting state at index 1.
self.add_parse_state(
&Vec::new(),
&Vec::new(),
ParseItemSet::with(
[(
ParseItem::start(),
[Symbol::end()].iter().cloned().collect(),
)]
.iter()
.cloned(),
),
);
// Compute the possible item sets for non-terminal extras.
let mut non_terminal_extra_item_sets_by_first_terminal = BTreeMap::new();
for extra_non_terminal in self
.syntax_grammar
.extra_symbols
.iter()
.filter(|s| s.is_non_terminal())
{
let variable = &self.syntax_grammar.variables[extra_non_terminal.index];
for production in &variable.productions {
non_terminal_extra_item_sets_by_first_terminal
.entry(production.first_symbol().unwrap())
.or_insert(ParseItemSet::default())
.insert(
ParseItem {
variable_index: extra_non_terminal.index as u32,
production,
step_index: 1,
has_preceding_inherited_fields: false,
},
&[Symbol::end_of_nonterminal_extra()]
.iter()
.cloned()
.collect(),
);
}
}
// Add a state for each starting terminal of a non-terminal extra rule.
for (terminal, item_set) in non_terminal_extra_item_sets_by_first_terminal {
self.non_terminal_extra_states
.push((terminal, self.parse_table.states.len()));
self.add_parse_state(&Vec::new(), &Vec::new(), item_set);
}
while let Some(entry) = self.parse_state_queue.pop_front() {
let item_set = self
.item_set_builder
.transitive_closure(&self.parse_state_info_by_id[entry.state_id].1);
self.add_actions(
self.parse_state_info_by_id[entry.state_id].0.clone(),
entry.preceding_auxiliary_symbols,
entry.state_id,
item_set,
)?;
}
Ok((self.parse_table, self.parse_state_info_by_id))
}
fn add_parse_state(
&mut self,
preceding_symbols: &SymbolSequence,
preceding_auxiliary_symbols: &AuxiliarySymbolSequence,
item_set: ParseItemSet<'a>,
) -> ParseStateId {
match self.state_ids_by_item_set.entry(item_set) {
// If an equivalent item set has already been processed, then return
// the existing parse state index.
Entry::Occupied(o) => *o.get(),
// Otherwise, insert a new parse state and add it to the queue of
// parse states to populate.
Entry::Vacant(v) => {
let core = v.key().core();
let core_count = self.core_ids_by_core.len();
let core_id = *self.core_ids_by_core.entry(core).or_insert(core_count);
let state_id = self.parse_table.states.len();
self.parse_state_info_by_id
.push((preceding_symbols.clone(), v.key().clone()));
self.parse_table.states.push(ParseState {
id: state_id,
lex_state_id: 0,
external_lex_state_id: 0,
terminal_entries: IndexMap::default(),
nonterminal_entries: IndexMap::default(),
core_id,
});
self.parse_state_queue.push_back(ParseStateQueueEntry {
state_id,
preceding_auxiliary_symbols: preceding_auxiliary_symbols.clone(),
});
v.insert(state_id);
state_id
}
}
}
fn add_actions(
&mut self,
mut preceding_symbols: SymbolSequence,
mut preceding_auxiliary_symbols: Vec<AuxiliarySymbolInfo>,
state_id: ParseStateId,
item_set: ParseItemSet<'a>,
) -> Result<()> {
let mut terminal_successors = BTreeMap::new();
let mut non_terminal_successors = BTreeMap::new();
let mut lookaheads_with_conflicts = TokenSet::new();
let mut reduction_infos = HashMap::<Symbol, ReductionInfo>::new();
// Each item in the item set contributes to either or a Shift action or a Reduce
// action in this state.
for (item, lookaheads) in &item_set.entries {
// If the item is unfinished, then this state has a transition for the item's
// next symbol. Advance the item to its next step and insert the resulting
// item into the successor item set.
if let Some(next_symbol) = item.symbol() {
let mut successor = item.successor();
if next_symbol.is_non_terminal() {
let variable = &self.syntax_grammar.variables[next_symbol.index];
// Keep track of where auxiliary non-terminals (repeat symbols) are
// used within visible symbols. This information may be needed later
// for conflict resolution.
if variable.is_auxiliary() {
preceding_auxiliary_symbols
.push(self.get_auxiliary_node_info(&item_set, next_symbol));
}
// For most parse items, the symbols associated with the preceding children
// don't matter: they have no effect on the REDUCE action that would be
// performed at the end of the item. But the symbols *do* matter for
// children that are hidden and have fields, because those fields are
// "inherited" by the parent node.
//
// If this item has consumed a hidden child with fields, then the symbols
// of its preceding children need to be taken into account when comparing
// it with other items.
if variable.is_hidden()
&& !self.variable_info[next_symbol.index].fields.is_empty()
{
successor.has_preceding_inherited_fields = true;
}
non_terminal_successors
.entry(next_symbol)
.or_insert_with(|| ParseItemSet::default())
.insert(successor, lookaheads);
} else {
terminal_successors
.entry(next_symbol)
.or_insert_with(|| ParseItemSet::default())
.insert(successor, lookaheads);
}
}
// If the item is finished, then add a Reduce action to this state based
// on this item.
else {
let symbol = Symbol::non_terminal(item.variable_index as usize);
let action = if item.is_augmented() {
ParseAction::Accept
} else {
ParseAction::Reduce {
symbol,
child_count: item.step_index as usize,
dynamic_precedence: item.production.dynamic_precedence,
production_id: self.get_production_id(item),
}
};
let precedence = item.precedence();
let associativity = item.associativity();
for lookahead in lookaheads.iter() {
let table_entry = self.parse_table.states[state_id]
.terminal_entries
.entry(lookahead)
.or_insert_with(|| ParseTableEntry::new());
let reduction_info = reduction_infos.entry(lookahead).or_default();
// While inserting Reduce actions, eagerly resolve conflicts related
// to precedence: avoid inserting lower-precedence reductions, and
// clear the action list when inserting higher-precedence reductions.
if table_entry.actions.is_empty() {
table_entry.actions.push(action);
} else {
match Self::compare_precedence(
&self.syntax_grammar,
precedence,
&[symbol],
&reduction_info.precedence,
&reduction_info.symbols,
) {
Ordering::Greater => {
table_entry.actions.clear();
table_entry.actions.push(action);
lookaheads_with_conflicts.remove(&lookahead);
*reduction_info = ReductionInfo::default();
}
Ordering::Equal => {
table_entry.actions.push(action);
lookaheads_with_conflicts.insert(lookahead);
}
Ordering::Less => continue,
}
}
reduction_info.precedence = precedence.clone();
if let Err(i) = reduction_info.symbols.binary_search(&symbol) {
reduction_info.symbols.insert(i, symbol);
}
match associativity {
Some(Associativity::Left) => reduction_info.has_left_assoc = true,
Some(Associativity::Right) => reduction_info.has_right_assoc = true,
None => reduction_info.has_non_assoc = true,
}
}
}
}
// Having computed the the successor item sets for each symbol, add a new
// parse state for each of these item sets, and add a corresponding Shift
// action to this state.
for (symbol, next_item_set) in terminal_successors {
preceding_symbols.push(symbol);
let next_state_id = self.add_parse_state(
&preceding_symbols,
&preceding_auxiliary_symbols,
next_item_set,
);
preceding_symbols.pop();
let entry = self.parse_table.states[state_id]
.terminal_entries
.entry(symbol);
if let Entry::Occupied(e) = &entry {
if !e.get().actions.is_empty() {
lookaheads_with_conflicts.insert(symbol);
}
}
entry
.or_insert_with(|| ParseTableEntry::new())
.actions
.push(ParseAction::Shift {
state: next_state_id,
is_repetition: false,
});
}
for (symbol, next_item_set) in non_terminal_successors {
preceding_symbols.push(symbol);
let next_state_id = self.add_parse_state(
&preceding_symbols,
&preceding_auxiliary_symbols,
next_item_set,
);
preceding_symbols.pop();
self.parse_table.states[state_id]
.nonterminal_entries
.insert(symbol, GotoAction::Goto(next_state_id));
}
// For any symbol with multiple actions, perform conflict resolution.
// This will either
// * choose one action over the others using precedence or associativity
// * keep multiple actions if this conflict has been whitelisted in the grammar
// * fail, terminating the parser generation process
for symbol in lookaheads_with_conflicts.iter() {
self.handle_conflict(
&item_set,
state_id,
&preceding_symbols,
&preceding_auxiliary_symbols,
symbol,
reduction_infos.get(&symbol).unwrap(),
)?;
}
// Finally, add actions for the grammar's `extra` symbols.
let state = &mut self.parse_table.states[state_id];
let is_end_of_non_terminal_extra = state.is_end_of_non_terminal_extra();
// If this state represents the end of a non-terminal extra rule, then make sure that
// it doesn't have other successor states. Non-terminal extra rules must have
// unambiguous endings.
if is_end_of_non_terminal_extra {
if state.terminal_entries.len() > 1 {
let parent_symbols = item_set
.entries
.iter()
.filter_map(|(item, _)| {
if !item.is_augmented() && item.step_index > 0 {
Some(item.variable_index)
} else {
None
}
})
.collect::<HashSet<_>>();
let mut message =
"Extra rules must have unambiguous endings. Conflicting rules: ".to_string();
for (i, variable_index) in parent_symbols.iter().enumerate() {
if i > 0 {
message += ", ";
}
message += &self.syntax_grammar.variables[*variable_index as usize].name;
}
return Err(anyhow!(message));
}
}
// Add actions for the start tokens of each non-terminal extra rule.
else {
for (terminal, state_id) in &self.non_terminal_extra_states {
state
.terminal_entries
.entry(*terminal)
.or_insert(ParseTableEntry {
reusable: true,
actions: vec![ParseAction::Shift {
state: *state_id,
is_repetition: false,
}],
});
}
// Add ShiftExtra actions for the terminal extra tokens. These actions
// are added to every state except for those at the ends of non-terminal
// extras.
for extra_token in &self.syntax_grammar.extra_symbols {
if extra_token.is_non_terminal() {
state
.nonterminal_entries
.insert(*extra_token, GotoAction::ShiftExtra);
} else {
state
.terminal_entries
.entry(*extra_token)
.or_insert(ParseTableEntry {
reusable: true,
actions: vec![ParseAction::ShiftExtra],
});
}
}
}
Ok(())
}
fn handle_conflict(
&mut self,
item_set: &ParseItemSet,
state_id: ParseStateId,
preceding_symbols: &SymbolSequence,
preceding_auxiliary_symbols: &Vec<AuxiliarySymbolInfo>,
conflicting_lookahead: Symbol,
reduction_info: &ReductionInfo,
) -> Result<()> {
let entry = self.parse_table.states[state_id]
.terminal_entries
.get_mut(&conflicting_lookahead)
.unwrap();
// Determine which items in the set conflict with each other, and the
// precedences associated with SHIFT vs REDUCE actions. There won't
// be multiple REDUCE actions with different precedences; that is
// sorted out ahead of time in `add_actions`. But there can still be
// REDUCE-REDUCE conflicts where all actions have the *same*
// precedence, and there can still be SHIFT/REDUCE conflicts.
let mut considered_associativity = false;
let mut shift_precedence: Vec<(&Precedence, Symbol)> = Vec::new();
let mut conflicting_items = HashSet::new();
for (item, lookaheads) in &item_set.entries {
if let Some(step) = item.step() {
if item.step_index > 0 {
if self
.item_set_builder
.first_set(&step.symbol)
.contains(&conflicting_lookahead)
{
if item.variable_index != u32::MAX {
conflicting_items.insert(item);
}
let p = (
item.precedence(),
Symbol::non_terminal(item.variable_index as usize),
);
if let Err(i) = shift_precedence.binary_search(&p) {
shift_precedence.insert(i, p);
}
}
}
} else if lookaheads.contains(&conflicting_lookahead) {
if item.variable_index != u32::MAX {
conflicting_items.insert(item);
}
}
}
if let ParseAction::Shift { is_repetition, .. } = entry.actions.last_mut().unwrap() {
// If all of the items in the conflict have the same parent symbol,
// and that parent symbols is auxiliary, then this is just the intentional
// ambiguity associated with a repeat rule. Resolve that class of ambiguity
// by leaving it in the parse table, but marking the SHIFT action with
// an `is_repetition` flag.
let conflicting_variable_index =
conflicting_items.iter().next().unwrap().variable_index;
if self.syntax_grammar.variables[conflicting_variable_index as usize].is_auxiliary()
&& conflicting_items
.iter()
.all(|item| item.variable_index == conflicting_variable_index)
{
*is_repetition = true;
return Ok(());
}
// If the SHIFT action has higher precedence, remove all the REDUCE actions.
let mut shift_is_less = false;
let mut shift_is_more = false;
for p in shift_precedence {
match Self::compare_precedence(
&self.syntax_grammar,
p.0,
&[p.1],
&reduction_info.precedence,
&reduction_info.symbols,
) {
Ordering::Greater => shift_is_more = true,
Ordering::Less => shift_is_less = true,
Ordering::Equal => {}
}
}
if shift_is_more && !shift_is_less {
entry.actions.drain(0..entry.actions.len() - 1);
}
// If the REDUCE actions have higher precedence, remove the SHIFT action.
else if shift_is_less && !shift_is_more {
entry.actions.pop();
conflicting_items.retain(|item| item.is_done());
}
// If the SHIFT and REDUCE actions have the same predence, consider
// the REDUCE actions' associativity.
else if !shift_is_less && !shift_is_more {
considered_associativity = true;
// If all Reduce actions are left associative, remove the SHIFT action.
// If all Reduce actions are right associative, remove the REDUCE actions.
match (
reduction_info.has_left_assoc,
reduction_info.has_non_assoc,
reduction_info.has_right_assoc,
) {
(true, false, false) => {
entry.actions.pop();
conflicting_items.retain(|item| item.is_done());
}
(false, false, true) => {
entry.actions.drain(0..entry.actions.len() - 1);
}
_ => {}
}
}
}
// If all of the actions but one have been eliminated, then there's no problem.
let entry = self.parse_table.states[state_id]
.terminal_entries
.get_mut(&conflicting_lookahead)
.unwrap();
if entry.actions.len() == 1 {
return Ok(());
}
// Determine the set of parent symbols involved in this conflict.
let mut actual_conflict = Vec::new();
for item in &conflicting_items {
let symbol = Symbol::non_terminal(item.variable_index as usize);
if self.syntax_grammar.variables[symbol.index].is_auxiliary() {
actual_conflict.extend(
preceding_auxiliary_symbols
.iter()
.rev()
.find_map(|info| {
if info.auxiliary_symbol == symbol {
Some(&info.parent_symbols)
} else {
None
}
})
.unwrap()
.iter(),
);
} else {
actual_conflict.push(symbol);
}
}
actual_conflict.sort_unstable();
actual_conflict.dedup();
// If this set of symbols has been whitelisted, then there's no error.
if self
.syntax_grammar
.expected_conflicts
.contains(&actual_conflict)
{
return Ok(());
}
let mut msg = "Unresolved conflict for symbol sequence:\n\n".to_string();
for symbol in preceding_symbols {
write!(&mut msg, " {}", self.symbol_name(symbol)).unwrap();
}
write!(
&mut msg,
" • {} …\n\n",
self.symbol_name(&conflicting_lookahead)
)
.unwrap();
write!(&mut msg, "Possible interpretations:\n\n").unwrap();
let mut interpretations = conflicting_items
.iter()
.map(|item| {
let mut line = String::new();
for preceding_symbol in preceding_symbols
.iter()
.take(preceding_symbols.len() - item.step_index as usize)
{
write!(&mut line, " {}", self.symbol_name(preceding_symbol)).unwrap();
}
write!(
&mut line,
" ({}",
&self.syntax_grammar.variables[item.variable_index as usize].name
)
.unwrap();
for (j, step) in item.production.steps.iter().enumerate() {
if j as u32 == item.step_index {
write!(&mut line, "").unwrap();
}
write!(&mut line, " {}", self.symbol_name(&step.symbol)).unwrap();
}
write!(&mut line, ")").unwrap();
if item.is_done() {
write!(
&mut line,
" • {} …",
self.symbol_name(&conflicting_lookahead)
)
.unwrap();
}
let precedence = item.precedence();
let associativity = item.associativity();
let prec_line = if let Some(associativity) = associativity {
Some(format!(
"(precedence: {}, associativity: {:?})",
precedence, associativity
))
} else if !precedence.is_none() {
Some(format!("(precedence: {})", precedence))
} else {
None
};
(line, prec_line)
})
.collect::<Vec<_>>();
let max_interpretation_length = interpretations
.iter()
.map(|i| i.0.chars().count())
.max()
.unwrap();
interpretations.sort_unstable();
for (i, (line, prec_suffix)) in interpretations.into_iter().enumerate() {
write!(&mut msg, " {}:", i + 1).unwrap();
msg += &line;
if let Some(prec_suffix) = prec_suffix {
for _ in line.chars().count()..max_interpretation_length {
msg.push(' ');
}
msg += " ";
msg += &prec_suffix;
}
msg.push('\n');
}
let mut resolution_count = 0;
write!(&mut msg, "\nPossible resolutions:\n\n").unwrap();
let mut shift_items = Vec::new();
let mut reduce_items = Vec::new();
for item in conflicting_items {
if item.is_done() {
reduce_items.push(item);
} else {
shift_items.push(item);
}
}
shift_items.sort_unstable();
reduce_items.sort_unstable();
let list_rule_names = |mut msg: &mut String, items: &[&ParseItem]| {
let mut last_rule_id = None;
for item in items {
if last_rule_id == Some(item.variable_index) {
continue;
}
if last_rule_id.is_some() {
write!(&mut msg, " and").unwrap();
}
last_rule_id = Some(item.variable_index);
write!(
msg,
" `{}`",
self.symbol_name(&Symbol::non_terminal(item.variable_index as usize))
)
.unwrap();
}
};
if actual_conflict.len() > 1 {
if shift_items.len() > 0 {
resolution_count += 1;
write!(
&mut msg,
" {}: Specify a higher precedence in",
resolution_count
)
.unwrap();
list_rule_names(&mut msg, &shift_items);
write!(&mut msg, " than in the other rules.\n").unwrap();
}
for item in &reduce_items {
resolution_count += 1;
write!(
&mut msg,
" {}: Specify a higher precedence in `{}` than in the other rules.\n",
resolution_count,
self.symbol_name(&Symbol::non_terminal(item.variable_index as usize))
)
.unwrap();
}
}
if considered_associativity {
resolution_count += 1;
write!(
&mut msg,
" {}: Specify a left or right associativity in",
resolution_count
)
.unwrap();
list_rule_names(&mut msg, &reduce_items);
write!(&mut msg, "\n").unwrap();
}
resolution_count += 1;
write!(
&mut msg,
" {}: Add a conflict for these rules: ",
resolution_count
)
.unwrap();
for (i, symbol) in actual_conflict.iter().enumerate() {
if i > 0 {
write!(&mut msg, ", ").unwrap();
}
write!(&mut msg, "`{}`", self.symbol_name(symbol)).unwrap();
}
write!(&mut msg, "\n").unwrap();
Err(anyhow!(msg))
}
fn compare_precedence(
grammar: &SyntaxGrammar,
left: &Precedence,
left_symbols: &[Symbol],
right: &Precedence,
right_symbols: &[Symbol],
) -> Ordering {
let precedence_entry_matches =
|entry: &PrecedenceEntry, precedence: &Precedence, symbols: &[Symbol]| -> bool {
match entry {
PrecedenceEntry::Name(n) => {
if let Precedence::Name(p) = precedence {
n == p
} else {
false
}
}
PrecedenceEntry::Symbol(n) => symbols
.iter()
.any(|s| &grammar.variables[s.index].name == n),
}
};
match (left, right) {
// Integer precedences can be compared to other integer precedences,
// and to the default precedence, which is zero.
(Precedence::Integer(l), Precedence::Integer(r)) if *l != 0 || *r != 0 => l.cmp(r),
(Precedence::Integer(l), Precedence::None) if *l != 0 => l.cmp(&0),
(Precedence::None, Precedence::Integer(r)) if *r != 0 => 0.cmp(&r),
// Named precedences can be compared to other named precedences.
_ => grammar
.precedence_orderings
.iter()
.find_map(|list| {
let mut saw_left = false;
let mut saw_right = false;
for entry in list {
let matches_left = precedence_entry_matches(entry, left, left_symbols);
let matches_right = precedence_entry_matches(entry, right, right_symbols);
if matches_left {
saw_left = true;
if saw_right {
return Some(Ordering::Less);
}
} else if matches_right {
saw_right = true;
if saw_left {
return Some(Ordering::Greater);
}
}
}
None
})
.unwrap_or(Ordering::Equal),
}
}
fn get_auxiliary_node_info(
&self,
item_set: &ParseItemSet,
symbol: Symbol,
) -> AuxiliarySymbolInfo {
let parent_symbols = item_set
.entries
.iter()
.filter_map(|(item, _)| {
let variable_index = item.variable_index as usize;
if item.symbol() == Some(symbol)
&& !self.syntax_grammar.variables[variable_index].is_auxiliary()
{
Some(Symbol::non_terminal(variable_index))
} else {
None
}
})
.collect();
AuxiliarySymbolInfo {
auxiliary_symbol: symbol,
parent_symbols,
}
}
fn get_production_id(&mut self, item: &ParseItem) -> ProductionInfoId {
let mut production_info = ProductionInfo {
alias_sequence: Vec::new(),
field_map: BTreeMap::new(),
};
for (i, step) in item.production.steps.iter().enumerate() {
production_info.alias_sequence.push(step.alias.clone());
if let Some(field_name) = &step.field_name {
production_info
.field_map
.entry(field_name.clone())
.or_insert(Vec::new())
.push(FieldLocation {
index: i,
inherited: false,
});
}
if step.symbol.kind == SymbolType::NonTerminal
&& !self.syntax_grammar.variables[step.symbol.index]
.kind
.is_visible()
{
let info = &self.variable_info[step.symbol.index];
for (field_name, _) in &info.fields {
production_info
.field_map
.entry(field_name.clone())
.or_insert(Vec::new())
.push(FieldLocation {
index: i,
inherited: true,
});
}
}
}
while production_info.alias_sequence.last() == Some(&None) {
production_info.alias_sequence.pop();
}
if item.production.steps.len() > self.parse_table.max_aliased_production_length {
self.parse_table.max_aliased_production_length = item.production.steps.len()
}
if let Some(index) = self
.parse_table
.production_infos
.iter()
.position(|seq| *seq == production_info)
{
index
} else {
self.parse_table.production_infos.push(production_info);
self.parse_table.production_infos.len() - 1
}
}
fn symbol_name(&self, symbol: &Symbol) -> String {
match symbol.kind {
SymbolType::End | SymbolType::EndOfNonTerminalExtra => "EOF".to_string(),
SymbolType::External => self.syntax_grammar.external_tokens[symbol.index]
.name
.clone(),
SymbolType::NonTerminal => self.syntax_grammar.variables[symbol.index].name.clone(),
SymbolType::Terminal => {
let variable = &self.lexical_grammar.variables[symbol.index];
if variable.kind == VariableType::Named {
variable.name.clone()
} else {
format!("'{}'", &variable.name)
}
}
}
}
}
fn populate_following_tokens(
result: &mut Vec<TokenSet>,
grammar: &SyntaxGrammar,
inlines: &InlinedProductionMap,
builder: &ParseItemSetBuilder,
) {
let productions = grammar
.variables
.iter()
.flat_map(|v| &v.productions)
.chain(&inlines.productions);
let all_tokens = (0..result.len())
.into_iter()
.map(Symbol::terminal)
.collect::<TokenSet>();
for production in productions {
for i in 1..production.steps.len() {
let left_tokens = builder.last_set(&production.steps[i - 1].symbol);
let right_tokens = builder.first_set(&production.steps[i].symbol);
for left_token in left_tokens.iter() {
if left_token.is_terminal() {
result[left_token.index].insert_all_terminals(right_tokens);
}
}
}
}
for extra in &grammar.extra_symbols {
if extra.is_terminal() {
for entry in result.iter_mut() {
entry.insert(*extra);
}
result[extra.index] = all_tokens.clone();
}
}
}
pub(crate) fn build_parse_table<'a>(
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
inlines: &'a InlinedProductionMap,
variable_info: &'a Vec<VariableInfo>,
) -> Result<(ParseTable, Vec<TokenSet>, Vec<ParseStateInfo<'a>>)> {
let item_set_builder = ParseItemSetBuilder::new(syntax_grammar, lexical_grammar, inlines);
let mut following_tokens = vec![TokenSet::new(); lexical_grammar.variables.len()];
populate_following_tokens(
&mut following_tokens,
syntax_grammar,
inlines,
&item_set_builder,
);
let (table, item_sets) = ParseTableBuilder {
syntax_grammar,
lexical_grammar,
item_set_builder,
variable_info,
non_terminal_extra_states: Vec::new(),
state_ids_by_item_set: IndexMap::default(),
core_ids_by_core: HashMap::new(),
parse_state_info_by_id: Vec::new(),
parse_state_queue: VecDeque::new(),
parse_table: ParseTable {
states: Vec::new(),
symbols: Vec::new(),
external_lex_states: Vec::new(),
production_infos: Vec::new(),
max_aliased_production_length: 1,
},
}
.build()?;
Ok((table, following_tokens, item_sets))
}

View File

@ -0,0 +1,75 @@
use crate::generate::grammars::LexicalGrammar;
use crate::generate::rules::Symbol;
use crate::generate::tables::{ParseStateId, ParseTable};
use std::fmt;
pub(crate) struct CoincidentTokenIndex<'a> {
entries: Vec<Vec<ParseStateId>>,
grammar: &'a LexicalGrammar,
n: usize,
}
impl<'a> CoincidentTokenIndex<'a> {
pub fn new(table: &ParseTable, lexical_grammar: &'a LexicalGrammar) -> Self {
let n = lexical_grammar.variables.len();
let mut result = Self {
n,
grammar: lexical_grammar,
entries: vec![Vec::new(); n * n],
};
for (i, state) in table.states.iter().enumerate() {
for symbol in state.terminal_entries.keys() {
if symbol.is_terminal() {
for other_symbol in state.terminal_entries.keys() {
if other_symbol.is_terminal() {
let index = result.index(symbol.index, other_symbol.index);
if result.entries[index].last().cloned() != Some(i) {
result.entries[index].push(i);
}
}
}
}
}
}
result
}
pub fn states_with(&self, a: Symbol, b: Symbol) -> &Vec<ParseStateId> {
&self.entries[self.index(a.index, b.index)]
}
pub fn contains(&self, a: Symbol, b: Symbol) -> bool {
!self.entries[self.index(a.index, b.index)].is_empty()
}
fn index(&self, a: usize, b: usize) -> usize {
if a < b {
a * self.n + b
} else {
b * self.n + a
}
}
}
impl<'a> fmt::Debug for CoincidentTokenIndex<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "CoincidentTokenIndex {{\n")?;
write!(f, " entries: {{\n")?;
for i in 0..self.n {
write!(f, " {}: {{\n", self.grammar.variables[i].name)?;
for j in 0..self.n {
write!(
f,
" {}: {:?},\n",
self.grammar.variables[j].name,
self.entries[self.index(i, j)].len()
)?;
}
write!(f, " }},\n")?;
}
write!(f, " }},")?;
write!(f, "}}")?;
Ok(())
}
}

View File

@ -0,0 +1,416 @@
use crate::generate::grammars::{LexicalGrammar, Production, ProductionStep, SyntaxGrammar};
use crate::generate::rules::{Associativity, Precedence, Symbol, SymbolType, TokenSet};
use lazy_static::lazy_static;
use std::cmp::Ordering;
use std::fmt;
use std::hash::{Hash, Hasher};
use std::u32;
lazy_static! {
static ref START_PRODUCTION: Production = Production {
dynamic_precedence: 0,
steps: vec![ProductionStep {
symbol: Symbol {
index: 0,
kind: SymbolType::NonTerminal,
},
precedence: Precedence::None,
associativity: None,
alias: None,
field_name: None,
}],
};
}
/// A ParseItem represents an in-progress match of a single production in a grammar.
#[derive(Clone, Copy, Debug)]
pub(crate) struct ParseItem<'a> {
/// The index of the parent rule within the grammar.
pub variable_index: u32,
/// The number of symbols that have already been matched.
pub step_index: u32,
/// The production being matched.
pub production: &'a Production,
/// A boolean indicating whether any of the already-matched children were
/// hidden nodes and had fields. Ordinarily, a parse item's behavior is not
/// affected by the symbols of its preceding children; it only needs to
/// keep track of their fields and aliases.
///
/// Take for example these two items:
/// X -> a b • c
/// X -> a g • c
///
/// They can be considered equivalent, for the purposes of parse table
/// generation, because they entail the same actions. But if this flag is
/// true, then the item's set of inherited fields may depend on the specific
/// symbols of its preceding children.
pub has_preceding_inherited_fields: bool,
}
/// A ParseItemSet represents a set of in-progress matches of productions in a
/// grammar, and for each in-progress match, a set of "lookaheads" - tokens that
/// are allowed to *follow* the in-progress rule. This object corresponds directly
/// to a state in the final parse table.
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ParseItemSet<'a> {
pub entries: Vec<(ParseItem<'a>, TokenSet)>,
}
/// A ParseItemSetCore is like a ParseItemSet, but without the lookahead
/// information. Parse states with the same core are candidates for merging.
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ParseItemSetCore<'a> {
pub entries: Vec<ParseItem<'a>>,
}
pub(crate) struct ParseItemDisplay<'a>(
pub &'a ParseItem<'a>,
pub &'a SyntaxGrammar,
pub &'a LexicalGrammar,
);
pub(crate) struct TokenSetDisplay<'a>(
pub &'a TokenSet,
pub &'a SyntaxGrammar,
pub &'a LexicalGrammar,
);
pub(crate) struct ParseItemSetDisplay<'a>(
pub &'a ParseItemSet<'a>,
pub &'a SyntaxGrammar,
pub &'a LexicalGrammar,
);
impl<'a> ParseItem<'a> {
pub fn start() -> Self {
ParseItem {
variable_index: u32::MAX,
production: &START_PRODUCTION,
step_index: 0,
has_preceding_inherited_fields: false,
}
}
pub fn step(&self) -> Option<&'a ProductionStep> {
self.production.steps.get(self.step_index as usize)
}
pub fn symbol(&self) -> Option<Symbol> {
self.step().map(|step| step.symbol)
}
pub fn associativity(&self) -> Option<Associativity> {
self.prev_step().and_then(|step| step.associativity)
}
pub fn precedence(&self) -> &Precedence {
self.prev_step()
.map_or(&Precedence::None, |step| &step.precedence)
}
pub fn prev_step(&self) -> Option<&'a ProductionStep> {
if self.step_index > 0 {
Some(&self.production.steps[self.step_index as usize - 1])
} else {
None
}
}
pub fn is_done(&self) -> bool {
self.step_index as usize == self.production.steps.len()
}
pub fn is_augmented(&self) -> bool {
self.variable_index == u32::MAX
}
/// Create an item like this one, but advanced by one step.
pub fn successor(&self) -> ParseItem<'a> {
ParseItem {
variable_index: self.variable_index,
production: self.production,
step_index: self.step_index + 1,
has_preceding_inherited_fields: self.has_preceding_inherited_fields,
}
}
/// Create an item identical to this one, but with a different production.
/// This is used when dynamically "inlining" certain symbols in a production.
pub fn substitute_production(&self, production: &'a Production) -> ParseItem<'a> {
let mut result = self.clone();
result.production = production;
result
}
}
impl<'a> ParseItemSet<'a> {
pub fn with(elements: impl IntoIterator<Item = (ParseItem<'a>, TokenSet)>) -> Self {
let mut result = Self::default();
for (item, lookaheads) in elements {
result.insert(item, &lookaheads);
}
result
}
pub fn insert(&mut self, item: ParseItem<'a>, lookaheads: &TokenSet) -> &mut TokenSet {
match self.entries.binary_search_by(|(i, _)| i.cmp(&item)) {
Err(i) => {
self.entries.insert(i, (item, lookaheads.clone()));
&mut self.entries[i].1
}
Ok(i) => {
self.entries[i].1.insert_all(lookaheads);
&mut self.entries[i].1
}
}
}
pub fn core(&self) -> ParseItemSetCore<'a> {
ParseItemSetCore {
entries: self.entries.iter().map(|e| e.0).collect(),
}
}
}
impl<'a> Default for ParseItemSet<'a> {
fn default() -> Self {
Self {
entries: Vec::new(),
}
}
}
impl<'a> fmt::Display for ParseItemDisplay<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
if self.0.is_augmented() {
write!(f, "START →")?;
} else {
write!(
f,
"{} →",
&self.1.variables[self.0.variable_index as usize].name
)?;
}
for (i, step) in self.0.production.steps.iter().enumerate() {
if i == self.0.step_index as usize {
write!(f, "")?;
if let Some(associativity) = step.associativity {
if !step.precedence.is_none() {
write!(f, " ({} {:?})", step.precedence, associativity)?;
} else {
write!(f, " ({:?})", associativity)?;
}
} else if !step.precedence.is_none() {
write!(f, " ({})", step.precedence)?;
}
}
write!(f, " ")?;
if step.symbol.is_terminal() {
if let Some(variable) = self.2.variables.get(step.symbol.index) {
write!(f, "{}", &variable.name)?;
} else {
write!(f, "{}-{}", "terminal", step.symbol.index)?;
}
} else if step.symbol.is_external() {
write!(f, "{}", &self.1.external_tokens[step.symbol.index].name)?;
} else {
write!(f, "{}", &self.1.variables[step.symbol.index].name)?;
}
if let Some(alias) = &step.alias {
write!(f, "@{}", alias.value)?;
}
}
if self.0.is_done() {
write!(f, "")?;
if let Some(step) = self.0.production.steps.last() {
if let Some(associativity) = step.associativity {
if !step.precedence.is_none() {
write!(f, " ({} {:?})", step.precedence, associativity)?;
} else {
write!(f, " ({:?})", associativity)?;
}
} else if !step.precedence.is_none() {
write!(f, " ({})", step.precedence)?;
}
}
}
Ok(())
}
}
impl<'a> fmt::Display for TokenSetDisplay<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "[")?;
for (i, symbol) in self.0.iter().enumerate() {
if i > 0 {
write!(f, ", ")?;
}
if symbol.is_terminal() {
if let Some(variable) = self.2.variables.get(symbol.index) {
write!(f, "{}", &variable.name)?;
} else {
write!(f, "{}-{}", "terminal", symbol.index)?;
}
} else if symbol.is_external() {
write!(f, "{}", &self.1.external_tokens[symbol.index].name)?;
} else {
write!(f, "{}", &self.1.variables[symbol.index].name)?;
}
}
write!(f, "]")?;
Ok(())
}
}
impl<'a> fmt::Display for ParseItemSetDisplay<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
for (item, lookaheads) in self.0.entries.iter() {
writeln!(
f,
"{}\t{}",
ParseItemDisplay(item, self.1, self.2),
TokenSetDisplay(lookaheads, self.1, self.2)
)?;
}
Ok(())
}
}
impl<'a> Hash for ParseItem<'a> {
fn hash<H: Hasher>(&self, hasher: &mut H) {
hasher.write_u32(self.variable_index);
hasher.write_u32(self.step_index);
hasher.write_i32(self.production.dynamic_precedence);
hasher.write_usize(self.production.steps.len());
hasher.write_i32(self.has_preceding_inherited_fields as i32);
self.precedence().hash(hasher);
self.associativity().hash(hasher);
// The already-matched children don't play any role in the parse state for
// this item, unless any of the following are true:
// * the children have fields
// * the children have aliases
// * the children are hidden and
// See the docs for `has_preceding_inherited_fields`.
for step in &self.production.steps[0..self.step_index as usize] {
step.alias.hash(hasher);
step.field_name.hash(hasher);
if self.has_preceding_inherited_fields {
step.symbol.hash(hasher);
}
}
for step in &self.production.steps[self.step_index as usize..] {
step.hash(hasher);
}
}
}
impl<'a> PartialEq for ParseItem<'a> {
fn eq(&self, other: &Self) -> bool {
if self.variable_index != other.variable_index
|| self.step_index != other.step_index
|| self.production.dynamic_precedence != other.production.dynamic_precedence
|| self.production.steps.len() != other.production.steps.len()
|| self.precedence() != other.precedence()
|| self.associativity() != other.associativity()
|| self.has_preceding_inherited_fields != other.has_preceding_inherited_fields
{
return false;
}
for (i, step) in self.production.steps.iter().enumerate() {
// See the previous comment (in the `Hash::hash` impl) regarding comparisons
// of parse items' already-completed steps.
if i < self.step_index as usize {
if step.alias != other.production.steps[i].alias {
return false;
}
if step.field_name != other.production.steps[i].field_name {
return false;
}
if self.has_preceding_inherited_fields
&& step.symbol != other.production.steps[i].symbol
{
return false;
}
} else if *step != other.production.steps[i] {
return false;
}
}
return true;
}
}
impl<'a> Ord for ParseItem<'a> {
fn cmp(&self, other: &Self) -> Ordering {
self.step_index
.cmp(&other.step_index)
.then_with(|| self.variable_index.cmp(&other.variable_index))
.then_with(|| {
self.production
.dynamic_precedence
.cmp(&other.production.dynamic_precedence)
})
.then_with(|| {
self.production
.steps
.len()
.cmp(&other.production.steps.len())
})
.then_with(|| self.precedence().cmp(&other.precedence()))
.then_with(|| self.associativity().cmp(&other.associativity()))
.then_with(|| {
for (i, step) in self.production.steps.iter().enumerate() {
// See the previous comment (in the `Hash::hash` impl) regarding comparisons
// of parse items' already-completed steps.
let o = if i < self.step_index as usize {
step.alias
.cmp(&other.production.steps[i].alias)
.then_with(|| {
step.field_name.cmp(&other.production.steps[i].field_name)
})
} else {
step.cmp(&other.production.steps[i])
};
if o != Ordering::Equal {
return o;
}
}
return Ordering::Equal;
})
}
}
impl<'a> PartialOrd for ParseItem<'a> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<'a> Eq for ParseItem<'a> {}
impl<'a> Hash for ParseItemSet<'a> {
fn hash<H: Hasher>(&self, hasher: &mut H) {
hasher.write_usize(self.entries.len());
for (item, lookaheads) in self.entries.iter() {
item.hash(hasher);
lookaheads.hash(hasher);
}
}
}
impl<'a> Hash for ParseItemSetCore<'a> {
fn hash<H: Hasher>(&self, hasher: &mut H) {
hasher.write_usize(self.entries.len());
for item in &self.entries {
item.hash(hasher);
}
}
}

View File

@ -0,0 +1,347 @@
use super::item::{ParseItem, ParseItemDisplay, ParseItemSet, TokenSetDisplay};
use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use crate::generate::rules::{Symbol, SymbolType, TokenSet};
use std::collections::{HashMap, HashSet};
use std::fmt;
#[derive(Clone, Debug, PartialEq, Eq)]
struct TransitiveClosureAddition<'a> {
item: ParseItem<'a>,
info: FollowSetInfo,
}
#[derive(Clone, Debug, PartialEq, Eq)]
struct FollowSetInfo {
lookaheads: TokenSet,
propagates_lookaheads: bool,
}
pub(crate) struct ParseItemSetBuilder<'a> {
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
first_sets: HashMap<Symbol, TokenSet>,
last_sets: HashMap<Symbol, TokenSet>,
inlines: &'a InlinedProductionMap,
transitive_closure_additions: Vec<Vec<TransitiveClosureAddition<'a>>>,
}
fn find_or_push<T: Eq>(vector: &mut Vec<T>, value: T) {
if !vector.contains(&value) {
vector.push(value);
}
}
impl<'a> ParseItemSetBuilder<'a> {
pub fn new(
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
inlines: &'a InlinedProductionMap,
) -> Self {
let mut result = Self {
syntax_grammar,
lexical_grammar,
first_sets: HashMap::new(),
last_sets: HashMap::new(),
inlines,
transitive_closure_additions: vec![Vec::new(); syntax_grammar.variables.len()],
};
// For each grammar symbol, populate the FIRST and LAST sets: the set of
// terminals that appear at the beginning and end that symbol's productions,
// respectively.
//
// For a terminal symbol, the FIRST and LAST set just consists of the
// terminal itself.
for i in 0..lexical_grammar.variables.len() {
let symbol = Symbol::terminal(i);
let mut set = TokenSet::new();
set.insert(symbol);
result.first_sets.insert(symbol, set.clone());
result.last_sets.insert(symbol, set);
}
for i in 0..syntax_grammar.external_tokens.len() {
let symbol = Symbol::external(i);
let mut set = TokenSet::new();
set.insert(symbol);
result.first_sets.insert(symbol, set.clone());
result.last_sets.insert(symbol, set);
}
// The FIRST set of a non-terminal `i` is the union of the following sets:
// * the set of all terminals that appear at the beginings of i's productions
// * the FIRST sets of all the non-terminals that appear at the beginnings
// of i's productions
//
// Rather than computing these sets using recursion, we use an explicit stack
// called `symbols_to_process`.
let mut symbols_to_process = Vec::new();
let mut processed_non_terminals = HashSet::new();
for i in 0..syntax_grammar.variables.len() {
let symbol = Symbol::non_terminal(i);
let first_set = &mut result.first_sets.entry(symbol).or_insert(TokenSet::new());
processed_non_terminals.clear();
symbols_to_process.clear();
symbols_to_process.push(symbol);
while let Some(current_symbol) = symbols_to_process.pop() {
if current_symbol.is_terminal() || current_symbol.is_external() {
first_set.insert(current_symbol);
} else if processed_non_terminals.insert(current_symbol) {
for production in syntax_grammar.variables[current_symbol.index]
.productions
.iter()
{
if let Some(step) = production.steps.first() {
symbols_to_process.push(step.symbol);
}
}
}
}
// The LAST set is defined in a similar way to the FIRST set.
let last_set = &mut result.last_sets.entry(symbol).or_insert(TokenSet::new());
processed_non_terminals.clear();
symbols_to_process.clear();
symbols_to_process.push(symbol);
while let Some(current_symbol) = symbols_to_process.pop() {
if current_symbol.is_terminal() || current_symbol.is_external() {
last_set.insert(current_symbol);
} else if processed_non_terminals.insert(current_symbol) {
for production in syntax_grammar.variables[current_symbol.index]
.productions
.iter()
{
if let Some(step) = production.steps.last() {
symbols_to_process.push(step.symbol);
}
}
}
}
}
// To compute an item set's transitive closure, we find each item in the set
// whose next symbol is a non-terminal, and we add new items to the set for
// each of that symbols' productions. These productions might themselves begin
// with non-terminals, so the process continues recursively. In this process,
// the total set of entries that get added depends only on two things:
// * the set of non-terminal symbols that occur at each item's current position
// * the set of terminals that occurs after each of these non-terminal symbols
//
// So we can avoid a lot of duplicated recursive work by precomputing, for each
// non-terminal symbol `i`, a final list of *additions* that must be made to an
// item set when `i` occurs as the next symbol in one if its core items. The
// structure of an *addition* is as follows:
// * `item` - the new item that must be added as part of the expansion of `i`
// * `lookaheads` - lookahead tokens that can always come after that item in
// the expansion of `i`
// * `propagates_lookaheads` - a boolean indicating whether or not `item` can
// occur at the *end* of the expansion of `i`, so that i's own current
// lookahead tokens can occur after `item`.
//
// Again, rather than computing these additions recursively, we use an explicit
// stack called `entries_to_process`.
for i in 0..syntax_grammar.variables.len() {
let empty_lookaheads = TokenSet::new();
let mut entries_to_process = vec![(i, &empty_lookaheads, true)];
// First, build up a map whose keys are all of the non-terminals that can
// appear at the beginning of non-terminal `i`, and whose values store
// information about the tokens that can follow each non-terminal.
let mut follow_set_info_by_non_terminal = HashMap::new();
while let Some(entry) = entries_to_process.pop() {
let (variable_index, lookaheads, propagates_lookaheads) = entry;
let existing_info = follow_set_info_by_non_terminal
.entry(variable_index)
.or_insert_with(|| FollowSetInfo {
lookaheads: TokenSet::new(),
propagates_lookaheads: false,
});
let did_add_follow_set_info;
if propagates_lookaheads {
did_add_follow_set_info = !existing_info.propagates_lookaheads;
existing_info.propagates_lookaheads = true;
} else {
did_add_follow_set_info = existing_info.lookaheads.insert_all(lookaheads);
}
if did_add_follow_set_info {
for production in &syntax_grammar.variables[variable_index].productions {
if let Some(symbol) = production.first_symbol() {
if symbol.is_non_terminal() {
if production.steps.len() == 1 {
entries_to_process.push((
symbol.index,
lookaheads,
propagates_lookaheads,
));
} else {
entries_to_process.push((
symbol.index,
&result.first_sets[&production.steps[1].symbol],
false,
));
}
}
}
}
}
}
// Store all of those non-terminals' productions, along with their associated
// lookahead info, as *additions* associated with non-terminal `i`.
let additions_for_non_terminal = &mut result.transitive_closure_additions[i];
for (variable_index, follow_set_info) in follow_set_info_by_non_terminal {
let variable = &syntax_grammar.variables[variable_index];
let non_terminal = Symbol::non_terminal(variable_index);
let variable_index = variable_index as u32;
if syntax_grammar.variables_to_inline.contains(&non_terminal) {
continue;
}
for production in &variable.productions {
let item = ParseItem {
variable_index,
production,
step_index: 0,
has_preceding_inherited_fields: false,
};
if let Some(inlined_productions) =
inlines.inlined_productions(item.production, item.step_index)
{
for production in inlined_productions {
find_or_push(
additions_for_non_terminal,
TransitiveClosureAddition {
item: item.substitute_production(production),
info: follow_set_info.clone(),
},
);
}
} else {
find_or_push(
additions_for_non_terminal,
TransitiveClosureAddition {
item,
info: follow_set_info.clone(),
},
);
}
}
}
}
result
}
pub(crate) fn transitive_closure(&mut self, item_set: &ParseItemSet<'a>) -> ParseItemSet<'a> {
let mut result = ParseItemSet::default();
for (item, lookaheads) in &item_set.entries {
if let Some(productions) = self
.inlines
.inlined_productions(item.production, item.step_index)
{
for production in productions {
self.add_item(
&mut result,
item.substitute_production(production),
lookaheads,
);
}
} else {
self.add_item(&mut result, *item, lookaheads);
}
}
result
}
pub fn first_set(&self, symbol: &Symbol) -> &TokenSet {
&self.first_sets[symbol]
}
pub fn last_set(&self, symbol: &Symbol) -> &TokenSet {
&self.last_sets[symbol]
}
fn add_item(&self, set: &mut ParseItemSet<'a>, item: ParseItem<'a>, lookaheads: &TokenSet) {
if let Some(step) = item.step() {
if step.symbol.is_non_terminal() {
let next_step = item.successor().step();
// Determine which tokens can follow this non-terminal.
let following_tokens = if let Some(next_step) = next_step {
self.first_sets.get(&next_step.symbol).unwrap()
} else {
&lookaheads
};
// Use the pre-computed *additions* to expand the non-terminal.
for addition in &self.transitive_closure_additions[step.symbol.index] {
let lookaheads = set.insert(addition.item, &addition.info.lookaheads);
if addition.info.propagates_lookaheads {
lookaheads.insert_all(following_tokens);
}
}
}
}
set.insert(item, lookaheads);
}
}
impl<'a> fmt::Debug for ParseItemSetBuilder<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "ParseItemSetBuilder {{\n")?;
write!(f, " first_sets: {{\n")?;
for (symbol, first_set) in &self.first_sets {
let name = match symbol.kind {
SymbolType::NonTerminal => &self.syntax_grammar.variables[symbol.index].name,
SymbolType::External => &self.syntax_grammar.external_tokens[symbol.index].name,
SymbolType::Terminal => &self.lexical_grammar.variables[symbol.index].name,
SymbolType::End | SymbolType::EndOfNonTerminalExtra => "END",
};
write!(
f,
" first({:?}): {}\n",
name,
TokenSetDisplay(first_set, &self.syntax_grammar, &self.lexical_grammar)
)?;
}
write!(f, " }}\n")?;
write!(f, " last_sets: {{\n")?;
for (symbol, last_set) in &self.last_sets {
let name = match symbol.kind {
SymbolType::NonTerminal => &self.syntax_grammar.variables[symbol.index].name,
SymbolType::External => &self.syntax_grammar.external_tokens[symbol.index].name,
SymbolType::Terminal => &self.lexical_grammar.variables[symbol.index].name,
SymbolType::End | SymbolType::EndOfNonTerminalExtra => "END",
};
write!(
f,
" last({:?}): {}\n",
name,
TokenSetDisplay(last_set, &self.syntax_grammar, &self.lexical_grammar)
)?;
}
write!(f, " }}\n")?;
write!(f, " additions: {{\n")?;
for (i, variable) in self.syntax_grammar.variables.iter().enumerate() {
write!(f, " {}: {{\n", variable.name)?;
for addition in &self.transitive_closure_additions[i] {
write!(
f,
" {}\n",
ParseItemDisplay(&addition.item, self.syntax_grammar, self.lexical_grammar)
)?;
}
write!(f, " }},\n")?;
}
write!(f, " }},")?;
write!(f, "}}")?;
Ok(())
}
}

View File

@ -0,0 +1,511 @@
use super::token_conflicts::TokenConflictMap;
use crate::generate::dedup::split_state_id_groups;
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar, VariableType};
use crate::generate::rules::{AliasMap, Symbol, TokenSet};
use crate::generate::tables::{
GotoAction, ParseAction, ParseState, ParseStateId, ParseTable, ParseTableEntry,
};
use log::info;
use std::collections::{HashMap, HashSet};
use std::mem;
pub(crate) fn minimize_parse_table(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
simple_aliases: &AliasMap,
token_conflict_map: &TokenConflictMap,
keywords: &TokenSet,
) {
let mut minimizer = Minimizer {
parse_table,
syntax_grammar,
lexical_grammar,
token_conflict_map,
keywords,
simple_aliases,
};
minimizer.merge_compatible_states();
minimizer.remove_unit_reductions();
minimizer.remove_unused_states();
minimizer.reorder_states_by_descending_size();
}
struct Minimizer<'a> {
parse_table: &'a mut ParseTable,
syntax_grammar: &'a SyntaxGrammar,
lexical_grammar: &'a LexicalGrammar,
token_conflict_map: &'a TokenConflictMap<'a>,
keywords: &'a TokenSet,
simple_aliases: &'a AliasMap,
}
impl<'a> Minimizer<'a> {
fn remove_unit_reductions(&mut self) {
let mut aliased_symbols = HashSet::new();
for variable in &self.syntax_grammar.variables {
for production in &variable.productions {
for step in &production.steps {
if step.alias.is_some() {
aliased_symbols.insert(step.symbol);
}
}
}
}
let mut unit_reduction_symbols_by_state = HashMap::new();
for (i, state) in self.parse_table.states.iter().enumerate() {
let mut only_unit_reductions = true;
let mut unit_reduction_symbol = None;
for (_, entry) in &state.terminal_entries {
for action in &entry.actions {
match action {
ParseAction::ShiftExtra => continue,
ParseAction::Reduce {
child_count: 1,
production_id: 0,
symbol,
..
} => {
if !self.simple_aliases.contains_key(&symbol)
&& !self.syntax_grammar.supertype_symbols.contains(&symbol)
&& !aliased_symbols.contains(&symbol)
&& self.syntax_grammar.variables[symbol.index].kind
!= VariableType::Named
&& (unit_reduction_symbol.is_none()
|| unit_reduction_symbol == Some(symbol))
{
unit_reduction_symbol = Some(symbol);
continue;
}
}
_ => {}
}
only_unit_reductions = false;
break;
}
if !only_unit_reductions {
break;
}
}
if let Some(symbol) = unit_reduction_symbol {
if only_unit_reductions {
unit_reduction_symbols_by_state.insert(i, *symbol);
}
}
}
for state in self.parse_table.states.iter_mut() {
let mut done = false;
while !done {
done = true;
state.update_referenced_states(|other_state_id, state| {
if let Some(symbol) = unit_reduction_symbols_by_state.get(&other_state_id) {
done = false;
match state.nonterminal_entries.get(symbol) {
Some(GotoAction::Goto(state_id)) => *state_id,
_ => other_state_id,
}
} else {
other_state_id
}
})
}
}
}
fn merge_compatible_states(&mut self) {
let core_count = 1 + self
.parse_table
.states
.iter()
.map(|state| state.core_id)
.max()
.unwrap();
// Initially group the states by their parse item set core.
let mut group_ids_by_state_id = Vec::with_capacity(self.parse_table.states.len());
let mut state_ids_by_group_id = vec![Vec::<ParseStateId>::new(); core_count];
for (i, state) in self.parse_table.states.iter().enumerate() {
state_ids_by_group_id[state.core_id].push(i);
group_ids_by_state_id.push(state.core_id);
}
split_state_id_groups(
&self.parse_table.states,
&mut state_ids_by_group_id,
&mut group_ids_by_state_id,
0,
|left, right, groups| self.states_conflict(left, right, groups),
);
while split_state_id_groups(
&self.parse_table.states,
&mut state_ids_by_group_id,
&mut group_ids_by_state_id,
0,
|left, right, groups| self.state_successors_differ(left, right, groups),
) {
continue;
}
let error_group_index = state_ids_by_group_id
.iter()
.position(|g| g.contains(&0))
.unwrap();
let start_group_index = state_ids_by_group_id
.iter()
.position(|g| g.contains(&1))
.unwrap();
state_ids_by_group_id.swap(error_group_index, 0);
state_ids_by_group_id.swap(start_group_index, 1);
// Create a list of new parse states: one state for each group of old states.
let mut new_states = Vec::with_capacity(state_ids_by_group_id.len());
for state_ids in &state_ids_by_group_id {
// Initialize the new state based on the first old state in the group.
let mut parse_state = ParseState::default();
mem::swap(&mut parse_state, &mut self.parse_table.states[state_ids[0]]);
// Extend the new state with all of the actions from the other old states
// in the group.
for state_id in &state_ids[1..] {
let mut other_parse_state = ParseState::default();
mem::swap(
&mut other_parse_state,
&mut self.parse_table.states[*state_id],
);
parse_state
.terminal_entries
.extend(other_parse_state.terminal_entries);
parse_state
.nonterminal_entries
.extend(other_parse_state.nonterminal_entries);
}
// Update the new state's outgoing references using the new grouping.
parse_state.update_referenced_states(|state_id, _| group_ids_by_state_id[state_id]);
new_states.push(parse_state);
}
self.parse_table.states = new_states;
}
fn states_conflict(
&self,
left_state: &ParseState,
right_state: &ParseState,
group_ids_by_state_id: &Vec<ParseStateId>,
) -> bool {
for (token, left_entry) in &left_state.terminal_entries {
if let Some(right_entry) = right_state.terminal_entries.get(token) {
if self.entries_conflict(
left_state.id,
right_state.id,
token,
left_entry,
right_entry,
group_ids_by_state_id,
) {
return true;
}
} else if self.token_conflicts(
left_state.id,
right_state.id,
right_state.terminal_entries.keys(),
*token,
) {
return true;
}
}
for token in right_state.terminal_entries.keys() {
if !left_state.terminal_entries.contains_key(token) {
if self.token_conflicts(
left_state.id,
right_state.id,
left_state.terminal_entries.keys(),
*token,
) {
return true;
}
}
}
false
}
fn state_successors_differ(
&self,
state1: &ParseState,
state2: &ParseState,
group_ids_by_state_id: &Vec<ParseStateId>,
) -> bool {
for (token, entry1) in &state1.terminal_entries {
if let ParseAction::Shift { state: s1, .. } = entry1.actions.last().unwrap() {
if let Some(entry2) = state2.terminal_entries.get(token) {
if let ParseAction::Shift { state: s2, .. } = entry2.actions.last().unwrap() {
let group1 = group_ids_by_state_id[*s1];
let group2 = group_ids_by_state_id[*s2];
if group1 != group2 {
info!(
"split states {} {} - successors for {} are split: {} {}",
state1.id,
state2.id,
self.symbol_name(token),
s1,
s2,
);
return true;
}
}
}
}
}
for (symbol, s1) in &state1.nonterminal_entries {
if let Some(s2) = state2.nonterminal_entries.get(symbol) {
match (s1, s2) {
(GotoAction::ShiftExtra, GotoAction::ShiftExtra) => continue,
(GotoAction::Goto(s1), GotoAction::Goto(s2)) => {
let group1 = group_ids_by_state_id[*s1];
let group2 = group_ids_by_state_id[*s2];
if group1 != group2 {
info!(
"split states {} {} - successors for {} are split: {} {}",
state1.id,
state2.id,
self.symbol_name(symbol),
s1,
s2,
);
return true;
}
}
_ => return true,
}
}
}
false
}
fn entries_conflict(
&self,
state_id1: ParseStateId,
state_id2: ParseStateId,
token: &Symbol,
entry1: &ParseTableEntry,
entry2: &ParseTableEntry,
group_ids_by_state_id: &Vec<ParseStateId>,
) -> bool {
// To be compatible, entries need to have the same actions.
let actions1 = &entry1.actions;
let actions2 = &entry2.actions;
if actions1.len() != actions2.len() {
info!(
"split states {} {} - differing action counts for token {}",
state_id1,
state_id2,
self.symbol_name(token)
);
return true;
}
for (i, action1) in actions1.iter().enumerate() {
let action2 = &actions2[i];
// Two shift actions are equivalent if their destinations are in the same group.
if let (
ParseAction::Shift {
state: s1,
is_repetition: is_repetition1,
},
ParseAction::Shift {
state: s2,
is_repetition: is_repetition2,
},
) = (action1, action2)
{
let group1 = group_ids_by_state_id[*s1];
let group2 = group_ids_by_state_id[*s2];
if group1 == group2 && is_repetition1 == is_repetition2 {
continue;
} else {
info!(
"split states {} {} - successors for {} are split: {} {}",
state_id1,
state_id2,
self.symbol_name(token),
s1,
s2,
);
return true;
}
} else if action1 != action2 {
info!(
"split states {} {} - unequal actions for {}",
state_id1,
state_id2,
self.symbol_name(token),
);
return true;
}
}
false
}
fn token_conflicts<'b>(
&self,
left_id: ParseStateId,
right_id: ParseStateId,
existing_tokens: impl Iterator<Item = &'b Symbol>,
new_token: Symbol,
) -> bool {
if new_token == Symbol::end_of_nonterminal_extra() {
info!(
"split states {} {} - end of non-terminal extra",
left_id, right_id,
);
return true;
}
// Do not add external tokens; they could conflict lexically with any of the state's
// existing lookahead tokens.
if new_token.is_external() {
info!(
"split states {} {} - external token {}",
left_id,
right_id,
self.symbol_name(&new_token),
);
return true;
}
// Do not add tokens which are both internal and external. Their validity could
// influence the behavior of the external scanner.
if self
.syntax_grammar
.external_tokens
.iter()
.any(|external| external.corresponding_internal_token == Some(new_token))
{
info!(
"split states {} {} - internal/external token {}",
left_id,
right_id,
self.symbol_name(&new_token),
);
return true;
}
// Do not add a token if it conflicts with an existing token.
for token in existing_tokens {
if token.is_terminal() {
if !(self.syntax_grammar.word_token == Some(*token)
&& self.keywords.contains(&new_token))
&& !(self.syntax_grammar.word_token == Some(new_token)
&& self.keywords.contains(token))
&& (self
.token_conflict_map
.does_conflict(new_token.index, token.index)
|| self
.token_conflict_map
.does_match_same_string(new_token.index, token.index))
{
info!(
"split states {} {} - token {} conflicts with {}",
left_id,
right_id,
self.symbol_name(&new_token),
self.symbol_name(token),
);
return true;
}
}
}
false
}
fn symbol_name(&self, symbol: &Symbol) -> &String {
if symbol.is_non_terminal() {
&self.syntax_grammar.variables[symbol.index].name
} else if symbol.is_external() {
&self.syntax_grammar.external_tokens[symbol.index].name
} else {
&self.lexical_grammar.variables[symbol.index].name
}
}
fn remove_unused_states(&mut self) {
let mut state_usage_map = vec![false; self.parse_table.states.len()];
state_usage_map[0] = true;
state_usage_map[1] = true;
for state in &self.parse_table.states {
for referenced_state in state.referenced_states() {
state_usage_map[referenced_state] = true;
}
}
let mut removed_predecessor_count = 0;
let mut state_replacement_map = vec![0; self.parse_table.states.len()];
for state_id in 0..self.parse_table.states.len() {
state_replacement_map[state_id] = state_id - removed_predecessor_count;
if !state_usage_map[state_id] {
removed_predecessor_count += 1;
}
}
let mut state_id = 0;
let mut original_state_id = 0;
while state_id < self.parse_table.states.len() {
if state_usage_map[original_state_id] {
self.parse_table.states[state_id].update_referenced_states(|other_state_id, _| {
state_replacement_map[other_state_id]
});
state_id += 1;
} else {
self.parse_table.states.remove(state_id);
}
original_state_id += 1;
}
}
fn reorder_states_by_descending_size(&mut self) {
// Get a mapping of old state index -> new_state_index
let mut old_ids_by_new_id = (0..self.parse_table.states.len()).collect::<Vec<_>>();
old_ids_by_new_id.sort_unstable_by_key(|i| {
// Don't changes states 0 (the error state) or 1 (the start state).
if *i <= 1 {
return *i as i64 - 1_000_000;
}
// Reorder all the other states by descending symbol count.
let state = &self.parse_table.states[*i];
-((state.terminal_entries.len() + state.nonterminal_entries.len()) as i64)
});
// Get the inverse mapping
let mut new_ids_by_old_id = vec![0; old_ids_by_new_id.len()];
for (id, old_id) in old_ids_by_new_id.iter().enumerate() {
new_ids_by_old_id[*old_id] = id;
}
// Reorder the parse states and update their references to reflect
// the new ordering.
self.parse_table.states = old_ids_by_new_id
.iter()
.map(|old_id| {
let mut state = ParseState::default();
mem::swap(&mut state, &mut self.parse_table.states[*old_id]);
state.update_referenced_states(|id, _| new_ids_by_old_id[id]);
state
})
.collect();
}
}

View File

@ -0,0 +1,479 @@
pub(crate) mod build_lex_table;
pub(crate) mod build_parse_table;
mod coincident_tokens;
mod item;
mod item_set_builder;
mod minimize_parse_table;
mod token_conflicts;
use self::build_lex_table::build_lex_table;
use self::build_parse_table::{build_parse_table, ParseStateInfo};
use self::coincident_tokens::CoincidentTokenIndex;
use self::minimize_parse_table::minimize_parse_table;
use self::token_conflicts::TokenConflictMap;
use crate::generate::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use crate::generate::nfa::NfaCursor;
use crate::generate::node_types::VariableInfo;
use crate::generate::rules::{AliasMap, Symbol, SymbolType, TokenSet};
use crate::generate::tables::{LexTable, ParseAction, ParseTable, ParseTableEntry};
use anyhow::Result;
use log::info;
use std::collections::{BTreeSet, HashMap};
pub(crate) fn build_tables(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
simple_aliases: &AliasMap,
variable_info: &Vec<VariableInfo>,
inlines: &InlinedProductionMap,
report_symbol_name: Option<&str>,
) -> Result<(ParseTable, LexTable, LexTable, Option<Symbol>)> {
let (mut parse_table, following_tokens, parse_state_info) =
build_parse_table(syntax_grammar, lexical_grammar, inlines, variable_info)?;
let token_conflict_map = TokenConflictMap::new(lexical_grammar, following_tokens);
let coincident_token_index = CoincidentTokenIndex::new(&parse_table, lexical_grammar);
let keywords = identify_keywords(
lexical_grammar,
&parse_table,
syntax_grammar.word_token,
&token_conflict_map,
&coincident_token_index,
);
populate_error_state(
&mut parse_table,
syntax_grammar,
lexical_grammar,
&coincident_token_index,
&token_conflict_map,
&keywords,
);
populate_used_symbols(&mut parse_table, syntax_grammar, lexical_grammar);
minimize_parse_table(
&mut parse_table,
syntax_grammar,
lexical_grammar,
simple_aliases,
&token_conflict_map,
&keywords,
);
let (main_lex_table, keyword_lex_table) = build_lex_table(
&mut parse_table,
syntax_grammar,
lexical_grammar,
&keywords,
&coincident_token_index,
&token_conflict_map,
);
populate_external_lex_states(&mut parse_table, syntax_grammar);
mark_fragile_tokens(&mut parse_table, lexical_grammar, &token_conflict_map);
if let Some(report_symbol_name) = report_symbol_name {
report_state_info(
&syntax_grammar,
&lexical_grammar,
&parse_table,
&parse_state_info,
report_symbol_name,
);
}
Ok((
parse_table,
main_lex_table,
keyword_lex_table,
syntax_grammar.word_token,
))
}
fn populate_error_state(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
coincident_token_index: &CoincidentTokenIndex,
token_conflict_map: &TokenConflictMap,
keywords: &TokenSet,
) {
let state = &mut parse_table.states[0];
let n = lexical_grammar.variables.len();
// First identify the *conflict-free tokens*: tokens that do not overlap with
// any other token in any way, besides matching exactly the same string.
let conflict_free_tokens: TokenSet = (0..n)
.into_iter()
.filter_map(|i| {
let conflicts_with_other_tokens = (0..n).into_iter().any(|j| {
j != i
&& !coincident_token_index.contains(Symbol::terminal(i), Symbol::terminal(j))
&& token_conflict_map.does_match_shorter_or_longer(i, j)
});
if conflicts_with_other_tokens {
None
} else {
info!(
"error recovery - token {} has no conflicts",
lexical_grammar.variables[i].name
);
Some(Symbol::terminal(i))
}
})
.collect();
let recover_entry = ParseTableEntry {
reusable: false,
actions: vec![ParseAction::Recover],
};
// Exclude from the error-recovery state any token that conflicts with one of
// the *conflict-free tokens* identified above.
for i in 0..n {
let symbol = Symbol::terminal(i);
if !conflict_free_tokens.contains(&symbol) && !keywords.contains(&symbol) {
if syntax_grammar.word_token != Some(symbol) {
if let Some(t) = conflict_free_tokens.iter().find(|t| {
!coincident_token_index.contains(symbol, *t)
&& token_conflict_map.does_conflict(symbol.index, t.index)
}) {
info!(
"error recovery - exclude token {} because of conflict with {}",
lexical_grammar.variables[i].name, lexical_grammar.variables[t.index].name
);
continue;
}
}
}
info!(
"error recovery - include token {}",
lexical_grammar.variables[i].name
);
state
.terminal_entries
.entry(symbol)
.or_insert_with(|| recover_entry.clone());
}
for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() {
if external_token.corresponding_internal_token.is_none() {
state
.terminal_entries
.entry(Symbol::external(i))
.or_insert_with(|| recover_entry.clone());
}
}
state.terminal_entries.insert(Symbol::end(), recover_entry);
}
fn populate_used_symbols(
parse_table: &mut ParseTable,
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
) {
let mut terminal_usages = vec![false; lexical_grammar.variables.len()];
let mut non_terminal_usages = vec![false; syntax_grammar.variables.len()];
let mut external_usages = vec![false; syntax_grammar.external_tokens.len()];
for state in &parse_table.states {
for symbol in state.terminal_entries.keys() {
match symbol.kind {
SymbolType::Terminal => terminal_usages[symbol.index] = true,
SymbolType::External => external_usages[symbol.index] = true,
_ => {}
}
}
for symbol in state.nonterminal_entries.keys() {
non_terminal_usages[symbol.index] = true;
}
}
parse_table.symbols.push(Symbol::end());
for (i, value) in terminal_usages.into_iter().enumerate() {
if value {
// Assign the grammar's word token a low numerical index. This ensures that
// it can be stored in a subtree with no heap allocations, even for grammars with
// very large numbers of tokens. This is an optimization, but it's also important to
// ensure that a subtree's symbol can be successfully reassigned to the word token
// without having to move the subtree to the heap.
// See https://github.com/tree-sitter/tree-sitter/issues/258
if syntax_grammar.word_token.map_or(false, |t| t.index == i) {
parse_table.symbols.insert(1, Symbol::terminal(i));
} else {
parse_table.symbols.push(Symbol::terminal(i));
}
}
}
for (i, value) in external_usages.into_iter().enumerate() {
if value {
parse_table.symbols.push(Symbol::external(i));
}
}
for (i, value) in non_terminal_usages.into_iter().enumerate() {
if value {
parse_table.symbols.push(Symbol::non_terminal(i));
}
}
}
fn populate_external_lex_states(parse_table: &mut ParseTable, syntax_grammar: &SyntaxGrammar) {
let mut external_tokens_by_corresponding_internal_token = HashMap::new();
for (i, external_token) in syntax_grammar.external_tokens.iter().enumerate() {
if let Some(symbol) = external_token.corresponding_internal_token {
external_tokens_by_corresponding_internal_token.insert(symbol.index, i);
}
}
// Ensure that external lex state 0 represents the absence of any
// external tokens.
parse_table.external_lex_states.push(TokenSet::new());
for i in 0..parse_table.states.len() {
let mut external_tokens = TokenSet::new();
for token in parse_table.states[i].terminal_entries.keys() {
if token.is_external() {
external_tokens.insert(*token);
} else if token.is_terminal() {
if let Some(index) =
external_tokens_by_corresponding_internal_token.get(&token.index)
{
external_tokens.insert(Symbol::external(*index));
}
}
}
parse_table.states[i].external_lex_state_id = parse_table
.external_lex_states
.iter()
.position(|tokens| *tokens == external_tokens)
.unwrap_or_else(|| {
parse_table.external_lex_states.push(external_tokens);
parse_table.external_lex_states.len() - 1
});
}
}
fn identify_keywords(
lexical_grammar: &LexicalGrammar,
parse_table: &ParseTable,
word_token: Option<Symbol>,
token_conflict_map: &TokenConflictMap,
coincident_token_index: &CoincidentTokenIndex,
) -> TokenSet {
if word_token.is_none() {
return TokenSet::new();
}
let word_token = word_token.unwrap();
let mut cursor = NfaCursor::new(&lexical_grammar.nfa, Vec::new());
// First find all of the candidate keyword tokens: tokens that start with
// letters or underscore and can match the same string as a word token.
let keyword_candidates: TokenSet = lexical_grammar
.variables
.iter()
.enumerate()
.filter_map(|(i, variable)| {
cursor.reset(vec![variable.start_state]);
if all_chars_are_alphabetical(&cursor)
&& token_conflict_map.does_match_same_string(i, word_token.index)
&& !token_conflict_map.does_match_different_string(i, word_token.index)
{
info!(
"Keywords - add candidate {}",
lexical_grammar.variables[i].name
);
Some(Symbol::terminal(i))
} else {
None
}
})
.collect();
// Exclude keyword candidates that shadow another keyword candidate.
let keywords: TokenSet = keyword_candidates
.iter()
.filter(|token| {
for other_token in keyword_candidates.iter() {
if other_token != *token
&& token_conflict_map.does_match_same_string(other_token.index, token.index)
{
info!(
"Keywords - exclude {} because it matches the same string as {}",
lexical_grammar.variables[token.index].name,
lexical_grammar.variables[other_token.index].name
);
return false;
}
}
true
})
.collect();
// Exclude keyword candidates for which substituting the keyword capture
// token would introduce new lexical conflicts with other tokens.
let keywords = keywords
.iter()
.filter(|token| {
for other_index in 0..lexical_grammar.variables.len() {
if keyword_candidates.contains(&Symbol::terminal(other_index)) {
continue;
}
// If the word token was already valid in every state containing
// this keyword candidate, then substituting the word token won't
// introduce any new lexical conflicts.
if coincident_token_index
.states_with(*token, Symbol::terminal(other_index))
.iter()
.all(|state_id| {
parse_table.states[*state_id]
.terminal_entries
.contains_key(&word_token)
})
{
continue;
}
if !token_conflict_map.has_same_conflict_status(
token.index,
word_token.index,
other_index,
) {
info!(
"Keywords - exclude {} because of conflict with {}",
lexical_grammar.variables[token.index].name,
lexical_grammar.variables[other_index].name
);
return false;
}
}
info!(
"Keywords - include {}",
lexical_grammar.variables[token.index].name,
);
true
})
.collect();
keywords
}
fn mark_fragile_tokens(
parse_table: &mut ParseTable,
lexical_grammar: &LexicalGrammar,
token_conflict_map: &TokenConflictMap,
) {
let n = lexical_grammar.variables.len();
let mut valid_tokens_mask = Vec::with_capacity(n);
for state in parse_table.states.iter_mut() {
valid_tokens_mask.clear();
valid_tokens_mask.resize(n, false);
for token in state.terminal_entries.keys() {
if token.is_terminal() {
valid_tokens_mask[token.index] = true;
}
}
for (token, entry) in state.terminal_entries.iter_mut() {
if token.is_terminal() {
for (i, is_valid) in valid_tokens_mask.iter().enumerate() {
if *is_valid {
if token_conflict_map.does_overlap(i, token.index) {
entry.reusable = false;
break;
}
}
}
}
}
}
}
fn report_state_info<'a>(
syntax_grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
parse_table: &ParseTable,
parse_state_info: &Vec<ParseStateInfo<'a>>,
report_symbol_name: &'a str,
) {
let mut all_state_indices = BTreeSet::new();
let mut symbols_with_state_indices = (0..syntax_grammar.variables.len())
.map(|i| (Symbol::non_terminal(i), BTreeSet::new()))
.collect::<Vec<_>>();
for (i, state) in parse_table.states.iter().enumerate() {
all_state_indices.insert(i);
let item_set = &parse_state_info[state.id];
for (item, _) in item_set.1.entries.iter() {
if !item.is_augmented() {
symbols_with_state_indices[item.variable_index as usize]
.1
.insert(i);
}
}
}
symbols_with_state_indices.sort_unstable_by_key(|(_, states)| -(states.len() as i32));
let max_symbol_name_length = syntax_grammar
.variables
.iter()
.map(|v| v.name.len())
.max()
.unwrap();
for (symbol, states) in &symbols_with_state_indices {
eprintln!(
"{:width$}\t{}",
syntax_grammar.variables[symbol.index].name,
states.len(),
width = max_symbol_name_length
);
}
eprintln!("");
let state_indices = if report_symbol_name == "*" {
Some(&all_state_indices)
} else {
symbols_with_state_indices
.iter()
.find_map(|(symbol, state_indices)| {
if syntax_grammar.variables[symbol.index].name == report_symbol_name {
Some(state_indices)
} else {
None
}
})
};
if let Some(state_indices) = state_indices {
let mut state_indices = state_indices.into_iter().cloned().collect::<Vec<_>>();
state_indices.sort_unstable_by_key(|i| (parse_table.states[*i].core_id, *i));
for state_index in state_indices {
let id = parse_table.states[state_index].id;
let (preceding_symbols, item_set) = &parse_state_info[id];
eprintln!("state index: {}", state_index);
eprintln!("state id: {}", id);
eprint!("symbol sequence:");
for symbol in preceding_symbols {
let name = if symbol.is_terminal() {
&lexical_grammar.variables[symbol.index].name
} else if symbol.is_external() {
&syntax_grammar.external_tokens[symbol.index].name
} else {
&syntax_grammar.variables[symbol.index].name
};
eprint!(" {}", name);
}
eprintln!(
"\nitems:\n{}",
self::item::ParseItemSetDisplay(&item_set, syntax_grammar, lexical_grammar,),
);
}
}
}
fn all_chars_are_alphabetical(cursor: &NfaCursor) -> bool {
cursor.transition_chars().all(|(chars, is_sep)| {
if is_sep {
true
} else {
chars.chars().all(|c| c.is_alphabetic() || c == '_')
}
})
}

View File

@ -0,0 +1,532 @@
use crate::generate::build_tables::item::TokenSetDisplay;
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
use crate::generate::nfa::{CharacterSet, NfaCursor, NfaTransition};
use crate::generate::rules::TokenSet;
use std::cmp::Ordering;
use std::collections::HashSet;
use std::fmt;
#[derive(Clone, Debug, Default, PartialEq, Eq)]
struct TokenConflictStatus {
matches_prefix: bool,
does_match_continuation: bool,
does_match_valid_continuation: bool,
does_match_separators: bool,
matches_same_string: bool,
matches_different_string: bool,
}
pub(crate) struct TokenConflictMap<'a> {
n: usize,
status_matrix: Vec<TokenConflictStatus>,
following_tokens: Vec<TokenSet>,
starting_chars_by_index: Vec<CharacterSet>,
following_chars_by_index: Vec<CharacterSet>,
grammar: &'a LexicalGrammar,
}
impl<'a> TokenConflictMap<'a> {
/// Create a token conflict map based on a lexical grammar, which describes the structure
/// each token, and a `following_token` map, which indicates which tokens may be appear
/// immediately after each other token.
///
/// This analyzes the possible kinds of overlap between each pair of tokens and stores
/// them in a matrix.
pub fn new(grammar: &'a LexicalGrammar, following_tokens: Vec<TokenSet>) -> Self {
let mut cursor = NfaCursor::new(&grammar.nfa, Vec::new());
let starting_chars = get_starting_chars(&mut cursor, grammar);
let following_chars = get_following_chars(&starting_chars, &following_tokens);
let n = grammar.variables.len();
let mut status_matrix = vec![TokenConflictStatus::default(); n * n];
for i in 0..grammar.variables.len() {
for j in 0..i {
let status = compute_conflict_status(&mut cursor, grammar, &following_chars, i, j);
status_matrix[matrix_index(n, i, j)] = status.0;
status_matrix[matrix_index(n, j, i)] = status.1;
}
}
TokenConflictMap {
n,
status_matrix,
following_tokens,
starting_chars_by_index: starting_chars,
following_chars_by_index: following_chars,
grammar,
}
}
/// Does token `i` match any strings that token `j` also matches, such that token `i`
/// is preferred over token `j`?
pub fn has_same_conflict_status(&self, a: usize, b: usize, other: usize) -> bool {
let left = &self.status_matrix[matrix_index(self.n, a, other)];
let right = &self.status_matrix[matrix_index(self.n, b, other)];
left == right
}
/// Does token `i` match any strings that token `j` does *not* match?
pub fn does_match_different_string(&self, i: usize, j: usize) -> bool {
self.status_matrix[matrix_index(self.n, i, j)].matches_different_string
}
/// Does token `i` match any strings that token `j` also matches, where
/// token `i` is preferred over token `j`?
pub fn does_match_same_string(&self, i: usize, j: usize) -> bool {
self.status_matrix[matrix_index(self.n, i, j)].matches_same_string
}
pub fn does_conflict(&self, i: usize, j: usize) -> bool {
let entry = &self.status_matrix[matrix_index(self.n, i, j)];
entry.does_match_valid_continuation
|| entry.does_match_separators
|| entry.matches_same_string
}
/// Does token `i` match any strings that are *prefixes* of strings matched by `j`?
pub fn does_match_prefix(&self, i: usize, j: usize) -> bool {
self.status_matrix[matrix_index(self.n, i, j)].matches_prefix
}
pub fn does_match_shorter_or_longer(&self, i: usize, j: usize) -> bool {
let entry = &self.status_matrix[matrix_index(self.n, i, j)];
let reverse_entry = &self.status_matrix[matrix_index(self.n, j, i)];
(entry.does_match_valid_continuation || entry.does_match_separators)
&& !reverse_entry.does_match_separators
}
pub fn does_overlap(&self, i: usize, j: usize) -> bool {
let status = &self.status_matrix[matrix_index(self.n, i, j)];
status.does_match_separators
|| status.matches_prefix
|| status.matches_same_string
|| status.does_match_continuation
}
pub fn prefer_token(grammar: &LexicalGrammar, left: (i32, usize), right: (i32, usize)) -> bool {
if left.0 > right.0 {
return true;
} else if left.0 < right.0 {
return false;
}
match grammar.variables[left.1]
.implicit_precedence
.cmp(&grammar.variables[right.1].implicit_precedence)
{
Ordering::Less => false,
Ordering::Greater => true,
Ordering::Equal => left.1 < right.1,
}
}
pub fn prefer_transition(
grammar: &LexicalGrammar,
t: &NfaTransition,
completed_id: usize,
completed_precedence: i32,
has_separator_transitions: bool,
) -> bool {
if t.precedence < completed_precedence {
return false;
}
if t.precedence == completed_precedence {
if t.is_separator {
return false;
}
if has_separator_transitions
&& grammar
.variable_indices_for_nfa_states(&t.states)
.position(|i| i == completed_id)
.is_none()
{
return false;
}
}
true
}
}
impl<'a> fmt::Debug for TokenConflictMap<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "TokenConflictMap {{\n")?;
let syntax_grammar = SyntaxGrammar::default();
write!(f, " following_tokens: {{\n")?;
for (i, following_tokens) in self.following_tokens.iter().enumerate() {
write!(
f,
" follow({:?}): {},\n",
self.grammar.variables[i].name,
TokenSetDisplay(following_tokens, &syntax_grammar, &self.grammar)
)?;
}
write!(f, " }},\n")?;
write!(f, " starting_characters: {{\n")?;
for i in 0..self.n {
write!(
f,
" {:?}: {:?},\n",
self.grammar.variables[i].name, self.starting_chars_by_index[i]
)?;
}
write!(f, " }},\n")?;
write!(f, " following_characters: {{\n")?;
for i in 0..self.n {
write!(
f,
" {:?}: {:?},\n",
self.grammar.variables[i].name, self.following_chars_by_index[i]
)?;
}
write!(f, " }},\n")?;
write!(f, " status_matrix: {{\n")?;
for i in 0..self.n {
write!(f, " {:?}: {{\n", self.grammar.variables[i].name)?;
for j in 0..self.n {
write!(
f,
" {:?}: {:?},\n",
self.grammar.variables[j].name,
self.status_matrix[matrix_index(self.n, i, j)]
)?;
}
write!(f, " }},\n")?;
}
write!(f, " }},")?;
write!(f, "}}")?;
Ok(())
}
}
fn matrix_index(variable_count: usize, i: usize, j: usize) -> usize {
variable_count * i + j
}
fn get_starting_chars(cursor: &mut NfaCursor, grammar: &LexicalGrammar) -> Vec<CharacterSet> {
let mut result = Vec::with_capacity(grammar.variables.len());
for variable in &grammar.variables {
cursor.reset(vec![variable.start_state]);
let mut all_chars = CharacterSet::empty();
for (chars, _) in cursor.transition_chars() {
all_chars = all_chars.add(chars);
}
result.push(all_chars);
}
result
}
fn get_following_chars(
starting_chars: &Vec<CharacterSet>,
following_tokens: &Vec<TokenSet>,
) -> Vec<CharacterSet> {
following_tokens
.iter()
.map(|following_tokens| {
let mut chars = CharacterSet::empty();
for token in following_tokens.iter() {
if token.is_terminal() {
chars = chars.add(&starting_chars[token.index]);
}
}
chars
})
.collect()
}
fn compute_conflict_status(
cursor: &mut NfaCursor,
grammar: &LexicalGrammar,
following_chars: &Vec<CharacterSet>,
i: usize,
j: usize,
) -> (TokenConflictStatus, TokenConflictStatus) {
let mut visited_state_sets = HashSet::new();
let mut state_set_queue = vec![vec![
grammar.variables[i].start_state,
grammar.variables[j].start_state,
]];
let mut result = (
TokenConflictStatus::default(),
TokenConflictStatus::default(),
);
while let Some(state_set) = state_set_queue.pop() {
let mut live_variable_indices = grammar.variable_indices_for_nfa_states(&state_set);
// If only one of the two tokens could possibly match from this state, then
// there is no reason to analyze any of its successors. Just record the fact
// that the token matches a string that the other token does not match.
let first_live_variable_index = live_variable_indices.next().unwrap();
if live_variable_indices.count() == 0 {
if first_live_variable_index == i {
result.0.matches_different_string = true;
} else {
result.1.matches_different_string = true;
}
continue;
}
// Don't pursue states where there's no potential for conflict.
cursor.reset(state_set);
let within_separator = cursor.transition_chars().any(|(_, sep)| sep);
// Examine each possible completed token in this state.
let mut completion = None;
for (id, precedence) in cursor.completions() {
if within_separator {
if id == i {
result.0.does_match_separators = true;
} else {
result.1.does_match_separators = true;
}
}
// If the other token has already completed, then this is
// a same-string conflict.
if let Some((prev_id, prev_precedence)) = completion {
if id == prev_id {
continue;
}
// Determine which of the two tokens is preferred.
let preferred_id;
if TokenConflictMap::prefer_token(
grammar,
(prev_precedence, prev_id),
(precedence, id),
) {
preferred_id = prev_id;
} else {
preferred_id = id;
completion = Some((id, precedence));
}
if preferred_id == i {
result.0.matches_same_string = true;
} else {
result.1.matches_same_string = true;
}
} else {
completion = Some((id, precedence));
}
}
// Examine each possible transition from this state to detect substring conflicts.
for transition in cursor.transitions() {
let mut can_advance = true;
// If there is already a completed token in this state, then determine
// if the next state can also match the completed token. If so, then
// this is *not* a conflict.
if let Some((completed_id, completed_precedence)) = completion {
let mut advanced_id = None;
let mut successor_contains_completed_id = false;
for variable_id in grammar.variable_indices_for_nfa_states(&transition.states) {
if variable_id == completed_id {
successor_contains_completed_id = true;
break;
} else {
advanced_id = Some(variable_id);
}
}
// Determine which action is preferred: matching the already complete
// token, or continuing on to try and match the other longer token.
if let (Some(advanced_id), false) = (advanced_id, successor_contains_completed_id) {
if TokenConflictMap::prefer_transition(
grammar,
&transition,
completed_id,
completed_precedence,
within_separator,
) {
can_advance = true;
if advanced_id == i {
result.0.does_match_continuation = true;
if transition.characters.does_intersect(&following_chars[j]) {
result.0.does_match_valid_continuation = true;
}
} else {
result.1.does_match_continuation = true;
if transition.characters.does_intersect(&following_chars[i]) {
result.1.does_match_valid_continuation = true;
}
}
} else {
if completed_id == i {
result.0.matches_prefix = true;
} else {
result.1.matches_prefix = true;
}
}
}
}
if can_advance && visited_state_sets.insert(transition.states.clone()) {
state_set_queue.push(transition.states);
}
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use crate::generate::grammars::{Variable, VariableType};
use crate::generate::prepare_grammar::{expand_tokens, ExtractedLexicalGrammar};
use crate::generate::rules::{Precedence, Rule, Symbol};
#[test]
fn test_starting_characters() {
let grammar = expand_tokens(ExtractedLexicalGrammar {
separators: Vec::new(),
variables: vec![
Variable {
name: "token_0".to_string(),
kind: VariableType::Named,
rule: Rule::pattern("[a-f]1|0x\\d"),
},
Variable {
name: "token_1".to_string(),
kind: VariableType::Named,
rule: Rule::pattern("d*ef"),
},
],
})
.unwrap();
let token_map = TokenConflictMap::new(&grammar, Vec::new());
assert_eq!(
token_map.starting_chars_by_index[0],
CharacterSet::empty().add_range('a', 'f').add_char('0')
);
assert_eq!(
token_map.starting_chars_by_index[1],
CharacterSet::empty().add_range('d', 'e')
);
}
#[test]
fn test_token_conflicts() {
let grammar = expand_tokens(ExtractedLexicalGrammar {
separators: Vec::new(),
variables: vec![
Variable {
name: "in".to_string(),
kind: VariableType::Named,
rule: Rule::string("in"),
},
Variable {
name: "identifier".to_string(),
kind: VariableType::Named,
rule: Rule::pattern("\\w+"),
},
Variable {
name: "instanceof".to_string(),
kind: VariableType::Named,
rule: Rule::string("instanceof"),
},
],
})
.unwrap();
let var = |name| index_of_var(&grammar, name);
let token_map = TokenConflictMap::new(
&grammar,
vec![
[Symbol::terminal(var("identifier"))]
.iter()
.cloned()
.collect(),
[Symbol::terminal(var("in"))].iter().cloned().collect(),
[Symbol::terminal(var("identifier"))]
.iter()
.cloned()
.collect(),
],
);
// Given the string "in", the `in` token is preferred over the `identifier` token
assert!(token_map.does_match_same_string(var("in"), var("identifier")));
assert!(!token_map.does_match_same_string(var("identifier"), var("in")));
// Depending on what character follows, the string "in" may be treated as part of an
// `identifier` token.
assert!(token_map.does_conflict(var("identifier"), var("in")));
// Depending on what character follows, the string "instanceof" may be treated as part of
// an `identifier` token.
assert!(token_map.does_conflict(var("identifier"), var("instanceof")));
assert!(token_map.does_conflict(var("instanceof"), var("in")));
}
#[test]
fn test_token_conflicts_with_separators() {
let grammar = expand_tokens(ExtractedLexicalGrammar {
separators: vec![Rule::pattern("\\s")],
variables: vec![
Variable {
name: "x".to_string(),
kind: VariableType::Named,
rule: Rule::string("x"),
},
Variable {
name: "newline".to_string(),
kind: VariableType::Named,
rule: Rule::string("\n"),
},
],
})
.unwrap();
let var = |name| index_of_var(&grammar, name);
let token_map = TokenConflictMap::new(&grammar, vec![TokenSet::new(); 4]);
assert!(token_map.does_conflict(var("newline"), var("x")));
assert!(!token_map.does_conflict(var("x"), var("newline")));
}
#[test]
fn test_token_conflicts_with_open_ended_tokens() {
let grammar = expand_tokens(ExtractedLexicalGrammar {
separators: vec![Rule::pattern("\\s")],
variables: vec![
Variable {
name: "x".to_string(),
kind: VariableType::Named,
rule: Rule::string("x"),
},
Variable {
name: "anything".to_string(),
kind: VariableType::Named,
rule: Rule::prec(Precedence::Integer(-1), Rule::pattern(".*")),
},
],
})
.unwrap();
let var = |name| index_of_var(&grammar, name);
let token_map = TokenConflictMap::new(&grammar, vec![TokenSet::new(); 4]);
assert!(token_map.does_match_shorter_or_longer(var("anything"), var("x")));
assert!(!token_map.does_match_shorter_or_longer(var("x"), var("anything")));
}
fn index_of_var(grammar: &LexicalGrammar, name: &str) -> usize {
grammar
.variables
.iter()
.position(|v| v.name == name)
.unwrap()
}
}

View File

@ -0,0 +1,133 @@
use std::ops::Range;
/// A set of characters represented as a balanced binary tree of comparisons.
/// This is used as an intermediate step in generating efficient code for
/// matching a given character set.
#[derive(PartialEq, Eq)]
pub enum CharacterTree {
Yes,
Compare {
value: char,
operator: Comparator,
consequence: Option<Box<CharacterTree>>,
alternative: Option<Box<CharacterTree>>,
},
}
#[derive(PartialEq, Eq)]
pub enum Comparator {
Less,
LessOrEqual,
Equal,
GreaterOrEqual,
}
impl CharacterTree {
pub fn from_ranges(ranges: &[Range<char>]) -> Option<Self> {
match ranges.len() {
0 => None,
1 => {
let range = &ranges[0];
if range.start == range.end {
Some(CharacterTree::Compare {
operator: Comparator::Equal,
value: range.start,
consequence: Some(Box::new(CharacterTree::Yes)),
alternative: None,
})
} else {
Some(CharacterTree::Compare {
operator: Comparator::GreaterOrEqual,
value: range.start,
consequence: Some(Box::new(CharacterTree::Compare {
operator: Comparator::LessOrEqual,
value: range.end,
consequence: Some(Box::new(CharacterTree::Yes)),
alternative: None,
})),
alternative: None,
})
}
}
len => {
let mid = len / 2;
let mid_range = &ranges[mid];
Some(CharacterTree::Compare {
operator: Comparator::Less,
value: mid_range.start,
consequence: Self::from_ranges(&ranges[0..mid]).map(Box::new),
alternative: Some(Box::new(CharacterTree::Compare {
operator: Comparator::LessOrEqual,
value: mid_range.end,
consequence: Some(Box::new(CharacterTree::Yes)),
alternative: Self::from_ranges(&ranges[(mid + 1)..]).map(Box::new),
})),
})
}
}
}
#[cfg(test)]
fn contains(&self, c: char) -> bool {
match self {
CharacterTree::Yes => true,
CharacterTree::Compare {
value,
operator,
alternative,
consequence,
} => {
let condition = match operator {
Comparator::Less => c < *value,
Comparator::LessOrEqual => c <= *value,
Comparator::Equal => c == *value,
Comparator::GreaterOrEqual => c >= *value,
};
if condition { consequence } else { alternative }
.as_ref()
.map_or(false, |a| a.contains(c))
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_character_tree_simple() {
let tree = CharacterTree::from_ranges(&['a'..'d', 'h'..'l', 'p'..'r', 'u'..'u', 'z'..'z'])
.unwrap();
assert!(tree.contains('a'));
assert!(tree.contains('b'));
assert!(tree.contains('c'));
assert!(tree.contains('d'));
assert!(!tree.contains('e'));
assert!(!tree.contains('f'));
assert!(!tree.contains('g'));
assert!(tree.contains('h'));
assert!(tree.contains('i'));
assert!(tree.contains('j'));
assert!(tree.contains('k'));
assert!(tree.contains('l'));
assert!(!tree.contains('m'));
assert!(!tree.contains('n'));
assert!(!tree.contains('o'));
assert!(tree.contains('p'));
assert!(tree.contains('q'));
assert!(tree.contains('r'));
assert!(!tree.contains('s'));
assert!(!tree.contains('s'));
assert!(tree.contains('u'));
assert!(!tree.contains('v'));
}
}

View File

@ -0,0 +1,63 @@
pub(crate) fn split_state_id_groups<S>(
states: &Vec<S>,
state_ids_by_group_id: &mut Vec<Vec<usize>>,
group_ids_by_state_id: &mut Vec<usize>,
start_group_id: usize,
mut f: impl FnMut(&S, &S, &Vec<usize>) -> bool,
) -> bool {
let mut result = false;
let mut group_id = start_group_id;
while group_id < state_ids_by_group_id.len() {
let state_ids = &state_ids_by_group_id[group_id];
let mut split_state_ids = Vec::new();
let mut i = 0;
while i < state_ids.len() {
let left_state_id = state_ids[i];
if split_state_ids.contains(&left_state_id) {
i += 1;
continue;
}
let left_state = &states[left_state_id];
// Identify all of the other states in the group that are incompatible with
// this state.
let mut j = i + 1;
while j < state_ids.len() {
let right_state_id = state_ids[j];
if split_state_ids.contains(&right_state_id) {
j += 1;
continue;
}
let right_state = &states[right_state_id];
if f(left_state, right_state, &group_ids_by_state_id) {
split_state_ids.push(right_state_id);
}
j += 1;
}
i += 1;
}
// If any states were removed from the group, add them all as a new group.
if split_state_ids.len() > 0 {
result = true;
state_ids_by_group_id[group_id].retain(|i| !split_state_ids.contains(&i));
let new_group_id = state_ids_by_group_id.len();
for id in &split_state_ids {
group_ids_by_state_id[*id] = new_group_id;
}
state_ids_by_group_id.push(split_state_ids);
}
group_id += 1;
}
result
}

View File

@ -0,0 +1,418 @@
function alias(rule, value) {
const result = {
type: "ALIAS",
content: normalize(rule),
named: false,
value: null
};
switch (value.constructor) {
case String:
result.named = false;
result.value = value;
return result;
case ReferenceError:
result.named = true;
result.value = value.symbol.name;
return result;
case Object:
if (typeof value.type === 'string' && value.type === 'SYMBOL') {
result.named = true;
result.value = value.name;
return result;
}
}
throw new Error('Invalid alias value ' + value);
}
function blank() {
return {
type: "BLANK"
};
}
function field(name, rule) {
return {
type: "FIELD",
name: name,
content: normalize(rule)
}
}
function choice(...elements) {
return {
type: "CHOICE",
members: elements.map(normalize)
};
}
function optional(value) {
checkArguments(arguments.length, optional, 'optional');
return choice(value, blank());
}
function prec(number, rule) {
checkPrecedence(number);
checkArguments(
arguments.length - 1,
prec,
'prec',
' and a precedence argument'
);
return {
type: "PREC",
value: number,
content: normalize(rule)
};
}
prec.left = function(number, rule) {
if (rule == null) {
rule = number;
number = 0;
}
checkPrecedence(number);
checkArguments(
arguments.length - 1,
prec.left,
'prec.left',
' and an optional precedence argument'
);
return {
type: "PREC_LEFT",
value: number,
content: normalize(rule)
};
}
prec.right = function(number, rule) {
if (rule == null) {
rule = number;
number = 0;
}
checkPrecedence(number);
checkArguments(
arguments.length - 1,
prec.right,
'prec.right',
' and an optional precedence argument'
);
return {
type: "PREC_RIGHT",
value: number,
content: normalize(rule)
};
}
prec.dynamic = function(number, rule) {
checkPrecedence(number);
checkArguments(
arguments.length - 1,
prec.dynamic,
'prec.dynamic',
' and a precedence argument'
);
return {
type: "PREC_DYNAMIC",
value: number,
content: normalize(rule)
};
}
function repeat(rule) {
checkArguments(arguments.length, repeat, 'repeat');
return {
type: "REPEAT",
content: normalize(rule)
};
}
function repeat1(rule) {
checkArguments(arguments.length, repeat1, 'repeat1');
return {
type: "REPEAT1",
content: normalize(rule)
};
}
function seq(...elements) {
return {
type: "SEQ",
members: elements.map(normalize)
};
}
function sym(name) {
return {
type: "SYMBOL",
name: name
};
}
function token(value) {
return {
type: "TOKEN",
content: normalize(value)
};
}
token.immediate = function(value) {
return {
type: "IMMEDIATE_TOKEN",
content: normalize(value)
};
}
function normalize(value) {
if (typeof value == "undefined")
throw new Error("Undefined symbol");
switch (value.constructor) {
case String:
return {
type: 'STRING',
value
};
case RegExp:
return {
type: 'PATTERN',
value: value.source
};
case ReferenceError:
throw value
default:
if (typeof value.type === 'string') {
return value;
} else {
throw new TypeError("Invalid rule: " + value.toString());
}
}
}
function RuleBuilder(ruleMap) {
return new Proxy({}, {
get(target, propertyName) {
const symbol = sym(propertyName);
if (!ruleMap || ruleMap.hasOwnProperty(propertyName)) {
return symbol;
} else {
const error = new ReferenceError(`Undefined symbol '${propertyName}'`);
error.symbol = symbol;
return error;
}
}
})
}
function grammar(baseGrammar, options) {
if (!options) {
options = baseGrammar;
baseGrammar = {
name: null,
rules: {},
extras: [normalize(/\s/)],
conflicts: [],
externals: [],
inline: [],
supertypes: [],
precedences: [],
};
}
let externals = baseGrammar.externals;
if (options.externals) {
if (typeof options.externals !== "function") {
throw new Error("Grammar's 'externals' property must be a function.");
}
const externalsRuleBuilder = RuleBuilder(null)
const externalRules = options.externals.call(externalsRuleBuilder, externalsRuleBuilder, baseGrammar.externals);
if (!Array.isArray(externalRules)) {
throw new Error("Grammar's 'externals' property must return an array of rules.");
}
externals = externalRules.map(normalize);
}
const ruleMap = {};
for (const key in options.rules) {
ruleMap[key] = true;
}
for (const key in baseGrammar.rules) {
ruleMap[key] = true;
}
for (const external of externals) {
if (typeof external.name === 'string') {
ruleMap[external.name] = true;
}
}
const ruleBuilder = RuleBuilder(ruleMap);
const name = options.name;
if (typeof name !== "string") {
throw new Error("Grammar's 'name' property must be a string.");
}
if (!/^[a-zA-Z_]\w*$/.test(name)) {
throw new Error("Grammar's 'name' property must not start with a digit and cannot contain non-word characters.");
}
let rules = Object.assign({}, baseGrammar.rules);
if (options.rules) {
if (typeof options.rules !== "object") {
throw new Error("Grammar's 'rules' property must be an object.");
}
for (const ruleName in options.rules) {
const ruleFn = options.rules[ruleName];
if (typeof ruleFn !== "function") {
throw new Error("Grammar rules must all be functions. '" + ruleName + "' rule is not.");
}
rules[ruleName] = normalize(ruleFn.call(ruleBuilder, ruleBuilder, baseGrammar.rules[ruleName]));
}
}
let extras = baseGrammar.extras.slice();
if (options.extras) {
if (typeof options.extras !== "function") {
throw new Error("Grammar's 'extras' property must be a function.");
}
extras = options.extras
.call(ruleBuilder, ruleBuilder, baseGrammar.extras)
if (!Array.isArray(extras)) {
throw new Error("Grammar's 'extras' function must return an array.")
}
extras = extras.map(normalize);
}
let word = baseGrammar.word;
if (options.word) {
word = options.word.call(ruleBuilder, ruleBuilder).name;
if (typeof word != 'string') {
throw new Error("Grammar's 'word' property must be a named rule.");
}
}
let conflicts = baseGrammar.conflicts;
if (options.conflicts) {
if (typeof options.conflicts !== "function") {
throw new Error("Grammar's 'conflicts' property must be a function.");
}
const baseConflictRules = baseGrammar.conflicts.map(conflict => conflict.map(sym));
const conflictRules = options.conflicts.call(ruleBuilder, ruleBuilder, baseConflictRules);
if (!Array.isArray(conflictRules)) {
throw new Error("Grammar's conflicts must be an array of arrays of rules.");
}
conflicts = conflictRules.map(conflictSet => {
if (!Array.isArray(conflictSet)) {
throw new Error("Grammar's conflicts must be an array of arrays of rules.");
}
return conflictSet.map(symbol => normalize(symbol).name);
});
}
let inline = baseGrammar.inline;
if (options.inline) {
if (typeof options.inline !== "function") {
throw new Error("Grammar's 'inline' property must be a function.");
}
const baseInlineRules = baseGrammar.inline.map(sym);
const inlineRules = options.inline.call(ruleBuilder, ruleBuilder, baseInlineRules);
if (!Array.isArray(inlineRules)) {
throw new Error("Grammar's inline must be an array of rules.");
}
inline = inlineRules.map(symbol => symbol.name);
}
let supertypes = baseGrammar.supertypes;
if (options.supertypes) {
if (typeof options.supertypes !== "function") {
throw new Error("Grammar's 'supertypes' property must be a function.");
}
const baseSupertypeRules = baseGrammar.supertypes.map(sym);
const supertypeRules = options.supertypes.call(ruleBuilder, ruleBuilder, baseSupertypeRules);
if (!Array.isArray(supertypeRules)) {
throw new Error("Grammar's supertypes must be an array of rules.");
}
supertypes = supertypeRules.map(symbol => symbol.name);
}
let precedences = baseGrammar.precedences;
if (options.precedences) {
if (typeof options.precedences !== "function") {
throw new Error("Grammar's 'precedences' property must be a function");
}
precedences = options.precedences.call(ruleBuilder, ruleBuilder, baseGrammar.precedences);
if (!Array.isArray(precedences)) {
throw new Error("Grammar's precedences must be an array of arrays of rules.");
}
precedences = precedences.map(list => {
if (!Array.isArray(list)) {
throw new Error("Grammar's precedences must be an array of arrays of rules.");
}
return list.map(normalize);
});
}
if (Object.keys(rules).length == 0) {
throw new Error("Grammar must have at least one rule.");
}
return {name, word, rules, extras, conflicts, precedences, externals, inline, supertypes};
}
function checkArguments(ruleCount, caller, callerName, suffix = '') {
if (ruleCount > 1) {
const error = new Error([
`The \`${callerName}\` function only takes one rule argument${suffix}.`,
'You passed multiple rules. Did you mean to call `seq`?\n'
].join('\n'));
Error.captureStackTrace(error, caller);
throw error
}
}
function checkPrecedence(value) {
if (value == null) {
throw new Error('Missing precedence value');
}
}
global.alias = alias;
global.blank = blank;
global.choice = choice;
global.optional = optional;
global.prec = prec;
global.repeat = repeat;
global.repeat1 = repeat1;
global.seq = seq;
global.sym = sym;
global.token = token;
global.grammar = grammar;
global.field = field;
const result = require(process.env.TREE_SITTER_GRAMMAR_PATH);
console.log(JSON.stringify(result, null, 2));

View File

@ -0,0 +1,269 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "tree-sitter grammar specification",
"type": "object",
"required": ["name", "rules"],
"additionalProperties": false,
"properties": {
"name": {
"description": "the name of the grammar",
"type": "string",
"pattern": "^[a-zA-Z_]\\w*"
},
"rules": {
"type": "object",
"patternProperties": {
"^[a-zA-Z_]\\w*$": {
"$ref": "#/definitions/rule"
}
},
"additionalProperties": false
},
"extras": {
"type": "array",
"items": {
"$ref": "#/definitions/rule"
}
},
"externals": {
"type": "array",
"items": {
"$ref": "#/definitions/rule"
}
},
"inline": {
"type": "array",
"items": {
"type": "string",
"pattern": "^[a-zA-Z_]\\w*$"
}
},
"conflicts": {
"type": "array",
"items": {
"type": "array",
"items": {
"type": "string",
"pattern": "^[a-zA-Z_]\\w*$"
}
}
},
"word": {
"type": "string",
"pattern": "^[a-zA-Z_]\\w*"
},
"supertypes": {
"description": "A list of hidden rule names that should be considered supertypes in the generated node types file. See http://tree-sitter.github.io/tree-sitter/using-parsers#static-node-types.",
"type": "array",
"items": {
"description": "the name of a rule in `rules` or `extras`",
"type": "string"
}
}
},
"definitions": {
"blank-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^BLANK$"
}
},
"required": ["type"]
},
"string-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^STRING$"
},
"value": {
"type": "string"
}
},
"required": ["type", "value"]
},
"pattern-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^PATTERN$"
},
"value": { "type": "string" }
},
"required": ["type", "value"]
},
"symbol-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^SYMBOL$"
},
"name": { "type": "string" }
},
"required": ["type", "name"]
},
"seq-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^SEQ$"
},
"members": {
"type": "array",
"items": {
"$ref": "#/definitions/rule"
}
}
},
"required": ["type", "members"]
},
"choice-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^CHOICE$"
},
"members": {
"type": "array",
"items": {
"$ref": "#/definitions/rule"
}
}
},
"required": ["type", "members"]
},
"alias-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^ALIAS$"
},
"value": {
"type": "string"
},
"named": {
"type": "boolean"
},
"content": {
"$ref": "#/definitions/rule"
}
},
"required": ["type", "named", "content", "value"]
},
"repeat-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^REPEAT$"
},
"content": {
"$ref": "#/definitions/rule"
}
},
"required": ["type", "content"]
},
"repeat1-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^REPEAT1$"
},
"content": {
"$ref": "#/definitions/rule"
}
},
"required": ["type", "content"]
},
"token-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^(TOKEN|IMMEDIATE_TOKEN)$"
},
"content": {
"$ref": "#/definitions/rule"
}
},
"required": ["type", "content"]
},
"field-rule": {
"properties": {
"name": { "type": "string" },
"type": {
"type": "string",
"pattern": "^FIELD$"
},
"content": {
"$ref": "#/definitions/rule"
}
},
"required": ["name", "type", "content"]
},
"prec-rule": {
"type": "object",
"properties": {
"type": {
"type": "string",
"pattern": "^(PREC|PREC_LEFT|PREC_RIGHT|PREC_DYNAMIC)$"
},
"value": {
"type": "integer"
},
"content": {
"$ref": "#/definitions/rule"
}
},
"required": ["type", "content", "value"]
},
"rule": {
"oneOf": [
{ "$ref": "#/definitions/alias-rule" },
{ "$ref": "#/definitions/blank-rule" },
{ "$ref": "#/definitions/string-rule" },
{ "$ref": "#/definitions/pattern-rule" },
{ "$ref": "#/definitions/symbol-rule" },
{ "$ref": "#/definitions/seq-rule" },
{ "$ref": "#/definitions/choice-rule" },
{ "$ref": "#/definitions/repeat1-rule" },
{ "$ref": "#/definitions/repeat-rule" },
{ "$ref": "#/definitions/token-rule" },
{ "$ref": "#/definitions/field-rule" },
{ "$ref": "#/definitions/prec-rule" }
]
}
}
}

View File

@ -0,0 +1,262 @@
use super::nfa::Nfa;
use super::rules::{Alias, Associativity, Precedence, Rule, Symbol};
use std::collections::HashMap;
use std::fmt;
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub(crate) enum VariableType {
Hidden,
Auxiliary,
Anonymous,
Named,
}
// Input grammar
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct Variable {
pub name: String,
pub kind: VariableType,
pub rule: Rule,
}
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub(crate) enum PrecedenceEntry {
Name(String),
Symbol(String),
}
#[derive(Debug, Default, PartialEq, Eq)]
pub(crate) struct InputGrammar {
pub name: String,
pub variables: Vec<Variable>,
pub extra_symbols: Vec<Rule>,
pub expected_conflicts: Vec<Vec<String>>,
pub precedence_orderings: Vec<Vec<PrecedenceEntry>>,
pub external_tokens: Vec<Rule>,
pub variables_to_inline: Vec<String>,
pub supertype_symbols: Vec<String>,
pub word_token: Option<String>,
}
// Extracted lexical grammar
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct LexicalVariable {
pub name: String,
pub kind: VariableType,
pub implicit_precedence: i32,
pub start_state: u32,
}
#[derive(Debug, Default, PartialEq, Eq)]
pub(crate) struct LexicalGrammar {
pub nfa: Nfa,
pub variables: Vec<LexicalVariable>,
}
// Extracted syntax grammar
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) struct ProductionStep {
pub symbol: Symbol,
pub precedence: Precedence,
pub associativity: Option<Associativity>,
pub alias: Option<Alias>,
pub field_name: Option<String>,
}
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub(crate) struct Production {
pub steps: Vec<ProductionStep>,
pub dynamic_precedence: i32,
}
#[derive(Default)]
pub(crate) struct InlinedProductionMap {
pub productions: Vec<Production>,
pub production_map: HashMap<(*const Production, u32), Vec<usize>>,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct SyntaxVariable {
pub name: String,
pub kind: VariableType,
pub productions: Vec<Production>,
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ExternalToken {
pub name: String,
pub kind: VariableType,
pub corresponding_internal_token: Option<Symbol>,
}
#[derive(Debug, Default)]
pub(crate) struct SyntaxGrammar {
pub variables: Vec<SyntaxVariable>,
pub extra_symbols: Vec<Symbol>,
pub expected_conflicts: Vec<Vec<Symbol>>,
pub external_tokens: Vec<ExternalToken>,
pub supertype_symbols: Vec<Symbol>,
pub variables_to_inline: Vec<Symbol>,
pub word_token: Option<Symbol>,
pub precedence_orderings: Vec<Vec<PrecedenceEntry>>,
}
#[cfg(test)]
impl ProductionStep {
pub(crate) fn new(symbol: Symbol) -> Self {
Self {
symbol,
precedence: Precedence::None,
associativity: None,
alias: None,
field_name: None,
}
}
pub(crate) fn with_prec(
self,
precedence: Precedence,
associativity: Option<Associativity>,
) -> Self {
Self {
symbol: self.symbol,
precedence,
associativity,
alias: self.alias,
field_name: self.field_name,
}
}
pub(crate) fn with_alias(self, value: &str, is_named: bool) -> Self {
Self {
symbol: self.symbol,
precedence: self.precedence,
associativity: self.associativity,
alias: Some(Alias {
value: value.to_string(),
is_named,
}),
field_name: self.field_name,
}
}
pub(crate) fn with_field_name(self, name: &str) -> Self {
Self {
symbol: self.symbol,
precedence: self.precedence,
associativity: self.associativity,
alias: self.alias,
field_name: Some(name.to_string()),
}
}
}
impl Production {
pub fn first_symbol(&self) -> Option<Symbol> {
self.steps.first().map(|s| s.symbol.clone())
}
}
#[cfg(test)]
impl Variable {
pub fn named(name: &str, rule: Rule) -> Self {
Self {
name: name.to_string(),
kind: VariableType::Named,
rule,
}
}
pub fn auxiliary(name: &str, rule: Rule) -> Self {
Self {
name: name.to_string(),
kind: VariableType::Auxiliary,
rule,
}
}
pub fn hidden(name: &str, rule: Rule) -> Self {
Self {
name: name.to_string(),
kind: VariableType::Hidden,
rule,
}
}
pub fn anonymous(name: &str, rule: Rule) -> Self {
Self {
name: name.to_string(),
kind: VariableType::Anonymous,
rule,
}
}
}
impl VariableType {
pub fn is_visible(&self) -> bool {
*self == VariableType::Named || *self == VariableType::Anonymous
}
}
impl LexicalGrammar {
pub fn variable_indices_for_nfa_states<'a>(
&'a self,
state_ids: &'a Vec<u32>,
) -> impl Iterator<Item = usize> + 'a {
let mut prev = None;
state_ids.iter().filter_map(move |state_id| {
let variable_id = self.variable_index_for_nfa_state(*state_id);
if prev != Some(variable_id) {
prev = Some(variable_id);
prev
} else {
None
}
})
}
pub fn variable_index_for_nfa_state(&self, state_id: u32) -> usize {
self.variables
.iter()
.position(|v| v.start_state >= state_id)
.unwrap()
}
}
impl SyntaxVariable {
pub fn is_auxiliary(&self) -> bool {
self.kind == VariableType::Auxiliary
}
pub fn is_hidden(&self) -> bool {
self.kind == VariableType::Hidden || self.kind == VariableType::Auxiliary
}
}
impl InlinedProductionMap {
pub fn inlined_productions<'a>(
&'a self,
production: &Production,
step_index: u32,
) -> Option<impl Iterator<Item = &'a Production> + 'a> {
self.production_map
.get(&(production as *const Production, step_index))
.map(|production_indices| {
production_indices
.iter()
.cloned()
.map(move |index| &self.productions[index])
})
}
}
impl fmt::Display for PrecedenceEntry {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
PrecedenceEntry::Name(n) => write!(f, "'{}'", n),
PrecedenceEntry::Symbol(s) => write!(f, "$.{}", s),
}
}
}

View File

@ -0,0 +1,214 @@
mod binding_files;
mod build_tables;
mod char_tree;
mod dedup;
mod grammars;
mod nfa;
mod node_types;
pub mod parse_grammar;
mod prepare_grammar;
mod render;
mod rules;
mod tables;
use self::build_tables::build_tables;
use self::grammars::{InlinedProductionMap, LexicalGrammar, SyntaxGrammar};
use self::parse_grammar::parse_grammar;
use self::prepare_grammar::prepare_grammar;
use self::render::render_c_code;
use self::rules::AliasMap;
use anyhow::{anyhow, Context, Result};
use lazy_static::lazy_static;
use regex::{Regex, RegexBuilder};
use semver::Version;
use std::fs;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
lazy_static! {
static ref JSON_COMMENT_REGEX: Regex = RegexBuilder::new("^\\s*//.*")
.multi_line(true)
.build()
.unwrap();
}
struct GeneratedParser {
c_code: String,
node_types_json: String,
}
pub fn generate_parser_in_directory(
repo_path: &PathBuf,
grammar_path: Option<&str>,
abi_version: usize,
generate_bindings: bool,
report_symbol_name: Option<&str>,
) -> Result<()> {
let src_path = repo_path.join("src");
let header_path = src_path.join("tree_sitter");
// Ensure that the output directories exist.
fs::create_dir_all(&src_path)?;
fs::create_dir_all(&header_path)?;
// Read the grammar.json.
let grammar_json;
match grammar_path {
Some(path) => {
grammar_json = load_grammar_file(path.as_ref())?;
}
None => {
let grammar_js_path = grammar_path.map_or(repo_path.join("grammar.js"), |s| s.into());
grammar_json = load_grammar_file(&grammar_js_path)?;
fs::write(&src_path.join("grammar.json"), &grammar_json)?;
}
}
// Parse and preprocess the grammar.
let input_grammar = parse_grammar(&grammar_json)?;
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
prepare_grammar(&input_grammar)?;
let language_name = input_grammar.name;
// Generate the parser and related files.
let GeneratedParser {
c_code,
node_types_json,
} = generate_parser_for_grammar_with_opts(
&language_name,
syntax_grammar,
lexical_grammar,
inlines,
simple_aliases,
abi_version,
report_symbol_name,
)?;
write_file(&src_path.join("parser.c"), c_code)?;
write_file(&src_path.join("node-types.json"), node_types_json)?;
write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?;
if generate_bindings {
binding_files::generate_binding_files(&repo_path, &language_name)?;
}
Ok(())
}
pub fn generate_parser_for_grammar(grammar_json: &str) -> Result<(String, String)> {
let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n");
let input_grammar = parse_grammar(&grammar_json)?;
let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
prepare_grammar(&input_grammar)?;
let parser = generate_parser_for_grammar_with_opts(
&input_grammar.name,
syntax_grammar,
lexical_grammar,
inlines,
simple_aliases,
tree_sitter::LANGUAGE_VERSION,
None,
)?;
Ok((input_grammar.name, parser.c_code))
}
fn generate_parser_for_grammar_with_opts(
name: &String,
syntax_grammar: SyntaxGrammar,
lexical_grammar: LexicalGrammar,
inlines: InlinedProductionMap,
simple_aliases: AliasMap,
abi_version: usize,
report_symbol_name: Option<&str>,
) -> Result<GeneratedParser> {
let variable_info =
node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?;
let node_types_json = node_types::generate_node_types_json(
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
&variable_info,
);
let (parse_table, main_lex_table, keyword_lex_table, keyword_capture_token) = build_tables(
&syntax_grammar,
&lexical_grammar,
&simple_aliases,
&variable_info,
&inlines,
report_symbol_name,
)?;
let c_code = render_c_code(
name,
parse_table,
main_lex_table,
keyword_lex_table,
keyword_capture_token,
syntax_grammar,
lexical_grammar,
simple_aliases,
abi_version,
);
Ok(GeneratedParser {
c_code,
node_types_json: serde_json::to_string_pretty(&node_types_json).unwrap(),
})
}
pub fn load_grammar_file(grammar_path: &Path) -> Result<String> {
match grammar_path.extension().and_then(|e| e.to_str()) {
Some("js") => Ok(load_js_grammar_file(grammar_path)?),
Some("json") => Ok(fs::read_to_string(grammar_path)?),
_ => Err(anyhow!(
"Unknown grammar file extension: {:?}",
grammar_path
)),
}
}
fn load_js_grammar_file(grammar_path: &Path) -> Result<String> {
let grammar_path = fs::canonicalize(grammar_path)?;
let mut node_process = Command::new("node")
.env("TREE_SITTER_GRAMMAR_PATH", grammar_path)
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.spawn()
.expect("Failed to run `node`");
let mut node_stdin = node_process
.stdin
.take()
.expect("Failed to open stdin for node");
let cli_version = Version::parse(env!("CARGO_PKG_VERSION"))
.expect("Could not parse this package's version as semver.");
write!(
node_stdin,
"global.TREE_SITTER_CLI_VERSION_MAJOR = {};
global.TREE_SITTER_CLI_VERSION_MINOR = {};
global.TREE_SITTER_CLI_VERSION_PATCH = {};",
cli_version.major, cli_version.minor, cli_version.patch,
)
.expect("Failed to write tree-sitter version to node's stdin");
let javascript_code = include_bytes!("./dsl.js");
node_stdin
.write(javascript_code)
.expect("Failed to write grammar dsl to node's stdin");
drop(node_stdin);
let output = node_process
.wait_with_output()
.expect("Failed to read output from node");
match output.status.code() {
None => panic!("Node process was killed"),
Some(0) => {}
Some(code) => return Err(anyhow!("Node process exited with status {}", code)),
}
let mut result = String::from_utf8(output.stdout).expect("Got invalid UTF8 from node");
result.push('\n');
Ok(result)
}
fn write_file(path: &Path, body: impl AsRef<[u8]>) -> Result<()> {
fs::write(path, body)
.with_context(|| format!("Failed to write {:?}", path.file_name().unwrap()))
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,222 @@
use super::grammars::{InputGrammar, PrecedenceEntry, Variable, VariableType};
use super::rules::{Precedence, Rule};
use anyhow::{anyhow, Result};
use serde::Deserialize;
use serde_json::{Map, Value};
#[derive(Deserialize)]
#[serde(tag = "type")]
#[allow(non_camel_case_types)]
enum RuleJSON {
ALIAS {
content: Box<RuleJSON>,
named: bool,
value: String,
},
BLANK,
STRING {
value: String,
},
PATTERN {
value: String,
},
SYMBOL {
name: String,
},
CHOICE {
members: Vec<RuleJSON>,
},
FIELD {
name: String,
content: Box<RuleJSON>,
},
SEQ {
members: Vec<RuleJSON>,
},
REPEAT {
content: Box<RuleJSON>,
},
REPEAT1 {
content: Box<RuleJSON>,
},
PREC_DYNAMIC {
value: i32,
content: Box<RuleJSON>,
},
PREC_LEFT {
value: PrecedenceValueJSON,
content: Box<RuleJSON>,
},
PREC_RIGHT {
value: PrecedenceValueJSON,
content: Box<RuleJSON>,
},
PREC {
value: PrecedenceValueJSON,
content: Box<RuleJSON>,
},
TOKEN {
content: Box<RuleJSON>,
},
IMMEDIATE_TOKEN {
content: Box<RuleJSON>,
},
}
#[derive(Deserialize)]
#[serde(untagged)]
enum PrecedenceValueJSON {
Integer(i32),
Name(String),
}
#[derive(Deserialize)]
pub(crate) struct GrammarJSON {
pub(crate) name: String,
rules: Map<String, Value>,
#[serde(default)]
precedences: Vec<Vec<RuleJSON>>,
#[serde(default)]
conflicts: Vec<Vec<String>>,
#[serde(default)]
externals: Vec<RuleJSON>,
#[serde(default)]
extras: Vec<RuleJSON>,
#[serde(default)]
inline: Vec<String>,
#[serde(default)]
supertypes: Vec<String>,
word: Option<String>,
}
pub(crate) fn parse_grammar(input: &str) -> Result<InputGrammar> {
let grammar_json: GrammarJSON = serde_json::from_str(&input)?;
let mut variables = Vec::with_capacity(grammar_json.rules.len());
for (name, value) in grammar_json.rules {
variables.push(Variable {
name: name.to_owned(),
kind: VariableType::Named,
rule: parse_rule(serde_json::from_value(value)?),
})
}
let mut precedence_orderings = Vec::with_capacity(grammar_json.precedences.len());
for list in grammar_json.precedences {
let mut ordering = Vec::with_capacity(list.len());
for entry in list {
ordering.push(match entry {
RuleJSON::STRING { value } => PrecedenceEntry::Name(value),
RuleJSON::SYMBOL { name } => PrecedenceEntry::Symbol(name),
_ => {
return Err(anyhow!(
"Invalid rule in precedences array. Only strings and symbols are allowed"
))
}
})
}
precedence_orderings.push(ordering);
}
let extra_symbols = grammar_json.extras.into_iter().map(parse_rule).collect();
let external_tokens = grammar_json.externals.into_iter().map(parse_rule).collect();
Ok(InputGrammar {
name: grammar_json.name,
word_token: grammar_json.word,
expected_conflicts: grammar_json.conflicts,
supertype_symbols: grammar_json.supertypes,
variables_to_inline: grammar_json.inline,
precedence_orderings,
variables,
extra_symbols,
external_tokens,
})
}
fn parse_rule(json: RuleJSON) -> Rule {
match json {
RuleJSON::ALIAS {
content,
value,
named,
} => Rule::alias(parse_rule(*content), value, named),
RuleJSON::BLANK => Rule::Blank,
RuleJSON::STRING { value } => Rule::String(value),
RuleJSON::PATTERN { value } => Rule::Pattern(value),
RuleJSON::SYMBOL { name } => Rule::NamedSymbol(name),
RuleJSON::CHOICE { members } => Rule::choice(members.into_iter().map(parse_rule).collect()),
RuleJSON::FIELD { content, name } => Rule::field(name, parse_rule(*content)),
RuleJSON::SEQ { members } => Rule::seq(members.into_iter().map(parse_rule).collect()),
RuleJSON::REPEAT1 { content } => Rule::repeat(parse_rule(*content)),
RuleJSON::REPEAT { content } => {
Rule::choice(vec![Rule::repeat(parse_rule(*content)), Rule::Blank])
}
RuleJSON::PREC { value, content } => Rule::prec(value.into(), parse_rule(*content)),
RuleJSON::PREC_LEFT { value, content } => {
Rule::prec_left(value.into(), parse_rule(*content))
}
RuleJSON::PREC_RIGHT { value, content } => {
Rule::prec_right(value.into(), parse_rule(*content))
}
RuleJSON::PREC_DYNAMIC { value, content } => {
Rule::prec_dynamic(value, parse_rule(*content))
}
RuleJSON::TOKEN { content } => Rule::token(parse_rule(*content)),
RuleJSON::IMMEDIATE_TOKEN { content } => Rule::immediate_token(parse_rule(*content)),
}
}
impl Into<Precedence> for PrecedenceValueJSON {
fn into(self) -> Precedence {
match self {
PrecedenceValueJSON::Integer(i) => Precedence::Integer(i),
PrecedenceValueJSON::Name(i) => Precedence::Name(i),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_grammar() {
let grammar = parse_grammar(
r#"{
"name": "my_lang",
"rules": {
"file": {
"type": "REPEAT1",
"content": {
"type": "SYMBOL",
"name": "statement"
}
},
"statement": {
"type": "STRING",
"value": "foo"
}
}
}"#,
)
.unwrap();
assert_eq!(grammar.name, "my_lang");
assert_eq!(
grammar.variables,
vec![
Variable {
name: "file".to_string(),
kind: VariableType::Named,
rule: Rule::repeat(Rule::NamedSymbol("statement".to_string()))
},
Variable {
name: "statement".to_string(),
kind: VariableType::Named,
rule: Rule::String("foo".to_string())
},
]
);
}
}

View File

@ -0,0 +1,289 @@
use super::ExtractedSyntaxGrammar;
use crate::generate::grammars::{Variable, VariableType};
use crate::generate::rules::{Rule, Symbol};
use std::collections::HashMap;
use std::mem;
struct Expander {
variable_name: String,
repeat_count_in_variable: usize,
preceding_symbol_count: usize,
auxiliary_variables: Vec<Variable>,
existing_repeats: HashMap<Rule, Symbol>,
}
impl Expander {
fn expand_variable(&mut self, index: usize, variable: &mut Variable) -> bool {
self.variable_name.clear();
self.variable_name.push_str(&variable.name);
self.repeat_count_in_variable = 0;
let mut rule = Rule::Blank;
mem::swap(&mut rule, &mut variable.rule);
// In the special case of a hidden variable with a repetition at its top level,
// convert that rule itself into a binary tree structure instead of introducing
// another auxiliary rule.
if let (VariableType::Hidden, Rule::Repeat(repeated_content)) = (variable.kind, &rule) {
let inner_rule = self.expand_rule(&repeated_content);
variable.rule = self.wrap_rule_in_binary_tree(Symbol::non_terminal(index), inner_rule);
variable.kind = VariableType::Auxiliary;
return true;
}
variable.rule = self.expand_rule(&rule);
false
}
fn expand_rule(&mut self, rule: &Rule) -> Rule {
match rule {
// For choices, sequences, and metadata, descend into the child rules,
// replacing any nested repetitions.
Rule::Choice(elements) => Rule::Choice(
elements
.iter()
.map(|element| self.expand_rule(element))
.collect(),
),
Rule::Seq(elements) => Rule::Seq(
elements
.iter()
.map(|element| self.expand_rule(element))
.collect(),
),
Rule::Metadata { rule, params } => Rule::Metadata {
rule: Box::new(self.expand_rule(rule)),
params: params.clone(),
},
// For repetitions, introduce an auxiliary rule that contains the the
// repeated content, but can also contain a recursive binary tree structure.
Rule::Repeat(content) => {
let inner_rule = self.expand_rule(content);
if let Some(existing_symbol) = self.existing_repeats.get(&inner_rule) {
return Rule::Symbol(*existing_symbol);
}
self.repeat_count_in_variable += 1;
let rule_name = format!(
"{}_repeat{}",
self.variable_name, self.repeat_count_in_variable
);
let repeat_symbol = Symbol::non_terminal(
self.preceding_symbol_count + self.auxiliary_variables.len(),
);
self.existing_repeats
.insert(inner_rule.clone(), repeat_symbol);
self.auxiliary_variables.push(Variable {
name: rule_name,
kind: VariableType::Auxiliary,
rule: self.wrap_rule_in_binary_tree(repeat_symbol, inner_rule),
});
Rule::Symbol(repeat_symbol)
}
// For primitive rules, don't change anything.
_ => rule.clone(),
}
}
fn wrap_rule_in_binary_tree(&self, symbol: Symbol, rule: Rule) -> Rule {
Rule::choice(vec![
Rule::Seq(vec![Rule::Symbol(symbol), Rule::Symbol(symbol)]),
rule,
])
}
}
pub(super) fn expand_repeats(mut grammar: ExtractedSyntaxGrammar) -> ExtractedSyntaxGrammar {
let mut expander = Expander {
variable_name: String::new(),
repeat_count_in_variable: 0,
preceding_symbol_count: grammar.variables.len(),
auxiliary_variables: Vec::new(),
existing_repeats: HashMap::new(),
};
for (i, mut variable) in grammar.variables.iter_mut().enumerate() {
let expanded_top_level_repetition = expander.expand_variable(i, &mut variable);
// If a hidden variable had a top-level repetition and it was converted to
// a recursive rule, then it can't be inlined.
if expanded_top_level_repetition {
grammar
.variables_to_inline
.retain(|symbol| *symbol != Symbol::non_terminal(i));
}
}
grammar
.variables
.extend(expander.auxiliary_variables.into_iter());
grammar
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_repeat_expansion() {
// Repeats nested inside of sequences and choices are expanded.
let grammar = expand_repeats(build_grammar(vec![Variable::named(
"rule0",
Rule::seq(vec![
Rule::terminal(10),
Rule::choice(vec![
Rule::repeat(Rule::terminal(11)),
Rule::repeat(Rule::terminal(12)),
]),
Rule::terminal(13),
]),
)]));
assert_eq!(
grammar.variables,
vec![
Variable::named(
"rule0",
Rule::seq(vec![
Rule::terminal(10),
Rule::choice(vec![Rule::non_terminal(1), Rule::non_terminal(2),]),
Rule::terminal(13),
])
),
Variable::auxiliary(
"rule0_repeat1",
Rule::choice(vec![
Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(1),]),
Rule::terminal(11),
])
),
Variable::auxiliary(
"rule0_repeat2",
Rule::choice(vec![
Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]),
Rule::terminal(12),
])
),
]
);
}
#[test]
fn test_repeat_deduplication() {
// Terminal 4 appears inside of a repeat in three different places.
let grammar = expand_repeats(build_grammar(vec![
Variable::named(
"rule0",
Rule::choice(vec![
Rule::seq(vec![Rule::terminal(1), Rule::repeat(Rule::terminal(4))]),
Rule::seq(vec![Rule::terminal(2), Rule::repeat(Rule::terminal(4))]),
]),
),
Variable::named(
"rule1",
Rule::seq(vec![Rule::terminal(3), Rule::repeat(Rule::terminal(4))]),
),
]));
// Only one auxiliary rule is created for repeating terminal 4.
assert_eq!(
grammar.variables,
vec![
Variable::named(
"rule0",
Rule::choice(vec![
Rule::seq(vec![Rule::terminal(1), Rule::non_terminal(2)]),
Rule::seq(vec![Rule::terminal(2), Rule::non_terminal(2)]),
])
),
Variable::named(
"rule1",
Rule::seq(vec![Rule::terminal(3), Rule::non_terminal(2),])
),
Variable::auxiliary(
"rule0_repeat1",
Rule::choice(vec![
Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]),
Rule::terminal(4),
])
)
]
);
}
#[test]
fn test_expansion_of_nested_repeats() {
let grammar = expand_repeats(build_grammar(vec![Variable::named(
"rule0",
Rule::seq(vec![
Rule::terminal(10),
Rule::repeat(Rule::seq(vec![
Rule::terminal(11),
Rule::repeat(Rule::terminal(12)),
])),
]),
)]));
assert_eq!(
grammar.variables,
vec![
Variable::named(
"rule0",
Rule::seq(vec![Rule::terminal(10), Rule::non_terminal(2),])
),
Variable::auxiliary(
"rule0_repeat1",
Rule::choice(vec![
Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(1),]),
Rule::terminal(12),
])
),
Variable::auxiliary(
"rule0_repeat2",
Rule::choice(vec![
Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2),]),
Rule::seq(vec![Rule::terminal(11), Rule::non_terminal(1),]),
])
),
]
);
}
#[test]
fn test_expansion_of_repeats_at_top_of_hidden_rules() {
let grammar = expand_repeats(build_grammar(vec![
Variable::named("rule0", Rule::non_terminal(1)),
Variable::hidden(
"_rule1",
Rule::repeat(Rule::choice(vec![Rule::terminal(11), Rule::terminal(12)])),
),
]));
assert_eq!(
grammar.variables,
vec![
Variable::named("rule0", Rule::non_terminal(1),),
Variable::auxiliary(
"_rule1",
Rule::choice(vec![
Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(1)]),
Rule::terminal(11),
Rule::terminal(12),
]),
),
]
);
}
fn build_grammar(variables: Vec<Variable>) -> ExtractedSyntaxGrammar {
ExtractedSyntaxGrammar {
variables,
..Default::default()
}
}
}

View File

@ -0,0 +1,903 @@
use super::ExtractedLexicalGrammar;
use crate::generate::grammars::{LexicalGrammar, LexicalVariable};
use crate::generate::nfa::{CharacterSet, Nfa, NfaState};
use crate::generate::rules::{Precedence, Rule};
use anyhow::{anyhow, Context, Result};
use lazy_static::lazy_static;
use regex::Regex;
use regex_syntax::ast::{
parse, Ast, Class, ClassPerlKind, ClassSet, ClassSetBinaryOpKind, ClassSetItem,
ClassUnicodeKind, RepetitionKind, RepetitionRange,
};
use std::collections::HashMap;
use std::i32;
lazy_static! {
static ref CURLY_BRACE_REGEX: Regex =
Regex::new(r#"(^|[^\\pP])\{([^}]*[^0-9A-Fa-f,}][^}]*)\}"#).unwrap();
static ref UNICODE_CATEGORIES: HashMap<&'static str, Vec<u32>> =
serde_json::from_str(UNICODE_CATEGORIES_JSON).unwrap();
static ref UNICODE_PROPERTIES: HashMap<&'static str, Vec<u32>> =
serde_json::from_str(UNICODE_PROPERTIES_JSON).unwrap();
static ref UNICODE_CATEGORY_ALIASES: HashMap<&'static str, String> =
serde_json::from_str(UNICODE_CATEGORY_ALIASES_JSON).unwrap();
static ref UNICODE_PROPERTY_ALIASES: HashMap<&'static str, String> =
serde_json::from_str(UNICODE_PROPERTY_ALIASES_JSON).unwrap();
}
const UNICODE_CATEGORIES_JSON: &'static str = include_str!("./unicode-categories.json");
const UNICODE_PROPERTIES_JSON: &'static str = include_str!("./unicode-properties.json");
const UNICODE_CATEGORY_ALIASES_JSON: &'static str = include_str!("./unicode-category-aliases.json");
const UNICODE_PROPERTY_ALIASES_JSON: &'static str = include_str!("./unicode-property-aliases.json");
const ALLOWED_REDUNDANT_ESCAPED_CHARS: [char; 4] = ['!', '\'', '"', '/'];
struct NfaBuilder {
nfa: Nfa,
is_sep: bool,
precedence_stack: Vec<i32>,
}
fn get_implicit_precedence(rule: &Rule) -> i32 {
match rule {
Rule::String(_) => 2,
Rule::Metadata { rule, params } => {
if params.is_main_token {
get_implicit_precedence(rule) + 1
} else {
get_implicit_precedence(rule)
}
}
_ => 0,
}
}
fn get_completion_precedence(rule: &Rule) -> i32 {
if let Rule::Metadata { params, .. } = rule {
if let Precedence::Integer(p) = params.precedence {
return p;
}
}
0
}
fn preprocess_regex(content: &str) -> String {
let content = CURLY_BRACE_REGEX.replace(content, "$1\\{$2\\}");
let mut result = String::with_capacity(content.len());
let mut is_escaped = false;
for c in content.chars() {
if is_escaped {
if ALLOWED_REDUNDANT_ESCAPED_CHARS.contains(&c) {
result.push(c);
} else {
result.push('\\');
result.push(c);
}
is_escaped = false;
} else if c == '\\' {
is_escaped = true;
} else {
result.push(c);
}
}
if is_escaped {
result.push('\\');
}
result
}
pub(crate) fn expand_tokens(mut grammar: ExtractedLexicalGrammar) -> Result<LexicalGrammar> {
let mut builder = NfaBuilder {
nfa: Nfa::new(),
is_sep: true,
precedence_stack: vec![0],
};
let separator_rule = if grammar.separators.len() > 0 {
grammar.separators.push(Rule::Blank);
Rule::repeat(Rule::choice(grammar.separators))
} else {
Rule::Blank
};
let mut variables = Vec::new();
for (i, variable) in grammar.variables.into_iter().enumerate() {
let is_immediate_token = match &variable.rule {
Rule::Metadata { params, .. } => params.is_main_token,
_ => false,
};
builder.is_sep = false;
builder.nfa.states.push(NfaState::Accept {
variable_index: i,
precedence: get_completion_precedence(&variable.rule),
});
let last_state_id = builder.nfa.last_state_id();
builder
.expand_rule(&variable.rule, last_state_id)
.with_context(|| format!("Error processing rule {}", variable.name))?;
if !is_immediate_token {
builder.is_sep = true;
let last_state_id = builder.nfa.last_state_id();
builder.expand_rule(&separator_rule, last_state_id)?;
}
variables.push(LexicalVariable {
name: variable.name,
kind: variable.kind,
implicit_precedence: get_implicit_precedence(&variable.rule),
start_state: builder.nfa.last_state_id(),
});
}
Ok(LexicalGrammar {
nfa: builder.nfa,
variables,
})
}
impl NfaBuilder {
fn expand_rule(&mut self, rule: &Rule, mut next_state_id: u32) -> Result<bool> {
match rule {
Rule::Pattern(s) => {
let s = preprocess_regex(s);
let ast = parse::Parser::new().parse(&s)?;
self.expand_regex(&ast, next_state_id)
}
Rule::String(s) => {
for c in s.chars().rev() {
self.push_advance(CharacterSet::empty().add_char(c), next_state_id);
next_state_id = self.nfa.last_state_id();
}
Ok(s.len() > 0)
}
Rule::Choice(elements) => {
let mut alternative_state_ids = Vec::new();
for element in elements {
if self.expand_rule(element, next_state_id)? {
alternative_state_ids.push(self.nfa.last_state_id());
} else {
alternative_state_ids.push(next_state_id);
}
}
alternative_state_ids.sort_unstable();
alternative_state_ids.dedup();
alternative_state_ids.retain(|i| *i != self.nfa.last_state_id());
for alternative_state_id in alternative_state_ids {
self.push_split(alternative_state_id);
}
Ok(true)
}
Rule::Seq(elements) => {
let mut result = false;
for element in elements.into_iter().rev() {
if self.expand_rule(element, next_state_id)? {
result = true;
}
next_state_id = self.nfa.last_state_id();
}
Ok(result)
}
Rule::Repeat(rule) => {
self.nfa.states.push(NfaState::Accept {
variable_index: 0,
precedence: 0,
}); // Placeholder for split
let split_state_id = self.nfa.last_state_id();
if self.expand_rule(rule, split_state_id)? {
self.nfa.states[split_state_id as usize] =
NfaState::Split(self.nfa.last_state_id(), next_state_id);
Ok(true)
} else {
Ok(false)
}
}
Rule::Metadata { rule, params } => {
let has_precedence = if let Precedence::Integer(precedence) = &params.precedence {
self.precedence_stack.push(*precedence);
true
} else {
false
};
let result = self.expand_rule(rule, next_state_id);
if has_precedence {
self.precedence_stack.pop();
}
result
}
Rule::Blank => Ok(false),
_ => Err(anyhow!("Grammar error: Unexpected rule {:?}", rule)),
}
}
fn expand_regex(&mut self, ast: &Ast, mut next_state_id: u32) -> Result<bool> {
match ast {
Ast::Empty(_) => Ok(false),
Ast::Flags(_) => Err(anyhow!("Regex error: Flags are not supported")),
Ast::Literal(literal) => {
self.push_advance(CharacterSet::from_char(literal.c), next_state_id);
Ok(true)
}
Ast::Dot(_) => {
self.push_advance(CharacterSet::from_char('\n').negate(), next_state_id);
Ok(true)
}
Ast::Assertion(_) => Err(anyhow!("Regex error: Assertions are not supported")),
Ast::Class(class) => match class {
Class::Unicode(class) => {
let mut chars = self.expand_unicode_character_class(&class.kind)?;
if class.negated {
chars = chars.negate();
}
self.push_advance(chars, next_state_id);
Ok(true)
}
Class::Perl(class) => {
let mut chars = self.expand_perl_character_class(&class.kind);
if class.negated {
chars = chars.negate();
}
self.push_advance(chars, next_state_id);
Ok(true)
}
Class::Bracketed(class) => {
let mut chars = self.translate_class_set(&class.kind)?;
if class.negated {
chars = chars.negate();
}
self.push_advance(chars, next_state_id);
Ok(true)
}
},
Ast::Repetition(repetition) => match repetition.op.kind {
RepetitionKind::ZeroOrOne => {
self.expand_zero_or_one(&repetition.ast, next_state_id)
}
RepetitionKind::OneOrMore => {
self.expand_one_or_more(&repetition.ast, next_state_id)
}
RepetitionKind::ZeroOrMore => {
self.expand_zero_or_more(&repetition.ast, next_state_id)
}
RepetitionKind::Range(RepetitionRange::Exactly(count)) => {
self.expand_count(&repetition.ast, count, next_state_id)
}
RepetitionKind::Range(RepetitionRange::AtLeast(min)) => {
if self.expand_zero_or_more(&repetition.ast, next_state_id)? {
self.expand_count(&repetition.ast, min, next_state_id)
} else {
Ok(false)
}
}
RepetitionKind::Range(RepetitionRange::Bounded(min, max)) => {
let mut result = self.expand_count(&repetition.ast, min, next_state_id)?;
for _ in min..max {
if result {
next_state_id = self.nfa.last_state_id();
}
if self.expand_zero_or_one(&repetition.ast, next_state_id)? {
result = true;
}
}
Ok(result)
}
},
Ast::Group(group) => self.expand_regex(&group.ast, next_state_id),
Ast::Alternation(alternation) => {
let mut alternative_state_ids = Vec::new();
for ast in alternation.asts.iter() {
if self.expand_regex(&ast, next_state_id)? {
alternative_state_ids.push(self.nfa.last_state_id());
} else {
alternative_state_ids.push(next_state_id);
}
}
alternative_state_ids.sort_unstable();
alternative_state_ids.dedup();
alternative_state_ids.retain(|i| *i != self.nfa.last_state_id());
for alternative_state_id in alternative_state_ids {
self.push_split(alternative_state_id);
}
Ok(true)
}
Ast::Concat(concat) => {
let mut result = false;
for ast in concat.asts.iter().rev() {
if self.expand_regex(&ast, next_state_id)? {
result = true;
next_state_id = self.nfa.last_state_id();
}
}
Ok(result)
}
}
}
fn translate_class_set(&self, class_set: &ClassSet) -> Result<CharacterSet> {
match &class_set {
ClassSet::Item(item) => self.expand_character_class(&item),
ClassSet::BinaryOp(binary_op) => {
let mut lhs_char_class = self.translate_class_set(&binary_op.lhs)?;
let mut rhs_char_class = self.translate_class_set(&binary_op.rhs)?;
match binary_op.kind {
ClassSetBinaryOpKind::Intersection => {
Ok(lhs_char_class.remove_intersection(&mut rhs_char_class))
}
ClassSetBinaryOpKind::Difference => {
Ok(lhs_char_class.difference(rhs_char_class))
}
ClassSetBinaryOpKind::SymmetricDifference => {
Ok(lhs_char_class.symmetric_difference(rhs_char_class))
}
}
}
}
}
fn expand_one_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
self.nfa.states.push(NfaState::Accept {
variable_index: 0,
precedence: 0,
}); // Placeholder for split
let split_state_id = self.nfa.last_state_id();
if self.expand_regex(&ast, split_state_id)? {
self.nfa.states[split_state_id as usize] =
NfaState::Split(self.nfa.last_state_id(), next_state_id);
Ok(true)
} else {
self.nfa.states.pop();
Ok(false)
}
}
fn expand_zero_or_one(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
if self.expand_regex(ast, next_state_id)? {
self.push_split(next_state_id);
Ok(true)
} else {
Ok(false)
}
}
fn expand_zero_or_more(&mut self, ast: &Ast, next_state_id: u32) -> Result<bool> {
if self.expand_one_or_more(&ast, next_state_id)? {
self.push_split(next_state_id);
Ok(true)
} else {
Ok(false)
}
}
fn expand_count(&mut self, ast: &Ast, count: u32, mut next_state_id: u32) -> Result<bool> {
let mut result = false;
for _ in 0..count {
if self.expand_regex(ast, next_state_id)? {
result = true;
next_state_id = self.nfa.last_state_id();
}
}
Ok(result)
}
fn expand_character_class(&self, item: &ClassSetItem) -> Result<CharacterSet> {
match item {
ClassSetItem::Empty(_) => Ok(CharacterSet::empty()),
ClassSetItem::Literal(literal) => Ok(CharacterSet::from_char(literal.c)),
ClassSetItem::Range(range) => Ok(CharacterSet::from_range(range.start.c, range.end.c)),
ClassSetItem::Union(union) => {
let mut result = CharacterSet::empty();
for item in &union.items {
result = result.add(&self.expand_character_class(&item)?);
}
Ok(result)
}
ClassSetItem::Perl(class) => Ok(self.expand_perl_character_class(&class.kind)),
ClassSetItem::Unicode(class) => {
let mut set = self.expand_unicode_character_class(&class.kind)?;
if class.negated {
set = set.negate();
}
Ok(set)
}
ClassSetItem::Bracketed(class) => {
let mut set = self.translate_class_set(&class.kind)?;
if class.negated {
set = set.negate();
}
Ok(set)
}
_ => Err(anyhow!(
"Regex error: Unsupported character class syntax {:?}",
item
)),
}
}
fn expand_unicode_character_class(&self, class: &ClassUnicodeKind) -> Result<CharacterSet> {
let mut chars = CharacterSet::empty();
let category_letter;
match class {
ClassUnicodeKind::OneLetter(le) => {
category_letter = le.to_string();
}
ClassUnicodeKind::Named(class_name) => {
let actual_class_name = UNICODE_CATEGORY_ALIASES
.get(class_name.as_str())
.or_else(|| UNICODE_PROPERTY_ALIASES.get(class_name.as_str()))
.unwrap_or(class_name);
if actual_class_name.len() == 1 {
category_letter = actual_class_name.clone();
} else {
let code_points = UNICODE_CATEGORIES
.get(actual_class_name.as_str())
.or_else(|| UNICODE_PROPERTIES.get(actual_class_name.as_str()))
.ok_or_else(|| {
anyhow!(
"Regex error: Unsupported unicode character class {}",
class_name
)
})?;
for c in code_points {
if let Some(c) = std::char::from_u32(*c) {
chars = chars.add_char(c);
}
}
return Ok(chars);
}
}
ClassUnicodeKind::NamedValue { .. } => {
return Err(anyhow!(
"Regex error: Key-value unicode properties are not supported"
))
}
}
for (category, code_points) in UNICODE_CATEGORIES.iter() {
if category.starts_with(&category_letter) {
for c in code_points {
if let Some(c) = std::char::from_u32(*c) {
chars = chars.add_char(c);
}
}
}
}
Ok(chars)
}
fn expand_perl_character_class(&self, item: &ClassPerlKind) -> CharacterSet {
match item {
ClassPerlKind::Digit => CharacterSet::from_range('0', '9'),
ClassPerlKind::Space => CharacterSet::empty()
.add_char(' ')
.add_char('\t')
.add_char('\r')
.add_char('\n'),
ClassPerlKind::Word => CharacterSet::empty()
.add_char('_')
.add_range('A', 'Z')
.add_range('a', 'z')
.add_range('0', '9'),
}
}
fn push_advance(&mut self, chars: CharacterSet, state_id: u32) {
let precedence = *self.precedence_stack.last().unwrap();
self.nfa.states.push(NfaState::Advance {
chars,
state_id,
precedence,
is_sep: self.is_sep,
});
}
fn push_split(&mut self, state_id: u32) {
let last_state_id = self.nfa.last_state_id();
self.nfa
.states
.push(NfaState::Split(state_id, last_state_id));
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::generate::grammars::Variable;
use crate::generate::nfa::{NfaCursor, NfaTransition};
fn simulate_nfa<'a>(grammar: &'a LexicalGrammar, s: &'a str) -> Option<(usize, &'a str)> {
let start_states = grammar.variables.iter().map(|v| v.start_state).collect();
let mut cursor = NfaCursor::new(&grammar.nfa, start_states);
let mut result = None;
let mut result_precedence = i32::MIN;
let mut start_char = 0;
let mut end_char = 0;
for c in s.chars() {
for (id, precedence) in cursor.completions() {
if result.is_none() || result_precedence <= precedence {
result = Some((id, &s[start_char..end_char]));
result_precedence = precedence;
}
}
if let Some(NfaTransition {
states,
is_separator,
..
}) = cursor
.transitions()
.into_iter()
.find(|t| t.characters.contains(c) && t.precedence >= result_precedence)
{
cursor.reset(states);
end_char += c.len_utf8();
if is_separator {
start_char = end_char;
}
} else {
break;
}
}
for (id, precedence) in cursor.completions() {
if result.is_none() || result_precedence <= precedence {
result = Some((id, &s[start_char..end_char]));
result_precedence = precedence;
}
}
result
}
#[test]
fn test_rule_expansion() {
struct Row {
rules: Vec<Rule>,
separators: Vec<Rule>,
examples: Vec<(&'static str, Option<(usize, &'static str)>)>,
}
let table = [
// regex with sequences and alternatives
Row {
rules: vec![Rule::pattern("(a|b|c)d(e|f|g)h?")],
separators: vec![],
examples: vec![
("ade1", Some((0, "ade"))),
("bdf1", Some((0, "bdf"))),
("bdfh1", Some((0, "bdfh"))),
("ad1", None),
],
},
// regex with repeats
Row {
rules: vec![Rule::pattern("a*")],
separators: vec![],
examples: vec![("aaa1", Some((0, "aaa"))), ("b", Some((0, "")))],
},
// regex with repeats in sequences
Row {
rules: vec![Rule::pattern("a((bc)+|(de)*)f")],
separators: vec![],
examples: vec![
("af1", Some((0, "af"))),
("adedef1", Some((0, "adedef"))),
("abcbcbcf1", Some((0, "abcbcbcf"))),
("a", None),
],
},
// regex with character ranges
Row {
rules: vec![Rule::pattern("[a-fA-F0-9]+")],
separators: vec![],
examples: vec![("A1ff0.", Some((0, "A1ff0")))],
},
// regex with perl character classes
Row {
rules: vec![Rule::pattern("\\w\\d\\s")],
separators: vec![],
examples: vec![("_0 ", Some((0, "_0 ")))],
},
// string
Row {
rules: vec![Rule::string("abc")],
separators: vec![],
examples: vec![("abcd", Some((0, "abc"))), ("ab", None)],
},
// complex rule containing strings and regexes
Row {
rules: vec![Rule::repeat(Rule::seq(vec![
Rule::string("{"),
Rule::pattern("[a-f]+"),
Rule::string("}"),
]))],
separators: vec![],
examples: vec![
("{a}{", Some((0, "{a}"))),
("{a}{d", Some((0, "{a}"))),
("ab", None),
],
},
// longest match rule
Row {
rules: vec![
Rule::pattern("a|bc"),
Rule::pattern("aa"),
Rule::pattern("bcd"),
],
separators: vec![],
examples: vec![
("a.", Some((0, "a"))),
("bc.", Some((0, "bc"))),
("aa.", Some((1, "aa"))),
("bcd?", Some((2, "bcd"))),
("b.", None),
("c.", None),
],
},
// regex with an alternative including the empty string
Row {
rules: vec![Rule::pattern("a(b|)+c")],
separators: vec![],
examples: vec![
("ac.", Some((0, "ac"))),
("abc.", Some((0, "abc"))),
("abbc.", Some((0, "abbc"))),
],
},
// separators
Row {
rules: vec![Rule::pattern("[a-f]+")],
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")],
examples: vec![
(" a", Some((0, "a"))),
(" \nb", Some((0, "b"))),
(" \\a", None),
(" \\\na", Some((0, "a"))),
],
},
// shorter tokens with higher precedence
Row {
rules: vec![
Rule::prec(Precedence::Integer(2), Rule::pattern("abc")),
Rule::prec(Precedence::Integer(1), Rule::pattern("ab[cd]e")),
Rule::pattern("[a-e]+"),
],
separators: vec![Rule::string("\\\n"), Rule::pattern("\\s")],
examples: vec![
("abceef", Some((0, "abc"))),
("abdeef", Some((1, "abde"))),
("aeeeef", Some((2, "aeeee"))),
],
},
// immediate tokens with higher precedence
Row {
rules: vec![
Rule::prec(Precedence::Integer(1), Rule::pattern("[^a]+")),
Rule::immediate_token(Rule::prec(
Precedence::Integer(2),
Rule::pattern("[^ab]+"),
)),
],
separators: vec![Rule::pattern("\\s")],
examples: vec![("cccb", Some((1, "ccc")))],
},
Row {
rules: vec![Rule::seq(vec![
Rule::string("a"),
Rule::choice(vec![Rule::string("b"), Rule::string("c")]),
Rule::string("d"),
])],
separators: vec![],
examples: vec![
("abd", Some((0, "abd"))),
("acd", Some((0, "acd"))),
("abc", None),
("ad", None),
("d", None),
("a", None),
],
},
// nested choices within sequences
Row {
rules: vec![Rule::seq(vec![
Rule::pattern("[0-9]+"),
Rule::choice(vec![
Rule::Blank,
Rule::choice(vec![Rule::seq(vec![
Rule::choice(vec![Rule::string("e"), Rule::string("E")]),
Rule::choice(vec![
Rule::Blank,
Rule::choice(vec![Rule::string("+"), Rule::string("-")]),
]),
Rule::pattern("[0-9]+"),
])]),
]),
])],
separators: vec![],
examples: vec![
("12", Some((0, "12"))),
("12e", Some((0, "12"))),
("12g", Some((0, "12"))),
("12e3", Some((0, "12e3"))),
("12e+", Some((0, "12"))),
("12E+34 +", Some((0, "12E+34"))),
("12e34", Some((0, "12e34"))),
],
},
// nested groups
Row {
rules: vec![Rule::seq(vec![Rule::pattern(r#"([^x\\]|\\(.|\n))+"#)])],
separators: vec![],
examples: vec![("abcx", Some((0, "abc"))), ("abc\\0x", Some((0, "abc\\0")))],
},
// allowing unrecognized escape sequences
Row {
rules: vec![
// Escaped forward slash (used in JS because '/' is the regex delimiter)
Rule::pattern(r#"\/"#),
// Escaped quotes
Rule::pattern(r#"\"\'"#),
// Quote preceded by a literal backslash
Rule::pattern(r#"[\\']+"#),
],
separators: vec![],
examples: vec![
("/", Some((0, "/"))),
("\"\'", Some((1, "\"\'"))),
(r#"'\'a"#, Some((2, r#"'\'"#))),
],
},
// unicode property escapes
Row {
rules: vec![
Rule::pattern(r#"\p{L}+\P{L}+"#),
Rule::pattern(r#"\p{White_Space}+\P{White_Space}+[\p{White_Space}]*"#),
],
separators: vec![],
examples: vec![
(" 123 abc", Some((1, " 123 "))),
("ბΨƁ___ƀƔ", Some((0, "ბΨƁ___"))),
],
},
// unicode property escapes in bracketed sets
Row {
rules: vec![Rule::pattern(r#"[\p{L}\p{Nd}]+"#)],
separators: vec![],
examples: vec![("abΨ12٣٣, ok", Some((0, "abΨ12٣٣")))],
},
// unicode character escapes
Row {
rules: vec![
Rule::pattern(r#"\u{00dc}"#),
Rule::pattern(r#"\U{000000dd}"#),
Rule::pattern(r#"\u00de"#),
Rule::pattern(r#"\U000000df"#),
],
separators: vec![],
examples: vec![
("\u{00dc}", Some((0, "\u{00dc}"))),
("\u{00dd}", Some((1, "\u{00dd}"))),
("\u{00de}", Some((2, "\u{00de}"))),
("\u{00df}", Some((3, "\u{00df}"))),
],
},
// allowing un-escaped curly braces
Row {
rules: vec![
// Un-escaped curly braces
Rule::pattern(r#"u{[0-9a-fA-F]+}"#),
// Already-escaped curly braces
Rule::pattern(r#"\{[ab]{3}\}"#),
// Unicode codepoints
Rule::pattern(r#"\u{1000A}"#),
// Unicode codepoints (lowercase)
Rule::pattern(r#"\u{1000b}"#),
],
separators: vec![],
examples: vec![
("u{1234} ok", Some((0, "u{1234}"))),
("{aba}}", Some((1, "{aba}"))),
("\u{1000A}", Some((2, "\u{1000A}"))),
("\u{1000b}", Some((3, "\u{1000b}"))),
],
},
// Emojis
Row {
rules: vec![Rule::pattern(r"\p{Emoji}+")],
separators: vec![],
examples: vec![
("🐎", Some((0, "🐎"))),
("🐴🐴", Some((0, "🐴🐴"))),
("#0", Some((0, "#0"))), // These chars are technically emojis!
("", None),
("", None),
("horse", None),
],
},
// Intersection
Row {
rules: vec![Rule::pattern(r"[[0-7]&&[4-9]]+")],
separators: vec![],
examples: vec![
("456", Some((0, "456"))),
("64", Some((0, "64"))),
("452", Some((0, "45"))),
("91", None),
("8", None),
("3", None),
],
},
// Difference
Row {
rules: vec![Rule::pattern(r"[[0-9]--[4-7]]+")],
separators: vec![],
examples: vec![
("123", Some((0, "123"))),
("83", Some((0, "83"))),
("9", Some((0, "9"))),
("124", Some((0, "12"))),
("67", None),
("4", None),
],
},
// Symmetric difference
Row {
rules: vec![Rule::pattern(r"[[0-7]~~[4-9]]+")],
separators: vec![],
examples: vec![
("123", Some((0, "123"))),
("83", Some((0, "83"))),
("9", Some((0, "9"))),
("124", Some((0, "12"))),
("67", None),
("4", None),
],
},
// Nested set operations
Row {
// 0 1 2 3 4 5 6 7 8 9
// [0-5]: y y y y y y
// [2-4]: y y y
// [0-5]--[2-4]: y y y
// [3-9]: y y y y y y y
// [6-7]: y y
// [3-9]--[5-7]: y y y y y
// final regex: y y y y y y
rules: vec![Rule::pattern(r"[[[0-5]--[2-4]]~~[[3-9]--[6-7]]]+")],
separators: vec![],
examples: vec![
("01", Some((0, "01"))),
("432", Some((0, "43"))),
("8", Some((0, "8"))),
("9", Some((0, "9"))),
("2", None),
("567", None),
],
},
];
for Row {
rules,
separators,
examples,
} in &table
{
let grammar = expand_tokens(ExtractedLexicalGrammar {
separators: separators.clone(),
variables: rules
.into_iter()
.map(|rule| Variable::named("", rule.clone()))
.collect(),
})
.unwrap();
for (haystack, needle) in examples.iter() {
assert_eq!(simulate_nfa(&grammar, haystack), *needle);
}
}
}
}

View File

@ -0,0 +1,304 @@
use crate::generate::grammars::{LexicalGrammar, SyntaxGrammar};
use crate::generate::rules::{Alias, AliasMap, Symbol, SymbolType};
#[derive(Clone, Default)]
struct SymbolStatus {
aliases: Vec<(Alias, usize)>,
appears_unaliased: bool,
}
// Update the grammar by finding symbols that always are aliased, and for each such symbol,
// promoting one of its aliases to a "default alias", which is applied globally instead
// of in a context-specific way.
//
// This has two benefits:
// * It reduces the overhead of storing production-specific alias info in the parse table.
// * Within an `ERROR` node, no context-specific aliases will be applied. This transformation
// ensures that the children of an `ERROR` node have symbols that are consistent with the
// way that they would appear in a valid syntax tree.
pub(super) fn extract_default_aliases(
syntax_grammar: &mut SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
) -> AliasMap {
let mut terminal_status_list = vec![SymbolStatus::default(); lexical_grammar.variables.len()];
let mut non_terminal_status_list =
vec![SymbolStatus::default(); syntax_grammar.variables.len()];
let mut external_status_list =
vec![SymbolStatus::default(); syntax_grammar.external_tokens.len()];
// For each grammar symbol, find all of the aliases under which the symbol appears,
// and determine whether or not the symbol ever appears *unaliased*.
for variable in syntax_grammar.variables.iter() {
for production in variable.productions.iter() {
for step in production.steps.iter() {
let mut status = match step.symbol.kind {
SymbolType::External => &mut external_status_list[step.symbol.index],
SymbolType::NonTerminal => &mut non_terminal_status_list[step.symbol.index],
SymbolType::Terminal => &mut terminal_status_list[step.symbol.index],
SymbolType::End | SymbolType::EndOfNonTerminalExtra => {
panic!("Unexpected end token")
}
};
// Default aliases don't work for inlined variables.
if syntax_grammar.variables_to_inline.contains(&step.symbol) {
continue;
}
if let Some(alias) = &step.alias {
if let Some(count_for_alias) = status
.aliases
.iter_mut()
.find_map(|(a, count)| if a == alias { Some(count) } else { None })
{
*count_for_alias += 1;
} else {
status.aliases.push((alias.clone(), 1));
}
} else {
status.appears_unaliased = true;
}
}
}
}
for symbol in syntax_grammar.extra_symbols.iter() {
let mut status = match symbol.kind {
SymbolType::External => &mut external_status_list[symbol.index],
SymbolType::NonTerminal => &mut non_terminal_status_list[symbol.index],
SymbolType::Terminal => &mut terminal_status_list[symbol.index],
SymbolType::End | SymbolType::EndOfNonTerminalExtra => {
panic!("Unexpected end token")
}
};
status.appears_unaliased = true;
}
let symbols_with_statuses = (terminal_status_list
.iter_mut()
.enumerate()
.map(|(i, status)| (Symbol::terminal(i), status)))
.chain(
non_terminal_status_list
.iter_mut()
.enumerate()
.map(|(i, status)| (Symbol::non_terminal(i), status)),
)
.chain(
external_status_list
.iter_mut()
.enumerate()
.map(|(i, status)| (Symbol::external(i), status)),
);
// For each symbol that always appears aliased, find the alias the occurs most often,
// and designate that alias as the symbol's "default alias". Store all of these
// default aliases in a map that will be returned.
let mut result = AliasMap::new();
for (symbol, status) in symbols_with_statuses {
if status.appears_unaliased {
status.aliases.clear();
} else {
if let Some(default_entry) = status
.aliases
.iter()
.enumerate()
.max_by_key(|(i, (_, count))| (count, -(*i as i64)))
.map(|(_, entry)| entry.clone())
{
status.aliases.clear();
status.aliases.push(default_entry.clone());
result.insert(symbol, default_entry.0);
}
}
}
// Wherever a symbol is aliased as its default alias, remove the usage of the alias,
// because it will now be redundant.
let mut alias_positions_to_clear = Vec::new();
for variable in syntax_grammar.variables.iter_mut() {
alias_positions_to_clear.clear();
for (i, production) in variable.productions.iter().enumerate() {
for (j, step) in production.steps.iter().enumerate() {
let status = match step.symbol.kind {
SymbolType::External => &mut external_status_list[step.symbol.index],
SymbolType::NonTerminal => &mut non_terminal_status_list[step.symbol.index],
SymbolType::Terminal => &mut terminal_status_list[step.symbol.index],
SymbolType::End | SymbolType::EndOfNonTerminalExtra => {
panic!("Unexpected end token")
}
};
// If this step is aliased as the symbol's default alias, then remove that alias.
if step.alias.is_some()
&& step.alias.as_ref() == status.aliases.get(0).map(|t| &t.0)
{
let mut other_productions_must_use_this_alias_at_this_index = false;
for (other_i, other_production) in variable.productions.iter().enumerate() {
if other_i != i
&& other_production.steps.len() > j
&& other_production.steps[j].alias == step.alias
&& result.get(&other_production.steps[j].symbol) != step.alias.as_ref()
{
other_productions_must_use_this_alias_at_this_index = true;
break;
}
}
if !other_productions_must_use_this_alias_at_this_index {
alias_positions_to_clear.push((i, j));
}
}
}
}
for (production_index, step_index) in &alias_positions_to_clear {
variable.productions[*production_index].steps[*step_index].alias = None;
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use crate::generate::grammars::{
LexicalVariable, Production, ProductionStep, SyntaxVariable, VariableType,
};
use crate::generate::nfa::Nfa;
#[test]
fn test_extract_simple_aliases() {
let mut syntax_grammar = SyntaxGrammar {
variables: vec![
SyntaxVariable {
name: "v1".to_owned(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true),
ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true),
ProductionStep::new(Symbol::terminal(2)).with_alias("a3", true),
ProductionStep::new(Symbol::terminal(3)).with_alias("a4", true),
],
}],
},
SyntaxVariable {
name: "v2".to_owned(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
// Token 0 is always aliased as "a1".
ProductionStep::new(Symbol::terminal(0)).with_alias("a1", true),
// Token 1 is aliased within rule `v1` above, but not here.
ProductionStep::new(Symbol::terminal(1)),
// Token 2 is aliased differently here than in `v1`. The alias from
// `v1` should be promoted to the default alias, because `v1` appears
// first in the grammar.
ProductionStep::new(Symbol::terminal(2)).with_alias("a5", true),
// Token 3 is also aliased differently here than in `v1`. In this case,
// this alias should be promoted to the default alias, because it is
// used a greater number of times (twice).
ProductionStep::new(Symbol::terminal(3)).with_alias("a6", true),
ProductionStep::new(Symbol::terminal(3)).with_alias("a6", true),
],
}],
},
],
..Default::default()
};
let lexical_grammar = LexicalGrammar {
nfa: Nfa::new(),
variables: vec![
LexicalVariable {
name: "t0".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
},
LexicalVariable {
name: "t1".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
},
LexicalVariable {
name: "t2".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
},
LexicalVariable {
name: "t3".to_string(),
kind: VariableType::Anonymous,
implicit_precedence: 0,
start_state: 0,
},
],
};
let default_aliases = extract_default_aliases(&mut syntax_grammar, &lexical_grammar);
assert_eq!(default_aliases.len(), 3);
assert_eq!(
default_aliases.get(&Symbol::terminal(0)),
Some(&Alias {
value: "a1".to_string(),
is_named: true,
})
);
assert_eq!(
default_aliases.get(&Symbol::terminal(2)),
Some(&Alias {
value: "a3".to_string(),
is_named: true,
})
);
assert_eq!(
default_aliases.get(&Symbol::terminal(3)),
Some(&Alias {
value: "a6".to_string(),
is_named: true,
})
);
assert_eq!(default_aliases.get(&Symbol::terminal(1)), None);
assert_eq!(
syntax_grammar.variables,
vec![
SyntaxVariable {
name: "v1".to_owned(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(0)),
ProductionStep::new(Symbol::terminal(1)).with_alias("a2", true),
ProductionStep::new(Symbol::terminal(2)),
ProductionStep::new(Symbol::terminal(3)).with_alias("a4", true),
],
},],
},
SyntaxVariable {
name: "v2".to_owned(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(0)),
ProductionStep::new(Symbol::terminal(1)),
ProductionStep::new(Symbol::terminal(2)).with_alias("a5", true),
ProductionStep::new(Symbol::terminal(3)),
ProductionStep::new(Symbol::terminal(3)),
],
},],
},
]
);
}
}

View File

@ -0,0 +1,499 @@
use super::{ExtractedLexicalGrammar, ExtractedSyntaxGrammar, InternedGrammar};
use crate::generate::grammars::{ExternalToken, Variable, VariableType};
use crate::generate::rules::{MetadataParams, Rule, Symbol, SymbolType};
use anyhow::{anyhow, Result};
use std::collections::HashMap;
use std::mem;
pub(super) fn extract_tokens(
mut grammar: InternedGrammar,
) -> Result<(ExtractedSyntaxGrammar, ExtractedLexicalGrammar)> {
let mut extractor = TokenExtractor {
current_variable_name: String::new(),
current_variable_token_count: 0,
extracted_variables: Vec::new(),
extracted_usage_counts: Vec::new(),
};
for mut variable in grammar.variables.iter_mut() {
extractor.extract_tokens_in_variable(&mut variable);
}
for mut variable in grammar.external_tokens.iter_mut() {
extractor.extract_tokens_in_variable(&mut variable);
}
let mut lexical_variables = Vec::with_capacity(extractor.extracted_variables.len());
for variable in extractor.extracted_variables {
lexical_variables.push(Variable {
name: variable.name,
kind: variable.kind,
rule: variable.rule,
});
}
// If a variable's entire rule was extracted as a token and that token didn't
// appear within any other rule, then remove that variable from the syntax
// grammar, giving its name to the token in the lexical grammar. Any symbols
// that pointed to that variable will need to be updated to point to the
// variable in the lexical grammar. Symbols that pointed to later variables
// will need to have their indices decremented.
let mut variables = Vec::new();
let mut symbol_replacer = SymbolReplacer {
replacements: HashMap::new(),
};
for (i, variable) in grammar.variables.into_iter().enumerate() {
if let Rule::Symbol(Symbol {
kind: SymbolType::Terminal,
index,
}) = variable.rule
{
if i > 0 && extractor.extracted_usage_counts[index] == 1 {
let mut lexical_variable = &mut lexical_variables[index];
lexical_variable.kind = variable.kind;
lexical_variable.name = variable.name;
symbol_replacer.replacements.insert(i, index);
continue;
}
}
variables.push(variable);
}
for variable in variables.iter_mut() {
variable.rule = symbol_replacer.replace_symbols_in_rule(&variable.rule);
}
let expected_conflicts = grammar
.expected_conflicts
.into_iter()
.map(|conflict| {
let mut result: Vec<_> = conflict
.iter()
.map(|symbol| symbol_replacer.replace_symbol(*symbol))
.collect();
result.sort_unstable();
result.dedup();
result
})
.collect();
let supertype_symbols = grammar
.supertype_symbols
.into_iter()
.map(|symbol| symbol_replacer.replace_symbol(symbol))
.collect();
let variables_to_inline = grammar
.variables_to_inline
.into_iter()
.map(|symbol| symbol_replacer.replace_symbol(symbol))
.collect();
let mut separators = Vec::new();
let mut extra_symbols = Vec::new();
for rule in grammar.extra_symbols {
if let Rule::Symbol(symbol) = rule {
extra_symbols.push(symbol_replacer.replace_symbol(symbol));
} else {
if let Some(index) = lexical_variables.iter().position(|v| v.rule == rule) {
extra_symbols.push(Symbol::terminal(index));
} else {
separators.push(rule);
}
}
}
let mut external_tokens = Vec::new();
for external_token in grammar.external_tokens {
let rule = symbol_replacer.replace_symbols_in_rule(&external_token.rule);
if let Rule::Symbol(symbol) = rule {
if symbol.is_non_terminal() {
return Err(anyhow!(
"Rule '{}' cannot be used as both an external token and a non-terminal rule",
&variables[symbol.index].name,
));
}
if symbol.is_external() {
external_tokens.push(ExternalToken {
name: external_token.name,
kind: external_token.kind,
corresponding_internal_token: None,
})
} else {
external_tokens.push(ExternalToken {
name: lexical_variables[symbol.index].name.clone(),
kind: external_token.kind,
corresponding_internal_token: Some(symbol),
})
}
} else {
return Err(anyhow!(
"Non-symbol rules cannot be used as external tokens"
));
}
}
let mut word_token = None;
if let Some(token) = grammar.word_token {
let token = symbol_replacer.replace_symbol(token);
if token.is_non_terminal() {
return Err(anyhow!(
"Non-terminal symbol '{}' cannot be used as the word token",
&variables[token.index].name
));
}
word_token = Some(token);
}
Ok((
ExtractedSyntaxGrammar {
variables,
expected_conflicts,
extra_symbols,
variables_to_inline,
supertype_symbols,
external_tokens,
word_token,
precedence_orderings: grammar.precedence_orderings,
},
ExtractedLexicalGrammar {
variables: lexical_variables,
separators,
},
))
}
struct TokenExtractor {
current_variable_name: String,
current_variable_token_count: usize,
extracted_variables: Vec<Variable>,
extracted_usage_counts: Vec<usize>,
}
struct SymbolReplacer {
replacements: HashMap<usize, usize>,
}
impl TokenExtractor {
fn extract_tokens_in_variable(&mut self, variable: &mut Variable) {
self.current_variable_name.clear();
self.current_variable_name.push_str(&variable.name);
self.current_variable_token_count = 0;
let mut rule = Rule::Blank;
mem::swap(&mut rule, &mut variable.rule);
variable.rule = self.extract_tokens_in_rule(&rule);
}
fn extract_tokens_in_rule(&mut self, input: &Rule) -> Rule {
match input {
Rule::String(name) => self.extract_token(input, Some(name)).into(),
Rule::Pattern(..) => self.extract_token(input, None).into(),
Rule::Metadata { params, rule } => {
if params.is_token {
let mut params = params.clone();
params.is_token = false;
let mut string_value = None;
if let Rule::String(value) = rule.as_ref() {
string_value = Some(value);
}
let rule_to_extract = if params == MetadataParams::default() {
rule.as_ref()
} else {
input
};
self.extract_token(rule_to_extract, string_value).into()
} else {
Rule::Metadata {
params: params.clone(),
rule: Box::new(self.extract_tokens_in_rule((&rule).clone())),
}
}
}
Rule::Repeat(content) => Rule::Repeat(Box::new(self.extract_tokens_in_rule(content))),
Rule::Seq(elements) => Rule::Seq(
elements
.iter()
.map(|e| self.extract_tokens_in_rule(e))
.collect(),
),
Rule::Choice(elements) => Rule::Choice(
elements
.iter()
.map(|e| self.extract_tokens_in_rule(e))
.collect(),
),
_ => input.clone(),
}
}
fn extract_token(&mut self, rule: &Rule, string_value: Option<&String>) -> Symbol {
for (i, variable) in self.extracted_variables.iter_mut().enumerate() {
if variable.rule == *rule {
self.extracted_usage_counts[i] += 1;
return Symbol::terminal(i);
}
}
let index = self.extracted_variables.len();
let variable = if let Some(string_value) = string_value {
Variable {
name: string_value.clone(),
kind: VariableType::Anonymous,
rule: rule.clone(),
}
} else {
self.current_variable_token_count += 1;
Variable {
name: format!(
"{}_token{}",
&self.current_variable_name, self.current_variable_token_count
),
kind: VariableType::Auxiliary,
rule: rule.clone(),
}
};
self.extracted_variables.push(variable);
self.extracted_usage_counts.push(1);
Symbol::terminal(index)
}
}
impl SymbolReplacer {
fn replace_symbols_in_rule(&mut self, rule: &Rule) -> Rule {
match rule {
Rule::Symbol(symbol) => self.replace_symbol(*symbol).into(),
Rule::Choice(elements) => Rule::Choice(
elements
.iter()
.map(|e| self.replace_symbols_in_rule(e))
.collect(),
),
Rule::Seq(elements) => Rule::Seq(
elements
.iter()
.map(|e| self.replace_symbols_in_rule(e))
.collect(),
),
Rule::Repeat(content) => Rule::Repeat(Box::new(self.replace_symbols_in_rule(content))),
Rule::Metadata { rule, params } => Rule::Metadata {
params: params.clone(),
rule: Box::new(self.replace_symbols_in_rule(rule)),
},
_ => rule.clone(),
}
}
fn replace_symbol(&self, symbol: Symbol) -> Symbol {
if !symbol.is_non_terminal() {
return symbol;
}
if let Some(replacement) = self.replacements.get(&symbol.index) {
return Symbol::terminal(*replacement);
}
let mut adjusted_index = symbol.index;
for (replaced_index, _) in self.replacements.iter() {
if *replaced_index < symbol.index {
adjusted_index -= 1;
}
}
return Symbol::non_terminal(adjusted_index);
}
}
#[cfg(test)]
mod test {
use super::*;
use crate::generate::grammars::VariableType;
#[test]
fn test_extraction() {
let (syntax_grammar, lexical_grammar) = extract_tokens(build_grammar(vec![
Variable::named(
"rule_0",
Rule::repeat(Rule::seq(vec![
Rule::string("a"),
Rule::pattern("b"),
Rule::choice(vec![
Rule::non_terminal(1),
Rule::non_terminal(2),
Rule::token(Rule::repeat(Rule::choice(vec![
Rule::string("c"),
Rule::string("d"),
]))),
]),
])),
),
Variable::named("rule_1", Rule::pattern("e")),
Variable::named("rule_2", Rule::pattern("b")),
Variable::named(
"rule_3",
Rule::seq(vec![Rule::non_terminal(2), Rule::Blank]),
),
]))
.unwrap();
assert_eq!(
syntax_grammar.variables,
vec![
Variable::named(
"rule_0",
Rule::repeat(Rule::seq(vec![
// The string "a" was replaced by a symbol referencing the lexical grammar
Rule::terminal(0),
// The pattern "b" was replaced by a symbol referencing the lexical grammar
Rule::terminal(1),
Rule::choice(vec![
// The symbol referencing `rule_1` was replaced by a symbol referencing
// the lexical grammar.
Rule::terminal(3),
// The symbol referencing `rule_2` had its index decremented because
// `rule_1` was moved to the lexical grammar.
Rule::non_terminal(1),
// The rule wrapped in `token` was replaced by a symbol referencing
// the lexical grammar.
Rule::terminal(2),
])
]))
),
// The pattern "e" was only used in once place: as the definition of `rule_1`,
// so that rule was moved to the lexical grammar. The pattern "b" appeared in
// two places, so it was not moved into the lexical grammar.
Variable::named("rule_2", Rule::terminal(1)),
Variable::named(
"rule_3",
Rule::seq(vec![Rule::non_terminal(1), Rule::Blank,])
),
]
);
assert_eq!(
lexical_grammar.variables,
vec![
Variable::anonymous("a", Rule::string("a")),
Variable::auxiliary("rule_0_token1", Rule::pattern("b")),
Variable::auxiliary(
"rule_0_token2",
Rule::repeat(Rule::choice(vec![Rule::string("c"), Rule::string("d"),]))
),
Variable::named("rule_1", Rule::pattern("e")),
]
);
}
#[test]
fn test_start_rule_is_token() {
let (syntax_grammar, lexical_grammar) =
extract_tokens(build_grammar(vec![Variable::named(
"rule_0",
Rule::string("hello"),
)]))
.unwrap();
assert_eq!(
syntax_grammar.variables,
vec![Variable::named("rule_0", Rule::terminal(0)),]
);
assert_eq!(
lexical_grammar.variables,
vec![Variable::anonymous("hello", Rule::string("hello")),]
)
}
#[test]
fn test_extracting_extra_symbols() {
let mut grammar = build_grammar(vec![
Variable::named("rule_0", Rule::string("x")),
Variable::named("comment", Rule::pattern("//.*")),
]);
grammar.extra_symbols = vec![Rule::string(" "), Rule::non_terminal(1)];
let (syntax_grammar, lexical_grammar) = extract_tokens(grammar).unwrap();
assert_eq!(syntax_grammar.extra_symbols, vec![Symbol::terminal(1),]);
assert_eq!(lexical_grammar.separators, vec![Rule::string(" "),]);
}
#[test]
fn test_extract_externals() {
let mut grammar = build_grammar(vec![
Variable::named(
"rule_0",
Rule::seq(vec![
Rule::external(0),
Rule::string("a"),
Rule::non_terminal(1),
Rule::non_terminal(2),
]),
),
Variable::named("rule_1", Rule::string("b")),
Variable::named("rule_2", Rule::string("c")),
]);
grammar.external_tokens = vec![
Variable::named("external_0", Rule::external(0)),
Variable::anonymous("a", Rule::string("a")),
Variable::named("rule_2", Rule::non_terminal(2)),
];
let (syntax_grammar, _) = extract_tokens(grammar).unwrap();
assert_eq!(
syntax_grammar.external_tokens,
vec![
ExternalToken {
name: "external_0".to_string(),
kind: VariableType::Named,
corresponding_internal_token: None,
},
ExternalToken {
name: "a".to_string(),
kind: VariableType::Anonymous,
corresponding_internal_token: Some(Symbol::terminal(0)),
},
ExternalToken {
name: "rule_2".to_string(),
kind: VariableType::Named,
corresponding_internal_token: Some(Symbol::terminal(2)),
},
]
);
}
#[test]
fn test_error_on_external_with_same_name_as_non_terminal() {
let mut grammar = build_grammar(vec![
Variable::named(
"rule_0",
Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(2)]),
),
Variable::named(
"rule_1",
Rule::seq(vec![Rule::non_terminal(2), Rule::non_terminal(2)]),
),
Variable::named("rule_2", Rule::string("a")),
]);
grammar.external_tokens = vec![Variable::named("rule_1", Rule::non_terminal(1))];
match extract_tokens(grammar) {
Err(e) => {
assert_eq!(e.to_string(), "Rule 'rule_1' cannot be used as both an external token and a non-terminal rule");
}
_ => {
panic!("Expected an error but got no error");
}
}
}
fn build_grammar(variables: Vec<Variable>) -> InternedGrammar {
InternedGrammar {
variables,
..Default::default()
}
}
}

View File

@ -0,0 +1,420 @@
use super::ExtractedSyntaxGrammar;
use crate::generate::grammars::{
Production, ProductionStep, SyntaxGrammar, SyntaxVariable, Variable,
};
use crate::generate::rules::{Alias, Associativity, Precedence, Rule, Symbol};
use anyhow::{anyhow, Result};
struct RuleFlattener {
production: Production,
precedence_stack: Vec<Precedence>,
associativity_stack: Vec<Associativity>,
alias_stack: Vec<Alias>,
field_name_stack: Vec<String>,
}
impl RuleFlattener {
fn new() -> Self {
Self {
production: Production {
steps: Vec::new(),
dynamic_precedence: 0,
},
precedence_stack: Vec::new(),
associativity_stack: Vec::new(),
alias_stack: Vec::new(),
field_name_stack: Vec::new(),
}
}
fn flatten(mut self, rule: Rule) -> Production {
self.apply(rule, true);
self.production
}
fn apply(&mut self, rule: Rule, at_end: bool) -> bool {
match rule {
Rule::Seq(members) => {
let mut result = false;
let last_index = members.len() - 1;
for (i, member) in members.into_iter().enumerate() {
result |= self.apply(member, i == last_index && at_end);
}
result
}
Rule::Metadata { rule, params } => {
let mut has_precedence = false;
if !params.precedence.is_none() {
has_precedence = true;
self.precedence_stack.push(params.precedence);
}
let mut has_associativity = false;
if let Some(associativity) = params.associativity {
has_associativity = true;
self.associativity_stack.push(associativity);
}
let mut has_alias = false;
if let Some(alias) = params.alias {
has_alias = true;
self.alias_stack.push(alias);
}
let mut has_field_name = false;
if let Some(field_name) = params.field_name {
has_field_name = true;
self.field_name_stack.push(field_name);
}
if params.dynamic_precedence.abs() > self.production.dynamic_precedence.abs() {
self.production.dynamic_precedence = params.dynamic_precedence;
}
let did_push = self.apply(*rule, at_end);
if has_precedence {
self.precedence_stack.pop();
if did_push && !at_end {
self.production.steps.last_mut().unwrap().precedence = self
.precedence_stack
.last()
.cloned()
.unwrap_or(Precedence::None);
}
}
if has_associativity {
self.associativity_stack.pop();
if did_push && !at_end {
self.production.steps.last_mut().unwrap().associativity =
self.associativity_stack.last().cloned();
}
}
if has_alias {
self.alias_stack.pop();
}
if has_field_name {
self.field_name_stack.pop();
}
did_push
}
Rule::Symbol(symbol) => {
self.production.steps.push(ProductionStep {
symbol,
precedence: self
.precedence_stack
.last()
.cloned()
.unwrap_or(Precedence::None),
associativity: self.associativity_stack.last().cloned(),
alias: self.alias_stack.last().cloned(),
field_name: self.field_name_stack.last().cloned(),
});
true
}
_ => false,
}
}
}
fn extract_choices(rule: Rule) -> Vec<Rule> {
match rule {
Rule::Seq(elements) => {
let mut result = vec![Rule::Blank];
for element in elements {
let extraction = extract_choices(element);
let mut next_result = Vec::new();
for entry in result {
for extraction_entry in extraction.iter() {
next_result.push(Rule::Seq(vec![entry.clone(), extraction_entry.clone()]));
}
}
result = next_result;
}
result
}
Rule::Choice(elements) => {
let mut result = Vec::new();
for element in elements {
for rule in extract_choices(element) {
result.push(rule);
}
}
result
}
Rule::Metadata { rule, params } => extract_choices(*rule)
.into_iter()
.map(|rule| Rule::Metadata {
rule: Box::new(rule),
params: params.clone(),
})
.collect(),
_ => vec![rule],
}
}
fn flatten_variable(variable: Variable) -> Result<SyntaxVariable> {
let mut productions = Vec::new();
for rule in extract_choices(variable.rule) {
let production = RuleFlattener::new().flatten(rule);
if !productions.contains(&production) {
productions.push(production);
}
}
Ok(SyntaxVariable {
name: variable.name,
kind: variable.kind,
productions,
})
}
fn symbol_is_used(variables: &Vec<SyntaxVariable>, symbol: Symbol) -> bool {
for variable in variables {
for production in &variable.productions {
for step in &production.steps {
if step.symbol == symbol {
return true;
}
}
}
}
false
}
pub(super) fn flatten_grammar(grammar: ExtractedSyntaxGrammar) -> Result<SyntaxGrammar> {
let mut variables = Vec::new();
for variable in grammar.variables {
variables.push(flatten_variable(variable)?);
}
for (i, variable) in variables.iter().enumerate() {
for production in &variable.productions {
if production.steps.is_empty() && symbol_is_used(&variables, Symbol::non_terminal(i)) {
return Err(anyhow!(
"The rule `{}` matches the empty string.
Tree-sitter does not support syntactic rules that match the empty string
unless they are used only as the grammar's start rule.
",
variable.name
));
}
}
}
Ok(SyntaxGrammar {
extra_symbols: grammar.extra_symbols,
expected_conflicts: grammar.expected_conflicts,
variables_to_inline: grammar.variables_to_inline,
precedence_orderings: grammar.precedence_orderings,
external_tokens: grammar.external_tokens,
supertype_symbols: grammar.supertype_symbols,
word_token: grammar.word_token,
variables,
})
}
#[cfg(test)]
mod tests {
use super::*;
use crate::generate::grammars::VariableType;
use crate::generate::rules::Symbol;
#[test]
fn test_flatten_grammar() {
let result = flatten_variable(Variable {
name: "test".to_string(),
kind: VariableType::Named,
rule: Rule::seq(vec![
Rule::non_terminal(1),
Rule::prec_left(
Precedence::Integer(101),
Rule::seq(vec![
Rule::non_terminal(2),
Rule::choice(vec![
Rule::prec_right(
Precedence::Integer(102),
Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]),
),
Rule::non_terminal(5),
]),
Rule::non_terminal(6),
]),
),
Rule::non_terminal(7),
]),
})
.unwrap();
assert_eq!(
result.productions,
vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)),
ProductionStep::new(Symbol::non_terminal(2))
.with_prec(Precedence::Integer(101), Some(Associativity::Left)),
ProductionStep::new(Symbol::non_terminal(3))
.with_prec(Precedence::Integer(102), Some(Associativity::Right)),
ProductionStep::new(Symbol::non_terminal(4))
.with_prec(Precedence::Integer(101), Some(Associativity::Left)),
ProductionStep::new(Symbol::non_terminal(6)),
ProductionStep::new(Symbol::non_terminal(7)),
]
},
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)),
ProductionStep::new(Symbol::non_terminal(2))
.with_prec(Precedence::Integer(101), Some(Associativity::Left)),
ProductionStep::new(Symbol::non_terminal(5))
.with_prec(Precedence::Integer(101), Some(Associativity::Left)),
ProductionStep::new(Symbol::non_terminal(6)),
ProductionStep::new(Symbol::non_terminal(7)),
]
},
]
);
}
#[test]
fn test_flatten_grammar_with_maximum_dynamic_precedence() {
let result = flatten_variable(Variable {
name: "test".to_string(),
kind: VariableType::Named,
rule: Rule::seq(vec![
Rule::non_terminal(1),
Rule::prec_dynamic(
101,
Rule::seq(vec![
Rule::non_terminal(2),
Rule::choice(vec![
Rule::prec_dynamic(
102,
Rule::seq(vec![Rule::non_terminal(3), Rule::non_terminal(4)]),
),
Rule::non_terminal(5),
]),
Rule::non_terminal(6),
]),
),
Rule::non_terminal(7),
]),
})
.unwrap();
assert_eq!(
result.productions,
vec![
Production {
dynamic_precedence: 102,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)),
ProductionStep::new(Symbol::non_terminal(2)),
ProductionStep::new(Symbol::non_terminal(3)),
ProductionStep::new(Symbol::non_terminal(4)),
ProductionStep::new(Symbol::non_terminal(6)),
ProductionStep::new(Symbol::non_terminal(7)),
],
},
Production {
dynamic_precedence: 101,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1)),
ProductionStep::new(Symbol::non_terminal(2)),
ProductionStep::new(Symbol::non_terminal(5)),
ProductionStep::new(Symbol::non_terminal(6)),
ProductionStep::new(Symbol::non_terminal(7)),
],
},
]
);
}
#[test]
fn test_flatten_grammar_with_final_precedence() {
let result = flatten_variable(Variable {
name: "test".to_string(),
kind: VariableType::Named,
rule: Rule::prec_left(
Precedence::Integer(101),
Rule::seq(vec![Rule::non_terminal(1), Rule::non_terminal(2)]),
),
})
.unwrap();
assert_eq!(
result.productions,
vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::non_terminal(1))
.with_prec(Precedence::Integer(101), Some(Associativity::Left)),
ProductionStep::new(Symbol::non_terminal(2))
.with_prec(Precedence::Integer(101), Some(Associativity::Left)),
]
}]
);
let result = flatten_variable(Variable {
name: "test".to_string(),
kind: VariableType::Named,
rule: Rule::prec_left(
Precedence::Integer(101),
Rule::seq(vec![Rule::non_terminal(1)]),
),
})
.unwrap();
assert_eq!(
result.productions,
vec![Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::non_terminal(1))
.with_prec(Precedence::Integer(101), Some(Associativity::Left)),]
}]
);
}
#[test]
fn test_flatten_grammar_with_field_names() {
let result = flatten_variable(Variable {
name: "test".to_string(),
kind: VariableType::Named,
rule: Rule::seq(vec![
Rule::field("first-thing".to_string(), Rule::terminal(1)),
Rule::terminal(2),
Rule::choice(vec![
Rule::Blank,
Rule::field("second-thing".to_string(), Rule::terminal(3)),
]),
]),
})
.unwrap();
assert_eq!(
result.productions,
vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(1)).with_field_name("first-thing"),
ProductionStep::new(Symbol::terminal(2))
]
},
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(1)).with_field_name("first-thing"),
ProductionStep::new(Symbol::terminal(2)),
ProductionStep::new(Symbol::terminal(3)).with_field_name("second-thing"),
]
},
]
);
}
}

View File

@ -0,0 +1,249 @@
use super::InternedGrammar;
use crate::generate::grammars::{InputGrammar, Variable, VariableType};
use crate::generate::rules::{Rule, Symbol};
use anyhow::{anyhow, Result};
pub(super) fn intern_symbols(grammar: &InputGrammar) -> Result<InternedGrammar> {
let interner = Interner { grammar };
if variable_type_for_name(&grammar.variables[0].name) == VariableType::Hidden {
return Err(anyhow!("A grammar's start rule must be visible."));
}
let mut variables = Vec::with_capacity(grammar.variables.len());
for variable in grammar.variables.iter() {
variables.push(Variable {
name: variable.name.clone(),
kind: variable_type_for_name(&variable.name),
rule: interner.intern_rule(&variable.rule)?,
});
}
let mut external_tokens = Vec::with_capacity(grammar.external_tokens.len());
for external_token in grammar.external_tokens.iter() {
let rule = interner.intern_rule(&external_token)?;
let (name, kind) = if let Rule::NamedSymbol(name) = external_token {
(name.clone(), variable_type_for_name(&name))
} else {
(String::new(), VariableType::Anonymous)
};
external_tokens.push(Variable { name, kind, rule });
}
let mut extra_symbols = Vec::with_capacity(grammar.extra_symbols.len());
for extra_token in grammar.extra_symbols.iter() {
extra_symbols.push(interner.intern_rule(extra_token)?);
}
let mut supertype_symbols = Vec::with_capacity(grammar.supertype_symbols.len());
for supertype_symbol_name in grammar.supertype_symbols.iter() {
supertype_symbols.push(
interner
.intern_name(supertype_symbol_name)
.ok_or_else(|| anyhow!("Undefined symbol `{}`", supertype_symbol_name))?,
);
}
let mut expected_conflicts = Vec::new();
for conflict in grammar.expected_conflicts.iter() {
let mut interned_conflict = Vec::with_capacity(conflict.len());
for name in conflict {
interned_conflict.push(
interner
.intern_name(&name)
.ok_or_else(|| anyhow!("Undefined symbol `{}`", name))?,
);
}
expected_conflicts.push(interned_conflict);
}
let mut variables_to_inline = Vec::new();
for name in grammar.variables_to_inline.iter() {
if let Some(symbol) = interner.intern_name(&name) {
variables_to_inline.push(symbol);
}
}
let mut word_token = None;
if let Some(name) = grammar.word_token.as_ref() {
word_token = Some(
interner
.intern_name(&name)
.ok_or_else(|| anyhow!("Undefined symbol `{}`", &name))?,
);
}
for (i, variable) in variables.iter_mut().enumerate() {
if supertype_symbols.contains(&Symbol::non_terminal(i)) {
variable.kind = VariableType::Hidden;
}
}
Ok(InternedGrammar {
variables,
external_tokens,
extra_symbols,
expected_conflicts,
variables_to_inline,
supertype_symbols,
word_token,
precedence_orderings: grammar.precedence_orderings.clone(),
})
}
struct Interner<'a> {
grammar: &'a InputGrammar,
}
impl<'a> Interner<'a> {
fn intern_rule(&self, rule: &Rule) -> Result<Rule> {
match rule {
Rule::Choice(elements) => {
let mut result = Vec::with_capacity(elements.len());
for element in elements {
result.push(self.intern_rule(element)?);
}
Ok(Rule::Choice(result))
}
Rule::Seq(elements) => {
let mut result = Vec::with_capacity(elements.len());
for element in elements {
result.push(self.intern_rule(element)?);
}
Ok(Rule::Seq(result))
}
Rule::Repeat(content) => Ok(Rule::Repeat(Box::new(self.intern_rule(content)?))),
Rule::Metadata { rule, params } => Ok(Rule::Metadata {
rule: Box::new(self.intern_rule(rule)?),
params: params.clone(),
}),
Rule::NamedSymbol(name) => {
if let Some(symbol) = self.intern_name(&name) {
Ok(Rule::Symbol(symbol))
} else {
Err(anyhow!("Undefined symbol `{}`", name))
}
}
_ => Ok(rule.clone()),
}
}
fn intern_name(&self, symbol: &str) -> Option<Symbol> {
for (i, variable) in self.grammar.variables.iter().enumerate() {
if variable.name == symbol {
return Some(Symbol::non_terminal(i));
}
}
for (i, external_token) in self.grammar.external_tokens.iter().enumerate() {
if let Rule::NamedSymbol(name) = external_token {
if name == symbol {
return Some(Symbol::external(i));
}
}
}
return None;
}
}
fn variable_type_for_name(name: &str) -> VariableType {
if name.starts_with("_") {
VariableType::Hidden
} else {
VariableType::Named
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_repeat_expansion() {
let grammar = intern_symbols(&build_grammar(vec![
Variable::named("x", Rule::choice(vec![Rule::named("y"), Rule::named("_z")])),
Variable::named("y", Rule::named("_z")),
Variable::named("_z", Rule::string("a")),
]))
.unwrap();
assert_eq!(
grammar.variables,
vec![
Variable::named(
"x",
Rule::choice(vec![Rule::non_terminal(1), Rule::non_terminal(2),])
),
Variable::named("y", Rule::non_terminal(2)),
Variable::hidden("_z", Rule::string("a")),
]
);
}
#[test]
fn test_interning_external_token_names() {
// Variable `y` is both an internal and an external token.
// Variable `z` is just an external token.
let mut input_grammar = build_grammar(vec![
Variable::named(
"w",
Rule::choice(vec![Rule::named("x"), Rule::named("y"), Rule::named("z")]),
),
Variable::named("x", Rule::string("a")),
Variable::named("y", Rule::string("b")),
]);
input_grammar
.external_tokens
.extend(vec![Rule::named("y"), Rule::named("z")]);
let grammar = intern_symbols(&input_grammar).unwrap();
// Variable `y` is referred to by its internal index.
// Variable `z` is referred to by its external index.
assert_eq!(
grammar.variables,
vec![
Variable::named(
"w",
Rule::choice(vec![
Rule::non_terminal(1),
Rule::non_terminal(2),
Rule::external(1),
])
),
Variable::named("x", Rule::string("a")),
Variable::named("y", Rule::string("b")),
]
);
// The external token for `y` refers back to its internal index.
assert_eq!(
grammar.external_tokens,
vec![
Variable::named("y", Rule::non_terminal(2)),
Variable::named("z", Rule::external(1)),
]
);
}
#[test]
fn test_grammar_with_undefined_symbols() {
let result = intern_symbols(&build_grammar(vec![Variable::named("x", Rule::named("y"))]));
match result {
Err(e) => assert_eq!(e.to_string(), "Undefined symbol `y`"),
_ => panic!("Expected an error but got none"),
}
}
fn build_grammar(variables: Vec<Variable>) -> InputGrammar {
InputGrammar {
variables,
name: "the_language".to_string(),
..Default::default()
}
}
}

View File

@ -0,0 +1,256 @@
mod expand_repeats;
mod expand_tokens;
mod extract_default_aliases;
mod extract_tokens;
mod flatten_grammar;
mod intern_symbols;
mod process_inlines;
pub(crate) use self::expand_tokens::expand_tokens;
use self::expand_repeats::expand_repeats;
use self::extract_default_aliases::extract_default_aliases;
use self::extract_tokens::extract_tokens;
use self::flatten_grammar::flatten_grammar;
use self::intern_symbols::intern_symbols;
use self::process_inlines::process_inlines;
use super::grammars::{
ExternalToken, InlinedProductionMap, InputGrammar, LexicalGrammar, PrecedenceEntry,
SyntaxGrammar, Variable,
};
use super::rules::{AliasMap, Precedence, Rule, Symbol};
use anyhow::{anyhow, Result};
use std::{
cmp::Ordering,
collections::{hash_map, HashMap, HashSet},
mem,
};
pub(crate) struct IntermediateGrammar<T, U> {
variables: Vec<Variable>,
extra_symbols: Vec<T>,
expected_conflicts: Vec<Vec<Symbol>>,
precedence_orderings: Vec<Vec<PrecedenceEntry>>,
external_tokens: Vec<U>,
variables_to_inline: Vec<Symbol>,
supertype_symbols: Vec<Symbol>,
word_token: Option<Symbol>,
}
pub(crate) type InternedGrammar = IntermediateGrammar<Rule, Variable>;
pub(crate) type ExtractedSyntaxGrammar = IntermediateGrammar<Symbol, ExternalToken>;
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct ExtractedLexicalGrammar {
pub variables: Vec<Variable>,
pub separators: Vec<Rule>,
}
impl<T, U> Default for IntermediateGrammar<T, U> {
fn default() -> Self {
Self {
variables: Default::default(),
extra_symbols: Default::default(),
expected_conflicts: Default::default(),
precedence_orderings: Default::default(),
external_tokens: Default::default(),
variables_to_inline: Default::default(),
supertype_symbols: Default::default(),
word_token: Default::default(),
}
}
}
/// Transform an input grammar into separate components that are ready
/// for parse table construction.
pub(crate) fn prepare_grammar(
input_grammar: &InputGrammar,
) -> Result<(
SyntaxGrammar,
LexicalGrammar,
InlinedProductionMap,
AliasMap,
)> {
validate_precedences(input_grammar)?;
let interned_grammar = intern_symbols(input_grammar)?;
let (syntax_grammar, lexical_grammar) = extract_tokens(interned_grammar)?;
let syntax_grammar = expand_repeats(syntax_grammar);
let mut syntax_grammar = flatten_grammar(syntax_grammar)?;
let lexical_grammar = expand_tokens(lexical_grammar)?;
let default_aliases = extract_default_aliases(&mut syntax_grammar, &lexical_grammar);
let inlines = process_inlines(&syntax_grammar, &lexical_grammar)?;
Ok((syntax_grammar, lexical_grammar, inlines, default_aliases))
}
/// Check that all of the named precedences used in the grammar are declared
/// within the `precedences` lists, and also that there are no conflicting
/// precedence orderings declared in those lists.
fn validate_precedences(grammar: &InputGrammar) -> Result<()> {
// For any two precedence names `a` and `b`, if `a` comes before `b`
// in some list, then it cannot come *after* `b` in any list.
let mut pairs = HashMap::new();
for list in &grammar.precedence_orderings {
for (i, mut entry1) in list.iter().enumerate() {
for mut entry2 in list.iter().skip(i + 1) {
if entry2 == entry1 {
continue;
}
let mut ordering = Ordering::Greater;
if entry1 > entry2 {
ordering = Ordering::Less;
mem::swap(&mut entry1, &mut entry2);
}
match pairs.entry((entry1, entry2)) {
hash_map::Entry::Vacant(e) => {
e.insert(ordering);
}
hash_map::Entry::Occupied(e) => {
if e.get() != &ordering {
return Err(anyhow!(
"Conflicting orderings for precedences {} and {}",
entry1,
entry2
));
}
}
}
}
}
}
// Check that no rule contains a named precedence that is not present in
// any of the `precedences` lists.
fn validate(rule_name: &str, rule: &Rule, names: &HashSet<&String>) -> Result<()> {
match rule {
Rule::Repeat(rule) => validate(rule_name, rule, names),
Rule::Seq(elements) | Rule::Choice(elements) => elements
.iter()
.map(|e| validate(rule_name, e, names))
.collect(),
Rule::Metadata { rule, params } => {
if let Precedence::Name(n) = &params.precedence {
if !names.contains(n) {
return Err(anyhow!(
"Undeclared precedence '{}' in rule '{}'",
n,
rule_name
));
}
}
validate(rule_name, rule, names)?;
Ok(())
}
_ => Ok(()),
}
}
let precedence_names = grammar
.precedence_orderings
.iter()
.flat_map(|l| l.iter())
.filter_map(|p| {
if let PrecedenceEntry::Name(n) = p {
Some(n)
} else {
None
}
})
.collect::<HashSet<&String>>();
for variable in &grammar.variables {
validate(&variable.name, &variable.rule, &precedence_names)?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::generate::grammars::{InputGrammar, Variable, VariableType};
#[test]
fn test_validate_precedences_with_undeclared_precedence() {
let grammar = InputGrammar {
precedence_orderings: vec![
vec![
PrecedenceEntry::Name("a".to_string()),
PrecedenceEntry::Name("b".to_string()),
],
vec![
PrecedenceEntry::Name("b".to_string()),
PrecedenceEntry::Name("c".to_string()),
PrecedenceEntry::Name("d".to_string()),
],
],
variables: vec![
Variable {
name: "v1".to_string(),
kind: VariableType::Named,
rule: Rule::Seq(vec![
Rule::prec_left(Precedence::Name("b".to_string()), Rule::string("w")),
Rule::prec(Precedence::Name("c".to_string()), Rule::string("x")),
]),
},
Variable {
name: "v2".to_string(),
kind: VariableType::Named,
rule: Rule::repeat(Rule::Choice(vec![
Rule::prec_left(Precedence::Name("omg".to_string()), Rule::string("y")),
Rule::prec(Precedence::Name("c".to_string()), Rule::string("z")),
])),
},
],
..Default::default()
};
let result = validate_precedences(&grammar);
assert_eq!(
result.unwrap_err().to_string(),
"Undeclared precedence 'omg' in rule 'v2'",
);
}
#[test]
fn test_validate_precedences_with_conflicting_order() {
let grammar = InputGrammar {
precedence_orderings: vec![
vec![
PrecedenceEntry::Name("a".to_string()),
PrecedenceEntry::Name("b".to_string()),
],
vec![
PrecedenceEntry::Name("b".to_string()),
PrecedenceEntry::Name("c".to_string()),
PrecedenceEntry::Name("a".to_string()),
],
],
variables: vec![
Variable {
name: "v1".to_string(),
kind: VariableType::Named,
rule: Rule::Seq(vec![
Rule::prec_left(Precedence::Name("b".to_string()), Rule::string("w")),
Rule::prec(Precedence::Name("c".to_string()), Rule::string("x")),
]),
},
Variable {
name: "v2".to_string(),
kind: VariableType::Named,
rule: Rule::repeat(Rule::Choice(vec![
Rule::prec_left(Precedence::Name("a".to_string()), Rule::string("y")),
Rule::prec(Precedence::Name("c".to_string()), Rule::string("z")),
])),
},
],
..Default::default()
};
let result = validate_precedences(&grammar);
assert_eq!(
result.unwrap_err().to_string(),
"Conflicting orderings for precedences 'a' and 'b'",
);
}
}

View File

@ -0,0 +1,539 @@
use crate::generate::{
grammars::{InlinedProductionMap, LexicalGrammar, Production, ProductionStep, SyntaxGrammar},
rules::SymbolType,
};
use anyhow::{anyhow, Result};
use std::collections::HashMap;
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
struct ProductionStepId {
// A `None` value here means that the production itself was produced via inlining,
// and is stored in the the builder's `productions` vector, as opposed to being
// stored in one of the grammar's variables.
variable_index: Option<usize>,
production_index: usize,
step_index: usize,
}
struct InlinedProductionMapBuilder {
production_indices_by_step_id: HashMap<ProductionStepId, Vec<usize>>,
productions: Vec<Production>,
}
impl InlinedProductionMapBuilder {
fn build<'a>(mut self, grammar: &'a SyntaxGrammar) -> InlinedProductionMap {
let mut step_ids_to_process = Vec::new();
for (variable_index, variable) in grammar.variables.iter().enumerate() {
for production_index in 0..variable.productions.len() {
step_ids_to_process.push(ProductionStepId {
variable_index: Some(variable_index),
production_index,
step_index: 0,
});
while !step_ids_to_process.is_empty() {
let mut i = 0;
while i < step_ids_to_process.len() {
let step_id = step_ids_to_process[i];
if let Some(step) = self.production_step_for_id(step_id, grammar) {
if grammar.variables_to_inline.contains(&step.symbol) {
let inlined_step_ids = self
.inline_production_at_step(step_id, grammar)
.into_iter()
.cloned()
.map(|production_index| ProductionStepId {
variable_index: None,
production_index,
step_index: step_id.step_index,
});
step_ids_to_process.splice(i..i + 1, inlined_step_ids);
} else {
step_ids_to_process[i] = ProductionStepId {
variable_index: step_id.variable_index,
production_index: step_id.production_index,
step_index: step_id.step_index + 1,
};
i += 1;
}
} else {
step_ids_to_process.remove(i);
}
}
}
}
}
let productions = self.productions;
let production_indices_by_step_id = self.production_indices_by_step_id;
let production_map = production_indices_by_step_id
.into_iter()
.map(|(step_id, production_indices)| {
let production = if let Some(variable_index) = step_id.variable_index {
&grammar.variables[variable_index].productions[step_id.production_index]
} else {
&productions[step_id.production_index]
} as *const Production;
((production, step_id.step_index as u32), production_indices)
})
.collect();
InlinedProductionMap {
productions,
production_map,
}
}
fn inline_production_at_step<'a>(
&'a mut self,
step_id: ProductionStepId,
grammar: &'a SyntaxGrammar,
) -> &'a Vec<usize> {
// Build a list of productions produced by inlining rules.
let mut i = 0;
let step_index = step_id.step_index;
let mut productions_to_add = vec![self.production_for_id(step_id, grammar).clone()];
while i < productions_to_add.len() {
if let Some(step) = productions_to_add[i].steps.get(step_index) {
let symbol = step.symbol.clone();
if grammar.variables_to_inline.contains(&symbol) {
// Remove the production from the vector, replacing it with a placeholder.
let production = productions_to_add
.splice(i..i + 1, [Production::default()].iter().cloned())
.next()
.unwrap();
// Replace the placeholder with the inlined productions.
productions_to_add.splice(
i..i + 1,
grammar.variables[symbol.index].productions.iter().map(|p| {
let mut production = production.clone();
let removed_step = production
.steps
.splice(step_index..(step_index + 1), p.steps.iter().cloned())
.next()
.unwrap();
let inserted_steps =
&mut production.steps[step_index..(step_index + p.steps.len())];
if let Some(alias) = removed_step.alias {
for inserted_step in inserted_steps.iter_mut() {
inserted_step.alias = Some(alias.clone());
}
}
if let Some(field_name) = removed_step.field_name {
for inserted_step in inserted_steps.iter_mut() {
inserted_step.field_name = Some(field_name.clone());
}
}
if let Some(last_inserted_step) = inserted_steps.last_mut() {
if last_inserted_step.precedence.is_none() {
last_inserted_step.precedence = removed_step.precedence;
}
if last_inserted_step.associativity == None {
last_inserted_step.associativity = removed_step.associativity;
}
}
if p.dynamic_precedence.abs() > production.dynamic_precedence.abs() {
production.dynamic_precedence = p.dynamic_precedence;
}
production
}),
);
continue;
}
}
i += 1;
}
// Store all the computed productions.
let result = productions_to_add
.into_iter()
.map(|production| {
self.productions
.iter()
.position(|p| *p == production)
.unwrap_or({
self.productions.push(production);
self.productions.len() - 1
})
})
.collect();
// Cache these productions based on the original production step.
self.production_indices_by_step_id
.entry(step_id)
.or_insert(result)
}
fn production_for_id<'a>(
&'a self,
id: ProductionStepId,
grammar: &'a SyntaxGrammar,
) -> &'a Production {
if let Some(variable_index) = id.variable_index {
&grammar.variables[variable_index].productions[id.production_index]
} else {
&self.productions[id.production_index]
}
}
fn production_step_for_id<'a>(
&'a self,
id: ProductionStepId,
grammar: &'a SyntaxGrammar,
) -> Option<&'a ProductionStep> {
self.production_for_id(id, grammar).steps.get(id.step_index)
}
}
pub(super) fn process_inlines(
grammar: &SyntaxGrammar,
lexical_grammar: &LexicalGrammar,
) -> Result<InlinedProductionMap> {
for symbol in &grammar.variables_to_inline {
match symbol.kind {
SymbolType::External => {
return Err(anyhow!(
"External token `{}` cannot be inlined",
grammar.external_tokens[symbol.index].name
))
}
SymbolType::Terminal => {
return Err(anyhow!(
"Token `{}` cannot be inlined",
lexical_grammar.variables[symbol.index].name,
))
}
_ => {}
}
}
Ok(InlinedProductionMapBuilder {
productions: Vec::new(),
production_indices_by_step_id: HashMap::new(),
}
.build(grammar))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::generate::grammars::{
LexicalVariable, ProductionStep, SyntaxVariable, VariableType,
};
use crate::generate::rules::{Associativity, Precedence, Symbol};
#[test]
fn test_basic_inlining() {
let grammar = SyntaxGrammar {
variables_to_inline: vec![Symbol::non_terminal(1)],
variables: vec![
SyntaxVariable {
name: "non-terminal-0".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::non_terminal(1)), // inlined
ProductionStep::new(Symbol::terminal(11)),
],
}],
},
SyntaxVariable {
name: "non-terminal-1".to_string(),
kind: VariableType::Named,
productions: vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(12)),
ProductionStep::new(Symbol::terminal(13)),
],
},
Production {
dynamic_precedence: -2,
steps: vec![ProductionStep::new(Symbol::terminal(14))],
},
],
},
],
..Default::default()
};
let inline_map = process_inlines(&grammar, &Default::default()).unwrap();
// Nothing to inline at step 0.
assert!(inline_map
.inlined_productions(&grammar.variables[0].productions[0], 0)
.is_none());
// Inlining variable 1 yields two productions.
assert_eq!(
inline_map
.inlined_productions(&grammar.variables[0].productions[0], 1)
.unwrap()
.cloned()
.collect::<Vec<_>>(),
vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(12)),
ProductionStep::new(Symbol::terminal(13)),
ProductionStep::new(Symbol::terminal(11)),
],
},
Production {
dynamic_precedence: -2,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(14)),
ProductionStep::new(Symbol::terminal(11)),
],
},
]
);
}
#[test]
fn test_nested_inlining() {
let grammar = SyntaxGrammar {
variables: vec![
SyntaxVariable {
name: "non-terminal-0".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::non_terminal(1)), // inlined
ProductionStep::new(Symbol::terminal(11)),
ProductionStep::new(Symbol::non_terminal(2)), // inlined
ProductionStep::new(Symbol::terminal(12)),
],
}],
},
SyntaxVariable {
name: "non-terminal-1".to_string(),
kind: VariableType::Named,
productions: vec![
Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::terminal(13))],
},
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::non_terminal(3)), // inlined
ProductionStep::new(Symbol::terminal(14)),
],
},
],
},
SyntaxVariable {
name: "non-terminal-2".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::terminal(15))],
}],
},
SyntaxVariable {
name: "non-terminal-3".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::terminal(16))],
}],
},
],
variables_to_inline: vec![
Symbol::non_terminal(1),
Symbol::non_terminal(2),
Symbol::non_terminal(3),
],
..Default::default()
};
let inline_map = process_inlines(&grammar, &Default::default()).unwrap();
let productions: Vec<&Production> = inline_map
.inlined_productions(&grammar.variables[0].productions[0], 1)
.unwrap()
.collect();
assert_eq!(
productions.iter().cloned().cloned().collect::<Vec<_>>(),
vec![
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(13)),
ProductionStep::new(Symbol::terminal(11)),
ProductionStep::new(Symbol::non_terminal(2)),
ProductionStep::new(Symbol::terminal(12)),
],
},
Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(16)),
ProductionStep::new(Symbol::terminal(14)),
ProductionStep::new(Symbol::terminal(11)),
ProductionStep::new(Symbol::non_terminal(2)),
ProductionStep::new(Symbol::terminal(12)),
],
},
]
);
assert_eq!(
inline_map
.inlined_productions(productions[0], 3)
.unwrap()
.cloned()
.collect::<Vec<_>>(),
vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::terminal(13)),
ProductionStep::new(Symbol::terminal(11)),
ProductionStep::new(Symbol::terminal(15)),
ProductionStep::new(Symbol::terminal(12)),
],
},]
);
}
#[test]
fn test_inlining_with_precedence_and_alias() {
let grammar = SyntaxGrammar {
variables_to_inline: vec![Symbol::non_terminal(1), Symbol::non_terminal(2)],
variables: vec![
SyntaxVariable {
name: "non-terminal-0".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
// inlined
ProductionStep::new(Symbol::non_terminal(1))
.with_prec(Precedence::Integer(1), Some(Associativity::Left)),
ProductionStep::new(Symbol::terminal(10)),
// inlined
ProductionStep::new(Symbol::non_terminal(2))
.with_alias("outer_alias", true),
],
}],
},
SyntaxVariable {
name: "non-terminal-1".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(11))
.with_prec(Precedence::Integer(2), None)
.with_alias("inner_alias", true),
ProductionStep::new(Symbol::terminal(12)),
],
}],
},
SyntaxVariable {
name: "non-terminal-2".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::terminal(13))],
}],
},
],
..Default::default()
};
let inline_map = process_inlines(&grammar, &Default::default()).unwrap();
let productions: Vec<_> = inline_map
.inlined_productions(&grammar.variables[0].productions[0], 0)
.unwrap()
.collect();
assert_eq!(
productions.iter().cloned().cloned().collect::<Vec<_>>(),
vec![Production {
dynamic_precedence: 0,
steps: vec![
// The first step in the inlined production retains its precedence
// and alias.
ProductionStep::new(Symbol::terminal(11))
.with_prec(Precedence::Integer(2), None)
.with_alias("inner_alias", true),
// The final step of the inlined production inherits the precedence of
// the inlined step.
ProductionStep::new(Symbol::terminal(12))
.with_prec(Precedence::Integer(1), Some(Associativity::Left)),
ProductionStep::new(Symbol::terminal(10)),
ProductionStep::new(Symbol::non_terminal(2)).with_alias("outer_alias", true),
]
}],
);
assert_eq!(
inline_map
.inlined_productions(productions[0], 3)
.unwrap()
.cloned()
.collect::<Vec<_>>(),
vec![Production {
dynamic_precedence: 0,
steps: vec![
ProductionStep::new(Symbol::terminal(11))
.with_prec(Precedence::Integer(2), None)
.with_alias("inner_alias", true),
ProductionStep::new(Symbol::terminal(12))
.with_prec(Precedence::Integer(1), Some(Associativity::Left)),
ProductionStep::new(Symbol::terminal(10)),
// All steps of the inlined production inherit their alias from the
// inlined step.
ProductionStep::new(Symbol::terminal(13)).with_alias("outer_alias", true),
]
}],
);
}
#[test]
fn test_error_when_inlining_tokens() {
let lexical_grammar = LexicalGrammar {
variables: vec![LexicalVariable {
name: "something".to_string(),
kind: VariableType::Named,
implicit_precedence: 0,
start_state: 0,
}],
..Default::default()
};
let grammar = SyntaxGrammar {
variables_to_inline: vec![Symbol::terminal(0)],
variables: vec![SyntaxVariable {
name: "non-terminal-0".to_string(),
kind: VariableType::Named,
productions: vec![Production {
dynamic_precedence: 0,
steps: vec![ProductionStep::new(Symbol::terminal(0))],
}],
}],
..Default::default()
};
if let Err(error) = process_inlines(&grammar, &lexical_grammar) {
assert_eq!(error.to_string(), "Token `something` cannot be inlined");
} else {
panic!("expected an error, but got none");
}
}
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"Other":"C","Control":"Cc","cntrl":"Cc","Format":"Cf","Unassigned":"Cn","Private_Use":"Co","Surrogate":"Cs","Letter":"L","Cased_Letter":"LC","Lowercase_Letter":"Ll","Modifier_Letter":"Lm","Other_Letter":"Lo","Titlecase_Letter":"Lt","Uppercase_Letter":"Lu","Mark":"M","Combining_Mark":"M","Spacing_Mark":"Mc","Enclosing_Mark":"Me","Nonspacing_Mark":"Mn","Number":"N","Decimal_Number":"Nd","digit":"Nd","Letter_Number":"Nl","Other_Number":"No","Punctuation":"P","punct":"P","Connector_Punctuation":"Pc","Dash_Punctuation":"Pd","Close_Punctuation":"Pe","Final_Punctuation":"Pf","Initial_Punctuation":"Pi","Other_Punctuation":"Po","Open_Punctuation":"Ps","Symbol":"S","Currency_Symbol":"Sc","Modifier_Symbol":"Sk","Math_Symbol":"Sm","Other_Symbol":"So","Separator":"Z","Line_Separator":"Zl","Paragraph_Separator":"Zp","Space_Separator":"Zs"}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
{"cjkAccountingNumeric":"kAccountingNumeric","cjkOtherNumeric":"kOtherNumeric","cjkPrimaryNumeric":"kPrimaryNumeric","nv":"Numeric_Value","cf":"Case_Folding","cjkCompatibilityVariant":"kCompatibilityVariant","dm":"Decomposition_Mapping","FC_NFKC":"FC_NFKC_Closure","lc":"Lowercase_Mapping","NFKC_CF":"NFKC_Casefold","scf":"Simple_Case_Folding","sfc":"Simple_Case_Folding","slc":"Simple_Lowercase_Mapping","stc":"Simple_Titlecase_Mapping","suc":"Simple_Uppercase_Mapping","tc":"Titlecase_Mapping","uc":"Uppercase_Mapping","bmg":"Bidi_Mirroring_Glyph","bpb":"Bidi_Paired_Bracket","cjkIICore":"kIICore","cjkIRG_GSource":"kIRG_GSource","cjkIRG_HSource":"kIRG_HSource","cjkIRG_JSource":"kIRG_JSource","cjkIRG_KPSource":"kIRG_KPSource","cjkIRG_KSource":"kIRG_KSource","cjkIRG_MSource":"kIRG_MSource","cjkIRG_SSource":"kIRG_SSource","cjkIRG_TSource":"kIRG_TSource","cjkIRG_UKSource":"kIRG_UKSource","cjkIRG_USource":"kIRG_USource","cjkIRG_VSource":"kIRG_VSource","cjkRSUnicode":"kRSUnicode","Unicode_Radical_Stroke":"kRSUnicode","URS":"kRSUnicode","EqUIdeo":"Equivalent_Unified_Ideograph","isc":"ISO_Comment","JSN":"Jamo_Short_Name","na":"Name","na1":"Unicode_1_Name","Name_Alias":"Name_Alias","scx":"Script_Extensions","age":"Age","blk":"Block","sc":"Script","bc":"Bidi_Class","bpt":"Bidi_Paired_Bracket_Type","ccc":"Canonical_Combining_Class","dt":"Decomposition_Type","ea":"East_Asian_Width","gc":"General_Category","GCB":"Grapheme_Cluster_Break","hst":"Hangul_Syllable_Type","InPC":"Indic_Positional_Category","InSC":"Indic_Syllabic_Category","jg":"Joining_Group","jt":"Joining_Type","lb":"Line_Break","NFC_QC":"NFC_Quick_Check","NFD_QC":"NFD_Quick_Check","NFKC_QC":"NFKC_Quick_Check","NFKD_QC":"NFKD_Quick_Check","nt":"Numeric_Type","SB":"Sentence_Break","vo":"Vertical_Orientation","WB":"Word_Break","AHex":"ASCII_Hex_Digit","Alpha":"Alphabetic","Bidi_C":"Bidi_Control","Bidi_M":"Bidi_Mirrored","Cased":"Cased","CE":"Composition_Exclusion","CI":"Case_Ignorable","Comp_Ex":"Full_Composition_Exclusion","CWCF":"Changes_When_Casefolded","CWCM":"Changes_When_Casemapped","CWKCF":"Changes_When_NFKC_Casefolded","CWL":"Changes_When_Lowercased","CWT":"Changes_When_Titlecased","CWU":"Changes_When_Uppercased","Dash":"Dash","Dep":"Deprecated","DI":"Default_Ignorable_Code_Point","Dia":"Diacritic","EBase":"Emoji_Modifier_Base","EComp":"Emoji_Component","EMod":"Emoji_Modifier","Emoji":"Emoji","EPres":"Emoji_Presentation","Ext":"Extender","ExtPict":"Extended_Pictographic","Gr_Base":"Grapheme_Base","Gr_Ext":"Grapheme_Extend","Gr_Link":"Grapheme_Link","Hex":"Hex_Digit","Hyphen":"Hyphen","IDC":"ID_Continue","Ideo":"Ideographic","IDS":"ID_Start","IDSB":"IDS_Binary_Operator","IDST":"IDS_Trinary_Operator","Join_C":"Join_Control","LOE":"Logical_Order_Exception","Lower":"Lowercase","Math":"Math","NChar":"Noncharacter_Code_Point","OAlpha":"Other_Alphabetic","ODI":"Other_Default_Ignorable_Code_Point","OGr_Ext":"Other_Grapheme_Extend","OIDC":"Other_ID_Continue","OIDS":"Other_ID_Start","OLower":"Other_Lowercase","OMath":"Other_Math","OUpper":"Other_Uppercase","Pat_Syn":"Pattern_Syntax","Pat_WS":"Pattern_White_Space","PCM":"Prepended_Concatenation_Mark","QMark":"Quotation_Mark","Radical":"Radical","RI":"Regional_Indicator","SD":"Soft_Dotted","STerm":"Sentence_Terminal","Term":"Terminal_Punctuation","UIdeo":"Unified_Ideograph","Upper":"Uppercase","VS":"Variation_Selector","WSpace":"White_Space","space":"White_Space","XIDC":"XID_Continue","XIDS":"XID_Start","XO_NFC":"Expands_On_NFC","XO_NFD":"Expands_On_NFD","XO_NFKC":"Expands_On_NFKC","XO_NFKD":"Expands_On_NFKD"}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,480 @@
use super::grammars::VariableType;
use smallbitvec::SmallBitVec;
use std::iter::FromIterator;
use std::{collections::HashMap, fmt};
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) enum SymbolType {
External,
End,
EndOfNonTerminalExtra,
Terminal,
NonTerminal,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) enum Associativity {
Left,
Right,
}
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) struct Alias {
pub value: String,
pub is_named: bool,
}
#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum Precedence {
None,
Integer(i32),
Name(String),
}
pub(crate) type AliasMap = HashMap<Symbol, Alias>;
#[derive(Clone, Debug, Default, PartialEq, Eq, Hash)]
pub(crate) struct MetadataParams {
pub precedence: Precedence,
pub dynamic_precedence: i32,
pub associativity: Option<Associativity>,
pub is_token: bool,
pub is_string: bool,
pub is_active: bool,
pub is_main_token: bool,
pub alias: Option<Alias>,
pub field_name: Option<String>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub(crate) struct Symbol {
pub kind: SymbolType,
pub index: usize,
}
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub(crate) enum Rule {
Blank,
String(String),
Pattern(String),
NamedSymbol(String),
Symbol(Symbol),
Choice(Vec<Rule>),
Metadata {
params: MetadataParams,
rule: Box<Rule>,
},
Repeat(Box<Rule>),
Seq(Vec<Rule>),
}
// Because tokens are represented as small (~400 max) unsigned integers,
// sets of tokens can be efficiently represented as bit vectors with each
// index corresponding to a token, and each value representing whether or not
// the token is present in the set.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub(crate) struct TokenSet {
terminal_bits: SmallBitVec,
external_bits: SmallBitVec,
eof: bool,
end_of_nonterminal_extra: bool,
}
impl Rule {
pub fn field(name: String, content: Rule) -> Self {
add_metadata(content, move |params| {
params.field_name = Some(name);
})
}
pub fn alias(content: Rule, value: String, is_named: bool) -> Self {
add_metadata(content, move |params| {
params.alias = Some(Alias { is_named, value });
})
}
pub fn token(content: Rule) -> Self {
add_metadata(content, |params| {
params.is_token = true;
})
}
pub fn immediate_token(content: Rule) -> Self {
add_metadata(content, |params| {
params.is_token = true;
params.is_main_token = true;
})
}
pub fn prec(value: Precedence, content: Rule) -> Self {
add_metadata(content, |params| {
params.precedence = value;
})
}
pub fn prec_left(value: Precedence, content: Rule) -> Self {
add_metadata(content, |params| {
params.associativity = Some(Associativity::Left);
params.precedence = value;
})
}
pub fn prec_right(value: Precedence, content: Rule) -> Self {
add_metadata(content, |params| {
params.associativity = Some(Associativity::Right);
params.precedence = value;
})
}
pub fn prec_dynamic(value: i32, content: Rule) -> Self {
add_metadata(content, |params| {
params.dynamic_precedence = value;
})
}
pub fn repeat(rule: Rule) -> Self {
Rule::Repeat(Box::new(rule))
}
pub fn choice(rules: Vec<Rule>) -> Self {
let mut elements = Vec::with_capacity(rules.len());
for rule in rules {
choice_helper(&mut elements, rule);
}
Rule::Choice(elements)
}
pub fn seq(rules: Vec<Rule>) -> Self {
Rule::Seq(rules)
}
}
impl Alias {
pub fn kind(&self) -> VariableType {
if self.is_named {
VariableType::Named
} else {
VariableType::Anonymous
}
}
}
impl Precedence {
pub fn is_none(&self) -> bool {
matches!(self, Precedence::None)
}
}
#[cfg(test)]
impl Rule {
pub fn terminal(index: usize) -> Self {
Rule::Symbol(Symbol::terminal(index))
}
pub fn non_terminal(index: usize) -> Self {
Rule::Symbol(Symbol::non_terminal(index))
}
pub fn external(index: usize) -> Self {
Rule::Symbol(Symbol::external(index))
}
pub fn named(name: &'static str) -> Self {
Rule::NamedSymbol(name.to_string())
}
pub fn string(value: &'static str) -> Self {
Rule::String(value.to_string())
}
pub fn pattern(value: &'static str) -> Self {
Rule::Pattern(value.to_string())
}
}
impl Symbol {
pub fn is_terminal(&self) -> bool {
self.kind == SymbolType::Terminal
}
pub fn is_non_terminal(&self) -> bool {
self.kind == SymbolType::NonTerminal
}
pub fn is_external(&self) -> bool {
self.kind == SymbolType::External
}
pub fn is_eof(&self) -> bool {
self.kind == SymbolType::End
}
pub fn non_terminal(index: usize) -> Self {
Symbol {
kind: SymbolType::NonTerminal,
index,
}
}
pub fn terminal(index: usize) -> Self {
Symbol {
kind: SymbolType::Terminal,
index,
}
}
pub fn external(index: usize) -> Self {
Symbol {
kind: SymbolType::External,
index,
}
}
pub fn end() -> Self {
Symbol {
kind: SymbolType::End,
index: 0,
}
}
pub fn end_of_nonterminal_extra() -> Self {
Symbol {
kind: SymbolType::EndOfNonTerminalExtra,
index: 0,
}
}
}
impl From<Symbol> for Rule {
fn from(symbol: Symbol) -> Self {
Rule::Symbol(symbol)
}
}
impl TokenSet {
pub fn new() -> Self {
Self {
terminal_bits: SmallBitVec::new(),
external_bits: SmallBitVec::new(),
eof: false,
end_of_nonterminal_extra: false,
}
}
pub fn iter<'a>(&'a self) -> impl Iterator<Item = Symbol> + 'a {
self.terminal_bits
.iter()
.enumerate()
.filter_map(|(i, value)| {
if value {
Some(Symbol::terminal(i))
} else {
None
}
})
.chain(
self.external_bits
.iter()
.enumerate()
.filter_map(|(i, value)| {
if value {
Some(Symbol::external(i))
} else {
None
}
}),
)
.chain(if self.eof { Some(Symbol::end()) } else { None })
.chain(if self.end_of_nonterminal_extra {
Some(Symbol::end_of_nonterminal_extra())
} else {
None
})
}
pub fn terminals<'a>(&'a self) -> impl Iterator<Item = Symbol> + 'a {
self.terminal_bits
.iter()
.enumerate()
.filter_map(|(i, value)| {
if value {
Some(Symbol::terminal(i))
} else {
None
}
})
}
pub fn contains(&self, symbol: &Symbol) -> bool {
match symbol.kind {
SymbolType::NonTerminal => panic!("Cannot store non-terminals in a TokenSet"),
SymbolType::Terminal => self.terminal_bits.get(symbol.index).unwrap_or(false),
SymbolType::External => self.external_bits.get(symbol.index).unwrap_or(false),
SymbolType::End => self.eof,
SymbolType::EndOfNonTerminalExtra => self.end_of_nonterminal_extra,
}
}
pub fn contains_terminal(&self, index: usize) -> bool {
self.terminal_bits.get(index).unwrap_or(false)
}
pub fn insert(&mut self, other: Symbol) {
let vec = match other.kind {
SymbolType::NonTerminal => panic!("Cannot store non-terminals in a TokenSet"),
SymbolType::Terminal => &mut self.terminal_bits,
SymbolType::External => &mut self.external_bits,
SymbolType::End => {
self.eof = true;
return;
}
SymbolType::EndOfNonTerminalExtra => {
self.end_of_nonterminal_extra = true;
return;
}
};
if other.index >= vec.len() {
vec.resize(other.index + 1, false);
}
vec.set(other.index, true);
}
pub fn remove(&mut self, other: &Symbol) -> bool {
let vec = match other.kind {
SymbolType::NonTerminal => panic!("Cannot store non-terminals in a TokenSet"),
SymbolType::Terminal => &mut self.terminal_bits,
SymbolType::External => &mut self.external_bits,
SymbolType::End => {
return if self.eof {
self.eof = false;
true
} else {
false
}
}
SymbolType::EndOfNonTerminalExtra => {
return if self.end_of_nonterminal_extra {
self.end_of_nonterminal_extra = false;
true
} else {
false
};
}
};
if other.index < vec.len() {
if vec[other.index] {
vec.set(other.index, false);
return true;
}
}
false
}
pub fn is_empty(&self) -> bool {
!self.eof
&& !self.end_of_nonterminal_extra
&& !self.terminal_bits.iter().any(|a| a)
&& !self.external_bits.iter().any(|a| a)
}
pub fn insert_all_terminals(&mut self, other: &TokenSet) -> bool {
let mut result = false;
if other.terminal_bits.len() > self.terminal_bits.len() {
self.terminal_bits.resize(other.terminal_bits.len(), false);
}
for (i, element) in other.terminal_bits.iter().enumerate() {
if element {
result |= !self.terminal_bits[i];
self.terminal_bits.set(i, element);
}
}
result
}
fn insert_all_externals(&mut self, other: &TokenSet) -> bool {
let mut result = false;
if other.external_bits.len() > self.external_bits.len() {
self.external_bits.resize(other.external_bits.len(), false);
}
for (i, element) in other.external_bits.iter().enumerate() {
if element {
result |= !self.external_bits[i];
self.external_bits.set(i, element);
}
}
result
}
pub fn insert_all(&mut self, other: &TokenSet) -> bool {
let mut result = false;
if other.eof {
result |= !self.eof;
self.eof = true;
}
if other.end_of_nonterminal_extra {
result |= !self.end_of_nonterminal_extra;
self.end_of_nonterminal_extra = true;
}
result |= self.insert_all_terminals(other);
result |= self.insert_all_externals(other);
result
}
}
impl FromIterator<Symbol> for TokenSet {
fn from_iter<T: IntoIterator<Item = Symbol>>(iter: T) -> Self {
let mut result = Self::new();
for symbol in iter {
result.insert(symbol);
}
result
}
}
fn add_metadata<T: FnOnce(&mut MetadataParams)>(input: Rule, f: T) -> Rule {
match input {
Rule::Metadata { rule, mut params } if !params.is_token => {
f(&mut params);
Rule::Metadata { rule, params }
}
_ => {
let mut params = MetadataParams::default();
f(&mut params);
Rule::Metadata {
rule: Box::new(input),
params,
}
}
}
}
fn choice_helper(result: &mut Vec<Rule>, rule: Rule) {
match rule {
Rule::Choice(elements) => {
for element in elements {
choice_helper(result, element);
}
}
_ => {
if !result.contains(&rule) {
result.push(rule);
}
}
}
}
impl fmt::Display for Precedence {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Precedence::Integer(i) => write!(f, "{}", i),
Precedence::Name(s) => write!(f, "'{}'", s),
Precedence::None => write!(f, "none"),
}
}
}
impl Default for Precedence {
fn default() -> Self {
Precedence::None
}
}

View File

@ -0,0 +1,168 @@
use super::nfa::CharacterSet;
use super::rules::{Alias, Symbol, TokenSet};
use std::collections::BTreeMap;
pub(crate) type ProductionInfoId = usize;
pub(crate) type ParseStateId = usize;
pub(crate) type LexStateId = usize;
use std::hash::BuildHasherDefault;
use indexmap::IndexMap;
use rustc_hash::FxHasher;
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub(crate) enum ParseAction {
Accept,
Shift {
state: ParseStateId,
is_repetition: bool,
},
ShiftExtra,
Recover,
Reduce {
symbol: Symbol,
child_count: usize,
dynamic_precedence: i32,
production_id: ProductionInfoId,
},
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum GotoAction {
Goto(ParseStateId),
ShiftExtra,
}
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub(crate) struct ParseTableEntry {
pub actions: Vec<ParseAction>,
pub reusable: bool,
}
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub(crate) struct ParseState {
pub id: ParseStateId,
pub terminal_entries: IndexMap<Symbol, ParseTableEntry, BuildHasherDefault<FxHasher>>,
pub nonterminal_entries: IndexMap<Symbol, GotoAction, BuildHasherDefault<FxHasher>>,
pub lex_state_id: usize,
pub external_lex_state_id: usize,
pub core_id: usize,
}
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub(crate) struct FieldLocation {
pub index: usize,
pub inherited: bool,
}
#[derive(Debug, Default, PartialEq, Eq)]
pub(crate) struct ProductionInfo {
pub alias_sequence: Vec<Option<Alias>>,
pub field_map: BTreeMap<String, Vec<FieldLocation>>,
}
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct ParseTable {
pub states: Vec<ParseState>,
pub symbols: Vec<Symbol>,
pub production_infos: Vec<ProductionInfo>,
pub max_aliased_production_length: usize,
pub external_lex_states: Vec<TokenSet>,
}
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub(crate) struct AdvanceAction {
pub state: LexStateId,
pub in_main_token: bool,
}
#[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
pub(crate) struct LexState {
pub accept_action: Option<Symbol>,
pub eof_action: Option<AdvanceAction>,
pub advance_actions: Vec<(CharacterSet, AdvanceAction)>,
}
#[derive(Debug, PartialEq, Eq)]
pub(crate) struct LexTable {
pub states: Vec<LexState>,
}
impl ParseTableEntry {
pub fn new() -> Self {
Self {
reusable: true,
actions: Vec::new(),
}
}
}
impl Default for LexTable {
fn default() -> Self {
LexTable { states: Vec::new() }
}
}
impl ParseState {
pub fn is_end_of_non_terminal_extra(&self) -> bool {
self.terminal_entries
.contains_key(&Symbol::end_of_nonterminal_extra())
}
pub fn referenced_states<'a>(&'a self) -> impl Iterator<Item = ParseStateId> + 'a {
self.terminal_entries
.iter()
.flat_map(|(_, entry)| {
entry.actions.iter().filter_map(|action| match action {
ParseAction::Shift { state, .. } => Some(*state),
_ => None,
})
})
.chain(self.nonterminal_entries.iter().filter_map(|(_, action)| {
if let GotoAction::Goto(state) = action {
Some(*state)
} else {
None
}
}))
}
pub fn update_referenced_states<F>(&mut self, mut f: F)
where
F: FnMut(usize, &ParseState) -> usize,
{
let mut updates = Vec::new();
for (symbol, entry) in &self.terminal_entries {
for (i, action) in entry.actions.iter().enumerate() {
if let ParseAction::Shift { state, .. } = action {
let result = f(*state, self);
if result != *state {
updates.push((*symbol, i, result));
}
}
}
}
for (symbol, action) in &self.nonterminal_entries {
if let GotoAction::Goto(other_state) = action {
let result = f(*other_state, self);
if result != *other_state {
updates.push((*symbol, 0, result));
}
}
}
for (symbol, action_index, new_state) in updates {
if symbol.is_non_terminal() {
self.nonterminal_entries
.insert(symbol, GotoAction::Goto(new_state));
} else {
let entry = self.terminal_entries.get_mut(&symbol).unwrap();
if let ParseAction::Shift { is_repetition, .. } = entry.actions[action_index] {
entry.actions[action_index] = ParseAction::Shift {
state: new_state,
is_repetition,
};
}
}
}
}
}

View File

@ -0,0 +1,28 @@
#include "tree_sitter/parser.h"
#include <node.h>
#include "nan.h"
using namespace v8;
extern "C" TSLanguage * tree_sitter_PARSER_NAME();
namespace {
NAN_METHOD(New) {}
void Init(Local<Object> exports, Local<Object> module) {
Local<FunctionTemplate> tpl = Nan::New<FunctionTemplate>(New);
tpl->SetClassName(Nan::New("Language").ToLocalChecked());
tpl->InstanceTemplate()->SetInternalFieldCount(1);
Local<Function> constructor = Nan::GetFunction(tpl).ToLocalChecked();
Local<Object> instance = constructor->NewInstance(Nan::GetCurrentContext()).ToLocalChecked();
Nan::SetInternalFieldPointer(instance, 0, tree_sitter_PARSER_NAME());
Nan::Set(instance, Nan::New("name").ToLocalChecked(), Nan::New("PARSER_NAME").ToLocalChecked());
Nan::Set(module, Nan::New("exports").ToLocalChecked(), instance);
}
NODE_MODULE(tree_sitter_PARSER_NAME_binding, Init)
} // namespace

View File

@ -0,0 +1,19 @@
{
"targets": [
{
"target_name": "tree_sitter_PARSER_NAME_binding",
"include_dirs": [
"<!(node -e \"require('nan')\")",
"src"
],
"sources": [
"bindings/node/binding.cc",
"src/parser.c",
# If your language uses an external scanner, add it here.
],
"cflags_c": [
"-std=c99",
]
}
]
}

View File

@ -0,0 +1,40 @@
fn main() {
let src_dir = std::path::Path::new("src");
let mut c_config = cc::Build::new();
c_config.include(&src_dir);
c_config
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable")
.flag_if_supported("-Wno-trigraphs");
let parser_path = src_dir.join("parser.c");
c_config.file(&parser_path);
// If your language uses an external scanner written in C,
// then include this block of code:
/*
let scanner_path = src_dir.join("scanner.c");
c_config.file(&scanner_path);
println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
*/
c_config.compile("parser");
println!("cargo:rerun-if-changed={}", parser_path.to_str().unwrap());
// If your language uses an external scanner written in C++,
// then include this block of code:
/*
let mut cpp_config = cc::Build::new();
cpp_config.cpp(true);
cpp_config.include(&src_dir);
cpp_config
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable");
let scanner_path = src_dir.join("scanner.cc");
cpp_config.file(&scanner_path);
cpp_config.compile("scanner");
println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
*/
}

View File

@ -0,0 +1,26 @@
[package]
name = "tree-sitter-PARSER_NAME"
description = "PARSER_NAME grammar for the tree-sitter parsing library"
version = "0.0.1"
keywords = ["incremental", "parsing", "PARSER_NAME"]
categories = ["parsing", "text-editors"]
repository = "https://github.com/tree-sitter/tree-sitter-PARSER_NAME"
edition = "2018"
license = "MIT"
build = "bindings/rust/build.rs"
include = [
"bindings/rust/*",
"grammar.js",
"queries/*",
"src/*",
]
[lib]
path = "bindings/rust/lib.rs"
[dependencies]
tree-sitter = "~RUST_BINDING_VERSION"
[build-dependencies]
cc = "1.0"

View File

@ -0,0 +1,19 @@
try {
module.exports = require("../../build/Release/tree_sitter_PARSER_NAME_binding");
} catch (error1) {
if (error1.code !== 'MODULE_NOT_FOUND') {
throw error1;
}
try {
module.exports = require("../../build/Debug/tree_sitter_PARSER_NAME_binding");
} catch (error2) {
if (error2.code !== 'MODULE_NOT_FOUND') {
throw error2;
}
throw error1
}
}
try {
module.exports.nodeTypeInfo = require("../../src/node-types.json");
} catch (_) {}

View File

@ -0,0 +1,52 @@
//! This crate provides PARSER_NAME language support for the [tree-sitter][] parsing library.
//!
//! Typically, you will use the [language][language func] function to add this language to a
//! tree-sitter [Parser][], and then use the parser to parse some code:
//!
//! ```
//! let code = "";
//! let mut parser = tree_sitter::Parser::new();
//! parser.set_language(tree_sitter_PARSER_NAME::language()).expect("Error loading PARSER_NAME grammar");
//! let tree = parser.parse(code, None).unwrap();
//! ```
//!
//! [Language]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Language.html
//! [language func]: fn.language.html
//! [Parser]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Parser.html
//! [tree-sitter]: https://tree-sitter.github.io/
use tree_sitter::Language;
extern "C" {
fn tree_sitter_PARSER_NAME() -> Language;
}
/// Get the tree-sitter [Language][] for this grammar.
///
/// [Language]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Language.html
pub fn language() -> Language {
unsafe { tree_sitter_PARSER_NAME() }
}
/// The content of the [`node-types.json`][] file for this grammar.
///
/// [`node-types.json`]: https://tree-sitter.github.io/tree-sitter/using-parsers#static-node-types
pub const NODE_TYPES: &'static str = include_str!("../../src/node-types.json");
// Uncomment these to include any queries that this grammar contains
// pub const HIGHLIGHTS_QUERY: &'static str = include_str!("../../queries/highlights.scm");
// pub const INJECTIONS_QUERY: &'static str = include_str!("../../queries/injections.scm");
// pub const LOCALS_QUERY: &'static str = include_str!("../../queries/locals.scm");
// pub const TAGS_QUERY: &'static str = include_str!("../../queries/tags.scm");
#[cfg(test)]
mod tests {
#[test]
fn test_can_load_grammar() {
let mut parser = tree_sitter::Parser::new();
parser
.set_language(super::language())
.expect("Error loading PARSER_NAME language");
}
}

View File

@ -0,0 +1,19 @@
{
"name": "tree-sitter-PARSER_NAME",
"version": "0.0.1",
"description": "PARSER_NAME grammar for tree-sitter",
"main": "bindings/node",
"keywords": [
"parsing",
"incremental"
],
"dependencies": {
"nan": "^2.12.1"
},
"devDependencies": {
"tree-sitter-cli": "^CLI_VERSION"
},
"scripts": {
"test": "tree-sitter test"
}
}

Some files were not shown because too many files have changed in this diff Show More