diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index b7d810777e..a2e7ac8ce4 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -33,7 +33,9 @@ add_library(common dma/dma.cpp dma/gs.cpp formatter/formatter.cpp + formatter/formatting_rules.cpp formatter/formatter_tree.cpp + formatter/config/rule_config.cpp global_profiler/GlobalProfiler.cpp goos/Interpreter.cpp goos/Object.cpp diff --git a/common/formatter/config/rule_config.cpp b/common/formatter/config/rule_config.cpp new file mode 100644 index 0000000000..fe3cfed94c --- /dev/null +++ b/common/formatter/config/rule_config.cpp @@ -0,0 +1,14 @@ +#include "rule_config.h" + +namespace formatter { + +// TODO - these are eventually going to need a refactor too, my depth is based on the true +// tree-depth, in reality we want it to be relative to the forms defined here because imagine +// multiple `let`s in the same form, their depths will be different but you want them relatively +// formatted the same + +// TODO - populate these more + +const std::unordered_map>> opengoal_rules = + {{"defun", {std::make_shared(1)}}}; +} // namespace formatter diff --git a/common/formatter/config/rule_config.h b/common/formatter/config/rule_config.h new file mode 100644 index 0000000000..16f2c2a8a6 --- /dev/null +++ b/common/formatter/config/rule_config.h @@ -0,0 +1,13 @@ +#pragma once + +#include +#include +#include +#include + +#include "common/formatter/formatting_rules.h" + +namespace formatter { +extern const std::unordered_map>> + opengoal_rules; +} diff --git a/common/formatter/formatter.cpp b/common/formatter/formatter.cpp index 079fc0fc55..03bb6f71a9 100644 --- a/common/formatter/formatter.cpp +++ b/common/formatter/formatter.cpp @@ -15,67 +15,72 @@ extern "C" { extern const TSLanguage* tree_sitter_opengoal(); } -std::string align_form(const std::string& form, int alignment_width) { - const auto lines = str_util::split(form); - std::string aligned_form = ""; - for (int i = 0; i < lines.size(); i++) { - aligned_form += str_util::repeat(alignment_width, " ") + lines.at(i); - if (i != lines.size() - 1) { - aligned_form += "\n"; - } - } - return aligned_form; -} +// TODO - incoporate some rules from zprint +// https://github.com/kkinnear/zprint/blob/main/doc/types/classic.md +// as well as maybe adjust the default rules to incorporate line length +// https://github.com/kkinnear/zprint/blob/main/doc/options/indent.md +// TODO - block comments seem to have an issue being parsed properly, also it basically needs the +// code for flexibly wrapping a block of code in configurable symbols (parens, block comment braces, +// etc) -std::string apply_formatting(const FormatterTree::Node& curr_node, +std::string apply_formatting(const FormatterTreeNode& curr_node, std::string output, int tree_depth = 0) { if (!curr_node.token && curr_node.refs.empty()) { return output; } std::string curr_form = ""; + // Print the token if (curr_node.token) { + // TODO - perhaps unneeded + curr_node.get_formatting_rule(tree_depth, -1) + ->indent_token(curr_form, curr_node, curr_node, tree_depth, -1); curr_form += curr_node.token.value(); return curr_form; } + // TODO - this might have some issues for non-list top level elements (ie. comments) if (!curr_node.metadata.is_root) { curr_form += "("; } + // Iterate the form for (int i = 0; i < curr_node.refs.size(); i++) { const auto& ref = curr_node.refs.at(i); - // TODO - abstract these into formatting rules - if (!curr_node.metadata.is_root && curr_node.metadata.multiple_elements_first_line) { - if (i > 1) { - // TODO - kinda unsafe - // Trim the current form before applying a new-line - curr_form = str_util::rtrim(curr_form) + "\n"; - if (ref.token) { - curr_form += str_util::repeat(curr_node.refs.at(0).token.value().length() + 2, " "); - } - } - } else if (!curr_node.metadata.is_root) { - if (i > 0) { - // Trim the current form before applying a new-line - curr_form = str_util::rtrim(curr_form) + "\n"; - curr_form += str_util::repeat(tree_depth, " "); - } + // Append a newline if relevant + if (!curr_node.metadata.is_root) { + curr_node.get_formatting_rule(tree_depth, i) + ->append_newline(curr_form, ref, curr_node, tree_depth, i); } + // Either print the element's token, or recursively format it as well if (ref.token) { - curr_form += ref.token.value() + " "; - } else { - if (!curr_node.metadata.is_root && curr_node.metadata.multiple_elements_first_line) { - // align returned form's lines with this forms lines - // TODO - kinda unsafe - curr_form += align_form(apply_formatting(ref, "", tree_depth + 1), - curr_node.refs.at(0).token.value().length() + 2); - } else { - curr_form += apply_formatting(ref, "", tree_depth + 1); + curr_node.get_formatting_rule(tree_depth, i) + ->indent_token(curr_form, ref, curr_node, 1, + i); // TODO depth hard-coded to 1, i think this can be removed, since + // forms are always done bottom-top recursively, they always act + // independently as if it was the shallowest depth + curr_form += ref.token.value(); + if (!curr_node.metadata.is_root) { + curr_form += " "; } + } else { + auto formatted_form = apply_formatting(ref, "", tree_depth + 1); + if (!curr_node.metadata.is_root) { + curr_node.get_formatting_rule(tree_depth, i) + ->align_form_lines(formatted_form, ref, curr_node); + } + curr_form += formatted_form; } + // Separate top-level elements with a new-line + // TODO - move this into a top-level rule + // TODO - temporary hack for top level comments, but need a better strategy for leaving them + // where they were originally if (curr_node.metadata.is_root && i < curr_node.refs.size() - 1) { - curr_form += "\n\n"; + curr_form += "\n"; + if (!ref.token || !str_util::starts_with(ref.token.value(), ";")) { + curr_form += "\n"; + } } } + // TODO - similar fear to issues as above if (!curr_node.metadata.is_root) { curr_form = str_util::rtrim(curr_form) + ")"; } diff --git a/common/formatter/formatter.h b/common/formatter/formatter.h index bb9e3aa0ba..58484cedc4 100644 --- a/common/formatter/formatter.h +++ b/common/formatter/formatter.h @@ -6,6 +6,7 @@ #include "tree_sitter/api.h" namespace formatter { + struct TreeSitterParserDeleter { void operator()(TSParser* ptr) const { ts_parser_delete(ptr); } }; diff --git a/common/formatter/formatter_tree.cpp b/common/formatter/formatter_tree.cpp index 054519310b..d8e7328e74 100644 --- a/common/formatter/formatter_tree.cpp +++ b/common/formatter/formatter_tree.cpp @@ -1,5 +1,22 @@ #include "formatter_tree.h" +#include "common/util/string_util.h" + +#include "config/rule_config.h" + +namespace formatter { +const std::shared_ptr default_rule = std::make_shared(); +} + +std::shared_ptr FormatterTreeNode::get_formatting_rule(const int depth, + const int index) const { + // TODO - really lazy for now + if (!rules.empty()) { + return rules.at(0); + } + return formatter::default_rule; +} + // Check if the original source only has whitespace up to a new-line after it's token bool node_followed_by_only_whitespace(const std::string& source, const TSNode& node) { uint32_t pos = ts_node_end_byte(node); @@ -16,14 +33,23 @@ bool node_followed_by_only_whitespace(const std::string& source, const TSNode& n return true; } +bool nodes_on_same_line(const std::string& source, const TSNode& n1, const TSNode& n2) { + // Get the source between the two lines, if there are any new-lines, the answer is NO + uint32_t start = ts_node_start_byte(n1); + uint32_t end = ts_node_end_byte(n2); + const auto code_between = source.substr(start, end - start); + return !str_util::contains(code_between, "\n"); +} + std::string get_source_code(const std::string& source, const TSNode& node) { uint32_t start = ts_node_start_byte(node); uint32_t end = ts_node_end_byte(node); - return source.substr(start, end - start); + // TODO - comments end with a \n, this is likely a tree-sitter grammar problem + return str_util::rtrim(source.substr(start, end - start)); } FormatterTree::FormatterTree(const std::string& source, const TSNode& root_node) { - root = FormatterTree::Node(); + root = FormatterTreeNode(); root.metadata.is_root = true; construct_formatter_tree_recursive(source, root_node, root); } @@ -31,15 +57,15 @@ FormatterTree::FormatterTree(const std::string& source, const TSNode& root_node) // TODO make an imperative version eventually void FormatterTree::construct_formatter_tree_recursive(const std::string& source, TSNode curr_node, - Node& tree_node) { + FormatterTreeNode& tree_node) { if (ts_node_child_count(curr_node) == 0) { - tree_node.refs.push_back(FormatterTree::Node(get_source_code(source, curr_node))); + tree_node.refs.push_back(FormatterTreeNode(get_source_code(source, curr_node))); return; } const std::string curr_node_type = ts_node_type(curr_node); - FormatterTree::Node list_node; + FormatterTreeNode list_node; if (curr_node_type == "list_lit") { - list_node = FormatterTree::Node(); + list_node = FormatterTreeNode(); } for (size_t i = 0; i < ts_node_child_count(curr_node); i++) { const auto child_node = ts_node_child(curr_node, i); @@ -53,8 +79,16 @@ void FormatterTree::construct_formatter_tree_recursive(const std::string& source if (i == 1) { list_node.metadata.multiple_elements_first_line = !node_followed_by_only_whitespace(source, child_node); + // Peek at the first element of the list to determine formatting rules + if (formatter::opengoal_rules.find(contents) != formatter::opengoal_rules.end()) { + list_node.rules = formatter::opengoal_rules.at(contents); + } } construct_formatter_tree_recursive(source, child_node, list_node); + // Check if the node that was recursively added to the list was on the same line + auto& new_node = list_node.refs.at(list_node.refs.size() - 1); + new_node.metadata.was_on_first_line_of_form = + nodes_on_same_line(source, curr_node, child_node); } else { construct_formatter_tree_recursive(source, child_node, tree_node); } diff --git a/common/formatter/formatter_tree.h b/common/formatter/formatter_tree.h index ee9497d3bc..bc93c30bc5 100644 --- a/common/formatter/formatter_tree.h +++ b/common/formatter/formatter_tree.h @@ -1,11 +1,16 @@ #pragma once +#include #include #include #include +#include "formatting_rules.h" + #include "tree_sitter/api.h" +class FormattingRule; + // Treesitter is fantastic for validating and parsing our code into a structured tree format without // whitespace so we can do that ourselves (formatting) However, the treesitter AST is a bit too // detailed for purposes of formatting. @@ -22,39 +27,50 @@ // Pass 1 - convert the AST into a simplified FormatterTree // Pass 2 - use the simplified tree to output the final code -// A FormatterTree has a very simple and crude tree structure where: -// Nodes are essentially forms, which contain in-order tokens or references to nested forms -// Nodes can have associated metadata, often related to their context in the original code -class FormatterTree { +namespace formatter { +extern const std::shared_ptr default_rule; +} + +class FormatterTreeNode { public: - struct NodeMetadata { + struct Metadata { bool is_root = false; // Whether the form had more than 1 element on the first line // (println // "test") // vs // (println "test") - bool multiple_elements_first_line; + bool multiple_elements_first_line = false; + bool was_on_first_line_of_form = false; }; + std::vector refs; + Metadata metadata; + // The token is optional because list nodes do not contain a token, they just contain a bunch of + // eventually token node refs + std::optional token; + std::vector> rules = {}; - class Node { - public: - std::vector refs; - NodeMetadata metadata; - // The token is optional because list nodes do not contain a token, they just contain a bunch of - // eventually token node refs - std::optional token; + FormatterTreeNode() = default; + FormatterTreeNode(const std::string& _token) : token(_token){}; + FormatterTreeNode(const Metadata& _metadata) : metadata(_metadata){}; - Node() = default; - Node(const std::string& _token) : token(_token){}; - Node(const NodeMetadata& _metadata) : metadata(_metadata){}; - }; + // Considers the input to determine the most relevant formatting rule for the given node + // if none are applicable, returns `formatter::default_rule` + std::shared_ptr get_formatting_rule(const int depth, const int index) const; +}; +// A FormatterTree has a very simple and crude tree structure where: +// - Nodes are essentially forms, which contain in-order tokens or references to nested forms +// - Nodes can have associated metadata, often related to their context in the original code +// - Nodes can also have multiple formatting rules associated with them. Often this is the default +// rule or based on pre-configured overrides due to the head of the form, ex. 'defun' +class FormatterTree { + public: FormatterTree(const std::string& source, const TSNode& root_node); - Node root; + FormatterTreeNode root; private: void construct_formatter_tree_recursive(const std::string& source, TSNode curr_node, - Node& tree_node); + FormatterTreeNode& tree_node); }; diff --git a/common/formatter/formatting_rules.cpp b/common/formatter/formatting_rules.cpp new file mode 100644 index 0000000000..179bc1f055 --- /dev/null +++ b/common/formatter/formatting_rules.cpp @@ -0,0 +1,104 @@ +#include "formatting_rules.h" + +#include "common/util/string_util.h" + +void FormattingRule::append_newline(std::string& curr_text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node, + const int depth, + const int index) { + if (index <= 0 && !containing_node.metadata.multiple_elements_first_line || + index <= 1 && containing_node.metadata.multiple_elements_first_line) { + return; + } + curr_text = str_util::rtrim(curr_text) + "\n"; +} + +void FormattingRule::indent_token(std::string& curr_text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node, + const int depth, + const int index) { + if (node.metadata.is_root) { + return; + } + if (containing_node.metadata.multiple_elements_first_line) { + if (index > 1) { + // Only apply indentation if we are about to print a normal text token + // TODO - unsafe + if (node.token.has_value()) { + curr_text += str_util::repeat(containing_node.refs.at(0).token.value().length() + 2, " "); + } + } + } else { + if (index > 0) { + curr_text += str_util::repeat(depth, " "); + } + } +} + +void FormattingRule::align_form_lines(std::string& text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node) { + const auto lines = str_util::split(text); + // TODO - unsafe (breaks on a list of lists) + int alignment_width = 1; + if (containing_node.metadata.multiple_elements_first_line) { + alignment_width = containing_node.refs.at(0).token.value().length() + 2; + } + std::string aligned_form = ""; + for (int i = 0; i < lines.size(); i++) { + aligned_form += str_util::repeat(alignment_width, " ") + lines.at(i); + if (i != lines.size() - 1) { + aligned_form += "\n"; + } + } + text = aligned_form; +} + +void InnerFormattingRule::append_newline(std::string& curr_text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node, + const int depth, + const int index) { + if (index < 1 || (m_depth != depth || m_index && m_index.value() != index)) { + return; + } + if (!node.metadata.was_on_first_line_of_form) { + curr_text = str_util::rtrim(curr_text) + "\n"; + } +} + +void InnerFormattingRule::indent_token(std::string& curr_text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node, + const int depth, + const int index) { + if (index < 1 || (m_depth != depth || m_index && m_index.value() != index)) { + return; + } + // We only new-line elements if they were not originally on the first line + if (!node.metadata.was_on_first_line_of_form) { + curr_text += str_util::repeat(depth * 2, " "); + } +} + +void InnerFormattingRule::align_form_lines(std::string& text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node) { + if (node.metadata.was_on_first_line_of_form) { + return; + } + const auto lines = str_util::split(text); + // TODO - unsafe (breaks on a list of lists) + int alignment_width = 2; + std::string aligned_form = ""; + for (int i = 0; i < lines.size(); i++) { + aligned_form += str_util::repeat(alignment_width, " ") + lines.at(i); + if (i != lines.size() - 1) { + aligned_form += "\n"; + } + } + text = aligned_form; + return; +} diff --git a/common/formatter/formatting_rules.h b/common/formatter/formatting_rules.h new file mode 100644 index 0000000000..0347ee01e6 --- /dev/null +++ b/common/formatter/formatting_rules.h @@ -0,0 +1,87 @@ +#pragma once + +#include + +#include "formatter_tree.h" + +class FormatterTreeNode; + +// Indentation rules are heavily inspired by the descriptions here +// https://github.com/weavejester/cljfmt/blob/master/docs/INDENTS.md +// +// This style of formatting assumes the code coming in is already reasonably formatted, aka was +// written by an actual human and isn't just a single line of minified code +// +// If it _is_ though, the formatting rules will still be able to do a somewhat decent job, as +// certain forms are configured to format a specific way, but it probably won't be broadly +// consistent with code written normally +// +// cljfmt observations: +// - a form that starts on the first line but spans multiple lines (it doesn't really handle this +// well) ex. (println (hello +// world) ye) ;; you'd expect the 'ye' to be aligned with `(h...` +// - vector lists are treated differently from paren lists (seems to leave them inline or default +// indent them if they span multiple lines) ex. [hello world +// what] + +// The default rule that is used if no other rule applies to the given form +// +// For lists this will format like so: +// - If the only element on the first line is the head of the form, every subsequent element is +// indented with 1 space on a new line +// (println ; <= one or fewer elements on first line +// "hello" +// "world") +// - otherwise, every element after the 2nd is on a new line and aligned to that 1st list arg. +// (println "hello" ; <= more than one element on first line +// "world") +class FormattingRule { + public: + virtual void append_newline(std::string& curr_text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node, + const int depth, + const int index); + virtual void indent_token(std::string& curr_text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node, + const int depth, + const int index); + virtual void align_form_lines(std::string& text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node); +}; + +// Inner indentation always indents by 2 spaces for every line after the first regardless of the +// number of elements `depth` defines at what depth the rule should be applied, and optionally +// `index` narrows this down further to a given index at that depth +// +// Some simple examples: +// (defn greet [name] +// (println "Hello" name)) +// +// (defn dismiss +// [name] +// (println "Goodbye" name)) +class InnerFormattingRule : public FormattingRule { + private: + int m_depth; + std::optional m_index; + + public: + InnerFormattingRule(int depth) : m_depth(depth){}; + InnerFormattingRule(int depth, int index) : m_depth(depth), m_index(index){}; + virtual void append_newline(std::string& curr_text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node, + const int depth, + const int index) override; + virtual void indent_token(std::string& curr_text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node, + const int depth, + const int index) override; + void align_form_lines(std::string& text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node) override; +}; diff --git a/common/util/string_util.cpp b/common/util/string_util.cpp index dafddf6798..aa2a0e7a98 100644 --- a/common/util/string_util.cpp +++ b/common/util/string_util.cpp @@ -27,6 +27,8 @@ std::string ltrim(const std::string& s) { return (start == std::string::npos) ? "" : s.substr(start); } +// TODO - used a lot in formatting, and its slow because i bet it iterates from the start and not +// the end std::string rtrim(const std::string& s) { size_t end = s.find_last_not_of(WHITESPACE); return (end == std::string::npos) ? "" : s.substr(0, end + 1); diff --git a/test/common/formatter/corpus/comments.test.gc b/test/common/formatter/corpus/comments.test.gc new file mode 100644 index 0000000000..992ef3203e --- /dev/null +++ b/test/common/formatter/corpus/comments.test.gc @@ -0,0 +1,44 @@ +=== +Top-Level Comment +=== + +;; test +(println "test") + +--- + +;; test +(println "test") + +=== +TODO - Inline Comment +=== + +(println "test") ;; test + +--- + +(println "test") + + ;; test + +=== +TODO - Block Comment +=== + +#| + block comment + test +|# + +(println "test") + +--- + +#| + + |# + +(println "test") + + diff --git a/test/common/formatter/corpus/default-intent.test.gc b/test/common/formatter/corpus/default.test.gc similarity index 100% rename from test/common/formatter/corpus/default-intent.test.gc rename to test/common/formatter/corpus/default.test.gc index 2da5ef353e..016bef6f33 100644 --- a/test/common/formatter/corpus/default-intent.test.gc +++ b/test/common/formatter/corpus/default.test.gc @@ -1,3 +1,15 @@ +=== +All Alignment +=== + +(println "hello" (println "world" "world2")) + +--- + +(println "hello" + (println "world" + "world2")) + === Multiple Top Level Forms === @@ -12,18 +24,6 @@ Multiple Top Level Forms (println "hello" "world") -=== -All Alignment -=== - -(println "hello" (println "world" "world2")) - ---- - -(println "hello" - (println "world" - "world2")) - === All Indented === diff --git a/test/common/formatter/corpus/inner.test.gc b/test/common/formatter/corpus/inner.test.gc new file mode 100644 index 0000000000..0f050963ee --- /dev/null +++ b/test/common/formatter/corpus/inner.test.gc @@ -0,0 +1,19 @@ +=== +Basic Inner Indentation +=== + +(defun greet (name string) +(println "Hello" name) + (println "Hello" name) + (println "Hello" name) +) + +--- + +(defun greet (name string) + (println "Hello" + name) + (println "Hello" + name) + (println "Hello" + name)) diff --git a/test/common/formatter/corpus/top-level.test.gc b/test/common/formatter/corpus/top-level.test.gc new file mode 100644 index 0000000000..ddde173a97 --- /dev/null +++ b/test/common/formatter/corpus/top-level.test.gc @@ -0,0 +1,14 @@ +=== +Top Level Elements +=== + +(println "test") +(println "test") (println "test") + +--- + +(println "test") + +(println "test") + +(println "test") diff --git a/test/common/formatter/test_formatter.cpp b/test/common/formatter/test_formatter.cpp index a8742c0271..fc9f0ed372 100644 --- a/test/common/formatter/test_formatter.cpp +++ b/test/common/formatter/test_formatter.cpp @@ -26,6 +26,7 @@ EXPECTED OUTPUT #include "gtest/gtest.h" +#include "third-party/fmt/color.h" #include "third-party/fmt/core.h" struct TestDefinition { @@ -34,7 +35,7 @@ struct TestDefinition { std::string output; }; -bool run_tests(fs::path file_path) { +bool run_tests(const fs::path& file_path, const bool only_important_tests) { // Read in the file, and run the test const auto contents = str_util::split(file_util::read_text_file(file_path)); std::vector tests; @@ -75,8 +76,12 @@ bool run_tests(fs::path file_path) { } // Run the tests, report successes and failures bool test_failed = false; - fmt::print("{}:\n", file_util::base_name(file_path.string())); + fmt::print("{}:\n", fmt::styled(file_util::base_name(file_path.string()), + fmt::emphasis::bold | fg(fmt::color::cyan))); for (const auto& test : tests) { + if (only_important_tests && !str_util::starts_with(test.name, "!")) { + continue; + } const auto formatted_result = formatter::format_code(test.input); if (!formatted_result) { // Unable to parse, was that expected? @@ -98,17 +103,26 @@ bool run_tests(fs::path file_path) { return test_failed; } -bool find_and_run_tests() { +bool find_and_run_tests(const bool only_important_tests) { // Enumerate test files const auto test_files = file_util::find_files_recursively( file_util::get_file_path({"test/common/formatter/corpus"}), std::regex("^.*\.test.gc$")); bool failed = false; for (const auto& file : test_files) { - failed = run_tests(file); + // don't fail fast, but any failure means we return false + if (failed) { + run_tests(file, only_important_tests); + } else { + failed = run_tests(file, only_important_tests); + } } return !failed; } TEST(Formatter, FormatterTests) { - EXPECT_TRUE(find_and_run_tests()); + // TODO - when i get annoyed enough, make this not a manual change and pre-scan the tests to see + // if there are any flagged tests + // + // This is to make it easier to run an individual test when debugging + EXPECT_TRUE(find_and_run_tests(false)); }