From 3dbaee1eccc67287cdb6d3976fbc8c56100c80e1 Mon Sep 17 00:00:00 2001 From: Tyler Wilding Date: Tue, 6 Jun 2023 19:34:50 -0500 Subject: [PATCH] formatter: handle top level blank lines and better handle comments (#2702) --- common/formatter/config/rule_config.cpp | 4 +- common/formatter/config/rule_config.h | 4 +- common/formatter/formatter.cpp | 29 +++---- common/formatter/formatter.h | 4 + common/formatter/formatter_tree.cpp | 63 +++++++++++---- common/formatter/formatter_tree.h | 15 ++-- common/formatter/formatting_rules.cpp | 79 ++++++++++++------- common/formatter/formatting_rules.h | 39 +++++++-- .../formatter/corpus/blank-lines.test.gc | 14 ++++ test/common/formatter/corpus/comments.test.gc | 6 +- test/common/formatter/test_formatter.cpp | 33 ++++++-- 11 files changed, 202 insertions(+), 88 deletions(-) create mode 100644 test/common/formatter/corpus/blank-lines.test.gc diff --git a/common/formatter/config/rule_config.cpp b/common/formatter/config/rule_config.cpp index fe3cfed94c..4cf20ecf20 100644 --- a/common/formatter/config/rule_config.cpp +++ b/common/formatter/config/rule_config.cpp @@ -9,6 +9,6 @@ namespace formatter { // TODO - populate these more -const std::unordered_map>> opengoal_rules = - {{"defun", {std::make_shared(1)}}}; +const std::unordered_map>> + opengoal_indentation_rules = {{"defun", {std::make_shared(1)}}}; } // namespace formatter diff --git a/common/formatter/config/rule_config.h b/common/formatter/config/rule_config.h index 16f2c2a8a6..b495353528 100644 --- a/common/formatter/config/rule_config.h +++ b/common/formatter/config/rule_config.h @@ -8,6 +8,6 @@ #include "common/formatter/formatting_rules.h" namespace formatter { -extern const std::unordered_map>> - opengoal_rules; +extern const std::unordered_map>> + opengoal_indentation_rules; } diff --git a/common/formatter/formatter.cpp b/common/formatter/formatter.cpp index 03bb6f71a9..304d006e04 100644 --- a/common/formatter/formatter.cpp +++ b/common/formatter/formatter.cpp @@ -39,17 +39,15 @@ std::string apply_formatting(const FormatterTreeNode& curr_node, return curr_form; } // TODO - this might have some issues for non-list top level elements (ie. comments) - if (!curr_node.metadata.is_root) { + if (!curr_node.metadata.is_top_level) { curr_form += "("; } // Iterate the form for (int i = 0; i < curr_node.refs.size(); i++) { const auto& ref = curr_node.refs.at(i); - // Append a newline if relevant - if (!curr_node.metadata.is_root) { - curr_node.get_formatting_rule(tree_depth, i) - ->append_newline(curr_form, ref, curr_node, tree_depth, i); - } + // Append a newline if needed + curr_node.get_formatting_rule(tree_depth, i) + ->append_newline(curr_form, ref, curr_node, tree_depth, i); // Either print the element's token, or recursively format it as well if (ref.token) { curr_node.get_formatting_rule(tree_depth, i) @@ -58,30 +56,21 @@ std::string apply_formatting(const FormatterTreeNode& curr_node, // forms are always done bottom-top recursively, they always act // independently as if it was the shallowest depth curr_form += ref.token.value(); - if (!curr_node.metadata.is_root) { + if (!curr_node.metadata.is_top_level) { curr_form += " "; } } else { auto formatted_form = apply_formatting(ref, "", tree_depth + 1); - if (!curr_node.metadata.is_root) { + if (!curr_node.metadata.is_top_level) { curr_node.get_formatting_rule(tree_depth, i) ->align_form_lines(formatted_form, ref, curr_node); } curr_form += formatted_form; } - // Separate top-level elements with a new-line - // TODO - move this into a top-level rule - // TODO - temporary hack for top level comments, but need a better strategy for leaving them - // where they were originally - if (curr_node.metadata.is_root && i < curr_node.refs.size() - 1) { - curr_form += "\n"; - if (!ref.token || !str_util::starts_with(ref.token.value(), ";")) { - curr_form += "\n"; - } - } + // Handle blank lines at the top level, skip if it's the final element + formatter_rules::blank_lines::separate_by_newline(curr_form, curr_node, ref, i); } - // TODO - similar fear to issues as above - if (!curr_node.metadata.is_root) { + if (!curr_node.metadata.is_top_level) { curr_form = str_util::rtrim(curr_form) + ")"; } return curr_form; diff --git a/common/formatter/formatter.h b/common/formatter/formatter.h index 58484cedc4..83a332a785 100644 --- a/common/formatter/formatter.h +++ b/common/formatter/formatter.h @@ -3,8 +3,12 @@ #include #include +#include "formatting_rules.h" + #include "tree_sitter/api.h" +// TODO: +// - Considering _eventually_ adding line-length heuristics namespace formatter { struct TreeSitterParserDeleter { diff --git a/common/formatter/formatter_tree.cpp b/common/formatter/formatter_tree.cpp index d8e7328e74..4e0989743a 100644 --- a/common/formatter/formatter_tree.cpp +++ b/common/formatter/formatter_tree.cpp @@ -5,11 +5,52 @@ #include "config/rule_config.h" namespace formatter { -const std::shared_ptr default_rule = std::make_shared(); +const std::shared_ptr default_rule = std::make_shared(); } -std::shared_ptr FormatterTreeNode::get_formatting_rule(const int depth, - const int index) const { +std::string get_source_code(const std::string& source, const TSNode& node) { + uint32_t start = ts_node_start_byte(node); + uint32_t end = ts_node_end_byte(node); + // TODO - comments end with a \n, this is likely a tree-sitter grammar problem + return str_util::rtrim(source.substr(start, end - start)); +} + +int num_blank_lines_following_node(const std::string& source, const TSNode& node) { + int num_lines = 0; + uint32_t cursor = ts_node_end_byte(node); + while (cursor < source.length() && source.at(cursor) == '\n') { + num_lines++; + cursor++; + } + return num_lines; +} + +// Check if the original source only has whitespace up to a new-line after it's token +bool node_preceeded_by_only_whitespace(const std::string& source, const TSNode& node) { + uint32_t pos = ts_node_start_byte(node); + while (pos > 0) { + const auto& c = source.at(pos); + if (c == '\n') { + return true; + } else if (c == ' ' || c == '\t') { + pos--; + continue; + } + return false; + } + return true; +} + +FormatterTreeNode::FormatterTreeNode(const std::string& source, const TSNode& node) + : token(get_source_code(source, node)) { + // Set any metadata based on the value of the token + metadata.is_comment = str_util::starts_with(str_util::ltrim(token.value()), ";"); + metadata.num_blank_lines_following = num_blank_lines_following_node(source, node); + metadata.is_inline = !node_preceeded_by_only_whitespace(source, node); +}; + +std::shared_ptr FormatterTreeNode::get_formatting_rule(const int depth, + const int index) const { // TODO - really lazy for now if (!rules.empty()) { return rules.at(0); @@ -41,16 +82,9 @@ bool nodes_on_same_line(const std::string& source, const TSNode& n1, const TSNod return !str_util::contains(code_between, "\n"); } -std::string get_source_code(const std::string& source, const TSNode& node) { - uint32_t start = ts_node_start_byte(node); - uint32_t end = ts_node_end_byte(node); - // TODO - comments end with a \n, this is likely a tree-sitter grammar problem - return str_util::rtrim(source.substr(start, end - start)); -} - FormatterTree::FormatterTree(const std::string& source, const TSNode& root_node) { root = FormatterTreeNode(); - root.metadata.is_root = true; + root.metadata.is_top_level = true; construct_formatter_tree_recursive(source, root_node, root); } @@ -59,7 +93,7 @@ void FormatterTree::construct_formatter_tree_recursive(const std::string& source TSNode curr_node, FormatterTreeNode& tree_node) { if (ts_node_child_count(curr_node) == 0) { - tree_node.refs.push_back(FormatterTreeNode(get_source_code(source, curr_node))); + tree_node.refs.push_back(FormatterTreeNode(source, curr_node)); return; } const std::string curr_node_type = ts_node_type(curr_node); @@ -80,8 +114,9 @@ void FormatterTree::construct_formatter_tree_recursive(const std::string& source list_node.metadata.multiple_elements_first_line = !node_followed_by_only_whitespace(source, child_node); // Peek at the first element of the list to determine formatting rules - if (formatter::opengoal_rules.find(contents) != formatter::opengoal_rules.end()) { - list_node.rules = formatter::opengoal_rules.at(contents); + if (formatter::opengoal_indentation_rules.find(contents) != + formatter::opengoal_indentation_rules.end()) { + list_node.rules = formatter::opengoal_indentation_rules.at(contents); } } construct_formatter_tree_recursive(source, child_node, list_node); diff --git a/common/formatter/formatter_tree.h b/common/formatter/formatter_tree.h index bc93c30bc5..ee245ec662 100644 --- a/common/formatter/formatter_tree.h +++ b/common/formatter/formatter_tree.h @@ -9,7 +9,7 @@ #include "tree_sitter/api.h" -class FormattingRule; +class IndentationRule; // Treesitter is fantastic for validating and parsing our code into a structured tree format without // whitespace so we can do that ourselves (formatting) However, the treesitter AST is a bit too @@ -28,13 +28,16 @@ class FormattingRule; // Pass 2 - use the simplified tree to output the final code namespace formatter { -extern const std::shared_ptr default_rule; +extern const std::shared_ptr default_indentation_rule; } class FormatterTreeNode { public: struct Metadata { - bool is_root = false; + bool is_top_level = false; + bool is_comment = false; + bool is_inline = false; + int num_blank_lines_following = 0; // Whether the form had more than 1 element on the first line // (println // "test") @@ -48,15 +51,15 @@ class FormatterTreeNode { // The token is optional because list nodes do not contain a token, they just contain a bunch of // eventually token node refs std::optional token; - std::vector> rules = {}; + std::vector> rules = {}; FormatterTreeNode() = default; - FormatterTreeNode(const std::string& _token) : token(_token){}; + FormatterTreeNode(const std::string& source, const TSNode& node); FormatterTreeNode(const Metadata& _metadata) : metadata(_metadata){}; // Considers the input to determine the most relevant formatting rule for the given node // if none are applicable, returns `formatter::default_rule` - std::shared_ptr get_formatting_rule(const int depth, const int index) const; + std::shared_ptr get_formatting_rule(const int depth, const int index) const; }; // A FormatterTree has a very simple and crude tree structure where: diff --git a/common/formatter/formatting_rules.cpp b/common/formatter/formatting_rules.cpp index 179bc1f055..d1d44376e9 100644 --- a/common/formatter/formatting_rules.cpp +++ b/common/formatter/formatting_rules.cpp @@ -2,24 +2,47 @@ #include "common/util/string_util.h" -void FormattingRule::append_newline(std::string& curr_text, - const FormatterTreeNode& node, - const FormatterTreeNode& containing_node, - const int depth, - const int index) { +void formatter_rules::blank_lines::separate_by_newline(std::string& curr_text, + const FormatterTreeNode& containing_node, + const FormatterTreeNode& node, + const int index) { + // We only are concerned with top level forms or elements + // Skip the last element, no trailing new-lines (let the editors handle this!) + // Also peek ahead to see if there was a comment on this line, if so don't separate things! + if (!containing_node.metadata.is_top_level || index >= containing_node.refs.size() - 1 || + (containing_node.refs.at(index + 1).metadata.is_comment && + containing_node.refs.at(index + 1).metadata.is_inline)) { + return; + } + curr_text += "\n"; + // If it's a comment, but has no following blank lines, dont insert a blank line + if (node.metadata.is_comment && node.metadata.num_blank_lines_following == 0) { + return; + } + // Otherwise, add only 1 blank line + curr_text += "\n"; +} + +void IndentationRule::append_newline(std::string& curr_text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node, + const int depth, + const int index) { if (index <= 0 && !containing_node.metadata.multiple_elements_first_line || - index <= 1 && containing_node.metadata.multiple_elements_first_line) { + index <= 1 && containing_node.metadata.multiple_elements_first_line || + containing_node.metadata.is_top_level || + (node.metadata.is_comment && node.metadata.is_inline)) { return; } curr_text = str_util::rtrim(curr_text) + "\n"; } -void FormattingRule::indent_token(std::string& curr_text, - const FormatterTreeNode& node, - const FormatterTreeNode& containing_node, - const int depth, - const int index) { - if (node.metadata.is_root) { +void IndentationRule::indent_token(std::string& curr_text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node, + const int depth, + const int index) { + if (node.metadata.is_top_level) { return; } if (containing_node.metadata.multiple_elements_first_line) { @@ -37,9 +60,9 @@ void FormattingRule::indent_token(std::string& curr_text, } } -void FormattingRule::align_form_lines(std::string& text, - const FormatterTreeNode& node, - const FormatterTreeNode& containing_node) { +void IndentationRule::align_form_lines(std::string& text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node) { const auto lines = str_util::split(text); // TODO - unsafe (breaks on a list of lists) int alignment_width = 1; @@ -56,11 +79,11 @@ void FormattingRule::align_form_lines(std::string& text, text = aligned_form; } -void InnerFormattingRule::append_newline(std::string& curr_text, - const FormatterTreeNode& node, - const FormatterTreeNode& containing_node, - const int depth, - const int index) { +void InnerIndentationRule::append_newline(std::string& curr_text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node, + const int depth, + const int index) { if (index < 1 || (m_depth != depth || m_index && m_index.value() != index)) { return; } @@ -69,11 +92,11 @@ void InnerFormattingRule::append_newline(std::string& curr_text, } } -void InnerFormattingRule::indent_token(std::string& curr_text, - const FormatterTreeNode& node, - const FormatterTreeNode& containing_node, - const int depth, - const int index) { +void InnerIndentationRule::indent_token(std::string& curr_text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node, + const int depth, + const int index) { if (index < 1 || (m_depth != depth || m_index && m_index.value() != index)) { return; } @@ -83,9 +106,9 @@ void InnerFormattingRule::indent_token(std::string& curr_text, } } -void InnerFormattingRule::align_form_lines(std::string& text, - const FormatterTreeNode& node, - const FormatterTreeNode& containing_node) { +void InnerIndentationRule::align_form_lines(std::string& text, + const FormatterTreeNode& node, + const FormatterTreeNode& containing_node) { if (node.metadata.was_on_first_line_of_form) { return; } diff --git a/common/formatter/formatting_rules.h b/common/formatter/formatting_rules.h index 25453de659..372bc735ea 100644 --- a/common/formatter/formatting_rules.h +++ b/common/formatter/formatting_rules.h @@ -6,6 +6,35 @@ class FormatterTreeNode; +namespace formatter_rules { +// The formatter will try to collapse as much space as possible in the top-level, this means +// separating forms by a single empty blank line +// +// The exception is comments, top level comments will retain their following blank lines from the +// original source +// - this could be none, in the case where a comment is directly next to a form (like this one!) +// - you don't want them to be separated! +// - or this could be top level comments / comment blocks documenting things but not being +// associated with a form +// - in this case, you want them to remain separated +// +// Reference - https://github.com/kkinnear/zprint/blob/main/doc/options/blank.md +namespace blank_lines { +void separate_by_newline(std::string& curr_text, + const FormatterTreeNode& containing_node, + const FormatterTreeNode& node, + const int index); +} + +// TODO - nothing here yet, in the future: +// - if/when the formatter is concerned with line length, there are implications here +// - align consecutive comment lines +// +// Reference - https://github.com/kkinnear/zprint/blob/main/doc/options/comments.md +namespace comments {} + +} // namespace formatter_rules + // Indentation rules are heavily inspired by the descriptions here // https://github.com/weavejester/cljfmt/blob/master/docs/INDENTS.md // @@ -35,9 +64,9 @@ class FormatterTreeNode; // - otherwise, every element after the 2nd is on a new line and aligned to that 1st list arg. // (println "hello" ; <= more than one element on first line // "world") -class FormattingRule { +class IndentationRule { public: - virtual ~FormattingRule() = default; + virtual ~IndentationRule() = default; virtual void append_newline(std::string& curr_text, const FormatterTreeNode& node, const FormatterTreeNode& containing_node, @@ -64,14 +93,14 @@ class FormattingRule { // (defn dismiss // [name] // (println "Goodbye" name)) -class InnerFormattingRule : public FormattingRule { +class InnerIndentationRule : public IndentationRule { private: int m_depth; std::optional m_index; public: - InnerFormattingRule(int depth) : m_depth(depth){}; - InnerFormattingRule(int depth, int index) : m_depth(depth), m_index(index){}; + InnerIndentationRule(int depth) : m_depth(depth){}; + InnerIndentationRule(int depth, int index) : m_depth(depth), m_index(index){}; virtual void append_newline(std::string& curr_text, const FormatterTreeNode& node, const FormatterTreeNode& containing_node, diff --git a/test/common/formatter/corpus/blank-lines.test.gc b/test/common/formatter/corpus/blank-lines.test.gc new file mode 100644 index 0000000000..6198cd2703 --- /dev/null +++ b/test/common/formatter/corpus/blank-lines.test.gc @@ -0,0 +1,14 @@ +=== +Separate Forms +=== + +(println "test") +(println "test") (println "test") + +--- + +(println "test") + +(println "test") + +(println "test") diff --git a/test/common/formatter/corpus/comments.test.gc b/test/common/formatter/corpus/comments.test.gc index 992ef3203e..e04c98889a 100644 --- a/test/common/formatter/corpus/comments.test.gc +++ b/test/common/formatter/corpus/comments.test.gc @@ -11,16 +11,14 @@ Top-Level Comment (println "test") === -TODO - Inline Comment +Inline Comment === (println "test") ;; test --- -(println "test") - - ;; test +(println "test") ;; test === TODO - Block Comment diff --git a/test/common/formatter/test_formatter.cpp b/test/common/formatter/test_formatter.cpp index fc9f0ed372..d8bcd246f9 100644 --- a/test/common/formatter/test_formatter.cpp +++ b/test/common/formatter/test_formatter.cpp @@ -35,7 +35,7 @@ struct TestDefinition { std::string output; }; -bool run_tests(const fs::path& file_path, const bool only_important_tests) { +std::vector get_test_definitions(const fs::path& file_path) { // Read in the file, and run the test const auto contents = str_util::split(file_util::read_text_file(file_path)); std::vector tests; @@ -74,6 +74,21 @@ bool run_tests(const fs::path& file_path, const bool only_important_tests) { continue; } } + return tests; +} + +bool has_important_tests(const fs::path& file_path) { + const auto& tests = get_test_definitions(file_path); + for (const auto& test : tests) { + if (str_util::starts_with(test.name, "!")) { + return true; + } + } + return false; +} + +bool run_tests(const fs::path& file_path, const bool only_important_tests) { + const auto& tests = get_test_definitions(file_path); // Run the tests, report successes and failures bool test_failed = false; fmt::print("{}:\n", fmt::styled(file_util::base_name(file_path.string()), @@ -103,11 +118,19 @@ bool run_tests(const fs::path& file_path, const bool only_important_tests) { return test_failed; } -bool find_and_run_tests(const bool only_important_tests) { +bool find_and_run_tests() { // Enumerate test files const auto test_files = file_util::find_files_recursively( file_util::get_file_path({"test/common/formatter/corpus"}), std::regex("^.*\.test.gc$")); bool failed = false; + // First do a pass to see if any tests are meant to be prioritized for debugging + bool only_important_tests = false; + for (const auto& file : test_files) { + only_important_tests = has_important_tests(file); + if (only_important_tests) { + break; + } + } for (const auto& file : test_files) { // don't fail fast, but any failure means we return false if (failed) { @@ -120,9 +143,5 @@ bool find_and_run_tests(const bool only_important_tests) { } TEST(Formatter, FormatterTests) { - // TODO - when i get annoyed enough, make this not a manual change and pre-scan the tests to see - // if there are any flagged tests - // - // This is to make it easier to run an individual test when debugging - EXPECT_TRUE(find_and_run_tests(false)); + EXPECT_TRUE(find_and_run_tests()); }