formatter: handle top level blank lines and better handle comments (#2702)

This commit is contained in:
Tyler Wilding
2023-06-06 19:34:50 -05:00
committed by GitHub
parent 3ecb3e4bc8
commit 3dbaee1ecc
11 changed files with 202 additions and 88 deletions
+2 -2
View File
@@ -9,6 +9,6 @@ namespace formatter {
// TODO - populate these more
const std::unordered_map<std::string, std::vector<std::shared_ptr<FormattingRule>>> opengoal_rules =
{{"defun", {std::make_shared<InnerFormattingRule>(1)}}};
const std::unordered_map<std::string, std::vector<std::shared_ptr<IndentationRule>>>
opengoal_indentation_rules = {{"defun", {std::make_shared<InnerIndentationRule>(1)}}};
} // namespace formatter
+2 -2
View File
@@ -8,6 +8,6 @@
#include "common/formatter/formatting_rules.h"
namespace formatter {
extern const std::unordered_map<std::string, std::vector<std::shared_ptr<FormattingRule>>>
opengoal_rules;
extern const std::unordered_map<std::string, std::vector<std::shared_ptr<IndentationRule>>>
opengoal_indentation_rules;
}
+9 -20
View File
@@ -39,17 +39,15 @@ std::string apply_formatting(const FormatterTreeNode& curr_node,
return curr_form;
}
// TODO - this might have some issues for non-list top level elements (ie. comments)
if (!curr_node.metadata.is_root) {
if (!curr_node.metadata.is_top_level) {
curr_form += "(";
}
// Iterate the form
for (int i = 0; i < curr_node.refs.size(); i++) {
const auto& ref = curr_node.refs.at(i);
// Append a newline if relevant
if (!curr_node.metadata.is_root) {
curr_node.get_formatting_rule(tree_depth, i)
->append_newline(curr_form, ref, curr_node, tree_depth, i);
}
// Append a newline if needed
curr_node.get_formatting_rule(tree_depth, i)
->append_newline(curr_form, ref, curr_node, tree_depth, i);
// Either print the element's token, or recursively format it as well
if (ref.token) {
curr_node.get_formatting_rule(tree_depth, i)
@@ -58,30 +56,21 @@ std::string apply_formatting(const FormatterTreeNode& curr_node,
// forms are always done bottom-top recursively, they always act
// independently as if it was the shallowest depth
curr_form += ref.token.value();
if (!curr_node.metadata.is_root) {
if (!curr_node.metadata.is_top_level) {
curr_form += " ";
}
} else {
auto formatted_form = apply_formatting(ref, "", tree_depth + 1);
if (!curr_node.metadata.is_root) {
if (!curr_node.metadata.is_top_level) {
curr_node.get_formatting_rule(tree_depth, i)
->align_form_lines(formatted_form, ref, curr_node);
}
curr_form += formatted_form;
}
// Separate top-level elements with a new-line
// TODO - move this into a top-level rule
// TODO - temporary hack for top level comments, but need a better strategy for leaving them
// where they were originally
if (curr_node.metadata.is_root && i < curr_node.refs.size() - 1) {
curr_form += "\n";
if (!ref.token || !str_util::starts_with(ref.token.value(), ";")) {
curr_form += "\n";
}
}
// Handle blank lines at the top level, skip if it's the final element
formatter_rules::blank_lines::separate_by_newline(curr_form, curr_node, ref, i);
}
// TODO - similar fear to issues as above
if (!curr_node.metadata.is_root) {
if (!curr_node.metadata.is_top_level) {
curr_form = str_util::rtrim(curr_form) + ")";
}
return curr_form;
+4
View File
@@ -3,8 +3,12 @@
#include <optional>
#include <string>
#include "formatting_rules.h"
#include "tree_sitter/api.h"
// TODO:
// - Considering _eventually_ adding line-length heuristics
namespace formatter {
struct TreeSitterParserDeleter {
+49 -14
View File
@@ -5,11 +5,52 @@
#include "config/rule_config.h"
namespace formatter {
const std::shared_ptr<FormattingRule> default_rule = std::make_shared<FormattingRule>();
const std::shared_ptr<IndentationRule> default_rule = std::make_shared<IndentationRule>();
}
std::shared_ptr<FormattingRule> FormatterTreeNode::get_formatting_rule(const int depth,
const int index) const {
std::string get_source_code(const std::string& source, const TSNode& node) {
uint32_t start = ts_node_start_byte(node);
uint32_t end = ts_node_end_byte(node);
// TODO - comments end with a \n, this is likely a tree-sitter grammar problem
return str_util::rtrim(source.substr(start, end - start));
}
int num_blank_lines_following_node(const std::string& source, const TSNode& node) {
int num_lines = 0;
uint32_t cursor = ts_node_end_byte(node);
while (cursor < source.length() && source.at(cursor) == '\n') {
num_lines++;
cursor++;
}
return num_lines;
}
// Check if the original source only has whitespace up to a new-line after it's token
bool node_preceeded_by_only_whitespace(const std::string& source, const TSNode& node) {
uint32_t pos = ts_node_start_byte(node);
while (pos > 0) {
const auto& c = source.at(pos);
if (c == '\n') {
return true;
} else if (c == ' ' || c == '\t') {
pos--;
continue;
}
return false;
}
return true;
}
FormatterTreeNode::FormatterTreeNode(const std::string& source, const TSNode& node)
: token(get_source_code(source, node)) {
// Set any metadata based on the value of the token
metadata.is_comment = str_util::starts_with(str_util::ltrim(token.value()), ";");
metadata.num_blank_lines_following = num_blank_lines_following_node(source, node);
metadata.is_inline = !node_preceeded_by_only_whitespace(source, node);
};
std::shared_ptr<IndentationRule> FormatterTreeNode::get_formatting_rule(const int depth,
const int index) const {
// TODO - really lazy for now
if (!rules.empty()) {
return rules.at(0);
@@ -41,16 +82,9 @@ bool nodes_on_same_line(const std::string& source, const TSNode& n1, const TSNod
return !str_util::contains(code_between, "\n");
}
std::string get_source_code(const std::string& source, const TSNode& node) {
uint32_t start = ts_node_start_byte(node);
uint32_t end = ts_node_end_byte(node);
// TODO - comments end with a \n, this is likely a tree-sitter grammar problem
return str_util::rtrim(source.substr(start, end - start));
}
FormatterTree::FormatterTree(const std::string& source, const TSNode& root_node) {
root = FormatterTreeNode();
root.metadata.is_root = true;
root.metadata.is_top_level = true;
construct_formatter_tree_recursive(source, root_node, root);
}
@@ -59,7 +93,7 @@ void FormatterTree::construct_formatter_tree_recursive(const std::string& source
TSNode curr_node,
FormatterTreeNode& tree_node) {
if (ts_node_child_count(curr_node) == 0) {
tree_node.refs.push_back(FormatterTreeNode(get_source_code(source, curr_node)));
tree_node.refs.push_back(FormatterTreeNode(source, curr_node));
return;
}
const std::string curr_node_type = ts_node_type(curr_node);
@@ -80,8 +114,9 @@ void FormatterTree::construct_formatter_tree_recursive(const std::string& source
list_node.metadata.multiple_elements_first_line =
!node_followed_by_only_whitespace(source, child_node);
// Peek at the first element of the list to determine formatting rules
if (formatter::opengoal_rules.find(contents) != formatter::opengoal_rules.end()) {
list_node.rules = formatter::opengoal_rules.at(contents);
if (formatter::opengoal_indentation_rules.find(contents) !=
formatter::opengoal_indentation_rules.end()) {
list_node.rules = formatter::opengoal_indentation_rules.at(contents);
}
}
construct_formatter_tree_recursive(source, child_node, list_node);
+9 -6
View File
@@ -9,7 +9,7 @@
#include "tree_sitter/api.h"
class FormattingRule;
class IndentationRule;
// Treesitter is fantastic for validating and parsing our code into a structured tree format without
// whitespace so we can do that ourselves (formatting) However, the treesitter AST is a bit too
@@ -28,13 +28,16 @@ class FormattingRule;
// Pass 2 - use the simplified tree to output the final code
namespace formatter {
extern const std::shared_ptr<FormattingRule> default_rule;
extern const std::shared_ptr<IndentationRule> default_indentation_rule;
}
class FormatterTreeNode {
public:
struct Metadata {
bool is_root = false;
bool is_top_level = false;
bool is_comment = false;
bool is_inline = false;
int num_blank_lines_following = 0;
// Whether the form had more than 1 element on the first line
// (println
// "test")
@@ -48,15 +51,15 @@ class FormatterTreeNode {
// The token is optional because list nodes do not contain a token, they just contain a bunch of
// eventually token node refs
std::optional<std::string> token;
std::vector<std::shared_ptr<FormattingRule>> rules = {};
std::vector<std::shared_ptr<IndentationRule>> rules = {};
FormatterTreeNode() = default;
FormatterTreeNode(const std::string& _token) : token(_token){};
FormatterTreeNode(const std::string& source, const TSNode& node);
FormatterTreeNode(const Metadata& _metadata) : metadata(_metadata){};
// Considers the input to determine the most relevant formatting rule for the given node
// if none are applicable, returns `formatter::default_rule`
std::shared_ptr<FormattingRule> get_formatting_rule(const int depth, const int index) const;
std::shared_ptr<IndentationRule> get_formatting_rule(const int depth, const int index) const;
};
// A FormatterTree has a very simple and crude tree structure where:
+51 -28
View File
@@ -2,24 +2,47 @@
#include "common/util/string_util.h"
void FormattingRule::append_newline(std::string& curr_text,
const FormatterTreeNode& node,
const FormatterTreeNode& containing_node,
const int depth,
const int index) {
void formatter_rules::blank_lines::separate_by_newline(std::string& curr_text,
const FormatterTreeNode& containing_node,
const FormatterTreeNode& node,
const int index) {
// We only are concerned with top level forms or elements
// Skip the last element, no trailing new-lines (let the editors handle this!)
// Also peek ahead to see if there was a comment on this line, if so don't separate things!
if (!containing_node.metadata.is_top_level || index >= containing_node.refs.size() - 1 ||
(containing_node.refs.at(index + 1).metadata.is_comment &&
containing_node.refs.at(index + 1).metadata.is_inline)) {
return;
}
curr_text += "\n";
// If it's a comment, but has no following blank lines, dont insert a blank line
if (node.metadata.is_comment && node.metadata.num_blank_lines_following == 0) {
return;
}
// Otherwise, add only 1 blank line
curr_text += "\n";
}
void IndentationRule::append_newline(std::string& curr_text,
const FormatterTreeNode& node,
const FormatterTreeNode& containing_node,
const int depth,
const int index) {
if (index <= 0 && !containing_node.metadata.multiple_elements_first_line ||
index <= 1 && containing_node.metadata.multiple_elements_first_line) {
index <= 1 && containing_node.metadata.multiple_elements_first_line ||
containing_node.metadata.is_top_level ||
(node.metadata.is_comment && node.metadata.is_inline)) {
return;
}
curr_text = str_util::rtrim(curr_text) + "\n";
}
void FormattingRule::indent_token(std::string& curr_text,
const FormatterTreeNode& node,
const FormatterTreeNode& containing_node,
const int depth,
const int index) {
if (node.metadata.is_root) {
void IndentationRule::indent_token(std::string& curr_text,
const FormatterTreeNode& node,
const FormatterTreeNode& containing_node,
const int depth,
const int index) {
if (node.metadata.is_top_level) {
return;
}
if (containing_node.metadata.multiple_elements_first_line) {
@@ -37,9 +60,9 @@ void FormattingRule::indent_token(std::string& curr_text,
}
}
void FormattingRule::align_form_lines(std::string& text,
const FormatterTreeNode& node,
const FormatterTreeNode& containing_node) {
void IndentationRule::align_form_lines(std::string& text,
const FormatterTreeNode& node,
const FormatterTreeNode& containing_node) {
const auto lines = str_util::split(text);
// TODO - unsafe (breaks on a list of lists)
int alignment_width = 1;
@@ -56,11 +79,11 @@ void FormattingRule::align_form_lines(std::string& text,
text = aligned_form;
}
void InnerFormattingRule::append_newline(std::string& curr_text,
const FormatterTreeNode& node,
const FormatterTreeNode& containing_node,
const int depth,
const int index) {
void InnerIndentationRule::append_newline(std::string& curr_text,
const FormatterTreeNode& node,
const FormatterTreeNode& containing_node,
const int depth,
const int index) {
if (index < 1 || (m_depth != depth || m_index && m_index.value() != index)) {
return;
}
@@ -69,11 +92,11 @@ void InnerFormattingRule::append_newline(std::string& curr_text,
}
}
void InnerFormattingRule::indent_token(std::string& curr_text,
const FormatterTreeNode& node,
const FormatterTreeNode& containing_node,
const int depth,
const int index) {
void InnerIndentationRule::indent_token(std::string& curr_text,
const FormatterTreeNode& node,
const FormatterTreeNode& containing_node,
const int depth,
const int index) {
if (index < 1 || (m_depth != depth || m_index && m_index.value() != index)) {
return;
}
@@ -83,9 +106,9 @@ void InnerFormattingRule::indent_token(std::string& curr_text,
}
}
void InnerFormattingRule::align_form_lines(std::string& text,
const FormatterTreeNode& node,
const FormatterTreeNode& containing_node) {
void InnerIndentationRule::align_form_lines(std::string& text,
const FormatterTreeNode& node,
const FormatterTreeNode& containing_node) {
if (node.metadata.was_on_first_line_of_form) {
return;
}
+34 -5
View File
@@ -6,6 +6,35 @@
class FormatterTreeNode;
namespace formatter_rules {
// The formatter will try to collapse as much space as possible in the top-level, this means
// separating forms by a single empty blank line
//
// The exception is comments, top level comments will retain their following blank lines from the
// original source
// - this could be none, in the case where a comment is directly next to a form (like this one!)
// - you don't want them to be separated!
// - or this could be top level comments / comment blocks documenting things but not being
// associated with a form
// - in this case, you want them to remain separated
//
// Reference - https://github.com/kkinnear/zprint/blob/main/doc/options/blank.md
namespace blank_lines {
void separate_by_newline(std::string& curr_text,
const FormatterTreeNode& containing_node,
const FormatterTreeNode& node,
const int index);
}
// TODO - nothing here yet, in the future:
// - if/when the formatter is concerned with line length, there are implications here
// - align consecutive comment lines
//
// Reference - https://github.com/kkinnear/zprint/blob/main/doc/options/comments.md
namespace comments {}
} // namespace formatter_rules
// Indentation rules are heavily inspired by the descriptions here
// https://github.com/weavejester/cljfmt/blob/master/docs/INDENTS.md
//
@@ -35,9 +64,9 @@ class FormatterTreeNode;
// - otherwise, every element after the 2nd is on a new line and aligned to that 1st list arg.
// (println "hello" ; <= more than one element on first line
// "world")
class FormattingRule {
class IndentationRule {
public:
virtual ~FormattingRule() = default;
virtual ~IndentationRule() = default;
virtual void append_newline(std::string& curr_text,
const FormatterTreeNode& node,
const FormatterTreeNode& containing_node,
@@ -64,14 +93,14 @@ class FormattingRule {
// (defn dismiss
// [name]
// (println "Goodbye" name))
class InnerFormattingRule : public FormattingRule {
class InnerIndentationRule : public IndentationRule {
private:
int m_depth;
std::optional<int> m_index;
public:
InnerFormattingRule(int depth) : m_depth(depth){};
InnerFormattingRule(int depth, int index) : m_depth(depth), m_index(index){};
InnerIndentationRule(int depth) : m_depth(depth){};
InnerIndentationRule(int depth, int index) : m_depth(depth), m_index(index){};
virtual void append_newline(std::string& curr_text,
const FormatterTreeNode& node,
const FormatterTreeNode& containing_node,
@@ -0,0 +1,14 @@
===
Separate Forms
===
(println "test")
(println "test") (println "test")
---
(println "test")
(println "test")
(println "test")
@@ -11,16 +11,14 @@ Top-Level Comment
(println "test")
===
TODO - Inline Comment
Inline Comment
===
(println "test") ;; test
---
(println "test")
;; test
(println "test") ;; test
===
TODO - Block Comment
+26 -7
View File
@@ -35,7 +35,7 @@ struct TestDefinition {
std::string output;
};
bool run_tests(const fs::path& file_path, const bool only_important_tests) {
std::vector<TestDefinition> get_test_definitions(const fs::path& file_path) {
// Read in the file, and run the test
const auto contents = str_util::split(file_util::read_text_file(file_path));
std::vector<TestDefinition> tests;
@@ -74,6 +74,21 @@ bool run_tests(const fs::path& file_path, const bool only_important_tests) {
continue;
}
}
return tests;
}
bool has_important_tests(const fs::path& file_path) {
const auto& tests = get_test_definitions(file_path);
for (const auto& test : tests) {
if (str_util::starts_with(test.name, "!")) {
return true;
}
}
return false;
}
bool run_tests(const fs::path& file_path, const bool only_important_tests) {
const auto& tests = get_test_definitions(file_path);
// Run the tests, report successes and failures
bool test_failed = false;
fmt::print("{}:\n", fmt::styled(file_util::base_name(file_path.string()),
@@ -103,11 +118,19 @@ bool run_tests(const fs::path& file_path, const bool only_important_tests) {
return test_failed;
}
bool find_and_run_tests(const bool only_important_tests) {
bool find_and_run_tests() {
// Enumerate test files
const auto test_files = file_util::find_files_recursively(
file_util::get_file_path({"test/common/formatter/corpus"}), std::regex("^.*\.test.gc$"));
bool failed = false;
// First do a pass to see if any tests are meant to be prioritized for debugging
bool only_important_tests = false;
for (const auto& file : test_files) {
only_important_tests = has_important_tests(file);
if (only_important_tests) {
break;
}
}
for (const auto& file : test_files) {
// don't fail fast, but any failure means we return false
if (failed) {
@@ -120,9 +143,5 @@ bool find_and_run_tests(const bool only_important_tests) {
}
TEST(Formatter, FormatterTests) {
// TODO - when i get annoyed enough, make this not a manual change and pre-scan the tests to see
// if there are any flagged tests
//
// This is to make it easier to run an individual test when debugging
EXPECT_TRUE(find_and_run_tests(false));
EXPECT_TRUE(find_and_run_tests());
}