Write an extractor to generate AST from tree-sitter

This commit is contained in:
Charlie Marsh 2022-10-05 17:51:45 -04:00
parent 40ab5d353b
commit c8dad90115
3 changed files with 561 additions and 131 deletions

115
Cargo.lock generated
View File

@ -404,9 +404,9 @@ checksum = "fff857943da45f546682664a79488be82e69e43c1a7a2307679ab9afb3a66d2e"
[[package]]
name = "clap"
version = "4.0.9"
version = "4.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30607dd93c420c6f1f80b544be522a0238a7db35e6a12968d28910983fee0df0"
checksum = "4ed45cc2c62a3eff523e718d8576ba762c83a3146151093283ac62ae11933a73"
dependencies = [
"atty",
"bitflags",
@ -419,9 +419,9 @@ dependencies = [
[[package]]
name = "clap_derive"
version = "4.0.9"
version = "4.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4a307492e1a34939f79d3b6b9650bd2b971513cd775436bf2b78defeb5af00b"
checksum = "db342ce9fda24fb191e2ed4e102055a4d381c1086a06630174cd8da8d5d917ce"
dependencies = [
"heck",
"proc-macro-error",
@ -452,6 +452,16 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "codespan-reporting"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e"
dependencies = [
"termcolor",
"unicode-width",
]
[[package]]
name = "colored"
version = "2.0.0"
@ -574,6 +584,50 @@ dependencies = [
"syn",
]
[[package]]
name = "cxx"
version = "1.0.78"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19f39818dcfc97d45b03953c1292efc4e80954e1583c4aa770bac1383e2310a4"
dependencies = [
"cc",
"cxxbridge-flags",
"cxxbridge-macro",
"link-cplusplus",
]
[[package]]
name = "cxx-build"
version = "1.0.78"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e580d70777c116df50c390d1211993f62d40302881e54d4b79727acb83d0199"
dependencies = [
"cc",
"codespan-reporting",
"once_cell",
"proc-macro2",
"quote",
"scratch",
"syn",
]
[[package]]
name = "cxxbridge-flags"
version = "1.0.78"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56a46460b88d1cec95112c8c363f0e2c39afdb237f60583b0b36343bf627ea9c"
[[package]]
name = "cxxbridge-macro"
version = "1.0.78"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "747b608fecf06b0d72d440f27acc99288207324b793be2c17991839f3d4995ea"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "diff"
version = "0.1.13"
@ -991,17 +1045,28 @@ checksum = "dfa686283ad6dd069f105e5ab091b04c62850d3e4cf5d67debad1933f55023df"
[[package]]
name = "iana-time-zone"
version = "0.1.50"
version = "0.1.51"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd911b35d940d2bd0bea0f9100068e5b97b51a1cbe13d13382f132e0365257a0"
checksum = "f5a6ef98976b22b3b7f2f3a806f858cb862044cfa66805aa3ad84cb3d3b785ed"
dependencies = [
"android_system_properties",
"core-foundation-sys",
"iana-time-zone-haiku",
"js-sys",
"wasm-bindgen",
"winapi 0.3.9",
]
[[package]]
name = "iana-time-zone-haiku"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fde6edd6cef363e9359ed3c98ba64590ba9eecba2293eb5a723ab32aee8926aa"
dependencies = [
"cxx",
"cxx-build",
]
[[package]]
name = "idna"
version = "0.3.0"
@ -1085,9 +1150,9 @@ dependencies = [
[[package]]
name = "itoa"
version = "1.0.3"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754"
checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc"
[[package]]
name = "js-sys"
@ -1221,6 +1286,15 @@ dependencies = [
"syn",
]
[[package]]
name = "link-cplusplus"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9272ab7b96c9046fbc5bc56c06c117cb639fe2d509df0c421cad82d2915cf369"
dependencies = [
"cc",
]
[[package]]
name = "linked-hash-map"
version = "0.5.6"
@ -1928,6 +2002,7 @@ dependencies = [
"log",
"notify",
"num-bigint",
"num-traits",
"once_cell",
"path-absolutize",
"rayon",
@ -2059,6 +2134,12 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "scratch"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8132065adcfd6e02db789d9285a0deb2f3fcb04002865ab67d5fb103533898"
[[package]]
name = "sct"
version = "0.7.0"
@ -2097,9 +2178,9 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.85"
version = "1.0.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e55a28e3aaef9d5ce0506d0a14dbba8054ddc7e499ef522dd8b26859ec9d4a44"
checksum = "41feea4228a6f1cd09ec7a3593a682276702cd67b5273544757dae23c096f074"
dependencies = [
"itoa",
"ryu",
@ -2282,9 +2363,9 @@ dependencies = [
[[package]]
name = "syn"
version = "1.0.101"
version = "1.0.102"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e90cde112c4b9690b8cbe810cba9ddd8bc1d7472e2cae317b69e9438c1cba7d2"
checksum = "3fcd952facd492f9be3ef0d0b7032a6e442ee9b361d4acc2b1d0c4aaa5f613a1"
dependencies = [
"proc-macro2",
"quote",
@ -2520,9 +2601,9 @@ checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992"
[[package]]
name = "unicode-ident"
version = "1.0.4"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcc811dc4066ac62f84f11307873c4850cb653bfa9b1719cee2bd2204a4bc5dd"
checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3"
[[package]]
name = "unicode-normalization"
@ -2533,6 +2614,12 @@ dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-width"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
[[package]]
name = "unicode-xid"
version = "0.2.4"

View File

@ -7,7 +7,7 @@ edition = "2021"
name = "ruff"
[build-dependencies]
cc="*"
cc = "*"
[dependencies]
anyhow = { version = "1.0.60" }
@ -27,23 +27,24 @@ itertools = { version = "0.10.5" }
libcst = { git = "https://github.com/charliermarsh/LibCST", rev = "32a044c127668df44582f85699358e67803b0d73" }
log = { version = "0.4.17" }
notify = { version = "4.0.17" }
num-bigint = { version = "0.4.3" }
num-traits = { version = "0.2.15" }
once_cell = { version = "1.13.1" }
path-absolutize = { version = "3.0.13", features = ["once_cell_cache"] }
rayon = { version = "1.5.3" }
regex = { version = "1.6.0" }
rustpython-ast = { features = ["unparse"], git = "https://github.com/charliermarsh/RustPython.git", rev = "778ae2aeb521d0438d2a91bd11238bb5c2bf9d4f" }
rustpython-parser = { features = ["lalrpop"], git = "https://github.com/charliermarsh/RustPython.git", rev = "778ae2aeb521d0438d2a91bd11238bb5c2bf9d4f" }
rustpython-common = { git = "https://github.com/charliermarsh/RustPython.git", rev = "778ae2aeb521d0438d2a91bd11238bb5c2bf9d4f" }
rustpython-parser = { features = ["lalrpop"], git = "https://github.com/charliermarsh/RustPython.git", rev = "778ae2aeb521d0438d2a91bd11238bb5c2bf9d4f" }
serde = { version = "1.0.143", features = ["derive"] }
serde_json = { version = "1.0.83" }
strum = { version = "0.24.1", features = ["strum_macros"] }
strum_macros = { version = "0.24.3" }
toml = { version = "0.5.9" }
tree-sitter = "0.20.9"
tree-sitter-python = "0.20.2"
tree-sitter = { version = "0.20.9" }
tree-sitter-python = { version = "0.20.2" }
update-informer = { version = "0.5.0", default_features = false, features = ["pypi"], optional = true }
walkdir = { version = "2.3.2" }
strum = { version = "0.24.1", features = ["strum_macros"] }
strum_macros = "0.24.3"
num-bigint = "0.4.3"
[dev-dependencies]
insta = { version = "1.19.1", features = ["yaml"] }

View File

@ -1,10 +1,431 @@
use anyhow::Result;
use tree_sitter::{Parser, Query, QueryCursor};
extern crate core;
enum Action {
Up,
Down,
Right,
use anyhow::Result;
use num_bigint::BigInt;
use num_traits::{float, Num};
use rustpython_ast::{
Arguments, Constant, Expr, ExprContext, ExprKind, Keyword, KeywordData, Location, Operator,
Stmt, StmtKind, Withitem,
};
use tree_sitter::{Node, Parser, Point};
fn to_location(point: Point) -> Location {
Location::new(point.row + 1, point.column + 1)
}
fn print_node(node: Node, source: &[u8]) {
let range = node.range();
let text = &source[range.start_byte..range.end_byte];
let line = range.start_point.row;
let col = range.start_point.column;
println!(
"[Line: {}, Col: {}] {}: `{}`",
line,
col,
node.kind(),
std::str::from_utf8(text).unwrap()
);
}
fn extract_module(node: Node, source: &[u8]) -> Vec<Stmt> {
let mut cursor = node.walk();
node.children(&mut cursor)
.map(|node| extract_statement(node, source))
.collect()
}
fn extract_suite(node: Node, source: &[u8]) -> Vec<Stmt> {
let mut cursor = node.walk();
node.children(&mut cursor)
.map(|node| extract_statement(node, source))
.collect()
}
fn extract_text(node: Node, source: &[u8]) -> String {
let range = node.range();
let text = &source[range.start_byte..range.end_byte];
std::str::from_utf8(text).unwrap().to_string()
}
fn extract_augmented_operator(node: Node, source: &[u8]) -> Operator {
match node.kind() {
"+=" => Operator::Add,
"-=" => Operator::Sub,
"*=" => Operator::Mult,
"@=" => Operator::MatMult,
"/=" => Operator::Div,
"%=" => Operator::Mod,
"**=" => Operator::Pow,
"<<=" => Operator::LShift,
">>=" => Operator::RShift,
"|=" => Operator::BitOr,
"^=" => Operator::BitXor,
"&=" => Operator::BitAnd,
"//=" => Operator::FloorDiv,
_ => panic!("Invalid operator: {:?}", node),
}
}
fn extract_operator(node: Node, source: &[u8]) -> Operator {
match node.kind() {
"+" => Operator::Add,
"-" => Operator::Sub,
"*" => Operator::Mult,
"@" => Operator::MatMult,
"/" => Operator::Div,
"%" => Operator::Mod,
"**" => Operator::Pow,
"<<" => Operator::LShift,
">>" => Operator::RShift,
"|" => Operator::BitOr,
"^" => Operator::BitXor,
"&" => Operator::BitAnd,
"//" => Operator::FloorDiv,
_ => panic!("Invalid operator: {:?}", node),
}
}
fn extract_arguments(node: Node, source: &[u8]) -> Arguments {
Arguments {
posonlyargs: vec![],
args: vec![],
vararg: None,
kwonlyargs: vec![],
kw_defaults: vec![],
kwarg: None,
defaults: vec![],
}
}
fn extract_with_clause(node: Node, source: &[u8]) -> Vec<Withitem> {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
print_node(child, source);
}
return vec![];
}
fn extract_statement(node: Node, source: &[u8]) -> Stmt {
match node.kind() {
"for_statement" => Stmt::new(
to_location(node.start_position()),
to_location(node.end_position()),
StmtKind::For {
target: Box::new(extract_expression(
node.child_by_field_name("left").unwrap(),
source,
)),
iter: Box::new(extract_expression(
node.child_by_field_name("right").unwrap(),
source,
)),
body: extract_suite(node.child_by_field_name("body").unwrap(), source),
// STOPSHIP(charlie): Unimplemented.
orelse: vec![],
type_comment: None,
},
),
"while_statement" => Stmt::new(
to_location(node.start_position()),
to_location(node.end_position()),
StmtKind::While {
test: Box::new(extract_expression(
node.child_by_field_name("condition").unwrap(),
source,
)),
body: extract_suite(node.child_by_field_name("body").unwrap(), source),
// STOPSHIP(charlie): Unimplemented.
orelse: vec![],
},
),
"with_statement" => Stmt::new(
to_location(node.start_position()),
to_location(node.end_position()),
StmtKind::With {
// TODO(charlie): If async, this will be 2? Also, we need to iterate until we find
// this, probably.
items: extract_with_clause(node.child(1).unwrap(), source),
body: extract_suite(node.child_by_field_name("body").unwrap(), source),
type_comment: None,
},
),
"class_definition" => {
if let Some((bases, keywords)) = node
.child_by_field_name("superclasses")
.map(|node| extract_argument_list(node, source))
{
Stmt::new(
to_location(node.start_position()),
to_location(node.end_position()),
StmtKind::ClassDef {
name: extract_text(node.child_by_field_name("name").unwrap(), source),
bases,
keywords,
body: extract_suite(node.child_by_field_name("body").unwrap(), source),
// TODO(charlie): How do I access these? Probably need to pass them down or
// recurse.
decorator_list: vec![],
},
)
} else {
Stmt::new(
to_location(node.start_position()),
to_location(node.end_position()),
StmtKind::ClassDef {
name: extract_text(node.child_by_field_name("name").unwrap(), source),
bases: vec![],
keywords: vec![],
body: extract_suite(node.child_by_field_name("body").unwrap(), source),
// TODO(charlie): How do I access these? Probably need to pass them down or
// recurse.
decorator_list: vec![],
},
)
}
}
"function_definition" => Stmt::new(
to_location(node.start_position()),
to_location(node.end_position()),
StmtKind::FunctionDef {
name: extract_text(node.child(1).unwrap(), source),
args: Box::new(extract_arguments(node.child(2).unwrap(), source)),
body: extract_suite(node.child_by_field_name("body").unwrap(), source),
decorator_list: vec![],
returns: None,
type_comment: None,
},
),
"return_statement" => Stmt::new(
to_location(node.start_position()),
to_location(node.end_position()),
StmtKind::Return {
value: node
.child(1)
.map(|node| Box::new(extract_expression(node, source))),
},
),
"pass_statement" => Stmt::new(
to_location(node.start_position()),
to_location(node.end_position()),
StmtKind::Pass,
),
"expression_statement" => {
let node = node.child(0).unwrap();
match node.kind() {
"assignment" => Stmt::new(
to_location(node.start_position()),
to_location(node.end_position()),
StmtKind::Assign {
targets: vec![],
value: Box::new(extract_expression(node.child(2).unwrap(), source)),
type_comment: None,
},
),
"augmented_assignment" => Stmt::new(
to_location(node.start_position()),
to_location(node.end_position()),
StmtKind::AugAssign {
target: Box::new(extract_expression(
node.child_by_field_name("left").unwrap(),
source,
)),
value: Box::new(extract_expression(
node.child_by_field_name("right").unwrap(),
source,
)),
op: extract_augmented_operator(
node.child_by_field_name("operator").unwrap(),
source,
),
},
),
_ => Stmt::new(
to_location(node.start_position()),
to_location(node.end_position()),
StmtKind::Expr {
value: Box::new(extract_expression(node, source)),
},
),
}
}
_ => panic!("Unhandled node: {}", node.kind()),
}
}
fn extract_expression_list(node: Node, source: &[u8]) -> Vec<Expr> {
let mut cursor = node.walk();
node.children(&mut cursor)
.filter(|node| node.kind() != "(" && node.kind() != ")" && node.kind() != ",")
.map(|node| extract_expression(node, source))
.collect()
}
fn extract_keyword_argument(node: Node, source: &[u8]) -> Keyword {
Keyword::new(
Default::default(),
Default::default(),
KeywordData {
arg: Some(extract_text(
node.child_by_field_name("name").unwrap(),
source,
)),
value: Box::new(extract_expression(
node.child_by_field_name("value").unwrap(),
source,
)),
},
)
}
fn extract_argument_list(node: Node, source: &[u8]) -> (Vec<Expr>, Vec<Keyword>) {
let mut args = vec![];
let mut keywords = vec![];
for child in node.children(&mut node.walk()) {
match child.kind() {
"keyword_argument" => {
keywords.push(extract_keyword_argument(child, source));
}
"identifier" | "integer" => {
args.push(extract_expression(child, source));
}
_ => {}
}
}
(args, keywords)
}
fn extract_expression(node: Node, source: &[u8]) -> Expr {
match node.kind() {
"integer" => Expr::new(
to_location(node.start_position()),
to_location(node.end_position()),
ExprKind::Constant {
value: Constant::Int(
BigInt::from_str_radix(&extract_text(node, source), 10).unwrap(),
),
kind: None,
},
),
"float" => Expr::new(
to_location(node.start_position()),
to_location(node.end_position()),
ExprKind::Constant {
value: Constant::Float(extract_text(node, source).parse::<f64>().unwrap()),
kind: None,
},
),
"string" => Expr::new(
to_location(node.start_position()),
to_location(node.end_position()),
ExprKind::Constant {
value: Constant::Str(extract_text(node, source)),
kind: None,
},
),
"tuple" => Expr::new(
to_location(node.start_position()),
to_location(node.end_position()),
ExprKind::Tuple {
elts: extract_expression_list(node, source),
ctx: ExprContext::Load,
},
),
"identifier" => Expr::new(
to_location(node.start_position()),
to_location(node.end_position()),
ExprKind::Name {
id: std::str::from_utf8(&source[node.range().start_byte..node.range().end_byte])
.unwrap()
.to_string(),
ctx: ExprContext::Load,
},
),
"call" => {
let argument_list =
extract_argument_list(node.child_by_field_name("arguments").unwrap(), source);
Expr::new(
to_location(node.start_position()),
to_location(node.end_position()),
ExprKind::Call {
func: Box::new(extract_expression(
node.child_by_field_name("function").unwrap(),
source,
)),
args: argument_list.0,
keywords: argument_list.1,
},
)
}
"binary_operator" => {
print_node(node, source);
Expr::new(
to_location(node.start_position()),
to_location(node.end_position()),
ExprKind::BinOp {
left: Box::new(extract_expression(
node.child_by_field_name("left").unwrap(),
source,
)),
op: extract_operator(node.child_by_field_name("operator").unwrap(), source),
right: Box::new(extract_expression(
node.child_by_field_name("right").unwrap(),
source,
)),
},
)
}
"true" => Expr::new(
to_location(node.start_position()),
to_location(node.end_position()),
ExprKind::Constant {
value: Constant::Bool(true),
kind: None,
},
),
"false" => Expr::new(
to_location(node.start_position()),
to_location(node.end_position()),
ExprKind::Constant {
value: Constant::Bool(false),
kind: None,
},
),
"ellipsis" => Expr::new(
to_location(node.start_position()),
to_location(node.end_position()),
ExprKind::Constant {
value: Constant::Ellipsis,
kind: None,
},
),
"yield" => match node.child(1) {
None => Expr::new(
to_location(node.start_position()),
to_location(node.end_position()),
ExprKind::Yield { value: None },
),
Some(node) => match node.kind() {
"from" => Expr::new(
to_location(node.start_position()),
to_location(node.end_position()),
ExprKind::YieldFrom {
value: Box::new(extract_expression(node.next_sibling().unwrap(), source)),
},
),
_ => Expr::new(
to_location(node.start_position()),
to_location(node.end_position()),
ExprKind::Yield {
value: Some(Box::new(extract_expression(node, source))),
},
),
},
},
_ => {
print_node(node, source);
panic!("Unhandled node: {}", node.kind())
}
}
}
fn main() -> Result<()> {
@ -13,117 +434,38 @@ def double(x):
# Return a double.
return x * 2
x = double(1)
y = (f"{x}" "b")
x = (double(500), double(2, z=1))
x += 1
class Foo:
pass
for x in range(5):
yield x
yield from x
x = True
x = b"abc"
while True:
pass
with (
foo as bar,
baz as wop):
pass
"#;
let mut parser = Parser::new();
parser
.set_language(tree_sitter_python::language())
.expect("Error loading Python grammar");
let parse_tree = parser.parse(src, None);
let parse_tree = parser.parse(src.as_bytes(), None);
if let Some(parse_tree) = &parse_tree {
// Check for comments.
let query = Query::new(tree_sitter_python::language(), "(comment) @capture")?;
let mut query_cursor = QueryCursor::new();
let all_matches = query_cursor.matches(&query, parse_tree.root_node(), src.as_bytes());
for each_match in all_matches {
for capture in each_match.captures.iter() {
let range = capture.node.range();
let text = &src[range.start_byte..range.end_byte];
let line = range.start_point.row;
let col = range.start_point.column;
println!(
"[Line: {}, Col: {}] Offending source code: `{}`",
line, col, text
);
}
}
// Check for string concatenations.
let query = Query::new(
tree_sitter_python::language(),
"(concatenated_string) @capture",
)?;
let mut query_cursor = QueryCursor::new();
let all_matches = query_cursor.matches(&query, parse_tree.root_node(), src.as_bytes());
for each_match in all_matches {
for capture in each_match.captures.iter() {
let range = capture.node.range();
let text = &src[range.start_byte..range.end_byte];
let line = range.start_point.row;
let col = range.start_point.column;
println!(
"[Line: {}, Col: {}] Offending source code: `{}`",
line, col, text
);
}
}
// Walk the tree.
let mut cursor = parse_tree.walk();
let mut action = Action::Down;
loop {
match action {
Action::Up => {
if cursor.goto_next_sibling() {
action = Action::Right;
} else if cursor.goto_parent() {
action = Action::Up;
} else {
break;
}
}
Action::Down => {
let range = cursor.node().range();
let text = &src[range.start_byte..range.end_byte];
let line = range.start_point.row;
let col = range.start_point.column;
println!(
"[Line: {}, Col: {}] {}: `{}`",
line,
col,
cursor.node().kind(),
text
);
if cursor.goto_first_child() {
action = Action::Down;
} else if cursor.goto_next_sibling() {
action = Action::Right;
} else if cursor.goto_parent() {
action = Action::Up;
} else {
break;
}
}
Action::Right => {
let range = cursor.node().range();
let text = &src[range.start_byte..range.end_byte];
let line = range.start_point.row;
let col = range.start_point.column;
println!(
"[Line: {}, Col: {}] {}: `{}`",
line,
col,
cursor.node().kind(),
text
);
if cursor.goto_first_child() {
action = Action::Down;
} else if cursor.goto_next_sibling() {
action = Action::Right;
} else if cursor.goto_parent() {
action = Action::Up;
} else {
break;
}
}
}
}
let _ = extract_module(parse_tree.root_node(), src.as_bytes());
// println!(
// "{:#?}",
// extract_module(parse_tree.root_node(), src.as_bytes())
// );
}
Ok(())