From 55da004d1bf528a4e791e80749b23851f5972e57 Mon Sep 17 00:00:00 2001 From: Hexalotl <15166449+Hexalotl@users.noreply.github.com> Date: Fri, 19 Jan 2024 03:57:10 -0800 Subject: [PATCH] Decomp Ghidra Improvements (#220) * Remove trailing commas from initializers for Ghidra * Substituting enums for their numeric values when initializing arrays * Adding option to copy context to clipboard when built --- requirements.txt | 1 + tools/decompctx.py | 292 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 232 insertions(+), 61 deletions(-) diff --git a/requirements.txt b/requirements.txt index cef1205a..e2e083ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ python-Levenshtein watchdog pyjkernel pcpp +pyperclip \ No newline at end of file diff --git a/tools/decompctx.py b/tools/decompctx.py index 56499aa4..7f8ee4ec 100644 --- a/tools/decompctx.py +++ b/tools/decompctx.py @@ -1,17 +1,40 @@ # This script makes leaves most of the heavy lifting to pcpp which does preprocessing and expansion of files: # https://github.com/ned14/pcpp # To use it make sure you run 'pip install pcpp' +# +# This script also optionally uses pyperclip to conveniently copy the context to the clipboard: +# https://github.com/asweigart/pyperclip +# Install via `pip install pyperclip` + import os import re import argparse +import pyperclip from io import StringIO +from pcpp import Preprocessor from pcpp import CmdPreprocessor from contextlib import redirect_stdout +#region Context Options +class ContextGenerationOptions: + should_strip_attributes = False + should_strip_at_address = False + should_convert_binary_literals = False + should_replace_enums_in_initializers = False + should_strip_initializer_trailing_commas = False +#endregion + #region Regex Patterns at_address_pattern = re.compile(r"(?:.*?)(?:[a-zA-Z_$][\w$]*\s*\*?\s[a-zA-Z_$][\w$]*)\s*((?:AT_ADDRESS|:)(?:\s*\(?\s*)(0x[0-9a-fA-F]+|[a-zA-Z_$][\w$]*)\)?);") attribute_pattern = re.compile(r"(__attribute__)") binary_literal_pattern = re.compile(r"\b(0b[01]+)\b") +trailing_initializer_pattern = re.compile(r"^.*?=\s*\{(?:.|\s)+?(,)\s*(?:\/\/.*?|\/\*.*?\*\/)*\s*?\}\s*;", re.MULTILINE) +enum_array_size_initializer_pattern = re.compile(r"\[\s*([a-zA-Z_$][\w$]*)\s*\]\s*;") +enum_declaration_pattern = re.compile(r"^.*(?:typedef\s+)*enum\s(?:[a-zA-Z_$][\w$]*)*\s*\{\s*((?:.|\s)*?)\}\s*(?:[a-zA-Z_$][\w$]*)*\s*;", re.MULTILINE) +enum_value_pattern = re.compile(r"([a-zA-Z_$][\w$]*)\s*(?:=\s*(.*))*") +word_pattern = re.compile(r"\b([a-zA-Z_][\w]*)\b") +white_space_pattern = re.compile(r"\s+") +cast_patterns = re.compile(r"\(int\)") #endregion #region Defaults @@ -34,6 +57,22 @@ default_include_directories: list[str] = [ default_output_filename = "ctx.h" #endregion +#region N64 SDK +def get_n64_sdk(sdk_argument: str)->str: + if sdk_argument: + return sdk_argument + + # No sdk path provided. Try to use default + sdk_argument = os.environ['N64_SDK'] + if not sdk_argument: + return None + + # Since we don't want the user to have to type the full path, all they need + # is to provide the top-level folder for the SDK + sdk_argument = os.path.join(sdk_argument, "ultra/usr/include") + return sdk_argument +#endregion + #region Attribute Stripping def strip_attributes(text_to_strip: str)->str: if not text_to_strip: @@ -112,20 +151,164 @@ def convert_binary_literals(text_to_strip: str) -> str: return text_to_strip #endregion -#region N64 SDK -def get_n64_sdk(sdk_argument: str)->str: - if sdk_argument: - return sdk_argument +#region Strip Trailing Commas +def strip_initializer_trailing_commas(text_to_strip: str) -> str: + if not text_to_strip: + return text_to_strip - # No sdk path provided. Try to use default - sdk_argument = os.environ['N64_SDK'] - if not sdk_argument: - return None + trailing_comma_matches = reversed(list(re.finditer(trailing_initializer_pattern, text_to_strip))) + for attribute_match in trailing_comma_matches: + # Create the substring + match_span = attribute_match.span(1) + start_index = match_span[0] + end_index = match_span[1] + prefix = text_to_strip[0:start_index] + postfix = text_to_strip[end_index:len(text_to_strip)] + text_to_strip = prefix + postfix - # Since we don't want the user to have to type the full path, all they need - # is to provide the top-level folder for the SDK - sdk_argument = os.path.join(sdk_argument, "ultra/usr/include") - return sdk_argument + return text_to_strip +#endregion + +#region Enums +def replace_enums_with_numeric_values(text_to_strip: str)->str: + if not text_to_strip: + return text_to_strip + + # Check if there are any uses of enums to initialize arrays + enum_array_size_initializer_matches = list(re.finditer(enum_array_size_initializer_pattern, text_to_strip)) + if len(enum_array_size_initializer_matches) == 0: + # None found, so no need to evaluate the enums + return text_to_strip + + # We need to replace enums. But to do so we need to gather all of the enum values from the context thus far + enum_declarations = list(re.finditer(enum_declaration_pattern, text_to_strip)) + if len(enum_declarations) == 0: + return text_to_strip + + preprocessor = Preprocessor() + enum_to_numeric_dict : dict[str, int] = {} + for enum_declaration in enum_declarations: + enum_members = enum_declaration[1] + split_enum_members = enum_members.split(",") + + enum_numeric_value = 0 + for split_member in split_enum_members: + split_member = re.sub(white_space_pattern, "", split_member) + if not split_member or split_member.isspace(): + continue + + enum_value_match = re.match(enum_value_pattern, split_member) + enum_member_name = enum_value_match[1] + + # Does the enum have an explicit value assigned? + if enum_value_match[2]: + assigned_value = enum_value_match[2] + try: + # Replace usages of enum with numeric value + numeric_expression = enum_value_match[2] + + # Remove casts + numeric_expression = re.sub(cast_patterns, "", numeric_expression) + + # Replace enum names with numerical values + for word_match in reversed(list(re.finditer(word_pattern, numeric_expression))): + word = word_match[1] + if word not in enum_to_numeric_dict: + continue + + word_span = word_match.span(1) + numeric_expression = numeric_expression[0:word_span[0]] + str(enum_to_numeric_dict[word]) + numeric_expression[word_span[1]:len(numeric_expression)] + + # Try to parse it out + tokens = preprocessor.tokenize(numeric_expression) + evaluation = preprocessor.evalexpr(tokens) + assigned_value = evaluation[0] + except Exception as e: + # Can't parse. Might be another enum + print(e) + + # Convert to int + enum_numeric_value = int(assigned_value) + + # Record the value + enum_to_numeric_dict[enum_member_name] = enum_numeric_value + + # By default the enum increases by 1 + enum_numeric_value += 1 + + # With the enum map built we can now replace the usages with the numeric values + enum_array_size_initializer_matches_reversed = reversed(enum_array_size_initializer_matches) + for array_size_initializer_match in enum_array_size_initializer_matches_reversed: + # Does this use a known enum? + enum_name = array_size_initializer_match[1] + if enum_name not in enum_to_numeric_dict: + continue + + enum_numeric_value = enum_to_numeric_dict[enum_name] + + # Create the substring + match_span = array_size_initializer_match.span(1) + start_index = match_span[0] + end_index = match_span[1] + + prefix = text_to_strip[0:start_index] + postfix = text_to_strip[end_index:len(text_to_strip)] + text_to_strip = prefix + str(enum_numeric_value) + postfix + + return text_to_strip +#endregion + +#region Preprocessing +def generate_context(preprocessor_arguments: list[str], context_options: ContextGenerationOptions)->str: + # Create the temp string writer to pass to the preprocessor since we still want to modify + # the contents for project-specific conditions + with StringIO() as preprocessor_string_writer: + with redirect_stdout(preprocessor_string_writer): + # Parse the target file: + CmdPreprocessor(preprocessor_arguments) + + # Check if empty + string_writer_position = preprocessor_string_writer.tell() + if string_writer_position == 0: + return None + + # Do we need to sanitize this further? + if not context_options.should_strip_attributes and not context_options.should_strip_at_address and not context_options.should_strip_initializer_trailing_commas and not context_options.should_convert_binary_literals: + # No sanitation needed, so write the entire file out + return preprocessor_string_writer.getvalue() + + # Sanitize/change the file depending on the context options + with StringIO() as context_string_writer: + # Sanitize line-by line for easier parsing + preprocessor_string_writer.seek(0) + while True: + line_to_write = preprocessor_string_writer.readline() + if not line_to_write: + break + + if context_options.should_strip_attributes: + line_to_write = strip_attributes(line_to_write) + + if context_options.should_strip_at_address: + line_to_write = strip_at_address(line_to_write) + + if context_options.should_convert_binary_literals: + line_to_write = convert_binary_literals(line_to_write) + + context_string_writer.writelines(line_to_write) + + # SIngle line cleanup completed + generated_context = context_string_writer.getvalue() + + # Search for multi-line cleanup + if context_options.should_strip_initializer_trailing_commas or context_options.should_replace_enums_in_initializers: + if context_options.should_strip_initializer_trailing_commas: + generated_context = strip_initializer_trailing_commas(generated_context) + + if context_options.should_replace_enums_in_initializers: + generated_context = replace_enums_with_numeric_values(generated_context) + + return generated_context #endregion #region Main @@ -136,9 +319,12 @@ def main(): parser.add_argument("-h", "-help", "--help", dest="help", action="store_true") parser.add_argument("-n64", "--n64-sdk", dest="n64_sdk", help="Path to the N64 SDK top level directory", action="store") parser.add_argument('-D', dest = 'defines', metavar = 'macro[=val]', nargs = 1, action = 'append', help = 'Predefine name as a macro [with value]') - parser.add_argument("--strip-attributes", dest="strip_attributes", help="If __attribute__(()) string should be stripped", action="store_true", default=True) - parser.add_argument("--strip-at-address", dest="strip_at_address", help="If AT_ADDRESS or : formatted string should be stripped", action="store_true", default=True) - parser.add_argument("--convert-binary-literals", dest="convert_binary_literals", help="If binary literals (0bxxxx) should be converted to decimal", action="store_true", default=True) + parser.add_argument("--strip-attributes", dest="strip_attributes", help="If __attribute__(()) string should be stripped", action="store_true", default=False) + parser.add_argument("--strip-at-address", dest="strip_at_address", help="If AT_ADDRESS or : formatted string should be stripped", action="store_true", default=False) + parser.add_argument("--strip-initializer_trailing_commas", dest="strip_initializer_trailing_commas", help="If trailing commas in initializers should be stripped", action="store_true", default=False) + parser.add_argument("--convert-binary-literals", dest="convert_binary_literals", help="If binary literals (0bxxxx) should be converted to decimal", action="store_true", default=False) + parser.add_argument("--replace-enums-in-initializers", dest="replace_enums_in_initializers", help="If enums should be replaced by its numeric value in initializers", action="store_true", default=False) + parser.add_argument("--clipboard", dest="copy_to_clipboard", help="If the context should be copied to the clipboard", action="store_true", default=False) # For the output path, we either want to be explicit or relative, but not both output_target_group = parser.add_mutually_exclusive_group() @@ -161,7 +347,7 @@ def main(): # pcpp preprocessor to show its full list of arguments parser.print_help() preprocessor_arguments.append("--help") - CmdPreprocessor(preprocessor_arguments) + CmdPreprocessor(preprocessor_arguments).tokenize return # Append in the default include directories @@ -197,6 +383,12 @@ def main(): # If not targeting Ghidra or m2c we can include more in if not known_args.ghidra and not known_args.m2c: preprocessor_arguments.append("--passthru-defines") + else: + # Don't include the line directives if targeting Ghidra/m2c + preprocessor_arguments.append("--line-directive") + + # For debugging purposes, include unfound includes in output to mark errors + preprocessor_arguments.append("--passthru-unfound-includes") # Compress to minimize whitespace preprocessor_arguments.append("--compress") @@ -210,54 +402,32 @@ def main(): preprocessor_arguments.append(known_args.c_file) # Check if we need to do further conversions after the file is preprocessed - should_strip_at_address = known_args.strip_at_address or known_args.ghidra or known_args.m2c - should_strip_attributes = known_args.strip_attributes or known_args.ghidra or known_args.m2c - should_convert_binary_literals = known_args.convert_binary_literals or known_args.ghidra + context_options = ContextGenerationOptions() + context_options.should_strip_at_address = known_args.strip_at_address or known_args.ghidra or known_args.m2c + context_options.should_strip_attributes = known_args.strip_attributes or known_args.ghidra or known_args.m2c + context_options.should_convert_binary_literals = known_args.convert_binary_literals or known_args.ghidra + context_options.should_strip_initializer_trailing_commas = known_args.strip_initializer_trailing_commas or known_args.ghidra + context_options.should_replace_enums_in_initializers = known_args.replace_enums_in_initializers or known_args.ghidra - # Create the temp string writer to pass to the preprocessor since we still want to modify - # the contents for project-specific conditions - with StringIO() as file_string_writer: - with redirect_stdout(file_string_writer): - # Parse the target file: - CmdPreprocessor(preprocessor_arguments) - - # Check if empty - string_writer_position = file_string_writer.tell() - if string_writer_position == 0: - return - - # Write to file - target_file_name = None - if known_args.output_path: - target_file_name = known_args.output_path - elif known_args.relative: - target_file_name = f"{c_file}.ctx" - else: - target_file_name = os.path.join(os.getcwd(), default_output_filename) + # Generate the context + generated_context = generate_context(preprocessor_arguments, context_options) - with open(target_file_name, "w", encoding="utf-8", newline="\n") as f: - # Do we need to sanitize this further? - if not should_strip_attributes and not should_strip_at_address and not should_convert_binary_literals: - f.write(file_string_writer.getvalue()) - return - - # Sanitize line-by line for easier parsing - file_string_writer.seek(0) - while True: - line_to_write = file_string_writer.readline() - if not line_to_write: - break + # Determine the file to write to + target_file_name = None + if known_args.output_path: + target_file_name = known_args.output_path + elif known_args.relative: + target_file_name = f"{c_file}.ctx" + else: + target_file_name = os.path.join(os.getcwd(), default_output_filename) + + # Write the generated context to the file + with open(target_file_name, "w", encoding="utf-8", newline="\n") as file_writer: + file_writer.write(generated_context) - if should_strip_attributes: - line_to_write = strip_attributes(line_to_write) - - if should_strip_at_address: - line_to_write = strip_at_address(line_to_write) - - if should_convert_binary_literals: - line_to_write = convert_binary_literals(line_to_write) - - f.writelines(line_to_write) + # Check if we also want to copy to the clipboard + if known_args.copy_to_clipboard: + pyperclip.copy(generated_context) #endregion if __name__ == "__main__":