# This script makes leaves most of the heavy lifting to pcpp which does preprocessing and expansion of files: # https://github.com/ned14/pcpp # To use it make sure you run 'pip install pcpp' # # This script also optionally uses pyperclip to conveniently copy the context to the clipboard: # https://github.com/asweigart/pyperclip # Install via `pip install pyperclip` import os import re import typing import argparse import pyperclip from glob import glob from re import Pattern from io import StringIO from pcpp import Preprocessor from pcpp import CmdPreprocessor from contextlib import redirect_stdout #region Context Options class ContextGenerationOptions: should_strip_declspec = False should_strip_attributes = False should_strip_at_address = False should_convert_binary_literals = False should_replace_enums_in_initializers = False should_strip_initializer_trailing_commas = False #endregion #region Regex Patterns at_address_pattern = re.compile(r"(?:.*?)(?:[a-zA-Z_$][\w$]*\s*\*?\s[a-zA-Z_$][\w$\[\]]*)\s*((?:AT_ADDRESS|:)(?:\s*\(?\s*)(0x[0-9a-fA-F]+|[a-zA-Z_$][\w$]*)\)?);") attribute_pattern = re.compile(r"(__attribute__)") declspec_pattern = re.compile(r"(__declspec)") binary_literal_pattern = re.compile(r"\b(0b[01]+)\b") trailing_initializer_pattern = re.compile(r"^.*?=\s*\{(?:.|\s)+?(,)?\s*(?:\/\/.*?|\/\*.*?\*\/)*\s*?\}\s*;", re.MULTILINE) enum_array_size_initializer_pattern = re.compile(r"\[\s*([a-zA-Z_$][\w$]*)\s*\]\s*;") enum_declaration_pattern = re.compile(r"^.*(?:typedef\s+)*enum\s(?:[a-zA-Z_$][\w$]*)*\s*\{\s*((?:.|\s)*?)\}\s*(?:[a-zA-Z_$][\w$]*)*\s*;", re.MULTILINE) enum_value_pattern = re.compile(r"([a-zA-Z_$][\w$]*)\s*(?:=\s*(.*))*") word_pattern = re.compile(r"\b([a-zA-Z_][\w]*)\b") white_space_pattern = re.compile(r"\s+") cast_patterns = re.compile(r"\(int\)") #endregion #region Defaults default_defines: typing.Dict[str, str] = {"__MWERKS__" : "1", "_LANGUAGE_C": "1", "F3DEX_GBI_2": "1"} src_dir = "src" include_dir = "include" cwd_dir = os.getcwd() script_dir = os.path.dirname(os.path.realpath(__file__)) root_dir = os.path.abspath(os.path.join(script_dir, "..")) default_include_directories: typing.List[str] = [ os.path.join(root_dir, src_dir), os.path.join(root_dir, include_dir), os.path.join(script_dir, src_dir), os.path.join(script_dir, include_dir), os.path.join(cwd_dir, src_dir), os.path.join(cwd_dir, include_dir), ] default_output_filename = "ctx.h" #endregion #region N64 SDK def get_n64_sdk(sdk_argument: str)->str: if sdk_argument: return sdk_argument # No sdk path provided. Try to use default sdk_argument = os.environ['N64_SDK'] if not sdk_argument: return None # Since we don't want the user to have to type the full path, all they need # is to provide the top-level folder for the SDK sdk_argument = os.path.join(sdk_argument, "ultra/usr/include") return sdk_argument #endregion #region Attribute Stripping def strip_attributes(text_to_strip: str)->str: if not text_to_strip: return text_to_strip attribute_matches = reversed(list(re.finditer(attribute_pattern, text_to_strip))) for attribute_match in attribute_matches: # Find the end index of the second double paranthesis paren_count = 0 match_span = attribute_match.span(0) end_index = match_span[1] attribute_opened = False while end_index < len(text_to_strip): if text_to_strip[end_index] == "(": paren_count += 1 if paren_count == 2: attribute_opened = True if text_to_strip[end_index] == ")": paren_count -= 1 if attribute_opened and paren_count == 0: end_index += 1 break end_index += 1 # Create the substring start_index = match_span[0] prefix = text_to_strip[0:start_index] postfix = text_to_strip[end_index:len(text_to_strip)] text_to_strip = prefix + postfix return text_to_strip #endregion #region declspec Stripping def strip_declspec(text_to_strip: str)->str: if not text_to_strip: return text_to_strip declspec_matches = reversed(list(re.finditer(declspec_pattern, text_to_strip))) for declspec_match in declspec_matches: # Find the end index of the second double paranthesis paren_count = 0 match_span = declspec_match.span(0) end_index = match_span[1] declspec_opened = False while end_index < len(text_to_strip): if text_to_strip[end_index] == "(": paren_count += 1 if paren_count == 1: declspec_opened = True if text_to_strip[end_index] == ")": paren_count -= 1 if declspec_opened and paren_count == 0: end_index += 1 break end_index += 1 # Create the substring start_index = match_span[0] prefix = text_to_strip[0:start_index] postfix = text_to_strip[end_index:len(text_to_strip)] text_to_strip = prefix + postfix return text_to_strip #endregion #region At Address Stripping def strip_at_address(text_to_strip: str) -> str: if not text_to_strip: return text_to_strip at_address_matches = reversed(list(re.finditer(at_address_pattern, text_to_strip))) for attribute_match in at_address_matches: # Create the substring match_span = attribute_match.span(1) start_index = match_span[0] end_index = match_span[1] prefix = text_to_strip[0:start_index] postfix = text_to_strip[end_index:len(text_to_strip)] text_to_strip = prefix + postfix return text_to_strip #endregion #region Binary Literal Conversion def convert_binary_literals(text_to_strip: str) -> str: if not text_to_strip: return text_to_strip binary_literal_matches = reversed(list(re.finditer(binary_literal_pattern, text_to_strip))) for binary_literal_match in binary_literal_matches: # Create the substring match_span = binary_literal_match.span(1) start_index = match_span[0] end_index = match_span[1] # Convert from binary literal format to regular int binary_converted = int(text_to_strip[start_index:end_index], 2) prefix = text_to_strip[0:start_index] postfix = text_to_strip[end_index:len(text_to_strip)] text_to_strip = prefix + str(binary_converted) + postfix return text_to_strip #endregion #region Strip Trailing Commas def strip_initializer_trailing_commas(text_to_strip: str) -> str: if not text_to_strip: return text_to_strip trailing_comma_matches = reversed(list(re.finditer(trailing_initializer_pattern, text_to_strip))) for comma_match in trailing_comma_matches: # Create the substring if not comma_match[1]: continue match_span = comma_match.span(1) start_index = match_span[0] end_index = match_span[1] prefix = text_to_strip[0:start_index] postfix = text_to_strip[end_index:len(text_to_strip)] text_to_strip = prefix + postfix return text_to_strip #endregion #region Enums def replace_enums_with_numeric_values(text_to_strip: str)->str: if not text_to_strip: return text_to_strip # Check if there are any uses of enums to initialize arrays enum_array_size_initializer_matches = list(re.finditer(enum_array_size_initializer_pattern, text_to_strip)) if len(enum_array_size_initializer_matches) == 0: # None found, so no need to evaluate the enums return text_to_strip # We need to replace enums. But to do so we need to gather all of the enum values from the context thus far enum_declarations = list(re.finditer(enum_declaration_pattern, text_to_strip)) if len(enum_declarations) == 0: return text_to_strip preprocessor = Preprocessor() enum_to_numeric_dict : typing.Dict[str, int] = {} for enum_declaration in enum_declarations: enum_members = enum_declaration[1] split_enum_members = enum_members.split(",") enum_numeric_value = 0 for split_member in split_enum_members: split_member = re.sub(white_space_pattern, "", split_member) if not split_member or split_member.isspace(): continue enum_value_match = re.match(enum_value_pattern, split_member) enum_member_name = enum_value_match[1] # Does the enum have an explicit value assigned? if enum_value_match[2]: assigned_value = enum_value_match[2] try: # Replace usages of enum with numeric value numeric_expression = enum_value_match[2] # Remove casts numeric_expression = re.sub(cast_patterns, "", numeric_expression) # Replace enum names with numerical values for word_match in reversed(list(re.finditer(word_pattern, numeric_expression))): word = word_match[1] if word not in enum_to_numeric_dict: continue word_span = word_match.span(1) numeric_expression = numeric_expression[0:word_span[0]] + str(enum_to_numeric_dict[word]) + numeric_expression[word_span[1]:len(numeric_expression)] # Try to parse it out tokens = preprocessor.tokenize(numeric_expression) evaluation = preprocessor.evalexpr(tokens) assigned_value = evaluation[0] except Exception as e: # Can't parse. Might be another enum print(e) # Convert to int enum_numeric_value = int(assigned_value) # Record the value enum_to_numeric_dict[enum_member_name] = enum_numeric_value # By default the enum increases by 1 enum_numeric_value += 1 # With the enum map built we can now replace the usages with the numeric values enum_array_size_initializer_matches_reversed = reversed(enum_array_size_initializer_matches) for array_size_initializer_match in enum_array_size_initializer_matches_reversed: # Does this use a known enum? enum_name = array_size_initializer_match[1] if enum_name not in enum_to_numeric_dict: continue enum_numeric_value = enum_to_numeric_dict[enum_name] # Create the substring match_span = array_size_initializer_match.span(1) start_index = match_span[0] end_index = match_span[1] prefix = text_to_strip[0:start_index] postfix = text_to_strip[end_index:len(text_to_strip)] text_to_strip = prefix + str(enum_numeric_value) + postfix return text_to_strip #endregion #region Preprocessing def generate_context(preprocessor_arguments: typing.List[str], context_options: ContextGenerationOptions)->str: # Create the temp string writer to pass to the preprocessor since we still want to modify # the contents for project-specific conditions with StringIO() as preprocessor_string_writer: with redirect_stdout(preprocessor_string_writer): # Parse the target file: CmdPreprocessor(preprocessor_arguments) # Check if empty string_writer_position = preprocessor_string_writer.tell() if string_writer_position == 0: return None # Do we need to sanitize this further? if not context_options.should_strip_declspec and not context_options.should_strip_attributes and not context_options.should_strip_at_address and not context_options.should_strip_initializer_trailing_commas and not context_options.should_convert_binary_literals: # No sanitation needed, so write the entire file out return preprocessor_string_writer.getvalue() # Sanitize/change the file depending on the context options with StringIO() as context_string_writer: # Sanitize line-by line for easier parsing preprocessor_string_writer.seek(0) while True: line_to_write = preprocessor_string_writer.readline() if not line_to_write: break if context_options.should_strip_declspec: line_to_write = strip_declspec(line_to_write) if context_options.should_strip_attributes: line_to_write = strip_attributes(line_to_write) if context_options.should_strip_at_address: line_to_write = strip_at_address(line_to_write) if context_options.should_convert_binary_literals: line_to_write = convert_binary_literals(line_to_write) context_string_writer.writelines(line_to_write) # SIngle line cleanup completed generated_context = context_string_writer.getvalue() # Search for multi-line cleanup if context_options.should_strip_initializer_trailing_commas or context_options.should_replace_enums_in_initializers: if context_options.should_strip_initializer_trailing_commas: generated_context = strip_initializer_trailing_commas(generated_context) if context_options.should_replace_enums_in_initializers: generated_context = replace_enums_with_numeric_values(generated_context) return generated_context #endregion #region Main def main(): # Write initial parser parser = argparse.ArgumentParser(prog="Decomp Context", description="Wrapper around pcpp that can create a context file which can be used for decompilation", add_help=False) parser.add_argument("c_file", nargs="?", help="File from which to create context") parser.add_argument("-h", "-help", "--help", dest="help", action="store_true") parser.add_argument("-n64", "--n64-sdk", dest="n64_sdk", help="Path to the N64 SDK top level directory", action="store") parser.add_argument('-D', dest = 'defines', metavar = 'macro[=val]', nargs = 1, action = 'append', help = 'Predefine name as a macro [with value]') parser.add_argument("--strip-declspec", dest="strip_declspec", help="If __declspec() string should be stripped", action="store_true", default=False) parser.add_argument("--strip-attributes", dest="strip_attributes", help="If __attribute__(()) string should be stripped", action="store_true", default=False) parser.add_argument("--strip-at-address", dest="strip_at_address", help="If AT_ADDRESS or : formatted string should be stripped", action="store_true", default=False) parser.add_argument("--strip-initializer_trailing_commas", dest="strip_initializer_trailing_commas", help="If trailing commas in initializers should be stripped", action="store_true", default=False) parser.add_argument("--convert-binary-literals", dest="convert_binary_literals", help="If binary literals (0bxxxx) should be converted to decimal", action="store_true", default=False) parser.add_argument("--replace-enums-in-initializers", dest="replace_enums_in_initializers", help="If enums should be replaced by its numeric value in initializers", action="store_true", default=False) parser.add_argument("--clipboard", dest="copy_to_clipboard", help="If the context should be copied to the clipboard", action="store_true", default=False) # For the output path, we either want to be explicit or relative, but not both output_target_group = parser.add_mutually_exclusive_group() output_target_group.add_argument("-o", dest="output_path", help="Explicit path to output the context file to", action="store") output_target_group.add_argument("-r", "--relative", dest="relative", help="Generate context relative to the source file", action="store_true") # When targeting a specific platform we want to only do one thing or another platform_target_group = parser.add_mutually_exclusive_group() platform_target_group.add_argument("--m2c", dest="m2c", help="Generates an m2c-friendly file", action="store_true") platform_target_group.add_argument("--ghidra", dest="ghidra", help="Generates an Ghidra-friendly file", action="store_true") # Parse the known arguments parsed_args = parser.parse_known_args() known_args = parsed_args[0] preprocessor_arguments = ['pcpp'] if known_args.help: # Since this script acts as a wrapper for the main pcpp script # we want to manually display the help and pass it through to the # pcpp preprocessor to show its full list of arguments parser.print_help() preprocessor_arguments.append("--help") CmdPreprocessor(preprocessor_arguments).tokenize return # Append in the default include directories include_directories: typing.List[str] = [] include_directories.extend(default_include_directories) n64_sdk = get_n64_sdk(known_args.n64_sdk) if n64_sdk: include_directories.append(n64_sdk) for include_directory in include_directories: preprocessor_arguments.extend(("-I", include_directory)) # Check if we have any passed in defines include_defines = [] known_defines: typing.List[str] = [] if known_args.defines: argument_defines = [x[0] for x in known_args.defines] for define in argument_defines: include_defines.append(define) known_defines.append(define.split("=")[0]) if not known_args.c_file: # If not file is specified it is assumed we want to create a mega context # file that is the aggregate of all include files include_files : typing.Set[str, str] = set() for include_directory in default_include_directories: files = [y for x in os.walk(include_directory) for y in glob(os.path.join(x[0], '*.h'))] for include_file in files: include_files.add(include_file) # Add each file as an input so that pccpp can parse them into a single output file # Sort the files for some consistency sorted_files = list(include_files) sorted_files.sort() for include_file in include_files: preprocessor_arguments.append(include_file) else: # Add the file we want to read c_file = known_args.c_file preprocessor_arguments.append(known_args.c_file) # Add in the default defines unless explicitly passed in as arguments for default_define, default_define_value in default_defines.items(): if default_define in known_defines: continue define_str: str = default_define + "=" + default_define_value include_defines.append(define_str) # Add the defines to the arguments for define in include_defines: preprocessor_arguments.extend(("-D", define)) # If not targeting Ghidra or m2c we can include more in if not known_args.ghidra and not known_args.m2c: preprocessor_arguments.append("--passthru-defines") else: # Don't include the line directives if targeting Ghidra/m2c preprocessor_arguments.append("--line-directive") # For debugging purposes, include unfound includes in output to mark errors preprocessor_arguments.append("--passthru-unfound-includes") # Compress to minimize whitespace preprocessor_arguments.append("--compress") # Add unknown arguments and pass them to pcpp pass_through_args = parsed_args[1] preprocessor_arguments.extend(pass_through_args) # Check if we need to do further conversions after the file is preprocessed context_options = ContextGenerationOptions() context_options.should_strip_declspec = known_args.strip_declspec or known_args.ghidra or known_args.m2c context_options.should_strip_at_address = known_args.strip_at_address or known_args.ghidra or known_args.m2c context_options.should_strip_attributes = known_args.strip_attributes or known_args.m2c context_options.should_convert_binary_literals = known_args.convert_binary_literals or known_args.ghidra context_options.should_strip_initializer_trailing_commas = known_args.strip_initializer_trailing_commas or known_args.ghidra context_options.should_replace_enums_in_initializers = known_args.replace_enums_in_initializers or known_args.ghidra # Generate the context generated_context = generate_context(preprocessor_arguments, context_options) # Determine the file to write to target_file_name = None if known_args.output_path: target_file_name = known_args.output_path elif known_args.relative: target_file_name = f"{c_file}.ctx" else: target_file_name = os.path.join(os.getcwd(), default_output_filename) # Write the generated context to the file with open(target_file_name, "w", encoding="utf-8", newline="\n") as file_writer: file_writer.write(generated_context) # Check if we also want to copy to the clipboard if known_args.copy_to_clipboard: pyperclip.copy(generated_context) #endregion if __name__ == "__main__": main()