From 55da004d1bf528a4e791e80749b23851f5972e57 Mon Sep 17 00:00:00 2001
From: Hexalotl <15166449+Hexalotl@users.noreply.github.com>
Date: Fri, 19 Jan 2024 03:57:10 -0800
Subject: [PATCH] Decomp Ghidra Improvements (#220)

* Remove trailing commas from initializers for Ghidra

* Substituting enums for their numeric values when initializing arrays

* Adding option to copy context to clipboard when built
---
 requirements.txt   |   1 +
 tools/decompctx.py | 292 +++++++++++++++++++++++++++++++++++----------
 2 files changed, 232 insertions(+), 61 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index cef1205a..e2e083ce 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,4 @@ python-Levenshtein
 watchdog
 pyjkernel
 pcpp
+pyperclip
\ No newline at end of file
diff --git a/tools/decompctx.py b/tools/decompctx.py
index 56499aa4..7f8ee4ec 100644
--- a/tools/decompctx.py
+++ b/tools/decompctx.py
@@ -1,17 +1,40 @@
 # This script makes leaves most of the heavy lifting to pcpp which does preprocessing and expansion of files:
 # https://github.com/ned14/pcpp
 # To use it make sure you run 'pip install pcpp'
+#
+# This script also optionally uses pyperclip to conveniently copy the context to the clipboard:
+# https://github.com/asweigart/pyperclip
+# Install via `pip install pyperclip`
+
 import os
 import re
 import argparse
+import pyperclip
 from io import StringIO
+from pcpp import Preprocessor
 from pcpp import CmdPreprocessor
 from contextlib import redirect_stdout
 
+#region Context Options
+class ContextGenerationOptions:
+    should_strip_attributes = False
+    should_strip_at_address = False
+    should_convert_binary_literals = False
+    should_replace_enums_in_initializers = False
+    should_strip_initializer_trailing_commas = False
+#endregion
+
 #region Regex Patterns
 at_address_pattern = re.compile(r"(?:.*?)(?:[a-zA-Z_$][\w$]*\s*\*?\s[a-zA-Z_$][\w$]*)\s*((?:AT_ADDRESS|:)(?:\s*\(?\s*)(0x[0-9a-fA-F]+|[a-zA-Z_$][\w$]*)\)?);")
 attribute_pattern = re.compile(r"(__attribute__)")
 binary_literal_pattern = re.compile(r"\b(0b[01]+)\b")
+trailing_initializer_pattern = re.compile(r"^.*?=\s*\{(?:.|\s)+?(,)\s*(?:\/\/.*?|\/\*.*?\*\/)*\s*?\}\s*;", re.MULTILINE)
+enum_array_size_initializer_pattern = re.compile(r"\[\s*([a-zA-Z_$][\w$]*)\s*\]\s*;")
+enum_declaration_pattern = re.compile(r"^.*(?:typedef\s+)*enum\s(?:[a-zA-Z_$][\w$]*)*\s*\{\s*((?:.|\s)*?)\}\s*(?:[a-zA-Z_$][\w$]*)*\s*;", re.MULTILINE)
+enum_value_pattern = re.compile(r"([a-zA-Z_$][\w$]*)\s*(?:=\s*(.*))*")
+word_pattern = re.compile(r"\b([a-zA-Z_][\w]*)\b")
+white_space_pattern = re.compile(r"\s+")
+cast_patterns = re.compile(r"\(int\)")
 #endregion
 
 #region Defaults
@@ -34,6 +57,22 @@ default_include_directories: list[str] = [
 default_output_filename = "ctx.h"
 #endregion
 
+#region N64 SDK
+def get_n64_sdk(sdk_argument: str)->str:
+    if sdk_argument:
+        return sdk_argument
+    
+    # No sdk path provided. Try to use default
+    sdk_argument = os.environ['N64_SDK']
+    if not sdk_argument:
+        return None
+    
+    # Since we don't want the user to have to type the full path, all they need
+    # is to provide the top-level folder for the SDK
+    sdk_argument = os.path.join(sdk_argument, "ultra/usr/include")
+    return sdk_argument
+#endregion
+
 #region Attribute Stripping
 def strip_attributes(text_to_strip: str)->str:
     if not text_to_strip:
@@ -112,20 +151,164 @@ def convert_binary_literals(text_to_strip: str) -> str:
     return text_to_strip
 #endregion
 
-#region N64 SDK
-def get_n64_sdk(sdk_argument: str)->str:
-    if sdk_argument:
-        return sdk_argument
+#region Strip Trailing Commas
+def strip_initializer_trailing_commas(text_to_strip: str) -> str:
+    if not text_to_strip:
+        return text_to_strip
     
-    # No sdk path provided. Try to use default
-    sdk_argument = os.environ['N64_SDK']
-    if not sdk_argument:
-        return None
+    trailing_comma_matches = reversed(list(re.finditer(trailing_initializer_pattern, text_to_strip)))
+    for attribute_match in trailing_comma_matches:
+        # Create the substring
+        match_span = attribute_match.span(1)
+        start_index = match_span[0]
+        end_index = match_span[1]
+        prefix = text_to_strip[0:start_index]
+        postfix = text_to_strip[end_index:len(text_to_strip)]
+        text_to_strip = prefix + postfix
     
-    # Since we don't want the user to have to type the full path, all they need
-    # is to provide the top-level folder for the SDK
-    sdk_argument = os.path.join(sdk_argument, "ultra/usr/include")
-    return sdk_argument
+    return text_to_strip
+#endregion
+
+#region Enums
+def replace_enums_with_numeric_values(text_to_strip: str)->str:
+    if not text_to_strip:
+        return text_to_strip
+    
+    # Check if there are any uses of enums to initialize arrays
+    enum_array_size_initializer_matches = list(re.finditer(enum_array_size_initializer_pattern, text_to_strip))
+    if len(enum_array_size_initializer_matches) == 0:
+        # None found, so no need to evaluate the enums
+        return text_to_strip
+    
+    # We need to replace enums. But to do so we need to gather all of the enum values from the context thus far
+    enum_declarations = list(re.finditer(enum_declaration_pattern, text_to_strip))
+    if len(enum_declarations) == 0:
+        return text_to_strip
+    
+    preprocessor = Preprocessor()
+    enum_to_numeric_dict : dict[str, int] = {}
+    for enum_declaration in enum_declarations:
+        enum_members = enum_declaration[1]
+        split_enum_members = enum_members.split(",")
+
+        enum_numeric_value = 0
+        for split_member in split_enum_members:
+            split_member = re.sub(white_space_pattern, "", split_member)
+            if not split_member or split_member.isspace():
+                continue
+
+            enum_value_match = re.match(enum_value_pattern, split_member)
+            enum_member_name = enum_value_match[1]
+
+            # Does the enum have an explicit value assigned?
+            if enum_value_match[2]:
+                assigned_value = enum_value_match[2]
+                try:
+                    # Replace usages of enum with numeric value
+                    numeric_expression = enum_value_match[2]
+
+                    # Remove casts
+                    numeric_expression = re.sub(cast_patterns, "", numeric_expression)
+
+                    # Replace enum names with numerical values
+                    for word_match in reversed(list(re.finditer(word_pattern, numeric_expression))):
+                        word = word_match[1]
+                        if word not in enum_to_numeric_dict:
+                            continue
+
+                        word_span = word_match.span(1)
+                        numeric_expression = numeric_expression[0:word_span[0]] + str(enum_to_numeric_dict[word]) + numeric_expression[word_span[1]:len(numeric_expression)]
+
+                    # Try to parse it out
+                    tokens = preprocessor.tokenize(numeric_expression)
+                    evaluation = preprocessor.evalexpr(tokens)
+                    assigned_value = evaluation[0]
+                except Exception as e:
+                    # Can't parse. Might be another enum
+                    print(e)
+
+                # Convert to int
+                enum_numeric_value = int(assigned_value)
+
+            # Record the value
+            enum_to_numeric_dict[enum_member_name] = enum_numeric_value
+
+            # By default the enum increases by 1
+            enum_numeric_value += 1
+
+    # With the enum map built we can now replace the usages with the numeric values
+    enum_array_size_initializer_matches_reversed = reversed(enum_array_size_initializer_matches)
+    for array_size_initializer_match in enum_array_size_initializer_matches_reversed:
+        # Does this use a known enum?
+        enum_name = array_size_initializer_match[1]
+        if enum_name not in enum_to_numeric_dict:
+            continue
+        
+        enum_numeric_value = enum_to_numeric_dict[enum_name]
+
+        # Create the substring
+        match_span = array_size_initializer_match.span(1)
+        start_index = match_span[0]
+        end_index = match_span[1]
+
+        prefix = text_to_strip[0:start_index]
+        postfix = text_to_strip[end_index:len(text_to_strip)]
+        text_to_strip = prefix + str(enum_numeric_value) + postfix
+    
+    return text_to_strip
+#endregion
+
+#region Preprocessing
+def generate_context(preprocessor_arguments: list[str], context_options: ContextGenerationOptions)->str:
+    # Create the temp string writer to pass to the preprocessor since we still want to modify
+    # the contents for project-specific conditions
+    with StringIO() as preprocessor_string_writer:
+        with redirect_stdout(preprocessor_string_writer):
+            # Parse the target file:
+            CmdPreprocessor(preprocessor_arguments)
+          
+            # Check if empty
+            string_writer_position = preprocessor_string_writer.tell()
+            if string_writer_position == 0:
+                return None
+            
+            # Do we need to sanitize this further?
+            if not context_options.should_strip_attributes and not context_options.should_strip_at_address and not context_options.should_strip_initializer_trailing_commas and not context_options.should_convert_binary_literals:
+                # No sanitation needed, so write the entire file out
+                return preprocessor_string_writer.getvalue()
+            
+            # Sanitize/change the file depending on the context options
+            with StringIO() as context_string_writer:
+                # Sanitize line-by line for easier parsing
+                preprocessor_string_writer.seek(0)
+                while True:
+                    line_to_write = preprocessor_string_writer.readline()
+                    if not line_to_write:
+                        break
+
+                    if context_options.should_strip_attributes:
+                        line_to_write = strip_attributes(line_to_write)
+
+                    if context_options.should_strip_at_address:
+                        line_to_write = strip_at_address(line_to_write)
+
+                    if context_options.should_convert_binary_literals:
+                        line_to_write = convert_binary_literals(line_to_write)
+                    
+                    context_string_writer.writelines(line_to_write)
+                
+                # SIngle line cleanup completed
+                generated_context = context_string_writer.getvalue()
+
+                # Search for multi-line cleanup
+                if context_options.should_strip_initializer_trailing_commas or context_options.should_replace_enums_in_initializers:
+                    if context_options.should_strip_initializer_trailing_commas:
+                        generated_context = strip_initializer_trailing_commas(generated_context)
+
+                    if context_options.should_replace_enums_in_initializers:
+                        generated_context = replace_enums_with_numeric_values(generated_context)
+                
+                return generated_context
 #endregion
 
 #region Main
@@ -136,9 +319,12 @@ def main():
     parser.add_argument("-h", "-help", "--help", dest="help", action="store_true")
     parser.add_argument("-n64", "--n64-sdk", dest="n64_sdk", help="Path to the N64 SDK top level directory", action="store")
     parser.add_argument('-D', dest = 'defines', metavar = 'macro[=val]', nargs = 1, action = 'append', help = 'Predefine name as a macro [with value]')
-    parser.add_argument("--strip-attributes", dest="strip_attributes", help="If __attribute__(()) string should be stripped", action="store_true", default=True)
-    parser.add_argument("--strip-at-address", dest="strip_at_address", help="If AT_ADDRESS or : formatted string should be stripped", action="store_true", default=True)
-    parser.add_argument("--convert-binary-literals", dest="convert_binary_literals", help="If binary literals (0bxxxx) should be converted to decimal", action="store_true", default=True)
+    parser.add_argument("--strip-attributes", dest="strip_attributes", help="If __attribute__(()) string should be stripped", action="store_true", default=False)
+    parser.add_argument("--strip-at-address", dest="strip_at_address", help="If AT_ADDRESS or : formatted string should be stripped", action="store_true", default=False)
+    parser.add_argument("--strip-initializer_trailing_commas", dest="strip_initializer_trailing_commas", help="If trailing commas in initializers should be stripped", action="store_true", default=False)
+    parser.add_argument("--convert-binary-literals", dest="convert_binary_literals", help="If binary literals (0bxxxx) should be converted to decimal", action="store_true", default=False)
+    parser.add_argument("--replace-enums-in-initializers", dest="replace_enums_in_initializers", help="If enums should be replaced by its numeric value in initializers", action="store_true", default=False)
+    parser.add_argument("--clipboard", dest="copy_to_clipboard", help="If the context should be copied to the clipboard", action="store_true", default=False)
 
     # For the output path, we either want to be explicit or relative, but not both
     output_target_group = parser.add_mutually_exclusive_group()
@@ -161,7 +347,7 @@ def main():
         # pcpp preprocessor to show its full list of arguments
         parser.print_help()
         preprocessor_arguments.append("--help")
-        CmdPreprocessor(preprocessor_arguments)
+        CmdPreprocessor(preprocessor_arguments).tokenize
         return
 
     # Append in the default include directories
@@ -197,6 +383,12 @@ def main():
     # If not targeting Ghidra or m2c we can include more in
     if not known_args.ghidra and not known_args.m2c:
         preprocessor_arguments.append("--passthru-defines")
+    else:
+        # Don't include the line directives if targeting Ghidra/m2c
+        preprocessor_arguments.append("--line-directive")
+
+    # For debugging purposes, include unfound includes in output to mark errors
+    preprocessor_arguments.append("--passthru-unfound-includes")
 
     # Compress to minimize whitespace
     preprocessor_arguments.append("--compress")
@@ -210,54 +402,32 @@ def main():
     preprocessor_arguments.append(known_args.c_file)
 
     # Check if we need to do further conversions after the file is preprocessed
-    should_strip_at_address = known_args.strip_at_address or known_args.ghidra or known_args.m2c
-    should_strip_attributes = known_args.strip_attributes or known_args.ghidra or known_args.m2c
-    should_convert_binary_literals = known_args.convert_binary_literals or known_args.ghidra
+    context_options = ContextGenerationOptions()
+    context_options.should_strip_at_address = known_args.strip_at_address or known_args.ghidra or known_args.m2c
+    context_options.should_strip_attributes = known_args.strip_attributes or known_args.ghidra or known_args.m2c
+    context_options.should_convert_binary_literals = known_args.convert_binary_literals or known_args.ghidra
+    context_options.should_strip_initializer_trailing_commas = known_args.strip_initializer_trailing_commas or known_args.ghidra
+    context_options.should_replace_enums_in_initializers = known_args.replace_enums_in_initializers or known_args.ghidra
 
-    # Create the temp string writer to pass to the preprocessor since we still want to modify
-    # the contents for project-specific conditions
-    with StringIO() as file_string_writer:
-        with redirect_stdout(file_string_writer):
-            # Parse the target file:
-            CmdPreprocessor(preprocessor_arguments)
-          
-            # Check if empty
-            string_writer_position = file_string_writer.tell()
-            if string_writer_position == 0:
-                return
-            
-            # Write to file
-            target_file_name = None
-            if known_args.output_path:
-                target_file_name = known_args.output_path
-            elif known_args.relative:
-                target_file_name = f"{c_file}.ctx"
-            else:
-                target_file_name = os.path.join(os.getcwd(), default_output_filename)
+    # Generate the context
+    generated_context = generate_context(preprocessor_arguments, context_options)
 
-            with open(target_file_name, "w", encoding="utf-8", newline="\n") as f:
-                # Do we need to sanitize this further?
-                if not should_strip_attributes and not should_strip_at_address and not should_convert_binary_literals:
-                    f.write(file_string_writer.getvalue())
-                    return
-                
-                # Sanitize line-by line for easier parsing
-                file_string_writer.seek(0)
-                while True:
-                    line_to_write = file_string_writer.readline()
-                    if not line_to_write:
-                        break
+    # Determine the file to write to
+    target_file_name = None
+    if known_args.output_path:
+        target_file_name = known_args.output_path
+    elif known_args.relative:
+        target_file_name = f"{c_file}.ctx"
+    else:
+        target_file_name = os.path.join(os.getcwd(), default_output_filename)
+    
+    # Write the generated context to the file
+    with open(target_file_name, "w", encoding="utf-8", newline="\n") as file_writer:
+        file_writer.write(generated_context)
 
-                    if should_strip_attributes:
-                        line_to_write = strip_attributes(line_to_write)
-
-                    if should_strip_at_address:
-                        line_to_write = strip_at_address(line_to_write)
-
-                    if should_convert_binary_literals:
-                        line_to_write = convert_binary_literals(line_to_write)
-                    
-                    f.writelines(line_to_write)
+    # Check if we also want to copy to the clipboard
+    if known_args.copy_to_clipboard:
+        pyperclip.copy(generated_context)
 #endregion
 
 if __name__ == "__main__":