mgs_reversing/build/ghidra_scripts/update_data.py

283 lines
12 KiB
Python

#@category _MGS
"""
Instructions:
1. The Ghidra parser can't deal with __LINE__, so in linker.h, #define out STATIC_ASSERT_SIZE() to nothing so that you have
#define STATIC_ASSERT_SIZE(struct, size)
2. In Ghidra, click on
File > Parse C Source...
then click on
Save profile to new name
to create a new profile that you can call psx.prf.
Remove all the source files and options, then individually add each header file in source and its subfolders to the
list of source files to parse and use the following parse options, replacing Path/to/ as appropriate and adding any
folders that might be missing from the list. Note that on Linux, you might be required to make all the filenames in
the PsyQ INCLUDE and INCLUDE/SYS folders lowercase, as well as the INCLUDE/SYS folder name itself.
-IPath/to/mgs_reversing/source
-IPath/to/mgs_reversing/source/equip
-IPath/to/mgs_reversing/source/font
-IPath/to/mgs_reversing/source/game
-IPath/to/mgs_reversing/source/libdg
-IPath/to/mgs_reversing/source/libgcl
-IPath/to/mgs_reversing/source/libgv
-IPath/to/mgs_reversing/source/libhzd
-IPath/to/mgs_reversing/source/memcard
-IPath/to/mgs_reversing/source/menu
-IPath/to/mgs_reversing/source/mts
-IPath/to/mgs_reversing/source/okajima
-IPath/to/mgs_reversing/source/takabe
-IPath/to/mgs_reversing/source/thing
-IPath/to/mgs_reversing/source/weapon
-IPath/to/psyq_sdk/psyq_4.3/include
-Dmips1
-D__GNUC__
-D_GNU_SOURCE
-D__WORDSIZE=32
-D__builtin_va_list=void *
-D__DO_NOT_DEFINE_COMPILE
-D_Complex
-D_WCHAR_T
3. Now restore the static assert definition to linker.h.
4. Change the root_dir and run it from Ghidra's Script Manager.
"""
import os
import sys
import string
import re
from ghidra.program.model.data import ArrayDataType, DataUtilities
dtm = currentProgram.getDataTypeManager()
root_dir = os.path.realpath(os.path.join(os.path.dirname(__file__), '../../'))
symbol_address_map = { }
with open(root_dir + '/obj/asm.map') as map:
for line in map:
pieces = line.split()
if len(pieces) != 2:
continue
symbol = pieces[1]
if symbol.startswith('_'):
continue
symbol_address_map[symbol] = pieces[0]
ishex = lambda s: all(c in string.hexdigits for c in s)
defines = { }
# Removes text from in between braces, even if they are nested.
# https://stackoverflow.com/a/14598135
def strip_braces(string):
ret = ''
skip1c = 0
for i in string:
if i == '{':
skip1c += 1
elif i == '}' and skip1c > 0:
skip1c -= 1
elif skip1c == 0:
ret += i
return ret
defined_data_ranges = []
updated_keys = set()
for root, subdirs, files in os.walk(root_dir + '/source'):
for filename in files:
file_path = os.path.join(root, filename)
if file_path.endswith('.swp'):
continue
with open(file_path, 'r') as f:
lines = f.readlines()
for index, line in enumerate(lines):
pieces = line.split()
if (len(pieces) < 3):
continue
if pieces[0] == '#define':
defines[pieces[1]] = ' '.join(pieces[2:])
continue
pieces_base_index = 1 if pieces[0] == 'struct' else 0
var_type = pieces[pieces_base_index]
if var_type.__contains__('/') or var_type == 'char':
continue
data_types = []
dtm.findDataTypes(var_type, data_types)
if len(data_types) == 0:
continue
data_type = data_types[0]
var_section = pieces[pieces_base_index + 1]
if not var_section.startswith('SECTION'):
continue
var_name = pieces[pieces_base_index + 2]
var_name = var_name.replace(';', '')
print_debug_info = False # var_name.__contains__('800B05A8') or var_name.__contains__('800B77E8')
if print_debug_info:
print('line == ' + line)
# Do we have an array?
if var_name.__contains__(']'):
if print_debug_info:
print('detected array')
penultimate_char = var_name[len(var_name) - 2]
num_elements = 1 # For the missing comma, assuming we're not going to end with a comma.
if penultimate_char == '[':
if print_debug_info:
print('number of elements not specified, counting')
# Count elements ourselves.
next_line_index = index
decl_braces_counted = False
while True:
next_line = lines[next_line_index]
if decl_braces_counted:
next_line = strip_braces(next_line).partition('//')[0]
else:
decl_braces_counted = True
next_line_pieces = next_line.count(',') # Assuming we're not going to end with a comma.
num_elements += next_line_pieces
if next_line.__contains__(';'):
break
next_line_index += 1
else:
if print_debug_info:
print('number of elements specified, retrieving')
number_regex = re.findall(r'\[([A-Za-z0-9_]+)\]', var_name)
if len(number_regex) == 0:
continue
if print_debug_info:
print('regex found a possible number')
possible_number = number_regex[0]
if possible_number.isdigit():
num_elements = int(possible_number, 10)
if print_debug_info:
print('found literal with value == ' + str(num_elements))
elif possible_number in defines:
if (defines[possible_number].isdigit()):
num_elements = int(defines[possible_number], 10)
if print_debug_info:
print('found use of #define with value == ' + str(num_elements))
else:
if print_debug_info:
print('found use of #define not yet put in dictionary')
# @todo: fix headers that contain relevant #defines possibly being read after the files that need them.
continue
key = var_name.partition('[')[0]
if key not in updated_keys and key in symbol_address_map:
address = symbol_address_map[key]
if print_debug_info:
print('key found in symbol-address map with address == ' + address)
if ishex(address) and len(address) == 8 and address.startswith('80'):
if print_debug_info:
print('valid address, adding array')
data_type_array = ArrayDataType(data_type, num_elements, data_type.getLength())
address_as_int = int(address, 16)
end_address_as_int = address_as_int + data_type_array.getLength() - 1
for start_end_pairs in defined_data_ranges:
defined_data_start = start_end_pairs[0]
defined_data_end = start_end_pairs[1]
if address_as_int <= defined_data_end and end_address_as_int >= defined_data_start:
print('WARNING: OVERLAPPING DATA')
print(var_name + ' in range ' + address + ' - ' + hex(end_address_as_int))
print('overlaps ' + getSymbolAt(toAddr(defined_data_start)).getName() + ' in range ' + hex(defined_data_start) + ' - ' + hex(defined_data_end))
print('*****************************')
defined_data_ranges.append((address_as_int, end_address_as_int))
start_address = toAddr(address_as_int)
end_address = toAddr(end_address_as_int)
clearListing(start_address, end_address)
DataUtilities.createData(currentProgram, start_address, data_type_array, data_type_array.getLength(), True, DataUtilities.ClearDataMode.CLEAR_SINGLE_DATA)
updated_keys.add(key)
if print_debug_info:
# exit()
print('*****************************')
else:
if print_debug_info:
print('detected single element')
if var_name not in updated_keys and var_name in symbol_address_map:
base_address = symbol_address_map[var_name]
if print_debug_info:
print('key found in symbol-address map with base_address == ' + base_address)
if ishex(base_address) and len(base_address) == 8 and base_address.startswith('80'):
if print_debug_info:
print('valid address, adding element')
address_as_int = int(base_address, 16)
end_address_as_int = address_as_int + data_type.getLength() - 1
for start_end_pairs in defined_data_ranges:
defined_data_start = start_end_pairs[0]
defined_data_end = start_end_pairs[1]
if address_as_int <= defined_data_end and end_address_as_int >= defined_data_start:
print('WARNING: OVERLAPPING DATA')
print(var_name + ' in range ' + base_address + ' - ' + hex(end_address_as_int))
print('overlaps ' + getSymbolAt(toAddr(defined_data_start)).getName() + ' in range ' + hex(defined_data_start) + ' - ' + hex(defined_data_end))
print('*****************************')
defined_data_ranges.append((address_as_int, end_address_as_int))
start_address = toAddr(address_as_int)
end_address = toAddr(end_address_as_int)
clearListing(start_address, end_address)
DataUtilities.createData(currentProgram, start_address, data_type, data_type.getLength(), True, DataUtilities.ClearDataMode.CLEAR_SINGLE_DATA)
updated_keys.add(var_name)
if print_debug_info:
# exit()
print('*****************************')