mgs_reversing/build/uninitializer.py

189 lines
6.7 KiB
Python

#!/usr/bin/env python3
# Uninitializer has two modes of operation:
#
# INJECT
# ======
# In this mode, uninitializer is given 3 inputs:
# - LHS overlay: overlay without uninitialized memory
# (BSS filled with 0s, gaps in data/rdata filled with 0s)
# - RHS overlay: overlay with uninitialized memory
# (BSS filled with 0x9e, gaps in data/rdata filled with 0x9e)
# - Uninitialized memory template: memory to be injected into
# the resulting file. This memory was extracted from original
# overlays.
#
# The program iterates over those three files simultaneously.
# If it detects uninitialized byte (byte in LHS = 0, byte in RHS = 0x9e)
# it injects a byte from uninitialized memory template.
#
# EXTRACT
# =======
# This mode can be used to generate the uninitialized memory
# template file. Uninitializer is given 3 inputs:
# - LHS overlay
# - RHS overlay
# - Original overlay file (from MGS)
#
# The program iterates over those three files, detects uninitialized
# bytes and creates the uninitialized memory template file based on
# the given original overlay file.
#
# EXTRACT - example usage
# =======================
#
# BSS
# ---
#
# At the end of overlay.c file you might have such data:
# const char s01a_dword_800E4CC8[] = {0x6, 0x11, 'd', 'r'};
# const char s01a_dword_800E4CCC[] = {'a', 'w', '_', 'l'};
# const char s01a_dword_800E4CD0[] = {'o', 'n', 'g', '_'};
#
# Once you have determined that this is indeed BSS and not rdata,
# you should replace it with:
# int SECTION("overlay.bss") s01a_dword_800E4CC8;
# int SECTION("overlay.bss") s01a_dword_800E4CCC;
# int SECTION("overlay.bss") s01a_dword_800E4CD0;
#
# Next, see the "Extracting" section below.
#
# rdata
# -----
#
# Once you have identified all rdata in overlay.c, you might
# want to inline some strings to source files. Please remember
# that memleaks only happen at file boundaries, so there are
# no memleaks between strings in a single file. Apart from that
# perform the inlining as usual.
#
# Extracting
# ----------
#
# Rebuild the project. If there was any uninitialized memory
# in variables you modified (as described in sections above),
# the build will fail - the uninitialized memory files don't
# have the new uninitialized memory. You have to generate it again!
# You can do it by executing the following command (replacing s03er):
#
# python3 uninitializer.py extract ../obj/s03er_lhs.bin ../obj/s03er_rhs.bin ~/stage/s03er/00e05a.bin ../um/s03er.bin
#
# If the command succeeds, after rebuilding the project the overlay
# should match. If the command fails at any of assertions,
# this means that the built overlay differs in more than memleaks
# compared to the original overlay - you have to find the difference
# manually (comparing lhs file with target file).
import sys
from create_dummy_file import DUMMY_FILE_SIZE
import os
import string
from collections import defaultdict
# In couple overlays we observed accesses to BSS variables outside
# the overlay bounds. It looks like as if the overlay was truncated.
# We don't have any explanation for this, so let's just artifically
# trim the last few bytes in overlays that exhibit this behavior for now.
BSS_HACK = defaultdict(int, {
's11c_lhs.bin': 4, # in s11c function at 0x800ce014 accesses 0x800d32dc which is just outside the overlay
# issues with gasdamge.c BSS
's02c_lhs.bin': 4,
's02d_lhs.bin': 4,
's02e_lhs.bin': 4,
})
def get_bss_adjustment(lhs):
return BSS_HACK[os.path.basename(lhs)]
def inject(lhs, rhs, uninitialized, out):
bss_adjustment = get_bss_adjustment(lhs)
lhs = open(lhs, 'rb').read()
rhs = open(rhs, 'rb').read()
if bss_adjustment > 0:
lhs = lhs[:-bss_adjustment]
if os.path.isfile(uninitialized):
uninitialized = open(uninitialized, 'rb').read()
else:
uninitialized = b""
uninitialized += b"\x00" * DUMMY_FILE_SIZE
uninitialized_cur_idx = 0
assert len(rhs) == DUMMY_FILE_SIZE, f"RHS overlay should be {DUMMY_FILE_SIZE} bytes large, but it's {len(rhs)} bytes large"
assert len(lhs) < DUMMY_FILE_SIZE, f"LHS overlay is too large ({len(lhs)} bytes large)"
assert len(lhs) < len(rhs), f"LHS overlay is too large ({len(lhs)} bytes large)"
out_arr = bytearray(lhs)
for i, (lb, rb) in enumerate(zip(lhs, rhs)):
if lb == rb:
continue
assert lb == 0 and rb == 0x9e, f"LHS and RHS differ in more than uninitialized memory at offset {i}, {lb:X} != {rb:X}"
# Uninitialized memory!
out_arr[i] = uninitialized[uninitialized_cur_idx]
uninitialized_cur_idx += 1
open(out, 'wb').write(bytes(out_arr))
def extract(lhs, rhs, target, uninitialized_out):
bss_adjustment = get_bss_adjustment(lhs)
lhs = open(lhs, 'rb').read()
rhs = open(rhs, 'rb').read()
target = open(target, 'rb').read()
if bss_adjustment > 0:
lhs = lhs[:-bss_adjustment]
assert len(rhs) == DUMMY_FILE_SIZE, f"RHS overlay should be {DUMMY_FILE_SIZE} bytes large, but it's {len(rhs)} bytes large"
assert len(lhs) < DUMMY_FILE_SIZE, f"LHS overlay is too large ({len(lhs)} bytes large)"
assert len(lhs) < len(rhs), f"LHS overlay is too large ({len(lhs)} bytes large)"
assert len(lhs) == len(target), f"LHS overlay should the same size as target. len(lhs) = {len(lhs)}, len(target) = {len(target)}"
uninitialized_out_arr = []
nonzero_uninitialized = 0
for i, (lb, rb, tb) in enumerate(zip(lhs, rhs, target)):
if lb == rb and rb == tb:
continue
assert lb == 0 and rb == 0x9e, f"LHS, RHS, target differ in more than uninitialized memory at offset {i}, {lb:X} != {rb:X} != {tb:X}"
# Uninitialized memory!
uninitialized_out_arr.append(tb)
if tb != 0:
nonzero_uninitialized += 1
# Trim trailing zeros
while uninitialized_out_arr and uninitialized_out_arr[-1] == 0:
uninitialized_out_arr.pop()
if uninitialized_out is not None:
open(uninitialized_out, 'wb').write(bytes(uninitialized_out_arr))
print("Extracted", len(uninitialized_out_arr), "bytes of uninitialized memory")
print(f"({nonzero_uninitialized} non-zero bytes)")
print()
uninitialized_out_arr = [c for c in uninitialized_out_arr if c in bytes(string.printable, encoding='ascii')]
print(bytes(uninitialized_out_arr).decode('ascii'))
if __name__ == '__main__':
if sys.argv[1] == 'inject':
lhs = sys.argv[2]
rhs = sys.argv[3]
uninitialized = sys.argv[4]
out = sys.argv[5]
inject(lhs, rhs, uninitialized, out)
elif sys.argv[1] == 'extract':
lhs = sys.argv[2]
rhs = sys.argv[3]
target = sys.argv[4]
uninitialized_out = sys.argv[5]
extract(lhs, rhs, target, uninitialized_out)
else:
print("Unknown mode:", sys.argv[1])