jak-project/scripts/analyze_korean.py

1400 lines
44 KiB
Python

# The idea is to parse through all of the game's korean strings and
# determine the glyph combinations that are used to construct the syllable blocks
# In terms of font setting, korean syllable blocks have 6 main configurations
# not all jamos are relevant for all these configurations, which reduces the complexity
# For example, the first 3 configurations involve no third jamo.
# 1 (2) - left -> right. For example 가
# 2 (2) - top -> middle. For example 고
# 3 (2) - left + right + middle. For example 과
#
# 4 (3) - left + right + bottom (no combined middle). For example 갈
# 5 (3) - top down. For example 골
# 6 (3) - left + right + bottom with middle. For example 괄
# Thinking about the characters this way eliminates a ton of permutations
# and allows us to use existing glyphs to fill in the gaps
# The hope is that by analyzing all of the korean text, we can atleast find where every glyph is used
# and then any glyphs left over we can do manually.
# read in the `game_text.txt` file and extract all the korean strings
from itertools import product
from pprint import pprint
import json
with open(
"../decompiler_out/jak2/assets/game_text.txt", mode="r", encoding="utf-8"
) as f:
game_text_lines = f.readlines()
korean_lines = {}
i = 0
while i < len(game_text_lines):
curr_line = game_text_lines[i].strip()
if curr_line.startswith("(#x"):
id = curr_line.split("(#x")[1]
korean_lines[id] = game_text_lines[i + 7].strip().replace("\\c", ",0x")[2:-1]
i = i + 1
# also parse subtitles
with open(
"../decompiler_out/jak2/assets/subtitles_raw.txt", mode="r", encoding="utf-8"
) as f:
subtitle_text_lines = f.readlines()
for line in subtitle_text_lines:
parts = line.split("::")
text = parts[2].replace('"', "").replace(" ", ",")
# pad with 0s for single hex digits
for c in [
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"a",
"b",
"c",
"d",
"e",
"f",
]:
text = text.replace(f"0x{c},", f"0x0{c},")
key = f"{parts[0]}_{parts[1]}"
korean_lines[key] = text[:-1]
print(f"Analyzing {len(korean_lines)} lines of korean text")
# we will fill up this structure, which will allow us to recreate / fill in the gaps in their korean encoding
jamo_combinations = {
# jamo : 6 orientations
# if the orientation is irrelvevant, null instead of a list
# the reason we use lists is because multiple glyphs may be used to represent the same jamo
# even in the same orientation, depending on the surrounding characters
#
# we will store these surrounding "context" characters to deduce a pattern at the end
# ie. 0x06 is always used for ᄀ unless if it becomes before ᅦ then it uses 0x33
#
# Choseong (initial)
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
"": [[], [], [], [], [], []],
# Jungseong (middle)
# - verticals
"": [[], None, None, [], None, None],
"": [[], None, None, [], None, None],
"": [[], None, None, [], None, None],
"": [[], None, None, [], None, None],
"": [[], None, None, [], None, None],
"": [[], None, None, [], None, None],
"": [[], None, None, [], None, None],
"": [[], None, None, [], None, None],
"": [[], None, None, [], None, None],
# - horizontals
"": [None, [], None, None, [], None],
"": [None, [], None, None, [], None],
"": [None, [], None, None, [], None],
"": [None, [], None, None, [], None],
"": [None, [], None, None, [], None],
# - combinations
"": [None, None, [], None, None, []],
"": [None, None, [], None, None, []],
"": [None, None, [], None, None, []],
"": [None, None, [], None, None, []],
"": [None, None, [], None, None, []],
"": [None, None, [], None, None, []],
"": [None, None, [], None, None, []],
# Jongseong (final)
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
"": [None, None, None, [], [], []],
}
# we will be reading byte strings from the game, these correspond to the font glyphs
# we can deduce all of these
jamo_glyph_mappings = {
"0x06": "",
"0x07": "",
"0x08": "",
"0x09": "",
"0x0a": "",
"0x0b": "",
"0x0c": "",
"0x0d": "",
"0x0e": "",
"0x0f": "",
"0x10": "",
"0x11": "",
"0x12": "",
"0x13": "",
"0x14": "",
"0x15": "",
"0x16": "",
"0x17": "",
"0x18": "",
"0x19": "",
"0x1a": "",
"0x1b": "",
"0x1c": "",
"0x1d": "",
"0x1e": "",
"0x1f": "",
"0x20": "",
"0x21": "",
"0x22": "",
"0x23": "",
"0x24": "",
"0x25": "",
"0x26": "",
"0x27": "",
"0x28": "",
"0x29": "",
"0x2a": "",
"0x2b": "",
"0x2c": "",
"0x2d": "",
"0x2e": "",
"0x2f": "",
"0x30": "",
"0x31": "",
"0x32": "",
"0x33": "",
"0x34": "",
"0x35": "",
"0x36": "",
"0x37": "",
"0x38": "",
"0x39": "",
"0x3a": "",
"0x3b": "",
"0x3c": "",
"0x3d": "",
"0x3e": "",
"0x3f": "",
"0x40": "",
"0x41": "",
"0x42": "",
"0x43": "",
"0x44": "",
"0x45": "",
"0x46": "",
"0x47": "",
"0x48": "",
"0x49": "",
"0x4a": "",
"0x4b": "",
"0x4c": "",
"0x4d": "",
"0x4e": "",
"0x4f": "",
"0x50": "",
"0x51": "",
"0x52": "",
"0x53": "",
"0x54": "",
"0x55": "",
"0x56": "",
"0x57": "",
"0x58": "",
"0x59": "",
"0x5a": "",
"0x5b": "",
"0x5c": "",
"0x5d": "",
"0x5e": "",
"0x5f": "",
"0x60": "",
"0x61": "",
"0x62": "",
"0x63": "",
"0x64": ["", ""],
"0x65": ["", ""],
"0x66": ["", ""],
"0x67": ["", ""],
"0x68": "",
"0x69": "",
"0x6a": "",
"0x6b": "",
"0x6c": "",
"0x6d": "",
"0x6e": "",
"0x6f": "",
"0x70": "",
"0x71": "",
"0x72": "",
"0x73": "",
"0x74": "",
"0x75": "",
"0x76": "",
"0x77": "",
"0x78": "",
"0x79": "",
"0x7a": "",
"0x7b": "",
"0x7c": "",
"0x7d": "",
"0x7e": "",
"0x7f": "",
"0x80": "",
"0x81": "",
"0x82": "",
"0x83": "",
"0x84": "",
"0x85": "",
"0x86": "",
"0x87": "",
"0x88": "",
"0x89": ["", ""],
"0x8a": ["", ""],
"0x8b": "",
"0x8c": "",
"0x8d": "",
"0x8e": "",
"0x8f": ["", ""],
"0x90": ["", ""],
"0x91": "",
"0x92": "",
"0x93": "",
"0x94": "",
"0x95": "",
"0x96": "",
"0x97": "",
"0x98": "",
"0x99": "",
"0x9a": "",
"0x9b": "",
"0x9c": "",
"0x9d": "",
"0x9e": "",
"0x9f": "",
"0xa0": "",
"0xa1": "",
"0xa2": "",
"0xa3": "",
"0xa4": "",
"0xa5": "",
"0xa6": "",
"0xa7": "",
"0xa8": "",
"0xa9": "",
"0xaa": "",
"0xab": "",
"0xac": "",
"0xad": "",
"0xae": "",
"0xaf": "",
"0xb0": "",
"0xb1": "",
"0xb2": "",
"0xb3": "",
"0xb4": "",
"0xb5": "",
"0xb6": "",
"0xb7": "",
"0xb8": "",
"0xb9": "",
"0xba": "",
"0xbb": "",
"0xbc": "",
"0xbd": "",
"0xbe": "",
"0xbf": "",
"0xc0": "",
"0xc1": "",
"0xc2": "",
"0xc3": "",
"0xc4": "",
"0xc5": "",
"0xc6": "",
"0xc7": "",
"0xc8": "",
"0xc9": "",
"0xca": "",
"0xcb": "",
"0xcc": "",
"0xcd": "",
"0xce": "",
"0xcf": "",
"0xd0": "",
"0xd1": "",
"0xd2": "",
"0xd3": "",
"0xd4": "",
"0xd5": "",
"0xd6": "",
"0xd7": "",
"0xd8": "",
"0xd9": "",
"0xda": "",
"0xdb": "",
"0xdc": "",
"0xdd": "",
"0xde": "",
"0xdf": "",
"0xe0": "",
"0xe1": "",
"0xe2": "",
"0xe3": "",
"0xe4": "",
"0xe5": "",
"0xe6": "",
"0xe7": "",
"0xe8": "",
"0xe9": "",
"0xea": "",
"0xeb": "",
"0xec": "",
"0xed": "",
"0xee": "",
"0xef": "",
"0xf0": "",
"0xf1": "",
"0xf2": "",
"0xf3": "",
"0xf4": "",
"0xf5": "",
"0xf6": "",
"0xf7": "",
"0xf8": "",
"0xf9": "",
"0xfa": "",
"0xfb": "",
"0xfc": "",
"0xfd": "",
"0xfe": "",
"0xff": "",
"extra_0x86": "",
"extra_0x87": "",
"extra_0x88": "",
"extra_0x89": "",
"extra_0x8a": "",
"extra_0x8b": "",
}
jamo_groupings = {
"initial": [
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
],
"median": [
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
],
"final": [
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
],
}
median_jamo_groupings = {
"right": ["", "", "", "", "", "", "", "", ""],
"bottom": ["", "", "", "", ""],
"combined": ["", "", "", "", "", "", ""],
}
median_combos = {
"": ["", ""],
"": ["", ""],
"": ["", ""],
"": ["", ""],
"": ["", ""],
"": ["", ""],
"": ["", ""],
}
def derive_syllable_block_info(glyph_list):
jamos = []
# iterate the glyphs, convert them into their mappings
for glyph in glyph_list:
if glyph not in jamo_glyph_mappings:
print(f"{glyph} not in mapping dictionary, fix it")
exit(1)
mapping = jamo_glyph_mappings[glyph]
# TODO - ugly for figuring out what glyphs by jamo!
if isinstance(mapping, list):
# there are a few select glyphs that are multiple jamos
for jamo in mapping:
jamos.append([jamo, glyph])
else:
jamos.append([mapping, glyph])
# Associate each jamo with it's initial/median/final grouping
jamo_info = []
found_medians = []
for jamo_and_glyph in jamos:
jamo = jamo_and_glyph[0]
glyph = jamo_and_glyph[1]
for [grouping, jamos_in_group] in jamo_groupings.items():
if jamo in jamos_in_group:
jamo_grouping = grouping
break
if jamo_grouping == "median":
found_medians.append([jamo, glyph])
jamo_info.append({"jamo": jamo, "grouping": jamo_grouping, "glyph": glyph})
if len(found_medians) > 2:
print(f"found more than 2 median vowels in {jamo_info}")
exit(1)
# Consolidate median vowels, as jak typically typically draws them as a combination of two
# glyphs
if len(found_medians) > 1:
combined_median = None
combined_glyphs = None
for [vowel, vowel_parts] in median_combos.items():
if (
found_medians[0][0] in vowel_parts
and found_medians[1][0] in vowel_parts
):
combined_median = vowel
combined_glyphs = [found_medians[0][1], found_medians[1][1]]
break
if combined_median == None:
print(f"unable to combine median in {jamo_info}")
exit(1)
new_jamo_info = []
skip_rest = False
for info in jamo_info:
if info["grouping"] != "median":
new_jamo_info.append(info)
elif not skip_rest:
new_jamo_info.append(
{
"jamo": combined_median,
"grouping": "median",
"glyph": combined_glyphs,
}
)
skip_rest = True
jamo_info = new_jamo_info
# Now we can consolidate median vowels and determine the orientation
if len(jamo_info) == 2:
for [grouping, jamos_in_group] in median_jamo_groupings.items():
if jamo_info[1]["jamo"] in jamos_in_group:
median_group = grouping
break
if median_group == "right":
writing_orientation = 0
elif median_group == "bottom":
writing_orientation = 1
elif median_group == "combined":
writing_orientation = 2
else:
print(f"couldnt figure out median group for {jamo_info}")
exit(1)
elif len(jamo_info) == 3:
for [grouping, jamos_in_group] in median_jamo_groupings.items():
if jamo_info[1]["jamo"] in jamos_in_group:
median_group = grouping
break
if median_group == "right":
writing_orientation = 3
elif median_group == "bottom":
writing_orientation = 4
elif median_group == "combined":
writing_orientation = 5
else:
print(f"couldnt figure out median group for {jamo_info}")
exit(1)
else:
print(f"unhandled jamo configuration {jamo_info}")
exit(1)
return {"writingOrientation": writing_orientation, "jamos": jamo_info}
# finally start going through the real text to figure out the mappings
total_syllable_blocks = 0
for [id, game_text_line] in korean_lines.items():
# print()
# print(game_text_line)
# split the bytes into characters, sound the alarm if we see a `0x05`
# NOTE - hopefully this is not a hack (seems like the font textures dont start until 0x6...how conveniant!)
game_text_line = game_text_line.replace("0x05,", "extra_")
text_bytes = game_text_line.split(",")
syllable_blocks = []
i = 0
while i < len(text_bytes):
curr_byte = text_bytes[i]
if curr_byte == "0x04":
total_syllable_blocks = total_syllable_blocks + 1
expected_num_glyphs = int(text_bytes[i + 1], 16)
syllable_blocks.append(
{
"numGlyphs": expected_num_glyphs,
"rawGlyphs": text_bytes[i + 2 : i + 2 + expected_num_glyphs],
}
)
i = i + 2 + expected_num_glyphs
continue
i = i + 1
# now we will inspect the choice of glyphs (which are individual jamo or jamo combinations)
# to determine the jamo and the writing orientation
for block in syllable_blocks:
jamo_info = derive_syllable_block_info(block["rawGlyphs"])
block["jamos"] = jamo_info["jamos"]
block["writingOrientation"] = jamo_info["writingOrientation"]
# pprint(syllable_blocks)
# The (almost) final step, store this information in our big jamo combination
# "database"
#
# We now effectively have an encoding, and we can process that to further refine it and
# see what we have to do manually
for block in syllable_blocks:
writing_orientation = block["writingOrientation"]
for jamo in block["jamos"]:
jamo_entry = jamo_combinations[jamo["jamo"]]
if jamo_entry[writing_orientation] == None:
print(f"something is very wrong with {block}")
exit(1)
new_entry = {"glyph": jamo["glyph"], "context": block["jamos"]}
if new_entry not in jamo_entry[writing_orientation]:
jamo_entry[writing_orientation].append(new_entry)
# Print some stats before finalizing the result
empty_cells = 0
glyph_list = set(jamo_glyph_mappings.keys())
for [jamo, orientations] in jamo_combinations.items():
for orientation in orientations:
if orientation is not None:
if len(orientation) == 0:
empty_cells = empty_cells + 1
for entry in orientation:
if isinstance(entry["glyph"], list):
for glyph in entry["glyph"]:
glyph_list.discard(glyph)
else:
glyph_list.discard(entry["glyph"])
print()
print(f"Analyzed {total_syllable_blocks} syllable blocks")
print(f"{empty_cells} empty jamo cells\n")
print(f"Did not see {len(glyph_list)} out of {len(jamo_glyph_mappings.keys())} glyphs:")
# with open("./jamo-db-before.json", mode="w", encoding="utf-8") as f:
# f.write(json.dumps(jamo_combinations, indent=2))
def format_alternative(curr_jamo, curr_glyph, full_glyph_context):
# Make a string key that represents the unicode jamos with a <GLYPH> placeholder to represent
# the jamo we are dealing with
# And the value is the glyph itself that gets used to draw this combination of jamos
key_parts = []
for glyph in full_glyph_context:
if curr_jamo == glyph["jamo"]:
key_parts.append("<G>")
else:
key_parts.append(glyph["jamo"])
formatted_curr_glyph = curr_glyph
if isinstance(curr_glyph, list):
formatted_curr_glyph = ",".join(curr_glyph)
return [",".join(key_parts), formatted_curr_glyph]
# Enumerate through the db, and consolidate duplicates / find the most common
# jamo for each position
for [jamo, orientations] in jamo_combinations.items():
for [index, orientation] in enumerate(orientations):
if orientation is not None:
result = {"defaultGlyph": "", "alternatives": {}}
glyph_frequencies = {}
alternatives = {}
if len(orientation) == 0:
empty_cells = empty_cells + 1
continue
for entry in orientation:
glyph_key = entry["glyph"]
if isinstance(entry["glyph"], list):
glyph_key = ",".join(entry["glyph"])
if glyph_key not in glyph_frequencies:
glyph_frequencies[glyph_key] = 0
glyph_frequencies[glyph_key] = glyph_frequencies[glyph_key] + 1
if glyph_key not in alternatives:
alternatives[glyph_key] = []
alternatives[glyph_key].append(
format_alternative(jamo, entry["glyph"], entry["context"])
)
# Consolidate
most_common_glyph = ""
most_common_glyph_times = -1
for [glyph, freq] in glyph_frequencies.items():
if freq > most_common_glyph_times:
most_common_glyph_times = freq
most_common_glyph = glyph
result["defaultGlyph"] = most_common_glyph
# TODO - handle if this is multiple glyphs
del alternatives[most_common_glyph]
# Flatten alternatives
for [glyph, alternatives] in alternatives.items():
for alternative in alternatives:
result["alternatives"][alternative[0]] = alternative[1]
# Overwrite the db value
jamo_combinations[jamo][index] = result
# These are found MANUALLY by iterating through all combinations and finding alternatives
# for jamo combinations to be legible
manual_encoding_additions = {
# Choseong (initial)
"": [
[],
[],
["0xb3:<G>,ᅫ"],
["0x33:<G>,ᅤ,*", "0x33:<G>,ᅦ,*", "0x33:<G>,ᅨ,*"],
[],
["0xae:<G>,ᅴ,*"],
],
"": [
["!0x34", "0x07:<G>,ᅵ"],
["0x67:<G>,ᅭ"],
[],
["!0x34"],
["!0x65", "0x68:<G>,ᅭ,*", "0x68:<G>,ᅮ,*", "0x68:<G>,ᅲ,*", "0x68:<G>,ᅳ,*"],
[
"!0x91",
"0x8f:<G>,ᅪ,*",
"0x64:<G>,ᅫ,*",
"0xb3:<G>,ᅰ,*",
"0x8f:<G>,ᅬ,*",
],
],
"": [
["0x1f:<G>,ᅡ", "0x1f:<G>,ᅣ", "0x4a:<G>,ᅨ", "0x35:<G>,ᅤ"],
[],
["0xb4:<G>,ᅫ", "0xb4:<G>,ᅰ"],
[
"0x35:<G>,ᅢ,*",
"0x35:<G>,ᅤ,*",
"0x35:<G>,ᅦ,*",
"0x1f:<G>,ᅧ,*",
"0x1f:<G>,ᅥ,*",
"0x4a:<G>,ᅨ,*",
],
[],
["0xb4:<G>,ᅪ,*", "0xb4:<G>,ᅫ,*", "0xb4:<G>,ᅰ,*"],
],
"": [
["0x26:<G>,ᅡ", "0x36:<G>,ᅤ", "0x26:<G>,ᅣ", "0x36:<G>,ᅧ", "0x51:<G>,ᅨ"],
[],
["0xb5:<G>,ᅪ", "0xb5:<G>,ᅰ"],
[
"0x36:<G>,ᅢ,*",
"0x36:<G>,ᅤ,*",
"0x51:<G>,ᅦ,*",
"0x26:<G>,ᅧ,*",
"0x51:<G>,ᅨ,*",
],
[],
["0xb5:<G>,ᅪ,*", "0xb5:<G>,ᅫ,*", "0xb5:<G>,ᅰ,*"],
],
"": [
[
"0x27:<G>,ᅡ",
"0x52:<G>,ᅢ",
"0x52:<G>,ᅤ",
"0x27:<G>,ᅣ",
"0x27:<G>,ᅧ",
"0x52:<G>,ᅨ",
"0x27:<G>,ᅵ",
],
[],
["0xb6:<G>,ᅪ", "0xb6:<G>,ᅫ", "0xb6:<G>,ᅰ"],
[
"!0x0a",
"0x52:<G>,ᅢ,*",
"0x52:<G>,ᅣ,*",
"0x52:<G>,ᅤ,*",
"0x52:<G>,ᅦ,*",
"0x52:<G>,ᅧ,*",
"0x52:<G>,ᅨ,*",
],
[],
["0xb6:<G>,ᅪ,*", "0xb6:<G>,ᅫ,*", "0xb6:<G>,ᅰ,*"],
],
"": [
["0x0b:<G>,ᅣ", "0x38:<G>,ᅤ"],
[],
["0xb7:<G>,ᅪ", "0xb7:<G>,ᅫ", "0xb7:<G>,ᅰ"],
[
"0x0b:<G>,ᅵ,*",
"0x0b:<G>,ᅡ,*",
"0x38:<G>,ᅢ,*",
"0x38:<G>,ᅤ,*",
"0x53:<G>,ᅦ,*",
"0x53:<G>,ᅨ,*",
],
[],
["0x95", "0xb7:<G>,ᅪ,*", "0xb7:<G>,ᅫ,*", "0xb7:<G>,ᅰ,*"],
],
"": [
["0x39:<G>,ᅤ", "0x39:<G>,ᅨ"],
[],
["0xb2:<G>,ᅪ", "0xb2:<G>,ᅫ", "0xb2:<G>,ᅰ"],
["0x39:<G>,ᅢ,*", "0x39:<G>,ᅤ,*", "0x39:<G>,ᅦ,*", "0x39:<G>,ᅨ,*"],
[],
["0xb2:<G>,ᅪ,*", "0xb2:<G>,ᅫ,*", "0xb2:<G>,ᅰ,*"],
],
"": [
["0x3a:<G>,ᅤ", "0x3a:<G>,ᅨ"],
[],
["0xb8:<G>,ᅪ", "0xb8:<G>,ᅫ", "0xb8:<G>,ᅰ"],
["0x3a:<G>,ᅢ,*", "0x3a:<G>,ᅤ,*", "0x3a:<G>,ᅦ,*", "0x3a:<G>,ᅨ,*"],
[],
["0xb8:<G>,ᅪ,*", "0xb8:<G>,ᅫ,*", "0xb8:<G>,ᅰ,*"],
],
"": [
["!0x3b", "0x0e:<G>,ᅥ", "0x0e:<G>,ᅵ"],
[],
["0x98"],
[
"0x3b:<G>,ᅢ,*",
"0x3b:<G>,ᅣ,*",
"0x3b:<G>,ᅤ,*",
"0x3b:<G>,ᅦ,*",
"0x3b:<G>,ᅧ,*",
"0x3b:<G>,ᅨ,*",
],
[],
["0x98"],
],
"": [
["0x4b:<G>,ᅤ", "0x4b:<G>,ᅨ"],
[],
["0xb9:<G>,ᅪ", "0xb9:<G>,ᅰ"],
["0x4b:<G>,ᅢ,*", "0x4b:<G>,ᅤ,*", "0x4b:<G>,ᅦ,*", "0x4b:<G>,ᅨ,*"],
[],
["0xb9:<G>,ᅪ,*", "0xb9:<G>,ᅫ,*", "0xb9:<G>,ᅰ,*"],
],
"": [
[
"0x4c:<G>,ᅢ",
"0x4c:<G>,ᅤ",
"0x4c:<G>,ᅥ",
"0x4c:<G>,ᅧ",
"0x4c:<G>,ᅨ",
],
[],
["0xba:<G>,ᅪ", "0xba:<G>,ᅫ", "0xba:<G>,ᅰ"],
[
"0x4c:<G>,ᅢ,*",
"0x4c:<G>,ᅣ,*",
"0x4c:<G>,ᅤ,*",
"0x4c:<G>,ᅦ,*",
"0x4c:<G>,ᅧ,*",
"0x4c:<G>,ᅨ,*",
],
[],
["0x9a", "0xba:<G>,ᅪ,*", "0xba:<G>,ᅫ,*", "0xba:<G>,ᅰ,*"],
],
"": [
[],
[],
["0xbb:<G>,ᅪ"],
["0x3c:<G>,ᅢ,*", "0x3c:<G>,ᅤ,*", "0x3c:<G>,ᅦ,*", "0x3c:<G>,ᅨ,*"],
[],
["0xbb:<G>,ᅪ,*", "0xbb:<G>,ᅫ,*", "0xbb:<G>,ᅰ,*"],
],
"": [
["0x3d:<G>,ᅨ"],
[],
["0xbc:<G>,ᅪ", "0xbc:<G>,ᅫ", "0xbc:<G>,ᅰ"],
["0x3d:<G>,ᅢ,*", "0x3d:<G>,ᅤ,*", "0x3d:<G>,ᅦ,*", "0x3d:<G>,ᅨ,*"],
[],
["0xbc:<G>,ᅪ,*", "0xbc:<G>,ᅫ,*", "0xbc:<G>,ᅰ,*"],
],
"": [
["!0x3e", "0x11:<G>,ᅵ"],
[],
["0xbd:<G>,ᅪ", "0xbd:<G>,ᅫ", "0xbd:<G>,ᅰ"],
[
"0x3e:<G>,ᅢ,*",
"0x3e:<G>,ᅣ,*",
"0x3e:<G>,ᅤ,*",
"0x3e:<G>,ᅦ,*",
"0x3e:<G>,ᅧ,*",
"0x3e:<G>,ᅨ,*",
"0x3e:<G>,ᅵ,*",
],
[],
["0xbd:<G>,ᅪ,*", "0xbd:<G>,ᅫ,*", "0xbd:<G>,ᅰ,*"],
],
"": [
["0x58:<G>,ᅤ", "0x58:<G>,ᅨ"],
[],
["0xbe:<G>,ᅪ", "0xbe:<G>,ᅫ", "0xbe:<G>,ᅰ"],
["0x58:<G>,ᅢ,*", "0x58:<G>,ᅤ,*", "0x58:<G>,ᅦ,*", "0x58:<G>,ᅨ,*"],
[],
["0x9e", "0xbe:<G>,ᅪ,*", "0xbe:<G>,ᅫ,*", "0xbe:<G>,ᅰ,*"],
],
"": [
["0x3f:<G>,ᅤ", "0x3f:<G>,ᅨ"],
[],
["0xaf:<G>,ᅪ", "0xaf:<G>,ᅫ", "0xaf:<G>,ᅰ"],
["0x3f:<G>,ᅢ,*", "0x3f:<G>,ᅤ,*", "0x3f:<G>,ᅦ,*", "0x3f:<G>,ᅨ,*"],
[],
["0xaf:<G>,ᅪ,*", "0xaf:<G>,ᅫ,*", "0xaf:<G>,ᅰ,*"],
],
"": [
["0x13:<G>,ᅣ", "0x40:<G>,ᅤ", "0x54:<G>,ᅨ"],
[],
["0xbf:<G>,ᅪ", "0xbf:<G>,ᅫ", "0xbf:<G>,ᅰ"],
[
"0x40:<G>,ᅢ,*",
"0x40:<G>,ᅤ,*",
"0x54:<G>,ᅦ,*",
"0x40:<G>,ᅧ,*",
"0x54:<G>,ᅨ,*",
],
[],
["0x9f", "0xbf:<G>,ᅪ,*", "0xbf:<G>,ᅫ,*", "0xbf:<G>,ᅰ,*"],
],
"": [
[
"0x2a:<G>,ᅡ",
"0x41:<G>,ᅤ",
"0x2a:<G>,ᅣ",
"0x2a:<G>,ᅧ",
],
[],
["0xa0"],
[
"0x41:<G>,ᅢ,*",
"0x41:<G>,ᅣ,*",
"0x41:<G>,ᅤ,*",
"0x2a:<G>,ᅥ,*",
"0x55:<G>,ᅦ,*",
"0x41:<G>,ᅧ,*",
"0x55:<G>,ᅨ,*",
],
[],
["0xa0"],
],
"": [
["0x59:<G>,ᅤ"],
[],
["0xc0:<G>,ᅪ", "0xc0:<G>,ᅫ", "0xc0:<G>,ᅰ"],
[
"0x59:<G>,ᅡ,*",
"0x59:<G>,ᅢ,*",
"0x59:<G>,ᅣ,*",
"0x59:<G>,ᅤ,*",
"0x59:<G>,ᅦ,*",
"0x59:<G>,ᅧ,*",
"0x59:<G>,ᅨ,*",
],
[],
["0xc0:<G>,ᅪ,*", "0xc0:<G>,ᅫ,*", "0xc0:<G>,ᅰ,*"],
],
# Jungseong (middle)
"": [[], None, None, ["0x1a:*,<G>,ᆫ"], None, None],
"": [[], None, None, ["0x46:*,<G>,ᆫ"], None, None],
"": [
[],
None,
None,
[],
None,
None,
],
"": [[], None, None, ["0x43", "0x47:*,<G>,ᆫ"], None, None],
"": [
[],
None,
None,
["0x2f:ᄎ,<G>,*", "0x2f:ᄒ,<G>,*", "0x1d:*[^ᄎ;ᄐ;ᄒ],<G>,ᆫ"],
None,
None,
],
"": [[], None, None, ["0x5a:ᄒ,<G>,*", "0x48:*,<G>,ᆫ"], None, None],
"": [
[],
None,
None,
["0x25:ᄂ,<G>,*", "0x1e:ᄎ,<G>,*", "0x30:ᄒ,<G>,*", "0x1e:*,<G>,ᆫ"],
None,
None,
],
"": [
[
"0x50:ᄂ,<G>",
"0x50:ᄃ,<G>",
"0x5d:ᄄ,<G>",
"0x50:ᄅ,<G>",
"0x50:ᄆ,<G>",
"0x50:ᄈ,<G>",
],
None,
None,
["0x49:*,<G>,ᆫ"],
None,
None,
],
"": [
None,
[
"0x62:ᄂ,<G>",
"0x62:ᄄ,<G>",
"0x62:ᄅ,<G>",
"0x62:ᄋ,<G>",
"0x62:ᄐ,<G>",
"0x62:ᄒ,<G>",
],
None,
None,
["!0x82", "0x82:*,<G>,*"],
None,
],
"": [
None,
None,
["0x8e,0xa8:ᄁ,<G>"],
None,
None,
[
"0x8e,0xa6:ᄀ,<G>,*",
"0x8e,0xa8:ᄀ,<G>,ᆫ",
"0x8f,0xa6:ᄁ,<G>,*",
"0x8f,0xa8:ᄁ,<G>,ᆫ",
],
],
"": [
None,
None,
[],
None,
None,
[
"!0xc1",
"0xc2:*,<G>,ᆫ",
"0x8e,0x42:ᄀ,<G>,*",
"0x8e,0x46:ᄀ,<G>,ᆫ",
"0x64:ᄁ,<G>,*",
"0x64:ᄁ,<G>,ᆫ",
],
],
"": [
None,
None,
["0x8e,0xa9:ᄁ,<G>"],
None,
None,
[
"0xa2,0xa9:*,<G>,ᆫ",
"0x8e,0xa7:ᄀ,<G>,*",
"0x8e,0xa9:ᄀ,<G>,ᆫ",
"0x8f,0xa7:ᄁ,<G>,*",
"0x8f,0xa9:ᄁ,<G>,ᆫ",
],
],
"": [
None,
[
"0x61:ᄀ,<G>",
"0x67:ᄁ,<G>",
"0x63:ᄂ,<G>",
"0x63:ᄃ,<G>",
"0x63:ᄄ,<G>",
"0x63:ᄅ,<G>",
"0x63:ᄈ,<G>",
"0x63:ᄊ,<G>",
"0x63:ᄋ,<G>",
"0x63:ᄏ,<G>",
"0x63:ᄐ,<G>",
"0x63:ᄒ,<G>",
],
None,
None,
["!0x83"],
None,
],
"": [None, ["0x6e:ᄒ,<G>"], None, None, ["0x85:*,<G>,*", "0x89:*,<G>,ᆫ"], None],
"": [None, None, [], None, None, ["0xaa:*,<G>,*", "0xac:*,<G>,ᆫ"]],
"": [None, None, [], None, None, ["0xc4:*,<G>,ᆫ"]],
"": [None, None, [], None, None, ["0xad:*,<G>,ᆫ"]],
"": [
None,
[
"0x6f:ᄂ,<G>",
"0x6f:ᄄ,<G>",
"0x6f:ᄅ,<G>",
"0x6f:ᄆ,<G>",
"0x6f:ᄇ,<G>",
"0x6f:ᄈ,<G>",
"0x6f:ᄉ,<G>",
"0x6f:ᄊ,<G>",
"0x6f:ᄋ,<G>",
"0x6f:ᄌ,<G>",
"0x6f:ᄍ,<G>",
"0x6f:ᄎ,<G>",
"0x6f:ᄏ,<G>",
"0x6f:ᄐ,<G>",
"0x6f:ᄑ,<G>",
"0x6f:ᄒ,<G>",
],
None,
None,
["0x8a:*,<G>,ᆫ"],
None,
],
"": [None, ["0x6d:ᄐ,<G>", "0x6d:ᄒ,<G>"], None, None, ["0x84:*,<G>,*"], None],
"": [None, None, [], None, None, ["!0xa3,0xa7", "0xa3,0xa9:*,<G>,ᆫ"]],
"": [[], None, None, ["0x1c:*,<G>,ᆫ"], None, None],
# Jongseong (final)
"": [None, None, None, [], [], ["!0xfe"]],
"": [None, None, None, [], [], ["0xdf"]],
"": [None, None, None, ["0xc7"], [], ["0xe0"]],
"": [None, None, None, [], ["0xe1:*,*,<G>"], ["!0xff"]],
"": [None, None, None, [], ["0xc9"], ["0xc9"]],
"": [None, None, None, [], ["0xe2"], ["0xe2"]],
"": [None, None, None, [], [], ["0xe3"]],
"": [None, None, None, [], [], ["extra_0x86:*,*,<G>"]],
"": [None, None, None, [], [], ["0xe5"]],
"": [None, None, None, [], [], ["0xf8"]],
"": [None, None, None, [], ["0xcf"], ["0xcf"]],
"": [None, None, None, ["0xe7"], ["0xe7"], ["0xe7"]],
"": [None, None, None, ["0xd0"], ["0xe8"], ["0xe8"]],
"": [None, None, None, ["0xe9"], ["0xe9"], ["0xe9"]],
"": [None, None, None, [], [], ["0xea"]],
"": [None, None, None, [], [], ["extra_0x87", "extra_0x87:*,*,<G>"]],
"": [None, None, None, [], [], ["extra_0x88:*,*,<G>"]],
"": [None, None, None, [], ["0xd4"], ["0xd4"]],
"": [None, None, None, [], [], ["extra_0x89:*,*,<G>"]],
"": [None, None, None, [], ["0xfc"], ["extra_0x8a:*,*,<G>"]],
"": [None, None, None, [], [], ["extra_0x8b:*,*,<G>"]],
"": [None, None, None, [], [], ["0xef"]],
"": [None, None, None, [], ["0xf0"], ["0xf0"]],
"": [None, None, None, ["0xda"], ["0xf1"], ["0xf1"]],
"": [None, None, None, [], [], ["0xf2"]],
"": [None, None, None, [], [], ["0xf3"]],
"": [None, None, None, [], [], ["0xf4"]],
}
# Print the results
with open("./jamo-db.json", mode="w", encoding="utf-8") as f:
f.write(json.dumps(jamo_combinations, indent=2))
# Fill in the rest of the encoding table with manually identified alternatives / additions
# Most of these are additions, but some override the original encoding (because it looked terrible)
#
# These are provided in a specific formats:
# - 0x01:<G>,ᅤ -- means use 0x01 glyph for drawing the jamo (in position shown, in this case, only before ᅤ
# - 0x01:<G>,* -- means use 0x01 no matter what comes after
# - 0x01:<G>,*[^ᄎ;ᄐ] -- means use 0x01 no matter what comes after (except ᄐ and ᄎ)
# - 0x01,0x02:<G>,* -- sometimes a jamo requires multiple glyphs
# - !0x01 -- means to override the default glyph, this is used very rarely
# - 0x01 -- means to set the default -- some jamos never were used at all. If a default is already set, this should throw an error
#
# Note that all of these changes will override any existing alternatives, so order matters even in these lists
# - 0x01:<G>,* and
# - 0x02:<G>,ᄎ
# will replace the ᄎ involving entry that uses 0x01 because of ordering!
jamo_replacements = {
0: jamo_groupings["initial"],
1: jamo_groupings["median"],
2: jamo_groupings["final"],
}
added_defaults = 0
replaced_defaults = 0
for [jamo, orientations] in jamo_combinations.items():
for [index, orientation] in enumerate(orientations):
if orientation is None:
continue
new_alternative_list = manual_encoding_additions[jamo][index]
if new_alternative_list is None or len(new_alternative_list) == 0:
continue
# enumerate new list and collect info / generate list of new alternatives
new_default = None
force_default = False
alternative_list = []
for item in new_alternative_list:
# - !0x01 -- means to override the default glyph, this is used very rarely
# - 0x01 -- means to set the default -- some jamos never were used at all. If a default is already set, this should throw an error
if ":" not in item:
if item.startswith("!"):
force_default = True
new_default = item[1:]
else:
new_default = item
continue
parts = item.split(":")
glyph = parts[0]
pattern = parts[1]
# order matters, so we have to append to a list for now
# 0x01:<G>,ᅤ
if "*" not in pattern:
alternative_list.append({"glyph": glyph, "jamo_combination": pattern})
# 0x01:<G>,*
else:
# 0x01:<G>,*[^ᄎ;ᄐ]
# exclusion lists
exclusion_list = set()
# we generate all alternatives in order from left-to-right
tokens = pattern.split(",")
# cleanup each item first
new_tokens = []
for token in tokens:
if token.startswith("*[^"):
for exclude_jamo in token[3:-1].split(","):
exclusion_list.add(exclude_jamo)
new_tokens.append("*")
else:
new_tokens.append(token)
wildcard_indices = [i for i, token in enumerate(tokens) if token == "*"]
filtered_replacements = {
idx: [val for val in lst if val not in exclusion_list]
for idx, lst in jamo_replacements.items()
}
replacement_lists = [filtered_replacements[i] for i in wildcard_indices]
for combo in product(*replacement_lists):
generated = tokens.copy()
for idx, val in zip(wildcard_indices, combo):
generated[idx] = val
alternative_list.append(
{"glyph": glyph, "jamo_combination": ",".join(generated)}
)
# Ok, now we have our big list of new alternatives
# we go through them one by one, adding them to the existing list
# since alternatives is a map we don't have to concern ourselves with worrying about duplicates
# the last one wins
#
# We also must set the new default if applicable
new_orientation = orientation
if new_default is not None:
if isinstance(new_orientation, list) and len(new_orientation) == 0:
new_orientation = {"defaultGlyph": new_default, "alternatives": {}}
added_defaults = added_defaults + 1
elif force_default:
new_orientation["defaultGlyph"] = new_default
replaced_defaults = replaced_defaults + 1
else:
print(
f"Trying to replace the default {new_orientation["defaultGlyph"]} with {new_default} improperly"
)
exit(1)
# Alternatives
if len(alternative_list) > 0:
for new_alt in alternative_list:
new_orientation["alternatives"][new_alt["jamo_combination"]] = new_alt[
"glyph"
]
# Finally, update the DB
jamo_combinations[jamo][index] = new_orientation
# Print some Stats again!
empty_cells = 0
new_glyph_list = set(jamo_glyph_mappings.keys())
for [jamo, orientations] in jamo_combinations.items():
for orientation in orientations:
if isinstance(orientation, dict):
new_glyph_list.discard(orientation["defaultGlyph"])
for combo, glyph in orientation["alternatives"].items():
new_glyph_list.discard(glyph)
elif orientation is not None:
print(f"{jamo} - {orientation}")
empty_cells = empty_cells + 1
print()
print(f"Added {added_defaults} defaults")
print(f"Replaced {replaced_defaults} defaults")
print(f"{empty_cells} empty jamo cells\n")
print(
f"Still did not see {len(new_glyph_list)} out of {len(jamo_glyph_mappings.keys())} glyphs:"
)
print(
f"Used an additional {len(new_glyph_list.difference(glyph_list))} glyphs, never seen in the original game!"
)
print("Never Used Glyphs:")
print(new_glyph_list)
# Print the results
with open("./jamo-db.json", mode="w", encoding="utf-8") as f:
f.write(json.dumps(jamo_combinations, indent=None))
# pprint(jamo_combinations)
# Export some CSV results so that we can fill in the rest of the encoding using excel (easier to keep track of
# what's missing)
# This CSV table will only include the most common for each as:
# - we already have the alternatives, we aren't going to check those
# - we will add a new alternative, only if the common glyphs don't match (and we don't already have one, which i can manually check)
# Use the lists so we have a consistent ordering
csv_lines = []
for jamo in jamo_groupings["initial"]:
cells_in_line = []
for orientation in jamo_combinations[jamo]:
if orientation is None:
cells_in_line.append("N/A")
elif isinstance(orientation, list) and len(orientation) == 0:
cells_in_line.append("")
else:
alternative_entries = []
for [context, alternative_glyph] in orientation["alternatives"].items():
alternative_entries.append(
f"- {alternative_glyph} for {context.replace("<GLYPH>", "<G>")}"
)
alternatives = "\n".join(alternative_entries)
if len(alternatives) > 0:
cells_in_line.append(
f"\"{orientation['defaultGlyph'].replace(",", " ")}\n{alternatives.replace(",", ";")}\""
)
else:
cells_in_line.append(
f"\"{orientation['defaultGlyph'].replace(",", " ")}\""
)
csv_lines.append(",".join(cells_in_line) + "\n")
for jamo in jamo_groupings["median"]:
cells_in_line = []
for orientation in jamo_combinations[jamo]:
if orientation is None:
cells_in_line.append("N/A")
elif isinstance(orientation, list) and len(orientation) == 0:
cells_in_line.append("")
else:
alternative_entries = []
for [context, alternative_glyph] in orientation["alternatives"].items():
alternative_entries.append(
f"- {alternative_glyph} for {context.replace("<GLYPH>", "<G>")}"
)
alternatives = "\n".join(alternative_entries)
if len(alternatives) > 0:
cells_in_line.append(
f"\"{orientation['defaultGlyph'].replace(",", " ")}\n{alternatives.replace(",", ";")}\""
)
else:
cells_in_line.append(
f"\"{orientation['defaultGlyph'].replace(",", " ")}\""
)
csv_lines.append(",".join(cells_in_line) + "\n")
for jamo in jamo_groupings["final"]:
cells_in_line = []
for orientation in jamo_combinations[jamo]:
if orientation is None:
cells_in_line.append("N/A")
elif isinstance(orientation, list) and len(orientation) == 0:
cells_in_line.append("")
else:
alternative_entries = []
for [context, alternative_glyph] in orientation["alternatives"].items():
alternative_entries.append(
f"- {alternative_glyph} for {context.replace("<GLYPH>", "<G>")}"
)
alternatives = "\n".join(alternative_entries)
if len(alternatives) > 0:
cells_in_line.append(
f"\"{orientation['defaultGlyph'].replace(",", " ")}\n{alternatives.replace(",", ";")}\""
)
else:
cells_in_line.append(
f"\"{orientation['defaultGlyph'].replace(",", " ")}\""
)
csv_lines.append(",".join(cells_in_line) + "\n")
# with open("./jamo-db.csv", mode="w", encoding="utf-8") as f:
# f.writelines(csv_lines)
# game -> UTF-8
# - convert glyphs into individual jamo (and sometimes ascii) (lookup table)
# - compose jamo into syllable blocks (python lib)
# UTF-8 -> game
# - decompose syllable blocks into jamo (python lib)
# - convert jamo into glyphs using our lookup DB