jak-project/scripts/ci/lint-characters.py

258 lines
15 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import glob
import json
import re
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--fix", action="store_true")
parser.set_defaults(fix=False)
args = parser.parse_args()
# fmt: off
JAK1_ALLOWED_CHARACTERS = [
"_", # NOTE - not an actual underscore, adds a long space!
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
"'", "!", "(", ")", "+", "-", ",", ".", "/", ":", "=", "<", ">", "*", "%", "?", "\"",
"`", "ˇ", "¨", "º", "¡", "¿", "Æ", "Ç", "ß", "", "", " ", "Å", "Ø", "Ą", "Ę", "Ł", "Ż","Ů", "Ý", "Č", "Ň", "Ř", "Š", "Ť", "Ž",
"Ñ", "Ã", "Õ", "Á", "É", "Í", "Ó", "Ú", "Ć", "Ń", "Ś", "Ź", "Ő", "Ű", "Â", "Đ", "Ê", "Î", "Ô", "Û", "À", "È", "Ì", "Ò", "Ù", "Ä", "Ë", "Ï", "Ö", "ö", "Ü", "Ė","Č","Š","Ž","Ų","Ū","Į","Ǎ","Ě","Ǧ","Ǐ","Ǒ","Ǔ","",
"Ď",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "",
"~", "Œ"
]
JAK1_ALLOWED_CODES = [
"<TIL>",
"<PAD_X>", "<PAD_TRIANGLE>", "<PAD_CIRCLE>", "<PAD_SQUARE>"
]
JAK1_AUTO_REPLACEMENTS = {
"ª": "º",
"\n": "",
"": "'",
"·": "-",
"": "-",
"": "",
"": ",,",
"": "\"",
" ": " ",
"": "!",
"": "(",
"": ")",
"": ".",
"×": "x",
"": "?"
}
JAK2_ALLOWED_CHARACTERS = [
"_", # NOTE - not an actual underscore, adds a long space!
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
"'", "!", "(", ")", "+", "-", ",", ".", "/", ":", "=", "<", ">", "*", "%", "?", "\"",
"`", "ˇ", "¨", "º", "¡", "¿", "Æ", "Ç", "ß", "", "", " ", "Å", "Ø", "Ą", "Ę", "Ł", "Ż",
"Ů", "ů", "Ý", "ý", "Č", "č", "Ň", "ň", "Ř", "ř", "Š", "š", "Ť", "ť", "Ž", "ž", "Đ", "đ",
"æ", "ø", "œ",
"Ñ", "Ã", "Õ", "Á", "É", "Í", "Ó", "Ú", "Ć", "Ń", "Ś", "Ź", "ź", "Ő", "Ű", "Â", "Ê", "Î", "Ô", "Û", "À", "È", "Ì", "Ò", "Ù", "Ä", "Ë", "Ï", "ï", "Ö", "ö", "Ü", "Ė","Č","Š","Ž","Ų","Ū","Į",
"ñ", "á", "é", "í", "ó", "ú", "â", "ê", "î", "ô", "û", "à", "è", "ì", "ò", "ù", "ë", "ä", "ö", "ü", "ś", "å", "õ", "ã", "ę", "ż", "ć", "ą", "ł", "ń", "ű", "ő", "ė","č","š","ž","ų","ū","į",
"Ǎ","Ě","Ǧ","Ǐ","Ǒ","Ǔ","","ǎ","ě","ǧ","ǐ","ǒ","ǔ","",
"Ď",
"", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "使", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "退", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "",
"~", "Œ", "°", "ç"
]
JAK2_ALLOWED_CODES = [
"<TIL>", "<SUPERSCRIPT_QUOTE>",
"<PAD_X>", "<PAD_TRIANGLE>", "<PAD_CIRCLE>", "<PAD_SQUARE>", "<PAD_DPAD_UP>", "<PAD_DPAD_DOWN>", "<PAD_DPAD_ANY>", "<PAD_L1>", "<PAD_R1>", "<PAD_R2>", "<PAD_L2>", "<PAD_ANALOG_ANY>", "<PAD_ANALOG_LEFT_RIGHT>", "<PAD_ANALOG_UP_DOWN>", "<ICON_MISSION_COMPLETE>", "<ICON_MISSION_TODO>", "<FLAG_ITALIAN>", "<FLAG_SPAIN>", "<FLAG_GERMAN>", "<FLAG_FRANCE>", "<FLAG_UK>", "<FLAG_USA>", "<FLAG_KOREA>", "<FLAG_JAPAN>", "<FLAG_FINLAND>", "<FLAG_SWEDEN>", "<FLAG_DENMARK>", "<FLAG_NORWAY>", "<FLAG_ICELAND>"
]
JAK2_AUTO_REPLACEMENTS = {
"ª": "º",
"\n": "",
"": "'",
"·": "-",
"": "-",
"": "",
"": ",,",
"": "\"",
" ": " ",
"": "!",
"": "(",
"": ")",
"": "~",
"": ".",
"×": "x",
"": "?",
"": "-",
";": ",",
"": ": ",
"": "...",
"«": "<",
"»": ">",
"": " ",
"": "\"",
"'̂'": "",
"": "ų",
"": "'"
}
# fmt: on
return_error = False
def is_korean_syllable(char):
return '\uAC00' <= char <= '\uD7A3'
def is_char_allowed(game_name, char, allowed_characters):
if game_name == "jak1":
return char in allowed_characters
return char in allowed_characters or is_korean_syllable(char)
def is_allowed_code(pos, text, allowed_codes):
# Find any occurences of allowed codes in the string
# if the position overlaps with these occurrences, it's allowed
for code in allowed_codes:
for match in re.finditer(code, text):
if pos >= match.start() and pos <= match.end():
return match.end()
return -1
def fix_character(game_name, char, allowed_characters, auto_replacements):
# First let's try upper-casing it, if that's allowed, let's use that instead
upper_case = char.upper()
if is_char_allowed(game_name, upper_case, allowed_characters):
return upper_case
if char in auto_replacements:
return auto_replacements[char]
return char
def replace_character(string, position, new_character):
string_list = list(string)
string_list[position] = new_character
new_string = "".join(string_list)
return new_string
def lint_characters(game_name, text, allowed_characters, allowed_codes, auto_replacements):
invalid_characters_found = False
pos = 0
while pos < len(text):
character = text[pos]
if not is_char_allowed(game_name, character, allowed_characters):
# Check to see if it's an allowed code
code_end_pos = is_allowed_code(pos, text, allowed_codes)
if code_end_pos == -1:
# If we are fixing instances, attempt to do so
char_fixed = False
if args.fix:
new_char = fix_character(game_name, character, allowed_characters, auto_replacements)
if new_char != character:
text = replace_character(text, pos, new_char)
char_fixed = True
if not char_fixed:
print(
"Character '{}' not allowed - Found in {} in string {}".format(
character, text, text
)
)
# text = replace_character(text, pos, "?")
invalid_characters_found = True
pos = pos + 1
else:
# advance to the end of the code and continue checking
pos = code_end_pos
else:
pos = pos + 1
return invalid_characters_found, text
def fix_games_translations(game_name, allowed_characters, allowed_codes, auto_replacements):
global return_error
print(f"Checking {game_name} translations")
# Iterate through the translations making sure there are no characters that are not allowed
text_files = glob.glob(f"./game/assets/{game_name}/text/*.json")
for text_file in text_files:
print("Checking {}".format(text_file))
with open(text_file, encoding="utf-8") as f:
file_data = json.load(f)
for id, text in file_data.items():
invalid_chars_exist, new_text = lint_characters(game_name, text, allowed_characters, allowed_codes, auto_replacements)
if args.fix:
file_data[id] = new_text
if invalid_chars_exist:
return_error = True
if args.fix:
# save the modified file back out
with open(text_file, "w", encoding="utf-8") as f:
json.dump(file_data, f, indent=2, ensure_ascii=False)
f.write("\n")
subtitle_files = glob.glob(f"./game/assets/{game_name}/subtitle/*lines*.json")
for subtitle_file in subtitle_files:
print("Checking {}...".format(subtitle_file))
with open(subtitle_file, encoding="utf-8") as f:
file_data = json.load(f)
# Check Speakers
for id, text in file_data["speakers"].items():
invalid_chars_exist, new_text = lint_characters(game_name, text, allowed_characters, allowed_codes, auto_replacements)
if args.fix and new_text != text:
file_data["speakers"][id] = new_text
if invalid_chars_exist:
return_error = True
# Check Lines
for id, lines in file_data["cutscenes"].items():
for i, line in enumerate(lines):
invalid_chars_exist, new_text = lint_characters(game_name, line, allowed_characters, allowed_codes, auto_replacements)
if args.fix and new_text != line:
lines[i] = new_text
if invalid_chars_exist:
return_error = True
if game_name == "jak1":
for id, lines in file_data["hints"].items():
for i, line in enumerate(lines):
invalid_chars_exist, new_text = lint_characters(game_name, line, allowed_characters, allowed_codes, auto_replacements)
if args.fix and new_text != line:
lines[i] = new_text
if invalid_chars_exist:
return_error = True
else:
for id, lines in file_data["other"].items():
for i, line in enumerate(lines):
invalid_chars_exist, new_text = lint_characters(game_name, line, allowed_characters, allowed_codes, auto_replacements)
if args.fix and new_text != line:
lines[i] = new_text
if invalid_chars_exist:
return_error = True
if args.fix:
# save the modified file back out
with open(subtitle_file, "w", encoding="utf-8") as f:
json.dump(file_data, f, indent=2, ensure_ascii=False)
f.write("\n")
fix_games_translations("jak1", JAK1_ALLOWED_CHARACTERS, JAK1_ALLOWED_CODES, JAK1_AUTO_REPLACEMENTS)
fix_games_translations("jak2", JAK2_ALLOWED_CHARACTERS, JAK2_ALLOWED_CODES, JAK2_AUTO_REPLACEMENTS)
if return_error:
print("Invalid characters were found, see above")
exit(1)
else:
print("No invalid characters found!")