adding final updates

This commit is contained in:
Angel Valentin 2025-09-02 16:41:13 -04:00
parent 353a576da5
commit f44424e02f
6 changed files with 984 additions and 607 deletions

View File

@ -18,17 +18,13 @@ Made a symlink -> ln -s ~/dev/balatro-rl/RLBridge /mnt/gamerlinuxssd/SteamLibrar
### File-based Communication
- [x] JSON file communication system
- [x] Lua file writer in mod
- [x] Python file watcher with watchdog
- [x] Game state transmission (hand cards, chips, available actions)
- [x] Action reception and execution
### RL Training
- [x] Python RL environment setup
- [x] AI model architecture
- [x] Training loop integration
- [x] Python RL custom environment setup
### Game Features
- [x] Always have restart_run as an action option assuming the game is ongoing
- [x] Make it so that if we lose, we can restart, or if we win a round and see the "cash out"
page, then we also restart. but getting to the "cash out" state should give a ton of reward to incentivize
the AI
@ -47,6 +43,10 @@ chips. Perhpas we just want to get wins of rounds just scoring chips is not enou
- We would probably store the raw requests and raw responses, and if we win, we can save, if not we can reset the list
- The idea is that I'll have the seed so I can just look at the actions the requests and responses, plugin the seed manually in the game and play it out myself
- Add something where we only keep top 5. I don't to have a long log of a bunch of wins
- [x] Should I reward higher for beating the game in the least amount of hands possible? Notice that if it plays more hands it gets more reward vs if it plays 1 hand even if it's a really good hand
- [x] on that note should we give more reward for having MONSTER hands. for example they are getting rewards based on blind size, but what if they surpass that by a bunch like get the blind size or greater in one hand? maybe that solves the above problem?
- [ ] Speed up training somehow. Is parallelization possible? Maybe through docker and buying Balatro on multiple steam accounts
### DEBUGGING
- [ ] I think it's counting the reset as an episode? review how it calculates episodes for logging in rewards I think something MIGHT be wrong. Also just check it in general because AI wrote it I might need to udpate

View File

@ -29,10 +29,11 @@ class BalatroRewardCalculator:
self.winning_chips = 0 # Store chips when blind is defeated
# Percentage-based reward thresholds (% of blind requirement)
# Updated to encourage bigger single hands
self.REWARD_THRESHOLDS = {
"excellent": 80.0, # 80%+ of blind requirement
"good": 50.0, # 50-79% of blind requirement
"decent": 25.0 # 25-49% of blind requirement
"excellent": 75.0, # 75%+ of blind requirement (lowered from 80%)
"good": 40.0, # 40-74% of blind requirement (lowered from 50%)
"decent": 20.0 # 20-39% of blind requirement (lowered from 25%)
}
def calculate_reward(self, current_state: Dict[str, Any],
@ -81,7 +82,11 @@ class BalatroRewardCalculator:
# Calculate percentage of blind requirement this hand achieved
chip_percentage = (chip_gain / blind_chips) * 100
if chip_percentage >= self.REWARD_THRESHOLDS["excellent"]:
# Monster hand bonus - overkill reward for beating blind in one shot
if chip_percentage >= 100:
reward += 20.0 # Huge bonus for one-shot blind completion
reward_breakdown.append(f"MONSTER HAND - One-shot blind kill (+{chip_gain} chips, {chip_percentage:.1f}%): +20.0")
elif chip_percentage >= self.REWARD_THRESHOLDS["excellent"]:
reward += 10.0
reward_breakdown.append(f"Excellent hand (+{chip_gain} chips, {chip_percentage:.1f}% of blind): +10.0")
elif chip_percentage >= self.REWARD_THRESHOLDS["good"]:
@ -101,8 +106,34 @@ class BalatroRewardCalculator:
# === BLIND COMPLETION ===
# Main goal - beat the blind (only reward once per episode)
if blind_defeated and not self.blind_already_defeated and game_over == 0:
reward += 50.0 # SUCCESS! Normalized from +500 to +50
reward_breakdown.append(f"BLIND DEFEATED: +50.0")
base_reward = 50.0
# Calculate hands used (starting hands - hands_left)
round_info = inner_game_state.get('round', {})
hands_left = round_info.get('hands_left', 0)
# Assume we start with 4 hands in ante 1
hands_used = 4 - hands_left
# Efficiency bonus - reward fewer hands used
if hands_used == 1:
efficiency_bonus = 25.0
reward_breakdown.append(f"ONE-HAND VICTORY BONUS: +{efficiency_bonus}")
elif hands_used == 2:
efficiency_bonus = 15.0
reward_breakdown.append(f"Two-hand efficiency bonus: +{efficiency_bonus}")
elif hands_used == 3:
efficiency_bonus = 8.0
reward_breakdown.append(f"Three-hand efficiency bonus: +{efficiency_bonus}")
else:
efficiency_bonus = max(0, 5.0 - hands_used) # Diminishing returns
if efficiency_bonus > 0:
reward_breakdown.append(f"Efficiency bonus: +{efficiency_bonus:.1f}")
else:
efficiency_bonus = 0
total_blind_reward = base_reward + efficiency_bonus
reward += total_blind_reward
reward_breakdown.append(f"BLIND DEFEATED: +{base_reward} (base)")
self.blind_already_defeated = True
self.winning_chips = current_chips # Store winning chip count

View File

@ -243,7 +243,7 @@ if __name__ == "__main__":
print(f"📂 Found checkpoint: {latest_checkpoint}")
model = train_agent(
total_timesteps=100000,
total_timesteps=250000,
save_path="./models/balatro_trained",
resume_from=str(latest_checkpoint) if latest_checkpoint else None
)

View File

@ -271,7 +271,11 @@ class BalatroStateMapper:
hand_name = current_hand.get('handname', 'None')
if not hand_name:
hand_name = "None"
hand_index = hand_types.index(hand_name)
try:
hand_index = hand_types.index(hand_name)
except ValueError:
hand_index = 0 # Default to "None" if hand type not found
features.extend(make_onehot(hand_index, len(hand_types)))

View File

@ -8,7 +8,7 @@ class ReplaySystem:
self.REPLAY_FILE_PATH = "replays.json"
def try_save_replay(self, file_path: str, seed: str, actions: List[Dict[str, Any]], score: float, chips: int):
"""Save the current replay to a file if the score is among the top MAX_REPLAYS."""
"""Save the current replay to a file if the chips is among the top MAX_REPLAYS."""
timestamp = datetime.now().isoformat()
replay_data = {
@ -26,17 +26,17 @@ class ReplaySystem:
if len(replays) < self.MAX_REPLAYS:
replays.append(replay_data)
else:
# Check if this score is higher than the lowest score
replays.sort(key=lambda x: x['score'], reverse=True)
if score > replays[-1]['score']:
# Replace the lowest scoring replay
# Check if this chips count is higher than the lowest chips count
replays.sort(key=lambda x: x['chips'], reverse=True)
if chips > replays[-1]['chips']:
# Replace the lowest chip count replay
replays[-1] = replay_data
else:
# Score is not high enough, don't add it
# Chips count is not high enough, don't add it
return len(replays)
# Sort by score (highest first) and keep only top MAX_REPLAYS
replays.sort(key=lambda x: x['score'], reverse=True)
# Sort by chips (highest first) and keep only top MAX_REPLAYS
replays.sort(key=lambda x: x['chips'], reverse=True)
replays = replays[:self.MAX_REPLAYS]
# Save back to file
@ -65,9 +65,9 @@ class ReplaySystem:
json.dump(replays, f, indent=4)
def sort_replays(self, file_path: str) -> List[Dict[str, Any]]:
"""Sort replays by score and return the top MAX_REPLAYS."""
"""Sort replays by chips and return the top MAX_REPLAYS."""
replays = self.load_replays(file_path)
replays.sort(key=lambda x: x['score'], reverse=True)
replays.sort(key=lambda x: x['chips'], reverse=True)
return replays[:self.MAX_REPLAYS]
def get_top_replays(self, file_path: str, count: int = None) -> List[Dict[str, Any]]:
@ -76,7 +76,7 @@ class ReplaySystem:
count = self.MAX_REPLAYS
replays = self.load_replays(file_path)
replays.sort(key=lambda x: x['score'], reverse=True)
replays.sort(key=lambda x: x['chips'], reverse=True)
return replays[:count]
def clear_replays(self, file_path: str):

File diff suppressed because it is too large Load Diff