adding final updates

2025-09-02 16:41:13 -04:00 · 2025-09-02 16:41:13 -04:00 · f44424e02f
parent 353a576da5
commit f44424e02f
6 changed files with 984 additions and 607 deletions
--- a/README.md
+++ b/README.md
@ -18,17 +18,13 @@ Made a symlink -> ln -s ~/dev/balatro-rl/RLBridge /mnt/gamerlinuxssd/SteamLibrar
 ### File-based Communication
 - [x] JSON file communication system
 - [x] Lua file writer in mod
- [x] Python file watcher with watchdog
 - [x] Game state transmission (hand cards, chips, available actions)
 - [x] Action reception and execution

 ### RL Training
- [x] Python RL environment setup
- [x] AI model architecture
- [x] Training loop integration
+- [x] Python RL custom environment setup

 ### Game Features  
- [x] Always have restart_run as an action option assuming the game is ongoing
 - [x] Make it so that if we lose, we can restart, or if we win a round and see the "cash out" 
 page, then we also restart. but getting to the "cash out" state should give a ton of reward to incentivize
 the AI
@ -47,6 +43,10 @@ chips. Perhpas we just want to get wins of rounds just scoring chips is not enou
    - We would probably store the raw requests and raw responses, and if we win, we can save, if not we can reset the list
    - The idea is that I'll have the seed so I can just look at the actions the requests and responses, plugin the seed manually in the game and play it out myself
    - Add something where we only keep top 5. I don't to have a long log of a bunch of wins
+- [x] Should I reward higher for beating the game in the least amount of hands possible? Notice that if it plays more hands it gets more reward vs if it plays 1 hand even if it's a really good hand
+- [x] on that note should we give more reward for having MONSTER hands. for example they are getting rewards based on blind size, but what if they surpass that by a bunch like get the blind size or greater in one hand? maybe that solves the above problem?
+- [ ] Speed up training somehow. Is parallelization possible? Maybe through docker and buying Balatro on multiple steam accounts


 ### DEBUGGING
+- [ ] I think it's counting the reset as an episode? review how it calculates episodes for logging in rewards I think something MIGHT be wrong. Also just check it in general because AI wrote it I might need to udpate
--- a/ai/environment/reward.py
+++ b/ai/environment/reward.py
@ -29,10 +29,11 @@ class BalatroRewardCalculator:
        self.winning_chips = 0  # Store chips when blind is defeated
        
        # Percentage-based reward thresholds (% of blind requirement)
+        # Updated to encourage bigger single hands
        self.REWARD_THRESHOLDS = {
-            "excellent": 80.0,  # 80%+ of blind requirement
-            "good": 50.0,       # 50-79% of blind requirement  
-            "decent": 25.0      # 25-49% of blind requirement
+            "excellent": 75.0,  # 75%+ of blind requirement (lowered from 80%)
+            "good": 40.0,       # 40-74% of blind requirement (lowered from 50%)
+            "decent": 20.0      # 20-39% of blind requirement (lowered from 25%)
        }
        
    def calculate_reward(self, current_state: Dict[str, Any], 
@ -81,7 +82,11 @@ class BalatroRewardCalculator:
            # Calculate percentage of blind requirement this hand achieved
            chip_percentage = (chip_gain / blind_chips) * 100
            
-            if chip_percentage >= self.REWARD_THRESHOLDS["excellent"]:
+            # Monster hand bonus - overkill reward for beating blind in one shot
+            if chip_percentage >= 100:
+                reward += 20.0  # Huge bonus for one-shot blind completion
+                reward_breakdown.append(f"MONSTER HAND - One-shot blind kill (+{chip_gain} chips, {chip_percentage:.1f}%): +20.0")
+            elif chip_percentage >= self.REWARD_THRESHOLDS["excellent"]:
                reward += 10.0
                reward_breakdown.append(f"Excellent hand (+{chip_gain} chips, {chip_percentage:.1f}% of blind): +10.0")
            elif chip_percentage >= self.REWARD_THRESHOLDS["good"]:
@ -101,8 +106,34 @@ class BalatroRewardCalculator:
        # === BLIND COMPLETION ===
        # Main goal - beat the blind (only reward once per episode)
        if blind_defeated and not self.blind_already_defeated and game_over == 0:
-            reward += 50.0  # SUCCESS! Normalized from +500 to +50
-            reward_breakdown.append(f"BLIND DEFEATED: +50.0")
+            base_reward = 50.0
+            
+            # Calculate hands used (starting hands - hands_left)
+            round_info = inner_game_state.get('round', {})
+            hands_left = round_info.get('hands_left', 0)
+            # Assume we start with 4 hands in ante 1
+            hands_used = 4 - hands_left
+            
+            # Efficiency bonus - reward fewer hands used
+            if hands_used == 1:
+                efficiency_bonus = 25.0
+                reward_breakdown.append(f"ONE-HAND VICTORY BONUS: +{efficiency_bonus}")
+            elif hands_used == 2:
+                efficiency_bonus = 15.0
+                reward_breakdown.append(f"Two-hand efficiency bonus: +{efficiency_bonus}")
+            elif hands_used == 3:
+                efficiency_bonus = 8.0
+                reward_breakdown.append(f"Three-hand efficiency bonus: +{efficiency_bonus}")
+            else:
+                efficiency_bonus = max(0, 5.0 - hands_used)  # Diminishing returns
+                if efficiency_bonus > 0:
+                    reward_breakdown.append(f"Efficiency bonus: +{efficiency_bonus:.1f}")
+                else:
+                    efficiency_bonus = 0
+            
+            total_blind_reward = base_reward + efficiency_bonus
+            reward += total_blind_reward
+            reward_breakdown.append(f"BLIND DEFEATED: +{base_reward} (base)")
            self.blind_already_defeated = True
            self.winning_chips = current_chips  # Store winning chip count
            
--- a/ai/train_balatro.py
+++ b/ai/train_balatro.py
@ -243,7 +243,7 @@ if __name__ == "__main__":
                print(f"📂 Found checkpoint: {latest_checkpoint}")
        
        model = train_agent(
-            total_timesteps=100000,
+            total_timesteps=250000,
            save_path="./models/balatro_trained",
            resume_from=str(latest_checkpoint) if latest_checkpoint else None
        )
--- a/ai/utils/mappers.py
+++ b/ai/utils/mappers.py
@ -271,7 +271,11 @@ class BalatroStateMapper:
        hand_name = current_hand.get('handname', 'None')
        if not hand_name:
            hand_name = "None"
-        hand_index = hand_types.index(hand_name)
+        
+        try:
+            hand_index = hand_types.index(hand_name)
+        except ValueError:
+            hand_index = 0  # Default to "None" if hand type not found

        features.extend(make_onehot(hand_index, len(hand_types)))
        
--- a/ai/utils/replay.py
+++ b/ai/utils/replay.py
@ -8,7 +8,7 @@ class ReplaySystem:
        self.REPLAY_FILE_PATH = "replays.json"

    def try_save_replay(self, file_path: str, seed: str, actions: List[Dict[str, Any]], score: float, chips: int):
-        """Save the current replay to a file if the score is among the top MAX_REPLAYS."""
+        """Save the current replay to a file if the chips is among the top MAX_REPLAYS."""
        timestamp = datetime.now().isoformat()
        
        replay_data = {
@ -26,17 +26,17 @@ class ReplaySystem:
        if len(replays) < self.MAX_REPLAYS:
            replays.append(replay_data)
        else:
-            # Check if this score is higher than the lowest score
-            replays.sort(key=lambda x: x['score'], reverse=True)
-            if score > replays[-1]['score']:
-                # Replace the lowest scoring replay
+            # Check if this chips count is higher than the lowest chips count
+            replays.sort(key=lambda x: x['chips'], reverse=True)
+            if chips > replays[-1]['chips']:
+                # Replace the lowest chip count replay
                replays[-1] = replay_data
            else:
-                # Score is not high enough, don't add it
+                # Chips count is not high enough, don't add it
                return len(replays)
        
-        # Sort by score (highest first) and keep only top MAX_REPLAYS
-        replays.sort(key=lambda x: x['score'], reverse=True)
+        # Sort by chips (highest first) and keep only top MAX_REPLAYS
+        replays.sort(key=lambda x: x['chips'], reverse=True)
        replays = replays[:self.MAX_REPLAYS]
        
        # Save back to file
@ -65,9 +65,9 @@ class ReplaySystem:
            json.dump(replays, f, indent=4)

    def sort_replays(self, file_path: str) -> List[Dict[str, Any]]:
-        """Sort replays by score and return the top MAX_REPLAYS."""
+        """Sort replays by chips and return the top MAX_REPLAYS."""
        replays = self.load_replays(file_path)
-        replays.sort(key=lambda x: x['score'], reverse=True)
+        replays.sort(key=lambda x: x['chips'], reverse=True)
        return replays[:self.MAX_REPLAYS]

    def get_top_replays(self, file_path: str, count: int = None) -> List[Dict[str, Any]]:
@ -76,7 +76,7 @@ class ReplaySystem:
            count = self.MAX_REPLAYS
        
        replays = self.load_replays(file_path)
-        replays.sort(key=lambda x: x['score'], reverse=True)
+        replays.sort(key=lambda x: x['chips'], reverse=True)
        return replays[:count]

    def clear_replays(self, file_path: str):
--- a/replays.json
+++ b/replays.json