fix(pu): fix sampling strategy for board games with short game length like TicTacToe

puyuan1996 · puyuan1996 · commit 49d214d836c4 · 2025-12-05T22:58:19.000+08:00
diff --git a/lzero/mcts/buffer/game_buffer.py b/lzero/mcts/buffer/game_buffer.py
@@ -153,30 +153,58 @@ def _sample_orig_data(self, batch_size: int) -> Tuple:
             # [0, game_segment_length - num_unroll_steps] to avoid padded data.
             
             if self._cfg.action_type == 'varied_action_space':
-                # For some environments (e.g., Jericho), the action space size may be different.
-                # To ensure we can always unroll `num_unroll_steps` steps starting from the sampled position (without exceeding segment length),
-                # we avoid sampling from the last `num_unroll_steps` steps of the game segment. 
-                if pos_in_game_segment >= self._cfg.game_segment_length - self._cfg.num_unroll_steps - self._cfg.td_steps:
-                    pos_in_game_segment = np.random.choice(self._cfg.game_segment_length - self._cfg.num_unroll_steps - self._cfg.td_steps, 1).item()
-                
+                # For varied action space environments (e.g., board games with short game length like TicTacToe)
+                # We need to handle cases where game_segment_length might be smaller than num_unroll_steps + td_steps
+                # Strategy: progressively relax sampling constraints to accommodate short games
+
+                # Step 1: Calculate ideal sampling upper bound
+                # Ideally, reserve space for both num_unroll_steps and td_steps to ensure complete trajectories
+                ideal_bound = self._cfg.game_segment_length - self._cfg.num_unroll_steps - self._cfg.td_steps
+
+                # Step 2: Handle different game length scenarios with graceful degradation
+                if ideal_bound > 0:
+                    # Case A: Normal/long games - enough space for full unroll + td steps
+                    # This is the standard case for most Atari games
+                    sampling_upper_bound = ideal_bound
+                else:
+                    # Case B: Short games - need to relax constraints
+                    # Try to at least reserve space for unroll steps (most critical for training)
+                    fallback_bound = self._cfg.game_segment_length - self._cfg.num_unroll_steps
+
+                    if fallback_bound > 0:
+                        # Can still accommodate unroll steps, though td_steps might need padding
+                        sampling_upper_bound = fallback_bound
+                    else:
+                        # Case C: Very short games (e.g., TicTacToe with 5-9 moves)
+                        # Allow sampling from entire segment length, padding will be applied during unrolling
+                        # This allows sampling from position 0 (beginning of game) when necessary
+                        sampling_upper_bound = self._cfg.game_segment_length
+
+                        # Ensure at least 1 to avoid np.random.choice errors
+                        if sampling_upper_bound <= 0:
+                            sampling_upper_bound = 1
+
+                # Step 3: Resample position if it exceeds calculated bound
+                if pos_in_game_segment >= sampling_upper_bound:
+                    pos_in_game_segment = np.random.choice(sampling_upper_bound, 1).item()
+
+                # Step 4: Further adjust based on actual segment length (runtime check)
                 segment_len = len(game_segment.action_segment)
-                if pos_in_game_segment >= segment_len - 1:
-                    # If the segment is very short (length 0 or 1), we can't randomly sample a position
-                    # before the last one. The only safe position is 0.
+                if pos_in_game_segment >= segment_len:
+                    # Position exceeds actual segment, resample within valid range
                     if segment_len > 1:
-                        # If the segment has at least 2 actions, we can safely sample from [0, len-2].
-                        # The upper bound for np.random.choice is exclusive, so (segment_len - 1) is correct.
+                        # Sample from [0, segment_len-1] to allow at least 1 step forward
                         pos_in_game_segment = np.random.choice(segment_len - 1, 1).item()
                     else:
-                        # If segment length is 0 or 1, the only valid/safe position is 0.
+                        # Segment has 0 or 1 actions, can only use position 0
                         pos_in_game_segment = 0
 
             else:
                 # For environments with a fixed action space (e.g., Atari),
                 # we can safely sample from the entire game segment range.
                 if pos_in_game_segment >= self._cfg.game_segment_length:
                     pos_in_game_segment = np.random.choice(self._cfg.game_segment_length, 1).item()
-                
+
                 segment_len = len(game_segment.action_segment)
                 if pos_in_game_segment >= segment_len - 1:
                     # If the segment is very short (length 0 or 1), we can't randomly sample a position