fix(pu): fix pad bug in buffer

puyuan1996 · puyuan1996 · commit 84ca42cdc33a · 2026-02-07T22:42:34.000+08:00
diff --git a/lzero/mcts/buffer/game_buffer_priorzero.py b/lzero/mcts/buffer/game_buffer_priorzero.py
@@ -214,12 +214,65 @@ def _make_batch(self, batch_size: int, reanalyze_ratio: float, fetch_latest: boo
         else:
             policy_non_re_context = None
 
+        # [LAYER 4 MONITORING] Track padding statistics
+        if not fetch_latest and hasattr(self, '_padding_warning_count') and self._padding_warning_count > 0:
+            print(
+                f"[Padding Monitor] {self._padding_warning_count} segments required padding adjustment in this batch. "
+                f"Consider increasing game_segment_length or reducing num_unroll_steps."
+            )
+            self._padding_warning_count = 0  # Reset counter
+
         return reward_value_context, policy_re_context, policy_non_re_context, current_batch
 
     def _clear(self):
         self.game_pos_priorities = []
         self.game_segment_buffer = []
         self.game_segment_game_pos_look_up = []
+
+    def _adjust_pos_to_avoid_padding(self, game_segment, pos_in_game_segment):
+        """
+        [CRITICAL FIX] Adjust position to ensure no padding is needed.
+
+        This prevents misalignment between raw_obs and cot_prefix caused by padding.
+
+        Args:
+            game_segment: The game segment to sample from
+            pos_in_game_segment: The initial sampled position
+
+        Returns:
+            Adjusted position that guarantees sufficient data length
+        """
+        # Calculate required length for unrolling
+        required_len = self._cfg.model.frame_stack_num + self._cfg.num_unroll_steps
+
+        # Check actual available data length
+        # CRITICAL: Must check raw_obs_segment, not action_segment!
+        # Because cot_prefix aligns with raw_obs in structure
+        actual_obs_len = len(game_segment.raw_obs_segment)
+        actual_cot_len = len(game_segment.cot_prefix_segment)
+
+        # Use the minimum of both to be safe
+        actual_len = min(actual_obs_len, actual_cot_len)
+
+        # If segment is too short, we can't avoid padding entirely
+        if actual_len < required_len:
+            # Log warning for monitoring
+            if not hasattr(self, '_padding_warning_count'):
+                self._padding_warning_count = 0
+            self._padding_warning_count += 1
+
+            # Return position 0 and accept minimal padding
+            # This is better than random position with more padding
+            return 0
+
+        # Ensure position doesn't exceed safe range
+        max_safe_pos = actual_len - required_len
+
+        if pos_in_game_segment > max_safe_pos:
+            # Clamp to safe range
+            pos_in_game_segment = np.random.randint(0, max_safe_pos + 1)
+
+        return pos_in_game_segment
     
     
     def _fetch_latest_orig_data(self, batch_size: int) -> Tuple:
@@ -271,7 +324,10 @@ def _fetch_latest_orig_data(self, batch_size: int) -> Tuple:
             # Indices exceeding `game_segment_length` are padded with the next segment and are not updated
             # in the current implementation. Therefore, we need to sample `pos_in_game_segment` within
             # [0, game_segment_length - num_unroll_steps] to avoid padded data.
-            
+
+            # [LAYER 1 FIX] Adjust position to avoid padding-induced misalignment
+            pos_in_game_segment = self._adjust_pos_to_avoid_padding(game_segment, pos_in_game_segment)
+
             if self._cfg.action_type == 'varied_action_space':
                 # For some environments (e.g., Jericho), the action space size may be different.
                 # To ensure we can always unroll `num_unroll_steps` steps starting from the sampled position (without exceeding segment length),
diff --git a/lzero/worker/muzero_evaluator.py b/lzero/worker/muzero_evaluator.py
@@ -100,6 +100,9 @@ def __init__(
                 self._tb_logger = tb_logger
             else:
                 self._tb_logger = None
+                self._logger, _ = build_logger(
+                    f'./{self._exp_name}/log/{self._instance_name}', self._instance_name, need_tb=False
+                )
 
         self._rank = get_rank()
         print(f'rank {self._rank}, self.task_id: {self.task_id}')
diff --git a/zoo/jericho/priorzero/priorzero_config.py b/zoo/jericho/priorzero/priorzero_config.py
@@ -157,7 +157,11 @@ class PriorZeroLLMConfig:
     # 需要注意的是，buffer中取一条经验是 10个样本，因为包含10次交互； num_unroll_steps = 10
     train_batch_size: int = 640 # 总的train_size, 结果= micro_batch_size *  GPUS * gradient_accumulation_steps
     # micro_train_batch_size: int = 16 # 一次micro_train_batch_size 用来计算梯度；只有一次 train_batch_size 才会更新参数
-    micro_train_batch_size: int = 4 # 一次micro_train_batch_size 用来计算梯度；只有一次 train_batch_size 才会更新参数
+    # micro_train_batch_size: int = 4 # 一次micro_train_batch_size 用来计算梯度；只有一次 train_batch_size 才会更新参数
+    
+    # 2卡 1.5b mbs=2
+    micro_train_batch_size: int = 2 # 一次micro_train_batch_size 用来计算梯度；只有一次 train_batch_size 才会更新参数
+
     broadcast_every: int = 1 # 每次训练多少次 train_batch_size 才同步 vllm 参数；也就是说 vllm 中的模型 off 多少次参数更新
 
     learning_rate: float = 1e-6
@@ -188,6 +192,7 @@ class PriorZeroLLMConfig:
     # entropy_loss_coef: Optional[float] = None  # None = disabled, typical values: 0.001-0.01  
 
     # LLM Prior Mixing Configuration
+
     # ===== baseline root policy-head-logits =====
     # prior_mixing_cfg: Optional[EasyDict] = field(default_factory=lambda: EasyDict({
     #     'enable_soft_mixing': True,              # Enable soft mixing instead of hard override
@@ -207,7 +212,7 @@ class PriorZeroLLMConfig:
         # 'enable_soft_mixing': True,              # Enable soft mixing instead of hard override
         'enable_soft_mixing': False,              # Enable soft mixing instead of hard override
         # 'mixing_alpha': 0.5,                     # Weight for LLM prior (0=network only, 1=LLM only)
-        'mixing_alpha': 0.,                     # Weight for LLM prior (0=network only, 1=LLM only)
+        'mixing_alpha': 1.,                     # Weight for LLM prior (0=network only, 1=LLM only)
         'alpha_schedule': None,                  # 'linear', 'cosine', 'exponential', or None (fixed)
         # 'alpha_schedule': 'cosine',  # Smooth decay          
         'alpha_init': 0.8,                       # Initial alpha (high LLM influence)
diff --git a/zoo/jericho/priorzero/priorzero_datafactory.py b/zoo/jericho/priorzero/priorzero_datafactory.py
@@ -297,16 +297,65 @@ def build_chat_context(self, user_prompt: str) -> str:
             add_generation_prompt=True,
         )
 
+    def _detect_padding_sample(self, raw_obs_list, cot_prefix_list, action_logprob_list, b, t):
+        """
+        [LAYER 3 DEFENSE] Detect if a sample contains padding data.
+
+        Padding detection heuristics:
+        1. Check for consecutive duplicate observations (unlikely in real gameplay)
+        2. Check for None values in cot_prefix (initial state marker)
+        3. Check for duplicate cot_prefix with different obs (clear misalignment)
+
+        Args:
+            raw_obs_list: List of raw observations
+            cot_prefix_list: List of CoT prefixes
+            action_logprob_list: List of action logprobs
+            b: Batch index
+            t: Time index
+
+        Returns:
+            True if padding is detected, False otherwise
+        """
+        T = len(raw_obs_list[b])
+
+        # Heuristic 1: Check for consecutive duplicates (strong padding indicator)
+        # If both obs and cot_prefix are duplicated at the same time, very likely padding
+        if t + 1 < T and raw_obs_list[b][t] == raw_obs_list[b][t + 1]:
+            # Check if cot_prefix is also duplicated
+            if cot_prefix_list is not None and t + 2 < len(cot_prefix_list[b]):
+                if cot_prefix_list[b][t + 1] == cot_prefix_list[b][t + 2]:
+                    # Double duplication is a strong signal of padding
+                    return True
+
+        # Heuristic 2: Check for None cot_prefix (should only be at t=0)
+        if t > 0 and cot_prefix_list is not None and t + 1 < len(cot_prefix_list[b]):
+            if cot_prefix_list[b][t + 1] is None:
+                return True
+
+        # Heuristic 3: Check if action_logprob is empty or None
+        if action_logprob_list is not None and t + 1 < len(action_logprob_list[b]):
+            logprob = action_logprob_list[b][t + 1]
+            if logprob is None or (isinstance(logprob, dict) and len(logprob) == 0):
+                return True
+
+        # Heuristic 4: Check for triple+ consecutive duplicates (very strong signal)
+        if t + 2 < T:
+            if (raw_obs_list[b][t] == raw_obs_list[b][t + 1] == raw_obs_list[b][t + 2]):
+                return True
+
+        return False
+
+
     def build_llm_samples(self,
         raw_obs_list: List[List[str]],
         history_obs_list: List[List[List[Tuple[str, str, float]]]],
         action_logprob_list: Optional[List[List[Any]]] = None,
-        pred_values: Optional[torch.Tensor] = None,   # [B, T-1] 
-        target_values: Optional[torch.Tensor] = None,   # [B, T-1] 
+        pred_values: Optional[torch.Tensor] = None,   # [B, T-1]
+        target_values: Optional[torch.Tensor] = None,   # [B, T-1]
         cot_prefix_list: Optional[List[List[str]]] = None,  # CoT reuse optimization
     ) -> List[Dict[str, Any]]:
         """
-        Build training samples from collected data.
+        [ENHANCED] Build training samples with padding detection and filtering.
 
         Args:
             raw_obs_list: Raw observations
@@ -324,14 +373,48 @@ def build_llm_samples(self,
             return samples
         T = len(raw_obs_list[0])
 
+        # [LAYER 3] Statistics for monitoring
+        total_samples = 0
+        filtered_samples = 0
+        filtered_reasons = {
+            'padding': 0,
+            'empty_action': 0,
+            'extreme_logprob': 0,
+            'nan_value': 0,
+        }
+
         for b in range(B):
             for t in range(T - 1):
+                total_samples += 1
+
+                # [LAYER 3 DEFENSE] Detect and skip padding samples
+                if self._detect_padding_sample(raw_obs_list, cot_prefix_list, action_logprob_list, b, t):
+                    filtered_samples += 1
+                    filtered_reasons['padding'] += 1
+                    continue
+
                 current_obs = raw_obs_list[b][t]
                 current_hist = history_obs_list[b][t]
                 next_hist = history_obs_list[b][t + 1]
 
-                _, true_action, reward_value = next_hist[-1]
+                # Validate history structure
+                if not next_hist or len(next_hist) == 0:
+                    filtered_samples += 1
+                    filtered_reasons['empty_action'] += 1
+                    continue
+
+                try:
+                    _, true_action, reward_value = next_hist[-1]
+                except (ValueError, IndexError) as e:
+                    if self.rank == 0:
+                        self._logger.warning(f"Unexpected history structure at b={b}, t={t}: {next_hist[-1]}")
+                    filtered_samples += 1
+                    filtered_reasons['empty_action'] += 1
+                    continue
+
                 if not true_action:
+                    filtered_samples += 1
+                    filtered_reasons['empty_action'] += 1
                     continue
 
                 instruction = self.build_llm_prompt(
@@ -341,18 +424,28 @@ def build_llm_samples(self,
                 prompt = self.build_chat_context(instruction)
                 old_logprob = None
                 if action_logprob_list is not None:
-                    old_logprob = action_logprob_list[b][t + 1][true_action]
+                    logprob_dict = action_logprob_list[b][t + 1]
+                    if isinstance(logprob_dict, dict):
+                        old_logprob = logprob_dict.get(true_action, None)
+                    else:
+                        old_logprob = None
 
                 # FIX: Filter samples with extreme logprobs to prevent ratio explosion
                 if old_logprob is not None:
                     # Skip if empty
                     if len(old_logprob) == 0:
+                        filtered_samples += 1
+                        filtered_reasons['extreme_logprob'] += 1
                         continue
                     # Skip if contains extreme values (< -50 indicates very low probability)
                     if min(old_logprob) < -50.0:
+                        filtered_samples += 1
+                        filtered_reasons['extreme_logprob'] += 1
                         continue
                     # Skip if contains NaN/Inf
                     if any(math.isnan(x) or math.isinf(x) for x in old_logprob):
+                        filtered_samples += 1
+                        filtered_reasons['nan_value'] += 1
                         continue
 
                 target_value = None
@@ -365,8 +458,12 @@ def build_llm_samples(self,
 
                 # FIX: Skip samples with NaN/Inf values
                 if target_value is not None and (math.isnan(target_value) or math.isinf(target_value)):
+                    filtered_samples += 1
+                    filtered_reasons['nan_value'] += 1
                     continue
                 if pred_value is not None and (math.isnan(pred_value) or math.isinf(pred_value)):
+                    filtered_samples += 1
+                    filtered_reasons['nan_value'] += 1
                     continue
 
                 # CoT reuse optimization: get CoT prefix from stored data
@@ -375,6 +472,9 @@ def build_llm_samples(self,
                 prefix_cot = None
                 if self.use_cot and cot_prefix_list is not None:
                     prefix_cot = cot_prefix_list[b][t+1]
+                    # [FIX] Handle None prefix (initial state or padding)
+                    if prefix_cot is None:
+                        prefix_cot = ""
 
                 samples.append(
                     {
@@ -388,6 +488,24 @@ def build_llm_samples(self,
                         "prefix_cot": prefix_cot,  # CoT reuse optimization
                     }
                 )
+
+        # [LAYER 3 MONITORING] Log filtering statistics
+        if self.rank == 0 and total_samples > 0:
+            filter_rate = (filtered_samples / total_samples) * 100
+            self._logger.info(
+                f"[Sample Filtering] Total: {total_samples} | Filtered: {filtered_samples} ({filter_rate:.2f}%) | "
+                f"padding={filtered_reasons['padding']}, empty={filtered_reasons['empty_action']}, "
+                f"extreme_logprob={filtered_reasons['extreme_logprob']}, nan={filtered_reasons['nan_value']}"
+            )
+
+            # WARNING: If too many samples are filtered due to padding, something is wrong
+            if filtered_reasons['padding'] > total_samples * 0.1:  # >10% padding
+                self._logger.warning(
+                    f"⚠️  High padding rate detected ({filtered_reasons['padding']}/{total_samples} = "
+                    f"{filtered_reasons['padding']/total_samples*100:.1f}%)! "
+                    f"Check sampling strategy in game buffer."
+                )
+
         return samples
 
     def make_llm_train_samples(self, priorzero_batch, ddp: bool = False) -> List[Dict[str, Any]]:
diff --git a/zoo/jericho/priorzero/priorzero_entry_sync_ddp.py b/zoo/jericho/priorzero/priorzero_entry_sync_ddp.py
@@ -205,13 +205,15 @@ def train_priorzero(
     while True:
         cmd = 0 # 0 表示当前循环contiune, 1 表示继续，2 表示break
         priorzero_batch = None
-        if learner.train_iter > 0 and evaluator.should_eval(learner.train_iter):
-            logger.info(f"\n[Evaluation] Rank {rank} | Iter {learner.train_iter}")
-            stop, reward = evaluator.eval(
-                save_ckpt_fn=learner.save_checkpoint,
-                train_iter=learner.train_iter,
-                envstep=collector.envstep
-            )
+        
+        # TODO: priorzero evaluator
+        # if learner.train_iter > 0 and evaluator.should_eval(learner.train_iter):
+        #     logger.info(f"\n[Evaluation] Rank {rank} | Iter {learner.train_iter}")
+        #     stop, reward = evaluator.eval(
+        #         save_ckpt_fn=learner.save_checkpoint,
+        #         train_iter=learner.train_iter,
+        #         envstep=collector.envstep
+        #     )
                     
         if llm_cfg.vllm_enable_sleep and vllm_engine is not None:
             vllm_engine.wake_up()