opendilab
diff --git a/‎zoo/jericho/priorzero/models/actor.py‎
Lines changed: 34 additions & 1 deletion b/‎zoo/jericho/priorzero/models/actor.py‎
Lines changed: 34 additions & 1 deletion
diff --git a/‎zoo/jericho/priorzero/priorzero_collector.py‎
Lines changed: 38 additions & 10 deletions b/‎zoo/jericho/priorzero/priorzero_collector.py‎
Lines changed: 38 additions & 10 deletions
diff --git a/‎zoo/jericho/priorzero/priorzero_config.py‎
Lines changed: 3 additions & 2 deletions b/‎zoo/jericho/priorzero/priorzero_config.py‎
Lines changed: 3 additions & 2 deletions
@@ -263,6 +263,23 @@ def train_batch(self, batch_data: Dict[str, torch.Tensor], kl_ctl: float, step_i
 
             self.strategy.optimizer_step(self.actor_optim, self.actor, self.actor_scheduler, name="actor")
 
+            # Calculate response length statistics
+            response_lengths = micro_batch['action_mask'].sum(dim=1).float()
+            avg_response_length = response_lengths.mean().item()
+            max_response_length = response_lengths.max().item()
+            min_response_length = response_lengths.min().item()
+
+            # Calculate log_probs statistics
+            valid_log_probs = action_log_probs[micro_batch['action_mask'] > 0]
+            avg_log_prob = valid_log_probs.mean().item() if valid_log_probs.numel() > 0 else 0.0
+
+            # Calculate ratio statistics
+            log_ratio = action_log_probs - micro_batch['old_action_logprob']
+            ratio = log_ratio.exp()
+            valid_ratio = ratio[micro_batch['action_mask'] > 0]
+            avg_ratio = valid_ratio.mean().item() if valid_ratio.numel() > 0 else 1.0
+            max_ratio = valid_ratio.max().item() if valid_ratio.numel() > 0 else 1.0
+
             status = {
                 "policy_loss": actor_loss.detach().float().mean().item(),
                 "lr": self.actor_scheduler.get_last_lr()[0],
@@ -271,6 +288,14 @@ def train_batch(self, batch_data: Dict[str, torch.Tensor], kl_ctl: float, step_i
                 # "approx_kl": approx_kl.detach().float().mean().item(),
                 "cur_old_kl": approx_kl.detach().float().mean().item(),
                 "iter": self.train_iter,
+                # Response length statistics
+                "response_length_avg": avg_response_length,
+                "response_length_max": max_response_length,
+                "response_length_min": min_response_length,
+                # Log prob and ratio statistics
+                "log_prob_avg": avg_log_prob,
+                "ratio_avg": avg_ratio,
+                "ratio_max": max_ratio,
             }
             log_status = micro_batch["log_status"]
             other_status = {k: [item[k] for item in log_status] for k in log_status[0].keys()}
@@ -298,6 +323,10 @@ def train_batch(self, batch_data: Dict[str, torch.Tensor], kl_ctl: float, step_i
         return status_list
 
     def _deepspeed_broadcast(self):
+        # FIX: Add barrier before vLLM weight update to prevent NCCL deadlock with tp>1
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+
         use_prefix_cache = getattr(self.strategy.args, "enable_prefix_caching", False)
         if use_prefix_cache:
             self.vllm_engine.reset_prefix_cache()
@@ -310,7 +339,11 @@ def _deepspeed_broadcast(self):
             # For ZeRO-3, allgather sharded parameter and broadcast to all vllm engines by rank 0
             with deepspeed.zero.GatheredParameters([param], enabled=self.strategy.args.zero_stage == 3):
                 shape = param.shape if self.strategy.args.zero_stage != 3 else param.ds_shape
-                self.vllm_engine.update_weight(name, dtype=param.dtype, shape=shape, weight=param.data, empty_cache=(count == num_params)) 
+                self.vllm_engine.update_weight(name, dtype=param.dtype, shape=shape, weight=param.data, empty_cache=(count == num_params))
+
+        # FIX: Add barrier after vLLM weight update to ensure all ranks complete
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier() 
 
     def _broadcast_to_vllm(self):
         use_prefix_cache = getattr(self.strategy.args, "enable_prefix_caching", False)
 
@@ -481,14 +481,22 @@ def collect(
                 # Episode Done
                 # ==============================================================
                 if episode_timestep.done:
-                    self._logger.info(f'======== Env {env_id} episode finished! ========')
                     self._total_episode_count += 1
                     # Logging
                     info_log = {
                         'reward': episode_timestep.info['eval_episode_return'],
                         'time': self._env_info[env_id]['time'],
                         'step': self._env_info[env_id]['step'],
                         'llm_prior_entropy': sum(llm_prior_entropy[env_id])/len(llm_prior_entropy[env_id])}
+
+                    # Structured episode completion log
+                    self._logger.info(
+                        f"[Episode Complete] Env={env_id} | "
+                        f"Reward={info_log['reward']:.2f} | "
+                        f"Steps={info_log['step']} | "
+                        f"Time={info_log['time']:.2f}s | "
+                        f"LLM_Entropy={info_log['llm_prior_entropy']:.3f}"
+                    )
                     if not collect_with_pure_policy:
                         info_log['visit_entropy'] = (
                             visit_entropies_lst[env_id] / eps_steps_lst[env_id]
@@ -540,8 +548,7 @@ def collect(
             # ==================================================================
             if len(self.game_segment_pool) >= self._default_num_segments:
                 self._logger.info(
-                    f'✓ Collected {len(self.game_segment_pool)} segments '
-                    f'(target: {self._default_num_segments})'
+                    f"[Collection Complete] Segments={len(self.game_segment_pool)}/{self._default_num_segments}"
                 )
 
                 # Format return data
@@ -565,13 +572,17 @@ def collect(
         collected_duration = sum([d['time'] for d in self._episode_info])
 
         if self._world_size > 1:
-            # Before allreduce
-            self._logger.info(f"Rank {self._rank} before allreduce: collected_step={collected_step}, collected_episode={collected_episode}")
+            # Aggregate data across ranks
+            local_step, local_episode = collected_step, collected_episode
             collected_step = allreduce_data(collected_step, 'sum')
             collected_episode = allreduce_data(collected_episode, 'sum')
             collected_duration = allreduce_data(collected_duration, 'sum')
-            # After allreduce
-            self._logger.info(f"Rank {self._rank} after allreduce: collected_step={collected_step}, collected_episode={collected_episode}")
+
+            self._logger.info(
+                f"[Rank {self._rank} Aggregation] "
+                f"Local: steps={local_step}, episodes={local_episode} | "
+                f"Global: steps={collected_step}, episodes={collected_episode}"
+            )
 
 
         self._total_envstep_count += collected_step
@@ -625,9 +636,26 @@ def _output_log(self, train_iter: int) -> None:
                 info['completed_value_mean'] = np.mean(completed_value)
 
             self._episode_info.clear()
-            
-            # Log to console
-            self._logger.info("Collector Training Summary:\n{}".format('\n'.join([f'  {k}: {v}' for k, v in info.items()])))
+
+            # Structured summary log
+            self._logger.info(
+                f"\n{'='*80}\n"
+                f"[Collector Summary] Train Iter: {train_iter}\n"
+                f"{'-'*80}\n"
+                f"Episodes:     {info['episode_count']} (Total: {info['total_episode_count']})\n"
+                f"Steps:        {info['envstep_count']} (Total: {info['total_envstep_count']})\n"
+                f"Avg Steps/Ep: {info['avg_envstep_per_episode']:.1f}\n"
+                f"Throughput:   {info['avg_envstep_per_sec']:.2f} steps/s, {info['avg_episode_per_sec']:.3f} eps/s\n"
+                f"Duration:     {info['collect_time']:.2f}s (Total: {info['total_duration']:.2f}s)\n"
+                f"{'-'*80}\n"
+                f"Reward:       mean={info['reward_mean']:.2f}, std={info['reward_std']:.2f}, "
+                f"min={info['reward_min']:.2f}, max={info['reward_max']:.2f}\n"
+                f"LLM Entropy:  mean={info['llm_prior_entropy_mean']:.3f}, "
+                f"min={info['llm_prior_entropy_min']:.3f}, max={info['llm_prior_entropy_max']:.3f}\n"
+                + (f"Visit Entropy: {info.get('visit_entropy_mean', 0):.3f}\n" if not self.collect_with_pure_policy else "")
+                + (f"Completed Val: {info.get('completed_value_mean', 0):.3f}\n" if self.policy_config.gumbel_algo else "")
+                + f"{'='*80}"
+            )
 
             # Log to TensorBoard and WandB
             for k, v in info.items():
 
@@ -29,7 +29,7 @@
     "qwen2.5-7b": {
         "model_name_or_path": "/mnt/shared-storage-user/puyuan/xiongjyu/models/Qwen2.5-7B-Instruct",
         "vllm_tensor_parallel_size": 1,
-        # "vllm_tensor_parallel_size": 2,
+        # "vllm_tensor_parallel_size": 2, # TODO
         "gpu_memory_utilization": 0.35,
         "description": "Qwen2.5-7B-Instruct (high quality, needs 2+ GPUs)",
     },
@@ -115,7 +115,8 @@ class PriorZeroLLMConfig:
 
     # 需要注意的是，buffer中取一条经验是 10个样本，因为包含10次交互； num_unroll_steps = 10
     train_batch_size: int = 640 # 总的train_size, 结果= micro_batch_size *  GPUS * gradient_accumulation_steps
-    micro_train_batch_size: int = 16 # 一次micro_train_batch_size 用来计算梯度；只有一次 train_batch_size 才会更新参数
+    # micro_train_batch_size: int = 16 # 一次micro_train_batch_size 用来计算梯度；只有一次 train_batch_size 才会更新参数
+    micro_train_batch_size: int = 4 # 一次micro_train_batch_size 用来计算梯度；只有一次 train_batch_size 才会更新参数
     broadcast_every: int = 1 # 每次训练多少次 train_batch_size 才同步 vllm 参数；也就是说 vllm 中的模型 off 多少次参数更新
 
     learning_rate: float = 1e-6