opendilab
diff --git a/‎lzero/mcts/buffer/game_buffer_bkp20250818.py‎
Lines changed: 668 additions & 0 deletions b/‎lzero/mcts/buffer/game_buffer_bkp20250818.py‎
Lines changed: 668 additions & 0 deletions
diff --git a/‎lzero/model/unizero_world_models/utils.py‎
Lines changed: 15 additions & 1 deletion b/‎lzero/model/unizero_world_models/utils.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎lzero/model/unizero_world_models/world_model.py‎
Lines changed: 18 additions & 7 deletions b/‎lzero/model/unizero_world_models/world_model.py‎
Lines changed: 18 additions & 7 deletions
diff --git a/‎lzero/policy/sampled_unizero.py‎
Lines changed: 1 addition & 1 deletion b/‎lzero/policy/sampled_unizero.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lzero/policy/unizero.py‎
Lines changed: 96 additions & 36 deletions b/‎lzero/policy/unizero.py‎
Lines changed: 96 additions & 36 deletions
@@ -268,14 +268,28 @@ def __init__(self, latent_recon_loss_weight=0, perceptual_loss_weight=0, continu
             self.reward_loss_weight = 1.
             self.policy_loss_weight = 1.
             self.ends_loss_weight = 0.
+
+            # muzero loss weight
+            # self.obs_loss_weight = 2
+            # self.value_loss_weight = 0.25
+            # self.reward_loss_weight = 1
+            # self.policy_loss_weight = 1
+            # self.ends_loss_weight = 0.
+
+            # like TD-MPC2 for DMC
+            # self.obs_loss_weight = 10
+            # self.value_loss_weight = 0.1
+            # self.reward_loss_weight = 0.1
+            # self.policy_loss_weight = 0.1
+            # self.ends_loss_weight = 0.
         else:
             # like TD-MPC2 for DMC
             self.obs_loss_weight = 10
             self.value_loss_weight = 0.1
             self.reward_loss_weight = 0.1
             self.policy_loss_weight = 0.1
             self.ends_loss_weight = 0.
-
+        
         self.latent_recon_loss_weight = latent_recon_loss_weight
         self.perceptual_loss_weight = perceptual_loss_weight
 
 
@@ -62,6 +62,8 @@ def __init__(self, config: TransformerConfig, tokenizer) -> None:
         # Position embedding
         if not self.config.rotary_emb:
             self.pos_emb = nn.Embedding(config.max_tokens, config.embed_dim, device=self.device)
+            # TODO(pu)
+            # self.pos_emb = nn.Embedding(config.max_tokens, config.embed_dim, device=self.device, max_norm=1.0)
             self.precompute_pos_emb_diff_kv()
             print(f"self.pos_emb.weight.device: {self.pos_emb.weight.device}")
 
@@ -76,6 +78,9 @@ def __init__(self, config: TransformerConfig, tokenizer) -> None:
         else:
             # for discrete action space
             self.act_embedding_table = nn.Embedding(config.action_space_size, config.embed_dim, device=self.device)
+            # TODO(pu)
+            # self.act_embedding_table = nn.Embedding(config.action_space_size, config.embed_dim, device=self.device, max_norm=1.0)
+
             logging.info(f"self.act_embedding_table.weight.device: {self.act_embedding_table.weight.device}")
 
         self.final_norm_option_in_obs_head = getattr(config, 'final_norm_option_in_obs_head', 'SimNorm')
@@ -1324,18 +1329,21 @@ def compute_loss(self, batch, target_tokenizer: Tokenizer = None, inverse_scalar
         else:
             dormant_ratio_encoder = torch.tensor(0.)
 
-        # Calculate the L2 norm of the latent state roots
-        latent_state_l2_norms = torch.norm(obs_embeddings, p=2, dim=2).mean()
-
         # Action tokens
         if self.continuous_action_space:
             act_tokens = batch['actions']
         else:
             act_tokens = rearrange(batch['actions'], 'b l -> b l 1')
 
+        with torch.no_grad():
+            # Calculate the L2 norm of the latent state roots
+            latent_state_l2_norms = torch.norm(obs_embeddings, p=2, dim=2).mean()
+            # Calculate the L2 norm of the latent action
+            latent_action_l2_norms = torch.norm(self.act_embedding_table(act_tokens), p=2, dim=2).mean()
+
         # Forward pass to obtain predictions for observations, rewards, and policies
         outputs = self.forward({'obs_embeddings_and_act_tokens': (obs_embeddings, act_tokens)}, start_pos=start_pos)
-        
+
         if self.obs_type == 'image':
             # Reconstruct observations from latent state representations
             # reconstructed_images = self.tokenizer.decode_to_obs(obs_embeddings)
@@ -1481,7 +1489,7 @@ def compute_loss(self, batch, target_tokenizer: Tokenizer = None, inverse_scalar
         loss_obs = (loss_obs * mask_padding_expanded)
 
         # Compute labels for policy and value
-        labels_policy, labels_value = self.compute_labels_world_model_value_policy(batch['target_value'],
+        labels_value, labels_policy = self.compute_labels_world_model_value_policy(batch['target_value'],
                                                                                    batch['target_policy'],
                                                                                    batch['mask_padding'])
 
@@ -1582,6 +1590,7 @@ def compute_loss(self, batch, target_tokenizer: Tokenizer = None, inverse_scalar
                 dormant_ratio_encoder=dormant_ratio_encoder,
                 dormant_ratio_world_model=dormant_ratio_world_model,
                 latent_state_l2_norms=latent_state_l2_norms,
+                latent_action_l2_norms=latent_action_l2_norms,
                 policy_mu=mu,
                 policy_sigma=sigma,
                 target_sampled_actions=target_sampled_actions,
@@ -1605,6 +1614,8 @@ def compute_loss(self, batch, target_tokenizer: Tokenizer = None, inverse_scalar
                 dormant_ratio_encoder=dormant_ratio_encoder,
                 dormant_ratio_world_model=dormant_ratio_world_model,
                 latent_state_l2_norms=latent_state_l2_norms,
+                latent_action_l2_norms=latent_action_l2_norms,
+
             )
 
 
@@ -1821,9 +1832,9 @@ def compute_labels_world_model_value_policy(self, target_value: torch.Tensor, ta
         labels_value = target_value.masked_fill(mask_fill_value, -100)
 
         if self.continuous_action_space:
-            return None, labels_value.reshape(-1, self.support_size)
+            return labels_value.reshape(-1, self.support_size), None
         else:
-            return labels_policy.reshape(-1, self.action_space_size), labels_value.reshape(-1, self.support_size)
+            return labels_value.reshape(-1, self.support_size), labels_policy.reshape(-1, self.action_space_size)
 
     def clear_caches(self):
         """
 
@@ -410,7 +410,7 @@ def _forward_learn(self, data: Tuple[torch.Tensor]) -> Dict[str, Union[float, in
         # Prepare action batch and convert to torch tensor
         if self._cfg.model.continuous_action_space:
             action_batch = torch.from_numpy(action_batch).to(self._cfg.device).unsqueeze(
-                -1)  # For discrete action space
+                -1)  # For continuous action space
         else:
             action_batch = torch.from_numpy(action_batch).to(self._cfg.device).unsqueeze(
                 -1).long()  # For discrete action space
 
@@ -474,6 +474,7 @@ def _forward_learn(self, data: Tuple[torch.Tensor]) -> Dict[str, Union[float, in
         dormant_ratio_encoder = self.intermediate_losses['dormant_ratio_encoder']
         dormant_ratio_world_model = self.intermediate_losses['dormant_ratio_world_model']
         latent_state_l2_norms = self.intermediate_losses['latent_state_l2_norms']
+        latent_action_l2_norms = self.intermediate_losses['latent_action_l2_norms']
 
         assert not torch.isnan(losses.loss_total).any(), "Loss contains NaN values"
         assert not torch.isinf(losses.loss_total).any(), "Loss contains Inf values"
@@ -586,6 +587,7 @@ def _forward_learn(self, data: Tuple[torch.Tensor]) -> Dict[str, Union[float, in
             'analysis/dormant_ratio_encoder': dormant_ratio_encoder.item(),
             'analysis/dormant_ratio_world_model': dormant_ratio_world_model.item(),
             'analysis/latent_state_l2_norms': latent_state_l2_norms.item(),
+            'analysis/latent_action_l2_norms': latent_action_l2_norms.item(),
             'analysis/l2_norm_before': self.l2_norm_before,
             'analysis/l2_norm_after': self.l2_norm_after,
             'analysis/grad_norm_before': self.grad_norm_before,
@@ -620,6 +622,7 @@ def _init_collect(self) -> None:
             self._mcts_collect = MCTSCtree(mcts_collect_cfg)
         else:
             self._mcts_collect = MCTSPtree(mcts_collect_cfg)
+
         self._collect_mcts_temperature = 1.
         self._collect_epsilon = 0.0
         self.collector_env_num = self._cfg.collector_env_num
@@ -908,29 +911,59 @@ def _reset_collect(self, env_id: int = None, current_steps: int = None, reset_in
             )
             self.last_batch_action = [-1 for _ in range(self._cfg.collector_env_num)]
 
-        # Return immediately if env_id is None or a list
-        if env_id is None or isinstance(env_id, list):
-            return
+        # --- BEGIN ROBUST FIX ---
+        # This logic handles the crucial end-of-episode cache clearing.
+        # The collector calls `_policy.reset([env_id])` when an episode is done,
+        # which results in `current_steps` being None and `env_id` being a list.
+        
+        # We must handle both single int and list of ints for env_id.
+        if env_id is not None:
+            if isinstance(env_id, int):
+                env_ids_to_reset = [env_id]
+            else: # Assumes it's a list
+                env_ids_to_reset = env_id
+                
+            # The key condition: `current_steps` is None only on the end-of-episode reset call from the collector.
+            if current_steps is None:
+                world_model = self._collect_model.world_model
+                for eid in env_ids_to_reset:
+                    # Clear the specific environment's initial inference cache.
+                    if eid < len(world_model.past_kv_cache_init_infer_envs):
+                        world_model.past_kv_cache_init_infer_envs[eid].clear()
+                    
+                    print(f'>>> [Collector] Cleared KV cache for env_id: {eid} at episode end.')
+
+                # The recurrent cache is global, which is problematic.
+                # A full clear is heavy-handed but safer than leaving stale entries.
+                world_model.past_kv_cache_recurrent_infer.clear()
+                
+                if hasattr(world_model, 'keys_values_wm_list'):
+                    world_model.keys_values_wm_list.clear()
+
+                torch.cuda.empty_cache()
+
+        # --- END ROBUST FIX ---
+
+        # # Determine the clear interval based on the environment's sample type
+        # clear_interval = 2000 if getattr(self._cfg, 'sample_type', '') == 'episode' else 200
 
-        # Determine the clear interval based on the environment's sample type
-        clear_interval = 2000 if getattr(self._cfg, 'sample_type', '') == 'episode' else 200
+        # # Clear caches if the current steps are a multiple of the clear interval
+        # if current_steps % clear_interval == 0:
+        #     print(f'clear_interval: {clear_interval}')
 
-        # Clear caches if the current steps are a multiple of the clear interval
-        if current_steps % clear_interval == 0:
-            print(f'clear_interval: {clear_interval}')
+        #     # Clear various caches in the collect model's world model
+        #     world_model = self._collect_model.world_model
+        #     for kv_cache_dict_env in world_model.past_kv_cache_init_infer_envs:
+        #         kv_cache_dict_env.clear()
+        #     world_model.past_kv_cache_recurrent_infer.clear()
+        #     world_model.keys_values_wm_list.clear()
 
-            # Clear various caches in the collect model's world model
-            world_model = self._collect_model.world_model
-            for kv_cache_dict_env in world_model.past_kv_cache_init_infer_envs:
-                kv_cache_dict_env.clear()
-            world_model.past_kv_cache_recurrent_infer.clear()
-            world_model.keys_values_wm_list.clear()
+        #     # Free up GPU memory
+        #     torch.cuda.empty_cache()
 
-            # Free up GPU memory
-            torch.cuda.empty_cache()
+        #     print('collector: collect_model clear()')
+        #     print(f'eps_steps_lst[{env_id}]: {current_steps}')
 
-            print('collector: collect_model clear()')
-            print(f'eps_steps_lst[{env_id}]: {current_steps}')
 
     def _reset_eval(self, env_id: int = None, current_steps: int = None, reset_init_data: bool = True) -> None:
         """
@@ -952,29 +985,54 @@ def _reset_eval(self, env_id: int = None, current_steps: int = None, reset_init_
             )
             self.last_batch_action = [-1 for _ in range(self._cfg.evaluator_env_num)]
 
-        # Return immediately if env_id is None or a list
-        if env_id is None or isinstance(env_id, list):
-            return
+        # --- BEGIN ROBUST FIX ---
+        # This logic handles the crucial end-of-episode cache clearing for evaluation.
+        # The evaluator calls `_policy.reset([env_id])` when an episode is done.
+        if env_id is not None:
+            if isinstance(env_id, int):
+                env_ids_to_reset = [env_id]
+            else: # Assumes it's a list
+                env_ids_to_reset = env_id
+
+            # The key condition: `current_steps` is None only on the end-of-episode reset call from the evaluator.
+            if current_steps is None:
+                world_model = self._eval_model.world_model
+                for eid in env_ids_to_reset:
+                    # Clear the specific environment's initial inference cache.
+                    if eid < len(world_model.past_kv_cache_init_infer_envs):
+                        world_model.past_kv_cache_init_infer_envs[eid].clear()
+                    
+                    print(f'>>> [Evaluator] Cleared KV cache for env_id: {eid} at episode end.')
+
+                # The recurrent cache is global.
+                world_model.past_kv_cache_recurrent_infer.clear()
+                
+                if hasattr(world_model, 'keys_values_wm_list'):
+                    world_model.keys_values_wm_list.clear()
+
+                torch.cuda.empty_cache()
+                return
+        # --- END ROBUST FIX ---
 
-        # Determine the clear interval based on the environment's sample type
-        clear_interval = 2000 if getattr(self._cfg, 'sample_type', '') == 'episode' else 200
+        # # # Determine the clear interval based on the environment's sample type
+        # clear_interval = 2000 if getattr(self._cfg, 'sample_type', '') == 'episode' else 200
 
-        # Clear caches if the current steps are a multiple of the clear interval
-        if current_steps % clear_interval == 0:
-            print(f'clear_interval: {clear_interval}')
+        # # # Clear caches if the current steps are a multiple of the clear interval
+        # if current_steps % clear_interval == 0:
+        #     print(f'clear_interval: {clear_interval}')
 
-            # Clear various caches in the eval model's world model
-            world_model = self._eval_model.world_model
-            for kv_cache_dict_env in world_model.past_kv_cache_init_infer_envs:
-                kv_cache_dict_env.clear()
-            world_model.past_kv_cache_recurrent_infer.clear()
-            world_model.keys_values_wm_list.clear()
+        #     # Clear various caches in the eval model's world model
+        #     world_model = self._eval_model.world_model
+        #     for kv_cache_dict_env in world_model.past_kv_cache_init_infer_envs:
+        #         kv_cache_dict_env.clear()
+        #     world_model.past_kv_cache_recurrent_infer.clear()
+        #     world_model.keys_values_wm_list.clear()
 
-            # Free up GPU memory
-            torch.cuda.empty_cache()
+        #     # Free up GPU memory
+        #     torch.cuda.empty_cache()
 
-            print('evaluator: eval_model clear()')
-            print(f'eps_steps_lst[{env_id}]: {current_steps}')
+        #     print('evaluator: eval_model clear()')
+        #     print(f'eps_steps_lst[{env_id}]: {current_steps}')
 
     def _monitor_vars_learn(self) -> List[str]:
         """
@@ -986,6 +1044,8 @@ def _monitor_vars_learn(self) -> List[str]:
             'analysis/dormant_ratio_encoder',
             'analysis/dormant_ratio_world_model',
             'analysis/latent_state_l2_norms',
+            'analysis/latent_action_l2_norms',
+
             'analysis/l2_norm_before',
             'analysis/l2_norm_after',
             'analysis/grad_norm_before',