opendilab
diff --git a/‎lzero/policy/scaling_transform.py‎
Lines changed: 23 additions & 19 deletions b/‎lzero/policy/scaling_transform.py‎
Lines changed: 23 additions & 19 deletions
diff --git a/‎lzero/policy/unizero.py‎
Lines changed: 29 additions & 2 deletions b/‎lzero/policy/unizero.py‎
Lines changed: 29 additions & 2 deletions
diff --git a/‎zoo/atari/config/atari_env_action_space_map_v4.py‎
Lines changed: 0 additions & 33 deletions b/‎zoo/atari/config/atari_env_action_space_map_v4.py‎
Lines changed: 0 additions & 33 deletions
diff --git a/‎zoo/atari/config/atari_env_action_space_map_v5.py‎
Lines changed: 0 additions & 33 deletions b/‎zoo/atari/config/atari_env_action_space_map_v5.py‎
Lines changed: 0 additions & 33 deletions
diff --git a/‎zoo/atari/config/atari_unizero_multitask_segment_ddp_config.py‎
Lines changed: 0 additions & 3 deletions b/‎zoo/atari/config/atari_unizero_multitask_segment_ddp_config.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎zoo/dmc2gym/config/dmc2gym_pixels_sampled_unizero_config.py‎
Lines changed: 1 addition & 3 deletions b/‎zoo/dmc2gym/config/dmc2gym_pixels_sampled_unizero_config.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎zoo/dmc2gym/config/dmc2gym_state_smz_config.py‎
Lines changed: 1 addition & 1 deletion b/‎zoo/dmc2gym/config/dmc2gym_state_smz_config.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎zoo/dmc2gym/config/dmc2gym_state_suz_config.py‎
Lines changed: 0 additions & 29 deletions b/‎zoo/dmc2gym/config/dmc2gym_state_suz_config.py‎
Lines changed: 0 additions & 29 deletions
diff --git a/‎zoo/dmc2gym/config/dmc2gym_state_suz_multitask_ddp_balance_config.py‎
Lines changed: 9 additions & 47 deletions b/‎zoo/dmc2gym/config/dmc2gym_state_suz_multitask_ddp_balance_config.py‎
Lines changed: 9 additions & 47 deletions
@@ -1,6 +1,5 @@
 from typing import Union
 import torch
-import numpy as np
 
 class DiscreteSupport(object):
 
@@ -106,29 +105,34 @@ def visit_count_temperature(
         return fixed_temperature_value
 
 
+
 def phi_transform(
     discrete_support: DiscreteSupport,
     x: torch.Tensor,
-    label_smoothing_eps: float = 0. # <--- 新增平滑参数
+    label_smoothing_eps: float = 0.0  # <--- Added smoothing parameter
 ) -> torch.Tensor:
     """
     Overview:
-        Map a real-valued scalar to a categorical distribution over a discrete support using linear interpolation (a.k.a. “soft” one-hot).
+        Map a real-valued scalar to a categorical distribution over a discrete support 
+        using linear interpolation (a.k.a. “soft” one-hot).
 
-        For each scalar value the probability mass is split between the two
+        For each scalar value, the probability mass is split between the two
         nearest support atoms so that their weighted sum equals the original
-        value (MuZero, Appendix F).
+        value (see MuZero, Appendix F).
 
     Arguments:
         - discrete_support : DiscreteSupport
             Container with the support values (must be evenly spaced).
         - x : torch.Tensor
             Input tensor of arbitrary shape ``(...,)`` containing real numbers.
+        - label_smoothing_eps : float
+            Epsilon value for label smoothing. If greater than 0, the resulting 
+            distribution is mixed with a uniform distribution. Defaults to 0.
 
     Returns:
         - torch.Tensor
             Tensor of shape ``(*x.shape, N)`` where ``N = discrete_support.size``.
-            The last dimension is a probability distribution (sums to 1).
+            The last dimension represents a probability distribution (sums to 1).
 
     Notes
     -----
@@ -141,20 +145,21 @@ def phi_transform(
     step      = discrete_support.step
     size      = discrete_support.size
 
-    # --- 1. clip to the valid range ----------------------------------------
+    # --- 1. Clip to the valid range ----------------------------------------
     x = x.clamp(min_bound, max_bound)
 
-    # --- 2. locate neighbouring indices ------------------------------------
-    pos             = (x - min_bound) / step    # continuous position
-    low_idx_float   = torch.floor(pos)          # lower index
-    low_idx_long    = low_idx_float.long()      # lower index
-    high_idx        = low_idx_long + 1          # upper index (may overflow)
+    # --- 2. Locate neighbouring indices ------------------------------------
+    pos             = (x - min_bound) / step    # Continuous position relative to support
+    low_idx_float   = torch.floor(pos)          # Lower index (float)
+    low_idx_long    = low_idx_float.long()      # Lower index (long)
+    high_idx        = low_idx_long + 1          # Upper index (may temporarily overflow)
 
-    # --- 3. linear interpolation weights -----------------------------------
-    p_high = pos - low_idx_float                # distance to lower atom
-    p_low  = 1.0 - p_high                       # complementary mass
+    # --- 3. Linear interpolation weights -----------------------------------
+    p_high = pos - low_idx_float                # Distance to the lower atom (weight for upper)
+    p_low  = 1.0 - p_high                       # Complementary mass (weight for lower)
 
-    # --- 4. stack indices / probs and scatter ------------------------------
+    # --- 4. Stack indices / probs and scatter ------------------------------
+    # Clamp high_idx to handle the edge case where x is exactly max_bound
     idx   = torch.stack([low_idx_long,
                          torch.clamp(high_idx, max=size - 1)], dim=-1)  # (*x, 2)
     prob  = torch.stack([p_low, p_high], dim=-1)                        # (*x, 2)
@@ -163,11 +168,10 @@ def phi_transform(
                          dtype=x.dtype, device=x.device)
 
     target.scatter_add_(-1, idx, prob)
-    # return target
 
-    # --- 5. 应用标签平滑 ---
+    # --- 5. Apply label smoothing ------------------------------------------
     if label_smoothing_eps > 0:
-        # 将原始的 two-hot 目标与一个均匀分布混合
+        # Mix the original "two-hot" target with a uniform distribution
         smooth_target = (1.0 - label_smoothing_eps) * target + (label_smoothing_eps / size)
         return smooth_target
     else:
 
@@ -209,11 +209,38 @@ class UniZeroPolicy(MuZeroPolicy):
                 rope_theta=10000,
                 # (int) The maximum sequence length for position encoding.
                 max_seq_len=8192,
-                lora_r= 0,
+                # (int) The rank parameter for LoRA (Low-Rank Adaptation). Set to 0 to disable LoRA.
+                lora_r=0,
+                # (float) The alpha parameter for LoRA scaling.
+                lora_alpha=1,
+                # (float) The dropout probability for LoRA layers.
+                lora_dropout=0.0,
                 # Controls where to compute reconstruction loss: 'after_backbone', 'before_backbone', or None.
                 #   - after_backbone: The reconstruction loss is computed after the encoded representation passes through the backbone.
-	        #   - before_backbone: The reconstruction loss is computed directly on the encoded representation, without the backbone.
+	            #   - before_backbone: The reconstruction loss is computed directly on the encoded representation, without the backbone.
                 decode_loss_mode=None,
+                # (str/None) Task embedding option. Set to None to disable task-specific embeddings.
+                task_embed_option=None,
+                # (bool) Whether to use task embeddings.
+                use_task_embed=False,
+                # (bool) Whether to use normal head (standard prediction heads).
+                use_normal_head=True,
+                # (bool) Whether to use Soft Mixture-of-Experts (MoE) head.
+                use_softmoe_head=False,
+                # (bool) Whether to use Mixture-of-Experts (MoE) head.
+                use_moe_head=False,
+                # (int) Number of experts in the MoE head.
+                num_experts_in_moe_head=4,
+                # (bool) Whether to use MoE in the transformer layers.
+                moe_in_transformer=False,
+                # (bool) Whether to use multiplicative MoE in the transformer layers.
+                multiplication_moe_in_transformer=False,
+                # (int) Number of shared experts in MoE.
+                n_shared_experts=1,
+                # (int) Number of experts to use per token in MoE.
+                num_experts_per_tok=1,
+                # (int) Total number of experts in the transformer MoE.
+                num_experts_of_moe_in_transformer=8,
             ),
         ),
         # ****** common ******
 
@@ -315,9 +315,6 @@ def create_env_manager() -> EasyDict:
         cd /path/to/your/project/
 
         torchrun --nproc_per_node=4 /mnt/shared-storage-user/puyuan/code/LightZero/zoo/atari/config/atari_unizero_multitask_segment_ddp_config.py  2>&1 | tee /mnt/shared-storage-user/puyuan/code/LightZero/logs/202512/atari8_uz_mt.log
-
-        /mnt/shared-storage-user/puyuan/lz/bin/python -m torch.distributed.launch --nproc_per_node=4 --master_port=29502 /mnt/shared-storage-user/puyuan/code/LightZero/zoo/atari/config/atari_unizero_multitask_segment_ddp_config.py  2>&1 | tee /mnt/shared-storage-user/puyuan/code/LightZero/logs/202512/atari8_uz_mt.log
-
     """
     from lzero.entry import train_unizero_multitask_segment_ddp
     from ding.utils import DDPContext
 
@@ -40,7 +40,7 @@
 # ==============================================================
 
 dmc2gym_pixels_cont_sampled_unizero_config = dict(
-    exp_name=f'data_sampled_unizero_0901/dmc2gym_{env_id}_image_cont_sampled_unizero_ns{num_simulations}_upc{update_per_collect}-rr{replay_ratio}_rer{reanalyze_ratio}_H{num_unroll_steps}_bs{batch_size}_{norm_type}_seed{seed}',
+    exp_name=f'data_sampled_unizero/dmc2gym_{env_id}_image_cont_sampled_unizero_ns{num_simulations}_upc{update_per_collect}-rr{replay_ratio}_rer{reanalyze_ratio}_H{num_unroll_steps}_bs{batch_size}_{norm_type}_seed{seed}',
     env=dict(
         env_id='dmc2gym-v0',
         continuous=True,
@@ -75,7 +75,6 @@
                 max_blocks=num_unroll_steps,
                 max_tokens=2 * num_unroll_steps,  # NOTE: each timestep has 2 tokens: obs and action
                 context_length=2 * infer_context_length,
-                # device='cpu',
                 device='cuda',
                 action_space_size=action_space_size,
                 num_layers=2,
@@ -116,7 +115,6 @@
         type='dmc2gym_lightzero',
         import_names=['zoo.dmc2gym.envs.dmc2gym_lightzero_env'],
     ),
-    # env_manager=dict(type='subprocess'),
     env_manager=dict(type='base'),
     policy=dict(
         type='sampled_unizero',
 
@@ -30,7 +30,7 @@
 # ==============================================================
 
 dmc2gym_state_cont_sampled_muzero_config = dict(
-    exp_name=f'/oss/niuyazhe/puyuan/data/data_lz_202505/data_smz/dmc2gym_{env_id}_state_cont_sampled_muzero_k{K}_ns{num_simulations}_upc{update_per_collect}-rr{replay_ratio}_rer{reanalyze_ratio}_{norm_type}_seed{seed}',
+    exp_name=f'data_smz/dmc2gym_{env_id}_state_cont_sampled_muzero_k{K}_ns{num_simulations}_upc{update_per_collect}-rr{replay_ratio}_rer{reanalyze_ratio}_{norm_type}_seed{seed}',
     env=dict(
         env_id='dmc2gym-v0',
         domain_name=domain_name,
 
@@ -99,35 +99,6 @@ def main(env_id, seed):
                     embed_dim=768,
                     env_num=max(collector_env_num, evaluator_env_num),
                     rotary_emb=False,
-                    
-                    
-                    # --- MOE Settings ---
-                    moe_in_transformer=False,
-                    # multiplication_moe_in_transformer=True,
-                    multiplication_moe_in_transformer=False,
-                    num_experts_of_moe_in_transformer=8,
-                    n_shared_experts=1,
-                    num_experts_per_tok=1,
-                    use_normal_head=True,
-                    use_softmoe_head=False,
-                    use_moe_head=False,
-                    num_experts_in_moe_head=4,
-        
-                    # --- LoRA Parameters ---
-                    moe_use_lora=False,  # TODO
-                    curriculum_stage_num=3,
-                    lora_target_modules=["attn", "feed_forward"],
-                    lora_r=0,
-                    lora_alpha=1,
-                    lora_dropout=0.0,
-
-                    # --- Multi-task Settings ---
-                    task_embed_option=None,  # TODO: 'concat_task_embed' or None
-                    use_task_embed=False,  # TODO
-
-                    # --- Analysis ---
-                    analysis_dormant_ratio_weight_rank=False,  # TODO
-                    analysis_dormant_ratio_interval=5000,
                 ),
             ),
             # (str) The path of the pretrained model. If None, the model will be initialized by the default model.
 
@@ -7,22 +7,9 @@
 """
 from __future__ import annotations
 
-import logging
 from typing import Any, Dict, List
-
 from easydict import EasyDict
 import copy
-# ==============================================================
-# Global setup: Logging
-# ==============================================================
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(message)s',
-    handlers=[
-        logging.FileHandler("output.log", encoding="utf-8"),  # Log to file
-        logging.StreamHandler()  # Log to console
-    ]
-)
 
 
 def get_base_config(env_id_list: list[str], collector_env_num: int, evaluator_env_num: int,
@@ -58,8 +45,8 @@ def get_base_config(env_id_list: list[str], collector_env_num: int, evaluator_en
         ),
         # Policy-specific settings
         policy=dict(
-            multi_gpu=True,  # TODO(user): Enable multi-GPU for DDP.
-            # TODO(user): Configure MoCo settings.
+            multi_gpu=True,
+            # TODO: Configure MoCo settings.
             only_use_moco_stats=False,
             use_moco=False,
             learn=dict(learner=dict(hook=dict(save_ckpt_after_iter=1000000))),
@@ -115,7 +102,6 @@ def get_base_config(env_id_list: list[str], collector_env_num: int, evaluator_en
                     # TODO(user): For debugging only. Use a smaller model.
                     # num_layers=1,
                     num_layers=4,
-                    # num_layers=8,
 
                     num_heads=24,
                     embed_dim=768,
@@ -319,24 +305,9 @@ def generate_experiment_name(num_tasks: int, curriculum_stage_num: int, buffer_r
     Returns:
         - (:obj:`str`): The generated experiment name prefix.
     """
-    # NOTE: This is a template for the experiment name.
-    # Users should customize it to reflect their specific experiment settings.
-    #
-    # IMPORTANT: To avoid filesystem path length issues, consider using the simplified version below.
-    # Uncomment the simplified version and comment out the detailed version if you encounter
-    # "File name too long" errors.
-    #
-    # ===== Simplified Version (RECOMMENDED to avoid path length issues) =====
-    # return f'data_20251120/dmc_{num_tasks}t_s{curriculum_stage_num}_brf{buffer_reanalyze_freq:.0e}_s{seed}/'
-    #
-    # ===== Detailed Version (Current) =====
-    # return (
-    #     f'data_suz_dmc_mt_balance_20251120/dmc_{num_tasks}tasks_frameskip4-pen-fs8_balance-stage-total-{curriculum_stage_num}'
-    #     f'_stage0-10k-5k_fix-lora-update-stablescale_moe8-uselora_nlayer4_not-share-head'
-    #     f'_brf{buffer_reanalyze_freq}_seed{seed}/'
-    # )
+
     return (
-        f'data_suz_dmc_mt_balance_20251120/dmc_{num_tasks}tasks_frameskip4-pen-fs8_balance-stage-total-{curriculum_stage_num}'
+        f'data_suz_dmc_mt_balance/dmc_{num_tasks}tasks_frameskip4-pen-fs8_balance-stage-total-{curriculum_stage_num}'
         f'_stage0-10k-5k_moe8_nlayer4'
         f'_brf{buffer_reanalyze_freq}_seed{seed}/'
     )
@@ -398,7 +369,6 @@ def generate_all_task_configs(
 
     for task_id, env_id in enumerate(env_id_list):
         task_specific_config = create_task_config(
-            # base_config=base_config.clone(),  # Use a clone to avoid modifying the base config
             base_config=copy.deepcopy(base_config),
             env_id=env_id,
             action_space_size_list=action_space_size_list,
@@ -435,15 +405,10 @@ def main():
         This script should be executed with <nproc_per_node> GPUs.
 
         Example launch commands:
-        1. Using `torch.distributed.launch`:
-           cd <PATH_TO_YOUR_PROJECT>/LightZero/
-           python -m torch.distributed.launch --nproc_per_node=8 --master_port=29501 \\
-               ./zoo/dmc2gym/config/dmc2gym_state_suz_multitask_ddp_balance_config.py 2>&1 | tee \\
-               ./logs/uz_mt_dmc18_balance_moe8_seed0.log
-
-        2. Using `torchrun`:
-           cd <PATH_TO_YOUR_PROJECT>/LightZero/
-           torchrun --nproc_per_node=4 ./zoo/dmc2gym/config/dmc2gym_state_suz_multitask_ddp_balance_config.py
+
+        cd <PATH_TO_YOUR_PROJECT>/LightZero/
+        torchrun --nproc_per_node=4 ./zoo/dmc2gym/config/dmc2gym_state_suz_multitask_ddp_balance_config.py 2>&1 | tee \\
+            ./logs/uz_mt_dmc18_balance_moe8_seed0.log
     """
     from lzero.entry import train_unizero_multitask_balance_segment_ddp
     from ding.utils import DDPContext
@@ -492,10 +457,7 @@ def main():
     # batch_size = [3] * len(env_id_list)
     # max_env_step = int(1e3)
 
-    # Production settings
-    # curriculum_stage_num = 5
-    curriculum_stage_num = 3
-    
+    curriculum_stage_num = 5
     collector_env_num = 8
     num_segments = 8
     n_episode = 8