fix(pu): fix test

puyuan1996 · puyuan1996 · commit 1bf1b0cb2915 · 2026-01-08T19:25:03.000+08:00
diff --git a/lzero/entry/train_unizero_multitask_segment_ddp.py b/lzero/entry/train_unizero_multitask_segment_ddp.py
@@ -231,7 +231,7 @@ def train_unizero_multitask_segment_ddp(
         # Process each task assigned to the current rank.
         for local_task_id, (task_id, [cfg, create_cfg]) in enumerate(tasks_for_this_rank):
             # Set a unique random seed for each task.
-            cfg.policy.device = 'cuda' if cfg.policy.cuda and torch.cuda.is_available() else 'cpu'
+            cfg.policy.device = 'cuda' if cfg.policy.device == 'cuda' and torch.cuda.is_available() else 'cpu'
             cfg = compile_config(cfg, seed=seed + task_id, env=None, auto=True, create_cfg=create_cfg, save_cfg=True)
             policy_config = cfg.policy
             policy.collect_mode.get_attribute('cfg').n_episode = policy_config.n_episode
diff --git a/lzero/mcts/tests/test_game_buffer.py b/lzero/mcts/tests/test_game_buffer.py
@@ -16,6 +16,11 @@
         use_priority=True,
         action_type='fixed_action_space',
         game_segment_length=20,
+        model=dict(
+            action_space_size=6,
+            value_support_range=(-10, 10, 1),
+            reward_support_range=(-10, 10, 1),
+        ),
     )
 )
 
diff --git a/lzero/model/common.py b/lzero/model/common.py
@@ -734,7 +734,7 @@ def __init__(
             self.downsample_net = DownSample(observation_shape, num_channels, activation, norm_type)
         else:
             self.conv = nn.Conv2d(observation_shape[0], num_channels, kernel_size=3, stride=1, padding=1, bias=False)
-            self.norm = build_normalization(norm_type, dim=3)(num_channels, *observation_shape[1:])
+            self.norm = build_normalization(norm_type, dim=2)(num_channels)
 
         self.resblocks = nn.ModuleList([
             ResBlock(in_channels=num_channels, activation=activation, norm_type=norm_type, res_type='basic', bias=False)
diff --git a/lzero/policy/tests/config/atari_muzero_config_for_test.py b/lzero/policy/tests/config/atari_muzero_config_for_test.py
@@ -50,6 +50,8 @@
             self_supervised_learning_loss=True,  # default is False
             discrete_action_encoding_type='one_hot',
             norm_type='BN',
+            value_support_range=(-300., 301., 1.),
+            reward_support_range=(-300., 301., 1.),
         ),
         cuda=True,
         env_type='not_board_games',
diff --git a/lzero/policy/tests/config/cartpole_muzero_config_for_test.py b/lzero/policy/tests/config/cartpole_muzero_config_for_test.py
@@ -30,12 +30,14 @@
         model=dict(
             observation_shape=4,
             action_space_size=2,
-            model_type='mlp', 
+            model_type='mlp',
             lstm_hidden_size=128,
             latent_state_dim=128,
             self_supervised_learning_loss=True,  # NOTE: default is False.
             discrete_action_encoding_type='one_hot',
-            norm_type='BN', 
+            norm_type='BN',
+            value_support_range=(-300., 301., 1.),
+            reward_support_range=(-300., 301., 1.),
         ),
         cuda=True,
         env_type='not_board_games',
diff --git a/lzero/policy/unizero.py b/lzero/policy/unizero.py
@@ -3,24 +3,29 @@
 from collections import defaultdict
 from typing import Any, Dict, List, Tuple, Union
 
+import numpy as np
+import torch
 import torch.nn.functional as F
 import wandb
 from ding.model import model_wrap
 from ding.utils import POLICY_REGISTRY
+from lzero.mcts import UniZeroMCTSCtree as MCTSCtree
 from lzero.model import ImageTransforms
-from lzero.policy import (DiscreteSupport, InverseScalarTransform, from,
-                          import, lzero.policy, mz_network_output_unpack,
-                          phi_transform, prepare_obs,
+from lzero.policy import (DiscreteSupport, InverseScalarTransform,
+                          mz_network_output_unpack, phi_transform, prepare_obs,
                           prepare_obs_stack_for_unizero, scalar_transform,
                           select_action, to_torch_float_tensor)
 from lzero.policy.head_clip_manager import (HeadClipConfig, HeadClipManager,
                                             create_head_clip_manager_from_dict)
+from lzero.policy.muzero import MuZeroPolicy
 from lzero.policy.utils import initialize_pad_batch
 from torch.nn.utils.convert_parameters import (parameters_to_vector,
                                                vector_to_parameters)
 
 from .utils import configure_optimizers_nanogpt
 
+
+def scale_module_weights_vectorized(module: torch.nn.Module, scale_factor: float):
     """
     Efficiently scale all weights of a module using vectorized operations.
     """
@@ -129,6 +134,8 @@ class UniZeroPolicy(MuZeroPolicy):
             # (int) The save interval of the model.
             learn=dict(learner=dict(hook=dict(save_ckpt_after_iter=10000, ), ), ),
             world_model_cfg=dict(
+                # (str) The encoder type, e.g., 'resnet' or 'vit'.
+                encoder_type='resnet',
                 # (bool) If True, the action space of the environment is continuous, otherwise discrete.
                 continuous_action_space=False,
                 # (int) The number of tokens per block.
@@ -142,7 +149,7 @@ class UniZeroPolicy(MuZeroPolicy):
                 # (bool) Whether to use GRU gating mechanism.
                 gru_gating=False,
                 # (str) The device to be used for computation, e.g., 'cpu' or 'cuda'.
-                device='cuda',
+                device='cpu',
                 # (bool) Whether to analyze simulation normalization.
                 analysis_sim_norm=False,
                 # (bool) Whether to analyze dormant ratio, average_weight_magnitude of net, effective_rank of latent.
@@ -235,6 +242,9 @@ class UniZeroPolicy(MuZeroPolicy):
                 num_experts_per_tok=1,
                 # (int) Total number of experts in the transformer MoE.
                 num_experts_of_moe_in_transformer=8,
+                # ****** Priority ******
+                # (bool) Whether to use priority when sampling training data from the buffer.
+                use_priority=False,
             ),
         ),
         # ****** common ******
@@ -298,6 +308,9 @@ class UniZeroPolicy(MuZeroPolicy):
         policy_ls_eps_end=0.01,
         # (int) Number of training steps to decay label smoothing epsilon from start to end
         policy_ls_eps_decay_steps=50000,
+        
+        label_smoothing_eps=0.1,  # TODO: For value
+
         # (bool) Whether to use continuous (fixed) label smoothing throughout training
         use_continuous_label_smoothing=False,
         # (float) Fixed epsilon value for continuous label smoothing (only used when use_continuous_label_smoothing=True)
diff --git a/lzero/policy/unizero_multitask.py b/lzero/policy/unizero_multitask.py
@@ -7,7 +7,6 @@
 import torch
 from ding.model import model_wrap
 from ding.utils import POLICY_REGISTRY
-from lzero.entry.utils import initialize_zeros_batch
 from lzero.mcts import UniZeroMCTSCtree as MCTSCtree
 from lzero.model import ImageTransforms
 from lzero.policy import (DiscreteSupport, InverseScalarTransform,
@@ -16,7 +15,7 @@
                           select_action, to_torch_float_tensor)
 from lzero.policy.unizero import UniZeroPolicy, scale_module_weights_vectorized
 
-from .utils import configure_optimizers_nanogpt
+from .utils import configure_optimizers_nanogpt, initialize_zeros_batch
 
 # Please replace the path with the actual location of your LibMTL library.
 sys.path.append('/path/to/your/LibMTL')
@@ -254,7 +253,7 @@ class UniZeroMTPolicy(UniZeroPolicy):
                 analysis_dormant_ratio_weight_rank=False,
                 # (float) The threshold for a dormant neuron.
                 dormant_threshold=0.01,
-
+                share_head=False,
             ),
         ),
         # ****** common ******
diff --git a/zoo/atari/config/atari_unizero_multitask_segment_ddp_config.py b/zoo/atari/config/atari_unizero_multitask_segment_ddp_config.py
@@ -130,8 +130,12 @@ def create_config(
                     max_tokens=2 * num_unroll_steps,
                     context_length=2 * infer_context_length,
                     encoder_type='vit',
+                    device='cuda',
+                    game_segment_length=20,
                 ),
             ),
+            device='cuda',
+            game_segment_length=20,
             learning_rate=0.0001,
             weight_decay=1e-2,
             batch_size=batch_size,
diff --git a/zoo/atari/config/atari_unizero_segment_config.py b/zoo/atari/config/atari_unizero_segment_config.py
@@ -71,6 +71,8 @@ def main(env_id, seed):
                     env_num=max(collector_env_num, evaluator_env_num),
                     num_simulations=num_simulations,
                     game_segment_length=game_segment_length,
+                    device='cuda',
+                    use_priority=True,
                 ),
             ),
             # Learning settings
diff --git a/zoo/atari/envs/atari_lightzero_env.py b/zoo/atari/envs/atari_lightzero_env.py
@@ -139,8 +139,11 @@ def reset(self) -> dict:
                 ),
             })
 
+            # self._reward_space = gym.spaces.Box(
+            #     low=self._env.env.reward_range[0], high=self._env.env.reward_range[1], shape=(1,), dtype=np.float32
+            # )
             self._reward_space = gym.spaces.Box(
-                low=self._env.env.reward_range[0], high=self._env.env.reward_range[1], shape=(1,), dtype=np.float32
+                low=-9999, high=9999, shape=(1,), dtype=np.float32
             )
 
             self._init_flag = True

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,11 @@`
`16`	`16`	`use_priority=True,`
`17`	`17`	`action_type='fixed_action_space',`
`18`	`18`	`game_segment_length=20,`
	`19`	`+ model=dict(`
	`20`	`+ action_space_size=6,`
	`21`	`+ value_support_range=(-10, 10, 1),`
	`22`	`+ reward_support_range=(-10, 10, 1),`
	`23`	`+ ),`
`19`	`24`	`)`
`20`	`25`	`)`
`21`	`26`