fix(pu): fix incompatibility between final_norm_option_in_encoder and predict_latent_loss_type in sampled unizero

puyuan1996 · web-flow · commit e17493e735b1 · 2025-10-18T17:17:00.000+08:00
diff --git a/lzero/model/sampled_unizero_model.py b/lzero/model/sampled_unizero_model.py
@@ -77,6 +77,7 @@ def __init__(
                 activation=self.activation,
                 norm_type=norm_type,
                 group_size=world_model_cfg.group_size,
+                final_norm_option_in_encoder=world_model_cfg.final_norm_option_in_encoder
             )
             # TODO: only for MemoryEnv now
             self.decoder_network = VectorDecoderForMemoryEnv(embedding_dim=world_model_cfg.embed_dim, output_shape=25, norm_type=norm_type)
@@ -98,6 +99,7 @@ def __init__(
                 norm_type=norm_type,
                 embedding_dim=world_model_cfg.embed_dim,
                 group_size=world_model_cfg.group_size,
+                final_norm_option_in_encoder=world_model_cfg.final_norm_option_in_encoder
             )
             # TODO: we should change the output_shape to the real observation shape
             self.decoder_network = LatentDecoder(embedding_dim=world_model_cfg.embed_dim, output_shape=(3, 64, 64))
@@ -127,6 +129,7 @@ def __init__(
                 strides=[1, 1, 1],
                 activation=self.activation,
                 group_size=world_model_cfg.group_size,
+                final_norm_option_in_encoder=world_model_cfg.final_norm_option_in_encoder
             )
             self.decoder_network = LatentDecoderForMemoryEnv(
                 image_shape=(3, 5, 5),
diff --git a/lzero/policy/sampled_unizero.py b/lzero/policy/sampled_unizero.py
@@ -121,8 +121,17 @@ class SampledUniZeroPolicy(UniZeroPolicy):
                 perceptual_loss_weight=0.,
                 # (float) The weight of the policy entropy loss.
                 policy_entropy_weight=5e-3,
-                # (str) The type of loss for predicting latent variables. Options could be ['group_kl', 'mse'].
-                predict_latent_loss_type='group_kl',
+                # (str) The normalization type for the final layer in both the head and the encoder.
+                # This option must be the same for both 'final_norm_option_in_head' and 'final_norm_option_in_encoder'.
+                # Valid options are 'LayerNorm' and 'SimNorm'.
+                # When set to 'LayerNorm', the 'predict_latent_loss_type' should be 'mse'.
+                # When set to 'SimNorm', the 'predict_latent_loss_type' should be 'group_kl'.
+                final_norm_option_in_head="LayerNorm",
+                final_norm_option_in_encoder="LayerNorm",
+                # (str) The type of loss function for predicting latent variables.
+                # Options are 'mse' (Mean Squared Error) or 'group_kl' (Group Kullback-Leibler divergence).
+                # This choice is dependent on the normalization method selected above.
+                predict_latent_loss_type='mse',
                 # (str) The type of observation. Options are ['image', 'vector'].
                 obs_type='image',
                 # (float) The discount factor for future rewards.
diff --git a/zoo/classic_control/pendulum/config/pendulum_cont_sampled_unizero_config.py b/zoo/classic_control/pendulum/config/pendulum_cont_sampled_unizero_config.py
@@ -49,7 +49,7 @@
             world_model_cfg=dict(
                 obs_type='vector',
                 num_unroll_steps=num_unroll_steps,
-                policy_entropy_weight=1e-4,
+                policy_entropy_weight=5e-2,
                 continuous_action_space=continuous_action_space,
                 num_of_sampled_actions=K,
                 sigma_type='conditioned',
@@ -80,11 +80,17 @@
         batch_size=batch_size,
         optim_type='AdamW',
         piecewise_decay_lr_scheduler=False,
-        learning_rate=0.0001,
+        discount_factor=0.99,
+        td_steps=5,
+        learning_rate=1e-4,
+        grad_clip_value=5,
+        manual_temperature_decay=True,
+        threshold_training_steps_for_final_temperature=int(2.5e4),
+        cos_lr_scheduler=True,
         num_simulations=num_simulations,
         reanalyze_ratio=reanalyze_ratio,
         n_episode=n_episode,
-        eval_freq=int(1e3),
+        eval_freq=int(2e3),
         replay_buffer_size=int(1e6),
         collector_env_num=collector_env_num,
         evaluator_env_num=evaluator_env_num,