v2: Better handling of max_action in actor

MischaPanch · MischaPanch · commit 0fa36cd19f60 · 2025-05-17T00:00:53.000+02:00
diff --git a/test/continuous/test_redq.py b/test/continuous/test_redq.py
@@ -64,7 +64,6 @@ def test_redq(args: argparse.Namespace = get_args(), enable_assertions: bool = T
     space_info = SpaceInfo.from_env(env)
     args.state_shape = space_info.observation_info.obs_shape
     args.action_shape = space_info.action_info.action_shape
-    args.max_action = space_info.action_info.max_action
     if args.reward_threshold is None:
         default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
         args.reward_threshold = default_reward_threshold.get(
diff --git a/test/offline/gather_pendulum_data.py b/test/offline/gather_pendulum_data.py
@@ -73,7 +73,6 @@ def gather_data() -> VectorReplayBuffer:
     space_info = SpaceInfo.from_env(env)
     args.state_shape = space_info.observation_info.obs_shape
     args.action_shape = space_info.action_info.action_shape
-    args.max_action = space_info.action_info.max_action
 
     if args.reward_threshold is None:
         default_reward_threshold = {"Pendulum-v0": -250, "Pendulum-v1": -250}
diff --git a/test/offline/test_bcq.py b/test/offline/test_bcq.py
@@ -104,7 +104,7 @@ def test_bcq(args: argparse.Namespace = get_args(), enable_assertions: bool = Tr
         output_dim=args.action_dim,
         hidden_sizes=args.hidden_sizes,
     )
-    actor = Perturbation(preprocess_net=net_a, max_action=args.max_action, phi=args.phi).to(
+    actor_perturbation = Perturbation(preprocess_net=net_a, max_action=args.max_action, phi=args.phi).to(
         args.device,
     )
     actor_optim = AdamOptimizerFactory(lr=args.actor_lr)
@@ -141,7 +141,7 @@ def test_bcq(args: argparse.Namespace = get_args(), enable_assertions: bool = Tr
     vae_optim = AdamOptimizerFactory()
 
     policy = BCQPolicy(
-        actor_perturbation=actor,
+        actor_perturbation=actor_perturbation,
         critic=critic,
         vae=vae,
         action_space=env.action_space,
diff --git a/tianshou/algorithm/modelfree/reinforce.py b/tianshou/algorithm/modelfree/reinforce.py
@@ -148,14 +148,20 @@ def __init__(
             action_scaling=action_scaling,
             action_bound_method=action_bound_method,
         )
-        if action_scaling and not np.isclose(actor.max_action, 1.0):
-            warnings.warn(
-                "action_scaling and action_bound_method are only intended "
-                "to deal with unbounded model action space, but find actor model "
-                f"bound action space with max_action={actor.max_action}. "
-                "Consider using unbounded=True option of the actor model, "
-                "or set action_scaling to False and action_bound_method to None.",
-            )
+        if action_scaling:
+            try:
+                max_action = float(actor.max_action) # type: ignore
+                if np.isclose(max_action, 1.0):
+                    warnings.warn(
+                        "action_scaling and action_bound_method are only intended "
+                        "to deal with unbounded model action space, but find actor model "
+                        f"bound action space with max_action={actor.max_action}. "
+                        "Consider using unbounded=True option of the actor model, "
+                        "or set action_scaling to False and action_bound_method to None.",
+                    )
+            except:
+                pass
+
         self.actor = actor
         self.dist_fn = dist_fn
         self._eps = 1e-8
@@ -286,7 +292,7 @@ def add_discounted_returns(
             should be marked by done flag, unfinished (or collecting) episodes will be
             recognized by buffer.unfinished_index().
         :param buffer: the corresponding replay buffer.
-        :param numpy.ndarray indices: tell batch's location in buffer, batch is equal
+        :param indices: tell batch's location in buffer, batch is equal
             to buffer[indices].
         """
         v_s_ = np.full(indices.shape, self.ret_rms.mean)
@@ -306,8 +312,7 @@ def add_discounted_returns(
             self.ret_rms.update(unnormalized_returns)
         else:
             batch.returns = unnormalized_returns
-        batch: BatchWithReturnsProtocol
-        return batch
+        return cast(BatchWithReturnsProtocol, batch)
 
 
 class Reinforce(OnPolicyAlgorithm[ActorPolicyProbabilistic]):
@@ -316,7 +321,7 @@ class Reinforce(OnPolicyAlgorithm[ActorPolicyProbabilistic]):
     def __init__(
         self,
         *,
-        policy: TActorPolicy,
+        policy: ActorPolicyProbabilistic,
         gamma: float = 0.99,
         return_standardization: bool = False,
         optim: OptimizerFactory,