Merge remote-tracking branch 'thuml/dev-v2' into dev-v2

MischaPanch · MischaPanch · commit 0a7e4eaa3182 · 2025-05-15T15:36:04.000+02:00
# Conflicts:
#	tianshou/utils/net/common.py
diff --git a/tianshou/utils/net/common.py b/tianshou/utils/net/common.py
@@ -674,17 +674,17 @@ def get_preprocess_net(self) -> ModuleWithVectorOutput:
     def forward(
         self,
         obs: np.ndarray | torch.Tensor,
-        state: T | None = None,
+        rnn_hidden_state: T | None = None,
         info: dict[str, Any] | None = None,
     ) -> tuple[np.ndarray | torch.Tensor, T | None]:
         """
         The main method for tianshou to compute actions from env observations.
         Implementations will always make use of the preprocess_net as the first processing step.
-        
-        :param obs: the observation to be passed to the actor.
-        :param rnn_hidden_state: the hidden state of the RNN, if applicable.
-        :param info: the info object from the env step
-        :return: a tuple (action_repr, hidden_state), where action_repr is either an actual action for the environment or 
+
+        :param obs: the observation from the environment
+        :param rnn_hidden_state: the hidden state of the RNN, if applicable
+        :param info: the info object from the environment step
+        :return: a tuple (action_repr, hidden_state), where action_repr is either an actual action for the environment or
             a representation from which it can be retrieved/sampled (e.g., mean and std for a Gaussian distribution),
             and hidden_state is the new hidden state of the RNN, if applicable.
         """
@@ -729,7 +729,7 @@ def is_discrete(self) -> bool:
     def forward(
         self,
         obs: np.ndarray | torch.Tensor | BatchProtocol,
-        state: Any | None = None,
+        rnn_hidden_state: Any | None = None,
         info: dict[str, Any] | None = None,
     ) -> tuple[np.ndarray, Any | None]:
         batch_size = len(obs)
@@ -738,7 +738,7 @@ def forward(
         else:
             # Discrete Actors currently return an n-dimensional array of probabilities for each action
             action = 1 / self.action_space.n * np.ones((batch_size, self.action_space.n))
-        return action, state
+        return action, rnn_hidden_state
 
     def compute_action_batch(self, obs: np.ndarray | torch.Tensor | BatchProtocol) -> np.ndarray:
         if self.is_discrete:
diff --git a/tianshou/utils/net/continuous.py b/tianshou/utils/net/continuous.py
@@ -66,7 +66,7 @@ def get_output_dim(self) -> int:
     def forward(
         self,
         obs: np.ndarray | torch.Tensor,
-        state: Any = None,
+        rnn_hidden_state: Any = None,
         info: dict[str, Any] | None = None,
     ) -> tuple[torch.Tensor, Any]:
         """Mapping: s_B -> action_values_BA, hidden_state_BH | None.
@@ -76,7 +76,7 @@ def forward(
         The hidden state is only not None if a recurrent net is used as part of the
         learning algorithm (support for RNNs is currently experimental).
         """
-        action_BA, hidden_BH = self.preprocess(obs, state)
+        action_BA, hidden_BH = self.preprocess(obs, rnn_hidden_state)
         action_BA = self.max_action * torch.tanh(self.last(action_BA))
         return action_BA, hidden_BH
 
@@ -222,13 +222,13 @@ def get_preprocess_net(self) -> ModuleWithVectorOutput:
     def forward(
         self,
         obs: np.ndarray | torch.Tensor,
-        state: Any = None,
+        rnn_hidden_state: Any = None,
         info: dict[str, Any] | None = None,
     ) -> tuple[tuple[torch.Tensor, torch.Tensor], Any]:
         """Mapping: obs -> logits -> (mu, sigma)."""
         if info is None:
             info = {}
-        logits, hidden = self.preprocess(obs, state)
+        logits, hidden = self.preprocess(obs, rnn_hidden_state)
         mu = self.mu(logits)
         if not self._unbounded:
             mu = self.max_action * torch.tanh(mu)
@@ -238,7 +238,7 @@ def forward(
             shape = [1] * len(mu.shape)
             shape[1] = -1
             sigma = (self.sigma_param.view(shape) + torch.zeros_like(mu)).exp()
-        return (mu, sigma), state
+        return (mu, sigma), rnn_hidden_state
 
 
 class RecurrentActorProb(nn.Module):
diff --git a/tianshou/utils/net/discrete.py b/tianshou/utils/net/discrete.py
@@ -59,7 +59,7 @@ def get_preprocess_net(self) -> ModuleWithVectorOutput:
     def forward(
         self,
         obs: np.ndarray | torch.Tensor,
-        state: Any = None,
+        rnn_hidden_state: Any = None,
         info: dict[str, Any] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
         r"""Mapping: s_B -> action_values_BA, hidden_state_BH | None.
@@ -71,7 +71,7 @@ def forward(
         The hidden state is only
         not None if a recurrent net is used as part of the learning algorithm.
         """
-        x, hidden_BH = self.preprocess(obs, state)
+        x, hidden_BH = self.preprocess(obs, rnn_hidden_state)
         x = self.last(x)
         if self.softmax_output:
             x = F.softmax(x, dim=-1)