Adds Recurrent Student-Teacher Distillation

Clemens Schwarke · Clemens Schwarke · commit 5be19f381194 · 2025-03-24T13:58:28.000Z
Approved-by: Mayank Mittal
diff --git a/config/dummy_config.yaml b/config/dummy_config.yaml
@@ -71,7 +71,7 @@ policy:
 
   # only needed for `ActorCriticRecurrent`
   # rnn_type: 'lstm'
-  # rnn_hidden_size: 512
+  # rnn_hidden_dim: 512
   # rnn_num_layers: 1
 
 runner:
diff --git a/rsl_rl/algorithms/distillation.py b/rsl_rl/algorithms/distillation.py
@@ -8,14 +8,14 @@
 import torch.optim as optim
 
 # rsl-rl
-from rsl_rl.modules import StudentTeacher
+from rsl_rl.modules import StudentTeacher, StudentTeacherRecurrent
 from rsl_rl.storage import RolloutStorage
 
 
 class Distillation:
     """Distillation algorithm for training a student model to mimic a teacher model."""
 
-    policy: StudentTeacher
+    policy: StudentTeacher | StudentTeacherRecurrent
     """The student teacher model."""
 
     def __init__(
@@ -24,6 +24,7 @@ def __init__(
         num_learning_epochs=1,
         gradient_length=15,
         learning_rate=1e-3,
+        loss_type="mse",
         device="cpu",
     ):
         self.device = device
@@ -37,11 +38,20 @@ def __init__(
         self.storage = None  # initialized later
         self.optimizer = optim.Adam(self.policy.student.parameters(), lr=self.learning_rate)
         self.transition = RolloutStorage.Transition()
+        self.last_hidden_states = None
 
         # distillation parameters
         self.num_learning_epochs = num_learning_epochs
         self.gradient_length = gradient_length
 
+        # initialize the loss function
+        if loss_type == "mse":
+            self.loss_fn = nn.functional.mse_loss
+        elif loss_type == "huber":
+            self.loss_fn = nn.functional.huber_loss
+        else:
+            raise ValueError(f"Unknown loss type: {loss_type}. Supported types are: mse, huber")
+
         self.num_updates = 0
 
     def init_storage(
@@ -79,25 +89,24 @@ def process_env_step(self, rewards, dones, infos):
 
     def update(self):
         self.num_updates += 1
-        mean_behaviour_loss = 0
+        mean_behavior_loss = 0
         loss = 0
         cnt = 0
 
-        for epoch in range(self.num_learning_epochs):  # TODO unify num_steps_per_env and gradient_length
-            self.policy.reset()
+        for epoch in range(self.num_learning_epochs):
+            self.policy.reset(hidden_states=self.last_hidden_states)
             self.policy.detach_hidden_states()
-            for obs, _, _, privileged_actions in self.storage.generator():
+            for obs, _, _, privileged_actions, dones in self.storage.generator():
 
                 # inference the student for gradient computation
                 actions = self.policy.act_inference(obs)
 
-                # behaviour cloning loss
-                behaviour_loss = nn.functional.mse_loss(actions, privileged_actions)
+                # behavior cloning loss
+                behavior_loss = self.loss_fn(actions, privileged_actions)
 
                 # total loss
-                loss = loss + behaviour_loss
-
-                mean_behaviour_loss += behaviour_loss.item()
+                loss = loss + behavior_loss
+                mean_behavior_loss += behavior_loss.item()
                 cnt += 1
 
                 # gradient step
@@ -108,11 +117,16 @@ def update(self):
                     self.policy.detach_hidden_states()
                     loss = 0
 
-        mean_behaviour_loss /= cnt
+                # reset dones
+                self.policy.reset(dones.view(-1))
+                self.policy.detach_hidden_states(dones.view(-1))
+
+        mean_behavior_loss /= cnt
         self.storage.clear()
-        self.policy.reset()  # TODO needed?
+        self.last_hidden_states = self.policy.get_hidden_states()
+        self.policy.detach_hidden_states()
 
         # construct the loss dictionary
-        loss_dict = {"behaviour": mean_behaviour_loss}
+        loss_dict = {"behavior": mean_behavior_loss}
 
         return loss_dict
diff --git a/rsl_rl/modules/__init__.py b/rsl_rl/modules/__init__.py
@@ -10,11 +10,13 @@
 from .normalizer import EmpiricalNormalization
 from .rnd import RandomNetworkDistillation
 from .student_teacher import StudentTeacher
+from .student_teacher_recurrent import StudentTeacherRecurrent
 
 __all__ = [
     "ActorCritic",
     "ActorCriticRecurrent",
     "EmpiricalNormalization",
     "RandomNetworkDistillation",
     "StudentTeacher",
+    "StudentTeacherRecurrent",
 ]
diff --git a/rsl_rl/modules/actor_critic_recurrent.py b/rsl_rl/modules/actor_critic_recurrent.py
@@ -5,11 +5,9 @@
 
 from __future__ import annotations
 
-import torch
-import torch.nn as nn
-
-from rsl_rl.modules.actor_critic import ActorCritic
-from rsl_rl.utils import resolve_nn_activation, unpad_trajectories
+from rsl_rl.modules import ActorCritic
+from rsl_rl.networks import Memory
+from rsl_rl.utils import resolve_nn_activation
 
 
 class ActorCriticRecurrent(ActorCritic):
@@ -24,7 +22,7 @@ def __init__(
         critic_hidden_dims=[256, 256, 256],
         activation="elu",
         rnn_type="lstm",
-        rnn_hidden_size=256,
+        rnn_hidden_dim=256,
         rnn_num_layers=1,
         init_noise_std=1.0,
         **kwargs,
@@ -35,8 +33,8 @@ def __init__(
             )
 
         super().__init__(
-            num_actor_obs=rnn_hidden_size,
-            num_critic_obs=rnn_hidden_size,
+            num_actor_obs=rnn_hidden_dim,
+            num_critic_obs=rnn_hidden_dim,
             num_actions=num_actions,
             actor_hidden_dims=actor_hidden_dims,
             critic_hidden_dims=critic_hidden_dims,
@@ -46,8 +44,8 @@ def __init__(
 
         activation = resolve_nn_activation(activation)
 
-        self.memory_a = Memory(num_actor_obs, type=rnn_type, num_layers=rnn_num_layers, hidden_size=rnn_hidden_size)
-        self.memory_c = Memory(num_critic_obs, type=rnn_type, num_layers=rnn_num_layers, hidden_size=rnn_hidden_size)
+        self.memory_a = Memory(num_actor_obs, type=rnn_type, num_layers=rnn_num_layers, hidden_size=rnn_hidden_dim)
+        self.memory_c = Memory(num_critic_obs, type=rnn_type, num_layers=rnn_num_layers, hidden_size=rnn_hidden_dim)
 
         print(f"Actor RNN: {self.memory_a}")
         print(f"Critic RNN: {self.memory_c}")
@@ -70,32 +68,3 @@ def evaluate(self, critic_observations, masks=None, hidden_states=None):
 
     def get_hidden_states(self):
         return self.memory_a.hidden_states, self.memory_c.hidden_states
-
-
-class Memory(torch.nn.Module):
-    def __init__(self, input_size, type="lstm", num_layers=1, hidden_size=256):
-        super().__init__()
-        # RNN
-        rnn_cls = nn.GRU if type.lower() == "gru" else nn.LSTM
-        self.rnn = rnn_cls(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
-        self.hidden_states = None
-
-    def forward(self, input, masks=None, hidden_states=None):
-        batch_mode = masks is not None
-        if batch_mode:
-            # batch mode (policy update): need saved hidden states
-            if hidden_states is None:
-                raise ValueError("Hidden states not passed to memory module during policy update")
-            out, _ = self.rnn(input, hidden_states)
-            out = unpad_trajectories(out, masks)
-        else:
-            # inference mode (collection): use hidden states of last step
-            out, self.hidden_states = self.rnn(input.unsqueeze(0), self.hidden_states)
-        return out
-
-    def reset(self, dones=None):
-        # When the RNN is an LSTM, self.hidden_states_a is a list with hidden_state and cell_state
-        if self.hidden_states is None:
-            return
-        for hidden_state in self.hidden_states:
-            hidden_state[..., dones == 1, :] = 0.0
diff --git a/rsl_rl/modules/student_teacher.py b/rsl_rl/modules/student_teacher.py
@@ -72,7 +72,7 @@ def __init__(
         # disable args validation for speedup
         Normal.set_default_validate_args = False
 
-    def reset(self, dones=None):
+    def reset(self, dones=None, hidden_states=None):
         pass
 
     def forward(self):
@@ -128,6 +128,9 @@ def load_state_dict(self, state_dict, strict=True):
                 if "actor." in key:
                     teacher_state_dict[key.replace("actor.", "")] = value
             self.teacher.load_state_dict(teacher_state_dict, strict=strict)
+            # also load recurrent memory if teacher is recurrent
+            if self.is_recurrent and self.teacher_recurrent:
+                raise NotImplementedError("Loading recurrent memory for the teacher is not implemented yet")  # TODO
             # set flag for successfully loading the parameters
             self.loaded_teacher = True
             self.teacher.eval()
@@ -141,5 +144,8 @@ def load_state_dict(self, state_dict, strict=True):
         else:
             raise ValueError("state_dict does not contain student or teacher parameters")
 
-    def detach_hidden_states(self):
+    def get_hidden_states(self):
+        return None
+
+    def detach_hidden_states(self, dones=None):
         pass
diff --git a/rsl_rl/modules/student_teacher_recurrent.py b/rsl_rl/modules/student_teacher_recurrent.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021-2025, ETH Zurich and NVIDIA CORPORATION
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from __future__ import annotations
+
+from rsl_rl.modules import StudentTeacher
+from rsl_rl.networks import Memory
+from rsl_rl.utils import resolve_nn_activation
+
+
+class StudentTeacherRecurrent(StudentTeacher):
+    is_recurrent = True
+
+    def __init__(
+        self,
+        num_student_obs,
+        num_teacher_obs,
+        num_actions,
+        student_hidden_dims=[256, 256, 256],
+        teacher_hidden_dims=[256, 256, 256],
+        activation="elu",
+        rnn_type="lstm",
+        rnn_hidden_dim=256,
+        rnn_num_layers=1,
+        init_noise_std=0.1,
+        teacher_recurrent=False,
+        **kwargs,
+    ):
+        if kwargs:
+            print(
+                "StudentTeacherRecurrent.__init__ got unexpected arguments, which will be ignored: "
+                + str(kwargs.keys()),
+            )
+
+        self.teacher_recurrent = teacher_recurrent
+
+        super().__init__(
+            num_student_obs=rnn_hidden_dim,
+            num_teacher_obs=rnn_hidden_dim if teacher_recurrent else num_teacher_obs,
+            num_actions=num_actions,
+            student_hidden_dims=student_hidden_dims,
+            teacher_hidden_dims=teacher_hidden_dims,
+            activation=activation,
+            init_noise_std=init_noise_std,
+        )
+
+        activation = resolve_nn_activation(activation)
+
+        self.memory_s = Memory(num_student_obs, type=rnn_type, num_layers=rnn_num_layers, hidden_size=rnn_hidden_dim)
+        if self.teacher_recurrent:
+            self.memory_t = Memory(
+                num_teacher_obs, type=rnn_type, num_layers=rnn_num_layers, hidden_size=rnn_hidden_dim
+            )
+
+        print(f"Student RNN: {self.memory_s}")
+        if self.teacher_recurrent:
+            print(f"Teacher RNN: {self.memory_t}")
+
+    def reset(self, dones=None, hidden_states=None):
+        if hidden_states is None:
+            hidden_states = (None, None)
+        self.memory_s.reset(dones, hidden_states[0])
+        if self.teacher_recurrent:
+            self.memory_t.reset(dones, hidden_states[1])
+
+    def act(self, observations):
+        input_s = self.memory_s(observations)
+        return super().act(input_s.squeeze(0))
+
+    def act_inference(self, observations):
+        input_s = self.memory_s(observations)
+        return super().act_inference(input_s.squeeze(0))
+
+    def evaluate(self, teacher_observations):
+        if self.teacher_recurrent:
+            teacher_observations = self.memory_t(teacher_observations)
+        return super().evaluate(teacher_observations.squeeze(0))
+
+    def get_hidden_states(self):
+        if self.teacher_recurrent:
+            return self.memory_s.hidden_states, self.memory_t.hidden_states
+        else:
+            return self.memory_s.hidden_states, None
+
+    def detach_hidden_states(self, dones=None):
+        self.memory_s.detach_hidden_states(dones)
+        if self.teacher_recurrent:
+            self.memory_t.detach_hidden_states(dones)
diff --git a/rsl_rl/networks/__init__.py b/rsl_rl/networks/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2021-2025, ETH Zurich and NVIDIA CORPORATION
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Definitions for neural networks."""
+
+from .memory import Memory
+
+__all__ = ["Memory"]
diff --git a/rsl_rl/networks/memory.py b/rsl_rl/networks/memory.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2021-2025, ETH Zurich and NVIDIA CORPORATION
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from __future__ import annotations
+
+import torch
+import torch.nn as nn
+
+from rsl_rl.utils import unpad_trajectories
+
+
+class Memory(torch.nn.Module):
+    def __init__(self, input_size, type="lstm", num_layers=1, hidden_size=256):
+        super().__init__()
+        # RNN
+        rnn_cls = nn.GRU if type.lower() == "gru" else nn.LSTM
+        self.rnn = rnn_cls(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
+        self.hidden_states = None
+
+    def forward(self, input, masks=None, hidden_states=None):
+        batch_mode = masks is not None
+        if batch_mode:
+            # batch mode: needs saved hidden states
+            if hidden_states is None:
+                raise ValueError("Hidden states not passed to memory module during policy update")
+            out, _ = self.rnn(input, hidden_states)
+            out = unpad_trajectories(out, masks)
+        else:
+            # inference/distillation mode: uses hidden states of last step
+            out, self.hidden_states = self.rnn(input.unsqueeze(0), self.hidden_states)
+        return out
+
+    def reset(self, dones=None, hidden_states=None):
+        if dones is None:  # reset all hidden states
+            if hidden_states is None:
+                self.hidden_states = None
+            else:
+                self.hidden_states = hidden_states
+        elif self.hidden_states is not None:  # reset hidden states of done environments
+            if hidden_states is None:
+                if isinstance(self.hidden_states, tuple):  # tuple in case of LSTM
+                    for hidden_state in self.hidden_states:
+                        hidden_state[..., dones == 1, :] = 0.0
+                else:
+                    self.hidden_states[..., dones == 1, :] = 0.0
+            else:
+                NotImplementedError(
+                    "Resetting hidden states of done environments with custom hidden states is not implemented"
+                )
+
+    def detach_hidden_states(self, dones=None):
+        if self.hidden_states is not None:
+            if dones is None:  # detach all hidden states
+                if isinstance(self.hidden_states, tuple):  # tuple in case of LSTM
+                    self.hidden_states = tuple(hidden_state.detach() for hidden_state in self.hidden_states)
+                else:
+                    self.hidden_states = self.hidden_states.detach()
+            else:  # detach hidden states of done environments
+                if isinstance(self.hidden_states, tuple):  # tuple in case of LSTM
+                    for hidden_state in self.hidden_states:
+                        hidden_state[..., dones == 1, :] = hidden_state[..., dones == 1, :].detach()
+                else:
+                    self.hidden_states[..., dones == 1, :] = self.hidden_states[..., dones == 1, :].detach()
diff --git a/rsl_rl/runners/on_policy_runner.py b/rsl_rl/runners/on_policy_runner.py
diff --git a/rsl_rl/storage/rollout_storage.py b/rsl_rl/storage/rollout_storage.py