when doing ACT, it makes sense to have prediction loss at each reasoning step

lucidrains · lucidrains · commit 270c067378be · 2025-07-29T07:54:45.000-07:00
diff --git a/HRM/hrm_with_act.py b/HRM/hrm_with_act.py
@@ -160,6 +160,7 @@ def forward(
         hiddens: tuple[Tensor, ...] | None = None,
         *,
         labels = None,
+        compute_loss_across_reasoning_steps = False,
         detach_hiddens = True,
         one_step_grad = True,
         max_reasoning_steps = None,
@@ -265,33 +266,46 @@ def forward(
 
                 pred_q_halt_continues.append(q_halt_continue)
 
-        # to output prediction, using the hiddens from the highest hierarchy
+        # if labels passed in, cross entropy loss
 
-        highest_hidden = hiddens[self.num_networks - 1]
+        hiddens = list(hiddens.values())
 
-        logits = self.to_logits(highest_hidden)
+        if not return_loss:
+            # to output prediction, using the hiddens from the highest hierarchy
 
-        # if labels passed in, cross entropy loss
+            highest_hidden = hiddens[self.num_networks - 1]
 
-        hiddens = hiddens.values()
+            logits = self.to_logits(highest_hidden)
 
-        if not return_loss:
             return logits, hiddens
 
         # get main loss
 
-        main_pred_loss = F.cross_entropy(
-            rearrange(logits, 'b n c -> b c n'),
-            labels,
-            ignore_index = self.ignore_index
-        )
+        highest_hiddens = stack(highest_hiddens) # (l b n d)
+
+        if not compute_loss_across_reasoning_steps:
+            logits = self.to_logits(highest_hiddens[-1])
+
+            main_pred_loss = F.cross_entropy(
+                rearrange(logits, 'b n c -> b c n'),
+                labels,
+                ignore_index = self.ignore_index
+            )
+
+        else:
+            all_logits = self.to_logits(highest_hiddens)
+            num_layers = all_logits.shape[0]
+
+            main_pred_loss = F.cross_entropy(
+                rearrange(all_logits, 'l b n c -> b c n l'),
+                repeat(labels, 'b n -> b n l', l = num_layers),
+                ignore_index = self.ignore_index
+            )
 
         # compute the act loss
 
         q_halts, q_continues = rearrange(pred_q_halt_continues, 'l halt_continue b -> halt_continue l b')
 
-        highest_hiddens = stack(highest_hiddens) # (l b n d)
-
         # q halt loss is simply on whether the prediction is correct or not
 
         with torch.no_grad():
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "HRM-pytorch"
-version = "0.0.14"
+version = "0.0.15"
 description = "The proposal from a Singaporean AGI company"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_hrm.py b/tests/test_hrm.py
@@ -52,3 +52,36 @@ def test_hrm():
     # after much training
 
     pred = hrm(seq, reasoning_steps = 5)
+
+@pytest.mark.parametrize('compute_loss_across_reasoning_steps', (False, True))
+def test_hrm_with_act(
+    compute_loss_across_reasoning_steps
+):
+    from HRM.hrm_with_act import HRM
+
+    hrm = HRM(
+        networks = [
+            dict(
+                dim = 32,
+                depth = 2,
+                attn_dim_head = 8,
+                heads = 1,
+                use_rmsnorm = True,
+                rotary_pos_emb = True,
+                pre_norm = False
+            )
+        ],
+        num_tokens = 256,
+        dim = 32,
+        max_reasoning_steps = 10
+    )
+
+    seq = torch.randint(0, 256, (3, 1024))
+    labels = torch.randint(0, 256, (3, 1024))
+
+    loss, *_ = hrm(seq, labels = labels)
+    loss.backward()
+
+    # after much training
+
+    pred = hrm(seq, max_reasoning_steps = 5, compute_loss_across_reasoning_steps = compute_loss_across_reasoning_steps)