add hparam

lucidrains · lucidrains · commit d1d075b5c6b8 · 2025-08-04T15:06:55.000-07:00
diff --git a/HRM/hrm.py b/HRM/hrm.py
@@ -9,7 +9,7 @@
 from einops import rearrange, repeat
 from einops.layers.torch import Rearrange, Reduce
 
-from x_transformers import Encoder, RMSNorm
+from x_transformers import Encoder, Decoder, RMSNorm
 
 # helper functions
 
@@ -66,9 +66,11 @@ def __init__(
         num_tokens,
         reasoning_steps = 2,                          # N in the paper - the number of forward evals for the last network (highest hierarchy) above
         relative_period: int | tuple[int, ...] = 2,   # the relative period for each network evaluation call to the one just previous - in the paper, they do 2 networks with a period of 2
+        causal = False,
         ignore_index = -1,
     ):
         super().__init__()
+        attn_layers_klass = Encoder if not causal else Decoder
 
         # input
 
@@ -82,7 +84,7 @@ def __init__(
 
         for network in networks:
             if isinstance(network, dict):
-                network = Encoder(**network)
+                network = attn_layers_klass(**network)
 
             self.networks.append(network)
 
diff --git a/HRM/hrm_with_act.py b/HRM/hrm_with_act.py
@@ -13,7 +13,7 @@
 from einops import rearrange, repeat
 from einops.layers.torch import Rearrange, Reduce
 
-from x_transformers import Encoder, RMSNorm
+from x_transformers import Encoder, Decoder, RMSNorm
 
 # constants
 
@@ -80,13 +80,15 @@ def __init__(
         num_tokens,
         reasoning_steps = 2,                          # N in the paper - the number of forward evals for the last network (highest hierarchy) above
         relative_period: int | tuple[int, ...] = 2,   # the relative period for each network evaluation call to the one just previous - in the paper, they do 2 networks with a period of 2
+        causal = False,
         min_reasoning_steps_epsilon_prob = 0.5,            # they stochastically choose the minimum segment from 2 .. max with this probability, and 1 step the rest of the time
         max_reasoning_steps = 10,
         act_loss_weight = 1.,
         discount_factor = 1.,
         ignore_index = -1,
     ):
         super().__init__()
+        attn_layers_klass = Encoder if not causal else Decoder
 
         # input
 
@@ -100,7 +102,7 @@ def __init__(
 
         for network in networks:
             if isinstance(network, dict):
-                network = Encoder(**network)
+                network = attn_layers_klass(**network)
 
             self.networks.append(network)
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "HRM-pytorch"
-version = "0.1.2"
+version = "0.1.4"
 description = "The proposal from a Singaporean AGI company"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_hrm.py b/tests/test_hrm.py
@@ -1,7 +1,10 @@
 import pytest
+param = pytest.mark.parametrize
+
 import torch
 
-def test_hrm():
+@param('causal', (False, True))
+def test_hrm(causal):
     from HRM.hrm import HRM
     from x_transformers import Encoder
 
@@ -35,6 +38,7 @@ def test_hrm():
                 pre_norm = False
             )
         ],
+        causal = causal,
         num_tokens = 256,
         dim = 32,
         reasoning_steps = 10
@@ -53,9 +57,11 @@ def test_hrm():
 
     pred = hrm(seq, reasoning_steps = 5)
 
-@pytest.mark.parametrize('compute_loss_across_reasoning_steps', (False, True))
+@param('compute_loss_across_reasoning_steps', (False, True))
+@param('causal', (False, True))
 def test_hrm_with_act(
-    compute_loss_across_reasoning_steps
+    compute_loss_across_reasoning_steps,
+    causal
 ):
     from HRM.hrm_with_act import HRM
 
@@ -73,7 +79,8 @@ def test_hrm_with_act(
         ],
         num_tokens = 256,
         dim = 32,
-        max_reasoning_steps = 10
+        max_reasoning_steps = 10,
+        causal = causal
     )
 
     seq = torch.randint(0, 256, (3, 1024))