cornell-zhang · silvenachen · Mar 29, 2025 · Mar 29, 2025 · Mar 31, 2025 · chhzh123
diff --git a/allo/library/nn.py b/allo/library/nn.py
@@ -199,3 +199,44 @@ def scaled_dot_product_attention[
             Z[i, h * (D // H) + j] = C_h[i, j]
 
     return Z
+
+
+def RoPE[
+    Ty, H, L, D
+](X: "Ty[L, D]", cos: "Ty[L, D // H // 2]", sin: "Ty[L, D // H // 2]") -> "Ty[L, D]":
+    # Rotary Position Embedding
+    # Reference: https://arxiv.org/abs/2104.09864
+    X_rotary: Ty[L, D]
+    for h in range(H):
+        X_1_h: Ty[L, D // H // 2]
+        X_2_h: Ty[L, D // H // 2]
+        for i, j in dsl.grid(L, D // H // 2, name="rope_split_1"):
+            X_1_h[i, j] = X[i, h * (D // H) + j]
+        for i, j in dsl.grid(L, D // H // 2, name="rope_split_2"):
+            X_2_h[i, j] = X[i, h * (D // H) + D // H // 2 + j]
+        X_1_rotary: Ty[L, D // H // 2] = 0
+        X_2_rotary: Ty[L, D // H // 2] = 0
+        for i, j in dsl.grid(L, D // H // 2, name="rotary_1"):
+            X_1_rotary[i, j] = cos[i, j] * X_1_h[i, j] - sin[i, j] * X_2_h[i, j]
+        for i, j in dsl.grid(L, D // H // 2, name="rotary_2"):
+            X_2_rotary[i, j] = sin[i, j] * X_1_h[i, j] + cos[i, j] * X_2_h[i, j]
+        for i, j in dsl.grid(L, D // H // 2, name="rotary_merge_1"):
+            X_rotary[i, h * (D // H) + j] = X_1_rotary[i, j]
+        for i, j in dsl.grid(L, D // H // 2, name="rotary_merge_2"):
+            X_rotary[i, h * (D // H) + D // H // 2 + j] = X_2_rotary[i, j]
+    return X_rotary
+
+
+def modulate_fused[
+    Ty, L, D
+](X: "Ty[L,D]", scale: "Ty[D]", shift: "Ty[D]") -> "Ty[L, D]":
+    Z: Ty[L, D]
+    for i, j in dsl.grid(L, D, name="m_fused"):
+        Z[i, j] = X[i, j] * (1 + scale[j]) + shift[j]
+    return Z
+
+
+def schedule_modulate_fused(s):
+    lj = s.get_loops(s.top_func_name)["m_fused"]["j"]
+    s.pipeline(lj)
+    return s
diff --git a/tests/test_nn.py b/tests/test_nn.py
@@ -303,5 +303,64 @@ def bert_layer(X, Wq, Wk, Wv, Wp, W1, W2, gamma1, beta1, gamma2, beta2):
     print(s.build(target="vhls"))
 
 
+def np_rope(X, cos, sin, num_heads=8):
+    X1 = X[:, :, :32]
+    X2 = X[:, :, 32:]
+
+    X_rotated = np.zeros_like(X)  # [1024, 8, 64]
+
+    for i in range(num_heads):
+        X_1_i = X1[:, i, :]
+        X_2_i = X2[:, i, :]
+        X_rotated_i = np.concatenate(
+            (X_1_i * cos - X_2_i * sin, X_1_i * sin + X_2_i * cos), axis=-1
+        )
+
+        X_rotated[:, i, :] = X_rotated_i  # [1024, 8, 64]
+    return X_rotated
+
+
+def test_RoPE():
+    from allo.library.nn import RoPE
+
+    L, D = 1024, 512
+    H = 8
+    s = allo.customize(RoPE, instantiate=[float32, H, L, D])
+    mod = s.build()
+    Q = np.random.randn(L, D).astype(np.float32)
+    cos = np.random.randn(L, 32).astype(np.float32)
+    sin = np.random.randn(L, 32).astype(np.float32)
+    allo_out = mod(Q, cos, sin)
+    Q_np = Q.reshape(1024, 8, 64)
+    np_out = np_rope(Q_np, cos, sin)
+    np_out = np_out.reshape(1024, 512)
+    np.testing.assert_allclose(allo_out, np_out, atol=1e-3)
+    print("Passed!")
+
+
+def np_modulate_fused(x, shift, scale):
+    output = x * (1 + scale) + shift
+    return output
+
+
+def test_modulate_fused():
+    from allo.library.nn import modulate_fused
+    from allo.library.nn import schedule_modulate_fused
+
+    L, D = 1024, 512
+    X = np.random.randn(L, D).astype(np.float32)
+    X_norm = X
+    s = allo.customize(modulate_fused, instantiate=[float32, L, D])
+    schedule_modulate_fused(s)
+    print(s.module)
+    mod = s.build(target="llvm")
+    scale = np.random.randn(D).astype(np.float32)
+    shift = np.random.randn(D).astype(np.float32)
+    allo_out = mod(X, scale, shift)
+    np_out = np_modulate_fused(X_norm, shift=shift, scale=scale)
+    np.testing.assert_allclose(allo_out, np_out, atol=1e-3)
+    print("Passed!")
+
+
 if __name__ == "__main__":
     pytest.main([__file__])