Add attention

PineapplePulp · PineapplePulp · commit b15a558be6d6 · 2025-04-06T23:07:49.000-07:00
diff --git a/lib/Autograd.chpl b/lib/Autograd.chpl
@@ -1101,6 +1101,24 @@ record batchNormOp : serializable {
     proc spec : GradOpSpec do return new dict(("operation","BatchNorm"));
 }
 
+record multiheadAttentionOp : serializable {
+    type eltType = real;
+    var features: shared BaseTensorResource(?);
+    var q_weight: shared BaseTensorResource(eltType, ?);
+    var k_weight: shared BaseTensorResource(eltType, ?);
+    var v_weight: shared BaseTensorResource(eltType, ?);
+    var num_heads: int;
+    var embed_dim: int;
+
+    proc children do return (features, q_weight, k_weight, v_weight);
+
+    proc forward() {
+        return ndarray.multiheadAttention(features.array, q_weight.array, k_weight.array, v_weight.array, num_heads, embed_dim);
+    }
+
+    proc spec : GradOpSpec do return new dict(("operation", "MultiHeadAttention"));
+}
+
 record dropoutOp : serializable {
     param rank: int;
     type eltType;
diff --git a/lib/DynamicTensor.chpl b/lib/DynamicTensor.chpl
@@ -634,6 +634,28 @@ proc type dynamicTensor.batchnorm(
     return new dynamicTensor(eltType);
 }
 
+proc type dynamicTensor.multiheadAttention(
+    features: dynamicTensor(?eltType),
+    q_weight: dynamicTensor(eltType),
+    k_weight: dynamicTensor(eltType),
+    v_weight: dynamicTensor(eltType),
+    num_heads: int,
+    embed_dim: int
+): dynamicTensor(eltType) {
+    if features.checkRank(3) {
+        return staticTensor.multiheadAttention(
+            features.forceRank(3),
+            q_weight.forceRank(2),
+            k_weight.forceRank(2),
+            v_weight.forceRank(2),
+            num_heads,
+            embed_dim
+        ).eraseRank();
+    }
+    halt("Could not determine rank in dynamicTensor.multiheadAttention");
+    return new dynamicTensor(eltType);
+}
+
 proc dynamicTensor.softmax(): dynamicTensor(eltType) {
     for param rank in 1..maxRank {
         if this.checkRank(rank) then
diff --git a/lib/NDArray.chpl b/lib/NDArray.chpl
@@ -1982,6 +1982,29 @@ proc type ndarray.matmul(a: ndarray(?aRank,?eltType),b: ndarray(?bRank,eltType))
     return prod;
 }
 
+// Supports 1 head as of now.
+proc type ndarray.multiheadAttention(
+    features: ndarray(3, ?eltType),
+    q_weight: ndarray(2, eltType),
+    k_weight: ndarray(2, eltType),
+    v_weight: ndarray(2, eltType),
+    num_heads: int,
+    embed_dim: int
+): ndarray(3, eltType) {
+    const fshape = features.shape;
+    const seq_len = fshape[0];
+    const batch_size = fshape[1];
+    const head_dim = embed_dim / 1;
+
+    var q = ndarray.matmul(features,q_weight);
+    var k = ndarray.matmul(features,k_weight);
+    var v = ndarray.matmul(features,v_weight);
+    var z = (ndarray.matmul(q,k.permute(0,2,1))/Math.sqrt(head_dim))._softmax(axis=2);
+    var a = ndarray.matmul(z,v);
+
+    return a;
+}
+
 proc type ndarray.batchNormTrain(
     features: ndarray(?rank,?eltType),
     weight: ndarray(1,eltType),
diff --git a/lib/Network.chpl b/lib/Network.chpl
@@ -1080,6 +1080,32 @@ class BatchNorm : Module(?) {
     }
 }
 
+class MultiheadAttention : Module(?) {
+    var q_weight: owned Parameter(eltType);
+    var k_weight: owned Parameter(eltType);
+    var v_weight: owned Parameter(eltType);
+    var num_heads: int;
+    var embed_dim: int;
+
+    proc init(type eltType = real, embed_dim: int, num_heads: int) {
+        this.q_weight = new Parameter(Tensor.ones(embed_dim, embed_dim));
+        this.k_weight = new Parameter(Tensor.ones(embed_dim, embed_dim));
+        this.v_weight = new Parameter(Tensor.ones(embed_dim, embed_dim));
+        this.num_heads = num_heads;
+        this.embed_dim = embed_dim;
+    }
+
+    override proc forward(input: Tensor(eltType)): Tensor(eltType) {
+        return Tensor.multiheadAttention(input, q_weight.data, k_weight.data, v_weight.data, num_heads, embed_dim);
+    }
+
+    override proc setup() {
+        addModule("query", q_weight);
+        addModule("key", k_weight);
+        addModule("value", v_weight);
+    }
+}
+
 class AdaptiveAvgPool2D : Module(?) {
   // only handles square pooling
   var outputSize: int;
diff --git a/lib/StaticTensor.chpl b/lib/StaticTensor.chpl
@@ -453,6 +453,18 @@ proc type staticTensor.batchNorm(
     return tensorFromCtx(featureRank, eltType, ctx);
 }
 
+proc type staticTensor.multiheadAttention(
+    features: staticTensor(3, ?eltType),
+    q_weight: staticTensor(2, eltType),
+    k_weight: staticTensor(2, eltType),
+    v_weight: staticTensor(2, eltType),
+    num_heads: int,
+    embed_dim: int
+): staticTensor(3, eltType) {
+    var ctx = new multiheadAttentionOp(eltType, features.meta, q_weight.meta, k_weight.meta, v_weight.meta, num_heads, embed_dim);
+    return tensorFromCtx(3, eltType, ctx);
+}
+
 // proc matvec(mat: staticTensor(2,?eltType),vec: staticTensor(1,eltType)): staticTensor(1,eltType) {
 //     const (n,) = vec.array.domain.shape;
 //     const (m,_n) = mat.array.domain.shape;