Add layerNorm

PineapplePulp · PineapplePulp · commit 0684044b8fca · 2025-04-13T17:52:05.000-07:00
diff --git a/lib/Autograd.chpl b/lib/Autograd.chpl
@@ -1101,6 +1101,21 @@ record batchNormOp : serializable {
     proc spec : GradOpSpec do return new dict(("operation","BatchNorm"));
 }
 
+record layerNormOp : serializable {
+    type eltType = real;
+    var features: shared BaseTensorResource(?);
+    var weight: shared BaseTensorResource(eltType, ?);
+    var bias: shared BaseTensorResource(eltType, ?);
+
+    proc children do return (features, weight, bias);
+
+    proc forward() {
+        return ndarray.layerNorm(features.array, weight.array, bias.array);
+    }
+
+    proc spec : GradOpSpec do return new dict(("operation", "LayerNorm"));
+}
+
 record multiheadAttentionOp : serializable {
     type eltType = real;
     var features: shared BaseTensorResource(?);
diff --git a/lib/DynamicTensor.chpl b/lib/DynamicTensor.chpl
@@ -634,6 +634,29 @@ proc type dynamicTensor.batchnorm(
     return new dynamicTensor(eltType);
 }
 
+proc type dynamicTensor.layerNorm(
+    features: dynamicTensor(?eltType),
+    weight: dynamicTensor(eltType),
+    bias: dynamicTensor(eltType)
+    // normalizedShape: (...?maxRank)
+): dynamicTensor(eltType) {
+    // const n = normalizedShape.size;
+    for param rankF in 2..4 {
+        for param rankN in 1..4 {
+            if features.checkRank(rankF) && weight.checkRank(rankN) && bias.checkRank(rankN) {
+                return staticTensor.layerNorm(
+                    features.forceRank(rankF),
+                    weight.forceRank(rankN),
+                    bias.forceRank(rankN),
+                    rankN
+                ).eraseRank();
+            }
+        }
+    }
+    halt("Could not determine rank in dynamicTensor.layerNorm.");
+    return new dynamicTensor(eltType);
+}
+
 proc type dynamicTensor.multiheadAttention(
     features: dynamicTensor(?eltType),
     q_weight: dynamicTensor(eltType),
diff --git a/lib/NDArray.chpl b/lib/NDArray.chpl
@@ -532,6 +532,16 @@ proc ndarray.variance(axes: int...?axesCount): ndarray(rank,eltType) {
     return ((this - this.mean((...axes)).expand((...shape)))**2).sum((...axes)) / (denom - 1);
 }
 
+proc ndarray.variance(axes: int...?axesCount, correction: int): ndarray(rank,eltType) {
+    const shape = this.shape;
+    var denom: eltType = 0;
+    for param i in 0..<axesCount {
+        const reducedN = shape(axes(i));
+        denom += reducedN : eltType;
+    }
+    return ((this - this.mean((...axes)).expand((...shape)))**2).sum((...axes)) / (denom - correction);
+}
+
 proc ndarray.shrink(narg: 2*int ... rank,param exactBounds = false): ndarray(rank,eltType) {
     var newShape: rank * int;
     var sliceRanges: rank * range;
@@ -1387,6 +1397,16 @@ operator *(a: ndarray(?rank,?eltType),b: ndarray(rank,eltType)): ndarray(rank,el
     return c;
 }
 
+operator **(a: ndarray(?rank,?eltType),b: real): ndarray(rank,eltType) {
+    const dom = a.domain;
+    var c: ndarray(rank,eltType) = new ndarray(a.domain,eltType);
+    ref cData = c.data;
+    const ref aData = a.data;
+    forall i in dom.every() do
+        cData[i] = (aData[i]**b):eltType;
+    return c;
+}
+
 operator -(a: ndarray(?rank, ?eltType)): ndarray(rank, eltType) {
     const dom = a.domain;
     var negged = new ndarray(dom, eltType);
@@ -2112,6 +2132,45 @@ proc type ndarray.batchNorm(
     return outFeatures;
 }
 
+proc type ndarray.layerNorm(
+    features: ndarray(?rank,?eltType),
+    weight: ndarray(?n,eltType),
+    bias: ndarray(n,eltType)
+): ndarray(rank,eltType) {
+    const fshape = features.shape;
+    const axis = rank - n - 1;
+
+    var args: n*int;
+    for i in 0..<n {
+        args[i] = i + axis + 1;
+    }
+    var avgs = features.mean((...args));
+    var vars = features.variance((...args), correction = 0);
+
+    ref f = features.data;
+    ref a = avgs.data;
+    ref v = vars.data;
+    ref w = weight.data;
+    ref b = bias.data;
+
+    var outDom = util.domainFromShape((...fshape));
+    var outFeatures = new ndarray(outDom,eltType);
+    ref dat = outFeatures.data;
+
+    forall idx in outDom.every() {
+        var c = idx;
+        var d: n*int;
+        for i in (axis + 1)..<rank {
+            c[i] = 0;
+        }
+        for i in 0..<n {
+            d[i] = idx[axis+1+i];
+        }
+        dat[idx] = ((f[idx] - a[c])/v[c])*w[d] + b[d];
+    }
+    return outFeatures;
+}
+
 
 inline proc type ndarray.fromRanges(type eltType = real, rngs: range...?rank): ndarray(rank,eltType) {
     const dom_ = {(...rngs)};
diff --git a/lib/Network.chpl b/lib/Network.chpl
@@ -1080,6 +1080,26 @@ class BatchNorm : Module(?) {
     }
 }
 
+class LayerNorm : Module(?) {
+    var weight: owned Parameter(eltType);
+    var bias: owned Parameter(eltType);
+    var nShape; 
+    proc init(type eltType = real, normalizedShape: ?nShapeRankP*int) {
+        this.weight = new Parameter(Tensor.ones((...normalizedShape)));
+        this.bias = new Parameter(Tensor.zeros((...normalizedShape)));
+        this.nShape = normalizedShape;
+    }
+
+    override proc forward(input: Tensor(eltType)): Tensor(eltType) {
+        return Tensor.layerNorm(input, weight.data, bias.data);
+    }
+
+    override proc setup() {
+        addModule("weight", weight);
+        addModule("bias", bias);
+    }
+}
+
 class MultiheadAttention : Module(?) {
     var q_weight: owned Parameter(eltType);
     var k_weight: owned Parameter(eltType);
diff --git a/lib/StaticTensor.chpl b/lib/StaticTensor.chpl
@@ -453,6 +453,16 @@ proc type staticTensor.batchNorm(
     return tensorFromCtx(featureRank, eltType, ctx);
 }
 
+proc type staticTensor.layerNorm(
+    features: staticTensor(?featureRank,?eltType),
+    weight: staticTensor(?n,eltType),
+    bias: staticTensor(n,eltType),
+    rankN: int
+): staticTensor(featureRank,eltType) {
+    var ctx = new layerNormOp(eltType, features.meta, weight.meta, bias.meta);
+    return tensorFromCtx(featureRank, eltType, ctx);
+}
+
 proc type staticTensor.multiheadAttention(
     features: staticTensor(3, ?eltType),
     q_weight: staticTensor(2, eltType),