[Torch] Fix batch norm decomposition dtype for mixed-precision inputs

rkayaith · claude · rkayaith · commit 1f2abc873643 · 2026-03-03T11:55:06.000-08:00
Use the running stats dtype instead of the input dtype when reshaping `running_mean`/`running_var` in `DecomposeAtenNativeBatchNormOp`. When the input is e.g. bf16 with f32 running stats, the reshape was producing `aten.view` ops with a bf16 result type from an f32 input, which is invalid. Fixes #4480 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp b/lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp
@@ -8592,14 +8592,17 @@ class DecomposeAtenNativeBatchNormOp
     SmallVector<int64_t> runningStatsShapeInt(inputRank, 1);
     runningStatsShapeInt[1] =
         cast<BaseTensorType>(runningMean.getType()).getSizes()[0];
-    Type dtype = cast<ValueTensorType>(input.getType()).getOptionalDtype();
-    Type reshapeType = ValueTensorType::get(
-        context, llvm::ArrayRef(runningStatsShapeInt), dtype);
 
-    runningMean = AtenViewOp::create(rewriter, loc, reshapeType, runningMean,
-                                     runningStatsSizeList);
-    runningVar = AtenViewOp::create(rewriter, loc, reshapeType, runningVar,
-                                    runningStatsSizeList);
+    auto reshapeType = [&](Value v) {
+      auto dtype = cast<ValueTensorType>(v.getType()).getOptionalDtype();
+      return ValueTensorType::get(context, llvm::ArrayRef(runningStatsShapeInt),
+                                  dtype);
+    };
+
+    runningMean = AtenViewOp::create(rewriter, loc, reshapeType(runningMean),
+                                     runningMean, runningStatsSizeList);
+    runningVar = AtenViewOp::create(rewriter, loc, reshapeType(runningVar),
+                                    runningVar, runningStatsSizeList);
 
     // normalizedInput = (input - runningMean) / (sqrt(runningVar + eps)).
     Value inputSubMean = AtenSubTensorOp::create(
@@ -8621,7 +8624,7 @@ class DecomposeAtenNativeBatchNormOp
       std::optional<unsigned> weightRank = getTensorRank(weight);
       if (!weightRank || *weightRank != 1)
         return rewriter.notifyMatchFailure(op, "expected weight to be rank 1");
-      weight = AtenViewOp::create(rewriter, loc, reshapeType, weight,
+      weight = AtenViewOp::create(rewriter, loc, reshapeType(weight), weight,
                                   runningStatsSizeList);
       batchNormOutput = AtenMulTensorOp::create(
           rewriter, loc, batchNormOutput.getType(), batchNormOutput, weight);
@@ -8631,7 +8634,7 @@ class DecomposeAtenNativeBatchNormOp
       std::optional<unsigned> biasRank = getTensorRank(bias);
       if (!biasRank || *biasRank != 1)
         return rewriter.notifyMatchFailure(op, "expected bias to be rank 1");
-      bias = AtenViewOp::create(rewriter, loc, reshapeType, bias,
+      bias = AtenViewOp::create(rewriter, loc, reshapeType(bias), bias,
                                 runningStatsSizeList);
       batchNormOutput =
           AtenAddTensorOp::create(rewriter, loc, batchNormOutput.getType(),
diff --git a/test/Dialect/Torch/decompose-complex-ops.mlir b/test/Dialect/Torch/decompose-complex-ops.mlir
@@ -1013,3 +1013,27 @@ func.func @channel_shuffle(%arg0: !torch.vtensor<[1,8,4,4],f32>) -> !torch.vtens
   %0 = torch.aten.channel_shuffle %arg0, %int4 : !torch.vtensor<[1,8,4,4],f32>, !torch.int -> !torch.vtensor<[1,8,4,4],f32>
   return %0 : !torch.vtensor<[1,8,4,4],f32>
 }
+
+// -----
+
+// CHECK-LABEL: func.func @native_batch_norm_mixed_precision(
+// CHECK-SAME: %{{.*}}: !torch.vtensor<[1,3,4,4],bf16>, %{{.*}}: !torch.vtensor<[3],f32>, %{{.*}}: !torch.vtensor<[3],f32>, %[[MEAN:.*]]: !torch.vtensor<[3],f32>, %[[VAR:.*]]: !torch.vtensor<[3],f32>)
+// Verify that the running stats reshape uses f32 (matching running stats dtype),
+// not bf16 (the input dtype).
+// CHECK: torch.aten.view %[[MEAN]], %{{.*}} : !torch.vtensor<[3],f32>, !torch.list<int> -> !torch.vtensor<[1,3,1,1],f32>
+// CHECK: torch.aten.view %[[VAR]], %{{.*}} : !torch.vtensor<[3],f32>, !torch.list<int> -> !torch.vtensor<[1,3,1,1],f32>
+func.func @native_batch_norm_mixed_precision(
+    %input: !torch.vtensor<[1,3,4,4],bf16>,
+    %weight: !torch.vtensor<[3],f32>,
+    %bias: !torch.vtensor<[3],f32>,
+    %running_mean: !torch.vtensor<[3],f32>,
+    %running_var: !torch.vtensor<[3],f32>
+) -> (!torch.vtensor<[1,3,4,4],bf16>, !torch.vtensor<[0],f32>, !torch.vtensor<[0],f32>) {
+  %false = torch.constant.bool false
+  %float1e-5 = torch.constant.float 1.000000e-05
+  %float0.1 = torch.constant.float 1.000000e-01
+  %out:3 = torch.aten.native_batch_norm %input, %weight, %bias, %running_mean, %running_var, %false, %float0.1, %float1e-5 :
+      !torch.vtensor<[1,3,4,4],bf16>, !torch.vtensor<[3],f32>, !torch.vtensor<[3],f32>, !torch.vtensor<[3],f32>, !torch.vtensor<[3],f32>, !torch.bool, !torch.float, !torch.float
+      -> !torch.vtensor<[1,3,4,4],bf16>, !torch.vtensor<[0],f32>, !torch.vtensor<[0],f32>
+  return %out#0, %out#1, %out#2 : !torch.vtensor<[1,3,4,4],bf16>, !torch.vtensor<[0],f32>, !torch.vtensor<[0],f32>
+}