[Torch] Fix batch norm decomposition dtype for mixed-precision inputs

rkayaith · claude · rkayaith · commit d444578c324e · 2026-03-03T11:47:59.000-08:00
Use the running stats dtype instead of the input dtype when reshaping `running_mean`/`running_var` in `DecomposeAtenNativeBatchNormOp`. When the input is e.g. bf16 with f32 running stats, the reshape was producing `aten.view` ops with a bf16 result type from an f32 input, which is invalid. Fixes #4480 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp b/lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp
@@ -8592,7 +8592,8 @@ class DecomposeAtenNativeBatchNormOp
     SmallVector<int64_t> runningStatsShapeInt(inputRank, 1);
     runningStatsShapeInt[1] =
         cast<BaseTensorType>(runningMean.getType()).getSizes()[0];
-    Type dtype = cast<ValueTensorType>(input.getType()).getOptionalDtype();
+    Type dtype =
+        cast<ValueTensorType>(runningMean.getType()).getOptionalDtype();
     Type reshapeType = ValueTensorType::get(
         context, llvm::ArrayRef(runningStatsShapeInt), dtype);
 
diff --git a/test/Dialect/Torch/decompose-complex-ops.mlir b/test/Dialect/Torch/decompose-complex-ops.mlir
@@ -1013,3 +1013,27 @@ func.func @channel_shuffle(%arg0: !torch.vtensor<[1,8,4,4],f32>) -> !torch.vtens
   %0 = torch.aten.channel_shuffle %arg0, %int4 : !torch.vtensor<[1,8,4,4],f32>, !torch.int -> !torch.vtensor<[1,8,4,4],f32>
   return %0 : !torch.vtensor<[1,8,4,4],f32>
 }
+
+// -----
+
+// CHECK-LABEL: func.func @native_batch_norm_mixed_precision(
+// CHECK-SAME: %{{.*}}: !torch.vtensor<[1,3,4,4],bf16>, %{{.*}}: !torch.vtensor<[3],f32>, %{{.*}}: !torch.vtensor<[3],f32>, %[[MEAN:.*]]: !torch.vtensor<[3],f32>, %[[VAR:.*]]: !torch.vtensor<[3],f32>)
+// Verify that the running stats reshape uses f32 (matching running stats dtype),
+// not bf16 (the input dtype).
+// CHECK: torch.aten.view %[[MEAN]], %{{.*}} : !torch.vtensor<[3],f32>, !torch.list<int> -> !torch.vtensor<[1,3,1,1],f32>
+// CHECK: torch.aten.view %[[VAR]], %{{.*}} : !torch.vtensor<[3],f32>, !torch.list<int> -> !torch.vtensor<[1,3,1,1],f32>
+func.func @native_batch_norm_mixed_precision(
+    %input: !torch.vtensor<[1,3,4,4],bf16>,
+    %weight: !torch.vtensor<[3],f32>,
+    %bias: !torch.vtensor<[3],f32>,
+    %running_mean: !torch.vtensor<[3],f32>,
+    %running_var: !torch.vtensor<[3],f32>
+) -> (!torch.vtensor<[1,3,4,4],bf16>, !torch.vtensor<[0],f32>, !torch.vtensor<[0],f32>) {
+  %false = torch.constant.bool false
+  %float1e-5 = torch.constant.float 1.000000e-05
+  %float0.1 = torch.constant.float 1.000000e-01
+  %out:3 = torch.aten.native_batch_norm %input, %weight, %bias, %running_mean, %running_var, %false, %float0.1, %float1e-5 :
+      !torch.vtensor<[1,3,4,4],bf16>, !torch.vtensor<[3],f32>, !torch.vtensor<[3],f32>, !torch.vtensor<[3],f32>, !torch.vtensor<[3],f32>, !torch.bool, !torch.float, !torch.float
+      -> !torch.vtensor<[1,3,4,4],bf16>, !torch.vtensor<[0],f32>, !torch.vtensor<[0],f32>
+  return %out#0, %out#1, %out#2 : !torch.vtensor<[1,3,4,4],bf16>, !torch.vtensor<[0],f32>, !torch.vtensor<[0],f32>
+}