nits

efric · claude · efric · commit 97dd26b424ef · 2026-03-25T18:08:04.000-07:00
Applies the VDMFMA-1 changes: renames FP8/BF8 enum variants to
F8E4M3FNUZ/F8E5M2FNUZ, switches expand/collapse accumulator to use
vector.interleave/deinterleave, adds isVDMFMAIntrinsic helper and
header declarations, and fixes getDistributedTileTypes broadcastFactor
logic.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
Signed-off-by: Eric Feng &lt;Eric.Feng@amd.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
@@ -754,7 +754,7 @@ OpFoldResult MMAAttr::getDistributionWorkerCount(OpBuilder &, Location,
   return getAsIndexOpFoldResult(getContext(), getSubgroupSize());
 }
 
-// Get virtual intrinsics that is composed/based on queried op.
+// Returns virtual intrinsics that are composed from this concrete MMA op.
 SmallVector<VirtualMMAIntrinsic> MMAAttr::getVirtualIntrinsics() const {
   switch (getIntrinsic()) {
   case MMAIntrinsic::MFMA_F32_16x16x16_F16:
@@ -1269,15 +1269,15 @@ getMNKShape(VirtualMMAIntrinsic type) {
   case VirtualMMAIntrinsic::VMFMA_F32_32x32x16_F8E4M3FNUZ:
   case VirtualMMAIntrinsic::VMFMA_F32_32x32x16_F16:
     return {32, 32, 16};
-  // Sparse trick VDMFMAs for skinny GEMMs.
+  // Sparse trick VDMFMAs for skinny GEMMs: semantically 8x16xK.
   case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_F16:
   case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_BF16:
     return {8, 16, 64};
   case VirtualMMAIntrinsic::VDMFMA_I32_8x16x128_I8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_BF8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_BF8_FP8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_FP8_BF8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_FP8:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E5M2FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E5M2FNUZ_F8E4M3FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E4M3FNUZ_F8E5M2FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E4M3FNUZ:
     return {8, 16, 128};
   }
   assert(false && "unhandled virtual mma layout type.");
@@ -1312,13 +1312,13 @@ getABCElementTypes(MLIRContext *context, VirtualMMAIntrinsic type) {
     return {bf16, bf16, f32};
   case VirtualMMAIntrinsic::VDMFMA_I32_8x16x128_I8:
     return {i8, i8, i32};
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_BF8:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E5M2FNUZ:
     return {f8E5M2FNUZ, f8E5M2FNUZ, f32};
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_BF8_FP8:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E5M2FNUZ_F8E4M3FNUZ:
     return {f8E5M2FNUZ, f8E4M3FNUZ, f32};
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_FP8_BF8:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E4M3FNUZ_F8E5M2FNUZ:
     return {f8E4M3FNUZ, f8E5M2FNUZ, f32};
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_FP8:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E4M3FNUZ:
     return {f8E4M3FNUZ, f8E4M3FNUZ, f32};
   }
   assert(false && "unhandled virtual mma layout type.");
@@ -1374,15 +1374,29 @@ void VirtualMMAAttr::getDistributedTileTypes(
     SmallVectorImpl<VectorType> &result) const {
   MLIRContext *context = getContext();
   VirtualMMAIntrinsic intrinsic = getIntrinsic();
+  // VDMFMA layouts pair adjacent lanes to emulate a wider tile, so
+  // threadProduct < subgroupSize (broadcastFactor > 1). We need
+  // getPerLaneElements to divide out the broadcast and compute the correct
+  // per-lane element count.
+  auto lhsLayout = getSingleSubgroupLayout(intrinsic, kMMAOperandLhs);
   int64_t subgroupSize = getSubgroupSize();
-  OpaqueMmaLayout o = getOpaqueMMALayout(context, intrinsic);
-  auto lhs = getSingleSubgroupLayout(intrinsic, kMMAOperandLhs);
-  auto rhs = getSingleSubgroupLayout(intrinsic, kMMAOperandRhs);
-  auto acc = getSingleSubgroupLayout(intrinsic, kMMAOperandAcc);
-  result.assign(
-      {VectorType::get({getPerLaneElements(lhs, subgroupSize)}, o.aType),
-       VectorType::get({getPerLaneElements(rhs, subgroupSize)}, o.bType),
-       VectorType::get({getPerLaneElements(acc, subgroupSize)}, o.cType)});
+  int64_t broadcastFactor = subgroupSize / llvm::product_of(lhsLayout.thread);
+  if (isVDMFMAIntrinsic(intrinsic) && broadcastFactor > 1) {
+    OpaqueMmaLayout o = getOpaqueMMALayout(context, intrinsic);
+    auto rhsLayout = getSingleSubgroupLayout(intrinsic, kMMAOperandRhs);
+    auto accLayout = getSingleSubgroupLayout(intrinsic, kMMAOperandAcc);
+    result.assign(
+        {VectorType::get({getPerLaneElements(lhsLayout, subgroupSize)},
+                         o.aType),
+         VectorType::get({getPerLaneElements(rhsLayout, subgroupSize)},
+                         o.bType),
+         VectorType::get({getPerLaneElements(accLayout, subgroupSize)},
+                         o.cType)});
+  } else {
+    result.assign({getThreadVectorType(context, intrinsic, kMMAOperandLhs),
+                   getThreadVectorType(context, intrinsic, kMMAOperandRhs),
+                   getThreadVectorType(context, intrinsic, kMMAOperandAcc)});
+  }
 }
 
 int64_t VirtualMMAAttr::getSubgroupSize() const {
@@ -1394,10 +1408,10 @@ int64_t VirtualMMAAttr::getSubgroupSize() const {
   case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_F16:
   case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_BF16:
   case VirtualMMAIntrinsic::VDMFMA_I32_8x16x128_I8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_BF8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_BF8_FP8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_FP8_BF8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_FP8: {
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E5M2FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E5M2FNUZ_F8E4M3FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E4M3FNUZ_F8E5M2FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E4M3FNUZ: {
     return 64;
   }
   }
@@ -1485,10 +1499,10 @@ int64_t VirtualMMAAttr::getIntrinsicsK() const {
   case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_F16:
   case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_BF16:
   case VirtualMMAIntrinsic::VDMFMA_I32_8x16x128_I8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_BF8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_BF8_FP8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_FP8_BF8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_FP8: {
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E5M2FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E5M2FNUZ_F8E4M3FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E4M3FNUZ_F8E5M2FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E4M3FNUZ: {
     return 2;
   }
   case VirtualMMAIntrinsic::VMFMA_F32_16x16x32_F8E4M3FNUZ:
@@ -1500,26 +1514,23 @@ int64_t VirtualMMAAttr::getIntrinsicsK() const {
   return 0;
 }
 
-// Expand collapsed ACC [c0, c1] -> [c0, 0, c1, 0].
-static Value expandAccumulator(OpBuilder &builder, Location loc, Value acc) {
+// Expands a collapsed 2-element ACC into the 4-element native SMFMAC form
+// by interleaving with zeros: [c0, c1] -> [c0, 0, c1, 0].
+Value expandAccumulator(OpBuilder &builder, Location loc, Value acc) {
   auto accType = cast<VectorType>(acc.getType());
   Value zero =
       arith::ConstantOp::create(builder, loc, builder.getZeroAttr(accType));
-
-  return vector::ShuffleOp::create(builder, loc, acc, zero,
-                                   ArrayRef<int64_t>{0, 2, 1, 3});
+  return vector::InterleaveOp::create(builder, loc, acc, zero);
 }
 
-// Collapse expanded ACC [d0, d1, d2, d3] -> [d0+d1, d2+d3].
-static Value collapseAccumulator(OpBuilder &builder, Location loc, Value acc) {
-  auto accType = cast<VectorType>(acc.getType());
-  Type elementType = accType.getElementType();
-
-  Value evens = vector::ShuffleOp::create(builder, loc, acc, acc,
-                                          ArrayRef<int64_t>{0, 2});
-  Value odds = vector::ShuffleOp::create(builder, loc, acc, acc,
-                                         ArrayRef<int64_t>{1, 3});
-
+// Collapses a 4-element native SMFMAC ACC back to the 2-element semantic form.
+// Deinterleaves into evens [d0, d2] and odds [d1, d3], then sums pairwise:
+// [d0, d1, d2, d3] -> [d0+d1, d2+d3].
+Value collapseAccumulator(OpBuilder &builder, Location loc, Value acc) {
+  Type elementType = cast<VectorType>(acc.getType()).getElementType();
+  auto deinterleave = vector::DeinterleaveOp::create(builder, loc, acc);
+  Value evens = deinterleave.getRes1();
+  Value odds = deinterleave.getRes2();
   if (isa<FloatType>(elementType)) {
     return arith::AddFOp::create(builder, loc, evens, odds);
   }
@@ -1574,6 +1585,27 @@ struct VDMFMAConfig {
 // (e.g., vector<2xf32>). buildVDMFMAOps handles the translation: it expands
 // a collapsed accumulator into the 4-element physical form before the smfmac
 // chain, then collapses the result back afterward.
+
+bool isVDMFMAIntrinsic(VirtualMMAIntrinsic intrinsic) {
+  switch (intrinsic) {
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_F16:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_BF16:
+  case VirtualMMAIntrinsic::VDMFMA_I32_8x16x128_I8:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E5M2FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E5M2FNUZ_F8E4M3FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E4M3FNUZ_F8E5M2FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E4M3FNUZ:
+    return true;
+  case VirtualMMAIntrinsic::VMFMA_F32_16x16x32_F8E4M3FNUZ:
+  case VirtualMMAIntrinsic::VMFMA_F32_16x16x32_F16:
+  case VirtualMMAIntrinsic::VMFMA_F32_32x32x16_F8E4M3FNUZ:
+  case VirtualMMAIntrinsic::VMFMA_F32_32x32x16_F16:
+    return false;
+  }
+  assert(false && "unhandled virtual mma intrinsic type");
+  return false;
+}
+
 static LogicalResult buildVDMFMAOps(OpBuilder &builder, Location loc,
                                     const VDMFMAConfig &config,
                                     ValueRange inputs, Value acc,
@@ -1695,10 +1727,10 @@ LogicalResult VirtualMMAAttr::buildUnderlyingOperations(
     return buildVDMFMAOps(builder, loc, config, inputs, outputs[0], results);
   }
   case VirtualMMAIntrinsic::VDMFMA_I32_8x16x128_I8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_BF8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_BF8_FP8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_FP8_BF8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_FP8: {
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E5M2FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E5M2FNUZ_F8E4M3FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E4M3FNUZ_F8E5M2FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E4M3FNUZ: {
     if (getColMajor()) {
       return failure();
     }
@@ -1730,10 +1762,10 @@ int64_t VirtualMMAAttr::getBlockSize() const {
   case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_F16:
   case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_BF16:
   case VirtualMMAIntrinsic::VDMFMA_I32_8x16x128_I8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_BF8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_BF8_FP8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_FP8_BF8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_FP8: {
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E5M2FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E5M2FNUZ_F8E4M3FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E4M3FNUZ_F8E5M2FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E4M3FNUZ: {
     return 1;
   }
   }
@@ -1813,10 +1845,10 @@ MMASingleSubgroupLayout getSingleSubgroupLayout(VirtualMMAIntrinsic intrinsic,
               /*element=*/{2, 1}};
     }
   case VirtualMMAIntrinsic::VDMFMA_I32_8x16x128_I8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_BF8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_BF8_FP8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_FP8_BF8:
-  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_FP8:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E5M2FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E5M2FNUZ_F8E4M3FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E4M3FNUZ_F8E5M2FNUZ:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x128_F8E4M3FNUZ:
     switch (operandIndex) {
     case kMMAOperandLhs:
       return {/*outer=*/{1, 1}, /*thread=*/{8, 4}, /*tstrides=*/{2, 16},
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h
@@ -297,6 +297,23 @@ MMASingleSubgroupLayout getSingleSubgroupLayout(ScaledMMAIntrinsic intrinsic,
 /// attribute.
 StringRef getTilingLevelName(GPU::TilingLevel level);
 
+//===----------------------------------------------------------------------===//
+// VDMFMA accumulator utilities
+//===----------------------------------------------------------------------===//
+
+/// Returns true if the given VirtualMMAIntrinsic is a VDMFMA (virtual dense
+/// MFMA via sparse trick) intrinsic.
+bool isVDMFMAIntrinsic(VirtualMMAIntrinsic intrinsic);
+
+/// Expands a collapsed 2-element ACC into the 4-element native SMFMAC form
+/// by interleaving with zeros: [c0, c1] -> [c0, 0, c1, 0].
+Value expandAccumulator(OpBuilder &builder, Location loc, Value acc);
+
+/// Collapses a 4-element native SMFMAC ACC back to the 2-element semantic
+/// form. Deinterleaves into evens [d0, d2] and odds [d1, d3], then sums
+/// pairwise: [d0, d1, d2, d3] -> [d0+d1, d2+d3].
+Value collapseAccumulator(OpBuilder &builder, Location loc, Value acc);
+
 //===----------------------------------------------------------------------===//
 // Implementations for operand promotion
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
@@ -376,10 +376,10 @@ def VMFMA_F32_32x32x16_F8E4M3FNUZ  : I32EnumAttrCase<"VMFMA_F32_32x32x16_F8E4M3F
 def VDMFMA_F32_8x16x64_F16  : I32EnumAttrCase<"VDMFMA_F32_8x16x64_F16", 4>;
 def VDMFMA_I32_8x16x128_I8  : I32EnumAttrCase<"VDMFMA_I32_8x16x128_I8", 5>;
 def VDMFMA_F32_8x16x64_BF16  : I32EnumAttrCase<"VDMFMA_F32_8x16x64_BF16", 6>;
-def VDMFMA_F32_8x16x128_BF8  : I32EnumAttrCase<"VDMFMA_F32_8x16x128_BF8", 7>;
-def VDMFMA_F32_8x16x128_BF8_FP8  : I32EnumAttrCase<"VDMFMA_F32_8x16x128_BF8_FP8", 8>;
-def VDMFMA_F32_8x16x128_FP8_BF8  : I32EnumAttrCase<"VDMFMA_F32_8x16x128_FP8_BF8", 9>;
-def VDMFMA_F32_8x16x128_FP8  : I32EnumAttrCase<"VDMFMA_F32_8x16x128_FP8", 10>;
+def VDMFMA_F32_8x16x128_F8E5M2FNUZ  : I32EnumAttrCase<"VDMFMA_F32_8x16x128_F8E5M2FNUZ", 7>;
+def VDMFMA_F32_8x16x128_F8E5M2FNUZ_F8E4M3FNUZ  : I32EnumAttrCase<"VDMFMA_F32_8x16x128_F8E5M2FNUZ_F8E4M3FNUZ", 8>;
+def VDMFMA_F32_8x16x128_F8E4M3FNUZ_F8E5M2FNUZ  : I32EnumAttrCase<"VDMFMA_F32_8x16x128_F8E4M3FNUZ_F8E5M2FNUZ", 9>;
+def VDMFMA_F32_8x16x128_F8E4M3FNUZ  : I32EnumAttrCase<"VDMFMA_F32_8x16x128_F8E4M3FNUZ", 10>;
 
 def IREEGPU_VirtualMMAIntrinsic : IREEGPU_I32EnumAttr<"VirtualMMAIntrinsic",
     "Descriptor for different Virtual MMA intrinsics", [
@@ -392,10 +392,10 @@ def IREEGPU_VirtualMMAIntrinsic : IREEGPU_I32EnumAttr<"VirtualMMAIntrinsic",
       VDMFMA_F32_8x16x64_BF16,
       // 8-bit VDMFMA variants.
       VDMFMA_I32_8x16x128_I8,
-      VDMFMA_F32_8x16x128_BF8,
-      VDMFMA_F32_8x16x128_BF8_FP8,
-      VDMFMA_F32_8x16x128_FP8_BF8,
-      VDMFMA_F32_8x16x128_FP8,
+      VDMFMA_F32_8x16x128_F8E5M2FNUZ,
+      VDMFMA_F32_8x16x128_F8E5M2FNUZ_F8E4M3FNUZ,
+      VDMFMA_F32_8x16x128_F8E4M3FNUZ_F8E5M2FNUZ,
+      VDMFMA_F32_8x16x128_F8E4M3FNUZ,
     ]>;
 
 // Enum for scaled mma intrinsic, loosely matching the MMAIntrinsic enum above
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir
@@ -64,40 +64,40 @@ module {
 //  CHECK-SAME:   mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x64_BF16>
 
 module {
-  func.func @test_vdmfma_bf8_8x16x128() attributes {
-      mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x128_BF8>} {
+  func.func @test_vdmfma_f8E5M2FNUZ_8x16x128() attributes {
+      mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x128_F8E5M2FNUZ>} {
     return
   }
 }
-// CHECK-LABEL: func @test_vdmfma_bf8_8x16x128
-//  CHECK-SAME:   mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x128_BF8>
+// CHECK-LABEL: func @test_vdmfma_f8E5M2FNUZ_8x16x128
+//  CHECK-SAME:   mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x128_F8E5M2FNUZ>
 
 module {
-  func.func @test_vdmfma_bf8_fp8_8x16x128() attributes {
-      mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x128_BF8_FP8>} {
+  func.func @test_vdmfma_f8E5M2FNUZ_f8E4M3FNUZ_8x16x128() attributes {
+      mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x128_F8E5M2FNUZ_F8E4M3FNUZ>} {
     return
   }
 }
-// CHECK-LABEL: func @test_vdmfma_bf8_fp8_8x16x128
-//  CHECK-SAME:   mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x128_BF8_FP8>
+// CHECK-LABEL: func @test_vdmfma_f8E5M2FNUZ_f8E4M3FNUZ_8x16x128
+//  CHECK-SAME:   mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x128_F8E5M2FNUZ_F8E4M3FNUZ>
 
 module {
-  func.func @test_vdmfma_fp8_bf8_8x16x128() attributes {
-      mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x128_FP8_BF8>} {
+  func.func @test_vdmfma_f8E4M3FNUZ_f8E5M2FNUZ_8x16x128() attributes {
+      mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x128_F8E4M3FNUZ_F8E5M2FNUZ>} {
     return
   }
 }
-// CHECK-LABEL: func @test_vdmfma_fp8_bf8_8x16x128
-//  CHECK-SAME:   mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x128_FP8_BF8>
+// CHECK-LABEL: func @test_vdmfma_f8E4M3FNUZ_f8E5M2FNUZ_8x16x128
+//  CHECK-SAME:   mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x128_F8E4M3FNUZ_F8E5M2FNUZ>
 
 module {
-  func.func @test_vdmfma_fp8_8x16x128() attributes {
-      mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x128_FP8>} {
+  func.func @test_vdmfma_f8E4M3FNUZ_8x16x128() attributes {
+      mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x128_F8E4M3FNUZ>} {
     return
   }
 }
-// CHECK-LABEL: func @test_vdmfma_fp8_8x16x128
-//  CHECK-SAME:   mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x128_FP8>
+// CHECK-LABEL: func @test_vdmfma_f8E4M3FNUZ_8x16x128
+//  CHECK-SAME:   mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x128_F8E4M3FNUZ>
 
 module {
   func.func @test_WMMAR3_f16_16x16x16_f32() attributes {
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_inner_tiled.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_inner_tiled.mlir