f16

efric · efric · commit 95b6efc4cfe4 · 2026-03-05T17:28:41.000-08:00
Signed-off-by: Eric Feng &lt;Eric.Feng@amd.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
+#include <cstdint>
 
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h"
@@ -34,6 +35,7 @@
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/OpDefinition.h"
@@ -1269,6 +1271,10 @@ getMNKShape(VirtualMMAIntrinsic type) {
   case VirtualMMAIntrinsic::VMFMA_F32_32x32x16_F8E4M3FNUZ:
   case VirtualMMAIntrinsic::VMFMA_F32_32x32x16_F16:
     return {32, 32, 16};
+  // Sparse trick VSMFMAs for skinny GEMMs: semantically 8x16xK.
+  // TODO(#XXXX): Add I8 VDMFMA variant (VDMFMA_I32_8x16x128_I8).
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_F16:
+    return {8, 16, 64};
   }
   assert(false && "unhandled virtual mma layout type.");
   return {};
@@ -1285,12 +1291,13 @@ getABCElementTypes(MLIRContext *context, VirtualMMAIntrinsic type) {
     return {f8E4M3FNUZ, f8E4M3FNUZ, f32};
   case VirtualMMAIntrinsic::VMFMA_F32_32x32x16_F8E4M3FNUZ:
     return {f8E4M3FNUZ, f8E4M3FNUZ, f32};
-  // V(Virtual)MFMA instructions which have 2 mfma instructions interleaved
-  // along the k dimension.
   case VirtualMMAIntrinsic::VMFMA_F32_16x16x32_F16:
     return {f16, f16, f32};
   case VirtualMMAIntrinsic::VMFMA_F32_32x32x16_F16:
     return {f16, f16, f32};
+  // Sparse trick VSMFMAs for skinny GEMMs.
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_F16:
+    return {f16, f16, f32};
   }
   assert(false && "unhandled virtual mma layout type.");
   return {};
@@ -1326,21 +1333,43 @@ void VirtualMMAAttr::getUndistributedTileTypes(
                  VectorType::get({o.mSize, o.nSize}, o.cType)});
 }
 
+// Returns the number of elements held per lane for a given operand layout,
+// accounting for broadcastFactor when threadProduct < subgroupSize.
+static int64_t getPerLaneElements(MMASingleSubgroupLayout layout,
+                                  int64_t subgroupSize) {
+  int64_t threadProduct = llvm::product_of(layout.thread);
+  assert(subgroupSize % threadProduct == 0 &&
+         "subgroup size must be a multiple of thread product");
+  int64_t broadcastFactor = subgroupSize / threadProduct;
+  int64_t totalElements =
+      llvm::product_of(layout.element) * llvm::product_of(layout.outer);
+  assert(totalElements % broadcastFactor == 0 &&
+         "total elements must be divisible by broadcast factor");
+  return totalElements / broadcastFactor;
+}
+
 void VirtualMMAAttr::getDistributedTileTypes(
     SmallVectorImpl<VectorType> &result) const {
   MLIRContext *context = getContext();
   VirtualMMAIntrinsic intrinsic = getIntrinsic();
-  result.assign({getThreadVectorType(context, intrinsic, kMMAOperandLhs),
-                 getThreadVectorType(context, intrinsic, kMMAOperandRhs),
-                 getThreadVectorType(context, intrinsic, kMMAOperandAcc)});
+  int64_t subgroupSize = getSubgroupSize();
+  OpaqueMmaLayout o = getOpaqueMMALayout(context, intrinsic);
+  auto lhs = getSingleSubgroupLayout(intrinsic, kMMAOperandLhs);
+  auto rhs = getSingleSubgroupLayout(intrinsic, kMMAOperandRhs);
+  auto acc = getSingleSubgroupLayout(intrinsic, kMMAOperandAcc);
+  result.assign(
+      {VectorType::get({getPerLaneElements(lhs, subgroupSize)}, o.aType),
+       VectorType::get({getPerLaneElements(rhs, subgroupSize)}, o.bType),
+       VectorType::get({getPerLaneElements(acc, subgroupSize)}, o.cType)});
 }
 
 int64_t VirtualMMAAttr::getSubgroupSize() const {
   switch (getIntrinsic()) {
   case VirtualMMAIntrinsic::VMFMA_F32_16x16x32_F8E4M3FNUZ:
   case VirtualMMAIntrinsic::VMFMA_F32_16x16x32_F16:
   case VirtualMMAIntrinsic::VMFMA_F32_32x32x16_F8E4M3FNUZ:
-  case VirtualMMAIntrinsic::VMFMA_F32_32x32x16_F16: {
+  case VirtualMMAIntrinsic::VMFMA_F32_32x32x16_F16:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_F16: {
     return 64;
   }
   }
@@ -1366,23 +1395,67 @@ LogicalResult VirtualMMAAttr::populateOperandOffsetsSizesStrides(
   MMASingleSubgroupLayout subgroupLayout =
       getSingleSubgroupLayout(getIntrinsic(), operandIndex,
                               operandIndex == kMMAOperandAcc && getColMajor());
+
+  // Compute broadcast factor: when thread product < subgroup size, multiple
+  // physical lanes share a logical thread position. broadcastFactor tells
+  // populateCanonicalOffsetsSizesAndStrides to split the element dimension
+  // so each physical lane gets a unique slice.
+  int64_t threadProduct = llvm::product_of(subgroupLayout.thread);
+  assert(getSubgroupSize() % threadProduct == 0 &&
+         "subgroup size must be a multiple of thread product");
+  int64_t broadcastFactor = getSubgroupSize() / threadProduct;
+
   SmallVector<OpFoldResult> canonicalOffsets;
   SmallVector<OpFoldResult> canonicalSizes;
   if (failed(populateCanonicalOffsetsSizesAndStrides(
           builder, loc, laneId, permutation, subgroupLayout, canonicalOffsets,
-          canonicalSizes, strides))) {
+          canonicalSizes, strides, broadcastFactor))) {
     return failure();
   }
   offsets.append(canonicalOffsets);
   sizes.append(canonicalSizes);
-
   return success();
 }
 
+// Returns true on odd lanes and false on even lanes.
+static Value createLaneParityPredicate(OpBuilder &builder, Location loc) {
+  Value laneId = gpu::LaneIdOp::create(builder, loc, /*upper_bound=*/nullptr);
+  Value one = arith::ConstantIndexOp::create(builder, loc, 1);
+  Value zero = arith::ConstantIndexOp::create(builder, loc, 0);
+  Value lowBit = arith::AndIOp::create(builder, loc, laneId, one);
+  return arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::ne, lowBit,
+                               zero);
+}
+
+// Creates a constant sparse index vector for SMFMAC operations.
+//
+// The sparse index encodes which 2 positions out of each group of 4
+// K-elements are selected for 2:4 structured sparsity. Each 4-bit
+// field within selectorBits selects positions for one K-group:
+//   0x4 (0100b) -> positions {0,1};  0xE (1110b) -> positions {2,3}.
+//
+// For 16-bit source data (f16/bf16): vector<4xi8>, 2 groups per i8.
+// For 8-bit source data (i8): vector<2xi16>, 4 groups per i16.
+//
+// Only the first element carries active selector bits; remaining
+// elements are padding zeros.
+static Value createConstSparseIndex(OpBuilder &builder, Location loc,
+                                    VectorType sparseIndexVectorType,
+                                    int64_t selectorBits) {
+  Type elemTy = sparseIndexVectorType.getElementType();
+  Value zero = arith::ConstantOp::create(
+      builder, loc, builder.getZeroAttr(sparseIndexVectorType));
+  Value selector = arith::ConstantOp::create(
+      builder, loc, builder.getIntegerAttr(elemTy, selectorBits));
+  return vector::InsertOp::create(builder, loc, selector, zero, 0);
+}
+
+// Returns the K unroll factor: virtual_K / native_K.
 int64_t VirtualMMAAttr::getIntrinsicsK() const {
   switch (getIntrinsic()) {
   case VirtualMMAIntrinsic::VMFMA_F32_16x16x32_F16:
-  case VirtualMMAIntrinsic::VMFMA_F32_32x32x16_F16: {
+  case VirtualMMAIntrinsic::VMFMA_F32_32x32x16_F16:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_F16: {
     return 2;
   }
   case VirtualMMAIntrinsic::VMFMA_F32_16x16x32_F8E4M3FNUZ:
@@ -1394,6 +1467,125 @@ int64_t VirtualMMAAttr::getIntrinsicsK() const {
   return 0;
 }
 
+// Expand collapsed ACC [c0, c1] -> [c0, 0, c1, 0].
+static Value expandAccumulator(OpBuilder &builder, Location loc, Value acc) {
+  auto accType = cast<VectorType>(acc.getType());
+  Value zero =
+      arith::ConstantOp::create(builder, loc, builder.getZeroAttr(accType));
+
+  return vector::ShuffleOp::create(builder, loc, acc, zero,
+                                   ArrayRef<int64_t>{0, 2, 1, 3});
+}
+
+// Collapse expanded ACC [d0, d1, d2, d3] -> [d0+d1, d2+d3].
+static Value collapseAccumulator(OpBuilder &builder, Location loc, Value acc) {
+  auto accType = cast<VectorType>(acc.getType());
+  Type elementType = accType.getElementType();
+
+  Value evens = vector::ShuffleOp::create(builder, loc, acc, acc,
+                                          ArrayRef<int64_t>{0, 2});
+  Value odds = vector::ShuffleOp::create(builder, loc, acc, acc,
+                                         ArrayRef<int64_t>{1, 3});
+
+  if (isa<FloatType>(elementType)) {
+    return arith::AddFOp::create(builder, loc, evens, odds);
+  }
+  return arith::AddIOp::create(builder, loc, evens, odds);
+}
+
+// Struct with consolidated info necessary for sparse trick invocation as a
+// VDMFMA.
+struct VDMFMAConfig {
+  int64_t m, n, nativeK;
+  int64_t unrollFactor;
+  VectorType sparseIndexVectorType;
+  int64_t evenSparseIndex;
+  int64_t oddSparseIndex;
+  int64_t aSliceWidth; // Elements per A slice per SMFMAC call.
+  SmallVector<SmallVector<int64_t, 8>, 2> bInterleaveIndices;
+};
+
+// Virtual Dense MFMA (VDMFMA) ops represent invocations of the sparse trick
+// targeting skinny GEMMs (M=8).
+//
+// === The sparse trick ===
+//
+// Sparse MFMA (V_SMFMAC) instructions perform MMA on an imbalanced pair of
+// operands: a 4:2 structured-sparse matrix A and a dense matrix B. The
+// instruction also takes a sparsity index that encodes which 2 of every 4
+// elements along K are non-zero within the sparse matrix A. The trick exploits
+// this by pairing even/odd lanes to jointly describe a full dense row.
+//
+// The lane-pairing layout maps each of the 8 logical M-rows to a pair of
+// adjacent physical rows (row 2i and 2i+1 for logical row i). Within each pair,
+// the even lane supplies positions {0,1} from each K-group of 4 and the odd
+// lane supplies positions {2,3}. The hardware interprets each physical row as
+// having 2:4 structured sparsity and computes a partial dot product over only
+// its non-zero elements. Summing the two physical rows' results reconstructs
+// the full dense dot product for the logical row. This yields a semantic M=8
+// matmul from a physical 16x16 instruction.
+//
+// Each lane loads unique A data via broadcastFactor distribution. Even lanes
+// receive K[0:aSliceWidth*unrollFactor/2], odd lanes receive
+// K[aSliceWidth*unrollFactor/2:aSliceWidth*unrollFactor]. A is sliced
+// sequentially into per-SMFMAC chunks of aSliceWidth elements.
+//
+// === Accumulator expand/collapse ===
+//
+// Because the sparse trick maps two hardware rows to one logical row, adjacent
+// register pairs in the output hold partial sums for the same dense row.
+// Collapsing sums each pair (v0+v1, v2+v3) to produce the 2-element semantic
+// result: one complete value per logical row.
+//
+// The layout and distribution infrastructure operate on the collapsed vector
+// (e.g., vector<2xf32>). buildVDMFMAOps handles the translation: it expands
+// a collapsed accumulator into the 4-element physical form before the smfmac
+// chain, then collapses the result back afterward.
+static LogicalResult buildVDMFMAOps(OpBuilder &builder, Location loc,
+                                    const VDMFMAConfig &config,
+                                    ValueRange inputs, Value acc,
+                                    SmallVectorImpl<Value> &results) {
+  Value smfmacAcc = expandAccumulator(builder, loc, acc);
+  VectorType expandedAccType = cast<VectorType>(smfmacAcc.getType());
+
+  Value isOddLane = createLaneParityPredicate(builder, loc);
+
+  Value sparseIndex = arith::SelectOp::create(
+      builder, loc, isOddLane,
+      createConstSparseIndex(builder, loc, config.sparseIndexVectorType,
+                             config.oddSparseIndex),
+      createConstSparseIndex(builder, loc, config.sparseIndexVectorType,
+                             config.evenSparseIndex));
+
+  Value lhs = inputs[0];
+  Value rhs = inputs[1];
+
+  assert(static_cast<int64_t>(config.bInterleaveIndices.size()) ==
+             config.unrollFactor &&
+         "must provide B interleave indices for each unroll iteration");
+
+  for (int64_t i = 0; i < config.unrollFactor; ++i) {
+    int64_t aOffset = config.aSliceWidth * i;
+    Value aSlice = vector::ExtractStridedSliceOp::create(
+        builder, loc, lhs, /*offsets=*/ArrayRef<int64_t>{aOffset},
+        /*sizes=*/ArrayRef<int64_t>{config.aSliceWidth},
+        /*strides=*/ArrayRef<int64_t>{1});
+
+    Value bSlice = vector::ShuffleOp::create(builder, loc, rhs, rhs,
+                                             config.bInterleaveIndices[i]);
+
+    smfmacAcc = amdgpu::SparseMFMAOp::create(
+        builder, loc, expandedAccType,
+        /*m=*/config.m, /*n=*/config.n, /*k=*/config.nativeK,
+        /*sourceA=*/aSlice, /*sourceB=*/bSlice, /*destC=*/smfmacAcc,
+        /*sparseIdx=*/sparseIndex, /*cbsz=*/0, /*abid=*/0);
+  }
+
+  Value result = collapseAccumulator(builder, loc, smfmacAcc);
+  results.push_back(result);
+  return success();
+}
+
 // Generates amdgpu.mfma/wmma operation on the given inputs for this attribute
 // type.
 LogicalResult VirtualMMAAttr::buildUnderlyingOperations(
@@ -1450,6 +1642,24 @@ LogicalResult VirtualMMAAttr::buildUnderlyingOperations(
     results.push_back(acc);
     return success();
   }
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_F16: {
+    if (getColMajor()) {
+      return failure();
+    }
+    VDMFMAConfig config{
+        /*m=*/16,
+        /*n=*/16,
+        /*nativeK=*/32,
+        /*unrollFactor=*/2,
+        /*sparseIndexVectorType=*/
+        VectorType::get({4}, builder.getIntegerType(8)),
+        /*evenSparseIndex=*/0x44,
+        /*oddSparseIndex=*/0xEE,
+        /*aSliceWidth=*/4,
+        /*bInterleaveIndices=*/
+        {{0, 1, 8, 9, 2, 3, 10, 11}, {4, 5, 12, 13, 6, 7, 14, 15}}};
+    return buildVDMFMAOps(builder, loc, config, inputs, outputs[0], results);
+  }
   }
   return failure();
 }
@@ -1459,7 +1669,8 @@ int64_t VirtualMMAAttr::getBlockSize() const {
   case VirtualMMAIntrinsic::VMFMA_F32_16x16x32_F8E4M3FNUZ:
   case VirtualMMAIntrinsic::VMFMA_F32_16x16x32_F16:
   case VirtualMMAIntrinsic::VMFMA_F32_32x32x16_F8E4M3FNUZ:
-  case VirtualMMAIntrinsic::VMFMA_F32_32x32x16_F16: {
+  case VirtualMMAIntrinsic::VMFMA_F32_32x32x16_F16:
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_F16: {
     return 1;
   }
   }
@@ -1525,6 +1736,18 @@ MMASingleSubgroupLayout getSingleSubgroupLayout(VirtualMMAIntrinsic intrinsic,
       return {/*outer=*/{4, 1}, /*thread=*/{2, 32}, /*tstrides=*/{32, 1},
               /*element=*/{4, 1}};
     }
+  case VirtualMMAIntrinsic::VDMFMA_F32_8x16x64_F16:
+    switch (operandIndex) {
+    case kMMAOperandLhs:
+      return {/*outer=*/{1, 1}, /*thread=*/{8, 4}, /*tstrides=*/{2, 16},
+              /*element=*/{1, 16}};
+    case kMMAOperandRhs:
+      return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*tstrides=*/{16, 1},
+              /*element=*/{16, 1}};
+    case kMMAOperandAcc:
+      return {/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*tstrides=*/{16, 1},
+              /*element=*/{2, 1}};
+    }
   }
   assert(false && "unhandled virtual mma layout type.");
   return {};
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
@@ -373,13 +373,16 @@ def VMFMA_F32_16x16x32_F16  : I32EnumAttrCase<"VMFMA_F32_16x16x32_F16", 0>;
 def VMFMA_F32_32x32x16_F16  : I32EnumAttrCase<"VMFMA_F32_32x32x16_F16", 1>;
 def VMFMA_F32_16x16x32_F8E4M3FNUZ  : I32EnumAttrCase<"VMFMA_F32_16x16x32_F8E4M3FNUZ", 2>;
 def VMFMA_F32_32x32x16_F8E4M3FNUZ  : I32EnumAttrCase<"VMFMA_F32_32x32x16_F8E4M3FNUZ", 3>;
+def VDMFMA_F32_8x16x64_F16  : I32EnumAttrCase<"VDMFMA_F32_8x16x64_F16", 4>;
+// TODO(#XXXX): Add I8 VDMFMA variant (VDMFMA_I32_8x16x128_I8).
 
 def IREEGPU_VirtualMMAIntrinsic : IREEGPU_I32EnumAttr<"VirtualMMAIntrinsic",
     "Descriptor for different Virtual MMA intrinsics", [
       VMFMA_F32_16x16x32_F16,
       VMFMA_F32_32x32x16_F16,
       VMFMA_F32_16x16x32_F8E4M3FNUZ,
       VMFMA_F32_32x32x16_F8E4M3FNUZ,
+      VDMFMA_F32_8x16x64_F16,
     ]>;
 
 // Enum for scaled mma intrinsic, loosely matching the MMAIntrinsic enum above
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir
@@ -36,6 +36,15 @@ module {
 // CHECK-LABEL: func @test_col_major_vmfma_f16_16x16x32_f32
 //  CHECK-SAME:   mma_types = #iree_gpu.virtual_mma_layout<VMFMA_F32_16x16x32_F16, col_major = true>
 
+module {
+  func.func @test_vdmfma_f16_8x16x64() attributes {
+      mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x64_F16>} {
+    return
+  }
+}
+// CHECK-LABEL: func @test_vdmfma_f16_8x16x64
+//  CHECK-SAME:   mma_types = #iree_gpu.virtual_mma_layout<VDMFMA_F32_8x16x64_F16>
+
 module {
   func.func @test_WMMAR3_f16_16x16x16_f32() attributes {
       mma_types = #iree_gpu.mma_layout<WMMAR3_F32_16x16x16_F16>} {
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_inner_tiled.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_inner_tiled.mlir