[Bench][Blackwell] Fix warp specialization for fp8 x mxfp4 bench (#6537)

csullivan · web-flow · commit b07e76a1c32c · 2025-04-24T22:55:58.000Z
This pr-chain brings the performance of the mixed fp8 x mxfp4 MOE kernel
on par with fp8 x fp8 kernel:
* About 10% slower in the dense benchmarks
* About 10% faster in the llama4 benchmarks

Applies a bug fix for padded scale loads in fp8 x mxfp4 mode
ensuring TMA load requirements are met when using the unpacked
fp4 (padded) layout. This only occurs after enabling warp
specialization.
diff --git a/bench/triton_bench/matmul_ogs_details/_ptma_matmul_ogs.py b/bench/triton_bench/matmul_ogs_details/_ptma_matmul_ogs.py
@@ -293,7 +293,7 @@ def _ptma_matmul_ogs(
     # Enable warp specialization when all loads are TMA loads. Don't enable it
     # for mixed-precision yet.
     ENABLE_WS: tl.constexpr = True
-    WARP_SPECIALIZE: tl.constexpr = ((USE_GATHER_TMA or X_USE_LOAD_TMA) and not is_microscaled_format) and ENABLE_WS
+    WARP_SPECIALIZE: tl.constexpr = (USE_GATHER_TMA or X_USE_LOAD_TMA) and ENABLE_WS
 
     for tile_id in tl.range(tl.program_id(0), num_tiles, NUM_SMS, flatten=True, disallow_acc_multi_buffer=DISALLOW_ACC_MULTI_BUFFER, warp_specialize=WARP_SPECIALIZE):
         expt_id, start_z, start_m, eM, off_m, off_n, pid_k = _load_tile_attrs(
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/LoadMMASpecialization.cpp
@@ -12,6 +12,7 @@
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "triton/Dialect/TritonGPU/Transforms/WarpSpecialization.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h"
 
 using namespace mlir;
 using namespace triton;
@@ -139,8 +140,11 @@ static void lowerTMACopy(ImplicitLocOpBuilder &b, Partition &partition,
   if (auto load = dyn_cast<DescriptorLoadOp>(op)) {
     Value tmaPtr = createInPartition<ttng::TensorDescToTMAPtrOp>(
         b, partition, load.getDesc());
+    auto indices = ttng::translateTMAIndices(
+        b, load.getLoc(), load.getDesc().getType().getBlockType().getEncoding(),
+        load.getIndices());
     createInPartition<ttng::AsyncTMACopyGlobalToLocalOp>(
-        b, partition, tmaPtr, load.getIndices(), barrier, view, truePred);
+        b, partition, tmaPtr, indices, barrier, view, truePred);
   } else {
     auto gather = cast<DescriptorGatherOp>(op);
     Value tmaPtr = createInPartition<ttng::TensorDescToTMAPtrOp>(