[BACKEND] Re-order WS lowering and NVGPU lowering (#9535)

ThomasRaoux · web-flow · commit 434aecbe933a · 2026-02-21T07:54:53.000Z
Better layering as NVGPU is meant to be at the same level of abstraction
of LLVM. This also avoid bugs when lowering prologue/epilogue of the
kernel
diff --git a/test/Conversion/nvgpu_to_llvm.mlir b/test/Conversion/nvgpu_to_llvm.mlir
@@ -119,68 +119,6 @@ llvm.func @tensor_memory_base_warpgroup() attributes {nvvm.kernel = 1 : ui1, nvv
 
 }
 
-// -----
-
-module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
-
-// CHECK-LABEL: @warpid_warp_specialize
-llvm.func @warpid_warp_specialize() {
-  // CHECK: [[C32:%.*]] = llvm.mlir.constant(32 : i32)
-  // CHECK: [[TIDX:%.*]] = nvvm.read.ptx.sreg.tid.x
-  // CHECK: [[ID:%.*]] = llvm.udiv [[TIDX]], [[C32]]
-  // CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
-  %0 = ttg.warp_id
-  // CHECK: "use"([[UNIFORM]])
-  "use"(%0) : (i32) -> ()
-
-  // CHECK: ttg.warp_specialize
-  ttg.warp_specialize() attributes {warpGroupStartIds = array<i32: 6, 4>}
-  // CHECK: default
-  default {
-    // CHECK: [[TIDX:%.*]] = nvvm.read.ptx.sreg.tid.x
-    // CHECK: [[ID:%.*]] = llvm.udiv [[TIDX]], [[C32]]
-    // CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
-    %1 = ttg.warp_id
-    // CHECK: "use"([[UNIFORM]])
-    "use"(%1) : (i32) -> ()
-    ttg.warp_yield
-  }
-  // CHECK: partition0
-  partition0() num_warps(4) {
-    // 6*32 = 196
-
-    // CHECK: [[C32:%.*]] = llvm.mlir.constant(32 : i32)
-    // CHECK: [[C192:%.*]] = llvm.mlir.constant(192 : i32)
-    // CHECK: [[TIDX:%.*]] = nvvm.read.ptx.sreg.tid.x
-    // CHECK: [[REL_TIDX:%.*]] = llvm.sub [[TIDX]], [[C192]]
-    // CHECK: [[ID:%.*]] = llvm.udiv [[REL_TIDX]], [[C32]]
-    // CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
-    %1 = ttg.warp_id
-    // CHECK: "use"([[UNIFORM]])
-    "use"(%1) : (i32) -> ()
-    ttg.warp_return
-  }
-  partition1() num_warps(2) {
-    // 4*32 = 128
-
-    // CHECK: [[C32:%.*]] = llvm.mlir.constant(32 : i32)
-    // CHECK: [[C128:%.*]] = llvm.mlir.constant(128 : i32)
-    // CHECK: [[TIDX:%.*]] = nvvm.read.ptx.sreg.tid.x
-    // CHECK: [[REL_TIDX:%.*]] = llvm.sub [[TIDX]], [[C128]]
-    // CHECK: [[ID:%.*]] = llvm.udiv [[REL_TIDX]], [[C32]]
-    // CHECK: [[UNIFORM:%.*]] = nvvm.shfl.sync idx {{%[0-9]+}}, [[ID]]
-    %1 = ttg.warp_id
-    // CHECK: "use"([[UNIFORM]])
-    "use"(%1) : (i32) -> ()
-    ttg.warp_return
-  } : () -> ()
-  llvm.return
-}
-
-}
-
-// -----
-
 module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32} {
 
 // CHECK-LABEL: @one_warp
diff --git a/test/Conversion/warp_specialize_to_llvm.mlir b/test/Conversion/warp_specialize_to_llvm.mlir
@@ -540,6 +540,51 @@ llvm.func @partition_warpid_order() attributes {allocation.offset = 32 : i32} {
 
 // -----
 
+module attributes {"ttg.num-warps" = 4 : i32, "ttg.total-num-warps" = 18 : i32} {
+
+llvm.mlir.global external @global_smem() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8>
+
+// CHECK-LABEL: @warpid_warp_specialize
+llvm.func @warpid_warp_specialize() attributes {allocation.offset = 32 : i32} {
+  // CHECK-DAG: [[C4:%.*]] = llvm.mlir.constant(4 : i32)
+  // CHECK-DAG: [[C6:%.*]] = llvm.mlir.constant(6 : i32)
+
+  // Partition warp IDs are rewritten to be relative in this pass, while
+  // keeping ttg.warp_id for NVGPUToLLVM to lower later.
+  // CHECK: %{{.*}} = ttg.warp_id
+  // CHECK-NEXT: [[REL0:%.*]] = llvm.sub %{{.*}}, [[C6]] : i32
+  // CHECK-NEXT: "use"([[REL0]]) : (i32) -> ()
+
+  // CHECK: %{{.*}} = ttg.warp_id
+  // CHECK-NEXT: [[REL1:%.*]] = llvm.sub %{{.*}}, [[C4]] : i32
+  // CHECK-NEXT: "use"([[REL1]]) : (i32) -> ()
+
+  %0 = ttg.warp_id
+  "use"(%0) : (i32) -> ()
+
+  ttg.warp_specialize() attributes {allocation.offset = 0 : i32, warpGroupStartIds = array<i32: 6, 4>}
+  default {
+    %1 = ttg.warp_id
+    "use"(%1) : (i32) -> ()
+    ttg.warp_yield
+  }
+  partition0() num_warps(4) {
+    %1 = ttg.warp_id
+    "use"(%1) : (i32) -> ()
+    ttg.warp_return
+  }
+  partition1() num_warps(2) {
+    %1 = ttg.warp_id
+    "use"(%1) : (i32) -> ()
+    ttg.warp_return
+  } : () -> ()
+  llvm.return
+}
+
+}
+
+// -----
+
 module attributes {"ttg.num-warps" = 4 : i32, "ttg.total-num-warps" = 12 : i32} {
 
 llvm.mlir.global external @global_smem() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8>
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -374,8 +374,8 @@ def make_llir(self, src, metadata, options, capability):
         nvidia.passes.ttgpuir.add_to_llvmir(pm, capability, ptx_version)
         passes.ttgpuir.add_canonicalize_llvm_ir(pm)
         passes.common.add_cse(pm)
-        nvidia.passes.ttnvgpuir.add_nvgpu_to_llvm(pm)
         nvidia.passes.ttnvgpuir.add_warp_specialize_to_llvm(pm)
+        nvidia.passes.ttnvgpuir.add_nvgpu_to_llvm(pm)
         passes.common.add_canonicalizer(pm)
         passes.common.add_cse(pm)
         passes.common.add_symbol_dce(pm)
diff --git a/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp b/third_party/nvidia/lib/NVGPUToLLVM/NVGPUToLLVMPass.cpp
@@ -210,13 +210,7 @@ class WarpIdOpPattern : public OpRewritePattern<mlir::triton::gpu::WarpIdOp> {
       return success();
     }
 
-    // If this is inside a warp specialize op, compute the relative thread ID
-    // within the warp group.
     Value tid = NVVM::ThreadIdXOp::create(rewriter, loc, i32_ty);
-    if (std::optional<int> startId =
-            getWarpGroupStartThreadId(rewriter.getInsertionBlock()))
-      tid = LLVM::SubOp::create(rewriter, loc, tid, b.i32_val(*startId));
-
     Value warpId = b.udiv(tid, b.i32_val(32));
     if (!op.getOmitUniformHint()) {
       // This indicates to PTXAS that the result and its derived values are
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp
@@ -7,7 +7,6 @@
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "triton/Conversion/TritonGPUToLLVM/Passes.h"
 #include "triton/Conversion/TritonGPUToLLVM/TypeConverter.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
@@ -83,6 +82,30 @@ class NVIDIAWarpSpecializeBarrierHelper : public WarpSpecializeBarrierHelper {
   unsigned numThreadsPerWarp;
 };
 
+static void rewriteWarpSpecializeWarpIdsOnce(ModuleOp mod) {
+  SmallVector<mlir::triton::gpu::WarpIdOp> wsWarpIds;
+  mod.walk([&](mlir::triton::gpu::WarpIdOp op) {
+    if (getWarpGroupStartWarpId(op->getBlock()))
+      wsWarpIds.push_back(op);
+  });
+
+  for (mlir::triton::gpu::WarpIdOp op : wsWarpIds) {
+    std::optional<int> startWarpId = getWarpGroupStartWarpId(op->getBlock());
+    assert(startWarpId &&
+           "expected warp-specialize warp_id to have a start warp ID");
+
+    auto loc = op.getLoc();
+    TritonLLVMIRRewriter b(loc, op);
+
+    // Keep `ttg.warp_id` for NVGPUToLLVM and only make it relative here.
+    Value absWarpId =
+        mlir::triton::gpu::WarpIdOp::create(b, loc, op.getOmitUniformHint());
+    Value relWarpId =
+        LLVM::SubOp::create(b, loc, absWarpId, b.i32_val(*startWarpId));
+    b.replaceOp(op, relWarpId);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // lowerWarpSpecialize
 //===----------------------------------------------------------------------===//
@@ -249,6 +272,8 @@ struct ConvertWarpSpecializeToLLVM
     if (failed(runPipeline(pm, mod)))
       return signalPassFailure();
 
+    rewriteWarpSpecializeWarpIdsOnce(mod);
+
     unsigned threadsPerWarp = TritonGPUDialect::getThreadsPerWarp(mod);
     NVIDIAWarpSpecializeBarrierHelper barrierHelper(threadsPerWarp);
     if (failed(lowerWarpSpecializeBarriers(mod, barrierHelper)))