[Gluon] Add Cluster Launch Control (CLC) support for Blackwell GPUs (#9361)

kamalesh0406 · peterbell10 · web-flow · commit 97c02ff4e502 · 2026-02-06T20:39:11.000Z
This adds support for NVIDIA's Cluster Launch Control (CLC) feature on Blackwell (SM100+) GPUs, enabling dynamic work distribution for persistent kernels. CLC allows running workers to cancel not-yet-launched clusters and take over their work, improving load balancing when SM availability varies. New Gluon API (triton.experimental.gluon.language.nvidia.blackwell.clc): - try_cancel(result, mbar): Issue async CLC request to cancel a pending cluster - is_canceled(result): Check if cancellation succeeded (returns non-zero) - get_first_ctaid(result, dim): Get the canceled cluster's first CTA ID MLIR ops added: - ttng.clc_try_cancel: Lowers to clusterlaunchcontrol.try_cancel.async PTX - ttng.clc_is_canceled: Lowers to clusterlaunchcontrol.query_cancel.is_canceled - ttng.clc_get_first_ctaid: Lowers to clusterlaunchcontrol.query_cancel.get_first_ctaid All ops include SM100+ compute capability checks and emit errors on older GPUs. Tutorial included demonstrating CLC matmul achieving 92.5% of cuBLAS performance on 8192x8192x8192 FP16 matrices.  # New contributor declaration - [x] I am not making a trivial change, such as fixing a typo in a comment. - [x] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [x] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [x] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [ ] I have not added any `lit` tests. - [x] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.) --------- Co-authored-by: Peter Bell <peterbell10@openai.com>
diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
@@ -87,6 +87,99 @@ def TTNG_ClusterWaitOp : TTNG_Op<"cluster_wait", []> {
   let hasVerifier = 1;
 }
 
+//
+// Cluster Launch Control (CLC) Ops - Blackwell SM100+
+//
+def TTNG_CLCTryCancelOp : TTNG_Op<"clc_try_cancel", []> {
+  let summary = "Issue CLC try_cancel to cancel a pending cluster";
+
+  let description = [{
+    Issues a clusterlaunchcontrol.try_cancel instruction to atomically cancel
+    a pending cluster launch. The result is written asynchronously to the
+    result buffer and the mbarrier is signaled on completion.
+
+    This is used for dynamic persistent kernels on Blackwell (SM100+).
+
+    The result buffer must be 16-byte aligned shared memory.
+    The mbarrier must be 8-byte aligned shared memory.
+  }];
+
+  let arguments = (ins
+    Arg<TTG_MemDescType, "", [MemWrite<SharedMemory>]>:$result,
+    Arg<TTG_MemDescType, "", [MemWrite<SharedMemory>]>:$mbarrier,
+    I1Attr:$multicast
+  );
+
+  let assemblyFormat = [{
+    $result `,` $mbarrier attr-dict `:` qualified(type($result)) `,` qualified(type($mbarrier))
+  }];
+  let hasVerifier = 1;
+}
+
+def TTNG_CLCLoadResultOp : TTNG_Op<"clc_load_result", []> {
+  let summary = "Load CLC response from shared memory into registers";
+
+  let description = [{
+    Loads the 128-bit CLC response from shared memory into two i64 registers.
+    This allows subsequent is_canceled and get_first_ctaid operations to
+    operate on registers without re-reading shared memory.
+  }];
+
+  let arguments = (ins
+    Arg<TTG_MemDescType, "", [MemRead<SharedMemory>]>:$src
+  );
+
+  let results = (outs I128:$clcResult);
+
+  let assemblyFormat = [{
+    $src attr-dict `:` qualified(type($src)) `->` type($clcResult)
+  }];
+  let hasVerifier = 1;
+}
+
+def TTNG_CLCIsCanceledOp : TTNG_Op<"clc_is_canceled", [Pure]> {
+  let summary = "Check if CLC response indicates successful cancellation";
+
+  let description = [{
+    Decodes the CLC response to check if a cluster was successfully
+    canceled. Returns true if canceled, false otherwise.
+  }];
+
+  let arguments = (ins I128:$clcResult);
+
+  let results = (outs I1:$is_canceled);
+
+  let assemblyFormat = [{
+    $clcResult attr-dict `:` type($clcResult) `->` type($is_canceled)
+  }];
+}
+
+def TTNG_CLCGetProgramIdOp : TTNG_Op<"clc_get_program_id", [Pure]> {
+  let summary = "Get CTA ID coordinate from CLC response";
+
+  let description = [{
+    Decodes the CLC response to get the first CTA ID coordinate of the
+    canceled cluster. The dim attribute specifies which dimension (0=x, 1=y, 2=z).
+  }];
+
+  let arguments = (ins
+    I128:$clcResult,
+    TT_ProgramDim:$dim
+  );
+
+  let results = (outs I32:$result);
+
+  let assemblyFormat = [{
+    $clcResult `,` $dim attr-dict `:` type($clcResult) `->` type($result)
+  }];
+
+  let builders = [
+    OpBuilder<(ins "Value":$clcResult, "int":$axis), [{
+      build($_builder, $_state, clcResult, ProgramIDDim(axis));
+    }]>
+  ];
+}
+
 //
 // WarpGroupDot Op
 //
diff --git a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
@@ -34,6 +34,7 @@
 #include "triton/Dialect/TritonNvidiaGPU/IR/TensorMemoryUtils.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.cpp.inc"
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h"
+#include "triton/Tools/StrUtil.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -1152,6 +1153,30 @@ LogicalResult TensormapCreateOp::verify() {
   return success();
 }
 
+// -- CLCTryCancelOp --
+static LogicalResult verifyCLCResultMemdesc(Location loc, MemDescType desc) {
+  auto int_ty = dyn_cast<IntegerType>(desc.getElementType());
+  if (!int_ty || int_ty.getWidth() != 64) {
+    return emitError(loc)
+           << "Expected CLC result buffer to have type int64, but got"
+           << desc.getElementType();
+  }
+  if (desc.getShape().size() != 1 || desc.getShape()[0] != 2) {
+    return emitError(loc)
+           << "Expected CLC result buffer to have shape [2], but got ["
+           << triton::join(desc.getShape(), ", ") << "]";
+  }
+  return success();
+}
+
+LogicalResult CLCTryCancelOp::verify() {
+  return verifyCLCResultMemdesc(getLoc(), getResult().getType());
+}
+
+LogicalResult CLCLoadResultOp::verify() {
+  return verifyCLCResultMemdesc(getLoc(), getSrc().getType());
+}
+
 } // namespace nvidia_gpu
 } // namespace triton
 } // namespace mlir
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -832,6 +832,27 @@ void init_gluon_ir(py::module &&m) {
            })
       .def("create_cluster_wait",
            [](GluonOpBuilder &self) { self.create<ttng::ClusterWaitOp>(); })
+      // CLC (Cluster Launch Control) ops - SM100+
+      .def("create_clc_try_cancel",
+           [](GluonOpBuilder &self, Value result, Value mbarrier,
+              bool multicast) {
+             self.create<ttng::CLCTryCancelOp>(result, mbarrier, multicast);
+           })
+      .def("create_clc_load_result",
+           [](GluonOpBuilder &self, Value result) -> Value {
+             auto i64Ty = self.getBuilder().getI64Type();
+             return self.create<ttng::CLCLoadResultOp>(result);
+           })
+      .def("create_clc_is_canceled",
+           [](GluonOpBuilder &self, Value clcResult) -> Value {
+             auto i1Ty = self.getBuilder().getI1Type();
+             return self.create<ttng::CLCIsCanceledOp>(clcResult);
+           })
+      .def("create_clc_get_program_id",
+           [](GluonOpBuilder &self, Value clcResult, int dim) -> Value {
+             auto i32Ty = self.getBuilder().getI32Type();
+             return self.create<ttng::CLCGetProgramIdOp>(clcResult, dim);
+           })
       .def("create_tcgen05_mma",
            [](GluonOpBuilder &self, Value a, Value b, Value acc, Value useAcc,
               Value pred, std::vector<Value> &mbarriers,
diff --git a/python/src/ir.cc b/python/src/ir.cc
@@ -1008,6 +1008,10 @@ void init_triton_ir(py::module &&m) {
            [](TritonOpBuilder &self) -> Type {
              return self.getBuilder().getI64Type();
            })
+      .def("get_int128_ty",
+           [](TritonOpBuilder &self) -> Type {
+             return self.getBuilder().getIntegerType(128);
+           })
       .def("get_fp8e4nv_ty",
            [](TritonOpBuilder &self) -> Type {
              return self.getBuilder().getType<Float8E4M3FNType>();
diff --git a/python/test/gluon/test_core.py b/python/test/gluon/test_core.py
@@ -40,6 +40,7 @@
     tcgen05_commit,
     tcgen05_copy,
     float2,
+    clc,
 )
 from triton.experimental.gluon.nvidia.hopper import TensorDescriptor
 
@@ -3421,3 +3422,50 @@ def test_tmem_reduction(red_op, use_abs, propagate_nan, M, N, num_warps):
     # Verify reduction output
     # Use equal_nan=True when testing NaN propagation
     torch.testing.assert_close(expected_red, red_output, atol=1e-5, rtol=1e-5, equal_nan=use_nan)
+
+
+@pytest.mark.parametrize("num_ctas", [1, 2])
+@pytest.mark.skipif(not is_blackwell(), reason="Requires Blackwell")
+def test_clc_basic(num_ctas):
+
+    @gluon.jit
+    def clc_kernel(WasLaunched, IsCancelled, ProgramId, smem_size: ttgl.constexpr):
+        # Large shared memory allocation to force 1 block per SM
+        cga_layout: ttgl.constexpr = [[0]] if ttgl.num_ctas() == 2 else []
+        layout: ttgl.constexpr = ttgl.SwizzledSharedLayout(1, 1, 1, order=[0], cga_layout=cga_layout)
+        dummy = ttgl.allocate_shared_memory(ttgl.int64, [smem_size // 8 - 32], layout)
+
+        clc_result = ttgl.allocate_shared_memory(ttgl.int64, [2], layout)
+        clc_mbar = mbarrier.allocate_mbarrier()
+        mbarrier.init(clc_mbar, count=1)
+
+        clc.try_cancel(clc_result, clc_mbar, multicast=True)
+        mbarrier.expect(clc_mbar, 16)
+        mbarrier.wait(clc_mbar, 0)
+
+        response = clc.load_result(clc_result)
+        pid = ttgl.program_id(0)
+        ttgl.store(WasLaunched + pid, True)
+        ttgl.store(IsCancelled + pid, response.is_canceled())
+        ttgl.store(ProgramId + pid, response.program_id(0))
+        dummy._keep_alive()
+
+    dev_props = torch.cuda.get_device_properties("cuda")
+    num_sms = dev_props.multi_processor_count
+    smem_size = dev_props.shared_memory_per_block_optin // num_ctas
+    grid = 2 * (num_sms // num_ctas)
+
+    was_launched = torch.zeros([grid], dtype=torch.bool, device="cuda")
+    is_cancelled = torch.zeros([grid], dtype=torch.bool, device="cuda")
+    program_ids = torch.zeros([grid], dtype=torch.int32, device="cuda")
+    clc_kernel[(grid, )](was_launched, is_cancelled, program_ids, smem_size, num_ctas=num_ctas)
+
+    num_launched = torch.sum(was_launched).item()
+    assert num_launched < grid
+
+    num_cancelled = torch.sum(is_cancelled).item()
+    assert num_launched + num_cancelled == grid
+
+    for pid in range(grid):
+        if is_cancelled[pid]:
+            assert not was_launched[program_ids[pid]]
diff --git a/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py b/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py
@@ -8,6 +8,7 @@
 from triton.experimental.gluon.language._semantic import _check, _compute_tmem_reg_layout
 
 from . import tma
+from . import clc
 from ..hopper import fence_async_shared, mbarrier
 from ..ampere import async_copy, mma_v2
 
@@ -20,6 +21,7 @@
 __all__ = [
     "allocate_tensor_memory",
     "async_copy",
+    "clc",
     "fence_async_shared",
     "get_tmem_reg_layout",
     "mbarrier",
diff --git a/python/triton/experimental/gluon/language/nvidia/blackwell/clc.py b/python/triton/experimental/gluon/language/nvidia/blackwell/clc.py
@@ -0,0 +1,113 @@
+"""
+Cluster Launch Control (CLC) for Blackwell (SM100+) dynamic persistent kernels.
+
+CLC enables hardware-based dynamic work scheduling where running workers can
+cancel not-yet-launched clusters and take over their work via the
+clusterlaunchcontrol.try_cancel instruction.
+"""
+from __future__ import annotations
+
+import triton.experimental.gluon.language._core as gl
+from triton.experimental.gluon.language._core import builtin, tensor, shared_memory_descriptor, base_value, base_type
+from typing import TYPE_CHECKING, List, Tuple
+
+if TYPE_CHECKING:
+    from triton._C.libtriton.gluon_ir import GluonOpBuilder
+    from triton._C.libtriton import ir
+
+__all__ = [
+    "try_cancel",
+    "load_result",
+    "clc_result",
+]
+
+
+@builtin
+def try_cancel(result: shared_memory_descriptor, barrier, multicast=False, _semantic=None):
+    """
+    Issue a CLC try_cancel request to atomically cancel a pending cluster launch.
+
+    Args:
+        result (shared_memory_descriptor): 16-byte aligned shared memory for the response
+        barrier (shared_memory_descriptor): 8-byte aligned mbarrier for completion signaling
+        multicast (bool): If True, broadcast result to all CTAs in cluster
+
+    Only supported on SM100+ (Blackwell).
+    """
+    _semantic.builder.create_clc_try_cancel(result.handle, barrier.handle, multicast)
+
+
+@builtin
+def load_result(src, _semantic=None):
+    """
+    Load the CLC response from shared memory into registers.
+
+    Args:
+        src (shared_memory_descriptor): The CLC response buffer
+
+    Returns:
+        CLCResult: Object with is_canceled() and get_first_ctaid(dim) methods
+    """
+    handle = _semantic.builder.create_clc_load_result(src.handle)
+    return clc_result(handle)
+
+
+class clc_result_type(base_type):
+
+    def to_ir(self, builder: GluonOpBuilder) -> None:
+        return builder.get_int128_ty()
+
+    def _unflatten_ir(self, handles: List[ir.Value], cursor: int) -> Tuple[shared_memory_descriptor, int]:
+        value = clc_result(handles[cursor])
+        return value, cursor + 1
+
+    def _flatten_ir_types(self, builder: GluonOpBuilder, out: List[ir.type]) -> None:
+        out.append(self.to_ir(builder))
+
+    def __str__(self) -> str:
+        return "clc_result"
+
+    def __eq__(self, other) -> bool:
+        return type(self) is type(other)
+
+    def mangle(self) -> str:
+        return "CLC"
+
+
+class clc_result(base_value):
+    """CLC response loaded into registers. Query without re-reading memory."""
+
+    def __init__(self, handle):
+        self.handle = handle
+        self.type = clc_result_type()
+
+    def _flatten_ir(self, handles: List[ir.value]) -> None:
+        handles.append(self.handle)
+
+    def _set_name(self, builder: ir.builder, name: str) -> None:
+        self.handle.set_loc(builder.create_name_loc(name, self.handle.get_loc()))
+
+    @builtin
+    def is_canceled(self, _semantic=None):
+        """
+        Check if the CLC response indicates a successful cancellation.
+
+        Returns:
+            tensor: True if a cluster was successfully canceled, False otherwise
+        """
+        handle = _semantic.builder.create_clc_is_canceled(self.handle)
+        return tensor(handle, gl.int1)
+
+    @builtin
+    def program_id(self, dim, _semantic=None):
+        """
+        Get the Program ID of the canceled cluster.
+
+        Args:
+            dim (int): Dimension to get (0=x, 1=y, 2=z)
+
+        Returns:
+            tensor: The Program ID for the specified dimension
+        """
+        handle = _semantic.builder.create_clc_get_program_id(self.handle, dim)
+        return tensor(handle, gl.int32)
diff --git a/python/tutorials/gluon/07-persistence.py b/python/tutorials/gluon/07-persistence.py
@@ -834,7 +834,8 @@ def test_persistent_matmul_pipelined(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, num_buf
 #   Hopper and Blackwell: we are not double-buffering the accumulator and
 #   leaving 256 columns of TMEM unused.
 # - On Blackwell, we can use `clusterlaunchcontrol` to dynamically schedule
-#   work in conjunction with the GPU, getting the best of both worlds.
+#   work in conjunction with the GPU, getting the best of both worlds. This is
+#   explored further in tutorial 12.
 #
 # Main takeaways:
 #
diff --git a/python/tutorials/gluon/12-cluster-launch-control.py b/python/tutorials/gluon/12-cluster-launch-control.py
diff --git a/test/Conversion/clc_to_llvm.mlir b/test/Conversion/clc_to_llvm.mlir
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/BarrierOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/BarrierOpToLLVM.cpp

Original file line number	Diff line number	Diff line change
`@@ -834,7 +834,8 @@ def test_persistent_matmul_pipelined(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, num_buf`
`834`	`834`	`# Hopper and Blackwell: we are not double-buffering the accumulator and`
`835`	`835`	`# leaving 256 columns of TMEM unused.`
`836`	`836`	# - On Blackwell, we can use `clusterlaunchcontrol` to dynamically schedule
`837`		`-# work in conjunction with the GPU, getting the best of both worlds.`
	`837`	`+# work in conjunction with the GPU, getting the best of both worlds. This is`
	`838`	`+# explored further in tutorial 12.`
`838`	`839`	`#`
`839`	`840`	`# Main takeaways:`
`840`	`841`	`#`