Merge branch 'main' into codex/minimal-blackwell-act-scale

hyouklee0 · web-flow · commit 4712323ddc75 · 2026-02-11T06:32:12.000-08:00
diff --git a/.github/workflows/llvm-build.yml b/.github/workflows/llvm-build.yml
@@ -103,9 +103,6 @@ jobs:
         sudo apt-get autoremove -y
         sudo apt-get clean
         df -h
-        echo "Removing large directories"
-        # deleting 15GB
-        df -h
 
     - name: Configure, Build, Test, and Install LLVM (Ubuntu and macOS x64)
       if: matrix.config.arch == 'x64' && (matrix.config.target-os == 'ubuntu' || matrix.config.target-os == 'macos')
@@ -125,7 +122,6 @@ jobs:
         -DLLVM_ENABLE_PROJECTS="mlir;lld"
         -DLLVM_INSTALL_UTILS=ON
         -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
-        -DLLVM_ENABLE_TERMINFO=OFF
         -DLLVM_ENABLE_ZSTD=OFF
         llvm-project/llvm
 
@@ -150,7 +146,6 @@ jobs:
         -DLLVM_ENABLE_DIA_SDK=OFF
         -DLLVM_INSTALL_UTILS=ON
         -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
-        -DLLVM_ENABLE_TERMINFO=OFF
         -DLLVM_ENABLE_ZSTD=OFF
         llvm-project/llvm
 
@@ -217,7 +212,6 @@ jobs:
         -DCMAKE_SYSROOT=$SYSROOT \
         -DLLVM_INCLUDE_TESTS=OFF \
         -DMLIR_INCLUDE_TESTS=OFF \
-        -DLLVM_ENABLE_TERMINFO=OFF \
         llvm-project/llvm
         ninja -C llvm-project/build install
         tar czf "${{ env.llvm_install_dir }}.tar.gz" "${{ env.llvm_install_dir }}"
@@ -243,7 +237,6 @@ jobs:
         -DLLVM_INSTALL_UTILS=ON
         -DLLVM_TARGETS_TO_BUILD="AArch64;NVPTX;AMDGPU"
         -DLLVM_USE_HOST_TOOLS=ON
-        -DLLVM_ENABLE_TERMINFO=OFF
         -DLLVM_ABI_BREAKING_CHECKS=FORCE_OFF
         llvm-project/llvm
 
diff --git a/.github/workflows/llvm-build/almalinux.Dockerfile b/.github/workflows/llvm-build/almalinux.Dockerfile
@@ -35,7 +35,6 @@ RUN cmake -GNinja -Bbuild \
   -DLLVM_ENABLE_ASSERTIONS=ON \
   -DMLIR_ENABLE_BINDINGS_PYTHON=OFF \
   -DLLVM_ENABLE_PROJECTS="mlir;lld" \
-  -DLLVM_ENABLE_TERMINFO=OFF \
   -DLLVM_INSTALL_UTILS=ON \
   -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" \
   -DLLVM_ENABLE_ZSTD=OFF \
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/ClusterBarrierInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/ClusterBarrierInsertion.cpp
@@ -2,6 +2,7 @@
 #include "triton/Analysis/Allocation.h"
 #include "triton/Analysis/Membar.h"
 #include "triton/Analysis/Utility.h"
+#include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 
@@ -59,6 +60,20 @@ static bool isPreAllocAliasSliceFilter(const AllocationSlice &lhsSlice,
          allocation->isExplicitBuffer(bufferId);
 }
 
+static bool hasUnresolvedCrossClusterDependency(const BlockInfo &blockInfo) {
+  auto hasDistributedDependency = [](const BlockInfo::SliceMapT &slices,
+                                     bool isRead) {
+    for (const auto &sliceAndOps : slices)
+      for (Operation *depOp : sliceAndOps.second)
+        if (isDistributedMultiCTAOp(depOp, isRead))
+          return true;
+    return false;
+  };
+
+  return hasDistributedDependency(blockInfo.syncReadSlices, /*isRead=*/true) ||
+         hasDistributedDependency(blockInfo.syncWriteSlices, /*isRead=*/false);
+}
+
 class ClusterBarrierAnalysis : public MembarOrFenceAnalysis {
 public:
   ClusterBarrierAnalysis() = default;
@@ -87,6 +102,26 @@ void ClusterBarrierAnalysis::update(Operation *op, BlockInfo *blockInfo,
     return;
   }
 
+  // Any path from distributed shared memory use to kernel exit must include a
+  // cluster arrive/wait pair
+  if (op->hasTrait<OpTrait::ReturnLike>() &&
+      isa<FunctionOpInterface>(op->getParentOp())) {
+    // In `freeTMAlloc` we emit a cluster sync during lowering for 2CTA kernels,
+    // as we need to sync before the TMA deallocation
+    // Note that 2CTA kernels must have a tcgen05_mma instruction and thus must
+    // use TensorMemory
+    // According to NVIDIA this is enough, so we don't need an extra
+    // end-of-kernel barrier
+    auto funcOp = dyn_cast<FunctionOpInterface>(op->getParentOp());
+    if (isKernel(funcOp) && hasUnresolvedCrossClusterDependency(*blockInfo) &&
+        !getModuleTwoCTAs(funcOp)) {
+      builder->setInsertionPoint(op);
+      insertClusterBarrier(op, builder);
+      blockInfo->sync();
+    }
+    return;
+  }
+
   BlockInfo curBlockInfo;
   auto scratchBufferId = Allocation::InvalidBufferId;
   if (isa<triton::CallOp>(op)) {
diff --git a/python/test/gluon/test_lowerings.py b/python/test/gluon/test_lowerings.py
@@ -199,9 +199,6 @@ def test_reduce_funky_layout(src_layout, axis, device):
     # TODO: Remove this once AMD supports num_ctas > 1
     if num_ctas > 1 and not is_hopper_or_newer():
         pytest.skip("num_ctas > 1 requires NVIDIA SM90+ (Hopper)")
-    # PTXAS BUGGGG
-    if shape == (16, 8) and axis == 0:
-        pytest.skip("PTXAS BUGGGG")
 
     torch.manual_seed(0)
     x = torch.randn(shape, dtype=torch.float32, device=device)
diff --git a/python/tutorials/08-grouped-gemm.py b/python/tutorials/08-grouped-gemm.py
@@ -225,7 +225,7 @@ def group_gemm_fn(group_A, group_B):
 
 @triton.autotune(
     tma_configs,
-    key=['group_a_ptrs', 'group_b_ptrs', 'gropup_c_ptrs', 'group_size'],
+    key=['group_a_ptrs', 'group_b_ptrs', 'group_c_ptrs', 'group_size'],
 )
 @triton.jit
 def grouped_matmul_tma_kernel(
diff --git a/python/tutorials/gluon/02-layouts.py b/python/tutorials/gluon/02-layouts.py
@@ -93,13 +93,13 @@
 more registers to each thread:
 
 ```
-[[B0, B1, B2, B3],
- [B4, B5, B6, B7]]
+[[ B0,  B1,  B2,  B3,  B4,  B5,  B6,  B7],
+ [ B8,  B9, B10, B11, B12, B13, B14, B15]]
 ```
 
 In each block, each thread owns 8 registers. Thus over the whole tensor, each
-thread owns `8 * 8 = 64` registers. Knowing how many registers a tensor uses is
-important for managing register pressure and budget in the kernel.
+thread owns `8 * 16 = 128` registers. Knowing how many registers a tensor uses
+is important for managing register pressure and budget in the kernel.
 
 Consider a smaller tensor, say `32x8xf32`. The number of tiles at each level of
 the block does not change, thus even though the tensor has only `32 * 8 = 256`
diff --git a/test/TritonNvidiaGPU/membar-cluster.mlir b/test/TritonNvidiaGPU/membar-cluster.mlir
@@ -25,6 +25,57 @@ module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
 
 // -----
 
+#blocked = #ttg.blocked<{sizePerThread = [1, 32], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [0, 1], CGALayout = [[0, 1]]}>
+#slice1 = #ttg.slice<{dim = 1, parent = #blocked}>
+
+module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  // If there is a cross-CTA read dependency at kernel exit, we must end with a cluster barrier.
+  // CHECK-LABEL: @end_cluster_barrier_after_cross_reduce
+  // CHECK: "tt.reduce"{{.*}}axis = 1
+  // CHECK: ttng.cluster_arrive {relaxed = false}
+  // CHECK-NEXT: ttng.cluster_wait
+  // CHECK-NEXT: tt.return
+  tt.func @end_cluster_barrier_after_cross_reduce(%arg0: tensor<256x128xf16, #blocked>) -> tensor<256xf16, #slice1> {
+    %red = "tt.reduce"(%arg0) ({
+    ^bb0(%lhs: f16, %rhs: f16):
+      %add = arith.addf %lhs, %rhs : f16
+      tt.reduce.return %add : f16
+    }) {axis = 1 : i32} : (tensor<256x128xf16, #blocked>) -> tensor<256xf16, #slice1>
+    tt.return %red : tensor<256xf16, #slice1>
+  }
+}
+
+// -----
+
+#sharedA = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 16, CGALayout = [[1, 0]]}>
+#sharedB = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 16, CGALayout = [[0, 1]]}>
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1, CTASplitM = 2, twoCTAs = true>
+#smem = #ttg.shared_memory
+
+module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 8 : i32, "ttng.two-ctas" = true, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+  // Negative test: in 2CTA kernels with non-zero tensor memory size, TMEM
+  // teardown sync at kernel exit means we should not add an extra cluster barrier.
+  // CHECK-LABEL: @no_end_cluster_barrier_for_mma_with_tmem_teardown
+  // CHECK: ttng.tmem_alloc
+  // CHECK: ttng.tc_gen5_mma
+  // CHECK-NOT: ttng.cluster_arrive {relaxed = false}
+  // CHECK-NOT: ttng.cluster_wait
+  // CHECK: tt.return
+  tt.func @no_end_cluster_barrier_for_mma_with_tmem_teardown() {
+    %true = arith.constant true
+    %a = ttg.local_alloc : () -> !ttg.memdesc<256x32xf16, #sharedA, #smem, mutable>
+    %b = ttg.local_alloc : () -> !ttg.memdesc<32x128xf16, #sharedB, #smem, mutable>
+    %acc = ttng.tmem_alloc : () -> !ttg.memdesc<256x128xf32, #tmem, #ttng.tensor_memory, mutable>
+    ttng.tc_gen5_mma %a, %b, %acc, %true, %true {two_ctas} :
+       !ttg.memdesc<256x32xf16, #sharedA, #smem, mutable>,
+       !ttg.memdesc<32x128xf16, #sharedB, #smem, mutable>,
+       !ttg.memdesc<256x128xf32, #tmem, #ttng.tensor_memory, mutable>
+    tt.return
+  }
+}
+
+// -----
+
 #blocked = #ttg.blocked<{sizePerThread = [1, 32], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [0, 1], CGALayout = [[0, 1]]}>
 #slice0 = #ttg.slice<{dim = 0, parent = #blocked}>
 #slice1 = #ttg.slice<{dim = 1, parent = #blocked}>
@@ -388,16 +439,20 @@ module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
 #smem = #ttg.shared_memory
 
 module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
-  // Wait included to model the end of the async lifetime. No extra cluster
-  // barriers should appear after the wait when reusing the same alloc.
+  // NB. Testing only. Note that in this program async_tma_copy_global
+  //     and local_store are racing!
+  // Even though we have a wait_barrier, we should still emit a cluster
+  // barrier at the end of the kernel, as a in that wait just one CTA is waiting
+  // for both the CTAs. It could be that CTA1 exits the kernel before CTA0,
+  // otherwise!
   // CHECK-LABEL: @no_cluster_when_same_allocation
   // CHECK: ttng.init_barrier
   // CHECK-NEXT: ttng.fence_mbarrier_init_release_cluster
   // CHECK-NEXT: ttng.cluster_arrive {relaxed = true}
   // CHECK-NEXT: ttng.cluster_wait
   // CHECK: ttng.wait_barrier
-  // CHECK-NOT: ttng.cluster_arrive
-  // CHECK-NOT: ttng.cluster_wait
+  // CHECK: ttng.cluster_arrive {relaxed = false}
+  // CHECK-NEXT: ttng.cluster_wait
   // CHECK: tt.return
   tt.func @no_cluster_when_same_allocation(%desc: !tt.tensordesc<tensor<64x128xf16, #nvmma>>) -> tensor<64x128xf16, #blocked> {
     %c0 = arith.constant 0 : i32
diff --git a/third_party/amd/python/examples/gluon/f16_fa_gfx1250.py b/third_party/amd/python/examples/gluon/f16_fa_gfx1250.py
diff --git a/third_party/amd/python/examples/gluon/f16_gemm_gfx1250.py b/third_party/amd/python/examples/gluon/f16_gemm_gfx1250.py

Original file line number	Diff line number	Diff line change
`@@ -225,7 +225,7 @@ def group_gemm_fn(group_A, group_B):`
`225`	`225`
`226`	`226`	`@triton.autotune(`
`227`	`227`	`tma_configs,`
`228`		`- key=['group_a_ptrs', 'group_b_ptrs', 'gropup_c_ptrs', 'group_size'],`
	`228`	`+ key=['group_a_ptrs', 'group_b_ptrs', 'group_c_ptrs', 'group_size'],`
`229`	`229`	`)`
`230`	`230`	`@triton.jit`
`231`	`231`	`def grouped_matmul_tma_kernel(`