Skip to content

Commit 4712323

Browse files
authored
Merge branch 'main' into codex/minimal-blackwell-act-scale
2 parents ebcd86a + 088fbe1 commit 4712323

9 files changed

Lines changed: 370 additions & 52 deletions

File tree

.github/workflows/llvm-build.yml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,6 @@ jobs:
103103
sudo apt-get autoremove -y
104104
sudo apt-get clean
105105
df -h
106-
echo "Removing large directories"
107-
# deleting 15GB
108-
df -h
109106
110107
- name: Configure, Build, Test, and Install LLVM (Ubuntu and macOS x64)
111108
if: matrix.config.arch == 'x64' && (matrix.config.target-os == 'ubuntu' || matrix.config.target-os == 'macos')
@@ -125,7 +122,6 @@ jobs:
125122
-DLLVM_ENABLE_PROJECTS="mlir;lld"
126123
-DLLVM_INSTALL_UTILS=ON
127124
-DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
128-
-DLLVM_ENABLE_TERMINFO=OFF
129125
-DLLVM_ENABLE_ZSTD=OFF
130126
llvm-project/llvm
131127
@@ -150,7 +146,6 @@ jobs:
150146
-DLLVM_ENABLE_DIA_SDK=OFF
151147
-DLLVM_INSTALL_UTILS=ON
152148
-DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
153-
-DLLVM_ENABLE_TERMINFO=OFF
154149
-DLLVM_ENABLE_ZSTD=OFF
155150
llvm-project/llvm
156151
@@ -217,7 +212,6 @@ jobs:
217212
-DCMAKE_SYSROOT=$SYSROOT \
218213
-DLLVM_INCLUDE_TESTS=OFF \
219214
-DMLIR_INCLUDE_TESTS=OFF \
220-
-DLLVM_ENABLE_TERMINFO=OFF \
221215
llvm-project/llvm
222216
ninja -C llvm-project/build install
223217
tar czf "${{ env.llvm_install_dir }}.tar.gz" "${{ env.llvm_install_dir }}"
@@ -243,7 +237,6 @@ jobs:
243237
-DLLVM_INSTALL_UTILS=ON
244238
-DLLVM_TARGETS_TO_BUILD="AArch64;NVPTX;AMDGPU"
245239
-DLLVM_USE_HOST_TOOLS=ON
246-
-DLLVM_ENABLE_TERMINFO=OFF
247240
-DLLVM_ABI_BREAKING_CHECKS=FORCE_OFF
248241
llvm-project/llvm
249242

.github/workflows/llvm-build/almalinux.Dockerfile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ RUN cmake -GNinja -Bbuild \
3535
-DLLVM_ENABLE_ASSERTIONS=ON \
3636
-DMLIR_ENABLE_BINDINGS_PYTHON=OFF \
3737
-DLLVM_ENABLE_PROJECTS="mlir;lld" \
38-
-DLLVM_ENABLE_TERMINFO=OFF \
3938
-DLLVM_INSTALL_UTILS=ON \
4039
-DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" \
4140
-DLLVM_ENABLE_ZSTD=OFF \

lib/Dialect/TritonNvidiaGPU/Transforms/ClusterBarrierInsertion.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "triton/Analysis/Allocation.h"
33
#include "triton/Analysis/Membar.h"
44
#include "triton/Analysis/Utility.h"
5+
#include "triton/Dialect/Triton/IR/Utility.h"
56
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
67
#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
78

@@ -59,6 +60,20 @@ static bool isPreAllocAliasSliceFilter(const AllocationSlice &lhsSlice,
5960
allocation->isExplicitBuffer(bufferId);
6061
}
6162

63+
static bool hasUnresolvedCrossClusterDependency(const BlockInfo &blockInfo) {
64+
auto hasDistributedDependency = [](const BlockInfo::SliceMapT &slices,
65+
bool isRead) {
66+
for (const auto &sliceAndOps : slices)
67+
for (Operation *depOp : sliceAndOps.second)
68+
if (isDistributedMultiCTAOp(depOp, isRead))
69+
return true;
70+
return false;
71+
};
72+
73+
return hasDistributedDependency(blockInfo.syncReadSlices, /*isRead=*/true) ||
74+
hasDistributedDependency(blockInfo.syncWriteSlices, /*isRead=*/false);
75+
}
76+
6277
class ClusterBarrierAnalysis : public MembarOrFenceAnalysis {
6378
public:
6479
ClusterBarrierAnalysis() = default;
@@ -87,6 +102,26 @@ void ClusterBarrierAnalysis::update(Operation *op, BlockInfo *blockInfo,
87102
return;
88103
}
89104

105+
// Any path from distributed shared memory use to kernel exit must include a
106+
// cluster arrive/wait pair
107+
if (op->hasTrait<OpTrait::ReturnLike>() &&
108+
isa<FunctionOpInterface>(op->getParentOp())) {
109+
// In `freeTMAlloc` we emit a cluster sync during lowering for 2CTA kernels,
110+
// as we need to sync before the TMA deallocation
111+
// Note that 2CTA kernels must have a tcgen05_mma instruction and thus must
112+
// use TensorMemory
113+
// According to NVIDIA this is enough, so we don't need an extra
114+
// end-of-kernel barrier
115+
auto funcOp = dyn_cast<FunctionOpInterface>(op->getParentOp());
116+
if (isKernel(funcOp) && hasUnresolvedCrossClusterDependency(*blockInfo) &&
117+
!getModuleTwoCTAs(funcOp)) {
118+
builder->setInsertionPoint(op);
119+
insertClusterBarrier(op, builder);
120+
blockInfo->sync();
121+
}
122+
return;
123+
}
124+
90125
BlockInfo curBlockInfo;
91126
auto scratchBufferId = Allocation::InvalidBufferId;
92127
if (isa<triton::CallOp>(op)) {

python/test/gluon/test_lowerings.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -199,9 +199,6 @@ def test_reduce_funky_layout(src_layout, axis, device):
199199
# TODO: Remove this once AMD supports num_ctas > 1
200200
if num_ctas > 1 and not is_hopper_or_newer():
201201
pytest.skip("num_ctas > 1 requires NVIDIA SM90+ (Hopper)")
202-
# PTXAS BUGGGG
203-
if shape == (16, 8) and axis == 0:
204-
pytest.skip("PTXAS BUGGGG")
205202

206203
torch.manual_seed(0)
207204
x = torch.randn(shape, dtype=torch.float32, device=device)

python/tutorials/08-grouped-gemm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ def group_gemm_fn(group_A, group_B):
225225

226226
@triton.autotune(
227227
tma_configs,
228-
key=['group_a_ptrs', 'group_b_ptrs', 'gropup_c_ptrs', 'group_size'],
228+
key=['group_a_ptrs', 'group_b_ptrs', 'group_c_ptrs', 'group_size'],
229229
)
230230
@triton.jit
231231
def grouped_matmul_tma_kernel(

python/tutorials/gluon/02-layouts.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,13 @@
9393
more registers to each thread:
9494
9595
```
96-
[[B0, B1, B2, B3],
97-
[B4, B5, B6, B7]]
96+
[[ B0, B1, B2, B3, B4, B5, B6, B7],
97+
[ B8, B9, B10, B11, B12, B13, B14, B15]]
9898
```
9999
100100
In each block, each thread owns 8 registers. Thus over the whole tensor, each
101-
thread owns `8 * 8 = 64` registers. Knowing how many registers a tensor uses is
102-
important for managing register pressure and budget in the kernel.
101+
thread owns `8 * 16 = 128` registers. Knowing how many registers a tensor uses
102+
is important for managing register pressure and budget in the kernel.
103103
104104
Consider a smaller tensor, say `32x8xf32`. The number of tiles at each level of
105105
the block does not change, thus even though the tensor has only `32 * 8 = 256`

test/TritonNvidiaGPU/membar-cluster.mlir

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,57 @@ module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
2525

2626
// -----
2727

28+
#blocked = #ttg.blocked<{sizePerThread = [1, 32], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [0, 1], CGALayout = [[0, 1]]}>
29+
#slice1 = #ttg.slice<{dim = 1, parent = #blocked}>
30+
31+
module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
32+
// If there is a cross-CTA read dependency at kernel exit, we must end with a cluster barrier.
33+
// CHECK-LABEL: @end_cluster_barrier_after_cross_reduce
34+
// CHECK: "tt.reduce"{{.*}}axis = 1
35+
// CHECK: ttng.cluster_arrive {relaxed = false}
36+
// CHECK-NEXT: ttng.cluster_wait
37+
// CHECK-NEXT: tt.return
38+
tt.func @end_cluster_barrier_after_cross_reduce(%arg0: tensor<256x128xf16, #blocked>) -> tensor<256xf16, #slice1> {
39+
%red = "tt.reduce"(%arg0) ({
40+
^bb0(%lhs: f16, %rhs: f16):
41+
%add = arith.addf %lhs, %rhs : f16
42+
tt.reduce.return %add : f16
43+
}) {axis = 1 : i32} : (tensor<256x128xf16, #blocked>) -> tensor<256xf16, #slice1>
44+
tt.return %red : tensor<256xf16, #slice1>
45+
}
46+
}
47+
48+
// -----
49+
50+
#sharedA = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 16, CGALayout = [[1, 0]]}>
51+
#sharedB = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 16, CGALayout = [[0, 1]]}>
52+
#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1, CTASplitM = 2, twoCTAs = true>
53+
#smem = #ttg.shared_memory
54+
55+
module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 8 : i32, "ttng.two-ctas" = true, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
56+
// Negative test: in 2CTA kernels with non-zero tensor memory size, TMEM
57+
// teardown sync at kernel exit means we should not add an extra cluster barrier.
58+
// CHECK-LABEL: @no_end_cluster_barrier_for_mma_with_tmem_teardown
59+
// CHECK: ttng.tmem_alloc
60+
// CHECK: ttng.tc_gen5_mma
61+
// CHECK-NOT: ttng.cluster_arrive {relaxed = false}
62+
// CHECK-NOT: ttng.cluster_wait
63+
// CHECK: tt.return
64+
tt.func @no_end_cluster_barrier_for_mma_with_tmem_teardown() {
65+
%true = arith.constant true
66+
%a = ttg.local_alloc : () -> !ttg.memdesc<256x32xf16, #sharedA, #smem, mutable>
67+
%b = ttg.local_alloc : () -> !ttg.memdesc<32x128xf16, #sharedB, #smem, mutable>
68+
%acc = ttng.tmem_alloc : () -> !ttg.memdesc<256x128xf32, #tmem, #ttng.tensor_memory, mutable>
69+
ttng.tc_gen5_mma %a, %b, %acc, %true, %true {two_ctas} :
70+
!ttg.memdesc<256x32xf16, #sharedA, #smem, mutable>,
71+
!ttg.memdesc<32x128xf16, #sharedB, #smem, mutable>,
72+
!ttg.memdesc<256x128xf32, #tmem, #ttng.tensor_memory, mutable>
73+
tt.return
74+
}
75+
}
76+
77+
// -----
78+
2879
#blocked = #ttg.blocked<{sizePerThread = [1, 32], threadsPerWarp = [8, 4], warpsPerCTA = [4, 1], order = [0, 1], CGALayout = [[0, 1]]}>
2980
#slice0 = #ttg.slice<{dim = 0, parent = #blocked}>
3081
#slice1 = #ttg.slice<{dim = 1, parent = #blocked}>
@@ -388,16 +439,20 @@ module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
388439
#smem = #ttg.shared_memory
389440

390441
module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
391-
// Wait included to model the end of the async lifetime. No extra cluster
392-
// barriers should appear after the wait when reusing the same alloc.
442+
// NB. Testing only. Note that in this program async_tma_copy_global
443+
// and local_store are racing!
444+
// Even though we have a wait_barrier, we should still emit a cluster
445+
// barrier at the end of the kernel, as a in that wait just one CTA is waiting
446+
// for both the CTAs. It could be that CTA1 exits the kernel before CTA0,
447+
// otherwise!
393448
// CHECK-LABEL: @no_cluster_when_same_allocation
394449
// CHECK: ttng.init_barrier
395450
// CHECK-NEXT: ttng.fence_mbarrier_init_release_cluster
396451
// CHECK-NEXT: ttng.cluster_arrive {relaxed = true}
397452
// CHECK-NEXT: ttng.cluster_wait
398453
// CHECK: ttng.wait_barrier
399-
// CHECK-NOT: ttng.cluster_arrive
400-
// CHECK-NOT: ttng.cluster_wait
454+
// CHECK: ttng.cluster_arrive {relaxed = false}
455+
// CHECK-NEXT: ttng.cluster_wait
401456
// CHECK: tt.return
402457
tt.func @no_cluster_when_same_allocation(%desc: !tt.tensordesc<tensor<64x128xf16, #nvmma>>) -> tensor<64x128xf16, #blocked> {
403458
%c0 = arith.constant 0 : i32

0 commit comments

Comments
 (0)