Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ LogicalResult verifyMMAv5Op(Operation *op);
} // namespace mlir::triton::nvidia_gpu::impl

#define GET_ATTRDEF_CLASSES
#include "triton/Dialect/TritonNvidiaGPU/IR/OpsEnums.h.inc"

#include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUAttrDefs.h.inc"

#include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.h.inc"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define TRITONNVIDIAGPU_ATTRDEFS

include "mlir/IR/AttrTypeBase.td"
include "mlir/IR/EnumAttr.td"
include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUDialect.td"
include "triton/Dialect/Triton/IR/TritonInterfaces.td"

Expand Down Expand Up @@ -74,4 +75,25 @@ def TTG_TensorMemoryScalesEncodingAttr : AttrDef<TritonNvidiaGPU_Dialect, "Tenso
let assemblyFormat = "`<` struct(params) `>`";
}


def TTNG_TensorModeAttr : I32EnumAttr<
"TensorMode", "",
[
I32EnumAttrCase<"TILED", 0, "tiled">,
I32EnumAttrCase<"IM2COL", 1, "im2col">
]> {
let cppNamespace = "::mlir::triton::nvidia_gpu";
let description = [{
Enum attribute for TMA tensor mode.

TILED: Tiled mode for regular tensor memory access.
IM2COL: Im2col mode for convolution-friendly tensor memory access.

See:
- https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-mode
- https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-mode
}];
}


#endif
39 changes: 35 additions & 4 deletions include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#define TRITONNVIDIAGPU_OPS

include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUDialect.td"
include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUAttrDefs.td"
include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.td"
include "mlir/Dialect/Arith/IR/ArithBase.td"
include "triton/Dialect/Triton/IR/TritonTypes.td"
Expand Down Expand Up @@ -289,7 +290,7 @@ def TTNG_AsyncCopyMbarrierArriveOp : TTNG_Op<"async_copy_mbarrier_arrive"> {
}


def TTNG_AsyncTMACopyGlobalToLocalOp : TTNG_Op<"async_tma_copy_global_to_local"> {
def TTNG_AsyncTMACopyGlobalToLocalOp : TTNG_Op<"async_tma_copy_global_to_local", [AttrSizedOperandSegments]> {
let summary = "copy data based on descriptor from global memory to local memory asynchronously";

let description = [{
Expand All @@ -298,24 +299,54 @@ def TTNG_AsyncTMACopyGlobalToLocalOp : TTNG_Op<"async_tma_copy_global_to_local">
local memory pointed by the memory descriptor instead of a distributed
tensor. The data copied depends on the global memory descriptor pointed to
by `desc`.

The operation supports two tensor modes:
- TILED (default): Regular tiled tensor memory access
- See: https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-mode
- IM2COL: Im2col mode for convolution-friendly access patterns
- In IM2COL mode, 'coord' is the coordinates in the input tensor
- For example, for a 4D tensor (NHWC), 'coord' is [batch_idx, channel_idx, h, w]
- In IM2COL mode, additional `offsets` must be provided (uint16 values)
- For 3D tensors (NWC): 1 offset (offset_w)
- For 4D tensors (NHWC): 2 offsets (offset_w, offset_h)
- For 5D tensors (NDHWC): 3 offsets (offset_w, offset_h, offset_d)
- General rule: number of offsets = coord.size() - 2
- See: https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-mode

}];

let hasVerifier = 1;
let arguments = (ins
Arg<TT_TensorDescType, "", [MemRead<GlobalMemory>]>:$desc,
Variadic<I32>:$coord,
Variadic<I16>:$offsets,
Arg<TTG_MemDescType, "", [MemWrite<SharedMemory>]>:$barrier,
Arg<TTG_MemDescType, "", [MemWrite<SharedMemory>]>:$result,
I1:$pred,
UnitAttr:$multicast,
DefaultValuedAttr<TT_CacheModifierAttr, "triton::CacheModifier::NONE">:$cache,
DefaultValuedAttr<TT_EvictionPolicyAttr, "triton::EvictionPolicy::NORMAL">:$evict,
DefaultValuedAttr<BoolAttr, "false">:$isVolatile
DefaultValuedAttr<BoolAttr, "false">:$isVolatile,
DefaultValuedAttr<TTNG_TensorModeAttr, "triton::nvidia_gpu::TensorMode::TILED">:$tensorMode
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How come this doesn't impact the lowering to llvm?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, the lowering is defaulted to tiled mode. The lowering to im2col mode will be added in the following PR.

);

let builders = [
// Builder for TILED mode (no offsets required, attributes default to standard values)
OpBuilder<(ins "Value":$desc, "ValueRange":$coord, "Value":$barrier,
"Value":$result, "Value":$pred,
CArg<"bool", "false">:$multicast,
CArg<"triton::CacheModifier", "triton::CacheModifier::NONE">:$cache,
CArg<"triton::EvictionPolicy", "triton::EvictionPolicy::NORMAL">:$evict,
CArg<"bool", "false">:$isVolatile,
CArg<"triton::nvidia_gpu::TensorMode", "triton::nvidia_gpu::TensorMode::TILED">:$tensorMode), [{
build($_builder, $_state, desc, coord, /*offsets=*/ValueRange{}, barrier,
result, pred, multicast, cache, evict, isVolatile, tensorMode);
}]>
];

let assemblyFormat = [{
$desc `[` $coord `]` $result `,` $barrier `,` $pred
oilist(`cacheModifier` `=` $cache | `evictionPolicy` `=` $evict)
$desc `[` $coord `]` (`offsets` `=` `[` $offsets^ `]`)? $result `,` $barrier `,` $pred
oilist(`cacheModifier` `=` $cache | `evictionPolicy` `=` $evict | `tensorMode` `=` $tensorMode)
attr-dict `:` qualified(type($desc)) `,` qualified(type($barrier)) `->` qualified(type($result))
}];
}
Expand Down
1 change: 1 addition & 0 deletions lib/Dialect/TritonNvidiaGPU/IR/Dialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,7 @@ LogicalResult impl::verifyMMAv5Op(Operation *op) {
// Attribute methods
//===----------------------------------------------------------------------===//
#define GET_ATTRDEF_CLASSES
#include "triton/Dialect/TritonNvidiaGPU/IR/OpsEnums.cpp.inc"
#include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUAttrDefs.cpp.inc"

//===----------------------------------------------------------------------===//
Expand Down
72 changes: 60 additions & 12 deletions lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,32 +277,78 @@ static LogicalResult verifyAsyncTMAStoreOp(Operation *op,
}

static LogicalResult verifyAsyncTMACoords(Operation *op, ValueRange coords,
TypedValue<TensorDescType> desc) {
unsigned rank = desc.getType().getBlockType().getRank();
if (coords.size() != rank) {
return op->emitOpError("expected ")
<< rank << " coordinates, but got " << coords.size();
TypedValue<TensorDescType> desc,
TensorMode tensorMode) {
unsigned blockRank = desc.getType().getBlockType().getRank();

if (tensorMode == TensorMode::IM2COL) {
// For IM2COL mode, coordinates are for the full tensor (3D-5D)
// not the 2D block shape
if (coords.size() < 3)
return op->emitOpError(
"IM2COL mode requires at least 3D coordinates, but got ")
<< coords.size() << "D";
if (coords.size() > 5)
return op->emitOpError(
"IM2COL mode supports at most 5D coordinates, but got ")
<< coords.size() << "D";
} else {
// For TILED mode, coordinates must match the block rank
if (coords.size() != blockRank) {
return op->emitOpError("expected ")
<< blockRank << " coordinates, but got " << coords.size();
}
if (coords.size() < 1 || coords.size() > 5)
return op->emitOpError("must have between 1 and 5 coordinates");
}
return success();
}

static LogicalResult verifyTMAMode(Operation *op, TensorMode tensorMode,
ValueRange coords, ValueRange offsets) {
if (tensorMode == TensorMode::IM2COL) {
if (offsets.empty())
return op->emitOpError("IM2COL mode requires offsets to be provided");

// For IM2COL mode, the number of offsets should be coord.size() - 2
// 4D tensors (4 coords) need 2 offsets, 5D tensors (5 coords) need 3
// offsets
size_t expectedOffsets = coords.size() - 2;
if (offsets.size() != expectedOffsets) {
return op->emitOpError("IM2COL mode with ")
<< coords.size() << "D coordinates requires " << expectedOffsets
<< " offsets, but got " << offsets.size();
}
} else {
// TILED mode should not have offsets
if (!offsets.empty())
return op->emitOpError("TILED mode does not support offsets");
}
if (coords.size() < 1 || coords.size() > 5)
return op->emitOpError("must have between 1 and 5 coordinates");
return success();
}

// -- AsyncTMACopyGlobalToLocalOp --
LogicalResult AsyncTMACopyGlobalToLocalOp::verify() {
if (failed(verifyAsyncTMACoords(*this, getCoord(), getDesc())))
if (failed(
verifyAsyncTMACoords(*this, getCoord(), getDesc(), getTensorMode())))
return failure();
auto resultType = getResult().getType();
if (failed(
verifyDescriptorLoadStoreOp(*this, getDesc().getType(), resultType)))
return failure();
return verifyAsyncTMALoadOp(*this, getDesc(), getBarrier(),
getResult().getType());
if (failed(verifyAsyncTMALoadOp(*this, getDesc(), getBarrier(),
getResult().getType())))
return failure();
if (failed(verifyTMAMode(*this, getTensorMode(), getCoord(), getOffsets())))
return failure();
return success();
}

// -- AsyncTMACopyLocalToGlobalOp --
LogicalResult AsyncTMACopyLocalToGlobalOp::verify() {
if (failed(verifyAsyncTMACoords(*this, getCoord(), getDesc())))
// Store ops only support TILED mode
if (failed(verifyAsyncTMACoords(*this, getCoord(), getDesc(),
TensorMode::TILED)))
return failure();
MemDescType srcType = getSrc().getType();
if (failed(verifyDescriptorLoadStoreOp(*this, getDesc().getType(), srcType)))
Expand All @@ -312,7 +358,9 @@ LogicalResult AsyncTMACopyLocalToGlobalOp::verify() {

// -- AsyncTMAReduceOp --
LogicalResult AsyncTMAReduceOp::verify() {
if (failed(verifyAsyncTMACoords(*this, getCoord(), getDesc())))
// Reduce ops only support TILED mode
if (failed(verifyAsyncTMACoords(*this, getCoord(), getDesc(),
TensorMode::TILED)))
return failure();
MemDescType srcType = getSrc().getType();
if (failed(verifyDescriptorLoadStoreOp(*this, getDesc().getType(), srcType)))
Expand Down
73 changes: 73 additions & 0 deletions test/TritonNvidiaGPU/invalid.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,79 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
tt.return
}
}
// -----

#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
#nvmma_128 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
#shared2 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
tt.func public @tma_im2col_missing_offsets(%arg0: !tt.tensordesc<tensor<64x128xf16, #nvmma_128>>) {
%true = arith.constant true
%c0_i32 = arith.constant 0 : i32
%0 = ttg.local_alloc : () -> !ttg.memdesc<64x128xf16, #nvmma_128, #smem, mutable>
%1 = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared2, #smem, mutable>
// expected-error @below {{IM2COL mode requires offsets to be provided}}
ttng.async_tma_copy_global_to_local %arg0[%c0_i32, %c0_i32, %c0_i32, %c0_i32] %0, %1, %true tensorMode = im2col : !tt.tensordesc<tensor<64x128xf16, #nvmma_128>>, !ttg.memdesc<1xi64, #shared2, #smem, mutable> -> !ttg.memdesc<64x128xf16, #nvmma_128, #smem, mutable>
tt.return
}
}

// -----

#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
#nvmma_128 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
#shared2 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
tt.func public @tma_im2col_wrong_offset_count(%arg0: !tt.tensordesc<tensor<64x128xf16, #nvmma_128>>) {
%true = arith.constant true
%c0_i32 = arith.constant 0 : i32
%c1_i16 = arith.constant 1 : i16
%0 = ttg.local_alloc : () -> !ttg.memdesc<64x128xf16, #nvmma_128, #smem, mutable>
%1 = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared2, #smem, mutable>
// expected-error @below {{IM2COL mode with 4D coordinates requires 2 offsets, but got 1}}
ttng.async_tma_copy_global_to_local %arg0[%c0_i32, %c0_i32, %c0_i32, %c0_i32] offsets = [%c1_i16] %0, %1, %true tensorMode = im2col : !tt.tensordesc<tensor<64x128xf16, #nvmma_128>>, !ttg.memdesc<1xi64, #shared2, #smem, mutable> -> !ttg.memdesc<64x128xf16, #nvmma_128, #smem, mutable>
tt.return
}
}

// -----

#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
#nvmma_128 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
#shared2 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
tt.func public @tma_tiled_with_offsets(%arg0: !tt.tensordesc<tensor<64x128xf16, #nvmma_128>>) {
%true = arith.constant true
%c0_i32 = arith.constant 0 : i32
%c1_i16 = arith.constant 1 : i16
%0 = ttg.local_alloc : () -> !ttg.memdesc<64x128xf16, #nvmma_128, #smem, mutable>
%1 = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared2, #smem, mutable>
// expected-error @below {{TILED mode does not support offsets}}
ttng.async_tma_copy_global_to_local %arg0[%c0_i32, %c0_i32] offsets = [%c1_i16] %0, %1, %true : !tt.tensordesc<tensor<64x128xf16, #nvmma_128>>, !ttg.memdesc<1xi64, #shared2, #smem, mutable> -> !ttg.memdesc<64x128xf16, #nvmma_128, #smem, mutable>
tt.return
}
}

// -----

#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
#nvmma_128 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
#shared2 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
tt.func public @tma_im2col_2d_invalid(%arg0: !tt.tensordesc<tensor<64x128xf16, #nvmma_128>>) {
%true = arith.constant true
%c0_i32 = arith.constant 0 : i32
%0 = ttg.local_alloc : () -> !ttg.memdesc<64x128xf16, #nvmma_128, #smem, mutable>
%1 = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared2, #smem, mutable>
// expected-error @below {{IM2COL mode requires at least 3D coordinates, but got 2D}}
ttng.async_tma_copy_global_to_local %arg0[%c0_i32, %c0_i32] %0, %1, %true tensorMode = im2col : !tt.tensordesc<tensor<64x128xf16, #nvmma_128>>, !ttg.memdesc<1xi64, #shared2, #smem, mutable> -> !ttg.memdesc<64x128xf16, #nvmma_128, #smem, mutable>
tt.return
}
}

// -----

Expand Down
62 changes: 62 additions & 0 deletions test/TritonNvidiaGPU/ops.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,65 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
tt.return
}
}

// Tests for TMA im2col (3D/4D/5D) and tiled mode
#nvmma_128 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
#shared3 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
// CHECK-LABEL: @tma_load_im2col_3d
// CHECK: ttng.async_tma_copy_global_to_local {{.*}} offsets = [{{.*}}] {{.*}} tensorMode = im2col
tt.func public @tma_load_im2col_3d(%desc: !tt.tensordesc<tensor<64x128xf16, #nvmma_128>>) {
%true = arith.constant true
%c0 = arith.constant 0 : i32
%off = arith.constant 1 : i16
%buf = ttg.local_alloc : () -> !ttg.memdesc<64x128xf16, #nvmma_128, #smem, mutable>
%bar = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared3, #smem, mutable>
ttng.init_barrier %bar, 1 : !ttg.memdesc<1xi64, #shared3, #smem, mutable>
ttng.async_tma_copy_global_to_local %desc[%c0, %c0, %c0] offsets = [%off] %buf, %bar, %true tensorMode = im2col : !tt.tensordesc<tensor<64x128xf16, #nvmma_128>>, !ttg.memdesc<1xi64, #shared3, #smem, mutable> -> !ttg.memdesc<64x128xf16, #nvmma_128, #smem, mutable>
tt.return
}

// CHECK-LABEL: @tma_load_im2col_4d
// CHECK: ttng.async_tma_copy_global_to_local {{.*}} offsets = [{{.*}}, {{.*}}] {{.*}} tensorMode = im2col
tt.func public @tma_load_im2col_4d(%desc: !tt.tensordesc<tensor<64x128xf16, #nvmma_128>>) {
%true = arith.constant true
%c0 = arith.constant 0 : i32
%off1 = arith.constant 1 : i16
%off2 = arith.constant 2 : i16
%buf = ttg.local_alloc : () -> !ttg.memdesc<64x128xf16, #nvmma_128, #smem, mutable>
%bar = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared3, #smem, mutable>
ttng.init_barrier %bar, 1 : !ttg.memdesc<1xi64, #shared3, #smem, mutable>
ttng.async_tma_copy_global_to_local %desc[%c0, %c0, %c0, %c0] offsets = [%off1, %off2] %buf, %bar, %true tensorMode = im2col : !tt.tensordesc<tensor<64x128xf16, #nvmma_128>>, !ttg.memdesc<1xi64, #shared3, #smem, mutable> -> !ttg.memdesc<64x128xf16, #nvmma_128, #smem, mutable>
tt.return
}

// CHECK-LABEL: @tma_load_im2col_5d
// CHECK: ttng.async_tma_copy_global_to_local {{.*}} offsets = [{{.*}}, {{.*}}, {{.*}}] {{.*}} tensorMode = im2col
tt.func public @tma_load_im2col_5d(%desc: !tt.tensordesc<tensor<64x128xf16, #nvmma_128>>) {
%true = arith.constant true
%c0 = arith.constant 0 : i32
%off1 = arith.constant 1 : i16
%off2 = arith.constant 2 : i16
%off3 = arith.constant 3 : i16
%buf = ttg.local_alloc : () -> !ttg.memdesc<64x128xf16, #nvmma_128, #smem, mutable>
%bar = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared3, #smem, mutable>
ttng.init_barrier %bar, 1 : !ttg.memdesc<1xi64, #shared3, #smem, mutable>
ttng.async_tma_copy_global_to_local %desc[%c0, %c0, %c0, %c0, %c0] offsets = [%off1, %off2, %off3] %buf, %bar, %true tensorMode = im2col : !tt.tensordesc<tensor<64x128xf16, #nvmma_128>>, !ttg.memdesc<1xi64, #shared3, #smem, mutable> -> !ttg.memdesc<64x128xf16, #nvmma_128, #smem, mutable>
tt.return
}

// CHECK-LABEL: @tma_load_tiled_mode
// CHECK: ttng.async_tma_copy_global_to_local {{.*}}[{{.*}}, {{.*}}] %{{.*}}, %{{.*}}, {{.*}} : !tt.tensordesc
// CHECK-NOT: offsets
// CHECK-NOT: tensorMode
tt.func public @tma_load_tiled_mode(%desc: !tt.tensordesc<tensor<64x128xf16, #nvmma_128>>) {
%true = arith.constant true
%c0 = arith.constant 0 : i32
%buf = ttg.local_alloc : () -> !ttg.memdesc<64x128xf16, #nvmma_128, #smem, mutable>
%bar = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared3, #smem, mutable>
ttng.init_barrier %bar, 1 : !ttg.memdesc<1xi64, #shared3, #smem, mutable>
ttng.async_tma_copy_global_to_local %desc[%c0, %c0] %buf, %bar, %true : !tt.tensordesc<tensor<64x128xf16, #nvmma_128>>, !ttg.memdesc<1xi64, #shared3, #smem, mutable> -> !ttg.memdesc<64x128xf16, #nvmma_128, #smem, mutable>
tt.return
}
}
Loading