openxla
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 9 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h‎
Lines changed: 3 additions & 8 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 54 additions & 2 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 54 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Types.cpp‎
Lines changed: 10 additions & 0 deletions b/‎lib/Dialect/TritonGPU/IR/Types.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.cpp‎
Lines changed: 0 additions & 29 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.cpp‎
Lines changed: 0 additions & 29 deletions
diff --git a/‎python/examples/gluon/01-attention-forward.py‎
Lines changed: 1 addition & 1 deletion b/‎python/examples/gluon/01-attention-forward.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/triton/experimental/gluon/language/nvidia/blackwell/tma.py‎
Lines changed: 21 additions & 0 deletions b/‎python/triton/experimental/gluon/language/nvidia/blackwell/tma.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎python/triton/experimental/gluon/language/nvidia/hopper/tma.py‎
Lines changed: 2 additions & 0 deletions b/‎python/triton/experimental/gluon/language/nvidia/hopper/tma.py‎
Lines changed: 2 additions & 0 deletions
@@ -294,6 +294,15 @@ bool isInnermostContiguous(MemDescType type, unsigned numElems);
 LinearLayout inferReshapeLinearLayout(TensorOrMemDesc srcTy,
                                       ArrayRef<int64_t> dstShape);
 
+FailureOr<SmallVector<int64_t>>
+getTMABlockShape(ArrayRef<int64_t> shapePerCTA, int elementBitWidth,
+                 int swizzleBytes, bool fp4Padded, bool isTransposed,
+                 bool packedSize, function_ref<InFlightDiagnostic()> emitError);
+SmallVector<int64_t> getTMABlockShape(ArrayRef<int64_t> shapePerCTA,
+                                      int elementBitWidth, int swizzleBytes,
+                                      bool fp4Padded, bool isTransposed,
+                                      bool packedSize);
+
 // Verify the types of operations that operate on memory.
 LogicalResult verifyMemoryOpTypes(Operation *op, ShapedType srcTy,
                                   ShapedType dstTy);
 
@@ -457,6 +457,7 @@ def NVMMASharedEncodingAttr : TritonGPU_Attr<"NVMMASharedEncoding", "nvmma_share
     int getVec() const;
   }];
   let hasCustomAssemblyFormat = 1;
+  let genVerifyDecl = 1;
 }
 
 def AMDRotatingSharedEncodingAttr :
 
@@ -31,18 +31,13 @@ triton::gpu::SharedEncodingTrait
 getEncodingFromDescriptor(Operation *op, RankedTensorType tensorType,
                           Value desc);
 
-SmallVector<int64_t> getTMABlockShape(ArrayRef<int64_t> shapePerCTA,
-                                      int elementBitWidth, int swizzleBytes,
-                                      bool fp4Padded, bool transposed,
-                                      bool packedSize);
-
 inline SmallVector<int64_t> getTMABlockShape(Attribute encoding,
                                              ArrayRef<int64_t> shapePerCTA,
                                              bool packedSize) {
   auto mmaEnc = cast<gpu::NVMMASharedEncodingAttr>(encoding);
-  return getTMABlockShape(shapePerCTA, mmaEnc.getElementBitWidth(),
-                          mmaEnc.getSwizzlingByteWidth(), mmaEnc.getFp4Padded(),
-                          mmaEnc.getTransposed(), packedSize);
+  return triton::gpu::getTMABlockShape(
+      shapePerCTA, mmaEnc.getElementBitWidth(), mmaEnc.getSwizzlingByteWidth(),
+      mmaEnc.getFp4Padded(), mmaEnc.getTransposed(), packedSize);
 }
 
 inline SmallVector<int64_t> getTMABlockShape(RankedTensorType ty,
 
@@ -2131,6 +2131,16 @@ void NVMMASharedEncodingAttr::print(AsmPrinter &printer) const {
   printer << "}>";
 }
 
+LogicalResult
+NVMMASharedEncodingAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                                unsigned swizzlingByteWidth, bool transposed,
+                                unsigned elementBitWidth, bool fp4Padded,
+                                CGAEncodingAttr CGALayout) {
+  if (elementBitWidth == 0)
+    return emitError() << "elementBitWidth must be non-zero";
+  return success();
+}
+
 int NVMMASharedEncodingAttr::getVec() const {
   if (getSwizzlingByteWidth() == 0)
     return 1;
@@ -2469,8 +2479,8 @@ CGAEncodingAttr DotOperandEncodingAttr::getCGALayout() const {
                               LinearLayout(std::move(bases), dims, true));
 }
 LogicalResult DotOperandEncodingAttr::verify(
-    ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
-    unsigned opIdx, Attribute parent, unsigned kWidth) {
+    function_ref<::mlir::InFlightDiagnostic()> emitError, unsigned opIdx,
+    Attribute parent, unsigned kWidth) {
   if (opIdx != 0 && opIdx != 1) {
     return emitError() << "ttg.dot_op opIdx parameter can be 0 or 1, got: "
                        << opIdx;
@@ -3963,6 +3973,48 @@ LinearLayout triton::gpu::inferReshapeLinearLayout(TensorOrMemDesc srcTy,
   return dst;
 }
 
+FailureOr<SmallVector<int64_t>> triton::gpu::getTMABlockShape(
+    ArrayRef<int64_t> shapePerCTA, int elementBitWidth, int swizzleBytes,
+    bool fp4Padded, bool isTransposed, bool packedSize,
+    function_ref<InFlightDiagnostic()> emitError) {
+  SmallVector<int64_t> blockShape(shapePerCTA);
+  int contigDim = isTransposed ? 0 : blockShape.size() - 1;
+  if (fp4Padded)
+    blockShape[contigDim] *= 2;
+  // All dimensions must be at most 256
+  constexpr int64_t dimMax = 256;
+  for (auto &size : blockShape)
+    size = std::min(size, dimMax);
+  // Last dim must equal the swizzle byte size
+  if (swizzleBytes != 0) {
+    auto contigDimSize = (8 * swizzleBytes) / elementBitWidth;
+    if (blockShape[contigDim] < contigDimSize) {
+      return emitError() << "block shape along the contiguous dimension "
+                         << contigDim
+                         << " is too small for the swizzle byte size "
+                         << swizzleBytes << " in an NVMMASharedLayout, got "
+                         << blockShape[contigDim] << " but expected at least "
+                         << contigDimSize;
+    }
+    blockShape[contigDim] = contigDimSize;
+  }
+  if (fp4Padded && packedSize) {
+    blockShape[contigDim] /= 2;
+  }
+  return blockShape;
+}
+SmallVector<int64_t> triton::gpu::getTMABlockShape(
+    ArrayRef<int64_t> shapePerCTA, int elementBitWidth, int swizzleBytes,
+    bool fp4Padded, bool isTransposed, bool packedSize) {
+  return *getTMABlockShape(
+      shapePerCTA, elementBitWidth, swizzleBytes, fp4Padded, isTransposed,
+      packedSize, []() -> InFlightDiagnostic {
+        llvm::report_fatal_error(
+            "Block shape is too small for the swizzle byte "
+            "size in NVMMA Shared Layout.");
+      });
+}
+
 SetVector<int> triton::gpu::getPartitionIds(Operation *op) {
   auto attrs = op->getAttr(kPartitionAttrName);
   SmallVector<int> partitionIds;
 
@@ -192,6 +192,16 @@ LogicalResult MemDescType::verify(function_ref<InFlightDiagnostic()> emitError,
     }
   }
 
+  if (auto enc = dyn_cast<NVMMASharedEncodingAttr>(encoding)) {
+    SmallVector<int64_t> shapePerCTA(getShapePerCTA(enc, allocShape));
+    auto blockShape = ArrayRef(shapePerCTA).take_back(enc.getRank());
+    if (failed(getTMABlockShape(blockShape, enc.getElementBitWidth(),
+                                enc.getSwizzlingByteWidth(), enc.getFp4Padded(),
+                                enc.getTransposed(), /*packedSize=*/false,
+                                emitError)))
+      return failure();
+  }
+
   return success();
 }
 
 
@@ -870,7 +870,7 @@ LogicalResult TMEMCopyOp::verify() {
       return emitOpError("Incorrect tmem layout.");
     }
     if (tmemEnc.getBlockM() != 128) {
-      return emitOpError("Tmem layout ahouls have M=128.");
+      return emitOpError("Tmem layout must have blockM=128.");
     }
     if (nvmmaEnc && nvmmaEnc.getSwizzlingByteWidth() == 0) {
       return emitOpError("Source layout should be swizzled.");
 
@@ -116,35 +116,6 @@ ttg::SharedEncodingTrait getEncodingFromDescriptor(Operation *op,
   return updateEncodingForShape(op, sharedEnc, tensorType);
 }
 
-SmallVector<int64_t> getTMABlockShape(ArrayRef<int64_t> shapePerCTA,
-                                      int elementBitWidth, int swizzleBytes,
-                                      bool fp4Padded, bool isTransposed,
-                                      bool packedSize) {
-  SmallVector<int64_t> blockShape(shapePerCTA);
-  int contigDim = isTransposed ? 0 : blockShape.size() - 1;
-  if (fp4Padded) {
-    blockShape[contigDim] *= 2;
-  }
-  // All dimensions must be at most 256
-  constexpr int64_t dimMax = 256;
-  for (auto &size : blockShape) {
-    size = std::min(size, dimMax);
-  }
-  // Last dim must equal the swizzle byte size
-  if (swizzleBytes != 0) {
-    auto contigDimSize = (8 * swizzleBytes) / elementBitWidth;
-    if (blockShape[contigDim] < contigDimSize) {
-      llvm::report_fatal_error("Block shape is too small for the swizzle byte "
-                               "size in NVMMA Shared Layout.");
-    }
-    blockShape[contigDim] = contigDimSize;
-  }
-  if (fp4Padded && packedSize) {
-    blockShape[contigDim] /= 2;
-  }
-  return blockShape;
-}
-
 std::optional<int> getTMASwizzleMode(Operation *op, TensorDescType ty) {
   auto encoding = ty.getBlockType().getEncoding();
   auto mmaEncoding = dyn_cast<ttg::NVMMASharedEncodingAttr>(encoding);
 
@@ -778,7 +778,7 @@ def _attn_fwd_correction_epilogue(config, prog, s_tmem, M, corr_consumer, epi_pr
     o_tmem, o_bar, o_consumer = o_consumer.acquire()
 
     # Shared memory subtile size is limited by the swizzle byte size.
-    contigDimSize: gl.constexpr = o_smem.type.layout.swizzle_byte_width * 8 / o_smem.type.element_ty.primitive_bitwidth
+    contigDimSize: gl.constexpr = o_smem.type.layout.swizzle_byte_width * 8 // o_smem.type.element_ty.primitive_bitwidth
     if o_smem.type.shape[1] // config.SPLIT_D_FACTOR >= contigDimSize:
         SPLIT_N_FACTOR: gl.constexpr = config.SPLIT_D_FACTOR
     else:
 
@@ -20,6 +20,25 @@
 ]
 
 
+def _check_gather_scatter(tensor_desc, x_offsets, smem, op_name, smem_name):
+    # Tensor descriptor must be 2D and layout must match the shared memory layout.
+    assert len(
+        tensor_desc.block_shape
+    ) == 2, f"async {op_name} requires a 2D tensor descriptor, but got one with rank {len(tensor_desc.block_shape)}"
+    assert tensor_desc.layout == smem.layout, f"tensor descriptor layout {tensor_desc.layout} does not match {smem_name} shared memory layout {smem.layout}"
+    # Row offsets must be 1D and have at least 8 rows.
+    assert len(
+        x_offsets.shape
+    ) == 1, f"async {op_name} requires a 1D tensor of row offsets, but got one with rank {len(x_offsets.shape)}"
+    assert x_offsets.shape[0] >= 8, f"async {op_name} requires at least 8 rows, but got {x_offsets.shape[0]}"
+    # Block shape must be [1, Y] where Y >= min_cols.
+    min_cols = 32 // tensor_desc.dtype.primitive_bitwidth * 8
+    assert tensor_desc.block_shape[
+        0] == 1, f"async {op_name} requires the tensor descriptor's block shape to have 1 row, but got {tensor_desc.block_shape}"
+    assert tensor_desc.block_shape[
+        1] >= min_cols, f"async {op_name} requires the tensor descriptor's block shape to have at least {min_cols} columns, but got {tensor_desc.block_shape[1]}"
+
+
 @builtin
 def async_gather(tensor_desc, x_offsets, y_offset, barrier, result, pred=True, _semantic=None):
     """
@@ -33,6 +52,7 @@ def async_gather(tensor_desc, x_offsets, y_offset, barrier, result, pred=True, _
         result (tensor_memory_descriptor): Result shared memory, must have NVMMASharedLayout.
         pred (bool): Scalar predicate. Operation is skipped if predicate is False. Defaults to True.
     """
+    _check_gather_scatter(tensor_desc, x_offsets, result, "gather", "result")
     pred = _semantic.to_tensor(pred)
     y_offset = _semantic.to_tensor(y_offset)
     _semantic.builder.create_async_tma_gather(tensor_desc.handle, x_offsets.handle, y_offset.handle, barrier.handle,
@@ -50,5 +70,6 @@ def async_scatter(tensor_desc, x_offsets, y_offset, src, _semantic=None):
         y_offset (int): Scalar Y offset.
         src (tensor_memory_descriptor): The source data, must be in NVMMASharedLayout.
     """
+    _check_gather_scatter(tensor_desc, x_offsets, src, "scatter", "source")
     y_offset = _semantic.to_tensor(y_offset)
     _semantic.builder.create_async_tma_scatter(tensor_desc.handle, x_offsets.handle, y_offset.handle, src.handle)
@@ -87,6 +87,7 @@ def layout(self):
 
 @builtin
 def async_copy_global_to_shared(tensor_desc, coord, barrier, result, pred=True, _semantic=None):
+    assert tensor_desc.layout == result.layout, f"tensor descriptor layout {tensor_desc.layout} does not match result shared memory layout {result.layout}"
     coord = _semantic._convert_to_ir_values(coord, require_i64=False)
     pred = _semantic.to_tensor(pred)
     _semantic.builder.create_async_tma_copy_global_to_local(tensor_desc.handle, coord, barrier.handle, result.handle,
@@ -95,6 +96,7 @@ def async_copy_global_to_shared(tensor_desc, coord, barrier, result, pred=True,
 
 @builtin
 def async_copy_shared_to_global(tensor_desc, coord, src, _semantic=None):
+    assert tensor_desc.layout == src.layout, f"tensor descriptor layout {tensor_desc.layout} does not match source shared memory layout {src.layout}"
     coord = _semantic._convert_to_ir_values(coord, require_i64=False)
     _semantic.builder.create_async_tma_copy_local_to_global(tensor_desc.handle, coord, src.handle)
Original file line number	Diff line number	Diff line change
`@@ -457,6 +457,7 @@ def NVMMASharedEncodingAttr : TritonGPU_Attr<"NVMMASharedEncoding", "nvmma_share`
`457`	`457`	`int getVec() const;`
`458`	`458`	`}];`
`459`	`459`	`let hasCustomAssemblyFormat = 1;`
	`460`	`+ let genVerifyDecl = 1;`
`460`	`461`	`}`
`461`	`462`
`462`	`463`	`def AMDRotatingSharedEncodingAttr :`
Original file line number	Diff line number	Diff line change
`@@ -192,6 +192,16 @@ LogicalResult MemDescType::verify(function_ref<InFlightDiagnostic()> emitError,`
`192`	`192`	`}`
`193`	`193`	`}`
`194`	`194`
	`195`	`+ if (auto enc = dyn_cast<NVMMASharedEncodingAttr>(encoding)) {`
	`196`	`+ SmallVector<int64_t> shapePerCTA(getShapePerCTA(enc, allocShape));`
	`197`	`+ auto blockShape = ArrayRef(shapePerCTA).take_back(enc.getRank());`
	`198`	`+ if (failed(getTMABlockShape(blockShape, enc.getElementBitWidth(),`
	`199`	`+ enc.getSwizzlingByteWidth(), enc.getFp4Padded(),`
	`200`	`+ enc.getTransposed(), /packedSize=/false,`
	`201`	`+ emitError)))`
	`202`	`+ return failure();`
	`203`	`+ }`
	`204`	`+`
`195`	`205`	`return success();`
`196`	`206`	`}`
`197`	`207`
Original file line number	Diff line number	Diff line change
`@@ -870,7 +870,7 @@ LogicalResult TMEMCopyOp::verify() {`
`870`	`870`	`return emitOpError("Incorrect tmem layout.");`
`871`	`871`	`}`
`872`	`872`	`if (tmemEnc.getBlockM() != 128) {`
`873`		`- return emitOpError("Tmem layout ahouls have M=128.");`
	`873`	`+ return emitOpError("Tmem layout must have blockM=128.");`
`874`	`874`	`}`
`875`	`875`	`if (nvmmaEnc && nvmmaEnc.getSwizzlingByteWidth() == 0) {`
`876`	`876`	`return emitOpError("Source layout should be swizzled.");`