triton-lang · jeffniu-openai · Jun 11, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 4, 2026
diff --git a/include/triton/Analysis/Utility.h b/include/triton/Analysis/Utility.h
@@ -252,6 +252,8 @@ bool supportWMMA(triton::DotOp op);
 
 bool supportMMA(triton::DotOp op, int version);
 
+bool supportMMA(triton::DotOpInterface op, int version);
+
 bool supportMMA(Value value, int version);
 
 // Conversion from `srcTy` to `dstTy` involving the minimum amount of data

@@ -605,16 +605,33 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
             const LinearLayout &layout, RankedTensorType type,
             bool withCTAOffset);
 
-// Compute per-element shared-memory pointers for a local atomic/ldst update by
+struct LocalSharedMemoryAddress {
+  Value ptr;
+  std::optional<Value> ctaId;
+};
+
+// Compute per-element shared-memory addresses for a local atomic/ldst update by
 // replacing `coords[*][axis]` with `idxValues[*]` and mapping the resulting
-// logical coordinates back to shared-memory offsets.
+// logical coordinates back to shared-memory offsets and target CTAs.
+SmallVector<LocalSharedMemoryAddress>
+computeLocalAddrs(Location loc, triton::gpu::MemDescType memDescTy,
+                  SharedMemoryObject smemObj, Type llvmElemTy,
+                  ArrayRef<Value> idxValues,
+                  ArrayRef<SmallVector<Value>> coords, unsigned axis,
+                  RewriterBase &rewriter, ArrayRef<Value> offsets = {});
+
 SmallVector<Value> computeLocalPtrs(Location loc,
                                     triton::gpu::MemDescType memDescTy,
                                     SharedMemoryObject smemObj, Type llvmElemTy,
                                     ArrayRef<Value> idxValues,
                                     ArrayRef<SmallVector<Value>> coords,
                                     unsigned axis, RewriterBase &rewriter);
 
+SmallVector<Value> loadLocalAddrs(Location loc, Type llvmElemTy,
+                                  ArrayRef<LocalSharedMemoryAddress> addrs,
+                                  RewriterBase &rewriter,
+                                  const TargetInfoBase &targetInfo);
+
 // Backend-agnostic preparation for lowering LocalAtomicScatterRMWOp.
 struct LocalAtomicScatterRMWInfo {
   RankedTensorType valuesTy;

diff --git a/include/triton/Dialect/Triton/IR/TritonOpInterfaces.td b/include/triton/Dialect/Triton/IR/TritonOpInterfaces.td
@@ -58,7 +58,13 @@ def DotOpInterface : OpInterface<"DotOpInterface"> {
       /*desc=*/"Verify the dimensions of the A and B DotOp operands.",
       /*retType=*/"bool",
       /*methodName=*/"verifyDims",
-      /*args=*/(ins)>,
+      /*args=*/(ins),
+      /*methodBody=*/[{}],
+      /*defaultImpl=*/ [{
+        auto aShape = cast<ShapedType>($_op.getA().getType()).getShape();
+        auto bShape = cast<ShapedType>($_op.getB().getType()).getShape();
+        return aShape.back() == bShape[bShape.size() - 2];
+      }]>,
   InterfaceMethod<
       /*desc=*/"Verify the dimensions of the DotOp output.",
       /*retType=*/"bool",

diff --git a/include/triton/Dialect/TritonGPU/IR/Dialect.h b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -278,6 +278,12 @@ SmallVector<int64_t> getAllocationShapePerCTA(Type type);
 
 unsigned getNumCTAs(Attribute layout);
 
+// Returns the MMAv2 warp distribution for a matrix tile. This does not apply
+// dot-chain policy and may oversubscribe tiles with fewer instruction
+// repetitions than warps.
+SmallVector<unsigned> getMmaV2WarpsPerCTA(ArrayRef<int64_t> shape,
+                                          int numWarps);
+
 // Return the order that represents that the batch is in row-major or
 // column-major order for a batch of matrices of shape [*, m, n] with
 // len(shape) == rank.

@@ -5,6 +5,7 @@ include "triton/Dialect/TritonInstrument/IR/TritonInstrumentDialect.td"
 include "triton/Dialect/TritonGPU/IR/TritonGPUTypes.td"
 include "triton/Dialect/Triton/IR/TritonTypes.td"
 include "triton/Dialect/Triton/IR/TritonInterfaces.td"
+include "triton/Dialect/Triton/IR/TritonOpInterfaces.td"
 include "triton/Dialect/Triton/IR/TritonAttrDefs.td"
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
@@ -14,6 +15,7 @@ include "triton/Dialect/TritonInstrument/IR/TritonInstrumentAttrDefs.td"
 // Interfaces
 //
 def GlobalMemory : Resource<"::mlir::triton::GlobalMemory">;
+def SharedMemory : Resource<"::mlir::triton::gpu::SharedMemory">;
 
 //
 // Ops
@@ -77,6 +79,29 @@ def TTI_ExperimentalClusterCTAIdOp
   let assemblyFormat = "attr-dict `:` type($result)";
 }
 
+def TTI_ExperimentalLocalGatherOp
+    : TTI_Op<"experimental_local_gather"> {
+  let summary = "Gather elements from shared memory with logical base offsets";
+  let description = [{
+    Gather elements from a shared memory descriptor using an index tensor along
+    one axis, after shifting the logical source coordinates by rank-sized scalar
+    offsets. This is intentionally private to instrumentation passes.
+  }];
+  let arguments = (ins
+    Arg<TTG_MemDescType, "", [MemRead<SharedMemory>]>:$src,
+    TT_IntTensor:$indices,
+    Variadic<I32>:$offsets,
+    I32Attr:$axis
+  );
+  let results = (outs TT_Tensor:$result);
+
+  let assemblyFormat = [{
+    $src `[` $indices `]` `offsets` `=` `[` $offsets `]`
+    attr-dict `:` qualified(type($src)) `,` type($indices) `->` type($result)
+  }];
+  let hasVerifier = 1;
+}
+
 def TTI_ExperimentalGSanInitOp
     : TTI_Op<"experimental_gsan_init"> {
   let summary = "Initialize GSan thread";
@@ -210,6 +235,32 @@ def TTI_ExperimentalLockReleaseOp : TTI_Op<"experimental_lock_release", [MemoryE
 // ===== FPSan ops =====
 
 
+def TTI_DotI8Op : TTI_Op<"dot_i8", [
+  Pure,
+  DeclareOpInterfaceMethods<DotOpInterface>,
+  TypesMatchWith<"result's type matches accumulator's type",
+                 "d", "c", "$_self">
+]> {
+  let summary = "non-saturating NVIDIA MMAv2 i8 dot";
+  let description = [{
+    Performs a wrapping i8 matrix multiplication into an i32 accumulator using
+    NVIDIA MMAv2. The A and B operands have independent signedness.
+  }];
+  let arguments = (ins
+    RankedTensorOf<[I8]>:$a,
+    RankedTensorOf<[I8]>:$b,
+    RankedTensorOf<[I32]>:$c,
+    BoolAttr:$aSigned,
+    BoolAttr:$bSigned
+  );
+  let results = (outs RankedTensorOf<[I32]>:$d);
+  let assemblyFormat = [{
+    $a `,` $b `,` $c `,` `aSigned` `=` $aSigned `,` `bSigned` `=` $bSigned
+    attr-dict `:` type($a) `*` type($b) `->` type($d)
+  }];
+  let hasVerifier = 1;
+}
+
 def TTI_ExperimentalFPSanEmbedOp : TTI_Op<"experimental_fpsan_embed", [
   Pure,
   Elementwise,

diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
@@ -1242,6 +1242,12 @@ bool supportMMA(triton::DotOp op, int version) {
   return supportMMA(op.getA(), version) && supportMMA(op.getB(), version);
 }
 
+bool supportMMA(triton::DotOpInterface op, int version) {
+  if (auto dotOp = dyn_cast<triton::DotOp>(op.getOperation()))
+    return supportMMA(dotOp, version);
+  return supportMMA(op.getA(), version) && supportMMA(op.getB(), version);
+}
+
 bool supportMMA(Value value, int version) {
   // Tell whether a DotOp support MMA by the operand type(either $a or $b).
   // We cannot get both the operand types(in TypeConverter), here we assume the

diff --git a/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp
@@ -24,20 +24,26 @@ lowerLocalScGt(Location loc, MLIRContext *ctx, MemDescType memDescTy,
                unsigned axis, ArrayRef<Value> storeVals, RewriterBase &rewriter,
                const TargetInfoBase &targetInfo) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
-  bool isScatter = !storeVals.empty();
-  SmallVector<Value> ptrs = computeLocalPtrs(
+  SmallVector<LocalSharedMemoryAddress> addrs = computeLocalAddrs(
       loc, memDescTy, smemObj, llvmElemTy, idxValues, coords, axis, rewriter);
+  if (storeVals.empty())
+    return loadLocalAddrs(loc, llvmElemTy, addrs, rewriter, targetInfo);
 
-  SmallVector<Value> results;
-  if (!isScatter)
-    results.resize(coords.size());
+  Value currentCtaId;
+  if (!addrs.empty() && addrs.front().ctaId)
+    currentCtaId = targetInfo.getClusterCTAId(rewriter, loc);
 
-  for (auto [i, ptr] : llvm::enumerate(ptrs)) {
-    if (isScatter) {
-      targetInfo.storeShared(rewriter, loc, ptr, storeVals[i], b.true_val());
+  SmallVector<Value> results;
+  for (auto [i, addr] : llvm::enumerate(addrs)) {
+    if (addr.ctaId) {
+      Value isLocal = b.icmp_eq(*addr.ctaId, currentCtaId);
+      Value isRemote = b.icmp_ne(*addr.ctaId, currentCtaId);
+      targetInfo.storeShared(rewriter, loc, addr.ptr, storeVals[i], isLocal);
+      targetInfo.storeDShared(rewriter, loc, addr.ptr, addr.ctaId, storeVals[i],
+                              isRemote);
     } else {
-      results[i] =
-          targetInfo.loadShared(rewriter, loc, ptr, llvmElemTy, b.true_val());
+      targetInfo.storeShared(rewriter, loc, addr.ptr, storeVals[i],
+                             b.true_val());
     }
   }
 
@@ -267,13 +273,6 @@ struct LocalGatherOpConversion : public ConvertOpToLLVMPattern<LocalGatherOp> {
     auto loc = op.getLoc();
     auto *ctx = op.getContext();
     auto memDescTy = cast<MemDescType>(op.getSrc().getType());
-    // TODO: PartitionedSharedEncoding lowering will be enabled in subsequent
-    // PRs.
-    if (isa<triton::gpu::PartitionedSharedEncodingAttr>(
-            memDescTy.getEncoding())) {
-      return rewriter.notifyMatchFailure(
-          op, "PartitionedSharedEncoding not yet supported in lowering");
-    }
     auto regTy = cast<RankedTensorType>(op.getType());
     auto typeConverter = getTypeConverter();
 
@@ -316,13 +315,6 @@ struct LocalScatterOpConversion
     auto loc = op.getLoc();
     auto *ctx = op.getContext();
     auto memDescTy = cast<MemDescType>(op.getDst().getType());
-    // TODO: PartitionedSharedEncoding lowering will be enabled in subsequent
-    // PRs.
-    if (isa<triton::gpu::PartitionedSharedEncodingAttr>(
-            memDescTy.getEncoding())) {
-      return rewriter.notifyMatchFailure(
-          op, "PartitionedSharedEncoding not yet supported in lowering");
-    }
     auto valuesTy = cast<RankedTensorType>(op.getValues().getType());
     auto typeConverter = getTypeConverter();
 

@@ -540,12 +540,12 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
   return emitIndices(loc, rewriter, target, ll, type, withCTAOffset);
 }
 
-SmallVector<Value> computeLocalPtrs(Location loc,
-                                    triton::gpu::MemDescType memDescTy,
-                                    SharedMemoryObject smemObj, Type llvmElemTy,
-                                    ArrayRef<Value> idxValues,
-                                    ArrayRef<SmallVector<Value>> coords,
-                                    unsigned axis, RewriterBase &rewriter) {
+SmallVector<LocalSharedMemoryAddress>
+computeLocalAddrs(Location loc, triton::gpu::MemDescType memDescTy,
+                  SharedMemoryObject smemObj, Type llvmElemTy,
+                  ArrayRef<Value> idxValues,
+                  ArrayRef<SmallVector<Value>> coords, unsigned axis,
+                  RewriterBase &rewriter, ArrayRef<Value> offsets) {
   MLIRContext *ctx = memDescTy.getContext();
   auto b = TritonLLVMOpBuilder(loc, rewriter);
 
@@ -561,12 +561,15 @@ SmallVector<Value> computeLocalPtrs(Location loc,
     allDims.push_back(str_attr("dim" + Twine(dim)));
 
   auto kOffset = str_attr("offset");
+  auto kBlock = str_attr("block");
+  bool useBlockId = invSharedLayout.hasOutDim(kBlock) &&
+                    invSharedLayout.getOutDimSize(kBlock) > 1;
   // Get the subslice affine offset (non-zero for memdesc subslices)
   Value affineOffset = smemObj.getShmemOffset(loc, rewriter, memDescTy);
   auto bitwidth = getIntOrFloatOrPtrBitWidth(llvmElemTy);
 
-  SmallVector<Value> ptrs;
-  ptrs.reserve(coords.size());
+  SmallVector<LocalSharedMemoryAddress> addrs;
+  addrs.reserve(coords.size());
 
   for (auto [i, idxVal] : llvm::enumerate(idxValues)) {
     Value idx = idxVal;
@@ -578,9 +581,12 @@ SmallVector<Value> computeLocalPtrs(Location loc,
       idx = b.zext(i32_ty, idx);
     }
 
-    // Copy coordinates and replace the axis coordinate with the index value
+    // Copy coordinates, replace the axis coordinate with the index value, and
+    // then shift all logical coordinates by the optional base offsets.
     SmallVector<Value> indices(coords[i]);
     indices[axis] = idx;
+    for (auto [dim, offset] : llvm::enumerate(offsets))
+      indices[dim] = b.add(indices[dim], offset);
 
     // Apply inverted shared layout to compute offset
     SmallVector<std::pair<StringAttr, Value>> inputs;
@@ -589,15 +595,18 @@ SmallVector<Value> computeLocalPtrs(Location loc,
 
     auto outputs = applyLinearLayout(loc, rewriter, invSharedLayout, inputs);
 
-    // Extract the offset value
+    // Extract the offset and target CTA.
     Value offset = nullptr;
+    Value blockId = nullptr;
     for (auto [name, value] : outputs) {
-      if (name == kOffset) {
+      if (name == kOffset)
         offset = value;
-        break;
-      }
+      else if (name == kBlock)
+        blockId = value;
     }
     assert(offset && "expected offset output from inverted shared layout");
+    assert((!useBlockId || blockId) &&
+           "expected block output from multi-CTA shared layout");
 
     // For subslices, the physical offset is computed as:
     //   physical_offset = L⁻¹(coords) ⊕ L⁻¹(subslice_logical_offset)
@@ -626,10 +635,47 @@ SmallVector<Value> computeLocalPtrs(Location loc,
       ptr = b.gep(smemObj.getBase().getType(), llvmElemTy, smemObj.getBase(),
                   offset);
     }
-    ptrs.push_back(ptr);
+    addrs.push_back(
+        {ptr, useBlockId ? std::optional<Value>(blockId) : std::nullopt});
   }
 
-  return ptrs;
+  return addrs;
+}
+
+SmallVector<Value> computeLocalPtrs(Location loc,
+                                    triton::gpu::MemDescType memDescTy,
+                                    SharedMemoryObject smemObj, Type llvmElemTy,
+                                    ArrayRef<Value> idxValues,
+                                    ArrayRef<SmallVector<Value>> coords,
+                                    unsigned axis, RewriterBase &rewriter) {
+  return llvm::map_to_vector(
+      computeLocalAddrs(loc, memDescTy, smemObj, llvmElemTy, idxValues, coords,
+                        axis, rewriter),
+      [](const LocalSharedMemoryAddress &addr) { return addr.ptr; });
+}
+
+SmallVector<Value> loadLocalAddrs(Location loc, Type llvmElemTy,
+                                  ArrayRef<LocalSharedMemoryAddress> addrs,
+                                  RewriterBase &rewriter,
+                                  const TargetInfoBase &targetInfo) {
+  auto b = TritonLLVMOpBuilder(loc, rewriter);
+  Value currentCtaId;
+  if (!addrs.empty() && addrs.front().ctaId)
+    currentCtaId = targetInfo.getClusterCTAId(rewriter, loc);
+
+  return llvm::map_to_vector(
+      addrs, [&](const LocalSharedMemoryAddress &addr) -> Value {
+        if (!addr.ctaId)
+          return targetInfo.loadShared(rewriter, loc, addr.ptr, llvmElemTy,
+                                       b.true_val());
+        Value isLocal = b.icmp_eq(*addr.ctaId, currentCtaId);
+        Value local =
+            targetInfo.loadShared(rewriter, loc, addr.ptr, llvmElemTy, isLocal);
+        Value remote = targetInfo.loadDShared(
+            rewriter, loc, addr.ptr, addr.ctaId, llvmElemTy,
+            b.icmp_ne(*addr.ctaId, currentCtaId));
+        return b.select(isLocal, local, remote);
+      });
 }
 
 FailureOr<LocalAtomicScatterRMWInfo> prepareLocalAtomicScatterRMW(