triton-lang
diff --git a/‎docs/python-api/triton.language.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/python-api/triton.language.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 15 additions & 0 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 50 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 5 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 4 additions & 0 deletions b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎lib/Analysis/Utility.cpp‎
Lines changed: 63 additions & 0 deletions b/‎lib/Analysis/Utility.cpp‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
@@ -152,6 +152,7 @@ Scan/Sort Ops
     sort
     topk
     gather
+    scatter
 
 Atomic Ops
 ----------
 
@@ -146,6 +146,21 @@ class GatherLoweringHelper {
   RankedTensorType dstTy;
 };
 
+// Helper class for lowering `tt.scatter` operations. This class shares lowering
+// logic between shared memory allocation and LLVM codegen.
+class ScatterLoweringHelper {
+public:
+  ScatterLoweringHelper(triton::ScatterOp scatterOp);
+
+  // Get the shared memory scratch size required by this op.
+  unsigned getScratchSizeInBytes();
+  // Determine if the scatter can be performed completely within a warp.
+  bool isWarpLocal();
+
+private:
+  triton::ScatterOp scatterOp;
+};
+
 // This struct represents the factorization of a warp-local layout conversion
 // into three components: a register-only permutation, a lane-only permutation,
 // and a set of swaps between lane and register basis vectors. Algebraically, it
 
@@ -76,6 +76,10 @@ void populateGatherOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                     RewritePatternSet &patterns,
                                     const TargetInfoBase &targetInfo,
                                     PatternBenefit benefit);
+void populateScatterOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
+                                     RewritePatternSet &patterns,
+                                     const TargetInfoBase &targetInfo,
+                                     PatternBenefit benefit);
 
 void populateConvertLayoutOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                            const TargetInfoBase &targetInfo,
 
@@ -14,6 +14,10 @@ class TargetInfoBase {
 
   virtual Value ballot(RewriterBase &rewriter, Location loc, Type type,
                        Value cmp) const = 0;
+  // Returns the subset of lanes in `activeMask` that have the same `value` as
+  // the current lane. Backends may override with native match-any support.
+  virtual Value matchAny(RewriterBase &rewriter, Location loc, Type maskType,
+                         Value value, Value activeMask) const = 0;
 
   // Emit a block/CTA level barrier that guarantees visibility for the
   // target address space
 
@@ -960,6 +960,56 @@ def TT_GatherOp : TT_Op<"gather", [Pure,
   let hasVerifier = 1;
 }
 
+//
+// Scatter Op
+//
+def TT_ScatterOp : TT_Op<"scatter", [Pure,
+    DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "local scatter operation";
+  let description = [{
+    Scatter elements from the source tensor into the destination tensor using
+    the indices tensor along a single specified axis. The source and indices
+    tensors must have the same shape. The output tensor has the same shape as
+    the destination tensor.
+
+    For each source position I, this writes:
+      out[I[0], ..., indices[I], ..., I[n]] = src[I]
+
+    If a reduction region is provided, then multiple source elements that map
+    to the same destination are combined using the region. When `include_self`
+    is true, the original destination value is included in the reduction for
+    indices that are written. When it is false, only source values are
+    combined, and destinations with no writes keep their original values.
+
+    The `efficient_layout` attribute is set when the compiler has determined an
+    optimized layout for the operation, indicating that it should not be
+    changed.
+  }];
+
+  let arguments = (ins
+    TT_Tensor:$dst,
+    TT_IntTensor:$indices,
+    TT_Tensor:$src,
+    I32Attr:$axis,
+    DefaultValuedAttr<BoolAttr, "true">:$include_self,
+    OptionalAttr<StrAttr>:$reduce_kind,
+    UnitAttr:$efficient_layout
+  );
+  let results = (outs TT_Tensor:$result);
+  let regions = (region AnyRegion:$combineOp);
+
+  let hasVerifier = 1;
+  let hasRegionVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
+}
+
+def TT_ScatterReturnOp: TT_Op<"scatter.return",
+                              [HasParent<"ScatterOp">, Pure, Terminator, ReturnLike]> {
+    let summary = "terminator for scatter reduction operator";
+    let arguments = (ins Variadic<AnyType>:$result);
+    let assemblyFormat = "$result attr-dict `:` type($result)";
+}
+
 //
 // Print Op
 //
 
@@ -268,7 +268,7 @@ def TritonGPUOptimizeThreadLocality : Pass<"tritongpu-optimize-thread-locality",
 
   let description = [{
     The aim of this pass is to reduce cross-thread communication for certain
-    operations, like reductions, reshapes, and gathers.
+    operations, like reductions, reshapes, gathers, and scatters.
 
     For reduction operations, this pass attempts to adjust the reduction size
     (or layout) to avoid splitting the reduction operation between multiple
@@ -281,6 +281,10 @@ def TritonGPUOptimizeThreadLocality : Pass<"tritongpu-optimize-thread-locality",
     heuristics to determine when it is appropriate to assign specific layouts
     and trigger their respective codegen paths. For now, the pass only attempts
     to apply layouts that result in warp-synchronous gathers.
+
+    For scatters, this pass applies the same strategy and attempts to assign
+    layouts that keep source/index updates warp-synchronous with their
+    destination columns.
   }];
 
   let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
 
@@ -86,6 +86,10 @@ unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
     GatherLoweringHelper helper(gatherOp);
     return helper.getScratchSizeInBytes();
   }
+  if (auto scatterOp = dyn_cast<ScatterOp>(op)) {
+    ScatterLoweringHelper helper(scatterOp);
+    return helper.getScratchSizeInBytes();
+  }
   if (auto histogram = dyn_cast<HistogramOp>(op)) {
     auto dstTy = histogram.getType();
     int threadsPerWarp = gpu::TritonGPUDialect::getThreadsPerWarp(
 
@@ -885,6 +885,69 @@ bool GatherLoweringHelper::isWarpLocal() {
          idxLayout.sublayout(kLane, otherDims);
 }
 
+ScatterLoweringHelper::ScatterLoweringHelper(triton::ScatterOp scatterOp)
+    : scatterOp(scatterOp) {}
+
+unsigned ScatterLoweringHelper::getScratchSizeInBytes() {
+  // Otherwise, scattering will write into a temporary output tensor in shared
+  // memory before materializing the final output registers.
+  RankedTensorType dstType = scatterOp.getDst().getType();
+  unsigned elemBytes = ceil<unsigned>(dstType.getElementTypeBitWidth(), 8);
+  unsigned dstBytes = product(dstType.getShape()) * elemBytes;
+  bool hasCombine =
+      !scatterOp.getCombineOp().empty() || scatterOp.getReduceKindAttr();
+  if (hasCombine) {
+    // Extra i32 per element for CAS-based locks/flags.
+    dstBytes += product(dstType.getShape()) * sizeof(int32_t);
+  }
+  return dstBytes;
+}
+
+bool ScatterLoweringHelper::isWarpLocal() {
+  // The scatter is warp-local if all source/index writes for any destination
+  // column can be serviced within a single warp.
+  RankedTensorType dstType = scatterOp.getDst().getType();
+  RankedTensorType srcType = scatterOp.getSrc().getType();
+  RankedTensorType idxType = scatterOp.getIndices().getType();
+  LinearLayout dstLayout = toLinearLayout(dstType);
+  LinearLayout srcLayout = toLinearLayout(srcType);
+  LinearLayout idxLayout = toLinearLayout(idxType);
+
+  Builder b(scatterOp.getContext());
+  StringAttr kBlock = b.getStringAttr("block");
+  StringAttr kWarp = b.getStringAttr("warp");
+  StringAttr kLane = b.getStringAttr("lane");
+  StringAttr kScatterDim =
+      b.getStringAttr("dim" + std::to_string(scatterOp.getAxis()));
+
+  // The scatter dimension must be invariant with respect to warp/block in all
+  // participating tensors.
+  if (!dstLayout.sublayoutIsZero({kBlock, kWarp}, kScatterDim) ||
+      !srcLayout.sublayoutIsZero({kBlock, kWarp}, kScatterDim) ||
+      !idxLayout.sublayoutIsZero({kBlock, kWarp}, kScatterDim))
+    return false;
+
+  SmallVector<StringAttr> otherDims;
+  for (unsigned dim = 0, rank = dstType.getRank(); dim < rank; ++dim) {
+    if (dim != scatterOp.getAxis()) {
+      otherDims.push_back(b.getStringAttr("dim" + Twine(dim)));
+    }
+  }
+
+  // Source/index and destination columns must line up identically across warps.
+  if (dstLayout.sublayout({kBlock, kWarp}, otherDims) !=
+          srcLayout.sublayout({kBlock, kWarp}, otherDims) ||
+      dstLayout.sublayout({kBlock, kWarp}, otherDims) !=
+          idxLayout.sublayout({kBlock, kWarp}, otherDims))
+    return false;
+
+  // Require lane ownership of columns to match for simpler codegen.
+  return dstLayout.sublayout(kLane, otherDims) ==
+             srcLayout.sublayout(kLane, otherDims) &&
+         dstLayout.sublayout(kLane, otherDims) ==
+             idxLayout.sublayout(kLane, otherDims);
+}
+
 unsigned getNumScratchElements(ArrayRef<unsigned> shape) {
   if (shape.empty())
     return 0;
 
@@ -18,6 +18,7 @@ add_triton_library(TritonGPUToLLVM
     PrintOpToLLVM.cpp
     ReduceOpToLLVM.cpp
     ScanOpToLLVM.cpp
+    ScatterOpToLLVM.cpp
     SPMDOpToLLVM.cpp
     TypeConverter.cpp
     Utility.cpp
-Original file line number
+Diff line change
     sort
     topk
     gather
 +    scatter
 Atomic Ops
 ----------