triton-lang
diff --git a/‎include/triton/Analysis/Allocation.h‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Analysis/Allocation.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 3 additions & 6 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 16 additions & 46 deletions b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 16 additions & 46 deletions
diff --git a/‎lib/Analysis/Utility.cpp‎
Lines changed: 112 additions & 51 deletions b/‎lib/Analysis/Utility.cpp‎
Lines changed: 112 additions & 51 deletions
@@ -20,6 +20,10 @@ using AllocationAnalysisScratchSizeFn = std::function<unsigned(Operation *)>;
 
 unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op);
 
+unsigned getNumScratchElemsSwizzledCvt(const LinearLayout &srcLayout,
+                                       const LinearLayout &dstLayout,
+                                       int bitwidth);
+
 unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
                                        RankedTensorType dstTy);
 
 
@@ -61,8 +61,6 @@ class ReduceOpHelper {
 
   RankedTensorType getSrcTy() { return srcTy; }
 
-  bool isWarpSynchronous();
-
   unsigned getInterWarpSizeWithUniqueData();
 
   unsigned getIntraWarpSizeWithUniqueData();
@@ -71,6 +69,9 @@ class ReduceOpHelper {
 
   bool isAssociative();
 
+  // Get the shared memory scratch size required by this reduce op.
+  unsigned getScratchSizeInBytes();
+
   InThreadVectorizeOpKind getInThreadVectorizeOpKind(unsigned axisPack);
 
   static triton::ColumnAction
@@ -92,10 +93,6 @@ class ReduceOpHelper {
                                                  InThreadVectorizeOpKind kind,
                                                  Value lhs, Value rhs);
 
-  SmallVector<unsigned>
-  getScratchBytesForCvt(const triton::LinearLayout &srcLayout,
-                        const triton::LinearLayout &dstLayout);
-
 private:
   triton::ReduceOp op;
   RankedTensorType srcTy;
 
@@ -31,49 +31,27 @@ namespace mlir {
 //===----------------------------------------------------------------------===//
 namespace triton {
 
-unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
-                                       RankedTensorType dstTy) {
-  auto *ctx = srcTy.getContext();
-  auto srcLayout = gpu::toLinearLayout(srcTy);
-  auto dstLayout = gpu::toLinearLayout(dstTy);
-  srcLayout = actionRemoveBroadcastedRegs(srcLayout).apply(srcLayout);
-  dstLayout = actionRemoveBroadcastedRegs(dstLayout).apply(dstLayout);
-  auto bitwidth = getBitwidth(srcTy);
-  auto smem = gpu::optimalSwizzlingLdSt(srcLayout, dstLayout, bitwidth);
+unsigned getNumScratchElemsSwizzledCvt(const LinearLayout &srcLayout,
+                                       const LinearLayout &dstLayout,
+                                       int bitwidth) {
+  auto *ctx = srcLayout.getInDimNames().begin()->getContext();
+  auto srcLayoutNoBroadcast =
+      actionRemoveBroadcastedRegs(srcLayout).apply(srcLayout);
+  auto dstLayoutNoBroadcast =
+      actionRemoveBroadcastedRegs(dstLayout).apply(dstLayout);
+  auto smem = gpu::optimalSwizzlingLdSt(srcLayoutNoBroadcast,
+                                        dstLayoutNoBroadcast, bitwidth);
   auto reps = smem.getInDimSize(StringAttr::get(ctx, "reps"));
   return smem.getTotalOutDimSize() / reps;
 }
 
-namespace {
-constexpr int64_t kReduceScratchAlign = 16;
-
-Type getReduceMemElemTy(Type elemTy, MLIRContext *ctx) {
-  if (elemTy.isIntOrFloat() && elemTy.getIntOrFloatBitWidth() < 8)
-    return IntegerType::get(ctx, 8);
-  return elemTy;
-}
-
-int64_t getReduceScratchSizeBytes(triton::ReduceOp op,
-                                  ArrayRef<unsigned> bytesPerOperand) {
-  std::vector<unsigned> indices(op.getNumOperands());
-  std::iota(indices.begin(), indices.end(), 0);
-  auto *ctx = op.getContext();
-  std::sort(indices.begin(), indices.end(), [&](unsigned i, unsigned j) {
-    auto lhsTy = getReduceMemElemTy(op.getElementTypes()[i], ctx);
-    auto rhsTy = getReduceMemElemTy(op.getElementTypes()[j], ctx);
-    return getIntOrFloatOrPtrBitWidth(lhsTy) >
-           getIntOrFloatOrPtrBitWidth(rhsTy);
-  });
-  // Aling to 16 bytes to allow for vectorisation
-  int64_t offset = 0;
-  for (unsigned idx : indices) {
-    offset += llvm::alignTo(bytesPerOperand[idx], kReduceScratchAlign);
-  }
-  return offset;
+unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
+                                       RankedTensorType dstTy) {
+  return getNumScratchElemsSwizzledCvt(gpu::toLinearLayout(srcTy),
+                                       gpu::toLinearLayout(dstTy),
+                                       getBitwidth(srcTy));
 }
 
-} // namespace
-
 // Both `atomic_cas` and `atomic_rmw` may need scratch memory to store values
 // because Triton's block-based programming model ensures that
 // all threads sharing the same partition of the tensor see the same values,
@@ -100,15 +78,7 @@ static SmallVector<unsigned> getRepShapeForAtomic(Value result) {
 
 unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
   if (auto reduceOp = dyn_cast<ReduceOp>(op)) {
-    ReduceOpHelper helper(reduceOp);
-    if (helper.isWarpSynchronous())
-      return 0;
-
-    auto regLl = ReduceOpHelper::reducedRegLaneLayout(helper.getSrcTy(),
-                                                      reduceOp.getAxis());
-    auto tmpLl = ReduceOpHelper::getInterLayout(regLl, reduceOp.getAxis());
-    auto bytesRegToTmp = helper.getScratchBytesForCvt(regLl, tmpLl);
-    return getReduceScratchSizeBytes(reduceOp, bytesRegToTmp);
+    return ReduceOpHelper(reduceOp).getScratchSizeInBytes();
   }
   if (auto scanOp = dyn_cast<ScanOp>(op)) {
     ScanLoweringHelper helper(scanOp);
 
@@ -78,10 +78,6 @@ unsigned ReduceOpHelper::getIntraWarpSizeWithUniqueData() {
   return getThreadsPerWarp(srcEncoding, srcShape)[axis];
 }
 
-bool ReduceOpHelper::isWarpSynchronous() {
-  return getWarpsPerCTA(srcEncoding, srcShape)[axis] == 1;
-}
-
 bool ReduceOpHelper::isReduceWithinCTA() {
   // TODO: Support reduce across CTAS
   // Layout optimization passes such as PlanCTAPass and
@@ -109,6 +105,35 @@ bool ReduceOpHelper::isAssociative() {
   return !hasNoAssociativeOp;
 }
 
+unsigned ReduceOpHelper::getScratchSizeInBytes() {
+  auto kLane = StringAttr::get(op.getContext(), "lane");
+
+  auto isReduced = [axis = axis](const LinearLayout &layout) {
+    return layout.getOutDimSizes().begin()[axis] == 1;
+  };
+  auto regLl = reducedRegLaneLayout(srcTy, axis);
+
+  // All the inputs have the same layout so, since we order them from largest
+  // bitsize to smallest, and the first one is aligned, by induction, they are
+  // all aligned, so we don't need to align the byte numbers returned here.
+  unsigned bytesRegToTmp = 0;
+  while (!isReduced(regLl)) {
+    auto tmpLl = getInterLayout(regLl, axis);
+    // We take the maximum of the elements and multiply by the total bitwidth.
+    // We do this as otherwise it's quite tricky to find the correct
+    // BaseOffsets in the lowering.
+    int bytes = 0;
+    for (auto inputTy : op.getInputTypes()) {
+      auto nelem =
+          getNumScratchElemsSwizzledCvt(regLl, tmpLl, getBitwidth(inputTy));
+      bytes += nelem * (getBitwidth(inputTy) / 8);
+    }
+    bytesRegToTmp = std::max<unsigned>(bytesRegToTmp, bytes);
+    regLl = zeroBasesAlongDimAndReorder(tmpLl, axis, kLane);
+  }
+  return bytesRegToTmp;
+}
+
 ReduceOpHelper::InThreadVectorizeOpKind
 ReduceOpHelper::getInThreadVectorizeOpKind(unsigned axisPack) {
   Operation *reduceOperation = op.getOperation();
@@ -291,26 +316,90 @@ LinearLayout ReduceOpHelper::getInterLayout(const LinearLayout &layout,
   auto *ctx = layout.getOutDimNames().begin()->getContext();
   auto kLane = mlir::StringAttr::get(ctx, "lane");
   auto kWarp = mlir::StringAttr::get(ctx, "warp");
-  auto regBases = layout.getBases();
-  auto linearAttr = triton::gpu::LinearEncodingAttr::get(ctx, layout);
-  int laneBits = layout.getInDimSizeLog2(kLane);
-  int neededLaneBits = llvm::Log2_32(linearAttr.getWarpsPerCTA()[axis]);
-  // TODO move to verifier
-  assert(neededLaneBits <= laneBits && "NYI: more inter-warps than lanes");
-  // Move the warp axis bases we need to reduce into lane bases, while
-  // keeping non-axis components in their original in-dim.
-  auto &laneBases = regBases[kLane];
-  auto &warpBases = regBases[kWarp];
-  int moved = 0;
-  for (auto &warpBasis : warpBases) {
-    if (warpBasis[axis] == 0)
-      continue;
-    assert(moved < neededLaneBits && "unexpected warp axis bases count");
-    std::swap(laneBases[moved], warpBasis);
-    moved++;
+  auto kBlock = mlir::StringAttr::get(ctx, "block");
+  auto bases = layout.getBases();
+  auto &laneBases = bases[kLane];
+  auto &warpBases = bases[kWarp];
+  auto &blockBases = bases[kBlock];
+
+  auto collectAxisBases = [&](ArrayRef<std::vector<int32_t>> bases) {
+    SmallVector<unsigned> out;
+    for (unsigned i = 0; i < bases.size(); ++i) {
+      if (bases[i][axis] != 0)
+        out.push_back(i);
+    }
+    return out;
+  };
+
+  SmallVector<unsigned> warpAxisBases = collectAxisBases(warpBases);
+  SmallVector<unsigned> blockAxisBases = collectAxisBases(blockBases);
+
+  SmallVector<unsigned> zeroLaneBases;
+  for (unsigned i = 0; i < laneBases.size(); ++i) {
+    if (llvm::all_of(laneBases[i], [](int32_t v) { return v == 0; }))
+      zeroLaneBases.push_back(i);
   }
 
-  return LinearLayout(std::move(regBases), to_vector(layout.getOutDimNames()));
+  auto axisSize = to_vector(layout.getOutDimSizes())[axis];
+  auto totalAxisBases = warpAxisBases.size() + blockAxisBases.size();
+
+  // First try to place all warp/block axis bases into lane bases that are
+  // currently zero. If we can do this we will be able to perform the full
+  // reduction with just one convert_layout
+  if (zeroLaneBases.size() >= totalAxisBases) {
+    unsigned laneIdx = 0;
+    for (unsigned idx : warpAxisBases) {
+      std::swap(laneBases[zeroLaneBases[laneIdx]], warpBases[idx]);
+      ++laneIdx;
+    }
+    for (unsigned idx : blockAxisBases) {
+      std::swap(laneBases[zeroLaneBases[laneIdx]], blockBases[idx]);
+      ++laneIdx;
+    }
+    return LinearLayout(std::move(bases), to_vector(layout.getOutDimNames()));
+  }
+
+  // If we can fit all the bases inside the lane dimension, we can perform the
+  // reduction with two convert_layouts
+  // The first cvt to move the relevant bases to the lane dimension
+  // The second to move all the bases we moved out of the lane dimension back to
+  // their original positions
+  if (warpAxisBases.size() + blockAxisBases.size() <= laneBases.size()) {
+    assert(totalAxisBases <= laneBases.size() &&
+           "unexpected lane base count for axis layout");
+    unsigned laneIdx = 0;
+    for (unsigned idx : warpAxisBases) {
+      std::swap(laneBases[laneIdx], warpBases[idx]);
+      ++laneIdx;
+    }
+    for (unsigned idx : blockAxisBases) {
+      std::swap(laneBases[laneIdx], blockBases[idx]);
+      ++laneIdx;
+    }
+    return LinearLayout(std::move(bases), to_vector(layout.getOutDimNames()));
+  }
+
+  // Assumptions (easily relaxed if AMD needs it)
+  // We assume that
+  // max number of warps * max number of blocks <= (max number of lanes)^2
+  // We check this in logarithmic space (number of bases)
+  // This is true in nvidia as the max numbers are warps=64 ctas=16 so that
+  // 64 * 16 = 1024 = 32 * 32 = laneBases.size() * laneBases.size()
+  // This implies that, even if we have to perform 3 cvt_layouts, we can perform
+  // first one that does not cross CTAs, and then two that may cross CTAs
+  assert(blockBases.size() <= laneBases.size());
+  assert(warpBases.size() + blockBases.size() <= 2 * laneBases.size());
+
+  // Otherwise, fit as many warp bases as possible into the lane dimension
+  unsigned laneIdx = 0;
+  for (unsigned idx : warpAxisBases) {
+    std::swap(laneBases[laneIdx], warpBases[idx]);
+    ++laneIdx;
+    if (laneIdx >= laneBases.size())
+      break;
+  }
+
+  return LinearLayout(std::move(bases), to_vector(layout.getOutDimNames()));
 }
 
 LinearLayout ReduceOpHelper::reducedRegLaneLayout(RankedTensorType srcTy,
@@ -320,9 +409,7 @@ LinearLayout ReduceOpHelper::reducedRegLaneLayout(RankedTensorType srcTy,
   auto kLane = StringAttr::get(ctx, "lane");
   auto kWarp = StringAttr::get(ctx, "warp");
 
-  auto reduced = triton::gpu::toLinearLayout(srcTy);
-  reduced = reduced.sublayout({kReg, kLane, kWarp},
-                              to_vector(reduced.getOutDimNames()));
+  auto reduced = toLinearLayout(srcTy);
   reduced = actionRemoveBroadcastedRegs(reduced).apply(reduced);
 
   reduced = moveAxisBasesToFront(reduced, axis).apply(reduced);
@@ -332,32 +419,6 @@ LinearLayout ReduceOpHelper::reducedRegLaneLayout(RankedTensorType srcTy,
   return reduced;
 }
 
-SmallVector<unsigned>
-ReduceOpHelper::getScratchBytesForCvt(const LinearLayout &srcLayout,
-                                      const LinearLayout &dstLayout) {
-  SmallVector<unsigned> bytes(srcElementTypes.size(), 0);
-  auto *ctx = op.getContext();
-  SmallVector<int64_t> shape;
-  shape.reserve(srcLayout.getNumOutDims());
-  for (auto dim : srcLayout.getOutDimNames()) {
-    shape.push_back(srcLayout.getOutDimSize(dim));
-  }
-  auto srcEnc = triton::gpu::LinearEncodingAttr::get(ctx, srcLayout);
-  auto dstEnc = triton::gpu::LinearEncodingAttr::get(ctx, dstLayout);
-  for (unsigned i = 0; i < srcElementTypes.size(); ++i) {
-    auto elemTy = srcElementTypes[i];
-    if (elemTy.isIntOrFloat() && elemTy.getIntOrFloatBitWidth() < 8)
-      elemTy = IntegerType::get(ctx, 8);
-    auto srcTy = RankedTensorType::get(shape, elemTy, srcEnc);
-    auto dstTy = RankedTensorType::get(shape, elemTy, dstEnc);
-    if (!cvtNeedsSharedMemory(srcTy, dstTy))
-      continue;
-    auto elems = getNumScratchElemsSwizzledCvt(srcTy, dstTy);
-    bytes[i] = elems * getBitwidth(srcTy) / 8;
-  }
-  return bytes;
-}
-
 ScanLoweringHelper::ScanLoweringHelper(triton::ScanOp op) : scanOp(op) {
   auto firstTy = cast<RankedTensorType>(op.getOperands()[0].getType());
   srcShape = firstTy.getShape();