triton-lang
diff --git a/‎include/triton/Analysis/Allocation.h‎
Lines changed: 3 additions & 2 deletions b/‎include/triton/Analysis/Allocation.h‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 0 additions & 6 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 41 additions & 48 deletions b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 41 additions & 48 deletions
diff --git a/‎lib/Analysis/Utility.cpp‎
Lines changed: 91 additions & 49 deletions b/‎lib/Analysis/Utility.cpp‎
Lines changed: 91 additions & 49 deletions
@@ -20,8 +20,9 @@ using AllocationAnalysisScratchSizeFn = std::function<unsigned(Operation *)>;
 
 unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op);
 
-unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
-                                       RankedTensorType dstTy);
+unsigned getNumScratchElemsSwizzledCvt(const LinearLayout &srcLayout,
+                                       const LinearLayout &dstLayout,
+                                       int bitwidth);
 
 } // namespace triton
 
 
@@ -44,8 +44,6 @@ class ReduceOpHelper {
 
   RankedTensorType getSrcTy() { return srcTy; }
 
-  bool isWarpSynchronous();
-
   unsigned getInterWarpSizeWithUniqueData();
 
   unsigned getIntraWarpSizeWithUniqueData();
@@ -67,10 +65,6 @@ class ReduceOpHelper {
   static triton::LinearLayout reducedRegLaneLayout(RankedTensorType srcTy,
                                                    unsigned axis);
 
-  SmallVector<unsigned>
-  getScratchBytesForCvt(const triton::LinearLayout &srcLayout,
-                        const triton::LinearLayout &dstLayout);
-
 private:
   triton::ReduceOp op;
   RankedTensorType srcTy;
 
@@ -31,49 +31,20 @@ namespace mlir {
 //===----------------------------------------------------------------------===//
 namespace triton {
 
-unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
-                                       RankedTensorType dstTy) {
-  auto *ctx = srcTy.getContext();
-  auto srcLayout = gpu::toLinearLayout(srcTy);
-  auto dstLayout = gpu::toLinearLayout(dstTy);
-  srcLayout = actionRemoveBroadcastedRegs(srcLayout).apply(srcLayout);
-  dstLayout = actionRemoveBroadcastedRegs(dstLayout).apply(dstLayout);
-  auto bitwidth = getBitwidth(srcTy);
-  auto smem = gpu::optimalSwizzlingLdSt(srcLayout, dstLayout, bitwidth);
+unsigned getNumScratchElemsSwizzledCvt(const LinearLayout &srcLayout,
+                                       const LinearLayout &dstLayout,
+                                       int bitwidth) {
+  auto *ctx = srcLayout.getInDimNames().begin()->getContext();
+  auto srcLayoutNoBroadcast =
+      actionRemoveBroadcastedRegs(srcLayout).apply(srcLayout);
+  auto dstLayoutNoBroadcast =
+      actionRemoveBroadcastedRegs(dstLayout).apply(dstLayout);
+  auto smem = gpu::optimalSwizzlingLdSt(srcLayoutNoBroadcast,
+                                        dstLayoutNoBroadcast, bitwidth);
   auto reps = smem.getInDimSize(StringAttr::get(ctx, "reps"));
   return smem.getTotalOutDimSize() / reps;
 }
 
-namespace {
-constexpr int64_t kReduceScratchAlign = 16;
-
-Type getReduceMemElemTy(Type elemTy, MLIRContext *ctx) {
-  if (elemTy.isIntOrFloat() && elemTy.getIntOrFloatBitWidth() < 8)
-    return IntegerType::get(ctx, 8);
-  return elemTy;
-}
-
-int64_t getReduceScratchSizeBytes(triton::ReduceOp op,
-                                  ArrayRef<unsigned> bytesPerOperand) {
-  std::vector<unsigned> indices(op.getNumOperands());
-  std::iota(indices.begin(), indices.end(), 0);
-  auto *ctx = op.getContext();
-  std::sort(indices.begin(), indices.end(), [&](unsigned i, unsigned j) {
-    auto lhsTy = getReduceMemElemTy(op.getElementTypes()[i], ctx);
-    auto rhsTy = getReduceMemElemTy(op.getElementTypes()[j], ctx);
-    return getIntOrFloatOrPtrBitWidth(lhsTy) >
-           getIntOrFloatOrPtrBitWidth(rhsTy);
-  });
-  // Aling to 16 bytes to allow for vectorisation
-  int64_t offset = 0;
-  for (unsigned idx : indices) {
-    offset += llvm::alignTo(bytesPerOperand[idx], kReduceScratchAlign);
-  }
-  return offset;
-}
-
-} // namespace
-
 // Both `atomic_cas` and `atomic_rmw` may need scratch memory to store values
 // because Triton's block-based programming model ensures that
 // all threads sharing the same partition of the tensor see the same values,
@@ -100,15 +71,35 @@ static SmallVector<unsigned> getRepShapeForAtomic(Value result) {
 
 unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
   if (auto reduceOp = dyn_cast<ReduceOp>(op)) {
-    ReduceOpHelper helper(reduceOp);
-    if (helper.isWarpSynchronous())
-      return 0;
+    auto srcTy = ReduceOpHelper(reduceOp).getSrcTy();
+    auto axis = reduceOp.getAxis();
+    auto kLane = StringAttr::get(reduceOp.getContext(), "lane");
 
-    auto regLl = ReduceOpHelper::reducedRegLaneLayout(helper.getSrcTy(),
-                                                      reduceOp.getAxis());
-    auto tmpLl = ReduceOpHelper::getInterLayout(regLl, reduceOp.getAxis());
-    auto bytesRegToTmp = helper.getScratchBytesForCvt(regLl, tmpLl);
-    return getReduceScratchSizeBytes(reduceOp, bytesRegToTmp);
+    auto isReduced = [axis](const LinearLayout &layout) {
+      return layout.getOutDimSizes().begin()[axis] == 1;
+    };
+    auto regLl =
+        ReduceOpHelper::reducedRegLaneLayout(srcTy, reduceOp.getAxis());
+
+    // All the inputs have the same layout so, since we order them from largest
+    // bitsize to smallest, and the first one is aligned, by induction, they are
+    // all aligned, so we don't need to align the byte numbers returned here.
+    auto bytesRegToTmp = 0;
+    while (!isReduced(regLl)) {
+      auto tmpLl = ReduceOpHelper::getInterLayout(regLl, axis);
+      // We take the maximum of the elements and multiply by the total bitwidth
+      // We do this as otherwise it's quite tricky to find the correct
+      // BaseOffsets in the lowering
+      int bytes = 0;
+      for (auto inputTy : reduceOp.getInputTypes()) {
+        auto nelem =
+            getNumScratchElemsSwizzledCvt(regLl, tmpLl, getBitwidth(inputTy));
+        bytes += nelem * (getBitwidth(inputTy) / 8);
+      }
+      bytesRegToTmp = std::max<unsigned>(bytesRegToTmp, bytes);
+      regLl = ReduceOpHelper::zeroBasesAlongDimAndReorder(tmpLl, axis, kLane);
+    }
+    return bytesRegToTmp;
   }
   if (auto scanOp = dyn_cast<ScanOp>(op)) {
     ScanLoweringHelper helper(scanOp);
@@ -131,7 +122,9 @@ unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
     if (!cvtNeedsSharedMemory(srcTy, dstTy))
       return 0;
     // The generic pass uses swizzling
-    auto elems = getNumScratchElemsSwizzledCvt(srcTy, dstTy);
+    auto elems = getNumScratchElemsSwizzledCvt(gpu::toLinearLayout(srcTy),
+                                               gpu::toLinearLayout(dstTy),
+                                               getBitwidth(srcTy));
     return elems * getBitwidth(srcTy) / 8;
   }
   if (isa<AtomicRMWOp, AtomicCASOp>(op)) {
 
@@ -76,10 +76,6 @@ unsigned ReduceOpHelper::getIntraWarpSizeWithUniqueData() {
   return getThreadsPerWarp(srcEncoding, srcShape)[axis];
 }
 
-bool ReduceOpHelper::isWarpSynchronous() {
-  return getWarpsPerCTA(srcEncoding, srcShape)[axis] == 1;
-}
-
 bool ReduceOpHelper::isReduceWithinCTA() {
   // TODO: Support reduce across CTAS
   // Layout optimization passes such as PlanCTAPass and
@@ -155,23 +151,97 @@ LinearLayout ReduceOpHelper::getInterLayout(const LinearLayout &layout,
   auto *ctx = layout.getOutDimNames().begin()->getContext();
   auto kLane = mlir::StringAttr::get(ctx, "lane");
   auto kWarp = mlir::StringAttr::get(ctx, "warp");
+  auto kBlock = mlir::StringAttr::get(ctx, "block");
   auto regBases = layout.getBases();
-  auto linearAttr = triton::gpu::LinearEncodingAttr::get(ctx, layout);
-  int laneBits = layout.getInDimSizeLog2(kLane);
-  int neededLaneBits = llvm::Log2_32(linearAttr.getWarpsPerCTA()[axis]);
-  // TODO move to verifier
-  assert(neededLaneBits <= laneBits && "NYI: more inter-warps than lanes");
-  // Move the warp axis bases we need to reduce into lane bases, while
-  // keeping non-axis components in their original in-dim.
-  auto &laneBases = regBases[kLane];
-  auto &warpBases = regBases[kWarp];
-  int moved = 0;
-  for (auto &warpBasis : warpBases) {
-    if (warpBasis[axis] == 0)
-      continue;
-    assert(moved < neededLaneBits && "unexpected warp axis bases count");
-    std::swap(laneBases[moved], warpBasis);
-    moved++;
+  auto laneIt = regBases.find(kLane);
+  auto warpIt = regBases.find(kWarp);
+  auto blockIt = regBases.find(kBlock);
+  if (laneIt == regBases.end() || warpIt == regBases.end()) {
+    return layout;
+  }
+
+  auto &laneBases = laneIt->second;
+  auto &warpBases = warpIt->second;
+  auto &blockBases = blockIt->second;
+
+  auto collectAxisBases = [&](const std::vector<std::vector<int32_t>> &bases,
+                              SmallVector<unsigned> &out) {
+    for (unsigned i = 0; i < bases.size(); ++i) {
+      if (bases[i][axis] != 0)
+        out.push_back(i);
+    }
+  };
+
+  SmallVector<unsigned> warpAxisBases;
+  collectAxisBases(warpBases, warpAxisBases);
+  SmallVector<unsigned> blockAxisBases;
+  collectAxisBases(blockBases, blockAxisBases);
+
+  SmallVector<unsigned> zeroLaneBases;
+  for (unsigned i = 0; i < laneBases.size(); ++i) {
+    if (llvm::all_of(laneBases[i], [](int32_t v) { return v == 0; }))
+      zeroLaneBases.push_back(i);
+  }
+
+  auto axisSize = to_vector(layout.getOutDimSizes())[axis];
+  auto totalAxisBases = warpAxisBases.size() + blockAxisBases.size();
+
+  // First try to place all warp/block axis bases into lane bases that are
+  // currently zero. If we can do this we will be able to perform the full
+  // reduction with just one convert_layout
+  if (zeroLaneBases.size() >= totalAxisBases) {
+    unsigned laneIdx = 0;
+    for (unsigned idx : warpAxisBases) {
+      std::swap(laneBases[zeroLaneBases[laneIdx]], warpBases[idx]);
+      ++laneIdx;
+    }
+    for (unsigned idx : blockAxisBases) {
+      std::swap(laneBases[zeroLaneBases[laneIdx]], blockBases[idx]);
+      ++laneIdx;
+    }
+    return LinearLayout(std::move(regBases),
+                        to_vector(layout.getOutDimNames()));
+  }
+
+  // If we can fit all the bases inside the lane dimension, we can perform the
+  // reduction with two convert_layouts
+  // The first cvt to move the relevant bases to the lane dimension
+  // The second to move all the bases we moved out of the lane dimension back to
+  // their original positions
+  if (warpAxisBases.size() + blockAxisBases.size() <= laneBases.size()) {
+    assert(totalAxisBases <= laneBases.size() &&
+           "unexpected lane base count for axis layout");
+    unsigned laneIdx = 0;
+    for (unsigned idx : warpAxisBases) {
+      std::swap(laneBases[laneIdx], warpBases[idx]);
+      ++laneIdx;
+    }
+    for (unsigned idx : blockAxisBases) {
+      std::swap(laneBases[laneIdx], blockBases[idx]);
+      ++laneIdx;
+    }
+    return LinearLayout(std::move(regBases),
+                        to_vector(layout.getOutDimNames()));
+  }
+
+  // Assumptions (easily relaxed if AMD needs it)
+  // We assume that
+  // max number of warps * max number of blocks <= (max number of lanes)^2
+  // We check this in logarithmic space (number of bases)
+  // This is true in nvidia as the max numbers are warps=64 ctas=16 so that
+  // 64 * 16 = 1024 = 32 * 32 = laneBases.size() * laneBases.size()
+  // This implies that, even if we have to perform 3 cvt_layouts, we can perform
+  // first one that does not cross CTAs, and then two that may cross CTAs
+  assert(blockBases.size() <= laneBases.size());
+  assert(warpBases.size() + blockBases.size() <= 2 * laneBases.size());
+
+  // Otherwise, fit as many warp bases as possible into the lane dimension
+  unsigned laneIdx = 0;
+  for (unsigned idx : warpAxisBases) {
+    std::swap(laneBases[laneIdx], warpBases[idx]);
+    ++laneIdx;
+    if (laneIdx >= laneBases.size())
+      break;
   }
 
   return LinearLayout(std::move(regBases), to_vector(layout.getOutDimNames()));
@@ -184,9 +254,7 @@ LinearLayout ReduceOpHelper::reducedRegLaneLayout(RankedTensorType srcTy,
   auto kLane = StringAttr::get(ctx, "lane");
   auto kWarp = StringAttr::get(ctx, "warp");
 
-  auto reduced = triton::gpu::toLinearLayout(srcTy);
-  reduced = reduced.sublayout({kReg, kLane, kWarp},
-                              to_vector(reduced.getOutDimNames()));
+  auto reduced = toLinearLayout(srcTy);
   reduced = actionRemoveBroadcastedRegs(reduced).apply(reduced);
   reduced = makeAxisContiguous(reduced, axis).apply(reduced);
   reduced = zeroBasesAlongDimAndReorder(reduced, axis, kReg);
@@ -195,32 +263,6 @@ LinearLayout ReduceOpHelper::reducedRegLaneLayout(RankedTensorType srcTy,
   return reduced;
 }
 
-SmallVector<unsigned>
-ReduceOpHelper::getScratchBytesForCvt(const LinearLayout &srcLayout,
-                                      const LinearLayout &dstLayout) {
-  SmallVector<unsigned> bytes(srcElementTypes.size(), 0);
-  auto *ctx = op.getContext();
-  SmallVector<int64_t> shape;
-  shape.reserve(srcLayout.getNumOutDims());
-  for (auto dim : srcLayout.getOutDimNames()) {
-    shape.push_back(srcLayout.getOutDimSize(dim));
-  }
-  auto srcEnc = triton::gpu::LinearEncodingAttr::get(ctx, srcLayout);
-  auto dstEnc = triton::gpu::LinearEncodingAttr::get(ctx, dstLayout);
-  for (unsigned i = 0; i < srcElementTypes.size(); ++i) {
-    auto elemTy = srcElementTypes[i];
-    if (elemTy.isIntOrFloat() && elemTy.getIntOrFloatBitWidth() < 8)
-      elemTy = IntegerType::get(ctx, 8);
-    auto srcTy = RankedTensorType::get(shape, elemTy, srcEnc);
-    auto dstTy = RankedTensorType::get(shape, elemTy, dstEnc);
-    if (!cvtNeedsSharedMemory(srcTy, dstTy))
-      continue;
-    auto elems = getNumScratchElemsSwizzledCvt(srcTy, dstTy);
-    bytes[i] = elems * getBitwidth(srcTy) / 8;
-  }
-  return bytes;
-}
-
 ScanLoweringHelper::ScanLoweringHelper(triton::ScanOp op) : scanOp(op) {
   auto firstTy = cast<RankedTensorType>(op.getOperands()[0].getType());
   srcShape = firstTy.getShape();