triton-lang
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 2 additions & 2 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 18 additions & 15 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 18 additions & 15 deletions
diff --git a/‎lib/Analysis/Membar.cpp‎
Lines changed: 2 additions & 1 deletion b/‎lib/Analysis/Membar.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lib/Analysis/Utility.cpp‎
Lines changed: 16 additions & 10 deletions b/‎lib/Analysis/Utility.cpp‎
Lines changed: 16 additions & 10 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 41 additions & 45 deletions b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 41 additions & 45 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 2 additions & 4 deletions b/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp‎
Lines changed: 12 additions & 15 deletions b/‎lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp‎
Lines changed: 12 additions & 15 deletions
@@ -389,8 +389,8 @@ template <typename T> class CallGraph {
 // Create a basic DataFlowSolver with constant and dead code analysis included.
 std::unique_ptr<DataFlowSolver> createDataFlowSolver();
 
-bool isCvtWarpSync(const triton::LinearLayout &srcLayout,
-                   const triton::LinearLayout &dstLayout);
+bool isCvtDimSync(const triton::LinearLayout &srcLayout,
+                  const triton::LinearLayout &dstLayout, StringAttr dim);
 
 } // namespace mlir
 
 
@@ -20,6 +20,8 @@ class TargetInfoBase {
   // target address space
   virtual void barrier(Location loc, RewriterBase &rewriter,
                        triton::gpu::AddrSpace targets) const = 0;
+  // Emit a cluster-level barrier when supported. Defaults to CTA barrier.
+  virtual void clusterBarrier(Location loc, RewriterBase &rewriter) const = 0;
   // Insert a warp syncronization barrier that also guarantees local address
   // space visibility at warp level when supported by the backend.
   // Backends that do not support warp-level barriers should conservatively
 
@@ -15,6 +15,8 @@
 #include "triton/Tools/StrUtil.h"
 #include "llvm/ADT/STLExtras.h"
 
+#include <optional>
+
 #define DEBUG_TYPE "ttgpu_to_llvm"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
 #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
@@ -561,17 +563,18 @@ lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
 // calcPaddedOffset is a lambda that takes a base offset (mlir::Value)
 // and computes a new offset (mlir::Value) by applying padding based on
 // shared memory layout.
-SmallVector<Value> lowerLdSt(
-    Location loc, MLIRContext *ctx, LinearLayout cvt,
-    ArrayRef<Value> valsArray, // Input for store, output for load
-    Type llvmElemTy, Value smemBase,
-    ArrayRef<std::pair<unsigned, unsigned>> paddingShifts, Value affineOffset,
-    uint64_t maskSpanAffineOffset, Value laneId, Value warpId,
-    RewriterBase &rewriter, const TargetInfoBase &targetInfo,
-    std::optional<int> maybeMaxVecElems,
-    std::function<SmallVector<Value>(RewriterBase &, Location, ArrayRef<Value>,
-                                     Value, int, VectorType)>
-        lowerInst);
+SmallVector<Value>
+lowerLdSt(Location loc, MLIRContext *ctx, LinearLayout cvt,
+          ArrayRef<Value> valsArray, // Input for store, output for load
+          Type llvmElemTy, Value smemBase,
+          ArrayRef<std::pair<unsigned, unsigned>> paddingShifts, Value affineOffset,
+          uint64_t maskSpanAffineOffset, Value laneId, Value warpId,
+          RewriterBase &rewriter, const TargetInfoBase &targetInfo,
+          std::optional<int> maybeMaxVecElems,
+          std::function<SmallVector<Value>(RewriterBase &, Location,
+                                           ArrayRef<Value>, Value, int,
+                                           VectorType, std::optional<Value>)>
+              lowerInst);
 
 // Lower local_load/local_store via ld.shared/st.shared
 SmallVector<Value>
@@ -613,10 +616,10 @@ void makeAllWarpGroupsIsolatedFromAbove(Operation *op);
 // Set the correct loop annotation on LLVM branch ops.
 void fixUpLoopAnnotation(ModuleOp mod);
 
-void transferWithinBlockSwizzling(triton::gpu::ConvertLayoutOp op, Value src,
-                                  const TargetInfoBase &targetInfo,
-                                  const LLVMTypeConverter *typeConverter,
-                                  RewriterBase &rewriter);
+void transferSwizzlingLocalMem(triton::gpu::ConvertLayoutOp op, Value src,
+                               const TargetInfoBase &targetInfo,
+                               const LLVMTypeConverter *typeConverter,
+                               RewriterBase &rewriter);
 
 SmallVector<Value> inlineRegionImpl(RewriterBase &rewriter, Region &region,
                                     ArrayRef<Value> args,
 
@@ -324,7 +324,8 @@ void MembarAnalysis::update(Operation *op, BlockInfo *blockInfo,
       auto dstTy = cast<RankedTensorType>(cvt.getType());
       auto srcLayout = triton::gpu::toLinearLayout(srcTy);
       auto dstLayout = triton::gpu::toLinearLayout(dstTy);
-      isWarpSync = mlir::isCvtWarpSync(srcLayout, dstLayout);
+      auto kWarp = StringAttr::get(op->getContext(), "warp");
+      isWarpSync = mlir::isCvtDimSync(srcLayout, dstLayout, kWarp);
     }
 
     if (!curBlockInfo.syncReadSlices.empty() ||
 
@@ -1258,17 +1258,23 @@ std::unique_ptr<DataFlowSolver> createDataFlowSolver() {
   return solver;
 }
 
-bool isCvtWarpSync(const triton::LinearLayout &srcLayout,
-                   const triton::LinearLayout &dstLayout) {
-  // We can use warp.sync when the warp dimension in the convert is trival
-  // and there is no broadcasting at a warp level (otherwise reads may be
-  // wrong)
+bool isCvtDimSync(const triton::LinearLayout &srcLayout,
+                  const triton::LinearLayout &dstLayout, StringAttr dim) {
+  // We can use a dimension-level sync when the conversion is trivial over that
+  // dimension and there is no broadcasting over it.
   auto *ctx = srcLayout.getInDimNames().begin()->getContext();
-  auto comp = dstLayout.invertAndCompose(srcLayout);
   auto kWarp = StringAttr::get(ctx, "warp");
-  return comp.isTrivialOver(kWarp) &&
-         srcLayout.getFreeVariableMasks()[kWarp] == 0 &&
-         dstLayout.getFreeVariableMasks()[kWarp] == 0;
+  auto kBlock = StringAttr::get(ctx, "block");
+  assert((dim == kWarp || dim == kBlock) && "expected dim to be warp or block");
+  assert(srcLayout.hasInDim(dim) && dstLayout.hasInDim(dim) &&
+         "expected dim to be present in both layouts");
+  auto parentTrivial = true;
+  if (dim == kWarp) {
+    parentTrivial = isCvtDimSync(srcLayout, dstLayout, kBlock);
+  }
+  auto comp = dstLayout.invertAndCompose(srcLayout);
+  return parentTrivial && comp.isTrivialOver(dim) &&
+         srcLayout.getFreeVariableMasks()[dim] == 0 &&
+         dstLayout.getFreeVariableMasks()[dim] == 0;
 }
-
 } // namespace mlir
@@ -4,6 +4,8 @@
 #include "triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 
+#include <optional>
+
 #include "triton/Analysis/Allocation.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
@@ -53,15 +55,10 @@ struct ConvertLayoutOpConversion
     assert(to_vector(conversion.getInDimNames()) ==
            to_vector(conversion.getOutDimNames()));
     auto dims = conversion.getInDimNames();
-    if (llvm::is_contained(dims, kBlock)) {
-      // Case 1: Transfer between values in different CTAs.
-      //          This requires moving values through distributed shared memory.
-      return rewriter.notifyMatchFailure(
-          op, "NYI: Transfer between different CTAs");
-    } else if (llvm::is_contained(dims, kWarp)) {
-      // Case 2: Transfer between values in the same CTA, in which case we move
-      //         values through shared memory.
-      transferWithinBlockSwizzling(op, adaptor.getSrc(), rewriter);
+    if (llvm::is_contained(dims, kBlock) || llvm::is_contained(dims, kWarp)) {
+      // Transfer between values in the same CTA, or across CTAs. We move values
+      // through (distributed) shared memory.
+      transferSwizzlingLocalMem(op, adaptor.getSrc(), rewriter);
       return success();
     } else if (llvm::is_contained(dims, kLane)) {
       // Case 3. Transfer between values in the same warp, in which case we try
@@ -70,7 +67,7 @@ struct ConvertLayoutOpConversion
       if (cvtNeedsWarpShuffle(srcTy, dstTy))
         return transferWithinWarp(op, adaptor, rewriter);
 
-      transferWithinBlockSwizzling(op, adaptor.getSrc(), rewriter);
+      transferSwizzlingLocalMem(op, adaptor.getSrc(), rewriter);
       return success();
     } else if (llvm::is_contained(dims, kRegister)) {
       // Case 4. Transfer between values in the same thread, in which case we
@@ -107,7 +104,7 @@ struct ConvertLayoutOpConversion
     return success();
   }
 
-  SmallVector<Value> transferWithinBlockSwizzlingImpl(
+  SmallVector<Value> transferSwizzlingLocalMemImpl(
       Location loc, ConversionPatternRewriter &rewriter,
       const LinearLayout &srcLayout, const LinearLayout &dstLayout,
       ArrayRef<Value> inVals, Type llvmElemTy, Value smemBase) const {
@@ -123,8 +120,8 @@ struct ConvertLayoutOpConversion
         return b.ptrtoint(llvmElemTyPtr, v).getResult();
       }));
       auto outVals =
-          transferWithinBlockSwizzlingImpl(loc, rewriter, srcLayout, dstLayout,
-                                           newInVals, llvmElemTyPtr, smemBase);
+          transferSwizzlingLocalMemImpl(loc, rewriter, srcLayout, dstLayout,
+                                        newInVals, llvmElemTyPtr, smemBase);
       for (auto &v : outVals) {
         v = b.inttoptr(llvmElemTy, v);
       }
@@ -137,7 +134,7 @@ struct ConvertLayoutOpConversion
       auto i8ElemTy = i8_ty;
       auto newInVals = llvm::to_vector(llvm::map_range(
           inVals, [&](Value v) { return b.zext(i8ElemTy, v).getResult(); }));
-      auto outVals = transferWithinBlockSwizzlingImpl(
+      auto outVals = transferSwizzlingLocalMemImpl(
           loc, rewriter, srcLayout, dstLayout, newInVals, i8ElemTy, smemBase);
       for (auto &v : outVals) {
         v = b.trunc(llvmElemTy, v);
@@ -150,26 +147,29 @@ struct ConvertLayoutOpConversion
     if (!removeBroadcastSrc.isIdentity()) {
       auto prmtSrc = removeBroadcastSrc.apply(srcLayout);
       auto newInVals = removeBroadcastSrc.apply(inVals);
-      return transferWithinBlockSwizzlingImpl(loc, rewriter, prmtSrc, dstLayout,
-                                              newInVals, llvmElemTy, smemBase);
+      return transferSwizzlingLocalMemImpl(loc, rewriter, prmtSrc, dstLayout,
+                                           newInVals, llvmElemTy, smemBase);
     }
 
     // Remove broadcasting in dst
     auto removeBroadcastDst = actionRemoveBroadcastedRegs(dstLayout);
     if (!removeBroadcastDst.isIdentity()) {
       auto prmtDst = removeBroadcastDst.apply(dstLayout);
-      auto outVals = transferWithinBlockSwizzlingImpl(
+      auto outVals = transferSwizzlingLocalMemImpl(
           loc, rewriter, srcLayout, prmtDst, inVals, llvmElemTy, smemBase);
       return broadcastAs(outVals, dstLayout);
     }
 
     // At this point we have a type that's at least 8-bit
     // and we don't have broadcasting in the registers
+
     auto bitwidth = llvmElemTy.getIntOrFloatBitWidth();
     auto smem = optimalSwizzlingLdSt(srcLayout, dstLayout, bitwidth);
 
     // Extract reps from smem
     auto kReg = str_attr("register");
+    auto kWarp = StringAttr::get(ctx, "warp");
+    auto kBlock = StringAttr::get(ctx, "block");
     auto kReps = str_attr("reps");
     auto nReps = smem.getInDimSize(kReps);
     auto reps = LinearLayout::identity1D(nReps, kReg, kReps);
@@ -191,8 +191,11 @@ struct ConvertLayoutOpConversion
     auto storeCvt = *divideRight(totalStoreCvt, reps);
     auto loadCvt = *divideRight(totalLoadCvt, reps);
     auto kOffset = str_attr("offset");
-    storeCvt = storeCvt.reshapeOuts({{kOffset, storeCvt.getTotalOutDimSize()}});
-    loadCvt = loadCvt.reshapeOuts({{kOffset, loadCvt.getTotalOutDimSize()}});
+    auto nBlock = storeCvt.getInDimSize(kBlock);
+    storeCvt = storeCvt.reshapeOuts(
+        {{kOffset, storeCvt.getTotalOutDimSize() / nBlock}, {kBlock, nBlock}});
+    loadCvt = loadCvt.reshapeOuts(
+        {{kOffset, loadCvt.getTotalOutDimSize() / nBlock}, {kBlock, nBlock}});
 
     auto tileSize = storeCvt.getInDimSize(kReg);
 
@@ -201,28 +204,30 @@ struct ConvertLayoutOpConversion
     auto affineOffset = b.i32_val(0);
     auto maskSpanAffineOffset = 0;
 
-    bool isWarpSync = mlir::isCvtWarpSync(srcLayout, dstLayout);
-    for (int i = 0; i < nReps; ++i) {
-      if (i > 0) {
-        if (isWarpSync) {
-          targetInfo.warpSync(loc, rewriter);
-        } else {
-          targetInfo.barrier(loc, rewriter, triton::gpu::AddrSpace::Local);
-        }
+    bool isWarpSync = mlir::isCvtDimSync(srcLayout, dstLayout, kWarp);
+    bool isBlockSync = mlir::isCvtDimSync(srcLayout, dstLayout, kBlock);
+    auto emitBarrier = [&]() {
+      if (isWarpSync) {
+        targetInfo.warpSync(loc, rewriter);
+      } else if (isBlockSync) {
+        targetInfo.barrier(loc, rewriter, triton::gpu::AddrSpace::Local);
+      } else {
+        targetInfo.clusterBarrier(loc, rewriter);
       }
+    };
+
+    for (int i = 0; i < nReps; ++i) {
+      if (i > 0)
+        emitBarrier();
       auto tileInVals =
           ArrayRef<Value>(permutedInVals).slice(i * tileSize, tileSize);
       // Store
       lowerLdStShared(loc, ctx, storeCvt, tileInVals, llvmElemTy, smemBase,
                       /*paddingShifts=*/{}, affineOffset, maskSpanAffineOffset,
                       rewriter, targetInfo);
-      if (isWarpSync) {
-        targetInfo.warpSync(loc, rewriter);
-      } else {
-        targetInfo.barrier(loc, rewriter, triton::gpu::AddrSpace::Local);
-      }
+      emitBarrier();
       // Load
-      SmallVector<Value> tileOutVals = lowerLdStShared(
+      auto tileOutVals = lowerLdStShared(
           loc, ctx, loadCvt, {}, llvmElemTy, smemBase, /*paddingShifts=*/{},
           affineOffset, maskSpanAffineOffset, rewriter, targetInfo);
       llvm::append_range(outVals, tileOutVals);
@@ -233,30 +238,21 @@ struct ConvertLayoutOpConversion
     return outVals;
   }
 
-  void transferWithinBlockSwizzling(ConvertLayoutOp op, Value src,
-                                    ConversionPatternRewriter &rewriter) const {
+  void transferSwizzlingLocalMem(ConvertLayoutOp op, Value src,
+                                 ConversionPatternRewriter &rewriter) const {
     auto loc = op.getLoc();
     auto *ctx = op.getContext();
     auto srcTy = op.getSrc().getType();
     auto dstTy = op.getType();
 
-    // Remove the kBlock dimension from the layout as it's the identity in the
-    // cvt
     auto srcLayout = toLinearLayout(srcTy);
     auto dstLayout = toLinearLayout(dstTy);
-    auto kReg = str_attr("register");
-    auto kLane = str_attr("lane");
-    auto kWarp = str_attr("warp");
-    srcLayout = srcLayout.sublayout({kReg, kLane, kWarp},
-                                    to_vector(srcLayout.getOutDimNames()));
-    dstLayout = dstLayout.sublayout({kReg, kLane, kWarp},
-                                    to_vector(dstLayout.getOutDimNames()));
 
     auto llvmElemTy = getTypeConverter()->convertType(srcTy.getElementType());
     auto smemBase =
         LLVM::getSharedMemoryBase(loc, rewriter, targetInfo, op.getOperation());
     auto inVals = unpackLLElements(loc, src, rewriter);
-    auto outVals = transferWithinBlockSwizzlingImpl(
+    auto outVals = transferSwizzlingLocalMemImpl(
         loc, rewriter, srcLayout, dstLayout, inVals, llvmElemTy, smemBase);
 
     Value result =
 
@@ -150,11 +150,10 @@ LogicalResult lowerLocalStore(Location loc, MLIRContext *ctx, Value regVal,
     cvt = regLayout.invertAndCompose(sharedLayout);
   }
   auto kBlock = str_attr("block");
-  // NYI. We would need to emit a map.shared::cluster instruction.
+  // We could support it by removing this check if we ever want to
   if (!cvt.isTrivialOver({kBlock})) {
     return failure();
   }
-  cvt = cvt.sublayout({kReg, kLane, kWarp}, {kOffset});
   lowerLocalLdSt(loc, ctx, cvt, inVals, llvmElemTy, memDescTy, smemObj,
                  rewriter, targetInfo);
 
@@ -287,11 +286,10 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
       cvt = regLayout.invertAndCompose(sharedLayout);
     }
     auto kBlock = str_attr("block");
-    // NYI. We would need to emit a map.shared::cluster instruction.
+    // We could support it by removing this check if we ever want to
     if (!cvt.isTrivialOver({kBlock})) {
       return failure();
     }
-    cvt = cvt.sublayout({kReg, kLane, kWarp}, {kOffset});
 
     auto outVals = lowerLocalLdSt(loc, ctx, cvt, {}, llvmElemTy, memDescTy,
                                   smemObj, rewriter, targetInfo, op);
 
@@ -37,16 +37,6 @@ struct ReduceOpConversion
     auto accs = unpackInputs(loc, op, adaptor, rewriter);
     unsigned axis = op.getAxis();
 
-    // The lowering already supports cross-CTA reductions in principle
-    // We are only missing:
-    // - Supporting them in convert_layout for LinearLayouts
-    // - Emitting cross-CTA barriers between convert_layouts when the second
-    //   convert_layout crosses CTAs
-    // After this, we can uncomment the tests in test_reduce_funky_layout
-    if (!helper.isReduceWithinCTA()) {
-      return failure();
-    }
-
     auto *ctx = op.getContext();
 
     // Remove block as we don't currently support it
@@ -80,15 +70,18 @@ struct ReduceOpConversion
     // That is, they fit in 2 rounds of warp reductions
     // Even more, if we do two rounds, getInterLayout will make sure that the
     // first one does not cross CTAs
+    auto kBlock = StringAttr::get(ctx, "block");
+    bool lastCvtCrossesCTAs = false;
     int i = 0;
     while (to_vector(regLl.getOutDimSizes())[axis] != 1) {
       LinearLayout tmpLl = ReduceOpHelper::getInterLayout(regLl, axis);
 
       // Emit a barrier if we are reusing the shmem
       if (i > 0) {
-        sync(rewriter, loc);
+        sync(rewriter, loc, lastCvtCrossesCTAs);
       }
       accs = convertLayoutValues(loc, rewriter, op, regLl, tmpLl, accs);
+      lastCvtCrossesCTAs = !mlir::isCvtDimSync(regLl, tmpLl, kBlock);
 
       std::tie(regLl, accs) =
           reduceWithinWarps(op, std::move(tmpLl), std::move(accs), rewriter);
@@ -105,7 +98,7 @@ struct ReduceOpConversion
       auto outputLayout = triton::gpu::toLinearLayout(resultTy);
       if (regLl != outputLayout) {
         // Reuse the shmem
-        sync(rewriter, loc);
+        sync(rewriter, loc, lastCvtCrossesCTAs);
         accs =
             convertLayoutValues(loc, rewriter, op, regLl, outputLayout, accs);
       }
@@ -276,9 +269,13 @@ struct ReduceOpConversion
     return srcValues;
   }
 
-  void sync(ConversionPatternRewriter &rewriter, Location loc) const {
-    auto b = TritonLLVMOpBuilder(loc, rewriter);
-    b.barrier(triton::gpu::AddrSpace::Local);
+  void sync(ConversionPatternRewriter &rewriter, Location loc,
+            bool crossCTA) const {
+    if (crossCTA) {
+      targetInfo.clusterBarrier(loc, rewriter);
+    } else {
+      targetInfo.barrier(loc, rewriter, triton::gpu::AddrSpace::Local);
+    }
   }
 
   // Reduce along op axis for elements that are in the same thread. The
Original file line number	Diff line number	Diff line change
`@@ -324,7 +324,8 @@ void MembarAnalysis::update(Operation op, BlockInfo blockInfo,`
`324`	`324`	`auto dstTy = cast<RankedTensorType>(cvt.getType());`
`325`	`325`	`auto srcLayout = triton::gpu::toLinearLayout(srcTy);`
`326`	`326`	`auto dstLayout = triton::gpu::toLinearLayout(dstTy);`
`327`		`- isWarpSync = mlir::isCvtWarpSync(srcLayout, dstLayout);`
	`327`	`+ auto kWarp = StringAttr::get(op->getContext(), "warp");`
	`328`	`+ isWarpSync = mlir::isCvtDimSync(srcLayout, dstLayout, kWarp);`
`328`	`329`	`}`
`329`	`330`
`330`	`331`	`if (!curBlockInfo.syncReadSlices.empty() \|\|`
Original file line number	Diff line number	Diff line change
`@@ -150,11 +150,10 @@ LogicalResult lowerLocalStore(Location loc, MLIRContext *ctx, Value regVal,`
`150`	`150`	`cvt = regLayout.invertAndCompose(sharedLayout);`
`151`	`151`	`}`
`152`	`152`	`auto kBlock = str_attr("block");`
`153`		`- // NYI. We would need to emit a map.shared::cluster instruction.`
	`153`	`+ // We could support it by removing this check if we ever want to`
`154`	`154`	`if (!cvt.isTrivialOver({kBlock})) {`
`155`	`155`	`return failure();`
`156`	`156`	`}`
`157`		`- cvt = cvt.sublayout({kReg, kLane, kWarp}, {kOffset});`
`158`	`157`	`lowerLocalLdSt(loc, ctx, cvt, inVals, llvmElemTy, memDescTy, smemObj,`
`159`	`158`	`rewriter, targetInfo);`
`160`	`159`
`@@ -287,11 +286,10 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {`
`287`	`286`	`cvt = regLayout.invertAndCompose(sharedLayout);`
`288`	`287`	`}`
`289`	`288`	`auto kBlock = str_attr("block");`
`290`		`- // NYI. We would need to emit a map.shared::cluster instruction.`
	`289`	`+ // We could support it by removing this check if we ever want to`
`291`	`290`	`if (!cvt.isTrivialOver({kBlock})) {`
`292`	`291`	`return failure();`
`293`	`292`	`}`
`294`		`- cvt = cvt.sublayout({kReg, kLane, kWarp}, {kOffset});`
`295`	`293`
`296`	`294`	`auto outVals = lowerLocalLdSt(loc, ctx, cvt, {}, llvmElemTy, memDescTy,`
`297`	`295`	`smemObj, rewriter, targetInfo, op);`