triton-lang
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 1 addition & 0 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Passes.td‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Passes.td‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 1 addition & 3 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 14 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/WarpSpecializeUtility.h‎
Lines changed: 26 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/WarpSpecializeUtility.h‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonInstrument/IR/FunctionBuilder.h‎
Lines changed: 7 additions & 11 deletions b/‎include/triton/Dialect/TritonInstrument/IR/FunctionBuilder.h‎
Lines changed: 7 additions & 11 deletions
diff --git a/‎include/triton/Dialect/TritonInstrument/IR/TritonInstrumentOps.td‎
Lines changed: 7 additions & 9 deletions b/‎include/triton/Dialect/TritonInstrument/IR/TritonInstrumentOps.td‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎include/triton/Dialect/TritonInstrument/IR/Utility.h‎
Lines changed: 8 additions & 2 deletions b/‎include/triton/Dialect/TritonInstrument/IR/Utility.h‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/AssertOpToLLVM.cpp‎
Lines changed: 5 additions & 17 deletions b/‎lib/Conversion/TritonGPUToLLVM/AssertOpToLLVM.cpp‎
Lines changed: 5 additions & 17 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
@@ -86,6 +86,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::triton::gpu::registerAllocateSharedMemoryPass();
   mlir::triton::gpu::registerTritonGPUAllocateWarpGroups();
   mlir::triton::gpu::registerTritonGPUGlobalScratchAllocationPass();
+  mlir::triton::gpu::registerCanonicalizeLLVMIR();
   mlir::triton::registerConvertWarpSpecializeToLLVM();
   mlir::triton::registerConvertTritonGPUToLLVMPass();
   mlir::triton::registerConvertNVGPUToLLVMPass();
 
@@ -42,4 +42,8 @@ def TritonGPUAllocateWarpGroups : Pass<"tritongpu-allocate-warp-groups", "mlir::
   }];
 }
 
+def CanonicalizeLLVMIR : Pass<"canonicalize-llvm-ir", "mlir::LLVM::LLVMFuncOp"> {
+  let summary = "Canonicalize LLVM IR";
+}
+
 #endif
@@ -103,9 +103,7 @@ void populatePrintOpToLLVMPattern(LLVMTypeConverter &typeConverter,
                                   PatternBenefit benefit);
 
 void populateInstrumentationToLLVMPatterns(LLVMTypeConverter &typeConverter,
-                                           const TargetInfoBase &targetInfo,
-                                           RewritePatternSet &patterns,
-                                           PatternBenefit benefit);
+                                           RewritePatternSet &patterns);
 
 } // namespace triton
 } // namespace mlir
 
@@ -342,6 +342,9 @@ LLVM::LLVMFuncOp appendOrGetExternFuncOp(RewriterBase &rewriter, Operation *op,
 
 // Multiply a square layout with 1 input and output dimension with a vector
 Value matrixVectorProd(TritonLLVMOpBuilder &b, const LinearLayout &A, Value x);
+
+// Whether the convert layout should be forced to use warp shuffles.
+bool cvtAlwaysUseWarpShuffle(triton::gpu::ConvertLayoutOp cvt);
 } // namespace gpu
 
 } // namespace triton
@@ -442,6 +445,9 @@ Value linearize(RewriterBase &rewriter, Location loc, ArrayRef<Value> multiDim,
 size_t linearize(ArrayRef<unsigned> multiDim, ArrayRef<unsigned> shape,
                  ArrayRef<unsigned> order);
 
+GlobalOp getOrInsertGlobalConstant(RewriterBase &rewriter, ModuleOp module,
+                                   Type type, Attribute content, StringRef key);
+
 Value addStringToModule(Location loc, RewriterBase &rewriter, StringRef key,
                         StringRef content);
 
@@ -630,6 +636,14 @@ SmallVector<Value> inlineRegion(RewriterBase &rewriter, Region &region,
                           mlir::TypeID::get<TerminatorOp>(), loc);
 }
 
+// #prevBlock
+// if (condition) {
+//   #ifBlock
+// }
+// #thenBlock
+std::tuple</*prevBlock=*/Block *, /*ifBlock=*/Block *, /*thenBlock=*/Block *>
+createIfBlock(ConversionPatternRewriter &b, Location loc, Value cnd);
+
 void finalizeTensorAtomicResults(Operation *op, RankedTensorType tensorTy,
                                  ConversionPatternRewriter &rewriter,
                                  SmallVector<Value> &resultVals,
 
@@ -10,13 +10,39 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "llvm/ADT/SetVector.h"
 #include <functional>
+#include <optional>
 
 namespace mlir {
 namespace triton {
 
 // Forward declaration
 class TritonLLVMIRRewriter;
 
+//===----------------------------------------------------------------------===//
+// lowerWarpSpecializeBarriers
+//===----------------------------------------------------------------------===//
+
+class WarpSpecializeBarrierHelper {
+public:
+  virtual ~WarpSpecializeBarrierHelper() = default;
+
+  virtual bool isBarrierOp(Operation *op) const = 0;
+  virtual Type getBarrierHandleType(MLIRContext *ctx) const = 0;
+  virtual FailureOr<Value>
+  getBarrierHandle(TritonLLVMIRRewriter &b,
+                   std::optional<unsigned> partitionIdx) = 0;
+  virtual void createBarrier(TritonLLVMIRRewriter &b, unsigned numWarps,
+                             Value handle) = 0;
+  LogicalResult createBarrier(TritonLLVMIRRewriter &b, unsigned numWarps,
+                              std::optional<unsigned> partitionIdx);
+};
+
+// Assign hardware barriers to each warp group and rewrite warp group barriers
+// into named barrier instructions. There is a maximum number of named barriers.
+LogicalResult
+lowerWarpSpecializeBarriers(ModuleOp module,
+                            WarpSpecializeBarrierHelper &barrierHelper);
+
 //===----------------------------------------------------------------------===//
 // convertOpTypes
 //===----------------------------------------------------------------------===//
 
@@ -22,10 +22,11 @@ namespace mlir::triton {
 class FuncOp;
 
 namespace instrument {
+std::string mangleType(Type t);
 
 class ManglingArgs {
 public:
-  using Arg = std::variant<Type, int, std::string>;
+  using Arg = std::variant<Type, uint64_t, std::string>;
 
   ManglingArgs() = default;
   ManglingArgs(const ManglingArgs &) = default;
@@ -51,9 +52,8 @@ class ManglingArgs {
 
   std::string mangleArg(Arg arg) const {
     if (auto type = std::get_if<Type>(&arg)) {
-      auto hash = static_cast<uint64_t>(mlir::hash_value(*type));
-      return std::string("_T") + llvm::utohexstr(hash);
-    } else if (auto intVal = std::get_if<int>(&arg)) {
+      return std::string("_") + mangleType(*type);
+    } else if (auto intVal = std::get_if<uint64_t>(&arg)) {
       return std::string("_I") + std::to_string(*intVal);
     } else if (auto stringVal = std::get_if<std::string>(&arg)) {
       return *stringVal;
@@ -74,18 +74,14 @@ class ManglingArgs {
   SmallVector<Arg> args;
 };
 
-/// Utility to mangle helper function names produced by the instrumentation
-/// passes. The mangled name encodes the base name, number of warps and the
-/// participating types.
-std::string mangleInstrumentHelperName(const std::string &baseName,
-                                       int numWarps,
-                                       llvm::ArrayRef<Type> types);
-
 class FunctionBuilder {
 public:
   FunctionBuilder(ModuleOp module, AuxDataMap &auxData)
       : module(module), auxData(auxData) {}
 
+  // Create a function that fills a global tensor with a scalar value.
+  void createFillGlobalTensorCall(ImplicitLocOpBuilder &b, Value ptr,
+                                  RankedTensorType type, Value scalar);
   // setWaiting: mark the base thread as waiting on the given barrier phase and
   // record that phase for deadlock detection.
   void createSetWaitingCall(ImplicitLocOpBuilder &b, Value mbar, int thread,
 
@@ -21,19 +21,17 @@ class TTI_Op<string mnemonic, list<Trait> traits = []> :
     Op<TritonInstrument_Dialect, mnemonic, traits> {
 }
 
-def TTI_ExperimentalAssertInThreadOp : TTI_Op<"experimental_assert_in_thread", [MemoryEffects<[MemWrite<GlobalMemory>]>]> {
-  let summary = "assert the condition within the current thread";
+def TTI_ExperimentalAssertUniformOp : TTI_Op<"experimental_assert_uniform", [MemoryEffects<[MemWrite<GlobalMemory>]>]> {
+  let summary = "assert the uniform condition";
   let description = [{
-    Assert that the condition is true given all the values are available in the current thread.
-    If the condition is false, the message is printed, and the program is aborted.
-    If check_any is true, any of the values in the condition must be true. Otherwise, all the
-    values in the condition must be true.
+    Assert that the condition is true given all threads in the warp group have
+    the same value, so only one thread needs to evaluate the assert and print
+    the message.
   }];
-  let arguments = (ins AnyTypeOf<[I1, I1Tensor]>:$condition, StrAttr:$message, BoolAttr:$check_any);
-  let assemblyFormat = "$condition `,` $message attr-dict `:` type($condition)";
+  let arguments = (ins I1:$condition, StrAttr:$message);
+  let assemblyFormat = "$condition `,` $message attr-dict-with-keyword";
 }
 
-
 def TTI_ExperimentalBufferDescriptorsOp
     : TTI_Op<"experimental_buffer_descriptors", [Pure]> {
   let summary = "define an array of buffer descriptors";
 
@@ -9,6 +9,7 @@
 #include <array>
 
 namespace mlir::triton::instrument {
+class FunctionBuilder;
 
 constexpr int numMemTypes = getMaxEnumValForMemType() + 1;
 
@@ -22,18 +23,22 @@ namespace CommitKind {
 enum Kind { None = -1, AsyncCp = 0, Wgmma, TmaStore, NumCommitKinds };
 }
 
+void createAssertInThread(ImplicitLocOpBuilder &b, Value condition,
+                          StringRef message);
 Operation *createStoreScratchMemory(OpBuilder &b, Location loc, Value alloc,
                                     Value tensor, RankedTensorType tensorType);
 Value createLoadScratchMemory(OpBuilder &b, Location loc, Value alloc,
                               RankedTensorType tensorType);
 Value expandOuterSlicedDim(OpBuilder &b, Location loc, Value tensor);
+RankedTensorType getIntTensorType(Region *region, ArrayRef<int64_t> shape,
+                                  unsigned bitWidth);
 TypedValue<RankedTensorType> createConstIntTensor(OpBuilder &builder,
                                                   Location loc, int64_t val,
                                                   RankedTensorType tensorType,
                                                   bool isSigned = false);
 FuncOp getEntryPoint(ModuleOp module);
 gpu::DistributedEncodingTrait
-getSingleDimSliceEncoding(gpu::BlockedEncodingAttr encoding, int dim);
+getSingleDimSliceEncoding(gpu::DistributedEncodingTrait encoding, int dim);
 
 struct ValueType {
   Value value;
@@ -82,7 +87,8 @@ struct AuxDataMap {
   RegionToValueMap waiting;
   std::array<bool, numMemTypes> hasNonTrivialAliasing{};
 
-  void populateAndPassToWarpSpecialize(ModuleOp module);
+  void populateAndPassToWarpSpecialize(ModuleOp module,
+                                       FunctionBuilder &funcBuilder);
 
 private:
   void getBuffersAndBarriers(
 
@@ -31,8 +31,7 @@ struct AssertOpConversion : public ConvertOpToLLVMPattern<triton::AssertOp> {
                                               rewriter, loc, elemTy,
                                               rewriter.getZeroAttr(elemTy))));
       } else {
-        assert(false && "Unsupported type for assert");
-        return failure();
+        return op->emitError("Unsupported type for assert");
       }
     }
     llAssert(op, condition, adaptor.getMessage(), rewriter);
@@ -49,11 +48,11 @@ struct AssertOpConversion : public ConvertOpToLLVMPattern<triton::AssertOp> {
   }
   // op: the op at which the assert is inserted. Unlike printf, we need to
   // know about the op to split the block.
-  void llAssert(Operation *op, Value condition, StringRef message,
+  void llAssert(AssertOp op, Value condition, StringRef message,
                 ConversionPatternRewriter &rewriter) const {
 
-    auto ctx = rewriter.getContext();
     auto loc = op->getLoc();
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
 
     StringRef file = "unknown";
     StringRef func = "unknown";
@@ -72,24 +71,13 @@ struct AssertOpConversion : public ConvertOpToLLVMPattern<triton::AssertOp> {
       col = fileLineColLoc.getColumn();
     }
 
-    // #block1
-    // if (condition) {
-    //   #block2
-    //   __assertfail(message);
-    // }
-    // #block3
-    Block *prevBlock = op->getBlock();
+    auto [prevBlock, ifBlock, thenBlock] =
+        createIfBlock(rewriter, loc, condition);
 
-    Block *ifBlock = rewriter.splitBlock(prevBlock, op->getIterator());
     rewriter.setInsertionPointToStart(ifBlock);
     targetInfo.assertFail(rewriter, loc, message, file, func, line);
 
     // Split a block after the call.
-    Block *thenBlock = rewriter.splitBlock(ifBlock, op->getIterator());
-    rewriter.setInsertionPointToEnd(ifBlock);
-    LLVM::BrOp::create(rewriter, loc, thenBlock);
-    rewriter.setInsertionPointToEnd(prevBlock);
-    LLVM::CondBrOp::create(rewriter, loc, condition, ifBlock, thenBlock);
     rewriter.setInsertionPointToStart(thenBlock);
   }
 
 
@@ -5,6 +5,7 @@ add_triton_library(TritonGPUToLLVM
     AllocateSharedMemoryUtility.cpp
     AllocateWarpGroups.cpp
     AssertOpToLLVM.cpp
+    CanonicalizeLLVMIR.cpp
     ControlFlowOpToLLVM.cpp
     ConvertLayoutOpToLLVM.cpp
     ElementwiseOpToLLVM.cpp
Original file line number	Diff line number	Diff line change
`@@ -42,4 +42,8 @@ def TritonGPUAllocateWarpGroups : Pass<"tritongpu-allocate-warp-groups", "mlir::`
`42`	`42`	`}];`
`43`	`43`	`}`
`44`	`44`
	`45`	`+def CanonicalizeLLVMIR : Pass<"canonicalize-llvm-ir", "mlir::LLVM::LLVMFuncOp"> {`
	`46`	`+ let summary = "Canonicalize LLVM IR";`
	`47`	`+}`
	`48`	`+`
`45`	`49`	`#endif`