triton-lang
diff --git a/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/wheels.yml‎
Lines changed: 6 additions & 6 deletions b/‎.github/workflows/wheels.yml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎include/triton/Dialect/TritonInstrument/IR/FunctionBuilder.h‎
Lines changed: 24 additions & 1 deletion b/‎include/triton/Dialect/TritonInstrument/IR/FunctionBuilder.h‎
Lines changed: 24 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/Dialect.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 2 additions & 1 deletion b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lib/Analysis/BufferRegion.cpp‎
Lines changed: 10 additions & 3 deletions b/‎lib/Analysis/BufferRegion.cpp‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp‎
Lines changed: 0 additions & 4 deletions b/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 9 additions & 0 deletions b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Types.cpp‎
Lines changed: 7 additions & 4 deletions b/‎lib/Dialect/TritonGPU/IR/Types.cpp‎
Lines changed: 7 additions & 4 deletions
@@ -85,7 +85,7 @@ jobs:
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
       - name: Install dependencies
-        run: apt-get install -y clang lld ccache
+        run: apt-get update && apt-get install -y clang lld ccache
       - name: Inspect cache directories
         run: |
           mkdir -p ~/.triton
 
@@ -12,7 +12,7 @@ permissions: read-all
 jobs:
 
   Build-Wheels:
-    timeout-minutes: 120
+    timeout-minutes: 180
     runs-on: ${{ matrix.config.runs_on }}
 
     strategy:
@@ -99,12 +99,12 @@ jobs:
           path: ./wheelhouse/*.whl
 
       - name: Install Azure CLI
-        if: ${{ steps.check-version.outputs.new_commit == 'true' }}
+        if: ${{ steps.check-version.outputs.new_commit == 'true' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') }}
         run: |
           curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
 
       - name: Azure login
-        if: ${{ steps.check-version.outputs.new_commit == 'true' }}
+        if: ${{ steps.check-version.outputs.new_commit == 'true' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') }}
         uses: azure/login@v2
         with:
           client-id: ${{ secrets.AZURE_CLIENT_ID }}
@@ -113,20 +113,20 @@ jobs:
 
       - id: generate-token
         name: Generate token
-        if: ${{ steps.check-version.outputs.new_commit == 'true' }}
+        if: ${{ steps.check-version.outputs.new_commit == 'true' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') }}
         run: |
           AZ_TOKEN=$(az account get-access-token --query accessToken)
           echo "::add-mask::$AZ_TOKEN"
           echo "access_token=$AZ_TOKEN" >> "$GITHUB_OUTPUT"
 
       - name: Publish wheels to Azure DevOps
-        if: ${{ steps.check-version.outputs.new_commit == 'true' }}
+        if: ${{ steps.check-version.outputs.new_commit == 'true' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') }}
         run: |
           python3 -m pip install twine
           python3 -m twine upload -r Triton-Nightly -u TritonArtifactsSP -p ${{ steps.generate-token.outputs.access_token }} --config-file utils/nightly.pypirc --non-interactive --verbose wheelhouse/*
 
       - name: Azure Logout
-        if: ${{ steps.check-version.outputs.new_commit == 'true' && (success() || failure()) }}
+        if: ${{ steps.check-version.outputs.new_commit == 'true' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && (success() || failure()) }}
         run: |
           az logout
           az cache purge
 
@@ -93,10 +93,23 @@ class FunctionBuilder {
   // matching barrier phases.
   void createCheckAllActiveWaitingCall(ImplicitLocOpBuilder &b, int activeMask,
                                        Value pred, Operation *insertPoint);
+  // verifyBarrierCanInit: ensure the barrier is currently invalidated before
+  // initializing it again.
+  void createVerifyBarrierCanInitCall(ImplicitLocOpBuilder &b, Value mbar,
+                                      Operation *insertPoint);
+  // verifyBarrierInitialized: ensure the barrier has been initialized and not
+  // invalidated before it is used.
+  void createVerifyBarrierInitializedCall(ImplicitLocOpBuilder &b, Value mbar,
+                                          Value pred, Operation *insertPoint);
   // initBarrierState: Initialize the tracked barrier state to phase 0 and set
-  // both the initial and current arrival counts.
+  // both the initial and current arrival counts. A zero state denotes an
+  // invalidated/uninitialized barrier.
   void createInitBarrierStateCall(ImplicitLocOpBuilder &b, Value mbar,
                                   int count, Operation *insertPoint);
+  // invalidateBarrierState: clear the tracked barrier lifecycle state and any
+  // waiting bits for the barrier.
+  void createInvalidateBarrierStateCall(ImplicitLocOpBuilder &b, Value mbar,
+                                        Operation *insertPoint);
   // verifyBarrierArrive: Check that applying the arrive count would not drive
   // the tracked current count negative. Triggers an assertion on failure.
   void createVerifyBarrierArriveCall(ImplicitLocOpBuilder &b, Value mbar,
@@ -145,6 +158,16 @@ class FunctionBuilder {
   void createTrackVisibleReadsCall(ImplicitLocOpBuilder &b, Value mbar,
                                    int thread, Value pred, MemType memType,
                                    Operation *insertPoint);
+  // clearBarrierWriteTracking: clear all write tracking associated with the
+  // given barrier row.
+  void createClearBarrierWriteTrackingCall(ImplicitLocOpBuilder &b, Value mbar,
+                                           Value pred, MemType memType,
+                                           Operation *insertPoint);
+  // clearBarrierReadTracking: clear all read tracking associated with the
+  // given barrier row.
+  void createClearBarrierReadTrackingCall(ImplicitLocOpBuilder &b, Value mbar,
+                                          Value pred, MemType memType,
+                                          Operation *insertPoint);
   // transferVisibleWrites: transfer write visibility tracked by a barrier to
   // all threads in threadMask.
   void createTransferVisibleWritesCall(ImplicitLocOpBuilder &b, Value mbar,
 
@@ -117,6 +117,8 @@ LinearLayout getTileLayout(MLIRContext *ctx, TMemAccessAtom atom, bool unpacked,
 
 TMemAllocation getTmemAllocSizes(gpu::MemDescType memDescType);
 
+uint32_t getTMemSubSliceOffset(gpu::MemDescType memDescType, int32_t nOffset);
+
 SmallVector<gpu::DistributedEncodingTrait>
 getTmemCompatibleLayouts(gpu::MemDescType memType, unsigned numWarps,
                          ArrayRef<int64_t> ctaSplit = {1, 1});
 
@@ -896,7 +896,8 @@ def TTNG_TMEMAllocOp : TTNG_Op<"tmem_alloc", [DeclareOpInterfaceMethods<MemoryEf
   }];
 }
 
-def TTNG_TMEMSubSliceOp : TTNG_Op<"tmem_subslice", [Pure]> {
+def TTNG_TMEMSubSliceOp : TTNG_Op<"tmem_subslice", [Pure,
+                                                    MemDescViewTrait]> {
   let summary = "Take a subslice of a tensor memory allocation";
   let description = [{
     This operation takes a subslice of a tensor memory allocation and returns a new descriptor
 
@@ -63,6 +63,12 @@ llvm::DenseSet<Value> getBarrierOperands(Operation *op) {
   if (auto initBarrierOp = dyn_cast<ttng::InitBarrierOp>(op)) {
     return {initBarrierOp.getOperand()};
   }
+  if (auto waitBarrierOp = dyn_cast<ttng::WaitBarrierOp>(op)) {
+    return {waitBarrierOp.getAlloc()};
+  }
+  if (auto arriveBarrierOp = dyn_cast<ttng::ArriveBarrierOp>(op)) {
+    return {arriveBarrierOp.getAlloc()};
+  }
   if (auto barrierExpectOp = dyn_cast<ttng::BarrierExpectOp>(op)) {
     return {barrierExpectOp.getAlloc()};
   }
@@ -269,7 +275,8 @@ LogicalResult BufferRegionAnalysis::visitOperation(
   if (auto tmemSubsliceOp = dyn_cast<ttng::TMEMSubSliceOp>(op)) {
     RegionInfo in = operands[0]->getValue();
     uint32_t subBufferSize = getMemDescSize(tmemSubsliceOp.getType());
-    uint32_t relativeOffset = tmemSubsliceOp.getN();
+    uint32_t relativeOffset = ttng::getTMemSubSliceOffset(
+        tmemSubsliceOp.getType(), tmemSubsliceOp.getN());
     for (auto &region : in.regions) {
       regionInfo.regions.insert(
           {region.baseOffset + relativeOffset, subBufferSize});
@@ -326,8 +333,8 @@ bool BufferRegionAnalysis::isMemoryAccessOperation(Operation *op) {
           ttng::TMEMStoreOp, ttg::AsyncCopyGlobalToLocalOp,
           ttng::AsyncTMACopyGlobalToLocalOp, ttng::AsyncTMACopyLocalToGlobalOp,
           ttng::AsyncTMAGatherOp, ttng::AsyncTMAScatterOp, ttng::InitBarrierOp,
-          ttng::BarrierExpectOp, ttng::InvalBarrierOp, ttng::WaitBarrierOp>(
-          op)) {
+          ttng::BarrierExpectOp, ttng::InvalBarrierOp, ttng::WaitBarrierOp,
+          ttng::ArriveBarrierOp>(op)) {
     return true;
   }
   // Allocations with operands write to the memory.
 
@@ -549,13 +549,9 @@ struct MemDescSubsliceOpConversion
   matchAndRewrite(triton::gpu::MemDescSubsliceOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     Location loc = op->getLoc();
-    auto *ctx = op->getContext();
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     auto srcTy = op.getSrc().getType();
-    auto destTy = op.getResult().getType();
     auto llvmElemTy = getTypeConverter()->convertType(srcTy.getElementType());
-    auto layoutOrder = getOrder(srcTy);
-    auto enc = srcTy.getEncoding();
 
     // PartitionedSharedEncoding is not yet supported for memdesc_subslice
     if (isa<PartitionedSharedEncodingAttr>(srcTy.getEncoding())) {
 
@@ -881,6 +881,9 @@ LogicalResult MemDescIndexOp::verify() {
   if (srcTy.getElementType() != dstTy.getElementType()) {
     return emitError("result element type must match desc element type");
   }
+  if (srcTy.getEncoding() != dstTy.getEncoding()) {
+    return emitError("src and result must have the same encoding");
+  }
   // memdesc_index reduces rank by 1 and preserves the trailing shape.
   bool correctRank = srcTy.getRank() == dstTy.getRank() + 1;
   if (!correctRank) {
@@ -955,6 +958,9 @@ LogicalResult MemDescSubsliceOp::verify() {
   if (srcTy.getElementType() != dstTy.getElementType()) {
     return emitError("result element type must match desc element type");
   }
+  if (srcTy.getEncoding() != dstTy.getEncoding()) {
+    return emitError("src and result must have the same encoding");
+  }
   if (getOffsets().size() != srcTy.getRank()) {
     return emitError("offsets must have the same rank as input");
   }
@@ -993,6 +999,9 @@ LogicalResult MemDescSubsliceOp::verify() {
       if (offset & (dstTy.getDimSize(dim) - 1)) {
         return emitError("The split offset may not touch the tile");
       }
+      if (offset >= srcTy.getDimSize(dim)) {
+        return emitError("The split offset may not exceed the source shape");
+      }
     }
   }
 
 
@@ -128,17 +128,20 @@ LogicalResult MemDescType::verify(function_ref<InFlightDiagnostic()> emitError,
              << "bitwidth * colStride must be less than or equal to 32. Got "
              << bitwidth << " and " << enc.getColStride();
     }
-    shape = shape.take_back(2);
+    // Takes subslices into account and figures out whether we can construct
+    // the linear layout at all
     allocShape = allocShape.take_back(2);
     auto ctaSplit = enc.getCGALayout().getCTASplitNum();
+    auto blockN = std::min<int32_t>(enc.getBlockN(), shape.back());
     if (allocShape[0] < enc.getBlockM() * ctaSplit[0] ||
-        allocShape[1] < enc.getBlockN() * ctaSplit[1]) {
+        allocShape[1] < blockN * ctaSplit[1]) {
       return emitError() << "the allocation shape must be at least "
                          << enc.getBlockM() * ctaSplit[0] << "x"
-                         << enc.getBlockN() * ctaSplit[1] << ". Got "
-                         << allocShape;
+                         << blockN * ctaSplit[1] << ". Got " << allocShape;
     }
+    // Checks the layout of the allocation
     auto ll = toLinearLayout(allocShape, enc);
+    // Sanity check that the layout is of the right shape
     auto dims = standardOutDimNames(ctx, 2);
     if (ll.getOutDimSize(dims[0]) != allocShape[0] ||
         ll.getOutDimSize(dims[1]) != allocShape[1]) {
Original file line number	Diff line number	Diff line change
`@@ -896,7 +896,8 @@ def TTNG_TMEMAllocOp : TTNG_Op<"tmem_alloc", [DeclareOpInterfaceMethods<MemoryEf`
`896`	`896`	`}];`
`897`	`897`	`}`
`898`	`898`
`899`		`-def TTNG_TMEMSubSliceOp : TTNG_Op<"tmem_subslice", [Pure]> {`
	`899`	`+def TTNG_TMEMSubSliceOp : TTNG_Op<"tmem_subslice", [Pure,`
	`900`	`+ MemDescViewTrait]> {`
`900`	`901`	`let summary = "Take a subslice of a tensor memory allocation";`
`901`	`902`	`let description = [{`
`902`	`903`	`This operation takes a subslice of a tensor memory allocation and returns a new descriptor`
Original file line number	Diff line number	Diff line change
`@@ -881,6 +881,9 @@ LogicalResult MemDescIndexOp::verify() {`
`881`	`881`	`if (srcTy.getElementType() != dstTy.getElementType()) {`
`882`	`882`	`return emitError("result element type must match desc element type");`
`883`	`883`	`}`
	`884`	`+ if (srcTy.getEncoding() != dstTy.getEncoding()) {`
	`885`	`+ return emitError("src and result must have the same encoding");`
	`886`	`+ }`
`884`	`887`	`// memdesc_index reduces rank by 1 and preserves the trailing shape.`
`885`	`888`	`bool correctRank = srcTy.getRank() == dstTy.getRank() + 1;`
`886`	`889`	`if (!correctRank) {`
`@@ -955,6 +958,9 @@ LogicalResult MemDescSubsliceOp::verify() {`
`955`	`958`	`if (srcTy.getElementType() != dstTy.getElementType()) {`
`956`	`959`	`return emitError("result element type must match desc element type");`
`957`	`960`	`}`
	`961`	`+ if (srcTy.getEncoding() != dstTy.getEncoding()) {`
	`962`	`+ return emitError("src and result must have the same encoding");`
	`963`	`+ }`
`958`	`964`	`if (getOffsets().size() != srcTy.getRank()) {`
`959`	`965`	`return emitError("offsets must have the same rank as input");`
`960`	`966`	`}`
`@@ -993,6 +999,9 @@ LogicalResult MemDescSubsliceOp::verify() {`
`993`	`999`	`if (offset & (dstTy.getDimSize(dim) - 1)) {`
`994`	`1000`	`return emitError("The split offset may not touch the tile");`
`995`	`1001`	`}`
	`1002`	`+ if (offset >= srcTy.getDimSize(dim)) {`
	`1003`	`+ return emitError("The split offset may not exceed the source shape");`
	`1004`	`+ }`
`996`	`1005`	`}`
`997`	`1006`	`}`
`998`	`1007`