[Canonicalize] Transform ptr_to_int->add->int_to_ptr to addptr (#9971)

yiqian1 · web-flow · commit 89d154dea731 · 2026-04-10T02:52:01.000Z
Add canonicalization pattern for IntToPtrOp that recognizes the pattern:
      int_to_ptr(addi(ptr_to_int(ptr), constant_offset))

    and transforms it to:
      addptr(ptr, element_offset)

    where element_offset = constant_offset / element_size_bytes.

    This pattern appears when performing pointer arithmetic via integer
operations (e.g., adding byte offsets to pointers). By canonicalizing
    to addptr, AxisInfoAnalysis can correctly track contiguity, enabling
    proper vectorization for operations like async_copy_local_to_global.

    The pattern only applies when:
- The offset is a compile-time constant (IntegerAttr or
SplatElementsAttr)
    - The byte offset is evenly divisible by the element size

    Added to both standard canonicalize and gluon-canonicalize passes.

    Tests added for positive cases (f32, f16, commutative) and negative
    cases (non-constant offset, indivisible offset).
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -48,6 +48,8 @@ def TT_IntToPtrOp : TT_Op<"int_to_ptr", [Elementwise,
     let results = (outs TT_PtrLike:$result);
 
     let assemblyFormat = "$src attr-dict `:` type($src) `->` type($result)";
+
+    let hasCanonicalizer = 1;
 }
 
 def TT_PtrToIntOp : TT_Op<"ptr_to_int", [Elementwise,
diff --git a/lib/Dialect/Gluon/Transforms/Canonicalize.cpp b/lib/Dialect/Gluon/Transforms/Canonicalize.cpp
@@ -57,6 +57,7 @@ void Canonicalize::runOnOperation() {
   StoreOp::getCanonicalizationPatterns(patterns, ctx);
   BroadcastOp::getCanonicalizationPatterns(patterns, ctx);
   ExpandDimsOp::getCanonicalizationPatterns(patterns, ctx);
+  IntToPtrOp::getCanonicalizationPatterns(patterns, ctx);
   ttg::WarpSpecializeOp::getCanonicalizationPatterns(patterns, ctx);
   ttg::WarpSpecializePartitionsOp::getCanonicalizationPatterns(patterns, ctx);
 
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
@@ -1095,6 +1095,113 @@ void MakeTensorDescOp::build(OpBuilder &builder, OperationState &state,
   return build(builder, state, descTy, base, shape, strides, paddingAttr);
 }
 
+//-- IntToPtrOp --
+// Pattern 1: int_to_ptr(ptr_to_int(ptr)) -> ptr
+// Eliminates round-trip pointer conversions
+struct CanonicalizeIntToPtrOfPtrToInt : public OpRewritePattern<IntToPtrOp> {
+  CanonicalizeIntToPtrOfPtrToInt(MLIRContext *context)
+      : OpRewritePattern<IntToPtrOp>(context, 1) {}
+
+  LogicalResult matchAndRewrite(IntToPtrOp intToPtrOp,
+                                PatternRewriter &rewriter) const override {
+    // Match: int_to_ptr(ptr_to_int(ptr))
+    auto ptrToIntOp = intToPtrOp.getSrc().getDefiningOp<PtrToIntOp>();
+    if (!ptrToIntOp)
+      return failure();
+
+    // Replace with the original pointer
+    rewriter.replaceOp(intToPtrOp, ptrToIntOp.getSrc());
+    return success();
+  }
+};
+
+// Pattern 2: int_to_ptr(addi(val, constant_offset)) -> addptr(int_to_ptr(val),
+// element_offset). Only when offset is constant and divisible by element size
+struct CanonicalizeIntToPtrWithAdd : public OpRewritePattern<IntToPtrOp> {
+  CanonicalizeIntToPtrWithAdd(MLIRContext *context)
+      : OpRewritePattern<IntToPtrOp>(context, 1) {}
+
+  LogicalResult matchAndRewrite(IntToPtrOp intToPtrOp,
+                                PatternRewriter &rewriter) const override {
+    // Match: int_to_ptr(addi(val, constant_offset))
+    auto addOp = intToPtrOp.getSrc().getDefiningOp<arith::AddIOp>();
+    if (!addOp)
+      return failure();
+
+    Value intValue = addOp.getLhs();
+    Value offsetValue = addOp.getRhs();
+
+    // Get the element size from the result pointer type
+    auto resultType = intToPtrOp.getType();
+    auto ptrType = cast<PointerType>(getElementTypeOrSelf(resultType));
+    int64_t elemSizeBits = triton::getPointeeBitWidth(ptrType);
+    int64_t elemSizeBytes = std::max<int64_t>(1, elemSizeBits / 8);
+
+    // Check if offset is a constant (either directly or via splat)
+    // Only apply canonicalization for constant offsets
+    std::optional<int64_t> constantByteOffset;
+    if (auto constOp = offsetValue.getDefiningOp<arith::ConstantOp>()) {
+      if (auto intAttr = dyn_cast<IntegerAttr>(constOp.getValue())) {
+        constantByteOffset = intAttr.getValue().getSExtValue();
+      } else if (auto splatAttr =
+                     dyn_cast<SplatElementsAttr>(constOp.getValue())) {
+        constantByteOffset =
+            splatAttr.getSplatValue<IntegerAttr>().getValue().getSExtValue();
+      }
+    }
+
+    if (!constantByteOffset.has_value())
+      return failure(); // Only handle constant offsets
+
+    // Check if the byte offset is divisible by element size
+    if (constantByteOffset.value() % elemSizeBytes != 0)
+      return failure();
+
+    // Compute element offset at compile time
+    int64_t elementOffset = constantByteOffset.value() / elemSizeBytes;
+
+    // Create int_to_ptr(val) for the base
+    auto loc = intToPtrOp.getLoc();
+    Value basePtr = IntToPtrOp::create(rewriter, loc, resultType, intValue);
+
+    // Create the element offset constant
+    Value elementOffsetValue;
+
+    // Get the integer type from the offset value to match its type
+    Type offsetElemType;
+    if (auto tensorType = dyn_cast<RankedTensorType>(offsetValue.getType())) {
+      offsetElemType = tensorType.getElementType();
+    } else {
+      offsetElemType = offsetValue.getType();
+    }
+
+    if (auto tensorType = dyn_cast<RankedTensorType>(resultType)) {
+      // Create a splat constant for tensor types, matching the offset's type
+      auto offsetAttr = rewriter.getIntegerAttr(offsetElemType, elementOffset);
+      auto splatType = RankedTensorType::get(
+          tensorType.getShape(), offsetElemType, tensorType.getEncoding());
+      auto splatAttr = SplatElementsAttr::get(splatType, offsetAttr);
+      elementOffsetValue = arith::ConstantOp::create(rewriter, loc, splatAttr);
+    } else {
+      // Scalar case
+      elementOffsetValue = arith::ConstantOp::create(
+          rewriter, loc,
+          rewriter.getIntegerAttr(offsetElemType, elementOffset));
+    }
+
+    // Replace with addptr(int_to_ptr(val), element_offset)
+    rewriter.replaceOpWithNewOp<AddPtrOp>(intToPtrOp, resultType, basePtr,
+                                          elementOffsetValue);
+    return success();
+  }
+};
+
+void IntToPtrOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                             MLIRContext *context) {
+  results.add<CanonicalizeIntToPtrOfPtrToInt, CanonicalizeIntToPtrWithAdd>(
+      context);
+}
+
 // The following ops, including `call`, `func`, and `return` are copied and
 // modified from
 // https://github.com/llvm/llvm-project/blob/main/mlir/lib/Dialect/Func/IR/FuncOps.cpp
diff --git a/test/Triton/canonicalize.mlir b/test/Triton/canonicalize.mlir
@@ -173,3 +173,76 @@ tt.func @fold_transpose_constant() -> tensor<128x16xf32> {
     // CHECK-NEXT: tt.return %[[cst]] : tensor<128x16xf32>
     tt.return %r : tensor<128x16xf32>
 }
+// -----
+
+// CHECK-LABEL: @canonicalize_int_to_ptr_of_ptr_to_int
+// Test: int_to_ptr(ptr_to_int(ptr)) -> ptr (round-trip elimination)
+tt.func @canonicalize_int_to_ptr_of_ptr_to_int(%ptr: tensor<64x!tt.ptr<f32>>) -> tensor<64x!tt.ptr<f32>> {
+  // CHECK-NOT: tt.ptr_to_int
+  // CHECK-NOT: tt.int_to_ptr
+  // CHECK: tt.return %{{.*}} : tensor<64x!tt.ptr<f32>>
+  %int = tt.ptr_to_int %ptr : tensor<64x!tt.ptr<f32>> -> tensor<64xi64>
+  %result = tt.int_to_ptr %int : tensor<64xi64> -> tensor<64x!tt.ptr<f32>>
+  tt.return %result : tensor<64x!tt.ptr<f32>>
+}
+
+// -----
+
+// CHECK-LABEL: @canonicalize_int_to_ptr_with_constant_offset_f32
+// Test: int_to_ptr(addi(ptr_to_int(ptr), constant)) -> addptr(ptr, element_offset)
+// For f32 (4 bytes): 16 bytes = 4 elements
+tt.func @canonicalize_int_to_ptr_with_constant_offset_f32(%base: tensor<128x!tt.ptr<f32>>) -> tensor<128x!tt.ptr<f32>> {
+  // CHECK: %[[OFFSET:.*]] = arith.constant dense<4> : tensor<128xi64>
+  // CHECK-NEXT: %[[RESULT:.*]] = tt.addptr %{{.*}}, %[[OFFSET]] : tensor<128x!tt.ptr<f32>>, tensor<128xi64>
+  %byte_offset = arith.constant dense<16> : tensor<128xi64>
+  %ptr_as_int = tt.ptr_to_int %base : tensor<128x!tt.ptr<f32>> -> tensor<128xi64>
+  %offset_ptr_int = arith.addi %ptr_as_int, %byte_offset : tensor<128xi64>
+  %result = tt.int_to_ptr %offset_ptr_int : tensor<128xi64> -> tensor<128x!tt.ptr<f32>>
+  // CHECK-NEXT: tt.return %[[RESULT]] : tensor<128x!tt.ptr<f32>>
+  tt.return %result : tensor<128x!tt.ptr<f32>>
+}
+
+// -----
+
+// CHECK-LABEL: @canonicalize_int_to_ptr_with_constant_offset_f16
+// Test: For f16 (2 bytes): 32 bytes = 16 elements
+tt.func @canonicalize_int_to_ptr_with_constant_offset_f16(%base: tensor<1024x!tt.ptr<f16>>) -> tensor<1024x!tt.ptr<f16>> {
+  // CHECK: %[[OFFSET:.*]] = arith.constant dense<16> : tensor<1024xi64>
+  // CHECK-NEXT: %[[RESULT:.*]] = tt.addptr %{{.*}}, %[[OFFSET]] : tensor<1024x!tt.ptr<f16>>, tensor<1024xi64>
+  %byte_offset = arith.constant dense<32> : tensor<1024xi64>
+  %ptr_as_int = tt.ptr_to_int %base : tensor<1024x!tt.ptr<f16>> -> tensor<1024xi64>
+  %offset_ptr_int = arith.addi %ptr_as_int, %byte_offset : tensor<1024xi64>
+  %result = tt.int_to_ptr %offset_ptr_int : tensor<1024xi64> -> tensor<1024x!tt.ptr<f16>>
+  // CHECK-NEXT: tt.return %[[RESULT]] : tensor<1024x!tt.ptr<f16>>
+  tt.return %result : tensor<1024x!tt.ptr<f16>>
+}
+
+// -----
+
+// CHECK-LABEL: @no_canonicalize_non_constant_offset
+// Test: Non-constant offsets should not be canonicalized
+tt.func @no_canonicalize_non_constant_offset(%base: tensor<128x!tt.ptr<f32>>, %offset: tensor<128xi64>) -> tensor<128x!tt.ptr<f32>> {
+  // CHECK: tt.ptr_to_int
+  // CHECK-NEXT: arith.addi
+  // CHECK-NEXT: tt.int_to_ptr
+  %ptr_as_int = tt.ptr_to_int %base : tensor<128x!tt.ptr<f32>> -> tensor<128xi64>
+  %offset_ptr_int = arith.addi %ptr_as_int, %offset : tensor<128xi64>
+  %result = tt.int_to_ptr %offset_ptr_int : tensor<128xi64> -> tensor<128x!tt.ptr<f32>>
+  tt.return %result : tensor<128x!tt.ptr<f32>>
+}
+
+// -----
+
+// CHECK-LABEL: @no_canonicalize_indivisible_offset
+// Test: Offset not divisible by element size should not be canonicalized
+tt.func @no_canonicalize_indivisible_offset(%base: tensor<128x!tt.ptr<f32>>) -> tensor<128x!tt.ptr<f32>> {
+  // 7 bytes is not divisible by 4 (size of f32)
+  // CHECK: tt.ptr_to_int
+  // CHECK-NEXT: arith.addi
+  // CHECK-NEXT: tt.int_to_ptr
+  %byte_offset = arith.constant dense<7> : tensor<128xi64>
+  %ptr_as_int = tt.ptr_to_int %base : tensor<128x!tt.ptr<f32>> -> tensor<128xi64>
+  %offset_ptr_int = arith.addi %ptr_as_int, %byte_offset : tensor<128xi64>
+  %result = tt.int_to_ptr %offset_ptr_int : tensor<128xi64> -> tensor<128x!tt.ptr<f32>>
+  tt.return %result : tensor<128x!tt.ptr<f32>>
+}

Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,8 @@ def TT_IntToPtrOp : TT_Op<"int_to_ptr", [Elementwise,`
`48`	`48`	`let results = (outs TT_PtrLike:$result);`
`49`	`49`
`50`	`50`	let assemblyFormat = "$src attr-dict `:` type($src) `->` type($result)";
	`51`	`+`
	`52`	`+ let hasCanonicalizer = 1;`
`51`	`53`	`}`
`52`	`54`
`53`	`55`	`def TT_PtrToIntOp : TT_Op<"ptr_to_int", [Elementwise,`