[NFC] Correct async_copy_global_to_local operand type inference conditions (#6448)

awayzjj · web-flow · commit 7a83ed78d95e · 2025-04-25T01:26:50.000Z
The `async_copy_global_to_local` operation currently exhibits incorrect
type inference behavior when the optional `mask` or `other` operands are
provided.

Adjust the operand count checks to:
- `&lt;= 2` for `mask` since it is the 3rd operand
- `&lt;= 3` for `other`  since it is the 4th operand

---------

Co-authored-by: junjian.zhan &lt;junjian.zhan@iluvatar.com&gt;
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -77,12 +77,10 @@ def TTG_AsyncCommitGroupOp : TTG_Op<"async_commit_group"> {
 
 def TTG_AsyncCopyGlobalToLocalOp : TTG_Op<"async_copy_global_to_local", [
   AttrSizedOperandSegments,
-  TypesMatchWith<"infer mask type from src type",
-                 "src", "mask", "getI1SameShape($_self)",
-                 "($_op.getOperands().size() <= 3) || std::equal_to<>()">,
-  TypesMatchWith<"infer other type from src type",
-                 "src", "other", "getPointeeType($_self)",
-                 "($_op.getOperands().size() <= 4) || std::equal_to<>()">
+  OptionalTypesMatchWith<"infer mask type from src type",
+                 "src", "mask", "getI1SameShape($_self)">,
+  OptionalTypesMatchWith<"infer other type from src type",
+                 "src", "other", "getPointeeType($_self)">,
 ]> {
   let summary = "copy data from global memory to local memory asynchronously";
 
diff --git a/test/TritonGPU/invalid.mlir b/test/TritonGPU/invalid.mlir
@@ -345,3 +345,37 @@ tt.func @partition_no_terminator() {
   } : () -> ()
   tt.return
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+  tt.func @async_copy_invalid_mask_type(%input: tensor<64x64x!tt.ptr<f16>, #blocked>,
+    %view: !ttg.memdesc<64x64xf16, #shared, #smem, mutable>,
+    %invalid_mask: tensor<64x64xi32, #blocked> // expected-note {{prior use here}}
+  ) {
+    // expected-error @+1 {{expects different type than prior uses}}
+    %token = ttg.async_copy_global_to_local %input, %view mask %invalid_mask
+      : tensor<64x64x!tt.ptr<f16>, #blocked> -> <64x64xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+tt.func @async_copy_invalid_other_type(%input: tensor<64x64x!tt.ptr<f16>, #blocked>,
+    %view: !ttg.memdesc<64x64xf16, #shared, #smem, mutable>,
+    %mask: tensor<64x64xi1, #blocked>,
+    %invalid_other: tensor<64x64xf32, #blocked> // expected-note {{prior use here}}
+  ) {
+  // expected-error @+1 {{expects different type than prior uses}}
+  %token = ttg.async_copy_global_to_local %input, %view mask %mask other %invalid_other : tensor<64x64x!tt.ptr<f16>, #blocked> -> <64x64xf16, #shared, #smem, mutable>
+  tt.return
+}
+}