triton-lang
diff --git a/‎test/Conversion/amd/async-ops-alias-scopes.mlir‎
Lines changed: 3 additions & 3 deletions b/‎test/Conversion/amd/async-ops-alias-scopes.mlir‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎test/Conversion/amd/async_ops_to_llvm.mlir‎
Lines changed: 53 additions & 76 deletions b/‎test/Conversion/amd/async_ops_to_llvm.mlir‎
Lines changed: 53 additions & 76 deletions
diff --git a/‎test/Conversion/amd/async_ops_to_llvm_errors.mlir‎
Lines changed: 22 additions & 0 deletions b/‎test/Conversion/amd/async_ops_to_llvm_errors.mlir‎
Lines changed: 22 additions & 0 deletions
@@ -16,7 +16,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
     %ptr = tt.splat %arg0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked>
     %mask = tt.splat %maskVal : i1 -> tensor<64x1xi1, #blocked>
 
-    // COMMON: rocdl.global.load.lds {{.*}} {alias_scopes = [[[$ASYNC_COPY_SCOPE]]]
+    // COMMON: rocdl.global.load.async.lds {{.*}} {alias_scopes = [[[$ASYNC_COPY_SCOPE]]]
     // Check that store for 'other' has alias information set
     // COMMON: llvm.store {{.*}} {alias_scopes = [[[$LOCAL_LOAD_SCOPE]]], {{.*}}, noalias_scopes = [[[$ASYNC_COPY_SCOPE]]]
     %0 = ttg.async_copy_global_to_local %ptr, %arg1 mask %mask other %other : tensor<64x1x!tt.ptr<f32>, #blocked> -> <64x1xf32, #shared, #smem, mutable>
@@ -41,7 +41,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
     %mask = tt.splat %maskVal : i1 -> tensor<8x64xi1, #blocked>
     %other = arith.constant dense<1.000000e+00> : tensor<8x64xf32, #blocked>
 
-    // COMMON: rocdl.raw.ptr.buffer.load.lds {{.*}} {alias_scopes = [[[$ASYNC_COPY_SCOPE]]]
+    // COMMON: rocdl.raw.ptr.buffer.load.async.lds {{.*}} {alias_scopes = [[[$ASYNC_COPY_SCOPE]]]
     // Check that store for 'other' has alias information set
     // COMMON: llvm.store {{.*}} {alias_scopes = [[[$LOCAL_LOAD_SCOPE]]], {{.*}}, noalias_scopes = [[[$ASYNC_COPY_SCOPE]]]
     %65 = amdg.buffer_load_to_local %arg1[%arg2] mask=%mask other=%other into %arg3 : <f32>[tensor<8x64xi32, #blocked>] tensor<8x64xf32, #blocked> -> <8x64xf32, #shared, #smem, mutable>
@@ -107,7 +107,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.targ
     // We need the splat to allow the AxisAnalysis to work during lowering
     %ptr = tt.splat %arg0 : !tt.ptr<f32> -> tensor<64x1x!tt.ptr<f32>, #blocked>
 
-    // COMMON: rocdl.global.load.lds {{.*}} {alias_scopes = [[[$ASYNC_COPY_SCOPE]]]
+    // COMMON: rocdl.global.load.async.lds {{.*}} {alias_scopes = [[[$ASYNC_COPY_SCOPE]]]
     %0 = ttg.async_copy_global_to_local %ptr, %arg1 : tensor<64x1x!tt.ptr<f32>, #blocked> -> <64x1xf32, #shared, #smem, mutable>
     %1 = ttg.async_commit_group tokens %0
 
 
@@ -1,5 +1,5 @@
 // RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx950 | FileCheck %s --check-prefix=GFX950
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx942 --verify-diagnostics | FileCheck %s
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx942 | FileCheck %s
 
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
 #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
@@ -11,9 +11,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
                                 %arg2: !ttg.memdesc<32x64xf32, #shared, #smem, mutable>) {
     // We need the splat to allow the AxisAnalysis to work during lowering
     %1 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<32x64x!tt.ptr<f32>, #blocked>
-    // Each thread needs to load 8 elements and we load 1 (sizePerThread) per global.load.lds
-    // CHECK-COUNT-8: rocdl.global.load.lds
-    // CHECK-NOT: rocdl.global.load.lds
+    // Each thread needs to load 8 elements and we load 1 (sizePerThread) per load.
+    // CDNA3/CDNA4 use the async variant so LLVM tracks via asyncmark.
+    // CHECK-COUNT-8: rocdl.global.load.async.lds
+    // CHECK-NOT: rocdl.global.load.async.lds
     %2 = ttg.async_copy_global_to_local %1, %arg2 : tensor<32x64x!tt.ptr<f32>, #blocked> -> <32x64xf32, #shared, #smem, mutable>
     tt.return
   }
@@ -31,9 +32,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
                                 %arg2: !ttg.memdesc<32x64xf32, #shared, #smem, mutable>) {
     // We need the splat to allow the AxisAnalysis to work during lowering
     %1 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<32x64x!tt.ptr<f32>, #blocked>
-    // Each thread needs to load 8 elements and we load 1 () per global.load.lds
-    // CHECK-COUNT-8: rocdl.global.load.lds
-    // CHECK-NOT: rocdl.global.load.lds
+    // Each thread needs to load 8 elements and we load 1 () per load
+    // CHECK-COUNT-8: rocdl.global.load.async.lds
+    // CHECK-NOT: rocdl.global.load.async.lds
     %2 = ttg.async_copy_global_to_local %1, %arg2 : tensor<32x64x!tt.ptr<f32>, #blocked> -> <32x64xf32, #shared, #smem, mutable>
     tt.return
   }
@@ -56,9 +57,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
     %4 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<32x64x!tt.ptr<f16>, #blocked>
     %5 = tt.addptr %4, %3 : tensor<32x64x!tt.ptr<f16>, #blocked>, tensor<32x64xi32, #blocked>
 
-    // Each thread needs to load 8 elements and we load 2 (sizePerThread) per global.load.lds
-    // CHECK-COUNT-4: rocdl.global.load.lds
-    // CHECK-NOT: rocdl.global.load.lds
+    // Each thread needs to load 8 elements and we load 2 (sizePerThread) per load
+    // CHECK-COUNT-4: rocdl.global.load.async.lds
+    // CHECK-NOT: rocdl.global.load.async.lds
     %6 = ttg.async_copy_global_to_local %5, %arg2 : tensor<32x64x!tt.ptr<f16>, #blocked> -> <32x64xf16, #shared, #smem, mutable>
     tt.return
   }
@@ -69,61 +70,31 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
 #blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
 #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
 #smem = #ttg.shared_memory
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
-  // GFX950-LABEL: async_copy_vectorized_8xf16
-  tt.func public @async_copy_vectorized_8xf16(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
-                                %arg1: i32 {tt.divisibility = 16 : i32},
-                                %arg2: !ttg.memdesc<32x64xf16, #shared, #smem, mutable>) {
-    // We need the index calculation so AxisAnalysis sees that we can vectorize the load
-    %1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
-    %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked>
-    %3 = tt.broadcast %2 : tensor<1x64xi32, #blocked> -> tensor<32x64xi32, #blocked>
-    %4 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<32x64x!tt.ptr<f16>, #blocked>
-    %5 = tt.addptr %4, %3 : tensor<32x64x!tt.ptr<f16>, #blocked>, tensor<32x64xi32, #blocked>
-
-    // Each thread needs to load 8 elements and we load 8 (sizePerThread) per global.load.lds
-    // GFX950: rocdl.global.load.lds
-    // GFX950-next: llvm.return
-
-    // GFX942 does not support vectorization > 4bytes
-    // expected-error@+1 {{failed to legalize operation 'ttg.async_copy_global_to_local' that was explicitly marked illegal}}
-    %6 = ttg.async_copy_global_to_local %5, %arg2 : tensor<32x64x!tt.ptr<f16>, #blocked> -> <32x64xf16, #shared, #smem, mutable>
-    tt.return
-  }
-}
-
-// -----
-
-#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
-#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
-#smem = #ttg.shared_memory
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
   // CHECK-LABEL: async_wait
+  // GFX950-LABEL: async_wait
   tt.func public @async_wait(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
                              %arg1: i32 {tt.divisibility = 16 : i32},
                              %arg2: !ttg.memdesc<32x64xf16, #shared, #smem, mutable>) {
-    // The waitcnt stores all counters in one i32 bits 15:14 and 3:0 store the vmcnt we have to wait on
-    // CHECK: rocdl.s.waitcnt -49168
-    // CHECK: rocdl.s.waitcnt 49279
-    // CHECK: rocdl.s.barrier
-    amdg.async_wait {num_inst = 0 : i32}
-    // CHECK: rocdl.s.waitcnt -49167
-    // CHECK: rocdl.s.waitcnt 49279
-    // CHECK: rocdl.s.barrier
-    amdg.async_wait {num_inst = 1 : i32}
-    // CHECK: rocdl.s.waitcnt -2
-    // CHECK: rocdl.s.waitcnt 49279
-    // CHECK: rocdl.s.barrier
-    amdg.async_wait {num_inst = 62 : i32}
-    // CHECK: rocdl.s.waitcnt -1
-    // CHECK: rocdl.s.waitcnt 49279
-    // CHECK: rocdl.s.barrier
-    amdg.async_wait {num_inst = 63 : i32}
-    // Check that we clamp values > 63
-    // CHECK: rocdl.s.waitcnt -1
-    // CHECK: rocdl.s.waitcnt 49279
-    // CHECK: rocdl.s.barrier
-    amdg.async_wait {num_inst = 64 : i32}
+    // CDNA3/CDNA4 lower ttg.async_wait directly to wait_asyncmark.
+    // The commit group count is passed through without clamping since
+    // LLVM will compute the final waitcnt.
+    // CHECK: rocdl.wait.asyncmark 0
+    // GFX950: rocdl.wait.asyncmark 0
+    ttg.async_wait {num = 0 : i32}
+    // CHECK: rocdl.wait.asyncmark 1
+    // GFX950: rocdl.wait.asyncmark 1
+    ttg.async_wait {num = 1 : i32}
+    // CHECK: rocdl.wait.asyncmark 62
+    // GFX950: rocdl.wait.asyncmark 62
+    ttg.async_wait {num = 62 : i32}
+    // CHECK: rocdl.wait.asyncmark 63
+    // GFX950: rocdl.wait.asyncmark 63
+    ttg.async_wait {num = 63 : i32}
+    // No clamping — LLVM handles it based on instruction count
+    // CHECK: rocdl.wait.asyncmark 64
+    // GFX950: rocdl.wait.asyncmark 64
+    ttg.async_wait {num = 64 : i32}
     tt.return
   }
 }
@@ -133,13 +104,19 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
 #blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
 #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
 #smem = #ttg.shared_memory
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
   // CHECK-LABEL: async_commit_group
+  // GFX950-LABEL: async_commit_group
   tt.func public @async_commit_group(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
                                      %arg1: i32 {tt.divisibility = 16 : i32},
                                      %arg2: !ttg.memdesc<32x64xf16, #shared, #smem, mutable>) {
+    // CDNA3/CDNA4 emit asyncmark for async group tracking
+    // CHECK: rocdl.asyncmark
     // CHECK: llvm.mlir.constant(0 : i32) : i32
     // CHECK-NEXT: llvm.return
+    // GFX950: rocdl.asyncmark
+    // GFX950: llvm.mlir.constant(0 : i32) : i32
+    // GFX950-NEXT: llvm.return
     ttg.async_commit_group
     tt.return
   }
@@ -179,25 +156,25 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
     // Note that mask/other alignment is 1 so we need 4 conditionals
 
     // CHECK: llvm.cond_br
-    // CHECK: rocdl.global.load.lds
+    // CHECK: rocdl.global.load.async.lds
     // CHECK-NEXT: llvm.br
     // CHECK: llvm.cond_br
     // CHECK: llvm.store
 
     // CHECK: llvm.cond_br
-    // CHECK: rocdl.global.load.lds
+    // CHECK: rocdl.global.load.async.lds
     // CHECK-NEXT: llvm.br
     // CHECK: llvm.cond_br
     // CHECK: llvm.store
 
     // CHECK: llvm.cond_br
-    // CHECK: rocdl.global.load.lds
+    // CHECK: rocdl.global.load.async.lds
     // CHECK-NEXT: llvm.br
     // CHECK: llvm.cond_br
     // CHECK: llvm.store
 
     // CHECK: llvm.cond_br
-    // CHECK: rocdl.global.load.lds
+    // CHECK: rocdl.global.load.async.lds
     // CHECK-NEXT: llvm.br
     // CHECK: llvm.cond_br
     // CHECK: llvm.store
@@ -243,31 +220,31 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
     // CHECK: rocdl.ds_bpermute
     // CHECK: rocdl.ballot
     // CHECK: llvm.cond_br
-    // CHECK: rocdl.global.load.lds
+    // CHECK: rocdl.global.load.async.lds
     // CHECK-NEXT: llvm.br
     // CHECK: llvm.cond_br
     // CHECK: llvm.store
 
     // CHECK: rocdl.ds_bpermute
     // CHECK: rocdl.ballot
     // CHECK: llvm.cond_br
-    // CHECK: rocdl.global.load.lds
+    // CHECK: rocdl.global.load.async.lds
     // CHECK-NEXT: llvm.br
     // CHECK: llvm.cond_br
     // CHECK: llvm.store
 
     // CHECK: rocdl.ds_bpermute
     // CHECK: rocdl.ballot
     // CHECK: llvm.cond_br
-    // CHECK: rocdl.global.load.lds
+    // CHECK: rocdl.global.load.async.lds
     // CHECK-NEXT: llvm.br
     // CHECK: llvm.cond_br
     // CHECK: llvm.store
 
     // CHECK: rocdl.ds_bpermute
     // CHECK: rocdl.ballot
     // CHECK: llvm.cond_br
-    // CHECK: rocdl.global.load.lds
+    // CHECK: rocdl.global.load.async.lds
     // CHECK-NEXT: llvm.br
     // CHECK: llvm.cond_br
     // CHECK: llvm.store
@@ -292,13 +269,13 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 16 : i32, ttg.sha
     // Each thread needs to load 1 element and we load 1 (sizePerThread) per global.load.lds
 
     // CHECK: llvm.getelementptr
-    // CHECK: rocdl.global.load.lds {{.*}}, {{.*}}, 4, 0, 0
+    // CHECK: rocdl.global.load.async.lds {{.*}}, {{.*}}, 4, 0, 0
     %2 = ttg.async_copy_global_to_local %1, %arg2 cacheModifier = ca: tensor<32x32x!tt.ptr<f32>, #blocked> -> <32x32xf32, #shared, #smem, mutable>
     // CHECK: llvm.getelementptr
-    // CHECK: rocdl.global.load.lds {{.*}}, {{.*}}, 4, 0, 3
+    // CHECK: rocdl.global.load.async.lds {{.*}}, {{.*}}, 4, 0, 3
     %3 = ttg.async_copy_global_to_local %1, %arg2 cacheModifier = cg: tensor<32x32x!tt.ptr<f32>, #blocked> -> <32x32xf32, #shared, #smem, mutable>
     // CHECK: llvm.getelementptr
-    // CHECK: rocdl.global.load.lds {{.*}}, {{.*}}, 4, 0, 17
+    // CHECK: rocdl.global.load.async.lds {{.*}}, {{.*}}, 4, 0, 17
     %4 = ttg.async_copy_global_to_local %1, %arg2 cacheModifier = cv: tensor<32x32x!tt.ptr<f32>, #blocked> -> <32x32xf32, #shared, #smem, mutable>
     tt.return
   }
@@ -313,7 +290,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
   // CHECK-LABEL: async_copy_contiguity_hint
   tt.func @async_copy_contiguity_hint(%v: tensor<256x!tt.ptr<f16>, #blocked>, %smem: !ttg.memdesc<256xf16, #shared1D, #smem, mutable>) {
     // Check we load 4 bytes at a time
-    // CHECK: rocdl.global.load.lds {{.*}}, {{.*}}, 4
+    // CHECK: rocdl.global.load.async.lds {{.*}}, {{.*}}, 4
     %0 = ttg.async_copy_global_to_local %v, %smem {contiguity = 2 : i32} : tensor<256x!tt.ptr<f16>, #blocked> -> !ttg.memdesc<256xf16, #shared1D, #smem, mutable>
     tt.return
   }
@@ -332,7 +309,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
     %1 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<32x64x!tt.ptr<f32>, #blocked>
     %2 = ttg.memdesc_subslice %arg2 [0, 0]  : !ttg.memdesc<32x128xf32, #shared, #smem, mutable> -> !ttg.memdesc<32x64xf32, #shared, #smem, mutable, 32x128>
     // We slice in the fastest dim but each warp loads one row, therefore we can write coalesced into LDS
-    // CHECK: rocdl.global.load.lds
+    // CHECK: rocdl.global.load.async.lds
     %3 = ttg.async_copy_global_to_local %1, %2 : tensor<32x64x!tt.ptr<f32>, #blocked> -> <32x64xf32, #shared, #smem, mutable, 32x128>
     tt.return
   }
@@ -351,7 +328,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
     %1 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<32x32x!tt.ptr<f32>, #blocked>
     %2 = ttg.memdesc_subslice %arg2 [0, 0]  : !ttg.memdesc<64x32xf32, #shared, #smem, mutable> -> !ttg.memdesc<32x32xf32, #shared, #smem, mutable, 64x32>
     // We slice into the slowest dim which does not break coalesced writes into LDS
-    // CHECK: rocdl.global.load.lds
+    // CHECK: rocdl.global.load.async.lds
     %3 = ttg.async_copy_global_to_local %1, %2 : tensor<32x32x!tt.ptr<f32>, #blocked> -> <32x32xf32, #shared, #smem, mutable, 64x32>
     tt.return
   }
 
@@ -0,0 +1,22 @@
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx942 --verify-diagnostics
+
+// GFX942 does not support vectorization > 4bytes for direct-to-LDS loads
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @async_copy_vectorized_8xf16_error(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
+                                %arg1: i32 {tt.divisibility = 16 : i32},
+                                %arg2: !ttg.memdesc<32x64xf16, #shared, #smem, mutable>) {
+    %1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked>
+    %3 = tt.broadcast %2 : tensor<1x64xi32, #blocked> -> tensor<32x64xi32, #blocked>
+    %4 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<32x64x!tt.ptr<f16>, #blocked>
+    %5 = tt.addptr %4, %3 : tensor<32x64x!tt.ptr<f16>, #blocked>, tensor<32x64xi32, #blocked>
+
+    // expected-error@+1 {{failed to legalize operation 'ttg.async_copy_global_to_local' that was explicitly marked illegal}}
+    %6 = ttg.async_copy_global_to_local %5, %arg2 : tensor<32x64x!tt.ptr<f16>, #blocked> -> <32x64xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}