Skip to content

Commit bcf113d

Browse files
committed
[WaveTransform] Fix FileCheck patterns for more tests
Fixed the check patterns for some more control flow tests when enabled the wave trasnform flow. Tests fixed in this commit: - loop-on-function-argument.ll - loop_exit_with_xor.ll - lower-brcond-with-xor.ll - lower-control-flow-live-variables-update.mir - machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll - machine-sink-temporal-divergence-swdev407790.ll - madmk.ll - masked-load-vectortypes.ll - mdt-preserving-crash.ll - memcpy-crash-issue63986.ll - memmove-var-size.ll - memory-legalizer-single-wave-workgroup-memops.ll - memset-param-combinations.ll - memset-pattern.ll - mfma-loop.ll - mfma-no-register-aliasing.ll - misaligned-vgpr-regsequence.mir - mmra.ll Disabled late wave transform: - lower-control-flow-live-variables-update.xfail.mir. Disabled wave transform (-amdgpu-late-wave-transform=0) since test expects legacy SI_IF/SI_END_CF pseudos which are incompatible with wave transform.
1 parent 7b675ca commit bcf113d

19 files changed

Lines changed: 4269 additions & 3089 deletions

llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: opt -S -mtriple=amdgcn-- -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
3-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
3+
; RUN: llc -amdgpu-late-wave-transform=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
44

55
define void @loop_on_argument(i1 %arg) {
66
; IR-LABEL: @loop_on_argument(
@@ -21,17 +21,23 @@ define void @loop_on_argument(i1 %arg) {
2121
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2222
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
2323
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
24+
; CHECK-NEXT: s_mov_b64 s[4:5], -1
25+
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
26+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
2427
; CHECK-NEXT: s_mov_b64 s[4:5], 0
25-
; CHECK-NEXT: v_mov_b32_e32 v0, 0
2628
; CHECK-NEXT: .LBB0_1: ; %loop
2729
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
28-
; CHECK-NEXT: s_and_b64 s[6:7], exec, vcc
29-
; CHECK-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
30-
; CHECK-NEXT: global_store_dword v[0:1], v0, off
30+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
31+
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
32+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 1, v2
33+
; CHECK-NEXT: s_xor_b64 s[6:7], exec, vcc
34+
; CHECK-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
35+
; CHECK-NEXT: global_store_dword v[0:1], v1, off
3136
; CHECK-NEXT: s_waitcnt vmcnt(0)
32-
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
37+
; CHECK-NEXT: s_mov_b64 exec, vcc
38+
; CHECK-NEXT: ; divergent control-flow edge
3339
; CHECK-NEXT: s_cbranch_execnz .LBB0_1
34-
; CHECK-NEXT: ; %bb.2: ; %exit
40+
; CHECK-NEXT: .LBB0_2: ; %exit
3541
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
3642
; CHECK-NEXT: s_setpc_b64 s[30:31]
3743
entry:

llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll

Lines changed: 48 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2-
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 < %s | FileCheck -check-prefix=GCN %s
2+
; RUN: llc -amdgpu-late-wave-transform=1 -mtriple=amdgcn--amdpal -mcpu=gfx803 < %s | FileCheck -check-prefix=GCN %s
33

44
; Where the mask of lanes wanting to exit the loop on this iteration is not
55
; obviously already masked by exec (in this case, the xor with -1 inserted by
@@ -10,26 +10,34 @@ define void @needs_and(i32 %arg) {
1010
; GCN-LABEL: needs_and:
1111
; GCN: ; %bb.0: ; %entry
1212
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13-
; GCN-NEXT: s_mov_b32 s10, 1
13+
; GCN-NEXT: s_mov_b32 s8, 1
1414
; GCN-NEXT: s_mov_b64 s[6:7], 0
15+
; GCN-NEXT: s_mov_b64 s[4:5], -1
1516
; GCN-NEXT: s_branch .LBB0_2
1617
; GCN-NEXT: .LBB0_1: ; %endif
1718
; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1
18-
; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
19-
; GCN-NEXT: s_and_b64 s[4:5], exec, vcc
20-
; GCN-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7]
21-
; GCN-NEXT: s_add_i32 s10, s10, 1
22-
; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
19+
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
20+
; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
21+
; GCN-NEXT: s_add_i32 s8, s8, 1
22+
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
23+
; GCN-NEXT: s_xor_b64 s[4:5], exec, vcc
24+
; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
25+
; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5]
26+
; GCN-NEXT: s_mov_b64 exec, vcc
27+
; GCN-NEXT: ; divergent control-flow edge
2328
; GCN-NEXT: s_cbranch_execz .LBB0_4
2429
; GCN-NEXT: .LBB0_2: ; %loop
2530
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
26-
; GCN-NEXT: v_cmp_gt_u32_e64 s[4:5], s10, v0
27-
; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v0
28-
; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
31+
; GCN-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0
32+
; GCN-NEXT: s_xor_b64 s[10:11], s[4:5], exec
33+
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[10:11]
34+
; GCN-NEXT: v_cmp_gt_u32_e32 vcc, s8, v0
35+
; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
36+
; GCN-NEXT: s_mov_b64 exec, s[10:11]
37+
; GCN-NEXT: ; divergent control-flow edge
2938
; GCN-NEXT: s_cbranch_execz .LBB0_1
30-
; GCN-NEXT: ; %bb.3: ; %then
39+
; GCN-NEXT: .LBB0_3: ; %then
3140
; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1
32-
; GCN-NEXT: s_nop 1
3341
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], s4
3442
; GCN-NEXT: s_branch .LBB0_1
3543
; GCN-NEXT: .LBB0_4: ; %loopexit
@@ -65,16 +73,19 @@ define void @doesnt_need_and(i32 %arg) {
6573
; GCN: ; %bb.0: ; %entry
6674
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6775
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], s4
76+
; GCN-NEXT: s_mov_b64 s[4:5], -1
6877
; GCN-NEXT: s_mov_b32 s6, 0
6978
; GCN-NEXT: s_mov_b64 s[4:5], 0
7079
; GCN-NEXT: .LBB1_1: ; %loop
7180
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
7281
; GCN-NEXT: s_add_i32 s6, s6, 1
73-
; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
74-
; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
75-
; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
82+
; GCN-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0
83+
; GCN-NEXT: s_xor_b64 s[8:9], exec, vcc
84+
; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
85+
; GCN-NEXT: s_mov_b64 exec, vcc
86+
; GCN-NEXT: ; divergent control-flow edge
7687
; GCN-NEXT: s_cbranch_execnz .LBB1_1
77-
; GCN-NEXT: ; %bb.2: ; %loopexit
88+
; GCN-NEXT: .LBB1_2: ; %loopexit
7889
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
7990
; GCN-NEXT: s_waitcnt vmcnt(0)
8091
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -100,30 +111,38 @@ define void @break_cond_is_arg(i32 %arg, i1 %breakcond) {
100111
; GCN: ; %bb.0: ; %entry
101112
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102113
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
103-
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1
104-
; GCN-NEXT: s_mov_b32 s10, 1
105-
; GCN-NEXT: s_mov_b64 s[6:7], 0
114+
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
115+
; GCN-NEXT: s_mov_b64 s[4:5], -1
116+
; GCN-NEXT: s_mov_b32 s8, 1
117+
; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
118+
; GCN-NEXT: s_mov_b64 s[4:5], 0
106119
; GCN-NEXT: s_branch .LBB2_2
107120
; GCN-NEXT: .LBB2_1: ; %endif
108121
; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1
109-
; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
110-
; GCN-NEXT: s_and_b64 s[4:5], exec, vcc
111-
; GCN-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7]
112-
; GCN-NEXT: s_add_i32 s10, s10, 1
113-
; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
122+
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
123+
; GCN-NEXT: s_add_i32 s8, s8, 1
124+
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
125+
; GCN-NEXT: s_xor_b64 s[6:7], exec, vcc
126+
; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec
127+
; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
128+
; GCN-NEXT: s_mov_b64 exec, vcc
129+
; GCN-NEXT: ; divergent control-flow edge
114130
; GCN-NEXT: s_cbranch_execz .LBB2_4
115131
; GCN-NEXT: .LBB2_2: ; %loop
116132
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
117-
; GCN-NEXT: v_cmp_gt_u32_e64 s[4:5], s10, v0
118-
; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
133+
; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
134+
; GCN-NEXT: s_xor_b64 s[10:11], vcc, exec
135+
; GCN-NEXT: s_xor_b64 s[6:7], exec, s[10:11]
136+
; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec
137+
; GCN-NEXT: s_mov_b64 exec, s[10:11]
138+
; GCN-NEXT: ; divergent control-flow edge
119139
; GCN-NEXT: s_cbranch_execz .LBB2_1
120-
; GCN-NEXT: ; %bb.3: ; %then
140+
; GCN-NEXT: .LBB2_3: ; %then
121141
; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1
122-
; GCN-NEXT: s_nop 2
123142
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], s4
124143
; GCN-NEXT: s_branch .LBB2_1
125144
; GCN-NEXT: .LBB2_4: ; %loopexit
126-
; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
145+
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
127146
; GCN-NEXT: s_waitcnt vmcnt(0)
128147
; GCN-NEXT: s_setpc_b64 s[30:31]
129148
entry:

llvm/test/CodeGen/AMDGPU/lower-brcond-with-xor.ll

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --debug-counter=dagcombine=0 -start-before=unify-loop-exits %s -o - | FileCheck %s
2+
; RUN: llc -amdgpu-late-wave-transform=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --debug-counter=dagcombine=0 -start-before=unify-loop-exits %s -o - | FileCheck %s
33

44
define amdgpu_kernel void @test(i32 %N, ptr addrspace(1) %p) {
55
; CHECK-LABEL: test:
66
; CHECK: ; %bb.0: ; %entry
77
; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0
88
; CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 1, v0
9-
; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc
9+
; CHECK-NEXT: s_xor_b64 s[0:1], vcc, -1
10+
; CHECK-NEXT: s_and_b64 s[2:3], exec, s[0:1]
11+
; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], -1
12+
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec
13+
; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[0:1]
14+
; CHECK-NEXT: s_mov_b64 exec, s[0:1]
15+
; CHECK-NEXT: ; divergent control-flow edge
1016
; CHECK-NEXT: s_endpgm
1117
entry:
1218
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)