11; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2- ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 < %s | FileCheck -check-prefix=GCN %s
2+ ; RUN: llc -amdgpu-late-wave-transform=1 - mtriple=amdgcn--amdpal -mcpu=gfx803 < %s | FileCheck -check-prefix=GCN %s
33
44; Where the mask of lanes wanting to exit the loop on this iteration is not
55; obviously already masked by exec (in this case, the xor with -1 inserted by
@@ -10,26 +10,34 @@ define void @needs_and(i32 %arg) {
1010; GCN-LABEL: needs_and:
1111; GCN: ; %bb.0: ; %entry
1212; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
13- ; GCN-NEXT: s_mov_b32 s10 , 1
13+ ; GCN-NEXT: s_mov_b32 s8 , 1
1414; GCN-NEXT: s_mov_b64 s[6:7], 0
15+ ; GCN-NEXT: s_mov_b64 s[4:5], -1
1516; GCN-NEXT: s_branch .LBB0_2
1617; GCN-NEXT: .LBB0_1: ; %endif
1718; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1
18- ; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
19- ; GCN-NEXT: s_and_b64 s[4:5], exec, vcc
20- ; GCN-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7]
21- ; GCN-NEXT: s_add_i32 s10, s10, 1
22- ; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
19+ ; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
20+ ; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
21+ ; GCN-NEXT: s_add_i32 s8, s8, 1
22+ ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
23+ ; GCN-NEXT: s_xor_b64 s[4:5], exec, vcc
24+ ; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
25+ ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5]
26+ ; GCN-NEXT: s_mov_b64 exec, vcc
27+ ; GCN-NEXT: ; divergent control-flow edge
2328; GCN-NEXT: s_cbranch_execz .LBB0_4
2429; GCN-NEXT: .LBB0_2: ; %loop
2530; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
26- ; GCN-NEXT: v_cmp_gt_u32_e64 s[4:5], s10, v0
27- ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v0
28- ; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
31+ ; GCN-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0
32+ ; GCN-NEXT: s_xor_b64 s[10:11], s[4:5], exec
33+ ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[10:11]
34+ ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, s8, v0
35+ ; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec
36+ ; GCN-NEXT: s_mov_b64 exec, s[10:11]
37+ ; GCN-NEXT: ; divergent control-flow edge
2938; GCN-NEXT: s_cbranch_execz .LBB0_1
30- ; GCN-NEXT: ; %bb.3 : ; %then
39+ ; GCN-NEXT: .LBB0_3 : ; %then
3140; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1
32- ; GCN-NEXT: s_nop 1
3341; GCN-NEXT: buffer_store_dword v0, off, s[4:7], s4
3442; GCN-NEXT: s_branch .LBB0_1
3543; GCN-NEXT: .LBB0_4: ; %loopexit
@@ -65,16 +73,19 @@ define void @doesnt_need_and(i32 %arg) {
6573; GCN: ; %bb.0: ; %entry
6674; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6775; GCN-NEXT: buffer_store_dword v0, off, s[4:7], s4
76+ ; GCN-NEXT: s_mov_b64 s[4:5], -1
6877; GCN-NEXT: s_mov_b32 s6, 0
6978; GCN-NEXT: s_mov_b64 s[4:5], 0
7079; GCN-NEXT: .LBB1_1: ; %loop
7180; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
7281; GCN-NEXT: s_add_i32 s6, s6, 1
73- ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
74- ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
75- ; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
82+ ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0
83+ ; GCN-NEXT: s_xor_b64 s[8:9], exec, vcc
84+ ; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
85+ ; GCN-NEXT: s_mov_b64 exec, vcc
86+ ; GCN-NEXT: ; divergent control-flow edge
7687; GCN-NEXT: s_cbranch_execnz .LBB1_1
77- ; GCN-NEXT: ; %bb.2 : ; %loopexit
88+ ; GCN-NEXT: .LBB1_2 : ; %loopexit
7889; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
7990; GCN-NEXT: s_waitcnt vmcnt(0)
8091; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -100,30 +111,38 @@ define void @break_cond_is_arg(i32 %arg, i1 %breakcond) {
100111; GCN: ; %bb.0: ; %entry
101112; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
102113; GCN-NEXT: v_and_b32_e32 v1, 1, v1
103- ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1
104- ; GCN-NEXT: s_mov_b32 s10, 1
105- ; GCN-NEXT: s_mov_b64 s[6:7], 0
114+ ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
115+ ; GCN-NEXT: s_mov_b64 s[4:5], -1
116+ ; GCN-NEXT: s_mov_b32 s8, 1
117+ ; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
118+ ; GCN-NEXT: s_mov_b64 s[4:5], 0
106119; GCN-NEXT: s_branch .LBB2_2
107120; GCN-NEXT: .LBB2_1: ; %endif
108121; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1
109- ; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
110- ; GCN-NEXT: s_and_b64 s[4:5], exec, vcc
111- ; GCN-NEXT: s_or_b64 s[6:7], s[4:5], s[6:7]
112- ; GCN-NEXT: s_add_i32 s10, s10, 1
113- ; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
122+ ; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
123+ ; GCN-NEXT: s_add_i32 s8, s8, 1
124+ ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
125+ ; GCN-NEXT: s_xor_b64 s[6:7], exec, vcc
126+ ; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec
127+ ; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
128+ ; GCN-NEXT: s_mov_b64 exec, vcc
129+ ; GCN-NEXT: ; divergent control-flow edge
114130; GCN-NEXT: s_cbranch_execz .LBB2_4
115131; GCN-NEXT: .LBB2_2: ; %loop
116132; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
117- ; GCN-NEXT: v_cmp_gt_u32_e64 s[4:5], s10, v0
118- ; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
133+ ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
134+ ; GCN-NEXT: s_xor_b64 s[10:11], vcc, exec
135+ ; GCN-NEXT: s_xor_b64 s[6:7], exec, s[10:11]
136+ ; GCN-NEXT: s_and_b64 s[6:7], s[6:7], exec
137+ ; GCN-NEXT: s_mov_b64 exec, s[10:11]
138+ ; GCN-NEXT: ; divergent control-flow edge
119139; GCN-NEXT: s_cbranch_execz .LBB2_1
120- ; GCN-NEXT: ; %bb.3 : ; %then
140+ ; GCN-NEXT: .LBB2_3 : ; %then
121141; GCN-NEXT: ; in Loop: Header=BB2_2 Depth=1
122- ; GCN-NEXT: s_nop 2
123142; GCN-NEXT: buffer_store_dword v0, off, s[4:7], s4
124143; GCN-NEXT: s_branch .LBB2_1
125144; GCN-NEXT: .LBB2_4: ; %loopexit
126- ; GCN-NEXT: s_or_b64 exec, exec, s[6:7 ]
145+ ; GCN-NEXT: s_or_b64 exec, exec, s[4:5 ]
127146; GCN-NEXT: s_waitcnt vmcnt(0)
128147; GCN-NEXT: s_setpc_b64 s[30:31]
129148entry:
0 commit comments