ROCm
diff --git a/‎llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll‎
Lines changed: 13 additions & 7 deletions b/‎llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll‎
Lines changed: 48 additions & 29 deletions b/‎llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll‎
Lines changed: 48 additions & 29 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/lower-brcond-with-xor.ll‎
Lines changed: 8 additions & 2 deletions b/‎llvm/test/CodeGen/AMDGPU/lower-brcond-with-xor.ll‎
Lines changed: 8 additions & 2 deletions
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-- -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -amdgpu-late-wave-transform=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
 
 define void @loop_on_argument(i1 %arg) {
 ; IR-LABEL: @loop_on_argument(
@@ -21,17 +21,23 @@ define void @loop_on_argument(i1 %arg) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; CHECK-NEXT:    s_mov_b64 s[4:5], -1
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:  .LBB0_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_and_b64 s[6:7], exec, vcc
-; CHECK-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; CHECK-NEXT:    global_store_dword v[0:1], v0, off
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v2
+; CHECK-NEXT:    s_xor_b64 s[6:7], exec, vcc
+; CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; CHECK-NEXT:    global_store_dword v[0:1], v1, off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_mov_b64 exec, vcc
+; CHECK-NEXT:    ; divergent control-flow edge
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_1
-; CHECK-NEXT:  ; %bb.2: ; %exit
+; CHECK-NEXT:  .LBB0_2: ; %exit
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
 
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -amdgpu-late-wave-transform=1 -mtriple=amdgcn--amdpal -mcpu=gfx803 < %s | FileCheck -check-prefix=GCN %s
 
 ; Where the mask of lanes wanting to exit the loop on this iteration is not
 ; obviously already masked by exec (in this case, the xor with -1 inserted by
@@ -10,26 +10,34 @@ define void @needs_and(i32 %arg) {
 ; GCN-LABEL: needs_and:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s10, 1
+; GCN-NEXT:    s_mov_b32 s8, 1
 ; GCN-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN-NEXT:    s_branch .LBB0_2
 ; GCN-NEXT:  .LBB0_1: ; %endif
 ; GCN-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-NEXT:    s_and_b64 s[4:5], exec, vcc
-; GCN-NEXT:    s_or_b64 s[6:7], s[4:5], s[6:7]
-; GCN-NEXT:    s_add_i32 s10, s10, 1
-; GCN-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GCN-NEXT:    s_add_i32 s8, s8, 1
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GCN-NEXT:    s_xor_b64 s[4:5], exec, vcc
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[4:5]
+; GCN-NEXT:    s_mov_b64 exec, vcc
+; GCN-NEXT:    ; divergent control-flow edge
 ; GCN-NEXT:    s_cbranch_execz .LBB0_4
 ; GCN-NEXT:  .LBB0_2: ; %loop
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_cmp_gt_u32_e64 s[4:5], s10, v0
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v0
-; GCN-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v0
+; GCN-NEXT:    s_xor_b64 s[10:11], s[4:5], exec
+; GCN-NEXT:    s_xor_b64 s[4:5], exec, s[10:11]
+; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, s8, v0
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GCN-NEXT:    s_mov_b64 exec, s[10:11]
+; GCN-NEXT:    ; divergent control-flow edge
 ; GCN-NEXT:    s_cbranch_execz .LBB0_1
-; GCN-NEXT:  ; %bb.3: ; %then
+; GCN-NEXT:  .LBB0_3: ; %then
 ; GCN-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], s4
 ; GCN-NEXT:    s_branch .LBB0_1
 ; GCN-NEXT:  .LBB0_4: ; %loopexit
@@ -65,16 +73,19 @@ define void @doesnt_need_and(i32 %arg) {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], s4
+; GCN-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-NEXT:  .LBB1_1: ; %loop
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    s_add_i32 s6, s6, 1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GCN-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GCN-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
+; GCN-NEXT:    s_xor_b64 s[8:9], exec, vcc
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GCN-NEXT:    s_mov_b64 exec, vcc
+; GCN-NEXT:    ; divergent control-flow edge
 ; GCN-NEXT:    s_cbranch_execnz .LBB1_1
-; GCN-NEXT:  ; %bb.2: ; %loopexit
+; GCN-NEXT:  .LBB1_2: ; %loopexit
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -100,30 +111,38 @@ define void @break_cond_is_arg(i32 %arg, i1 %breakcond) {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v1
-; GCN-NEXT:    s_mov_b32 s10, 1
-; GCN-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GCN-NEXT:    s_mov_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b32 s8, 1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GCN-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-NEXT:    s_branch .LBB2_2
 ; GCN-NEXT:  .LBB2_1: ; %endif
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GCN-NEXT:    s_and_b64 s[4:5], exec, vcc
-; GCN-NEXT:    s_or_b64 s[6:7], s[4:5], s[6:7]
-; GCN-NEXT:    s_add_i32 s10, s10, 1
-; GCN-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-NEXT:    s_add_i32 s8, s8, 1
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GCN-NEXT:    s_xor_b64 s[6:7], exec, vcc
+; GCN-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GCN-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    s_mov_b64 exec, vcc
+; GCN-NEXT:    ; divergent control-flow edge
 ; GCN-NEXT:    s_cbranch_execz .LBB2_4
 ; GCN-NEXT:  .LBB2_2: ; %loop
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_cmp_gt_u32_e64 s[4:5], s10, v0
-; GCN-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
+; GCN-NEXT:    s_xor_b64 s[10:11], vcc, exec
+; GCN-NEXT:    s_xor_b64 s[6:7], exec, s[10:11]
+; GCN-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GCN-NEXT:    s_mov_b64 exec, s[10:11]
+; GCN-NEXT:    ; divergent control-flow edge
 ; GCN-NEXT:    s_cbranch_execz .LBB2_1
-; GCN-NEXT:  ; %bb.3: ; %then
+; GCN-NEXT:  .LBB2_3: ; %then
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; GCN-NEXT:    s_nop 2
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], s4
 ; GCN-NEXT:    s_branch .LBB2_1
 ; GCN-NEXT:  .LBB2_4: ; %loopexit
-; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
 
@@ -1,12 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --debug-counter=dagcombine=0 -start-before=unify-loop-exits %s -o - | FileCheck %s
+; RUN: llc -amdgpu-late-wave-transform=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --debug-counter=dagcombine=0 -start-before=unify-loop-exits %s -o - | FileCheck %s
 
 define amdgpu_kernel void @test(i32 %N, ptr addrspace(1) %p) {
 ; CHECK-LABEL: test:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; CHECK-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v0
-; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; CHECK-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; CHECK-NEXT:    s_and_b64 s[2:3], exec, s[0:1]
+; CHECK-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; CHECK-NEXT:    s_xor_b64 s[2:3], exec, s[0:1]
+; CHECK-NEXT:    s_mov_b64 exec, s[0:1]
+; CHECK-NEXT:    ; divergent control-flow edge
 ; CHECK-NEXT:    s_endpgm
 entry:
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()