-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[MachineSink] Lower SplitEdgeProbabilityThreshold #127666
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/guy-david/machine-sink-powerpc
Are you sure you want to change the base?
[MachineSink] Lower SplitEdgeProbabilityThreshold #127666
Conversation
b20f2dc
to
725de8f
Compare
@llvm/pr-subscribers-debuginfo @llvm/pr-subscribers-backend-powerpc Author: Guy David (guy-david) ChangesRequires #128745. Lower it slightly below the likeliness of a null-check to be true which is set to 37.5% (see PtrUntakenProb). On M4 Pro:
On Ryzen9 5950X:
I looked into the disassembly of Patch is 226.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127666.diff 51 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 82acb780cfb72..81459cf65d6c2 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -82,7 +82,7 @@ static cl::opt<unsigned> SplitEdgeProbabilityThreshold(
"If the branch threshold is higher than this threshold, we allow "
"speculative execution of up to 1 instruction to avoid branching to "
"splitted critical edge"),
- cl::init(40), cl::Hidden);
+ cl::init(35), cl::Hidden);
static cl::opt<unsigned> SinkLoadInstsPerBlockThreshold(
"machine-sink-load-instrs-threshold",
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index fb6575cc0ee83..fdc087e9c1991 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -632,20 +632,18 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
;
; CHECK-GI-LABEL: red_mla_dup_ext_u8_s8_s16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: cbz w2, .LBB5_3
+; CHECK-GI-NEXT: mov w8, wzr
+; CHECK-GI-NEXT: cbz w2, .LBB5_9
; CHECK-GI-NEXT: // %bb.1: // %for.body.preheader
; CHECK-GI-NEXT: cmp w2, #16
; CHECK-GI-NEXT: mov w8, w2
-; CHECK-GI-NEXT: b.hs .LBB5_4
+; CHECK-GI-NEXT: b.hs .LBB5_3
; CHECK-GI-NEXT: // %bb.2:
; CHECK-GI-NEXT: mov w10, #0 // =0x0
; CHECK-GI-NEXT: mov x9, xzr
; CHECK-GI-NEXT: fmov s0, w10
-; CHECK-GI-NEXT: b .LBB5_8
-; CHECK-GI-NEXT: .LBB5_3:
-; CHECK-GI-NEXT: mov w0, wzr
-; CHECK-GI-NEXT: ret
-; CHECK-GI-NEXT: .LBB5_4: // %vector.ph
+; CHECK-GI-NEXT: b .LBB5_7
+; CHECK-GI-NEXT: .LBB5_3: // %vector.ph
; CHECK-GI-NEXT: lsl w9, w1, #8
; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
@@ -654,7 +652,7 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
; CHECK-GI-NEXT: dup v2.8h, w9
; CHECK-GI-NEXT: and x9, x8, #0xfffffff0
; CHECK-GI-NEXT: mov x11, x9
-; CHECK-GI-NEXT: .LBB5_5: // %vector.body
+; CHECK-GI-NEXT: .LBB5_4: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: ldp d3, d4, [x10, #-8]
; CHECK-GI-NEXT: subs x11, x11, #16
@@ -663,29 +661,31 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
; CHECK-GI-NEXT: ushll v4.8h, v4.8b, #0
; CHECK-GI-NEXT: mla v0.8h, v2.8h, v3.8h
; CHECK-GI-NEXT: mla v1.8h, v2.8h, v4.8h
-; CHECK-GI-NEXT: b.ne .LBB5_5
-; CHECK-GI-NEXT: // %bb.6: // %middle.block
+; CHECK-GI-NEXT: b.ne .LBB5_4
+; CHECK-GI-NEXT: // %bb.5: // %middle.block
; CHECK-GI-NEXT: add v0.8h, v1.8h, v0.8h
; CHECK-GI-NEXT: cmp x9, x8
; CHECK-GI-NEXT: addv h0, v0.8h
-; CHECK-GI-NEXT: b.ne .LBB5_8
-; CHECK-GI-NEXT: // %bb.7:
-; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: b.ne .LBB5_7
+; CHECK-GI-NEXT: // %bb.6:
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: mov w0, w8
; CHECK-GI-NEXT: ret
-; CHECK-GI-NEXT: .LBB5_8: // %for.body.preheader1
+; CHECK-GI-NEXT: .LBB5_7: // %for.body.preheader1
; CHECK-GI-NEXT: sxtb w10, w1
-; CHECK-GI-NEXT: sub x8, x8, x9
+; CHECK-GI-NEXT: sub x11, x8, x9
; CHECK-GI-NEXT: add x9, x0, x9
-; CHECK-GI-NEXT: .LBB5_9: // %for.body
+; CHECK-GI-NEXT: .LBB5_8: // %for.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT: ldrb w11, [x9], #1
+; CHECK-GI-NEXT: ldrb w8, [x9], #1
; CHECK-GI-NEXT: fmov w12, s0
-; CHECK-GI-NEXT: subs x8, x8, #1
-; CHECK-GI-NEXT: mul w11, w11, w10
-; CHECK-GI-NEXT: add w0, w11, w12, uxth
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: b.ne .LBB5_9
-; CHECK-GI-NEXT: // %bb.10: // %for.cond.cleanup
+; CHECK-GI-NEXT: subs x11, x11, #1
+; CHECK-GI-NEXT: mul w8, w8, w10
+; CHECK-GI-NEXT: add w8, w8, w12, uxth
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: b.ne .LBB5_8
+; CHECK-GI-NEXT: .LBB5_9: // %for.cond.cleanup
+; CHECK-GI-NEXT: mov w0, w8
; CHECK-GI-NEXT: ret
entry:
%conv2 = sext i8 %B to i16
diff --git a/llvm/test/CodeGen/AArch64/swifterror.ll b/llvm/test/CodeGen/AArch64/swifterror.ll
index 07ee87e880aff..1ca98f6015c11 100644
--- a/llvm/test/CodeGen/AArch64/swifterror.ll
+++ b/llvm/test/CodeGen/AArch64/swifterror.ll
@@ -412,6 +412,7 @@ define float @foo_if(ptr swifterror %error_ptr_ref, i32 %cc) {
; CHECK-APPLE-NEXT: .cfi_def_cfa w29, 16
; CHECK-APPLE-NEXT: .cfi_offset w30, -8
; CHECK-APPLE-NEXT: .cfi_offset w29, -16
+; CHECK-APPLE-NEXT: movi d0, #0000000000000000
; CHECK-APPLE-NEXT: cbz w0, LBB3_2
; CHECK-APPLE-NEXT: ; %bb.1: ; %gen_error
; CHECK-APPLE-NEXT: mov w0, #16 ; =0x10
@@ -420,10 +421,7 @@ define float @foo_if(ptr swifterror %error_ptr_ref, i32 %cc) {
; CHECK-APPLE-NEXT: fmov s0, #1.00000000
; CHECK-APPLE-NEXT: mov w8, #1 ; =0x1
; CHECK-APPLE-NEXT: strb w8, [x0, #8]
-; CHECK-APPLE-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; CHECK-APPLE-NEXT: ret
-; CHECK-APPLE-NEXT: LBB3_2:
-; CHECK-APPLE-NEXT: movi d0, #0000000000000000
+; CHECK-APPLE-NEXT: LBB3_2: ; %common.ret
; CHECK-APPLE-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
; CHECK-APPLE-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 0c9ff3eee8231..70caf812ea6c2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -200,6 +200,7 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[0:1], 0
; CHECK-NEXT: s_mov_b32 s0, 1
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: s_ashr_i32 s6, s3, 31
@@ -330,15 +331,12 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_xor_b32_e32 v0, s6, v0
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0
-; CHECK-NEXT: s_branch .LBB1_3
-; CHECK-NEXT: .LBB1_2:
-; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
-; CHECK-NEXT: .LBB1_3: ; %Flow
+; CHECK-NEXT: .LBB1_2: ; %Flow
; CHECK-NEXT: s_xor_b32 s0, s0, 1
; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
-; CHECK-NEXT: ; %bb.4:
+; CHECK-NEXT: s_cbranch_scc1 .LBB1_4
+; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s4
; CHECK-NEXT: s_sub_i32 s0, 0, s4
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -358,7 +356,7 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT: .LBB1_5:
+; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index df645888626c6..2fcbc41895f03 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -194,6 +194,7 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[0:1], 0
; CHECK-NEXT: s_mov_b32 s7, 1
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: s_ashr_i32 s6, s3, 31
@@ -322,15 +323,12 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; CHECK-NEXT: v_xor_b32_e32 v0, s6, v0
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0
-; CHECK-NEXT: s_branch .LBB1_3
-; CHECK-NEXT: .LBB1_2:
-; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
-; CHECK-NEXT: .LBB1_3: ; %Flow
+; CHECK-NEXT: .LBB1_2: ; %Flow
; CHECK-NEXT: s_xor_b32 s0, s7, 1
; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
-; CHECK-NEXT: ; %bb.4:
+; CHECK-NEXT: s_cbranch_scc1 .LBB1_4
+; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s4
; CHECK-NEXT: s_sub_i32 s0, 0, s4
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -348,7 +346,7 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT: .LBB1_5:
+; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index f5a901b024ef5..c9a5a92188256 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -193,6 +193,7 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, 1
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s2
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: v_mov_b32_e32 v0, s3
@@ -318,15 +319,12 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; CHECK-NEXT: s_branch .LBB1_3
-; CHECK-NEXT: .LBB1_2:
-; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
-; CHECK-NEXT: .LBB1_3: ; %Flow
+; CHECK-NEXT: .LBB1_2: ; %Flow
; CHECK-NEXT: s_xor_b32 s1, s6, 1
; CHECK-NEXT: s_and_b32 s1, s1, 1
; CHECK-NEXT: s_cmp_lg_u32 s1, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
-; CHECK-NEXT: ; %bb.4:
+; CHECK-NEXT: s_cbranch_scc1 .LBB1_4
+; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2
; CHECK-NEXT: s_sub_i32 s1, 0, s2
; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -345,7 +343,7 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s2, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT: .LBB1_5:
+; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 2be4b52198b45..06e51387c8f21 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -190,6 +190,7 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, 1
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s2
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: v_mov_b32_e32 v0, s3
@@ -314,15 +315,12 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; CHECK-NEXT: s_branch .LBB1_3
-; CHECK-NEXT: .LBB1_2:
-; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
-; CHECK-NEXT: .LBB1_3: ; %Flow
+; CHECK-NEXT: .LBB1_2: ; %Flow
; CHECK-NEXT: s_xor_b32 s1, s6, 1
; CHECK-NEXT: s_and_b32 s1, s1, 1
; CHECK-NEXT: s_cmp_lg_u32 s1, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
-; CHECK-NEXT: ; %bb.4:
+; CHECK-NEXT: s_cbranch_scc1 .LBB1_4
+; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2
; CHECK-NEXT: s_sub_i32 s1, 0, s2
; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -339,7 +337,7 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s2, v0
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s2, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT: .LBB1_5:
+; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/artificial-terminators.mir b/llvm/test/CodeGen/AMDGPU/artificial-terminators.mir
index 1a76cae68f164..9e84d979e8547 100644
--- a/llvm/test/CodeGen/AMDGPU/artificial-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/artificial-terminators.mir
@@ -34,18 +34,14 @@ body: |
; CHECK-NEXT: S_BRANCH %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: successors: %bb.5(0x30000000), %bb.2(0x50000000)
+ ; CHECK-NEXT: successors: %bb.4(0x30000000), %bb.2(0x50000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[V_ADD_U32_e64_3]], [[S_MOV_B32_1]], implicit $exec
; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_LT_I32_e64_]], implicit-def $scc
- ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]]
- ; CHECK-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit $exec
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.5:
- ; CHECK-NEXT: successors: %bb.4(0x80000000)
- ; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[V_CMP_LT_I32_e64_]]
- ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]]
+ ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -64,7 +60,7 @@ body: |
; CHECK-NEXT: S_BRANCH %bb.4
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.4:
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[COPY3]], %bb.5, [[S_OR_B32_]], %bb.2, [[S_OR_B32_]], %bb.3
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[COPY3]], %bb.1, [[S_OR_B32_]], %bb.2, [[S_OR_B32_]], %bb.3
; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[PHI]], implicit-def $scc
; CHECK-NEXT: S_ENDPGM 0
bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index f9ffa5ae57f3e..dfbb5f6a64042 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -9,44 +9,34 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_addc_u32 s13, s13, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; CHECK-NEXT: s_load_dwordx8 s[36:43], s[8:9], 0x0
+; CHECK-NEXT: s_load_dwordx8 s[20:27], s[8:9], 0x0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
-; CHECK-NEXT: s_mov_b32 s12, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_cmp_lg_u32 s40, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB0_8
-; CHECK-NEXT: ; %bb.1: ; %if.end13.i.i
-; CHECK-NEXT: s_cmp_eq_u32 s42, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB0_4
-; CHECK-NEXT: ; %bb.2: ; %if.else251.i.i
-; CHECK-NEXT: s_cmp_lg_u32 s43, 0
-; CHECK-NEXT: s_mov_b32 s17, 0
-; CHECK-NEXT: s_cselect_b32 s12, -1, 0
-; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s12
-; CHECK-NEXT: s_cbranch_vccz .LBB0_5
-; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: s_mov_b32 s36, 0
-; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12
-; CHECK-NEXT: s_cbranch_vccz .LBB0_6
-; CHECK-NEXT: s_branch .LBB0_7
-; CHECK-NEXT: .LBB0_4:
-; CHECK-NEXT: s_mov_b32 s14, s12
-; CHECK-NEXT: s_mov_b32 s15, s12
-; CHECK-NEXT: s_mov_b32 s13, s12
-; CHECK-NEXT: s_mov_b64 s[38:39], s[14:15]
-; CHECK-NEXT: s_mov_b64 s[36:37], s[12:13]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_cmp_lg_u32 s24, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB0_2
+; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: s_mov_b64 s[38:39], s[22:23]
+; CHECK-NEXT: s_mov_b64 s[36:37], s[20:21]
; CHECK-NEXT: s_branch .LBB0_7
-; CHECK-NEXT: .LBB0_5: ; %if.then263.i.i
-; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s41, 0
-; CHECK-NEXT: s_mov_b32 s36, 1.0
-; CHECK-NEXT: s_mov_b32 s17, 0x7fc00000
+; CHECK-NEXT: .LBB0_2: ; %if.end13.i.i
; CHECK-NEXT: s_mov_b32 s37, s36
; CHECK-NEXT: s_mov_b32 s38, s36
+; CHECK-NEXT: s_cmp_eq_u32 s26, 0
; CHECK-NEXT: s_mov_b32 s39, s36
+; CHECK-NEXT: s_cbranch_scc1 .LBB0_6
+; CHECK-NEXT: ; %bb.3: ; %if.else251.i.i
+; CHECK-NEXT: s_cmp_lg_u32 s27, 0
+; CHECK-NEXT: s_mov_b32 s17, 0
+; CHECK-NEXT: s_cselect_b32 s12, -1, 0
+; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s12
+; CHECK-NEXT: s_cbranch_vccz .LBB0_8
+; CHECK-NEXT: ; %bb.4:
+; CHECK-NEXT: s_mov_b32 s36, 0
; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12
-; CHECK-NEXT: s_cbranch_vccnz .LBB0_7
-; CHECK-NEXT: .LBB0_6: ; %if.end273.i.i
+; CHECK-NEXT: s_cbranch_vccnz .LBB0_6
+; CHECK-NEXT: .LBB0_5: ; %if.end273.i.i
; CHECK-NEXT: s_add_u32 s12, s8, 40
; CHECK-NEXT: s_addc_u32 s13, s9, 0
; CHECK-NEXT: s_getpc_b64 s[18:19]
@@ -72,13 +62,13 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_mov_b32 s37, s36
; CHECK-NEXT: s_mov_b32 s38, s36
; CHECK-NEXT: s_mov_b32 s39, s36
-; CHECK-NEXT: .LBB0_7: ; %if.end294.i.i
+; CHECK-NEXT: .LBB0_6: ; %if.end294.i.i
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; CHECK-NEXT: .LBB0_8: ; %kernel_direct_lighting.exit
+; CHECK-NEXT: .LBB0_7: ; %kernel_direct_lighting.exit
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x20
; CHECK-NEXT: v_mov_b32_e32 v0, s36
; CHECK-NEXT: v_mov_b32_e32 v4, 0
@@ -88,6 +78,16 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: .LBB0_8: ; %if.then263.i.i
+; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s25, 0
+; CHECK-NEXT: s_mov_b32 s36, 1.0
+; CHECK-NEXT: s_mov_b32 s17, 0x7fc00000
+; CHECK-NEXT: s_mov_b32 s37, s36
+; CHECK-NEXT: s_mov_b32 s38, s36
+; CHECK-NEXT: s_mov_b32 s39, s36
+; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12
+; CHECK-NEXT: s_cbranch_vccz .LBB0_5
+; CHECK-NEXT: s_branch .LBB0_6
entry:
%cmp5.i.i = icmp eq i32 %cmp5.i.i.arg, 0
br i1 %cmp5.i.i, label %if.end13.i.i, label %kernel_direct_lighting.exit
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index d61c4b46596c0..ce0b79b0b358c 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -848,12 +848,13 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX9-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], -1
; GFX9-SDAG-NEXT: s_mov_b32 s33, 0
-; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x1000
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-SDAG-NEXT: s_mov_b32 s4, 0
-; GFX9-SDAG-NEXT: s_cbranch_scc0 .LBB7_6
+; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x1000
+; GFX9-SDAG-NEXT: s_cbranch_scc0 .LBB7_4
; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.1
; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
@@ -873,8 +874,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s6
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: s_cbranch_execnz .LB...
[truncated]
|
@llvm/pr-subscribers-llvm-globalisel Author: Guy David (guy-david) ChangesRequires #128745. Lower it slightly below the likeliness of a null-check to be true which is set to 37.5% (see PtrUntakenProb). On M4 Pro:
On Ryzen9 5950X:
I looked into the disassembly of Patch is 226.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127666.diff 51 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 82acb780cfb72..81459cf65d6c2 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -82,7 +82,7 @@ static cl::opt<unsigned> SplitEdgeProbabilityThreshold(
"If the branch threshold is higher than this threshold, we allow "
"speculative execution of up to 1 instruction to avoid branching to "
"splitted critical edge"),
- cl::init(40), cl::Hidden);
+ cl::init(35), cl::Hidden);
static cl::opt<unsigned> SinkLoadInstsPerBlockThreshold(
"machine-sink-load-instrs-threshold",
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index fb6575cc0ee83..fdc087e9c1991 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -632,20 +632,18 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
;
; CHECK-GI-LABEL: red_mla_dup_ext_u8_s8_s16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: cbz w2, .LBB5_3
+; CHECK-GI-NEXT: mov w8, wzr
+; CHECK-GI-NEXT: cbz w2, .LBB5_9
; CHECK-GI-NEXT: // %bb.1: // %for.body.preheader
; CHECK-GI-NEXT: cmp w2, #16
; CHECK-GI-NEXT: mov w8, w2
-; CHECK-GI-NEXT: b.hs .LBB5_4
+; CHECK-GI-NEXT: b.hs .LBB5_3
; CHECK-GI-NEXT: // %bb.2:
; CHECK-GI-NEXT: mov w10, #0 // =0x0
; CHECK-GI-NEXT: mov x9, xzr
; CHECK-GI-NEXT: fmov s0, w10
-; CHECK-GI-NEXT: b .LBB5_8
-; CHECK-GI-NEXT: .LBB5_3:
-; CHECK-GI-NEXT: mov w0, wzr
-; CHECK-GI-NEXT: ret
-; CHECK-GI-NEXT: .LBB5_4: // %vector.ph
+; CHECK-GI-NEXT: b .LBB5_7
+; CHECK-GI-NEXT: .LBB5_3: // %vector.ph
; CHECK-GI-NEXT: lsl w9, w1, #8
; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
@@ -654,7 +652,7 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
; CHECK-GI-NEXT: dup v2.8h, w9
; CHECK-GI-NEXT: and x9, x8, #0xfffffff0
; CHECK-GI-NEXT: mov x11, x9
-; CHECK-GI-NEXT: .LBB5_5: // %vector.body
+; CHECK-GI-NEXT: .LBB5_4: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: ldp d3, d4, [x10, #-8]
; CHECK-GI-NEXT: subs x11, x11, #16
@@ -663,29 +661,31 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
; CHECK-GI-NEXT: ushll v4.8h, v4.8b, #0
; CHECK-GI-NEXT: mla v0.8h, v2.8h, v3.8h
; CHECK-GI-NEXT: mla v1.8h, v2.8h, v4.8h
-; CHECK-GI-NEXT: b.ne .LBB5_5
-; CHECK-GI-NEXT: // %bb.6: // %middle.block
+; CHECK-GI-NEXT: b.ne .LBB5_4
+; CHECK-GI-NEXT: // %bb.5: // %middle.block
; CHECK-GI-NEXT: add v0.8h, v1.8h, v0.8h
; CHECK-GI-NEXT: cmp x9, x8
; CHECK-GI-NEXT: addv h0, v0.8h
-; CHECK-GI-NEXT: b.ne .LBB5_8
-; CHECK-GI-NEXT: // %bb.7:
-; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: b.ne .LBB5_7
+; CHECK-GI-NEXT: // %bb.6:
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: mov w0, w8
; CHECK-GI-NEXT: ret
-; CHECK-GI-NEXT: .LBB5_8: // %for.body.preheader1
+; CHECK-GI-NEXT: .LBB5_7: // %for.body.preheader1
; CHECK-GI-NEXT: sxtb w10, w1
-; CHECK-GI-NEXT: sub x8, x8, x9
+; CHECK-GI-NEXT: sub x11, x8, x9
; CHECK-GI-NEXT: add x9, x0, x9
-; CHECK-GI-NEXT: .LBB5_9: // %for.body
+; CHECK-GI-NEXT: .LBB5_8: // %for.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT: ldrb w11, [x9], #1
+; CHECK-GI-NEXT: ldrb w8, [x9], #1
; CHECK-GI-NEXT: fmov w12, s0
-; CHECK-GI-NEXT: subs x8, x8, #1
-; CHECK-GI-NEXT: mul w11, w11, w10
-; CHECK-GI-NEXT: add w0, w11, w12, uxth
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: b.ne .LBB5_9
-; CHECK-GI-NEXT: // %bb.10: // %for.cond.cleanup
+; CHECK-GI-NEXT: subs x11, x11, #1
+; CHECK-GI-NEXT: mul w8, w8, w10
+; CHECK-GI-NEXT: add w8, w8, w12, uxth
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: b.ne .LBB5_8
+; CHECK-GI-NEXT: .LBB5_9: // %for.cond.cleanup
+; CHECK-GI-NEXT: mov w0, w8
; CHECK-GI-NEXT: ret
entry:
%conv2 = sext i8 %B to i16
diff --git a/llvm/test/CodeGen/AArch64/swifterror.ll b/llvm/test/CodeGen/AArch64/swifterror.ll
index 07ee87e880aff..1ca98f6015c11 100644
--- a/llvm/test/CodeGen/AArch64/swifterror.ll
+++ b/llvm/test/CodeGen/AArch64/swifterror.ll
@@ -412,6 +412,7 @@ define float @foo_if(ptr swifterror %error_ptr_ref, i32 %cc) {
; CHECK-APPLE-NEXT: .cfi_def_cfa w29, 16
; CHECK-APPLE-NEXT: .cfi_offset w30, -8
; CHECK-APPLE-NEXT: .cfi_offset w29, -16
+; CHECK-APPLE-NEXT: movi d0, #0000000000000000
; CHECK-APPLE-NEXT: cbz w0, LBB3_2
; CHECK-APPLE-NEXT: ; %bb.1: ; %gen_error
; CHECK-APPLE-NEXT: mov w0, #16 ; =0x10
@@ -420,10 +421,7 @@ define float @foo_if(ptr swifterror %error_ptr_ref, i32 %cc) {
; CHECK-APPLE-NEXT: fmov s0, #1.00000000
; CHECK-APPLE-NEXT: mov w8, #1 ; =0x1
; CHECK-APPLE-NEXT: strb w8, [x0, #8]
-; CHECK-APPLE-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; CHECK-APPLE-NEXT: ret
-; CHECK-APPLE-NEXT: LBB3_2:
-; CHECK-APPLE-NEXT: movi d0, #0000000000000000
+; CHECK-APPLE-NEXT: LBB3_2: ; %common.ret
; CHECK-APPLE-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
; CHECK-APPLE-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 0c9ff3eee8231..70caf812ea6c2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -200,6 +200,7 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[0:1], 0
; CHECK-NEXT: s_mov_b32 s0, 1
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: s_ashr_i32 s6, s3, 31
@@ -330,15 +331,12 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_xor_b32_e32 v0, s6, v0
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0
-; CHECK-NEXT: s_branch .LBB1_3
-; CHECK-NEXT: .LBB1_2:
-; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
-; CHECK-NEXT: .LBB1_3: ; %Flow
+; CHECK-NEXT: .LBB1_2: ; %Flow
; CHECK-NEXT: s_xor_b32 s0, s0, 1
; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
-; CHECK-NEXT: ; %bb.4:
+; CHECK-NEXT: s_cbranch_scc1 .LBB1_4
+; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s4
; CHECK-NEXT: s_sub_i32 s0, 0, s4
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -358,7 +356,7 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT: .LBB1_5:
+; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index df645888626c6..2fcbc41895f03 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -194,6 +194,7 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[0:1], 0
; CHECK-NEXT: s_mov_b32 s7, 1
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: s_ashr_i32 s6, s3, 31
@@ -322,15 +323,12 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; CHECK-NEXT: v_xor_b32_e32 v0, s6, v0
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0
-; CHECK-NEXT: s_branch .LBB1_3
-; CHECK-NEXT: .LBB1_2:
-; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
-; CHECK-NEXT: .LBB1_3: ; %Flow
+; CHECK-NEXT: .LBB1_2: ; %Flow
; CHECK-NEXT: s_xor_b32 s0, s7, 1
; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
-; CHECK-NEXT: ; %bb.4:
+; CHECK-NEXT: s_cbranch_scc1 .LBB1_4
+; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s4
; CHECK-NEXT: s_sub_i32 s0, 0, s4
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -348,7 +346,7 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT: .LBB1_5:
+; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index f5a901b024ef5..c9a5a92188256 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -193,6 +193,7 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, 1
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s2
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: v_mov_b32_e32 v0, s3
@@ -318,15 +319,12 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; CHECK-NEXT: s_branch .LBB1_3
-; CHECK-NEXT: .LBB1_2:
-; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
-; CHECK-NEXT: .LBB1_3: ; %Flow
+; CHECK-NEXT: .LBB1_2: ; %Flow
; CHECK-NEXT: s_xor_b32 s1, s6, 1
; CHECK-NEXT: s_and_b32 s1, s1, 1
; CHECK-NEXT: s_cmp_lg_u32 s1, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
-; CHECK-NEXT: ; %bb.4:
+; CHECK-NEXT: s_cbranch_scc1 .LBB1_4
+; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2
; CHECK-NEXT: s_sub_i32 s1, 0, s2
; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -345,7 +343,7 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s2, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT: .LBB1_5:
+; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 2be4b52198b45..06e51387c8f21 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -190,6 +190,7 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, 1
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s2
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: v_mov_b32_e32 v0, s3
@@ -314,15 +315,12 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; CHECK-NEXT: s_branch .LBB1_3
-; CHECK-NEXT: .LBB1_2:
-; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
-; CHECK-NEXT: .LBB1_3: ; %Flow
+; CHECK-NEXT: .LBB1_2: ; %Flow
; CHECK-NEXT: s_xor_b32 s1, s6, 1
; CHECK-NEXT: s_and_b32 s1, s1, 1
; CHECK-NEXT: s_cmp_lg_u32 s1, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
-; CHECK-NEXT: ; %bb.4:
+; CHECK-NEXT: s_cbranch_scc1 .LBB1_4
+; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2
; CHECK-NEXT: s_sub_i32 s1, 0, s2
; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -339,7 +337,7 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s2, v0
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s2, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT: .LBB1_5:
+; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/artificial-terminators.mir b/llvm/test/CodeGen/AMDGPU/artificial-terminators.mir
index 1a76cae68f164..9e84d979e8547 100644
--- a/llvm/test/CodeGen/AMDGPU/artificial-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/artificial-terminators.mir
@@ -34,18 +34,14 @@ body: |
; CHECK-NEXT: S_BRANCH %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: successors: %bb.5(0x30000000), %bb.2(0x50000000)
+ ; CHECK-NEXT: successors: %bb.4(0x30000000), %bb.2(0x50000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[V_ADD_U32_e64_3]], [[S_MOV_B32_1]], implicit $exec
; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_LT_I32_e64_]], implicit-def $scc
- ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]]
- ; CHECK-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit $exec
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.5:
- ; CHECK-NEXT: successors: %bb.4(0x80000000)
- ; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[V_CMP_LT_I32_e64_]]
- ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]]
+ ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -64,7 +60,7 @@ body: |
; CHECK-NEXT: S_BRANCH %bb.4
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.4:
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[COPY3]], %bb.5, [[S_OR_B32_]], %bb.2, [[S_OR_B32_]], %bb.3
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[COPY3]], %bb.1, [[S_OR_B32_]], %bb.2, [[S_OR_B32_]], %bb.3
; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[PHI]], implicit-def $scc
; CHECK-NEXT: S_ENDPGM 0
bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index f9ffa5ae57f3e..dfbb5f6a64042 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -9,44 +9,34 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_addc_u32 s13, s13, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; CHECK-NEXT: s_load_dwordx8 s[36:43], s[8:9], 0x0
+; CHECK-NEXT: s_load_dwordx8 s[20:27], s[8:9], 0x0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
-; CHECK-NEXT: s_mov_b32 s12, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_cmp_lg_u32 s40, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB0_8
-; CHECK-NEXT: ; %bb.1: ; %if.end13.i.i
-; CHECK-NEXT: s_cmp_eq_u32 s42, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB0_4
-; CHECK-NEXT: ; %bb.2: ; %if.else251.i.i
-; CHECK-NEXT: s_cmp_lg_u32 s43, 0
-; CHECK-NEXT: s_mov_b32 s17, 0
-; CHECK-NEXT: s_cselect_b32 s12, -1, 0
-; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s12
-; CHECK-NEXT: s_cbranch_vccz .LBB0_5
-; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: s_mov_b32 s36, 0
-; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12
-; CHECK-NEXT: s_cbranch_vccz .LBB0_6
-; CHECK-NEXT: s_branch .LBB0_7
-; CHECK-NEXT: .LBB0_4:
-; CHECK-NEXT: s_mov_b32 s14, s12
-; CHECK-NEXT: s_mov_b32 s15, s12
-; CHECK-NEXT: s_mov_b32 s13, s12
-; CHECK-NEXT: s_mov_b64 s[38:39], s[14:15]
-; CHECK-NEXT: s_mov_b64 s[36:37], s[12:13]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_cmp_lg_u32 s24, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB0_2
+; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: s_mov_b64 s[38:39], s[22:23]
+; CHECK-NEXT: s_mov_b64 s[36:37], s[20:21]
; CHECK-NEXT: s_branch .LBB0_7
-; CHECK-NEXT: .LBB0_5: ; %if.then263.i.i
-; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s41, 0
-; CHECK-NEXT: s_mov_b32 s36, 1.0
-; CHECK-NEXT: s_mov_b32 s17, 0x7fc00000
+; CHECK-NEXT: .LBB0_2: ; %if.end13.i.i
; CHECK-NEXT: s_mov_b32 s37, s36
; CHECK-NEXT: s_mov_b32 s38, s36
+; CHECK-NEXT: s_cmp_eq_u32 s26, 0
; CHECK-NEXT: s_mov_b32 s39, s36
+; CHECK-NEXT: s_cbranch_scc1 .LBB0_6
+; CHECK-NEXT: ; %bb.3: ; %if.else251.i.i
+; CHECK-NEXT: s_cmp_lg_u32 s27, 0
+; CHECK-NEXT: s_mov_b32 s17, 0
+; CHECK-NEXT: s_cselect_b32 s12, -1, 0
+; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s12
+; CHECK-NEXT: s_cbranch_vccz .LBB0_8
+; CHECK-NEXT: ; %bb.4:
+; CHECK-NEXT: s_mov_b32 s36, 0
; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12
-; CHECK-NEXT: s_cbranch_vccnz .LBB0_7
-; CHECK-NEXT: .LBB0_6: ; %if.end273.i.i
+; CHECK-NEXT: s_cbranch_vccnz .LBB0_6
+; CHECK-NEXT: .LBB0_5: ; %if.end273.i.i
; CHECK-NEXT: s_add_u32 s12, s8, 40
; CHECK-NEXT: s_addc_u32 s13, s9, 0
; CHECK-NEXT: s_getpc_b64 s[18:19]
@@ -72,13 +62,13 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_mov_b32 s37, s36
; CHECK-NEXT: s_mov_b32 s38, s36
; CHECK-NEXT: s_mov_b32 s39, s36
-; CHECK-NEXT: .LBB0_7: ; %if.end294.i.i
+; CHECK-NEXT: .LBB0_6: ; %if.end294.i.i
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; CHECK-NEXT: .LBB0_8: ; %kernel_direct_lighting.exit
+; CHECK-NEXT: .LBB0_7: ; %kernel_direct_lighting.exit
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x20
; CHECK-NEXT: v_mov_b32_e32 v0, s36
; CHECK-NEXT: v_mov_b32_e32 v4, 0
@@ -88,6 +78,16 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: .LBB0_8: ; %if.then263.i.i
+; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s25, 0
+; CHECK-NEXT: s_mov_b32 s36, 1.0
+; CHECK-NEXT: s_mov_b32 s17, 0x7fc00000
+; CHECK-NEXT: s_mov_b32 s37, s36
+; CHECK-NEXT: s_mov_b32 s38, s36
+; CHECK-NEXT: s_mov_b32 s39, s36
+; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12
+; CHECK-NEXT: s_cbranch_vccz .LBB0_5
+; CHECK-NEXT: s_branch .LBB0_6
entry:
%cmp5.i.i = icmp eq i32 %cmp5.i.i.arg, 0
br i1 %cmp5.i.i, label %if.end13.i.i, label %kernel_direct_lighting.exit
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index d61c4b46596c0..ce0b79b0b358c 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -848,12 +848,13 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX9-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], -1
; GFX9-SDAG-NEXT: s_mov_b32 s33, 0
-; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x1000
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-SDAG-NEXT: s_mov_b32 s4, 0
-; GFX9-SDAG-NEXT: s_cbranch_scc0 .LBB7_6
+; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x1000
+; GFX9-SDAG-NEXT: s_cbranch_scc0 .LBB7_4
; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.1
; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
@@ -873,8 +874,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s6
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: s_cbranch_execnz .LB...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Guy David (guy-david) ChangesRequires #128745. Lower it slightly below the likeliness of a null-check to be true which is set to 37.5% (see PtrUntakenProb). On M4 Pro:
On Ryzen9 5950X:
I looked into the disassembly of Patch is 226.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127666.diff 51 Files Affected:
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 82acb780cfb72..81459cf65d6c2 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -82,7 +82,7 @@ static cl::opt<unsigned> SplitEdgeProbabilityThreshold(
"If the branch threshold is higher than this threshold, we allow "
"speculative execution of up to 1 instruction to avoid branching to "
"splitted critical edge"),
- cl::init(40), cl::Hidden);
+ cl::init(35), cl::Hidden);
static cl::opt<unsigned> SinkLoadInstsPerBlockThreshold(
"machine-sink-load-instrs-threshold",
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index fb6575cc0ee83..fdc087e9c1991 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -632,20 +632,18 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
;
; CHECK-GI-LABEL: red_mla_dup_ext_u8_s8_s16:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: cbz w2, .LBB5_3
+; CHECK-GI-NEXT: mov w8, wzr
+; CHECK-GI-NEXT: cbz w2, .LBB5_9
; CHECK-GI-NEXT: // %bb.1: // %for.body.preheader
; CHECK-GI-NEXT: cmp w2, #16
; CHECK-GI-NEXT: mov w8, w2
-; CHECK-GI-NEXT: b.hs .LBB5_4
+; CHECK-GI-NEXT: b.hs .LBB5_3
; CHECK-GI-NEXT: // %bb.2:
; CHECK-GI-NEXT: mov w10, #0 // =0x0
; CHECK-GI-NEXT: mov x9, xzr
; CHECK-GI-NEXT: fmov s0, w10
-; CHECK-GI-NEXT: b .LBB5_8
-; CHECK-GI-NEXT: .LBB5_3:
-; CHECK-GI-NEXT: mov w0, wzr
-; CHECK-GI-NEXT: ret
-; CHECK-GI-NEXT: .LBB5_4: // %vector.ph
+; CHECK-GI-NEXT: b .LBB5_7
+; CHECK-GI-NEXT: .LBB5_3: // %vector.ph
; CHECK-GI-NEXT: lsl w9, w1, #8
; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
@@ -654,7 +652,7 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
; CHECK-GI-NEXT: dup v2.8h, w9
; CHECK-GI-NEXT: and x9, x8, #0xfffffff0
; CHECK-GI-NEXT: mov x11, x9
-; CHECK-GI-NEXT: .LBB5_5: // %vector.body
+; CHECK-GI-NEXT: .LBB5_4: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: ldp d3, d4, [x10, #-8]
; CHECK-GI-NEXT: subs x11, x11, #16
@@ -663,29 +661,31 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
; CHECK-GI-NEXT: ushll v4.8h, v4.8b, #0
; CHECK-GI-NEXT: mla v0.8h, v2.8h, v3.8h
; CHECK-GI-NEXT: mla v1.8h, v2.8h, v4.8h
-; CHECK-GI-NEXT: b.ne .LBB5_5
-; CHECK-GI-NEXT: // %bb.6: // %middle.block
+; CHECK-GI-NEXT: b.ne .LBB5_4
+; CHECK-GI-NEXT: // %bb.5: // %middle.block
; CHECK-GI-NEXT: add v0.8h, v1.8h, v0.8h
; CHECK-GI-NEXT: cmp x9, x8
; CHECK-GI-NEXT: addv h0, v0.8h
-; CHECK-GI-NEXT: b.ne .LBB5_8
-; CHECK-GI-NEXT: // %bb.7:
-; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: b.ne .LBB5_7
+; CHECK-GI-NEXT: // %bb.6:
+; CHECK-GI-NEXT: fmov w8, s0
+; CHECK-GI-NEXT: mov w0, w8
; CHECK-GI-NEXT: ret
-; CHECK-GI-NEXT: .LBB5_8: // %for.body.preheader1
+; CHECK-GI-NEXT: .LBB5_7: // %for.body.preheader1
; CHECK-GI-NEXT: sxtb w10, w1
-; CHECK-GI-NEXT: sub x8, x8, x9
+; CHECK-GI-NEXT: sub x11, x8, x9
; CHECK-GI-NEXT: add x9, x0, x9
-; CHECK-GI-NEXT: .LBB5_9: // %for.body
+; CHECK-GI-NEXT: .LBB5_8: // %for.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT: ldrb w11, [x9], #1
+; CHECK-GI-NEXT: ldrb w8, [x9], #1
; CHECK-GI-NEXT: fmov w12, s0
-; CHECK-GI-NEXT: subs x8, x8, #1
-; CHECK-GI-NEXT: mul w11, w11, w10
-; CHECK-GI-NEXT: add w0, w11, w12, uxth
-; CHECK-GI-NEXT: fmov s0, w0
-; CHECK-GI-NEXT: b.ne .LBB5_9
-; CHECK-GI-NEXT: // %bb.10: // %for.cond.cleanup
+; CHECK-GI-NEXT: subs x11, x11, #1
+; CHECK-GI-NEXT: mul w8, w8, w10
+; CHECK-GI-NEXT: add w8, w8, w12, uxth
+; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: b.ne .LBB5_8
+; CHECK-GI-NEXT: .LBB5_9: // %for.cond.cleanup
+; CHECK-GI-NEXT: mov w0, w8
; CHECK-GI-NEXT: ret
entry:
%conv2 = sext i8 %B to i16
diff --git a/llvm/test/CodeGen/AArch64/swifterror.ll b/llvm/test/CodeGen/AArch64/swifterror.ll
index 07ee87e880aff..1ca98f6015c11 100644
--- a/llvm/test/CodeGen/AArch64/swifterror.ll
+++ b/llvm/test/CodeGen/AArch64/swifterror.ll
@@ -412,6 +412,7 @@ define float @foo_if(ptr swifterror %error_ptr_ref, i32 %cc) {
; CHECK-APPLE-NEXT: .cfi_def_cfa w29, 16
; CHECK-APPLE-NEXT: .cfi_offset w30, -8
; CHECK-APPLE-NEXT: .cfi_offset w29, -16
+; CHECK-APPLE-NEXT: movi d0, #0000000000000000
; CHECK-APPLE-NEXT: cbz w0, LBB3_2
; CHECK-APPLE-NEXT: ; %bb.1: ; %gen_error
; CHECK-APPLE-NEXT: mov w0, #16 ; =0x10
@@ -420,10 +421,7 @@ define float @foo_if(ptr swifterror %error_ptr_ref, i32 %cc) {
; CHECK-APPLE-NEXT: fmov s0, #1.00000000
; CHECK-APPLE-NEXT: mov w8, #1 ; =0x1
; CHECK-APPLE-NEXT: strb w8, [x0, #8]
-; CHECK-APPLE-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
-; CHECK-APPLE-NEXT: ret
-; CHECK-APPLE-NEXT: LBB3_2:
-; CHECK-APPLE-NEXT: movi d0, #0000000000000000
+; CHECK-APPLE-NEXT: LBB3_2: ; %common.ret
; CHECK-APPLE-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
; CHECK-APPLE-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 0c9ff3eee8231..70caf812ea6c2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -200,6 +200,7 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[0:1], 0
; CHECK-NEXT: s_mov_b32 s0, 1
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: s_ashr_i32 s6, s3, 31
@@ -330,15 +331,12 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_xor_b32_e32 v0, s6, v0
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0
-; CHECK-NEXT: s_branch .LBB1_3
-; CHECK-NEXT: .LBB1_2:
-; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
-; CHECK-NEXT: .LBB1_3: ; %Flow
+; CHECK-NEXT: .LBB1_2: ; %Flow
; CHECK-NEXT: s_xor_b32 s0, s0, 1
; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
-; CHECK-NEXT: ; %bb.4:
+; CHECK-NEXT: s_cbranch_scc1 .LBB1_4
+; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s4
; CHECK-NEXT: s_sub_i32 s0, 0, s4
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -358,7 +356,7 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT: .LBB1_5:
+; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index df645888626c6..2fcbc41895f03 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -194,6 +194,7 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[0:1], 0
; CHECK-NEXT: s_mov_b32 s7, 1
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: s_ashr_i32 s6, s3, 31
@@ -322,15 +323,12 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; CHECK-NEXT: v_xor_b32_e32 v0, s6, v0
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0
-; CHECK-NEXT: s_branch .LBB1_3
-; CHECK-NEXT: .LBB1_2:
-; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
-; CHECK-NEXT: .LBB1_3: ; %Flow
+; CHECK-NEXT: .LBB1_2: ; %Flow
; CHECK-NEXT: s_xor_b32 s0, s7, 1
; CHECK-NEXT: s_and_b32 s0, s0, 1
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
-; CHECK-NEXT: ; %bb.4:
+; CHECK-NEXT: s_cbranch_scc1 .LBB1_4
+; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s4
; CHECK-NEXT: s_sub_i32 s0, 0, s4
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -348,7 +346,7 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT: .LBB1_5:
+; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index f5a901b024ef5..c9a5a92188256 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -193,6 +193,7 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, 1
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s2
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: v_mov_b32_e32 v0, s3
@@ -318,15 +319,12 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cndmask_b32_e32 v0, v9, v5, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; CHECK-NEXT: s_branch .LBB1_3
-; CHECK-NEXT: .LBB1_2:
-; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
-; CHECK-NEXT: .LBB1_3: ; %Flow
+; CHECK-NEXT: .LBB1_2: ; %Flow
; CHECK-NEXT: s_xor_b32 s1, s6, 1
; CHECK-NEXT: s_and_b32 s1, s1, 1
; CHECK-NEXT: s_cmp_lg_u32 s1, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
-; CHECK-NEXT: ; %bb.4:
+; CHECK-NEXT: s_cbranch_scc1 .LBB1_4
+; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2
; CHECK-NEXT: s_sub_i32 s1, 0, s2
; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -345,7 +343,7 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s2, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT: .LBB1_5:
+; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 2be4b52198b45..06e51387c8f21 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -190,6 +190,7 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cmp_ne_u64_e64 vcc, s[4:5], 0
; CHECK-NEXT: s_mov_b32 s6, 1
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s2
+; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: s_cbranch_vccz .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: v_mov_b32_e32 v0, s3
@@ -314,15 +315,12 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; CHECK-NEXT: s_branch .LBB1_3
-; CHECK-NEXT: .LBB1_2:
-; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
-; CHECK-NEXT: .LBB1_3: ; %Flow
+; CHECK-NEXT: .LBB1_2: ; %Flow
; CHECK-NEXT: s_xor_b32 s1, s6, 1
; CHECK-NEXT: s_and_b32 s1, s1, 1
; CHECK-NEXT: s_cmp_lg_u32 s1, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB1_5
-; CHECK-NEXT: ; %bb.4:
+; CHECK-NEXT: s_cbranch_scc1 .LBB1_4
+; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2
; CHECK-NEXT: s_sub_i32 s1, 0, s2
; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -339,7 +337,7 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, s2, v0
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s2, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT: .LBB1_5:
+; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/artificial-terminators.mir b/llvm/test/CodeGen/AMDGPU/artificial-terminators.mir
index 1a76cae68f164..9e84d979e8547 100644
--- a/llvm/test/CodeGen/AMDGPU/artificial-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/artificial-terminators.mir
@@ -34,18 +34,14 @@ body: |
; CHECK-NEXT: S_BRANCH %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: successors: %bb.5(0x30000000), %bb.2(0x50000000)
+ ; CHECK-NEXT: successors: %bb.4(0x30000000), %bb.2(0x50000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[V_ADD_U32_e64_3]], [[S_MOV_B32_1]], implicit $exec
; CHECK-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 $exec_lo, [[V_CMP_LT_I32_e64_]], implicit-def $scc
- ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]]
- ; CHECK-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit $exec
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: bb.5:
- ; CHECK-NEXT: successors: %bb.4(0x80000000)
- ; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[V_CMP_LT_I32_e64_]]
- ; CHECK-NEXT: S_BRANCH %bb.4
+ ; CHECK-NEXT: $exec_lo = S_MOV_B32_term [[S_XOR_B32_]]
+ ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec
+ ; CHECK-NEXT: S_BRANCH %bb.2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.3(0x40000000)
@@ -64,7 +60,7 @@ body: |
; CHECK-NEXT: S_BRANCH %bb.4
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.4:
- ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[COPY3]], %bb.5, [[S_OR_B32_]], %bb.2, [[S_OR_B32_]], %bb.3
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[COPY3]], %bb.1, [[S_OR_B32_]], %bb.2, [[S_OR_B32_]], %bb.3
; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[PHI]], implicit-def $scc
; CHECK-NEXT: S_ENDPGM 0
bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
index f9ffa5ae57f3e..dfbb5f6a64042 100644
--- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll
@@ -9,44 +9,34 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_addc_u32 s13, s13, 0
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
-; CHECK-NEXT: s_load_dwordx8 s[36:43], s[8:9], 0x0
+; CHECK-NEXT: s_load_dwordx8 s[20:27], s[8:9], 0x0
; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
-; CHECK-NEXT: s_mov_b32 s12, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_cmp_lg_u32 s40, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB0_8
-; CHECK-NEXT: ; %bb.1: ; %if.end13.i.i
-; CHECK-NEXT: s_cmp_eq_u32 s42, 0
-; CHECK-NEXT: s_cbranch_scc1 .LBB0_4
-; CHECK-NEXT: ; %bb.2: ; %if.else251.i.i
-; CHECK-NEXT: s_cmp_lg_u32 s43, 0
-; CHECK-NEXT: s_mov_b32 s17, 0
-; CHECK-NEXT: s_cselect_b32 s12, -1, 0
-; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s12
-; CHECK-NEXT: s_cbranch_vccz .LBB0_5
-; CHECK-NEXT: ; %bb.3:
; CHECK-NEXT: s_mov_b32 s36, 0
-; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12
-; CHECK-NEXT: s_cbranch_vccz .LBB0_6
-; CHECK-NEXT: s_branch .LBB0_7
-; CHECK-NEXT: .LBB0_4:
-; CHECK-NEXT: s_mov_b32 s14, s12
-; CHECK-NEXT: s_mov_b32 s15, s12
-; CHECK-NEXT: s_mov_b32 s13, s12
-; CHECK-NEXT: s_mov_b64 s[38:39], s[14:15]
-; CHECK-NEXT: s_mov_b64 s[36:37], s[12:13]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_cmp_lg_u32 s24, 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB0_2
+; CHECK-NEXT: ; %bb.1:
+; CHECK-NEXT: s_mov_b64 s[38:39], s[22:23]
+; CHECK-NEXT: s_mov_b64 s[36:37], s[20:21]
; CHECK-NEXT: s_branch .LBB0_7
-; CHECK-NEXT: .LBB0_5: ; %if.then263.i.i
-; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s41, 0
-; CHECK-NEXT: s_mov_b32 s36, 1.0
-; CHECK-NEXT: s_mov_b32 s17, 0x7fc00000
+; CHECK-NEXT: .LBB0_2: ; %if.end13.i.i
; CHECK-NEXT: s_mov_b32 s37, s36
; CHECK-NEXT: s_mov_b32 s38, s36
+; CHECK-NEXT: s_cmp_eq_u32 s26, 0
; CHECK-NEXT: s_mov_b32 s39, s36
+; CHECK-NEXT: s_cbranch_scc1 .LBB0_6
+; CHECK-NEXT: ; %bb.3: ; %if.else251.i.i
+; CHECK-NEXT: s_cmp_lg_u32 s27, 0
+; CHECK-NEXT: s_mov_b32 s17, 0
+; CHECK-NEXT: s_cselect_b32 s12, -1, 0
+; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s12
+; CHECK-NEXT: s_cbranch_vccz .LBB0_8
+; CHECK-NEXT: ; %bb.4:
+; CHECK-NEXT: s_mov_b32 s36, 0
; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12
-; CHECK-NEXT: s_cbranch_vccnz .LBB0_7
-; CHECK-NEXT: .LBB0_6: ; %if.end273.i.i
+; CHECK-NEXT: s_cbranch_vccnz .LBB0_6
+; CHECK-NEXT: .LBB0_5: ; %if.end273.i.i
; CHECK-NEXT: s_add_u32 s12, s8, 40
; CHECK-NEXT: s_addc_u32 s13, s9, 0
; CHECK-NEXT: s_getpc_b64 s[18:19]
@@ -72,13 +62,13 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_mov_b32 s37, s36
; CHECK-NEXT: s_mov_b32 s38, s36
; CHECK-NEXT: s_mov_b32 s39, s36
-; CHECK-NEXT: .LBB0_7: ; %if.end294.i.i
+; CHECK-NEXT: .LBB0_6: ; %if.end294.i.i
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; CHECK-NEXT: .LBB0_8: ; %kernel_direct_lighting.exit
+; CHECK-NEXT: .LBB0_7: ; %kernel_direct_lighting.exit
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x20
; CHECK-NEXT: v_mov_b32_e32 v0, s36
; CHECK-NEXT: v_mov_b32_e32 v4, 0
@@ -88,6 +78,16 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5]
; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: .LBB0_8: ; %if.then263.i.i
+; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s25, 0
+; CHECK-NEXT: s_mov_b32 s36, 1.0
+; CHECK-NEXT: s_mov_b32 s17, 0x7fc00000
+; CHECK-NEXT: s_mov_b32 s37, s36
+; CHECK-NEXT: s_mov_b32 s38, s36
+; CHECK-NEXT: s_mov_b32 s39, s36
+; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12
+; CHECK-NEXT: s_cbranch_vccz .LBB0_5
+; CHECK-NEXT: s_branch .LBB0_6
entry:
%cmp5.i.i = icmp eq i32 %cmp5.i.i.arg, 0
br i1 %cmp5.i.i, label %if.end13.i.i, label %kernel_direct_lighting.exit
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index d61c4b46596c0..ce0b79b0b358c 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -848,12 +848,13 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX9-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX9-SDAG-NEXT: s_add_u32 s0, s0, s17
; GFX9-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-SDAG-NEXT: s_mov_b64 s[6:7], -1
; GFX9-SDAG-NEXT: s_mov_b32 s33, 0
-; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x1000
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-SDAG-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-SDAG-NEXT: s_mov_b32 s4, 0
-; GFX9-SDAG-NEXT: s_cbranch_scc0 .LBB7_6
+; GFX9-SDAG-NEXT: s_movk_i32 s32, 0x1000
+; GFX9-SDAG-NEXT: s_cbranch_scc0 .LBB7_4
; GFX9-SDAG-NEXT: ; %bb.1: ; %bb.1
; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15
; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0x1ff0, v0
@@ -873,8 +874,11 @@ define amdgpu_kernel void @test_dynamic_stackalloc_kernel_control_flow(i32 %n, i
; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 1
; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], s6
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: s_cbranch_execnz .LB...
[truncated]
|
725de8f
to
0846b5c
Compare
2001146
to
fdcf246
Compare
0846b5c
to
f95740d
Compare
fdcf246
to
40b923a
Compare
f95740d
to
955685d
Compare
40b923a
to
cebcc2f
Compare
955685d
to
ddb2ff2
Compare
cebcc2f
to
95af9b3
Compare
Lower it slightly below the likeliness of a null-check to be true which is set to 37.5% (see PtrUntakenProb). Otherwise, it will split the edge and create another basic-block and with an unconditional branch which might make the CFG more complex and with a suboptimal block placement. Note that if multiple instructions can be sinked from the same edge then a split will occur regardless of this change.
ddb2ff2
to
f2418cc
Compare
Requires #128745.
Lower it slightly below the likeliness of a null-check to be true which is set to 37.5% (see PtrUntakenProb).
Otherwise, it will split the edge and create another basic-block (an else clause which wasn't there to begin with) and an unconditional branch which makes the CFG more complex and can result in a suboptimal block placement.
Note that if multiple instructions can be sinked from the same edge then a split will occur regardless of this change.
On M4 Pro:
On Ryzen9 5950X:
I looked into the disassembly of
BM_MemCmp<1, GreaterThanZero, None>
inMemFunctions.test
and it has not changed.