-
Notifications
You must be signed in to change notification settings - Fork 13.3k
DAG: Fix vector bin op scalarize defining a partially undef vector #122459
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DAG: Fix vector bin op scalarize defining a partially undef vector #122459
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-selectiondag Author: Matt Arsenault (arsenm) ChangesThis avoids some of the pending regressions after AMDGPU implements In a case like shl <value, undef>, splat k, because the second operand Full diff: https://github.com/llvm/llvm-project/pull/122459.diff 2 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index da3c834417d6b2..712e52ee8fc921 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27525,8 +27525,12 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
// If all lanes but 1 are undefined, no need to splat the scalar result.
// TODO: Keep track of undefs and use that info in the general case.
if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() &&
- count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1 &&
- count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) {
+ // This is assuming if either input is undef, the result will fold out.
+ //
+ // TODO: Do we need to check if the opcode/operand propagates undef?
+ // Should we ignore operation identity values?
+ ((count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) ||
+ (count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1))) {
// bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
// build_vec ..undef, (bo X, Y), undef...
SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT));
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index aa3e05fdbdb36a..02e30b6c68e994 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -156,3 +156,334 @@ define <2 x i16> @trunc_v2i64_arg_to_v2i16(<2 x i64> %arg0) #0 {
%trunc = trunc <2 x i64> %arg0 to <2 x i16>
ret <2 x i16> %trunc
}
+
+; Test for regression where an unnecessary v_alignbit_b32 was inserted
+; on the final result, due to losing the fact that the upper half of
+; the lhs vector was undef.
+define <2 x i16> @vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_lshr_lhs_alignbit_regression:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+ %lshr = lshr <2 x i32> %undef.hi.elt, splat (i32 16)
+ %trunc = trunc <2 x i32> %lshr to <2 x i16>
+ ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshr_b32_e32 v0, 16, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_lshr_rhs_alignbit_regression:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e64 v0, v0, 16
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+ %lshr = lshr <2 x i32> splat (i32 16), %undef.hi.elt
+ %trunc = trunc <2 x i32> %lshr to <2 x i16>
+ ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_ashr_lhs_alignbit_regression:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+ %ashr = ashr <2 x i32> %undef.hi.elt, splat (i32 16)
+ %trunc = trunc <2 x i32> %ashr to <2 x i16>
+ ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_ashr_i32_e32 v0, -4, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_ashr_rhs_alignbit_regression:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_ashrrev_i32_e64 v0, v0, -4
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+ %lshr = ashr <2 x i32> splat (i32 -4), %undef.hi.elt
+ %trunc = trunc <2 x i32> %lshr to <2 x i16>
+ ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_add_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_add_lhs_alignbit_regression:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, 16, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_add_lhs_alignbit_regression:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+ %lshr = add <2 x i32> %undef.hi.elt, splat (i32 16)
+ %trunc = trunc <2 x i32> %lshr to <2 x i16>
+ ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_shl_rhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_shl_rhs_alignbit_regression:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshl_b32_e32 v0, 2, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xfffe, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_shl_rhs_alignbit_regression:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 2
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+ %lshr = shl <2 x i32> splat (i32 2), %undef.hi.elt
+ %trunc = trunc <2 x i32> %lshr to <2 x i16>
+ ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_sub_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_sub_lhs_alignbit_regression:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_add_i32_e32 v0, vcc, -16, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_sub_lhs_alignbit_regression:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+ %lshr = sub <2 x i32> %undef.hi.elt, splat (i32 16)
+ %trunc = trunc <2 x i32> %lshr to <2 x i16>
+ ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_or_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_or_b32_e32 v0, 0xffff0011, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0xffff
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_or_b32_e32 v0, 0xffff0011, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+ %lshr = or <2 x i32> %undef.hi.elt, splat (i32 17)
+ %trunc = trunc <2 x i32> %lshr to <2 x i16>
+ ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_xor_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_xor_lhs_alignbit_regression:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_xor_b32_e32 v0, 17, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_xor_lhs_alignbit_regression:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_xor_b32_e32 v0, 17, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+ %lshr = xor <2 x i32> %undef.hi.elt, splat (i32 17)
+ %trunc = trunc <2 x i32> %lshr to <2 x i16>
+ ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_shl_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_shl_lhs_alignbit_regression:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xfffc, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_shl_lhs_alignbit_regression:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshlrev_b16_e32 v0, 2, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+ %shl = shl <2 x i32> %undef.hi.elt, splat (i32 2)
+ %trunc = trunc <2 x i32> %shl to <2 x i16>
+ ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_mul_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_mul_lhs_alignbit_regression:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mul_lo_u32 v0, v0, 18
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: v_and_b32_e32 v0, 0xfffe, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_mul_lhs_alignbit_regression:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mul_lo_u32 v0, v0, 18
+; VI-NEXT: v_and_b32_e32 v0, 0xfffe, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+ %lshr = mul <2 x i32> %undef.hi.elt, splat (i32 18)
+ %trunc = trunc <2 x i32> %lshr to <2 x i16>
+ ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, 0x38e38e39
+; SI-NEXT: v_mul_hi_i32 v0, v0, s4
+; SI-NEXT: v_lshrrev_b32_e32 v1, 31, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_sdiv_lhs_alignbit_regression:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s4, 0x38e38e39
+; VI-NEXT: v_mul_hi_i32 v0, v0, s4
+; VI-NEXT: v_lshrrev_b32_e32 v1, 31, v0
+; VI-NEXT: v_ashrrev_i32_e32 v0, 2, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+ %lshr = sdiv <2 x i32> %undef.hi.elt, splat (i32 18)
+ %trunc = trunc <2 x i32> %lshr to <2 x i16>
+ ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_srem_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_srem_lhs_alignbit_regression:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, 0x38e38e39
+; SI-NEXT: v_mul_hi_i32 v1, v0, s4
+; SI-NEXT: v_lshrrev_b32_e32 v2, 31, v1
+; SI-NEXT: v_lshrrev_b32_e32 v1, 2, v1
+; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; SI-NEXT: v_mul_lo_u32 v1, v1, 18
+; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_srem_lhs_alignbit_regression:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s4, 0x38e38e39
+; VI-NEXT: v_mul_hi_i32 v1, v0, s4
+; VI-NEXT: v_lshrrev_b32_e32 v2, 31, v1
+; VI-NEXT: v_ashrrev_i32_e32 v1, 2, v1
+; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; VI-NEXT: v_mul_lo_u32 v1, v1, 18
+; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+ %lshr = srem <2 x i32> %undef.hi.elt, splat (i32 18)
+ %trunc = trunc <2 x i32> %lshr to <2 x i16>
+ ret <2 x i16> %trunc
+}
+
+
+define <2 x i16> @vector_trunc_high_bits_undef_udiv_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_udiv_lhs_alignbit_regression:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, 0x38e38e39
+; SI-NEXT: v_mul_hi_u32 v0, v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: v_bfe_u32 v0, v0, 2, 16
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_udiv_lhs_alignbit_regression:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s4, 0x38e38e39
+; VI-NEXT: v_mul_hi_u32 v0, v0, s4
+; VI-NEXT: v_lshrrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+ %lshr = udiv <2 x i32> %undef.hi.elt, splat (i32 18)
+ %trunc = trunc <2 x i32> %lshr to <2 x i16>
+ ret <2 x i16> %trunc
+}
+
+define <2 x i16> @vector_trunc_high_bits_undef_urem_lhs_alignbit_regression(i32 %arg0) {
+; SI-LABEL: vector_trunc_high_bits_undef_urem_lhs_alignbit_regression:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s4, 0x38e38e39
+; SI-NEXT: v_mul_hi_u32 v1, v0, s4
+; SI-NEXT: v_lshrrev_b32_e32 v1, 2, v1
+; SI-NEXT: v_mul_lo_u32 v1, v1, 18
+; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: vector_trunc_high_bits_undef_urem_lhs_alignbit_regression:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s4, 0x38e38e39
+; VI-NEXT: v_mul_hi_u32 v1, v0, s4
+; VI-NEXT: v_lshrrev_b32_e32 v1, 2, v1
+; VI-NEXT: v_mul_lo_u32 v1, v1, 18
+; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+ %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0
+ %lshr = urem <2 x i32> %undef.hi.elt, splat (i32 18)
+ %trunc = trunc <2 x i32> %lshr to <2 x i16>
+ ret <2 x i16> %trunc
+}
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 4c2e4ea18fd0031636994cf81fd03d82f59b7d27 ea735413cd1f49e142f9c618c7940b9835adcfea llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp llvm/test/CodeGen/AMDGPU/trunc-combine.ll The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
52e6f81
to
d12bd25
Compare
// TODO: Do we need to check if the opcode/operand propagates undef? | ||
// Should we ignore operation identity values? | ||
((count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) || | ||
(count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The opcode specific undef handling does worry me - how well does this work if we completely scalarize all elements and leave getNode() to handle the undefs / constant handling etc? Similar to what getKnownUndefForVectorBinop does in TargetLowering
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't actually know why this combine is doing any of this. It works best to skip the whole special handling, and just use INSERT_VECTOR_ELT
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually in the next commit I find a miscompile from this, something is wrong
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just doing getNode and checking if the result folded to undef seems to work. I do see some regressions in the test I was looking at, but I think these are actually undef handling bug fixes (e.g. xor undef, undef was previously assumed to return undef but we fold that to 0)
0fd42cf
to
4c6f15f
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with one minor (update comments)
SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC); | ||
SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC); | ||
SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags()); | ||
|
||
// If all lanes but 1 are undefined, no need to splat the scalar result. | ||
// TODO: Keep track of undefs and use that info in the general case. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think both the comment and TODO need updating
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't check for all-but-one undef lanes though anymore?
This avoids some of the pending regressions after AMDGPU implements isExtractVecEltCheap. In a case like shl <value, undef>, splat k, because the second operand was fully defined, we would fall through and use the splat value for the first operand, losing the undef high bits. This would result in an additional instruction to handle the high bits. Add some reduced testcases for different opcodes for one of the regressions.
It folds to undef or a constant and seems to have the best result anyway rather than overdefining by splatting the variable value.
ea73541
to
c41456e
Compare
This avoids some of the pending regressions after AMDGPU implements
isExtractVecEltCheap.
In a case like shl <value, undef>, splat k, because the second operand
was fully defined, we would fall through and use the splat value for the
first operand, losing the undef high bits. This would result in an additional
instruction to handle the high bits. Add some reduced testcases for different
opcodes for one of the regressions.