Skip to content

Commit 620794b

Browse files
committed
Adjust DemandedElts
1 parent 6846661 commit 620794b

File tree

3 files changed

+99
-23
lines changed

3 files changed

+99
-23
lines changed

Diff for: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

+27-6
Original file line numberDiff line numberDiff line change
@@ -5632,9 +5632,6 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, const APInt &DemandedElts,
56325632
(SNaN && !C->getValueAPF().isSignaling());
56335633
}
56345634

5635-
if (Op.isUndef())
5636-
return true;
5637-
56385635
unsigned Opcode = Op.getOpcode();
56395636
switch (Opcode) {
56405637
case ISD::FADD:
@@ -5755,9 +5752,33 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, const APInt &DemandedElts,
57555752
}
57565753
return isKnownNeverNaN(Src, SNaN, Depth + 1);
57575754
}
5758-
case ISD::INSERT_SUBVECTOR:
5759-
return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5760-
isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5755+
case ISD::INSERT_SUBVECTOR: {
5756+
SDValue BaseVector = Op.getOperand(0);
5757+
SDValue SubVector = Op.getOperand(1);
5758+
EVT BaseVectorVT = BaseVector.getValueType();
5759+
if (BaseVectorVT.isFixedLengthVector()) {
5760+
unsigned Idx = Op.getConstantOperandVal(2);
5761+
unsigned NumBaseVectorElts = BaseVectorVT.getVectorNumElements();
5762+
unsigned NumSubVectorElts =
5763+
SubVector.getValueType().getVectorNumElements();
5764+
5765+
// Clear the bits at the position where the subvector will be inserted.
5766+
APInt DemandedMask = APInt::getAllOnes(NumSubVectorElts)
5767+
.zext(NumBaseVectorElts)
5768+
.shl(Idx)
5769+
.reverseBits();
5770+
APInt DemandedSrcElts = DemandedElts & DemandedMask;
5771+
5772+
// If DemandedSrcElts is zero, we only need to check that the subvector is
5773+
// never NaN.
5774+
if (DemandedSrcElts.isZero())
5775+
return isKnownNeverNaN(SubVector, SNaN, Depth + 1);
5776+
return isKnownNeverNaN(BaseVector, DemandedSrcElts, SNaN, Depth + 1) &&
5777+
isKnownNeverNaN(SubVector, SNaN, Depth + 1);
5778+
}
5779+
return isKnownNeverNaN(BaseVector, SNaN, Depth + 1) &&
5780+
isKnownNeverNaN(SubVector, SNaN, Depth + 1);
5781+
}
57615782
case ISD::BUILD_VECTOR: {
57625783
unsigned NumElts = Op.getNumOperands();
57635784
for (unsigned I = 0; I != NumElts; ++I)

Diff for: llvm/test/CodeGen/AMDGPU/clamp.ll

+17-17
Original file line numberDiff line numberDiff line change
@@ -2986,14 +2986,14 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
29862986
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
29872987
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
29882988
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
2989+
; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
29892990
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
29902991
; GFX6-NEXT: s_waitcnt vmcnt(0)
29912992
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
2992-
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
29932993
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
2994-
; GFX6-NEXT: v_max_f32_e32 v3, 0, v3
2995-
; GFX6-NEXT: v_max_f32_e32 v2, s0, v2
2996-
; GFX6-NEXT: v_min_f32_e32 v3, s0, v3
2994+
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
2995+
; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2
2996+
; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4
29972997
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
29982998
; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
29992999
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -3006,20 +3006,20 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
30063006
; GFX8: ; %bb.0:
30073007
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
30083008
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
3009+
; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
30093010
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
30103011
; GFX8-NEXT: v_mov_b32_e32 v1, s3
30113012
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
30123013
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
30133014
; GFX8-NEXT: flat_load_dword v3, v[0:1]
30143015
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
3015-
; GFX8-NEXT: v_mov_b32_e32 v4, s0
30163016
; GFX8-NEXT: v_mov_b32_e32 v1, s1
30173017
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
30183018
; GFX8-NEXT: s_waitcnt vmcnt(0)
30193019
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
30203020
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
30213021
; GFX8-NEXT: v_max_f16_e32 v2, 0, v2
3022-
; GFX8-NEXT: v_max_f16_e32 v3, s0, v3
3022+
; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3
30233023
; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3
30243024
; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
30253025
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -3747,16 +3747,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
37473747
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
37483748
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
37493749
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3750+
; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
37503751
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
37513752
; GFX6-NEXT: s_waitcnt vmcnt(0)
37523753
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
37533754
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
37543755
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
3755-
; GFX6-NEXT: v_max_f32_e32 v3, s0, v3
3756-
; GFX6-NEXT: v_max_f32_e32 v2, 0, v2
3756+
; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3
37573757
; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
37583758
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
3759-
; GFX6-NEXT: v_min_f32_e32 v2, s0, v2
3759+
; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4
37603760
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
37613761
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
37623762
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
@@ -3779,9 +3779,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
37793779
; GFX8-NEXT: s_waitcnt vmcnt(0)
37803780
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
37813781
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
3782-
; GFX8-NEXT: v_max_f16_e32 v2, s0, v2
3782+
; GFX8-NEXT: v_max_f16_e32 v2, 0x7e00, v2
37833783
; GFX8-NEXT: v_max_f16_e32 v3, 0, v3
3784-
; GFX8-NEXT: v_min_f16_e32 v3, s0, v3
3784+
; GFX8-NEXT: v_min_f16_e32 v3, 0x7e00, v3
37853785
; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
37863786
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
37873787
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -3845,14 +3845,14 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
38453845
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
38463846
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
38473847
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
3848+
; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
38483849
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
38493850
; GFX6-NEXT: s_waitcnt vmcnt(0)
38503851
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
3851-
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
38523852
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
3853-
; GFX6-NEXT: v_max_f32_e32 v3, 0, v3
3854-
; GFX6-NEXT: v_max_f32_e32 v2, s0, v2
3855-
; GFX6-NEXT: v_min_f32_e32 v3, s0, v3
3853+
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
3854+
; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2
3855+
; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4
38563856
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
38573857
; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
38583858
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -3865,20 +3865,20 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
38653865
; GFX8: ; %bb.0:
38663866
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
38673867
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
3868+
; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
38683869
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
38693870
; GFX8-NEXT: v_mov_b32_e32 v1, s3
38703871
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
38713872
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
38723873
; GFX8-NEXT: flat_load_dword v3, v[0:1]
38733874
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
3874-
; GFX8-NEXT: v_mov_b32_e32 v4, s0
38753875
; GFX8-NEXT: v_mov_b32_e32 v1, s1
38763876
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
38773877
; GFX8-NEXT: s_waitcnt vmcnt(0)
38783878
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
38793879
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
38803880
; GFX8-NEXT: v_max_f16_e32 v2, 0, v2
3881-
; GFX8-NEXT: v_max_f16_e32 v3, s0, v3
3881+
; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3
38823882
; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3
38833883
; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
38843884
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2

Diff for: llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll

+55
Original file line numberDiff line numberDiff line change
@@ -1057,6 +1057,13 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
10571057
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v3f32_clamp_postcvt:
10581058
; SDAG-GFX1100-TRUE16: ; %bb.0:
10591059
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1060+
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
1061+
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
1062+
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
1063+
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
1064+
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
1065+
; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, 0
1066+
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v5, v6 op_sel_hi:[1,1,1] clamp
10601067
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10611068
; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp
10621069
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
@@ -1070,6 +1077,34 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
10701077
; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
10711078
; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
10721079
; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1080+
; SDAG-GFX1100-FAKE16-NEXT: v_pack_b32_f16 v1, v1, 0
1081+
; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
1082+
; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1083+
; SDAG-GFX1100-FAKE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp
1084+
; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3
1085+
; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
1086+
;
1087+
; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
1088+
; SDAG-GFX900: ; %bb.0:
1089+
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1090+
; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
1091+
; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
1092+
; SDAG-GFX900-NEXT: v_pack_b32_f16 v1, v1, 0
1093+
; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
1094+
; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp
1095+
; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v3
1096+
; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
1097+
;
1098+
; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
1099+
; SDAG-GFX906: ; %bb.0:
1100+
; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1101+
; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
1102+
; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
1103+
; SDAG-GFX906-NEXT: v_pack_b32_f16 v1, v1, 0
1104+
; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
1105+
; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp
1106+
; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v3
1107+
; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
10731108
;
10741109
; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
10751110
; SDAG-VI: ; %bb.0:
@@ -1143,6 +1178,26 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
11431178
; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v6
11441179
; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
11451180
;
1181+
; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
1182+
; GISEL-GFX900: ; %bb.0:
1183+
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1184+
; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
1185+
; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
1186+
; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
1187+
; GISEL-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp
1188+
; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v3
1189+
; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
1190+
;
1191+
; GISEL-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
1192+
; GISEL-GFX906: ; %bb.0:
1193+
; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1194+
; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
1195+
; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
1196+
; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
1197+
; GISEL-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp
1198+
; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v3
1199+
; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
1200+
;
11461201
; GISEL-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
11471202
; GISEL-VI: ; %bb.0:
11481203
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)

0 commit comments

Comments
 (0)