From 4566628a98447c863f90965aa5ff360d54a34c9b Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Thu, 12 Sep 2024 17:00:13 +0800 Subject: [PATCH 1/2] SelectionDAG: Support nofpclass Currently SelectionDAG ignroes the nofpclass information from arguments. Such as define dso_local float @f(float noundef nofpclass(nan zero) %a, float noundef nofpclass(nan zero) %b) #0 { entry: %cond = tail call float @llvm.maximumnum.f32(float %a, float %b) ret float %cond } In SelectionDAG::isKnownNeverNaN, a false is returned. TODO: 1) bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) needs to process hasNoSNaN; 2) bool SelectionDAG::isKnownNeverZeroFloat(SDValue Op) needs to process Zero and SignedZero. These 2 problems will be fixed with other PRs. --- llvm/include/llvm/CodeGen/SelectionDAG.h | 8 +++ llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 33 +++++++++++-- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 49 +++++++++++++++++-- .../SelectionDAG/SelectionDAGBuilder.cpp | 37 +++++++++++++- .../CodeGen/SelectionDAG/TargetLowering.cpp | 4 ++ llvm/test/CodeGen/AMDGPU/known-never-snan.ll | 1 - llvm/test/CodeGen/AMDGPU/reduction.ll | 28 +++-------- llvm/test/CodeGen/X86/fminimum-fmaximum.ll | 41 +++++----------- 8 files changed, 142 insertions(+), 59 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index d6c2c36a0d482..1ee3e9382a246 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -2136,6 +2136,14 @@ class SelectionDAG { /// positive or negative zero. bool isKnownNeverZeroFloat(SDValue Op) const; + /// Test whether the given floating point SDValue is known to never be + /// positive zero. + bool isKnownNeverPosZeroFloat(SDValue Op) const; + + /// Test whether the given floating point SDValue is known to never be + /// negative zero. + bool isKnownNeverNegZeroFloat(SDValue Op) const; + /// Test whether the given SDValue is known to contain non-zero value(s). bool isKnownNeverZero(SDValue Op, unsigned Depth = 0) const; diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 6067b3b29ea18..737927aba67ed 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -383,6 +383,7 @@ struct SDNodeFlags { bool Exact : 1; bool Disjoint : 1; bool NonNeg : 1; + // deprecated: Use NoQNanS && NoSNaNs bool NoNaNs : 1; bool NoInfs : 1; bool NoSignedZeros : 1; @@ -400,6 +401,11 @@ struct SDNodeFlags { // Instructions with attached 'unpredictable' metadata on IR level. bool Unpredictable : 1; + bool NoQNaNs : 1; + bool NoSNaNs : 1; + bool NoPosZeros : 1; + bool NoNegZeros : 1; + public: /// Default constructor turns off all optimization flags. SDNodeFlags() @@ -407,12 +413,15 @@ struct SDNodeFlags { Disjoint(false), NonNeg(false), NoNaNs(false), NoInfs(false), NoSignedZeros(false), AllowReciprocal(false), AllowContract(false), ApproximateFuncs(false), AllowReassociation(false), NoFPExcept(false), - Unpredictable(false) {} + Unpredictable(false), NoQNaNs(false), NoSNaNs(false), NoPosZeros(false), + NoNegZeros(false) {} /// Propagate the fast-math-flags from an IR FPMathOperator. void copyFMF(const FPMathOperator &FPMO) { - setNoNaNs(FPMO.hasNoNaNs()); + setNoSNaNs(FPMO.hasNoNaNs()); + setNoQNaNs(FPMO.hasNoNaNs()); setNoInfs(FPMO.hasNoInfs()); + setNoNegZeros(FPMO.hasNoSignedZeros()); setNoSignedZeros(FPMO.hasNoSignedZeros()); setAllowReciprocal(FPMO.hasAllowReciprocal()); setAllowContract(FPMO.hasAllowContract()); @@ -426,8 +435,20 @@ struct SDNodeFlags { void setExact(bool b) { Exact = b; } void setDisjoint(bool b) { Disjoint = b; } void setNonNeg(bool b) { NonNeg = b; } - void setNoNaNs(bool b) { NoNaNs = b; } + [[deprecated("Use SetSNaNs() and SetQNaNs()")]] void setNoNaNs(bool b) { + NoNaNs = NoQNaNs = NoSNaNs = b; + } + void setNoQNaNs(bool b) { + NoQNaNs = b; + NoNaNs = (NoQNaNs && NoSNaNs); + } + void setNoSNaNs(bool b) { + NoSNaNs = b; + NoNaNs = (NoQNaNs && NoSNaNs); + } void setNoInfs(bool b) { NoInfs = b; } + void setNoPosZeros(bool b) { NoPosZeros = b; } + void setNoNegZeros(bool b) { NoNegZeros = b; } void setNoSignedZeros(bool b) { NoSignedZeros = b; } void setAllowReciprocal(bool b) { AllowReciprocal = b; } void setAllowContract(bool b) { AllowContract = b; } @@ -442,8 +463,12 @@ struct SDNodeFlags { bool hasExact() const { return Exact; } bool hasDisjoint() const { return Disjoint; } bool hasNonNeg() const { return NonNeg; } - bool hasNoNaNs() const { return NoNaNs; } + bool hasNoNaNs() const { return (NoSNaNs && NoQNaNs); } + bool hasNoSNaNs() const { return NoSNaNs; } + bool hasNoQNaNs() const { return NoQNaNs; } bool hasNoInfs() const { return NoInfs; } + bool hasNoPosZeros() const { return NoPosZeros; } + bool hasNoNegZeros() const { return NoNegZeros; } bool hasNoSignedZeros() const { return NoSignedZeros; } bool hasAllowReciprocal() const { return AllowReciprocal; } bool hasAllowContract() const { return AllowContract; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 9b96dbb666198..9d61567a28364 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5435,7 +5435,12 @@ bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const { bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const { // If we're told that NaNs won't happen, assume they won't. - if (getTarget().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs()) + if (getTarget().Options.NoNaNsFPMath) + return true; + SDNodeFlags OpFlags = Op->getFlags(); + if (SNaN && OpFlags.hasNoSNaNs()) + return true; + if (OpFlags.hasNoSNaNs() && OpFlags.hasNoQNaNs()) return true; if (Depth >= MaxRecursionDepth) @@ -5569,11 +5574,39 @@ bool SelectionDAG::isKnownNeverZeroFloat(SDValue Op) const { assert(Op.getValueType().isFloatingPoint() && "Floating point type expected"); + SDNodeFlags OpFlags = Op->getFlags(); + if (OpFlags.hasNoPosZeros() && OpFlags.hasNoNegZeros()) + return true; + // If the value is a constant, we can obviously see if it is a zero or not. return ISD::matchUnaryFpPredicate( Op, [](ConstantFPSDNode *C) { return !C->isZero(); }); } +bool SelectionDAG::isKnownNeverPosZeroFloat(SDValue Op) const { + assert(Op.getValueType().isFloatingPoint() && "Floating point type expected"); + + SDNodeFlags OpFlags = Op->getFlags(); + if (OpFlags.hasNoPosZeros()) + return true; + + // If the value is a constant, we can obviously see if it is a zero or not. + return ISD::matchUnaryFpPredicate( + Op, [](ConstantFPSDNode *C) { return !C->isZero() || C->isNegative(); }); +} + +bool SelectionDAG::isKnownNeverNegZeroFloat(SDValue Op) const { + assert(Op.getValueType().isFloatingPoint() && "Floating point type expected"); + + SDNodeFlags OpFlags = Op->getFlags(); + if (OpFlags.hasNoNegZeros()) + return true; + + // If the value is a constant, we can obviously see if it is a zero or not. + return ISD::matchUnaryFpPredicate( + Op, [](ConstantFPSDNode *C) { return !C->isZero() || !C->isNegative(); }); +} + bool SelectionDAG::isKnownNeverZero(SDValue Op, unsigned Depth) const { if (Depth >= MaxRecursionDepth) return false; // Limit search depth. @@ -7490,6 +7523,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, N2.getOpcode() != ISD::DELETED_NODE && N3.getOpcode() != ISD::DELETED_NODE && "Operand is DELETED_NODE!"); + SDNodeFlags NewFlags = Flags; // Perform various simplifications. switch (Opcode) { case ISD::FMA: @@ -7535,6 +7569,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert((!VT.isVector() || VT.getVectorElementCount() == N1.getValueType().getVectorElementCount()) && "SETCC vector element counts must match!"); + if (N1->getFlags().hasNoNaNs() && N2->getFlags().hasNoNaNs()) { + NewFlags.setNoQNaNs(true); + NewFlags.setNoSNaNs(true); + } // Use FoldSetCC to simplify SETCC's. if (SDValue V = FoldSetCC(VT, N1, N2, cast(N3)->get(), DL)) return V; @@ -7548,6 +7586,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } case ISD::SELECT: case ISD::VSELECT: + if ((N1->getFlags().hasNoNaNs() && N2->getFlags().hasNoNaNs()) || + N3->getFlags().hasNoNaNs()) { + NewFlags.setNoQNaNs(true); + NewFlags.setNoSNaNs(true); + } if (SDValue V = simplifySelect(N1, N2, N3)) return V; break; @@ -7654,12 +7697,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, AddNodeIDNode(ID, Opcode, VTs, Ops); void *IP = nullptr; if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) { - E->intersectFlagsWith(Flags); + E->intersectFlagsWith(NewFlags); return SDValue(E, 0); } N = newSDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs); - N->setFlags(Flags); + N->setFlags(NewFlags); createOperands(N, Ops); CSEMap.InsertNode(N, IP); } else { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 25213f587116d..0bfb0c14dd902 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3708,8 +3708,24 @@ void SelectionDAGBuilder::visitSelect(const User &I) { bool Negate = false; SDNodeFlags Flags; - if (auto *FPOp = dyn_cast(&I)) + SelectInst *NewI = dyn_cast(cast(I).clone()); + if (auto *FPOp = dyn_cast(&I)) { Flags.copyFMF(*FPOp); + if (Cond->getFlags().hasNoNaNs() || + (LHSVal->getFlags().hasNoNaNs() && RHSVal->getFlags().hasNoNaNs())) { + FastMathFlags FMF = FPOp->getFastMathFlags(); + FMF.setNoNaNs(true); + NewI->setFastMathFlags(FMF); + CmpInst *CmpCond = dyn_cast(NewI->getCondition()); + if (isa(CmpCond)) { + FastMathFlags CondFMF = CmpCond->getFastMathFlags(); + CondFMF.setNoNaNs(true); + CmpCond->setFastMathFlags(CondFMF); + } + Flags.setNoQNaNs(true); + Flags.setNoSNaNs(true); + } + } Flags.setUnpredictable( cast(I).getMetadata(LLVMContext::MD_unpredictable)); @@ -3735,7 +3751,7 @@ void SelectionDAGBuilder::visitSelect(const User &I) { // so we can't lower to FMINIMUM/FMAXIMUM because those nodes specify that // -0.0 is less than +0.0. const Value *LHS, *RHS; - auto SPR = matchSelectPattern(&I, LHS, RHS); + auto SPR = matchSelectPattern(NewI, LHS, RHS); ISD::NodeType Opc = ISD::DELETED_NODE; switch (SPR.Flavor) { case SPF_UMAX: Opc = ISD::UMAX; break; @@ -3798,6 +3814,7 @@ void SelectionDAGBuilder::visitSelect(const User &I) { BaseOps.clear(); } } + NewI->deleteValue(); if (IsUnaryAbs) { for (unsigned i = 0; i != NumValues; ++i) { @@ -11775,6 +11792,22 @@ void SelectionDAGISel::LowerArguments(const Function &F) { AssertOp = ISD::AssertSext; else if (Arg.hasAttribute(Attribute::ZExt)) AssertOp = ISD::AssertZext; + if (Arg.hasAttribute(Attribute::NoFPClass)) { + SDNodeFlags InValFlags = InVals[i]->getFlags(); + bool NoSNaN = ((Arg.getNoFPClass() & llvm::fcSNan) == llvm::fcSNan); + bool NoQNaN = ((Arg.getNoFPClass() & llvm::fcQNan) == llvm::fcQNan); + InValFlags.setNoSNaNs(NoSNaN); + InValFlags.setNoQNaNs(NoQNaN); + bool NoPosZeros = + ((Arg.getNoFPClass() & llvm::fcPosZero) == llvm::fcPosZero); + bool NoNegZeros = + ((Arg.getNoFPClass() & llvm::fcNegZero) == llvm::fcNegZero); + InValFlags.setNoPosZeros(NoPosZeros); + InValFlags.setNoNegZeros(NoNegZeros); + InValFlags.setNoInfs((Arg.getNoFPClass() & llvm::fcInf) == + llvm::fcInf); + InVals[i]->setFlags(InValFlags); + } ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts, PartVT, VT, nullptr, NewRoot, diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index a2a232ed93b72..617946e4bc2a7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8422,6 +8422,10 @@ TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node, Opcode == ISD::STRICT_FMINNUM || Opcode == ISD::STRICT_FMAXNUM) && "Wrong opcode"); + EVT VT = Node->getValueType(0); + if (VT.isVector() && isOperationLegal(Opcode, VT.getScalarType())) + return SDValue(); + if (Node->getFlags().hasNoNaNs()) { ISD::CondCode Pred = Opcode == ISD::FMINNUM ? ISD::SETLT : ISD::SETGT; SDValue Op1 = Node->getOperand(0); diff --git a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll index 64948c374e4dd..10cab8f4a6529 100644 --- a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll +++ b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll @@ -248,7 +248,6 @@ define float @v_select_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b ; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] %b.nnan.add = fadd nnan float %b, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/reduction.ll b/llvm/test/CodeGen/AMDGPU/reduction.ll index 53a036b617725..1663330bd1b9f 100644 --- a/llvm/test/CodeGen/AMDGPU/reduction.ll +++ b/llvm/test/CodeGen/AMDGPU/reduction.ll @@ -498,18 +498,11 @@ entry: ; XVI-NEXT: s_setpc_b64 ; GFX9: s_waitcnt -; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1 -; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0 -; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}} +; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], v0, v1{{$}} ; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0 -; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1 - -; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]] -; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]] +; VI-DAG: v_max_f16_sdwa [[MAX0:v[0-9]+]], v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], v0, v1 ; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]] define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) { entry: @@ -537,19 +530,12 @@ entry: ; XVI-NEXT: s_setpc_b64 ; GFX9: s_waitcnt -; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1 -; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0 -; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}} +; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], v0, v1{{$}} ; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0 -; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1 - -; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]] -; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]] -; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]] +; VI-DAG: v_min_f16_sdwa [[MIN0:v[0-9]+]], v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_min_f16_e32 [[MIN1:v[0-9]+]], v0, v1 +; VI: v_min_f16_e32 v0, [[MIN1]], [[MIN0]] define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) { entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll index 41d9a867c0a96..fde92a0605e5d 100644 --- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll +++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll @@ -411,23 +411,16 @@ define float @test_fmaximum_combine_cmps(float %x, float %y) nounwind { ; SSE2-NEXT: divss %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: js .LBB9_2 -; SSE2-NEXT: # %bb.1: -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: .LBB9_2: -; SSE2-NEXT: movaps %xmm3, %xmm2 -; SSE2-NEXT: cmpunordss %xmm3, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm4 -; SSE2-NEXT: andps %xmm3, %xmm4 -; SSE2-NEXT: js .LBB9_4 -; SSE2-NEXT: # %bb.3: +; SSE2-NEXT: js .LBB9_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: jmp .LBB9_3 +; SSE2-NEXT: .LBB9_1: +; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: .LBB9_4: -; SSE2-NEXT: maxss %xmm1, %xmm3 -; SSE2-NEXT: andnps %xmm3, %xmm2 -; SSE2-NEXT: orps %xmm4, %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: .LBB9_3: +; SSE2-NEXT: maxss %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: test_fmaximum_combine_cmps: @@ -437,15 +430,11 @@ define float @test_fmaximum_combine_cmps(float %x, float %y) nounwind { ; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: js .LBB9_1 ; AVX1-NEXT: # %bb.2: -; AVX1-NEXT: vmovaps %xmm0, %xmm2 -; AVX1-NEXT: jmp .LBB9_3 +; AVX1-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq ; AVX1-NEXT: .LBB9_1: ; AVX1-NEXT: vmovaps %xmm1, %xmm2 -; AVX1-NEXT: vmovaps %xmm0, %xmm1 -; AVX1-NEXT: .LBB9_3: -; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX512F-LABEL: test_fmaximum_combine_cmps: @@ -459,8 +448,6 @@ define float @test_fmaximum_combine_cmps(float %x, float %y) nounwind { ; AVX512F-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} ; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512F-NEXT: vmaxss %xmm2, %xmm1, %xmm0 -; AVX512F-NEXT: vcmpunordss %xmm1, %xmm1, %k1 -; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_fmaximum_combine_cmps: @@ -490,9 +477,7 @@ define float @test_fmaximum_combine_cmps(float %x, float %y) nounwind { ; X86-NEXT: vmovaps %xmm0, %xmm2 ; X86-NEXT: vmovaps %xmm1, %xmm0 ; X86-NEXT: .LBB9_3: -; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 -; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 -; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax From 41e074082e00ed93dc7ac7855672e4554b9e03ed Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Sun, 22 Sep 2024 14:26:41 +0800 Subject: [PATCH 2/2] Add fcmp+select testcase --- llvm/test/CodeGen/X86/fcmp-nofpclass.ll | 118 ++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 llvm/test/CodeGen/X86/fcmp-nofpclass.ll diff --git a/llvm/test/CodeGen/X86/fcmp-nofpclass.ll b/llvm/test/CodeGen/X86/fcmp-nofpclass.ll new file mode 100644 index 0000000000000..e30b74197f781 --- /dev/null +++ b/llvm/test/CodeGen/X86/fcmp-nofpclass.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86 + +define dso_local noundef float @maxs(float noundef nofpclass(nan) %a, float noundef nofpclass(nan) %b) local_unnamed_addr #0 { +; SSE2-LABEL: maxs: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: maxs: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: maxs: +; X86: # %bb.0: # %entry +; X86-NEXT: pushl %eax +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +entry: + %cmp = fcmp ogt float %a, %b + %cond = select i1 %cmp, float %a, float %b + ret float %cond +} + +define dso_local noundef <8 x float> @maxs_v8f32(<8 x float> noundef nofpclass(nan) %a, <8 x float> noundef nofpclass(nan) %b) local_unnamed_addr #0 { +; SSE2-LABEL: maxs_v8f32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: maxps %xmm2, %xmm0 +; SSE2-NEXT: maxps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; AVX-LABEL: maxs_v8f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; X86-LABEL: maxs_v8f32: +; X86: # %bb.0: # %entry +; X86-NEXT: vmaxps %ymm1, %ymm0, %ymm0 +; X86-NEXT: retl +entry: + %cmp = fcmp ogt <8 x float> %a, %b + %cond = select <8 x i1> %cmp, <8 x float> %a, <8 x float> %b + ret <8 x float> %cond +} + +define dso_local noundef float @maxd(float noundef nofpclass(nan) %a, float noundef nofpclass(nan) %b) local_unnamed_addr #0 { +; SSE2-LABEL: maxd: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: maxss %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; AVX-LABEL: maxd: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: maxd: +; X86: # %bb.0: # %entry +; X86-NEXT: pushl %eax +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: vmovss %xmm0, (%esp) +; X86-NEXT: flds (%esp) +; X86-NEXT: popl %eax +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +entry: + %cmp = fcmp ogt float %a, %b + %cond = select i1 %cmp, float %a, float %b + ret float %cond +} + +define dso_local noundef <8 x double> @mind_v8f32(<8 x double> noundef nofpclass(nan) %a, <8 x double> noundef nofpclass(nan) %b) local_unnamed_addr #0 { +; SSE2-LABEL: mind_v8f32: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: maxpd %xmm4, %xmm0 +; SSE2-NEXT: maxpd %xmm5, %xmm1 +; SSE2-NEXT: maxpd %xmm6, %xmm2 +; SSE2-NEXT: maxpd %xmm7, %xmm3 +; SSE2-NEXT: retq +; +; AVX-LABEL: mind_v8f32: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmaxpd %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vmaxpd %ymm3, %ymm1, %ymm1 +; AVX-NEXT: retq +; +; X86-LABEL: mind_v8f32: +; X86: # %bb.0: # %entry +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-32, %esp +; X86-NEXT: subl $32, %esp +; X86-NEXT: vmaxpd %ymm2, %ymm0, %ymm0 +; X86-NEXT: vmaxpd 8(%ebp), %ymm1, %ymm1 +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl +entry: + %cmp = fcmp ogt <8 x double> %a, %b + %cond = select <8 x i1> %cmp, <8 x double> %a, <8 x double> %b + ret <8 x double> %cond +}