-
Notifications
You must be signed in to change notification settings - Fork 13.3k
Optimize count leading ones if promoted type #99591
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Optimize count leading ones if promoted type #99591
Conversation
14b59c7
to
83b525e
Compare
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-llvm-globalisel Author: None (v01dXYZ) ChangesFirst suggestions to solve #96455. Only tested with scalar For Full diff: https://github.com/llvm/llvm-project/pull/99591.diff 8 Files Affected:
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 22d0708f54786..37ba7d7b0b23e 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2298,8 +2298,29 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
if (match(CountZeros->getOperand(1), m_One()))
return false;
- // If it's cheap to speculate, there's nothing to do.
Type *Ty = CountZeros->getType();
+ EVT VTy = TLI->getValueType(*DL, Ty);
+
+ // do not despeculate if we have (ctlz (xor op -1)) if the operand is
+ // promoted as legalisation would later transform to:
+ //
+ // (ctlz (lshift (xor (extend op) -1)
+ // lshiftamount))
+ //
+ // Despeculation is not only useless but also not wanted with SelectionDAG
+ // as XOR and CTLZ would be in different basic blocks.
+ ConstantInt *C;
+ Value *Op0;
+ Value *Op1;
+ if ((TLI->getTypeAction(CountZeros->getContext(), VTy) ==
+ TargetLowering::TypePromoteInteger ||
+ TLI->getOperationAction(ISD::CTLZ, VTy) == TargetLowering::Promote) &&
+ match(CountZeros->getOperand(0), m_Xor(m_Value(Op0), m_Value(Op1))) &&
+ ((C = dyn_cast<ConstantInt>(Op0)) || (C = dyn_cast<ConstantInt>(Op1))) &&
+ C->isMinusOne())
+ return false;
+
+ // If it's cheap to speculate, there's nothing to do.
auto IntrinsicID = CountZeros->getIntrinsicID();
if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz(Ty)) ||
(IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz(Ty)))
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 3f1094e0ac703..a334b14e0cc8b 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2356,6 +2356,35 @@ LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
return Legalized;
}
+static bool extendCtlzNot(const MachineInstr &MI, MachineIRBuilder &MIRBuilder,
+ MachineRegisterInfo &MRI, LLT WideTy) {
+ Register XorSrc;
+ Register CstReg;
+ if (!mi_match(MI.getOperand(1).getReg(), MRI,
+ m_GXor(m_Reg(XorSrc), m_Reg(CstReg))))
+ return false;
+
+ auto OptCst = getIConstantVRegValWithLookThrough(CstReg, MRI);
+ APInt Cst = OptCst->Value;
+
+ if (!Cst.isAllOnes())
+ return false;
+
+ auto AllOnes = MIRBuilder.buildConstant(
+ WideTy, APInt::getAllOnes(WideTy.getSizeInBits()));
+ auto Res = MIRBuilder.buildAnyExt(WideTy, XorSrc);
+
+ Register SrcReg = MI.getOperand(1).getReg();
+ LLT CurTy = MRI.getType(SrcReg);
+ unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
+ Res = MIRBuilder.buildShl(WideTy, Res,
+ MIRBuilder.buildConstant(WideTy, SizeDiff));
+ Res = MIRBuilder.buildXor(WideTy, Res, AllOnes);
+ Res = MIRBuilder.buildCTLZ_ZERO_UNDEF(MI.getOperand(0), Res);
+
+ return true;
+}
+
LegalizerHelper::LegalizeResult
LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
switch (MI.getOpcode()) {
@@ -2449,6 +2478,13 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
LLT CurTy = MRI.getType(SrcReg);
unsigned NewOpc = MI.getOpcode();
+
+ if ((MI.getOpcode() == TargetOpcode::G_CTLZ ||
+ MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) &&
+ extendCtlzNot(MI, MIRBuilder, MRI, WideTy)) {
+ MI.eraseFromParent();
+ return Legalized;
+ }
if (NewOpc == TargetOpcode::G_CTTZ) {
// The count is the same in the larger type except if the original
// value was zero. This can be handled by setting the bit just off
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index d6a0dd9ae9b20..f062ad2543dd9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -5049,6 +5049,40 @@ static MVT getPromotedVectorElementType(const TargetLowering &TLI,
return MidVT;
}
+// (CTLZ (XOR Op -1)) --> (TRUNCATE (CTLZ_ZERO_UNDEF
+// (XOR (SHIFT (ANYEXTEND Op1)
+// ShiftAmount)
+// -1)))
+static bool ExtendCtlzNot(SDNode *Node, SDValue &Result, SDLoc &dl, MVT OVT,
+ MVT NVT, SelectionDAG &DAG) {
+ SDValue NotOp = Node->getOperand(0);
+ if (NotOp.getOpcode() != ISD::XOR)
+ return false;
+
+ SDValue SrcOp = NotOp->getOperand(0);
+ SDValue CstOp = NotOp->getOperand(1);
+
+ ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOp);
+
+ if (!Cst || !Cst->isAllOnes())
+ return false;
+
+ auto ExtSrc = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, SrcOp);
+ unsigned SHLAmount = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
+ auto ShiftConst =
+ DAG.getShiftAmountConstant(SHLAmount, ExtSrc.getValueType(), dl);
+ SDValue NSrcOp = DAG.getNode(ISD::SHL, dl, NVT, ExtSrc, ShiftConst);
+
+ SDValue NCstOp =
+ DAG.getConstant(APInt::getAllOnes(NVT.getScalarSizeInBits()), dl, NVT);
+
+ Result = DAG.getNode(NotOp->getOpcode(), dl, NVT, NSrcOp, NCstOp,
+ NotOp->getFlags());
+ Result = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Result);
+ Result = DAG.getNode(ISD::TRUNCATE, dl, OVT, Result);
+ return true;
+}
+
void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
LLVM_DEBUG(dbgs() << "Trying to promote node\n");
SmallVector<SDValue, 8> Results;
@@ -5084,6 +5118,13 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
case ISD::CTTZ_ZERO_UNDEF:
case ISD::CTLZ:
case ISD::CTPOP: {
+ // If the operand of CTLZ is NOT, push the extend in the NOT.
+ if (Node->getOpcode() == ISD::CTLZ &&
+ ExtendCtlzNot(Node, Tmp1, dl, OVT, NVT, DAG)) {
+ Results.push_back(Tmp1);
+ break;
+ }
+
// Zero extend the argument unless its cttz, then use any_extend.
if (Node->getOpcode() == ISD::CTTZ ||
Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF)
@@ -5115,6 +5156,10 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
break;
}
case ISD::CTLZ_ZERO_UNDEF: {
+ if (ExtendCtlzNot(Node, Tmp1, dl, OVT, NVT, DAG)) {
+ Results.push_back(Tmp1);
+ break;
+ }
// We know that the argument is unlikely to be zero, hence we can take a
// different approach as compared to ISD::CTLZ
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index fed5ebcc3c903..e55fed038f576 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -638,6 +638,47 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
return Result;
}
+// (CTLZ (XOR Op -1)) --> (CTLZ_ZERO_UNDEF (XOR (SHIFT (ANYEXTEND Op1)
+// ShiftAmount)
+// -1))
+static bool ExtendCtlzNot(SDNode *Node, SDValue &Result, SDLoc &dl, EVT OVT,
+ EVT NVT, SelectionDAG &DAG) {
+ SDValue NotOp = Node->getOperand(0);
+ if (NotOp.getOpcode() != ISD::XOR)
+ return false;
+
+ SDValue SrcOp = NotOp->getOperand(0);
+ SDValue CstOp = NotOp->getOperand(1);
+
+ if (!isAllOnesOrAllOnesSplat(CstOp))
+ return false;
+
+ auto ExtSrc = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, SrcOp);
+ unsigned SHLAmount = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
+ auto ShiftConst =
+ DAG.getShiftAmountConstant(SHLAmount, ExtSrc.getValueType(), dl);
+
+ SDValue NCstOp =
+ DAG.getConstant(APInt::getAllOnes(NVT.getScalarSizeInBits()), dl, NVT);
+ if (!Node->isVPOpcode()) {
+ SDValue NSrcOp = DAG.getNode(ISD::SHL, dl, NVT, ExtSrc, ShiftConst);
+
+ Result = DAG.getNode(ISD::XOR, dl, NVT, NSrcOp, NCstOp);
+ Result = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Result);
+ } else {
+ SDValue Mask = Node->getOperand(1);
+ SDValue EVL = Node->getOperand(2);
+
+ SDValue NSrcOp =
+ DAG.getNode(ISD::VP_SHL, dl, NVT, ExtSrc, ShiftConst, Mask, EVL);
+
+ Result = DAG.getNode(ISD::VP_XOR, dl, NVT, NSrcOp, NCstOp, Mask, EVL);
+ Result = DAG.getNode(ISD::VP_CTLZ_ZERO_UNDEF, dl, NVT, Result, Mask, EVL);
+ }
+
+ return true;
+}
+
SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
EVT OVT = N->getValueType(0);
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
@@ -656,6 +697,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
}
unsigned CtlzOpcode = N->getOpcode();
+ // If the operand of CTLZ is NOT, push the extend in the NOT.
+ if (SDValue Res;
+ (CtlzOpcode == ISD::CTLZ || CtlzOpcode == ISD::CTLZ_ZERO_UNDEF ||
+ CtlzOpcode == ISD::VP_CTLZ || CtlzOpcode == ISD::VP_CTLZ_ZERO_UNDEF) &&
+ ExtendCtlzNot(N, Res, dl, OVT, NVT, DAG)) {
+ return Res;
+ }
+
if (CtlzOpcode == ISD::CTLZ || CtlzOpcode == ISD::VP_CTLZ) {
// Subtract off the extra leading bits in the bigger type.
SDValue ExtractLeadingBits = DAG.getConstant(
diff --git a/llvm/test/CodeGen/AArch64/ctlo.ll b/llvm/test/CodeGen/AArch64/ctlo.ll
new file mode 100644
index 0000000000000..5f15f540f458d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ctlo.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mtriple=aarch64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s --mtriple=aarch64 -global-isel -verify-machineinstrs | FileCheck %s
+
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @ctlo_i8(i8 %x) {
+; CHECK-LABEL: ctlo_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-NEXT: eor w8, w8, w0, lsl #24
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i8 %x, -1
+ %tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 false )
+ ret i8 %tmp2
+}
+
+define i8 @ctlo_i8_undef(i8 %x) {
+; CHECK-LABEL: ctlo_i8_undef:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-NEXT: eor w8, w8, w0, lsl #24
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i8 %x, -1
+ %tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 true )
+ ret i8 %tmp2
+}
+
+define i16 @ctlo_i16(i16 %x) {
+; CHECK-LABEL: ctlo_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-NEXT: eor w8, w8, w0, lsl #16
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i16 %x, -1
+ %tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 false )
+ ret i16 %tmp2
+}
+
+define i16 @ctlo_i16_undef(i16 %x) {
+; CHECK-LABEL: ctlo_i16_undef:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w8, #-1 // =0xffffffff
+; CHECK-NEXT: eor w8, w8, w0, lsl #16
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i16 %x, -1
+ %tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 true )
+ ret i16 %tmp2
+}
+
+define i32 @ctlo_i32(i32 %x) {
+; CHECK-LABEL: ctlo_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn w8, w0
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i32 %x, -1
+ %tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 false )
+ ret i32 %tmp2
+}
+
+define i32 @ctlo_i32_undef(i32 %x) {
+; CHECK-LABEL: ctlo_i32_undef:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn w8, w0
+; CHECK-NEXT: clz w0, w8
+; CHECK-NEXT: ret
+ %tmp1 = xor i32 %x, -1
+ %tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 true )
+ ret i32 %tmp2
+}
+
+define i64 @ctlo_i64(i64 %x) {
+; CHECK-LABEL: ctlo_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn x8, x0
+; CHECK-NEXT: clz x0, x8
+; CHECK-NEXT: ret
+ %tmp1 = xor i64 %x, -1
+ %tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 false )
+ ret i64 %tmp2
+}
+
+define i64 @ctlo_i64_undef(i64 %x) {
+; CHECK-LABEL: ctlo_i64_undef:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvn x8, x0
+; CHECK-NEXT: clz x0, x8
+; CHECK-NEXT: ret
+ %tmp1 = xor i64 %x, -1
+ %tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 true )
+ ret i64 %tmp2
+}
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index f17cec231f323..e993ecfcdf3b8 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -89,18 +89,14 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
define i8 @test_not_ctlz_i8(i8 %a) nounwind {
; LA32-LABEL: test_not_ctlz_i8:
; LA32: # %bb.0:
-; LA32-NEXT: ori $a1, $zero, 255
-; LA32-NEXT: andn $a0, $a1, $a0
-; LA32-NEXT: clz.w $a0, $a0
-; LA32-NEXT: addi.w $a0, $a0, -24
+; LA32-NEXT: slli.w $a0, $a0, 24
+; LA32-NEXT: clo.w $a0, $a0
; LA32-NEXT: ret
;
; LA64-LABEL: test_not_ctlz_i8:
; LA64: # %bb.0:
-; LA64-NEXT: ori $a1, $zero, 255
-; LA64-NEXT: andn $a0, $a1, $a0
-; LA64-NEXT: clz.d $a0, $a0
-; LA64-NEXT: addi.d $a0, $a0, -56
+; LA64-NEXT: slli.d $a0, $a0, 56
+; LA64-NEXT: clo.d $a0, $a0
; LA64-NEXT: ret
%neg = xor i8 %a, -1
%tmp = call i8 @llvm.ctlz.i8(i8 %neg, i1 false)
@@ -110,18 +106,14 @@ define i8 @test_not_ctlz_i8(i8 %a) nounwind {
define i16 @test_not_ctlz_i16(i16 %a) nounwind {
; LA32-LABEL: test_not_ctlz_i16:
; LA32: # %bb.0:
-; LA32-NEXT: nor $a0, $a0, $zero
-; LA32-NEXT: bstrpick.w $a0, $a0, 15, 0
-; LA32-NEXT: clz.w $a0, $a0
-; LA32-NEXT: addi.w $a0, $a0, -16
+; LA32-NEXT: slli.w $a0, $a0, 16
+; LA32-NEXT: clo.w $a0, $a0
; LA32-NEXT: ret
;
; LA64-LABEL: test_not_ctlz_i16:
; LA64: # %bb.0:
-; LA64-NEXT: nor $a0, $a0, $zero
-; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0
-; LA64-NEXT: clz.d $a0, $a0
-; LA64-NEXT: addi.d $a0, $a0, -48
+; LA64-NEXT: slli.d $a0, $a0, 48
+; LA64-NEXT: clo.d $a0, $a0
; LA64-NEXT: ret
%neg = xor i16 %a, -1
%tmp = call i16 @llvm.ctlz.i16(i16 %neg, i1 false)
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
index 58882525e55c4..6f89489bb39d6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll
@@ -2624,6 +2624,58 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
%v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va, i1 true, <vscale x 1 x i1> %m, i32 %evl)
ret <vscale x 1 x i9> %v
}
+define <vscale x 1 x i9> @vp_ctlo_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_nxv1i9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vnot.v v8, v8, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
+define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vp_ctlo_zero_undef_nxv1i9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-NEXT: vnot.v v8, v8, v0.t
+; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
+; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
+; CHECK-NEXT: li a0, 142
+; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT: ret
+;
+; CHECK-ZVBB-LABEL: vp_ctlo_zero_undef_nxv1i9:
+; CHECK-ZVBB: # %bb.0:
+; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
+; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
+; CHECK-ZVBB-NEXT: ret
+ %va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
+ %v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 true, <vscale x 1 x i1> %m, i32 %evl)
+ ret <vscale x 1 x i9> %v
+}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; RV32: {{.*}}
; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index 7431f94f0fdf2..020d6d1b80136 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -46,20 +46,18 @@ define i8 @ctlo_i8(i8 %x) {
;
; X86-CLZ-LABEL: ctlo_i8:
; X86-CLZ: # %bb.0:
-; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT: notb %al
-; X86-CLZ-NEXT: movzbl %al, %eax
+; X86-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT: shll $24, %eax
+; X86-CLZ-NEXT: notl %eax
; X86-CLZ-NEXT: lzcntl %eax, %eax
-; X86-CLZ-NEXT: addl $-24, %eax
; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X86-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlo_i8:
; X64-CLZ: # %bb.0:
-; X64-CLZ-NEXT: notb %dil
-; X64-CLZ-NEXT: movzbl %dil, %eax
-; X64-CLZ-NEXT: lzcntl %eax, %eax
-; X64-CLZ-NEXT: addl $-24, %eax
+; X64-CLZ-NEXT: shll $24, %edi
+; X64-CLZ-NEXT: notl %edi
+; X64-CLZ-NEXT: lzcntl %edi, %eax
; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X64-CLZ-NEXT: retq
%tmp1 = xor i8 %x, -1
@@ -89,20 +87,18 @@ define i8 @ctlo_i8_undef(i8 %x) {
;
; X86-CLZ-LABEL: ctlo_i8_undef:
; X86-CLZ: # %bb.0:
-; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT: notb %al
-; X86-CLZ-NEXT: movzbl %al, %eax
+; X86-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-CLZ-NEXT: shll $24, %eax
+; X86-CLZ-NEXT: notl %eax
; X86-CLZ-NEXT: lzcntl %eax, %eax
; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X86-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlo_i8_undef:
; X64-CLZ: # %bb.0:
-; X64-CLZ-NEXT: notb %dil
-; X64-CLZ-NEXT: movzbl %dil, %eax
-; X64-CLZ-NEXT: shll $24, %eax
-; X64-CLZ-NEXT: lzcntl %eax, %eax
+; X64-CLZ-NEXT: shll $24, %edi
+; X64-CLZ-NEXT: notl %edi
+; X64-CLZ-NEXT: lzcntl %edi, %eax
; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax
; X64-CLZ-NEXT: retq
%tmp1 = xor i8 %x, -1
|
It is a bit weird that this patch contains both CGP and SDAG changes. Can we split it into multiple patches? |
83b525e
to
f6f2161
Compare
Thanks for your review. Done:
The remaining task is the test for CGP (especially for vector types). |
6b7bb51
to
b6e18db
Compare
|
b6e18db
to
7c23984
Compare
…omotion For count leading/trailing ones, ie (CTLZ/CTTZ (XOR Op -1)), legalisation should be able to optimise this case when a promotion is necessary. Despeculation should not be applied in this case as it will separate XOR and CTLZ/CTTZ in two different basic blocks. This is particularly problematic with SelectionDAG.
7c23984
to
4800bb4
Compare
I added some tests for CGP for two different cases:
There is still a remaining issue with X86 when LZCNT is not available as DAG legalisation is custom for i8. I tried to mark Note also that there is no optimisation for counting trailing ones here, only for counting leading ones, although CGP takes the two cases into account in this PR. Tell me if you think it is relevant to add it. |
// Despeculation is not only useless but also not wanted with SelectionDAG | ||
// as XOR and CTLZ/CTTZ would be in different basic blocks. | ||
EVT VTy = TLI->getValueType(*DL, Ty); | ||
int ISDOpcode = IntrinsicID == Intrinsic::ctlz ? ISD::CTLZ : ISD::CTTZ; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you need to account for the zero_or_undef operand?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No because of the check above early exiting if Operand[1] ~= One
.
(CTLZ (XOR Op -1)) --> (CTLZ_ZERO_UNDEF (XOR (SHIFT (ANYEXTEND Op) ShiftAmount) -1)) The optimisation also applies for CTLZ_ZERO_UNDEF, VP_CTLZ, VP_CTLZ_ZERO_UNDEF. Fixes llvm#96455
4800bb4
to
8b95ef8
Compare
Ping |
// Despeculation is not only useless but also not wanted with SelectionDAG | ||
// as XOR and CTLZ/CTTZ would be in different basic blocks. | ||
EVT VTy = TLI->getValueType(*DL, Ty); | ||
int ISDOpcode = IntrinsicID == Intrinsic::ctlz ? ISD::CTLZ : ISD::CTTZ; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why int
instead of unsigned
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this file, we have in other parts int ISDOpcode = ...
, that's why I use int
. But it seems there is no usage of setting a variable with an ISD::<Variant>
with an int
in the code base.
; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t | ||
; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t | ||
; CHECK-ZVBB-NEXT: ret | ||
%va.not = xor <vscale x 1 x i9> %va, splat (i9 -1) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be very unusual to see a vp.ctlz and a regular xor. It be more likely to be a vp.xor and a vp.ctlz.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added support for vp.xor
when it has the same mask and vector length as vp.ctlz
. But I let a bug get through (not taking into account the possible case of (fixed)CTLZ
with VP_XOR
.)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe you want me to not try to match VP_CTLZ + XOR
and only keep VP_CTLZ + VP_XOR
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe you want me to not try to match VP_CTLZ + XOR and only keep VP_CTLZ + VP_XOR ?
Yeah not much point in matching a mixing of VP and non-VP. It won't scale well if we try to do that with every optimization.
fefcd7a
to
3adc473
Compare
3adc473
to
2654579
Compare
llvm/test/CodeGen/AArch64/ctlo.ll
Outdated
define i8 @ctlo_i8_undef(i8 %x) { | ||
; CHECK-LABEL: ctlo_i8_undef: | ||
; CHECK: // %bb.0: | ||
; CHECK-NEXT: mov w8, #-1 // =0xffffffff |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks worse than the current codegen. Or at least not an obvious improvement.
ctlo_i8_undef: // @ctlo_i8_undef
.cfi_startproc
// %bb.0:
mvn w8, w0
lsl w8, w8, #24
clz w0, w8
ret
Have you investigated doing this as a DAGCombine after type legalization? Its very unusual to change type legalization behavior based on surrounding instructions. |
@topperc Currently, the type legaliser not only extends the operands but needs to add a substract node too. Using a DAGCombiner would mean detect this pattern and reverse it instead of trying to prevent to create it (as it is in this PR). I understand your point though as this PR adds another exception to Type Legalisation. I'm not experienced enough to have a worthy opinion about which one is better. I'll try to implement it with a DAGCombiner in another PR. |
Do not optimise for CTLZ_ZERO_UNDEF
Do not optimise for VP_CTLZ + XOR. That way, the non-VP and VP cases are more clearly separated which is easier to follow.
… promotion (#102877) This PR is related to #99591. In this PR, instead of modifying how the legalisation occurs depending on surrounding instructions, we refine after legalisation. This PR has two parts: * `SDPatternMatch/MatchContext`: Modify a little bit the code to match Operands (used by `m_Node(...)`) and Unary/Binary/Ternary Patterns to make it compatible with `VPMatchContext`, instead of only `m_Opc` supported. Some tests were added to ensure no regressions. * `DAGCombiner`: Add a `foldSubCtlzNot` which detect and rewrite the patterns using matching context. Remaining Tasks: - [ ] GlobalISel - [ ] Currently the pattern matching will occur even before legalisation. Should I restrict it to specific stages instead ? - [ ] Style: Add a visitVP_SUB ?? Move `foldSubCtlzNot` in another location for style consistency purpose ? @topperc --------- Co-authored-by: v01dxyz <[email protected]>
First suggestions to solve #96455.
tested with:
X86
AArch64
RISCV
(andVP_
)For
SelectionDAG
withAMDGPU
, I tried to replace the custom legaliser forCTLZ
(which is necessary to make convert it toi32
instead ofi16
if no custom legaliser), but I didn't manage to get it work when there is aANYEXTEND
user. That's why the commit is reverted.GlobalISel
works fine though.