Skip to content

Commit 83b525e

Browse files
author
v01dxyz
committed
[CodeGen] Legalisation with promotion: optimise count leading ones
(CTLZ (XOR Op -1)) --> (CTLZ_ZERO_UNDEF (XOR (SHIFT (ANYEXTEND Op) ShiftAmount) -1)) The optimisation also applies for CTLZ_ZERO_UNDEF, VP_CTLZ, VP_CTLZ_ZERO_UNDEF. Fixes #96455
1 parent 626c7ce commit 83b525e

File tree

8 files changed

+323
-32
lines changed

8 files changed

+323
-32
lines changed

Diff for: llvm/lib/CodeGen/CodeGenPrepare.cpp

+22-1
Original file line numberDiff line numberDiff line change
@@ -2298,8 +2298,29 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
22982298
if (match(CountZeros->getOperand(1), m_One()))
22992299
return false;
23002300

2301-
// If it's cheap to speculate, there's nothing to do.
23022301
Type *Ty = CountZeros->getType();
2302+
EVT VTy = TLI->getValueType(*DL, Ty);
2303+
2304+
// do not despeculate if we have (ctlz (xor op -1)) if the operand is
2305+
// promoted as legalisation would later transform to:
2306+
//
2307+
// (ctlz (lshift (xor (extend op) -1)
2308+
// lshiftamount))
2309+
//
2310+
// Despeculation is not only useless but also not wanted with SelectionDAG
2311+
// as XOR and CTLZ would be in different basic blocks.
2312+
ConstantInt *C;
2313+
Value *Op0;
2314+
Value *Op1;
2315+
if ((TLI->getTypeAction(CountZeros->getContext(), VTy) ==
2316+
TargetLowering::TypePromoteInteger ||
2317+
TLI->getOperationAction(ISD::CTLZ, VTy) == TargetLowering::Promote) &&
2318+
match(CountZeros->getOperand(0), m_Xor(m_Value(Op0), m_Value(Op1))) &&
2319+
((C = dyn_cast<ConstantInt>(Op0)) || (C = dyn_cast<ConstantInt>(Op1))) &&
2320+
C->isMinusOne())
2321+
return false;
2322+
2323+
// If it's cheap to speculate, there's nothing to do.
23032324
auto IntrinsicID = CountZeros->getIntrinsicID();
23042325
if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz(Ty)) ||
23052326
(IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz(Ty)))

Diff for: llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

+36
Original file line numberDiff line numberDiff line change
@@ -2356,6 +2356,35 @@ LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
23562356
return Legalized;
23572357
}
23582358

2359+
static bool extendCtlzNot(const MachineInstr &MI, MachineIRBuilder &MIRBuilder,
2360+
MachineRegisterInfo &MRI, LLT WideTy) {
2361+
Register XorSrc;
2362+
Register CstReg;
2363+
if (!mi_match(MI.getOperand(1).getReg(), MRI,
2364+
m_GXor(m_Reg(XorSrc), m_Reg(CstReg))))
2365+
return false;
2366+
2367+
auto OptCst = getIConstantVRegValWithLookThrough(CstReg, MRI);
2368+
APInt Cst = OptCst->Value;
2369+
2370+
if (!Cst.isAllOnes())
2371+
return false;
2372+
2373+
auto AllOnes = MIRBuilder.buildConstant(
2374+
WideTy, APInt::getAllOnes(WideTy.getSizeInBits()));
2375+
auto Res = MIRBuilder.buildAnyExt(WideTy, XorSrc);
2376+
2377+
Register SrcReg = MI.getOperand(1).getReg();
2378+
LLT CurTy = MRI.getType(SrcReg);
2379+
unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2380+
Res = MIRBuilder.buildShl(WideTy, Res,
2381+
MIRBuilder.buildConstant(WideTy, SizeDiff));
2382+
Res = MIRBuilder.buildXor(WideTy, Res, AllOnes);
2383+
Res = MIRBuilder.buildCTLZ_ZERO_UNDEF(MI.getOperand(0), Res);
2384+
2385+
return true;
2386+
}
2387+
23592388
LegalizerHelper::LegalizeResult
23602389
LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
23612390
switch (MI.getOpcode()) {
@@ -2449,6 +2478,13 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
24492478
auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
24502479
LLT CurTy = MRI.getType(SrcReg);
24512480
unsigned NewOpc = MI.getOpcode();
2481+
2482+
if ((MI.getOpcode() == TargetOpcode::G_CTLZ ||
2483+
MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) &&
2484+
extendCtlzNot(MI, MIRBuilder, MRI, WideTy)) {
2485+
MI.eraseFromParent();
2486+
return Legalized;
2487+
}
24522488
if (NewOpc == TargetOpcode::G_CTTZ) {
24532489
// The count is the same in the larger type except if the original
24542490
// value was zero. This can be handled by setting the bit just off

Diff for: llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp

+45
Original file line numberDiff line numberDiff line change
@@ -5049,6 +5049,40 @@ static MVT getPromotedVectorElementType(const TargetLowering &TLI,
50495049
return MidVT;
50505050
}
50515051

5052+
// (CTLZ (XOR Op -1)) --> (TRUNCATE (CTLZ_ZERO_UNDEF
5053+
// (XOR (SHIFT (ANYEXTEND Op1)
5054+
// ShiftAmount)
5055+
// -1)))
5056+
static bool ExtendCtlzNot(SDNode *Node, SDValue &Result, SDLoc &dl, MVT OVT,
5057+
MVT NVT, SelectionDAG &DAG) {
5058+
SDValue NotOp = Node->getOperand(0);
5059+
if (NotOp.getOpcode() != ISD::XOR)
5060+
return false;
5061+
5062+
SDValue SrcOp = NotOp->getOperand(0);
5063+
SDValue CstOp = NotOp->getOperand(1);
5064+
5065+
ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOp);
5066+
5067+
if (!Cst || !Cst->isAllOnes())
5068+
return false;
5069+
5070+
auto ExtSrc = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, SrcOp);
5071+
unsigned SHLAmount = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
5072+
auto ShiftConst =
5073+
DAG.getShiftAmountConstant(SHLAmount, ExtSrc.getValueType(), dl);
5074+
SDValue NSrcOp = DAG.getNode(ISD::SHL, dl, NVT, ExtSrc, ShiftConst);
5075+
5076+
SDValue NCstOp =
5077+
DAG.getConstant(APInt::getAllOnes(NVT.getScalarSizeInBits()), dl, NVT);
5078+
5079+
Result = DAG.getNode(NotOp->getOpcode(), dl, NVT, NSrcOp, NCstOp,
5080+
NotOp->getFlags());
5081+
Result = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Result);
5082+
Result = DAG.getNode(ISD::TRUNCATE, dl, OVT, Result);
5083+
return true;
5084+
}
5085+
50525086
void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
50535087
LLVM_DEBUG(dbgs() << "Trying to promote node\n");
50545088
SmallVector<SDValue, 8> Results;
@@ -5084,6 +5118,13 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
50845118
case ISD::CTTZ_ZERO_UNDEF:
50855119
case ISD::CTLZ:
50865120
case ISD::CTPOP: {
5121+
// If the operand of CTLZ is NOT, push the extend in the NOT.
5122+
if (Node->getOpcode() == ISD::CTLZ &&
5123+
ExtendCtlzNot(Node, Tmp1, dl, OVT, NVT, DAG)) {
5124+
Results.push_back(Tmp1);
5125+
break;
5126+
}
5127+
50875128
// Zero extend the argument unless its cttz, then use any_extend.
50885129
if (Node->getOpcode() == ISD::CTTZ ||
50895130
Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF)
@@ -5115,6 +5156,10 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
51155156
break;
51165157
}
51175158
case ISD::CTLZ_ZERO_UNDEF: {
5159+
if (ExtendCtlzNot(Node, Tmp1, dl, OVT, NVT, DAG)) {
5160+
Results.push_back(Tmp1);
5161+
break;
5162+
}
51185163
// We know that the argument is unlikely to be zero, hence we can take a
51195164
// different approach as compared to ISD::CTLZ
51205165

Diff for: llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp

+49
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,47 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
638638
return Result;
639639
}
640640

641+
// (CTLZ (XOR Op -1)) --> (CTLZ_ZERO_UNDEF (XOR (SHIFT (ANYEXTEND Op1)
642+
// ShiftAmount)
643+
// -1))
644+
static bool ExtendCtlzNot(SDNode *Node, SDValue &Result, SDLoc &dl, EVT OVT,
645+
EVT NVT, SelectionDAG &DAG) {
646+
SDValue NotOp = Node->getOperand(0);
647+
if (NotOp.getOpcode() != ISD::XOR)
648+
return false;
649+
650+
SDValue SrcOp = NotOp->getOperand(0);
651+
SDValue CstOp = NotOp->getOperand(1);
652+
653+
if (!isAllOnesOrAllOnesSplat(CstOp))
654+
return false;
655+
656+
auto ExtSrc = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, SrcOp);
657+
unsigned SHLAmount = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
658+
auto ShiftConst =
659+
DAG.getShiftAmountConstant(SHLAmount, ExtSrc.getValueType(), dl);
660+
661+
SDValue NCstOp =
662+
DAG.getConstant(APInt::getAllOnes(NVT.getScalarSizeInBits()), dl, NVT);
663+
if (!Node->isVPOpcode()) {
664+
SDValue NSrcOp = DAG.getNode(ISD::SHL, dl, NVT, ExtSrc, ShiftConst);
665+
666+
Result = DAG.getNode(ISD::XOR, dl, NVT, NSrcOp, NCstOp);
667+
Result = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Result);
668+
} else {
669+
SDValue Mask = Node->getOperand(1);
670+
SDValue EVL = Node->getOperand(2);
671+
672+
SDValue NSrcOp =
673+
DAG.getNode(ISD::VP_SHL, dl, NVT, ExtSrc, ShiftConst, Mask, EVL);
674+
675+
Result = DAG.getNode(ISD::VP_XOR, dl, NVT, NSrcOp, NCstOp, Mask, EVL);
676+
Result = DAG.getNode(ISD::VP_CTLZ_ZERO_UNDEF, dl, NVT, Result, Mask, EVL);
677+
}
678+
679+
return true;
680+
}
681+
641682
SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
642683
EVT OVT = N->getValueType(0);
643684
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
@@ -656,6 +697,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
656697
}
657698

658699
unsigned CtlzOpcode = N->getOpcode();
700+
// If the operand of CTLZ is NOT, push the extend in the NOT.
701+
if (SDValue Res;
702+
(CtlzOpcode == ISD::CTLZ || CtlzOpcode == ISD::CTLZ_ZERO_UNDEF ||
703+
CtlzOpcode == ISD::VP_CTLZ || CtlzOpcode == ISD::VP_CTLZ_ZERO_UNDEF) &&
704+
ExtendCtlzNot(N, Res, dl, OVT, NVT, DAG)) {
705+
return Res;
706+
}
707+
659708
if (CtlzOpcode == ISD::CTLZ || CtlzOpcode == ISD::VP_CTLZ) {
660709
// Subtract off the extra leading bits in the bigger type.
661710
SDValue ExtractLeadingBits = DAG.getConstant(

Diff for: llvm/test/CodeGen/AArch64/ctlo.ll

+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s --mtriple=aarch64 -verify-machineinstrs | FileCheck %s
3+
; RUN: llc < %s --mtriple=aarch64 -global-isel -verify-machineinstrs | FileCheck %s
4+
5+
declare i8 @llvm.ctlz.i8(i8, i1)
6+
declare i16 @llvm.ctlz.i16(i16, i1)
7+
declare i32 @llvm.ctlz.i32(i32, i1)
8+
declare i64 @llvm.ctlz.i64(i64, i1)
9+
10+
define i8 @ctlo_i8(i8 %x) {
11+
; CHECK-LABEL: ctlo_i8:
12+
; CHECK: // %bb.0:
13+
; CHECK-NEXT: mov w8, #-1 // =0xffffffff
14+
; CHECK-NEXT: eor w8, w8, w0, lsl #24
15+
; CHECK-NEXT: clz w0, w8
16+
; CHECK-NEXT: ret
17+
%tmp1 = xor i8 %x, -1
18+
%tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 false )
19+
ret i8 %tmp2
20+
}
21+
22+
define i8 @ctlo_i8_undef(i8 %x) {
23+
; CHECK-LABEL: ctlo_i8_undef:
24+
; CHECK: // %bb.0:
25+
; CHECK-NEXT: mov w8, #-1 // =0xffffffff
26+
; CHECK-NEXT: eor w8, w8, w0, lsl #24
27+
; CHECK-NEXT: clz w0, w8
28+
; CHECK-NEXT: ret
29+
%tmp1 = xor i8 %x, -1
30+
%tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 true )
31+
ret i8 %tmp2
32+
}
33+
34+
define i16 @ctlo_i16(i16 %x) {
35+
; CHECK-LABEL: ctlo_i16:
36+
; CHECK: // %bb.0:
37+
; CHECK-NEXT: mov w8, #-1 // =0xffffffff
38+
; CHECK-NEXT: eor w8, w8, w0, lsl #16
39+
; CHECK-NEXT: clz w0, w8
40+
; CHECK-NEXT: ret
41+
%tmp1 = xor i16 %x, -1
42+
%tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 false )
43+
ret i16 %tmp2
44+
}
45+
46+
define i16 @ctlo_i16_undef(i16 %x) {
47+
; CHECK-LABEL: ctlo_i16_undef:
48+
; CHECK: // %bb.0:
49+
; CHECK-NEXT: mov w8, #-1 // =0xffffffff
50+
; CHECK-NEXT: eor w8, w8, w0, lsl #16
51+
; CHECK-NEXT: clz w0, w8
52+
; CHECK-NEXT: ret
53+
%tmp1 = xor i16 %x, -1
54+
%tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 true )
55+
ret i16 %tmp2
56+
}
57+
58+
define i32 @ctlo_i32(i32 %x) {
59+
; CHECK-LABEL: ctlo_i32:
60+
; CHECK: // %bb.0:
61+
; CHECK-NEXT: mvn w8, w0
62+
; CHECK-NEXT: clz w0, w8
63+
; CHECK-NEXT: ret
64+
%tmp1 = xor i32 %x, -1
65+
%tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 false )
66+
ret i32 %tmp2
67+
}
68+
69+
define i32 @ctlo_i32_undef(i32 %x) {
70+
; CHECK-LABEL: ctlo_i32_undef:
71+
; CHECK: // %bb.0:
72+
; CHECK-NEXT: mvn w8, w0
73+
; CHECK-NEXT: clz w0, w8
74+
; CHECK-NEXT: ret
75+
%tmp1 = xor i32 %x, -1
76+
%tmp2 = call i32 @llvm.ctlz.i32( i32 %tmp1, i1 true )
77+
ret i32 %tmp2
78+
}
79+
80+
define i64 @ctlo_i64(i64 %x) {
81+
; CHECK-LABEL: ctlo_i64:
82+
; CHECK: // %bb.0:
83+
; CHECK-NEXT: mvn x8, x0
84+
; CHECK-NEXT: clz x0, x8
85+
; CHECK-NEXT: ret
86+
%tmp1 = xor i64 %x, -1
87+
%tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 false )
88+
ret i64 %tmp2
89+
}
90+
91+
define i64 @ctlo_i64_undef(i64 %x) {
92+
; CHECK-LABEL: ctlo_i64_undef:
93+
; CHECK: // %bb.0:
94+
; CHECK-NEXT: mvn x8, x0
95+
; CHECK-NEXT: clz x0, x8
96+
; CHECK-NEXT: ret
97+
%tmp1 = xor i64 %x, -1
98+
%tmp2 = call i64 @llvm.ctlz.i64( i64 %tmp1, i1 true )
99+
ret i64 %tmp2
100+
}

Diff for: llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll

+8-16
Original file line numberDiff line numberDiff line change
@@ -89,18 +89,14 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
8989
define i8 @test_not_ctlz_i8(i8 %a) nounwind {
9090
; LA32-LABEL: test_not_ctlz_i8:
9191
; LA32: # %bb.0:
92-
; LA32-NEXT: ori $a1, $zero, 255
93-
; LA32-NEXT: andn $a0, $a1, $a0
94-
; LA32-NEXT: clz.w $a0, $a0
95-
; LA32-NEXT: addi.w $a0, $a0, -24
92+
; LA32-NEXT: slli.w $a0, $a0, 24
93+
; LA32-NEXT: clo.w $a0, $a0
9694
; LA32-NEXT: ret
9795
;
9896
; LA64-LABEL: test_not_ctlz_i8:
9997
; LA64: # %bb.0:
100-
; LA64-NEXT: ori $a1, $zero, 255
101-
; LA64-NEXT: andn $a0, $a1, $a0
102-
; LA64-NEXT: clz.d $a0, $a0
103-
; LA64-NEXT: addi.d $a0, $a0, -56
98+
; LA64-NEXT: slli.d $a0, $a0, 56
99+
; LA64-NEXT: clo.d $a0, $a0
104100
; LA64-NEXT: ret
105101
%neg = xor i8 %a, -1
106102
%tmp = call i8 @llvm.ctlz.i8(i8 %neg, i1 false)
@@ -110,18 +106,14 @@ define i8 @test_not_ctlz_i8(i8 %a) nounwind {
110106
define i16 @test_not_ctlz_i16(i16 %a) nounwind {
111107
; LA32-LABEL: test_not_ctlz_i16:
112108
; LA32: # %bb.0:
113-
; LA32-NEXT: nor $a0, $a0, $zero
114-
; LA32-NEXT: bstrpick.w $a0, $a0, 15, 0
115-
; LA32-NEXT: clz.w $a0, $a0
116-
; LA32-NEXT: addi.w $a0, $a0, -16
109+
; LA32-NEXT: slli.w $a0, $a0, 16
110+
; LA32-NEXT: clo.w $a0, $a0
117111
; LA32-NEXT: ret
118112
;
119113
; LA64-LABEL: test_not_ctlz_i16:
120114
; LA64: # %bb.0:
121-
; LA64-NEXT: nor $a0, $a0, $zero
122-
; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0
123-
; LA64-NEXT: clz.d $a0, $a0
124-
; LA64-NEXT: addi.d $a0, $a0, -48
115+
; LA64-NEXT: slli.d $a0, $a0, 48
116+
; LA64-NEXT: clo.d $a0, $a0
125117
; LA64-NEXT: ret
126118
%neg = xor i16 %a, -1
127119
%tmp = call i16 @llvm.ctlz.i16(i16 %neg, i1 false)

0 commit comments

Comments
 (0)