Skip to content

Commit 13db390

Browse files
author
v01dxyz
committed
[DAGCombine] Count leading ones: refine post-legalisation
Detect and rewrite patterns created by DAG/Type Legalisation when CTLZ is used for counting leading ones. Replace a SUB + CTLZ + ZERO_EXTEND with a CTLZ_ZERO_UNDEF + SHL. The VP path is supported too. DAG Legalisation Pattern: (sub (ctlz (zeroextend (not Src))) BitWidthDiff) if BitWidthDiff == BitWidth(Node) - BitWidth(Src) --> (ctlz_zero_undef (not (shl (anyextend Src) BitWidthDiff))) Type Legalisation Pattern: (sub (ctlz (and (xor Src XorMask) AndMask)) BitWidthDiff) if AndMask has only trailing ones and MaskBitWidth(AndMask) == BitWidth(Node) - BitWidthDiff and XorMask has more trailing ones than AndMask --> (ctlz_zero_undef (not (shl Src BitWidthDiff)))
1 parent 81785b3 commit 13db390

File tree

5 files changed

+99
-64
lines changed

5 files changed

+99
-64
lines changed

Diff for: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

+49
Original file line numberDiff line numberDiff line change
@@ -3755,6 +3755,50 @@ SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N, const SDLoc &DL) {
37553755
return SDValue();
37563756
}
37573757

3758+
template <class MatchContextClass>
3759+
static SDValue foldSubCtlzNot(SDNode *N, SelectionDAG &DAG) {
3760+
const SDLoc DL(N);
3761+
SDValue N0 = N->getOperand(0);
3762+
EVT VT = N0.getValueType();
3763+
unsigned BitWidth = VT.getScalarSizeInBits();
3764+
3765+
MatchContextClass Matcher(DAG, DAG.getTargetLoweringInfo(), N);
3766+
3767+
APInt AndMask;
3768+
APInt XorMask;
3769+
APInt BitWidthDiff;
3770+
3771+
SDValue CtlzOp;
3772+
SDValue Src;
3773+
3774+
if (!sd_context_match(
3775+
N, Matcher, m_Sub(m_Ctlz(m_Value(CtlzOp)), m_ConstInt(BitWidthDiff))))
3776+
return SDValue();
3777+
3778+
if (sd_context_match(CtlzOp, Matcher, m_ZExt(m_Not(m_Value(Src))))) {
3779+
// (sub (ctlz (zero_extend (not Op)) BitWidthDiff))
3780+
if ((BitWidth - Src.getValueType().getScalarSizeInBits()) != BitWidthDiff)
3781+
return SDValue();
3782+
3783+
Src = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Src);
3784+
} else if (sd_context_match(CtlzOp, Matcher,
3785+
m_And(m_Xor(m_Value(Src), m_ConstInt(XorMask)),
3786+
m_ConstInt(AndMask)))) {
3787+
// (sub (ctlz (and (xor Op XorMask) AndMask)) BitWidthDiff)
3788+
unsigned AndMaskWidth = BitWidth - BitWidthDiff.getZExtValue();
3789+
if (!(AndMask.isMask(AndMaskWidth) && XorMask.countr_one() >= AndMaskWidth))
3790+
return SDValue();
3791+
} else
3792+
return SDValue();
3793+
3794+
SDValue ShiftConst = DAG.getShiftAmountConstant(BitWidthDiff, VT, DL);
3795+
SDValue LShift = Matcher.getNode(ISD::SHL, DL, VT, Src, ShiftConst);
3796+
SDValue Not =
3797+
Matcher.getNode(ISD::XOR, DL, VT, LShift, DAG.getAllOnesConstant(DL, VT));
3798+
3799+
return Matcher.getNode(ISD::CTLZ_ZERO_UNDEF, DL, VT, Not);
3800+
}
3801+
37583802
// Since it may not be valid to emit a fold to zero for vector initializers
37593803
// check if we can before folding.
37603804
static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
@@ -3779,6 +3823,9 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
37793823
return N;
37803824
};
37813825

3826+
if (SDValue V = foldSubCtlzNot<EmptyMatchContext>(N, DAG))
3827+
return V;
3828+
37823829
// fold (sub x, x) -> 0
37833830
// FIXME: Refactor this and xor and other similar operations together.
37843831
if (PeekThroughFreeze(N0) == PeekThroughFreeze(N1))
@@ -26900,6 +26947,8 @@ SDValue DAGCombiner::visitVPOp(SDNode *N) {
2690026947
return visitVP_SELECT(N);
2690126948
case ISD::VP_MUL:
2690226949
return visitMUL<VPMatchContext>(N);
26950+
case ISD::VP_SUB:
26951+
return foldSubCtlzNot<VPMatchContext>(N, DAG);
2690326952
default:
2690426953
break;
2690526954
}

Diff for: llvm/test/CodeGen/AArch64/ctlo.ll

+28-14
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,20 @@ declare i32 @llvm.ctlz.i32(i32, i1)
88
declare i64 @llvm.ctlz.i64(i64, i1)
99

1010
define i8 @ctlo_i8(i8 %x) {
11-
; CHECK-LABEL: ctlo_i8:
12-
; CHECK: // %bb.0:
13-
; CHECK-NEXT: mov w8, #255 // =0xff
14-
; CHECK-NEXT: bic w8, w8, w0
15-
; CHECK-NEXT: clz w8, w8
16-
; CHECK-NEXT: sub w0, w8, #24
17-
; CHECK-NEXT: ret
11+
; CHECK-SD-LABEL: ctlo_i8:
12+
; CHECK-SD: // %bb.0:
13+
; CHECK-SD-NEXT: mov w8, #-1 // =0xffffffff
14+
; CHECK-SD-NEXT: eor w8, w8, w0, lsl #24
15+
; CHECK-SD-NEXT: clz w0, w8
16+
; CHECK-SD-NEXT: ret
17+
;
18+
; CHECK-GI-LABEL: ctlo_i8:
19+
; CHECK-GI: // %bb.0:
20+
; CHECK-GI-NEXT: mov w8, #255 // =0xff
21+
; CHECK-GI-NEXT: bic w8, w8, w0
22+
; CHECK-GI-NEXT: clz w8, w8
23+
; CHECK-GI-NEXT: sub w0, w8, #24
24+
; CHECK-GI-NEXT: ret
1825
%tmp1 = xor i8 %x, -1
1926
%tmp2 = call i8 @llvm.ctlz.i8( i8 %tmp1, i1 false )
2027
ret i8 %tmp2
@@ -41,13 +48,20 @@ define i8 @ctlo_i8_undef(i8 %x) {
4148
}
4249

4350
define i16 @ctlo_i16(i16 %x) {
44-
; CHECK-LABEL: ctlo_i16:
45-
; CHECK: // %bb.0:
46-
; CHECK-NEXT: mov w8, #65535 // =0xffff
47-
; CHECK-NEXT: bic w8, w8, w0
48-
; CHECK-NEXT: clz w8, w8
49-
; CHECK-NEXT: sub w0, w8, #16
50-
; CHECK-NEXT: ret
51+
; CHECK-SD-LABEL: ctlo_i16:
52+
; CHECK-SD: // %bb.0:
53+
; CHECK-SD-NEXT: mov w8, #-1 // =0xffffffff
54+
; CHECK-SD-NEXT: eor w8, w8, w0, lsl #16
55+
; CHECK-SD-NEXT: clz w0, w8
56+
; CHECK-SD-NEXT: ret
57+
;
58+
; CHECK-GI-LABEL: ctlo_i16:
59+
; CHECK-GI: // %bb.0:
60+
; CHECK-GI-NEXT: mov w8, #65535 // =0xffff
61+
; CHECK-GI-NEXT: bic w8, w8, w0
62+
; CHECK-GI-NEXT: clz w8, w8
63+
; CHECK-GI-NEXT: sub w0, w8, #16
64+
; CHECK-GI-NEXT: ret
5165
%tmp1 = xor i16 %x, -1
5266
%tmp2 = call i16 @llvm.ctlz.i16( i16 %tmp1, i1 false )
5367
ret i16 %tmp2

Diff for: llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll

+8-16
Original file line numberDiff line numberDiff line change
@@ -89,18 +89,14 @@ define i64 @test_ctlz_i64(i64 %a) nounwind {
8989
define i8 @test_not_ctlz_i8(i8 %a) nounwind {
9090
; LA32-LABEL: test_not_ctlz_i8:
9191
; LA32: # %bb.0:
92-
; LA32-NEXT: ori $a1, $zero, 255
93-
; LA32-NEXT: andn $a0, $a1, $a0
94-
; LA32-NEXT: clz.w $a0, $a0
95-
; LA32-NEXT: addi.w $a0, $a0, -24
92+
; LA32-NEXT: slli.w $a0, $a0, 24
93+
; LA32-NEXT: clo.w $a0, $a0
9694
; LA32-NEXT: ret
9795
;
9896
; LA64-LABEL: test_not_ctlz_i8:
9997
; LA64: # %bb.0:
100-
; LA64-NEXT: ori $a1, $zero, 255
101-
; LA64-NEXT: andn $a0, $a1, $a0
102-
; LA64-NEXT: clz.d $a0, $a0
103-
; LA64-NEXT: addi.d $a0, $a0, -56
98+
; LA64-NEXT: slli.d $a0, $a0, 56
99+
; LA64-NEXT: clo.d $a0, $a0
104100
; LA64-NEXT: ret
105101
%neg = xor i8 %a, -1
106102
%tmp = call i8 @llvm.ctlz.i8(i8 %neg, i1 false)
@@ -110,18 +106,14 @@ define i8 @test_not_ctlz_i8(i8 %a) nounwind {
110106
define i16 @test_not_ctlz_i16(i16 %a) nounwind {
111107
; LA32-LABEL: test_not_ctlz_i16:
112108
; LA32: # %bb.0:
113-
; LA32-NEXT: nor $a0, $a0, $zero
114-
; LA32-NEXT: bstrpick.w $a0, $a0, 15, 0
115-
; LA32-NEXT: clz.w $a0, $a0
116-
; LA32-NEXT: addi.w $a0, $a0, -16
109+
; LA32-NEXT: slli.w $a0, $a0, 16
110+
; LA32-NEXT: clo.w $a0, $a0
117111
; LA32-NEXT: ret
118112
;
119113
; LA64-LABEL: test_not_ctlz_i16:
120114
; LA64: # %bb.0:
121-
; LA64-NEXT: nor $a0, $a0, $zero
122-
; LA64-NEXT: bstrpick.d $a0, $a0, 15, 0
123-
; LA64-NEXT: clz.d $a0, $a0
124-
; LA64-NEXT: addi.d $a0, $a0, -48
115+
; LA64-NEXT: slli.d $a0, $a0, 48
116+
; LA64-NEXT: clo.d $a0, $a0
125117
; LA64-NEXT: ret
126118
%neg = xor i16 %a, -1
127119
%tmp = call i16 @llvm.ctlz.i16(i16 %neg, i1 false)

Diff for: llvm/test/CodeGen/RISCV/rvv/ctlz-vp.ll

+8-26
Original file line numberDiff line numberDiff line change
@@ -2627,34 +2627,24 @@ define <vscale x 1 x i9> @vp_ctlz_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
26272627
define <vscale x 1 x i9> @vp_ctlo_nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
26282628
; CHECK-LABEL: vp_ctlo_nxv1i9:
26292629
; CHECK: # %bb.0:
2630-
; CHECK-NEXT: li a1, 511
2631-
; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
2632-
; CHECK-NEXT: vxor.vx v8, v8, a1
26332630
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
2634-
; CHECK-NEXT: vand.vx v8, v8, a1, v0.t
2631+
; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
2632+
; CHECK-NEXT: vnot.v v8, v8, v0.t
26352633
; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
26362634
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
26372635
; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
26382636
; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
26392637
; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
26402638
; CHECK-NEXT: li a0, 142
26412639
; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
2642-
; CHECK-NEXT: li a0, 16
2643-
; CHECK-NEXT: vminu.vx v8, v8, a0, v0.t
2644-
; CHECK-NEXT: li a0, 7
2645-
; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t
26462640
; CHECK-NEXT: ret
26472641
;
26482642
; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9:
26492643
; CHECK-ZVBB: # %bb.0:
2650-
; CHECK-ZVBB-NEXT: li a1, 511
2651-
; CHECK-ZVBB-NEXT: vsetvli a2, zero, e16, mf4, ta, ma
2652-
; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1
26532644
; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
2654-
; CHECK-ZVBB-NEXT: vand.vx v8, v8, a1, v0.t
2645+
; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
2646+
; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
26552647
; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
2656-
; CHECK-ZVBB-NEXT: li a0, 7
2657-
; CHECK-ZVBB-NEXT: vsub.vx v8, v8, a0, v0.t
26582648
; CHECK-ZVBB-NEXT: ret
26592649
%va.not = xor <vscale x 1 x i9> %va, splat (i9 -1)
26602650
%v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)
@@ -2694,32 +2684,24 @@ define <vscale x 1 x i9> @vp_ctlo_zero_undef_nxv1i9(<vscale x 1 x i9> %va, <vsca
26942684
define <vscale x 1 x i9> @vp_ctlo_nxv1i9_vp_xor(<vscale x 1 x i9> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
26952685
; CHECK-LABEL: vp_ctlo_nxv1i9_vp_xor:
26962686
; CHECK: # %bb.0:
2697-
; CHECK-NEXT: li a1, 511
26982687
; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
2699-
; CHECK-NEXT: vxor.vx v8, v8, a1, v0.t
2700-
; CHECK-NEXT: vand.vx v8, v8, a1, v0.t
2688+
; CHECK-NEXT: vsll.vi v8, v8, 7, v0.t
2689+
; CHECK-NEXT: vnot.v v8, v8, v0.t
27012690
; CHECK-NEXT: vfwcvt.f.xu.v v9, v8, v0.t
27022691
; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
27032692
; CHECK-NEXT: vsrl.vi v8, v9, 23, v0.t
27042693
; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
27052694
; CHECK-NEXT: vnsrl.wi v8, v8, 0, v0.t
27062695
; CHECK-NEXT: li a0, 142
27072696
; CHECK-NEXT: vrsub.vx v8, v8, a0, v0.t
2708-
; CHECK-NEXT: li a0, 16
2709-
; CHECK-NEXT: vminu.vx v8, v8, a0, v0.t
2710-
; CHECK-NEXT: li a0, 7
2711-
; CHECK-NEXT: vsub.vx v8, v8, a0, v0.t
27122697
; CHECK-NEXT: ret
27132698
;
27142699
; CHECK-ZVBB-LABEL: vp_ctlo_nxv1i9_vp_xor:
27152700
; CHECK-ZVBB: # %bb.0:
2716-
; CHECK-ZVBB-NEXT: li a1, 511
27172701
; CHECK-ZVBB-NEXT: vsetvli zero, a0, e16, mf4, ta, ma
2718-
; CHECK-ZVBB-NEXT: vxor.vx v8, v8, a1, v0.t
2719-
; CHECK-ZVBB-NEXT: vand.vx v8, v8, a1, v0.t
2702+
; CHECK-ZVBB-NEXT: vsll.vi v8, v8, 7, v0.t
2703+
; CHECK-ZVBB-NEXT: vnot.v v8, v8, v0.t
27202704
; CHECK-ZVBB-NEXT: vclz.v v8, v8, v0.t
2721-
; CHECK-ZVBB-NEXT: li a0, 7
2722-
; CHECK-ZVBB-NEXT: vsub.vx v8, v8, a0, v0.t
27232705
; CHECK-ZVBB-NEXT: ret
27242706
%va.not = call <vscale x 1 x i9> @llvm.vp.xor.nxv1i9(<vscale x 1 x i9> %va, <vscale x 1 x i9> splat (i9 -1), <vscale x 1 x i1> %m, i32 %evl)
27252707
%v = call <vscale x 1 x i9> @llvm.vp.ctlz.nxv1i9(<vscale x 1 x i9> %va.not, i1 false, <vscale x 1 x i1> %m, i32 %evl)

Diff for: llvm/test/CodeGen/X86/ctlo.ll

+6-8
Original file line numberDiff line numberDiff line change
@@ -46,20 +46,18 @@ define i8 @ctlo_i8(i8 %x) {
4646
;
4747
; X86-CLZ-LABEL: ctlo_i8:
4848
; X86-CLZ: # %bb.0:
49-
; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
50-
; X86-CLZ-NEXT: notb %al
51-
; X86-CLZ-NEXT: movzbl %al, %eax
49+
; X86-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
50+
; X86-CLZ-NEXT: shll $24, %eax
51+
; X86-CLZ-NEXT: notl %eax
5252
; X86-CLZ-NEXT: lzcntl %eax, %eax
53-
; X86-CLZ-NEXT: addl $-24, %eax
5453
; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax
5554
; X86-CLZ-NEXT: retl
5655
;
5756
; X64-CLZ-LABEL: ctlo_i8:
5857
; X64-CLZ: # %bb.0:
59-
; X64-CLZ-NEXT: notb %dil
60-
; X64-CLZ-NEXT: movzbl %dil, %eax
61-
; X64-CLZ-NEXT: lzcntl %eax, %eax
62-
; X64-CLZ-NEXT: addl $-24, %eax
58+
; X64-CLZ-NEXT: shll $24, %edi
59+
; X64-CLZ-NEXT: notl %edi
60+
; X64-CLZ-NEXT: lzcntl %edi, %eax
6361
; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax
6462
; X64-CLZ-NEXT: retq
6563
%tmp1 = xor i8 %x, -1

0 commit comments

Comments
 (0)