Skip to content

Commit 4487a39

Browse files
authored
[AArch64] Extend SBC combine to handle CSET HI (llvm#192708)
The `performSubWithBorrowCombine` previously only matched `CSET LO` (unsigned <). This extends it to also handle `CSET HI` (unsigned >) by swapping the `SUBS` operands, since `a > b` is equivalent to `b < a`. This resolves the FIXME left in the test from llvm#165271.
1 parent 8eff63f commit 4487a39

2 files changed

Lines changed: 213 additions & 7 deletions

File tree

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23147,6 +23147,7 @@ static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) {
2314723147
// Attempt to combine the following patterns:
2314823148
// SUB x, (CSET LO, (CMP a, b)) -> SBC x, 0, (CMP a, b)
2314923149
// SUB (SUB x, y), (CSET LO, (CMP a, b)) -> SBC x, y, (CMP a, b)
23150+
// Also handles CSET HI by swapping the CMP operands (a > b ≡ b < a).
2315023151
// The CSET may be preceded by a ZEXT.
2315123152
static SDValue performSubWithBorrowCombine(SDNode *N, SelectionDAG &DAG) {
2315223153
if (N->getOpcode() != ISD::SUB)
@@ -23159,16 +23160,33 @@ static SDValue performSubWithBorrowCombine(SDNode *N, SelectionDAG &DAG) {
2315923160
SDValue N1 = N->getOperand(1);
2316023161
if (N1.getOpcode() == ISD::ZERO_EXTEND && N1.hasOneUse())
2316123162
N1 = N1.getOperand(0);
23162-
if (!N1.hasOneUse() || getCSETCondCode(N1) != AArch64CC::LO)
23163+
auto CC = getCSETCondCode(N1);
23164+
if (!N1.hasOneUse() || (CC != AArch64CC::LO && CC != AArch64CC::HI))
2316323165
return SDValue();
2316423166

2316523167
SDValue Flags = N1.getOperand(3);
2316623168
if (Flags.getOpcode() != AArch64ISD::SUBS)
2316723169
return SDValue();
2316823170

23169-
SDLoc DL(N);
2317023171
SDValue N0 = N->getOperand(0);
23171-
if (N0->getOpcode() == ISD::SUB)
23172+
bool CanFoldSub = N0.getOpcode() == ISD::SUB;
23173+
23174+
// For HI (unsigned >), swap the SUBS operands to obtain LO (unsigned <).
23175+
if (CC == AArch64CC::HI) {
23176+
if (!Flags.hasOneUse())
23177+
return SDValue();
23178+
// Skip when the inner SUB can't be folded and the swap would cost a mov.
23179+
auto *RHSC = dyn_cast<ConstantSDNode>(Flags.getOperand(1));
23180+
if ((!CanFoldSub || !N0.hasOneUse()) && RHSC &&
23181+
isLegalCmpImmed(RHSC->getAPIntValue()))
23182+
return SDValue();
23183+
Flags = DAG.getNode(AArch64ISD::SUBS, SDLoc(Flags), Flags->getVTList(),
23184+
Flags.getOperand(1), Flags.getOperand(0))
23185+
.getValue(1);
23186+
}
23187+
23188+
SDLoc DL(N);
23189+
if (CanFoldSub)
2317223190
return DAG.getNode(AArch64ISD::SBC, DL, VT, N0.getOperand(0),
2317323191
N0.getOperand(1), Flags);
2317423192
return DAG.getNode(AArch64ISD::SBC, DL, VT, N0, DAG.getConstant(0, DL, VT),

llvm/test/CodeGen/AArch64/sbc.ll

Lines changed: 192 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -129,27 +129,215 @@ define i32 @test_sext_add(i32 %a, i32 %b, i32 %x, i32 %y) {
129129
ret i32 %res
130130
}
131131

132-
; FIXME: This case could be supported with reversed operands to the CMP.
133132
define i32 @test_ugt(i32 %a, i32 %b, i32 %x, i32 %y) {
134133
; CHECK-SD-LABEL: test_ugt:
135134
; CHECK-SD: // %bb.0:
135+
; CHECK-SD-NEXT: cmp w1, w0
136+
; CHECK-SD-NEXT: sbc w0, w2, w3
137+
; CHECK-SD-NEXT: ret
138+
;
139+
; CHECK-GI-LABEL: test_ugt:
140+
; CHECK-GI: // %bb.0:
141+
; CHECK-GI-NEXT: cmp w0, w1
142+
; CHECK-GI-NEXT: sub w9, w2, w3
143+
; CHECK-GI-NEXT: cset w8, hi
144+
; CHECK-GI-NEXT: sub w0, w9, w8
145+
; CHECK-GI-NEXT: ret
146+
%cc = icmp ugt i32 %a, %b
147+
%carry = zext i1 %cc to i32
148+
%sub = sub i32 %x, %y
149+
%res = sub i32 %sub, %carry
150+
ret i32 %res
151+
}
152+
153+
define i64 @test_ugt_mixed_i32_i64(i32 %a, i32 %b, i64 %x, i64 %y) {
154+
; CHECK-SD-LABEL: test_ugt_mixed_i32_i64:
155+
; CHECK-SD: // %bb.0:
156+
; CHECK-SD-NEXT: cmp w1, w0
157+
; CHECK-SD-NEXT: sbc x0, x2, x3
158+
; CHECK-SD-NEXT: ret
159+
;
160+
; CHECK-GI-LABEL: test_ugt_mixed_i32_i64:
161+
; CHECK-GI: // %bb.0:
162+
; CHECK-GI-NEXT: cmp w0, w1
163+
; CHECK-GI-NEXT: sub x9, x2, x3
164+
; CHECK-GI-NEXT: cset w8, hi
165+
; CHECK-GI-NEXT: sub x0, x9, x8
166+
; CHECK-GI-NEXT: ret
167+
%cc = icmp ugt i32 %a, %b
168+
%carry = zext i1 %cc to i64
169+
%sub = sub i64 %x, %y
170+
%res = sub i64 %sub, %carry
171+
ret i64 %res
172+
}
173+
174+
define i32 @test_ugt_multi_use_flags(i32 %a, i32 %b, i32 %x, i32 %y, i32 %z) {
175+
; CHECK-SD-LABEL: test_ugt_multi_use_flags:
176+
; CHECK-SD: // %bb.0:
136177
; CHECK-SD-NEXT: cmp w0, w1
137178
; CHECK-SD-NEXT: sub w8, w2, w3
138179
; CHECK-SD-NEXT: cset w9, hi
139-
; CHECK-SD-NEXT: sub w0, w8, w9
180+
; CHECK-SD-NEXT: cmp w0, w1
181+
; CHECK-SD-NEXT: sub w8, w8, w9
182+
; CHECK-SD-NEXT: csel w0, w8, w4, eq
140183
; CHECK-SD-NEXT: ret
141184
;
142-
; CHECK-GI-LABEL: test_ugt:
185+
; CHECK-GI-LABEL: test_ugt_multi_use_flags:
143186
; CHECK-GI: // %bb.0:
144187
; CHECK-GI-NEXT: cmp w0, w1
145188
; CHECK-GI-NEXT: sub w9, w2, w3
146189
; CHECK-GI-NEXT: cset w8, hi
147-
; CHECK-GI-NEXT: sub w0, w9, w8
190+
; CHECK-GI-NEXT: sub w8, w9, w8
191+
; CHECK-GI-NEXT: csel w0, w8, w4, eq
148192
; CHECK-GI-NEXT: ret
149193
%cc = icmp ugt i32 %a, %b
150194
%carry = zext i1 %cc to i32
151195
%sub = sub i32 %x, %y
152196
%res = sub i32 %sub, %carry
197+
%cc2 = icmp eq i32 %a, %b
198+
%sel = select i1 %cc2, i32 %res, i32 %z
199+
ret i32 %sel
200+
}
201+
202+
define i32 @test_ugt_42(i32 %a, i32 %x, i32 %y) {
203+
; CHECK-SD-LABEL: test_ugt_42:
204+
; CHECK-SD: // %bb.0:
205+
; CHECK-SD-NEXT: mov w8, #42 // =0x2a
206+
; CHECK-SD-NEXT: cmp w8, w0
207+
; CHECK-SD-NEXT: sbc w0, w1, w2
208+
; CHECK-SD-NEXT: ret
209+
;
210+
; CHECK-GI-LABEL: test_ugt_42:
211+
; CHECK-GI: // %bb.0:
212+
; CHECK-GI-NEXT: cmp w0, #42
213+
; CHECK-GI-NEXT: sub w9, w1, w2
214+
; CHECK-GI-NEXT: cset w8, hi
215+
; CHECK-GI-NEXT: sub w0, w9, w8
216+
; CHECK-GI-NEXT: ret
217+
%cc = icmp ugt i32 %a, 42
218+
%carry = zext i1 %cc to i32
219+
%sub = sub i32 %x, %y
220+
%res = sub i32 %sub, %carry
221+
ret i32 %res
222+
}
223+
224+
define i32 @test_only_borrow_ugt_42(i32 %a, i32 %x) {
225+
; CHECK-LABEL: test_only_borrow_ugt_42:
226+
; CHECK: // %bb.0:
227+
; CHECK-NEXT: cmp w0, #42
228+
; CHECK-NEXT: cset w8, hi
229+
; CHECK-NEXT: sub w0, w1, w8
230+
; CHECK-NEXT: ret
231+
%cc = icmp ugt i32 %a, 42
232+
%carry = zext i1 %cc to i32
233+
%res = sub i32 %x, %carry
234+
ret i32 %res
235+
}
236+
237+
define i32 @test_only_borrow_ugt_42_combine(i32 %a, i32 %x) {
238+
; CHECK-SD-LABEL: test_only_borrow_ugt_42_combine:
239+
; CHECK-SD: // %bb.0:
240+
; CHECK-SD-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
241+
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
242+
; CHECK-SD-NEXT: .cfi_offset w19, -8
243+
; CHECK-SD-NEXT: .cfi_offset w30, -16
244+
; CHECK-SD-NEXT: subs w0, w0, #42
245+
; CHECK-SD-NEXT: cset w8, hi
246+
; CHECK-SD-NEXT: sub w19, w1, w8
247+
; CHECK-SD-NEXT: bl use
248+
; CHECK-SD-NEXT: mov w0, w19
249+
; CHECK-SD-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
250+
; CHECK-SD-NEXT: ret
251+
;
252+
; CHECK-GI-LABEL: test_only_borrow_ugt_42_combine:
253+
; CHECK-GI: // %bb.0:
254+
; CHECK-GI-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill
255+
; CHECK-GI-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
256+
; CHECK-GI-NEXT: .cfi_def_cfa_offset 32
257+
; CHECK-GI-NEXT: .cfi_offset w19, -8
258+
; CHECK-GI-NEXT: .cfi_offset w20, -16
259+
; CHECK-GI-NEXT: .cfi_offset w30, -32
260+
; CHECK-GI-NEXT: cmp w0, #42
261+
; CHECK-GI-NEXT: sub w0, w0, #42
262+
; CHECK-GI-NEXT: mov w19, w1
263+
; CHECK-GI-NEXT: cset w20, hi
264+
; CHECK-GI-NEXT: bl use
265+
; CHECK-GI-NEXT: sub w0, w19, w20
266+
; CHECK-GI-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
267+
; CHECK-GI-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload
268+
; CHECK-GI-NEXT: ret
269+
%cc = icmp ugt i32 %a, 42
270+
%carry = zext i1 %cc to i32
271+
%res = sub i32 %x, %carry
272+
%sub42 = sub i32 %a, 42
273+
call void @use(i32 %sub42)
274+
ret i32 %res
275+
}
276+
277+
define i32 @test_ugt_huge_imm(i32 %a, i32 %x, i32 %y) {
278+
; CHECK-SD-LABEL: test_ugt_huge_imm:
279+
; CHECK-SD: // %bb.0:
280+
; CHECK-SD-NEXT: mov w8, #52501 // =0xcd15
281+
; CHECK-SD-NEXT: movk w8, #1883, lsl #16
282+
; CHECK-SD-NEXT: cmp w8, w0
283+
; CHECK-SD-NEXT: sbc w0, w1, w2
284+
; CHECK-SD-NEXT: ret
285+
;
286+
; CHECK-GI-LABEL: test_ugt_huge_imm:
287+
; CHECK-GI: // %bb.0:
288+
; CHECK-GI-NEXT: mov w8, #52501 // =0xcd15
289+
; CHECK-GI-NEXT: sub w9, w1, w2
290+
; CHECK-GI-NEXT: movk w8, #1883, lsl #16
291+
; CHECK-GI-NEXT: cmp w0, w8
292+
; CHECK-GI-NEXT: cset w8, hi
293+
; CHECK-GI-NEXT: sub w0, w9, w8
294+
; CHECK-GI-NEXT: ret
295+
%cc = icmp ugt i32 %a, 123456789
296+
%carry = zext i1 %cc to i32
297+
%sub = sub i32 %x, %y
298+
%res = sub i32 %sub, %carry
299+
ret i32 %res
300+
}
301+
302+
; Negative: outer SUB shared with @use + shifted cmp imm — swap costs a mov.
303+
define i32 @test_ugt_huge_imm_shifted(i32 %a, i32 %x, i32 %y) {
304+
; CHECK-SD-LABEL: test_ugt_huge_imm_shifted:
305+
; CHECK-SD: // %bb.0:
306+
; CHECK-SD-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
307+
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
308+
; CHECK-SD-NEXT: .cfi_offset w19, -8
309+
; CHECK-SD-NEXT: .cfi_offset w30, -16
310+
; CHECK-SD-NEXT: cmp w0, #291, lsl #12 // =1191936
311+
; CHECK-SD-NEXT: sub w0, w1, w2
312+
; CHECK-SD-NEXT: cset w8, hi
313+
; CHECK-SD-NEXT: sub w19, w0, w8
314+
; CHECK-SD-NEXT: bl use
315+
; CHECK-SD-NEXT: mov w0, w19
316+
; CHECK-SD-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
317+
; CHECK-SD-NEXT: ret
318+
;
319+
; CHECK-GI-LABEL: test_ugt_huge_imm_shifted:
320+
; CHECK-GI: // %bb.0:
321+
; CHECK-GI-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill
322+
; CHECK-GI-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
323+
; CHECK-GI-NEXT: .cfi_def_cfa_offset 32
324+
; CHECK-GI-NEXT: .cfi_offset w19, -8
325+
; CHECK-GI-NEXT: .cfi_offset w20, -16
326+
; CHECK-GI-NEXT: .cfi_offset w30, -32
327+
; CHECK-GI-NEXT: sub w19, w1, w2
328+
; CHECK-GI-NEXT: cmp w0, #291, lsl #12 // =1191936
329+
; CHECK-GI-NEXT: mov w0, w19
330+
; CHECK-GI-NEXT: cset w20, hi
331+
; CHECK-GI-NEXT: bl use
332+
; CHECK-GI-NEXT: sub w0, w19, w20
333+
; CHECK-GI-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
334+
; CHECK-GI-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload
335+
; CHECK-GI-NEXT: ret
336+
%cc = icmp ugt i32 %a, u0x123000
337+
%carry = zext i1 %cc to i32
338+
%sub = sub i32 %x, %y
339+
%res = sub i32 %sub, %carry
340+
call void @use(i32 %sub)
153341
ret i32 %res
154342
}
155343

0 commit comments

Comments
 (0)