-
Notifications
You must be signed in to change notification settings - Fork 15.7k
[ISel] Implement operand widening for VECTOR_FIND_LAST_ACTIVE. #174389
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-backend-x86 Author: Florian Hahn (fhahn) ChangesImplement WidenVecOp_VECTOR_FIND_LAST_ACTIVE to properly widen the Currently lowering crashes when widening of the operand is needed: WidenVectorOperand op #0: t15: i64 = find_last_active t14 Depends on github.com/#174384 to avoid infinite Full diff: https://github.com/llvm/llvm-project/pull/174389.diff 5 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index cd58c8ab1c3e4..5f247b25d1486 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -1157,6 +1157,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue WidenVecOp_VP_REDUCE(SDNode *N);
SDValue WidenVecOp_ExpOp(SDNode *N);
SDValue WidenVecOp_VP_CttzElements(SDNode *N);
+ SDValue WidenVecOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N);
/// Helper function to generate a set of operations to perform
/// a vector operation for a wider type.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index af685191d82d8..cea3c9171f0d6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -7200,6 +7200,9 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::VP_CTTZ_ELTS_ZERO_UNDEF:
Res = WidenVecOp_VP_CttzElements(N);
break;
+ case ISD::VECTOR_FIND_LAST_ACTIVE:
+ Res = WidenVecOp_VECTOR_FIND_LAST_ACTIVE(N);
+ break;
}
// If Res is null, the sub-method took care of registering the result.
@@ -8125,6 +8128,26 @@ SDValue DAGTypeLegalizer::WidenVecOp_VP_CttzElements(SDNode *N) {
{Source, Mask, N->getOperand(2)}, N->getFlags());
}
+SDValue DAGTypeLegalizer::WidenVecOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N) {
+ SDLoc DL(N);
+ SDValue Mask = N->getOperand(0);
+ EVT OrigMaskVT = Mask.getValueType();
+ SDValue WideMask = GetWidenedVector(Mask);
+ EVT WideMaskVT = WideMask.getValueType();
+
+ // Pad the mask with zeros to ensure inactive lanes don't affect the result.
+ unsigned OrigElts = OrigMaskVT.getVectorNumElements();
+ unsigned WideElts = WideMaskVT.getVectorNumElements();
+ if (OrigElts != WideElts) {
+ SDValue ZeroMask = DAG.getConstant(0, DL, WideMaskVT);
+ WideMask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideMaskVT, ZeroMask,
+ Mask, DAG.getVectorIdxConstant(0, DL));
+ }
+
+ return DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, DL, N->getValueType(0),
+ WideMask);
+}
+
//===----------------------------------------------------------------------===//
// Vector Widening Utilities
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 69c3455573918..b37a5e4144aea 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9668,20 +9668,44 @@ SDValue TargetLowering::expandVectorFindLastActive(SDNode *N,
EVT StepVT = MVT::getIntegerVT(EltWidth);
EVT StepVecVT = MaskVT.changeVectorElementType(*DAG.getContext(), StepVT);
- // If promotion is required to make the type legal, do it here; promotion
- // of integers within LegalizeVectorOps is looking for types of the same
- // size but with a smaller number of larger elements, not the usual larger
- // size with the same number of larger elements.
- if (TLI.getTypeAction(StepVecVT.getSimpleVT()) ==
- TargetLowering::TypePromoteInteger) {
+ // If promotion or widening is required to make the type legal, do it here.
+ // Promotion of integers within LegalizeVectorOps is looking for types of
+ // the same size but with a smaller number of larger elements, not the usual
+ // larger size with the same number of larger elements.
+ TargetLowering::LegalizeTypeAction TypeAction =
+ TLI.getTypeAction(StepVecVT.getSimpleVT());
+ SDValue StepVec;
+ if (TypeAction == TargetLowering::TypePromoteInteger) {
StepVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), StepVecVT);
StepVT = StepVecVT.getVectorElementType();
+ StepVec = DAG.getStepVector(DL, StepVecVT);
+ } else if (TypeAction == TargetLowering::TypeWidenVector) {
+ // For widening, the element count changes. Create a step vector with only
+ // the original elements valid and zeros for padding. Also widen the mask.
+ EVT WideVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), StepVecVT);
+ unsigned WideNumElts = WideVecVT.getVectorNumElements();
+
+ // Build widened step vector: <0, 1, ..., OrigNumElts-1, 0, 0, ...>
+ SDValue OrigStepVec = DAG.getStepVector(DL, StepVecVT);
+ StepVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVecVT,
+ DAG.getConstant(0, DL, WideVecVT), OrigStepVec,
+ DAG.getIntPtrConstant(0, DL));
+
+ // Widen mask: pad with zeros.
+ EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), BoolVT, WideNumElts);
+ SDValue ZeroMask = DAG.getConstant(0, DL, WideMaskVT);
+ Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideMaskVT, ZeroMask, Mask,
+ DAG.getIntPtrConstant(0, DL));
+
+ StepVecVT = WideVecVT;
+ StepVT = WideVecVT.getVectorElementType();
+ } else {
+ StepVec = DAG.getStepVector(DL, StepVecVT);
}
// Zero out lanes with inactive elements, then find the highest remaining
// value from the stepvector.
SDValue Zeroes = DAG.getConstant(0, DL, StepVecVT);
- SDValue StepVec = DAG.getStepVector(DL, StepVecVT);
SDValue ActiveElts = DAG.getSelect(DL, StepVecVT, Mask, StepVec, Zeroes);
SDValue HighestIdx = DAG.getNode(ISD::VECREDUCE_UMAX, DL, StepVT, ActiveElts);
return DAG.getZExtOrTrunc(HighestIdx, DL, N->getValueType(0));
diff --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
index bb9a09ca3cc80..7ee8f6fda93f5 100644
--- a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
+++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
@@ -490,6 +490,65 @@ define i1 @extract_last_i1_scalable(<vscale x 16 x i1> %data, <vscale x 16 x i1>
ret i1 %res
}
+; Test v3i32 - non-power-of-2 element count that requires mask widening
+; (v3i1 -> v4i1) via WidenVecOp_VECTOR_FIND_LAST_ACTIVE.
+define i32 @extract_last_active_v3i32(<3 x i32> %a, <3 x i1> %c) {
+; NEON-FIXED-LABEL: extract_last_active_v3i32:
+; NEON-FIXED: // %bb.0:
+; NEON-FIXED-NEXT: sub sp, sp, #16
+; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT: movi v1.2d, #0000000000000000
+; NEON-FIXED-NEXT: adrp x9, .LCPI18_0
+; NEON-FIXED-NEXT: mov x11, sp
+; NEON-FIXED-NEXT: ldr d2, [x9, :lo12:.LCPI18_0]
+; NEON-FIXED-NEXT: str q0, [sp]
+; NEON-FIXED-NEXT: mov v1.h[0], w0
+; NEON-FIXED-NEXT: mov v1.h[1], w1
+; NEON-FIXED-NEXT: fmov x8, d1
+; NEON-FIXED-NEXT: mov v1.h[2], w2
+; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; NEON-FIXED-NEXT: fmov x9, d1
+; NEON-FIXED-NEXT: umaxv h2, v2.4h
+; NEON-FIXED-NEXT: lsr x9, x9, #32
+; NEON-FIXED-NEXT: orr w9, w8, w9
+; NEON-FIXED-NEXT: orr w8, w9, w8, lsr #16
+; NEON-FIXED-NEXT: fmov w10, s2
+; NEON-FIXED-NEXT: tst w8, #0x1
+; NEON-FIXED-NEXT: bfi x11, x10, #2, #2
+; NEON-FIXED-NEXT: ldr w9, [x11]
+; NEON-FIXED-NEXT: csinv w0, w9, wzr, ne
+; NEON-FIXED-NEXT: add sp, sp, #16
+; NEON-FIXED-NEXT: ret
+;
+; SVE-FIXED-LABEL: extract_last_active_v3i32:
+; SVE-FIXED: // %bb.0:
+; SVE-FIXED-NEXT: sub sp, sp, #16
+; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT: movi v1.2d, #0000000000000000
+; SVE-FIXED-NEXT: index z2.h, #0, #1
+; SVE-FIXED-NEXT: mov x11, sp
+; SVE-FIXED-NEXT: str q0, [sp]
+; SVE-FIXED-NEXT: mov v1.h[0], w0
+; SVE-FIXED-NEXT: mov v1.h[1], w1
+; SVE-FIXED-NEXT: fmov x8, d1
+; SVE-FIXED-NEXT: mov v1.h[2], w2
+; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; SVE-FIXED-NEXT: fmov x9, d1
+; SVE-FIXED-NEXT: umaxv h2, v2.4h
+; SVE-FIXED-NEXT: lsr x9, x9, #32
+; SVE-FIXED-NEXT: orr w9, w8, w9
+; SVE-FIXED-NEXT: orr w8, w9, w8, lsr #16
+; SVE-FIXED-NEXT: fmov w10, s2
+; SVE-FIXED-NEXT: tst w8, #0x1
+; SVE-FIXED-NEXT: bfi x11, x10, #2, #2
+; SVE-FIXED-NEXT: ldr w9, [x11]
+; SVE-FIXED-NEXT: csinv w0, w9, wzr, ne
+; SVE-FIXED-NEXT: add sp, sp, #16
+; SVE-FIXED-NEXT: ret
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v3i32(<3 x i32> %a, <3 x i1> %c, i32 -1)
+ ret i32 %res
+}
+
declare i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8)
declare i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16)
declare i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32)
diff --git a/llvm/test/CodeGen/X86/vector-extract-last-active.ll b/llvm/test/CodeGen/X86/vector-extract-last-active.ll
new file mode 100644
index 0000000000000..09d305eaaeb77
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vector-extract-last-active.ll
@@ -0,0 +1,287 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown-linux | FileCheck %s
+
+; This test verifies that the experimental.vector.extract.last.active intrinsic
+; doesn't cause an infinite loop during legalization when the step vector type
+; needs widening (e.g., v4i8 -> v16i8 on X86).
+
+define i32 @extract_last_active_v4i32(<4 x i32> %a, <4 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movd %xmm1, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; CHECK-NEXT: movd %xmm2, %ecx
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; CHECK-NEXT: movd %xmm2, %edx
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; CHECK-NEXT: movd %xmm1, %esi
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: testl %esi, %esi
+; CHECK-NEXT: setne %dil
+; CHECK-NEXT: leal (%rdi,%rdi,2), %r8d
+; CHECK-NEXT: xorl %r9d, %r9d
+; CHECK-NEXT: testl %edx, %edx
+; CHECK-NEXT: setne %r9b
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: testl %ecx, %ecx
+; CHECK-NEXT: setne %dil
+; CHECK-NEXT: addl %edi, %edi
+; CHECK-NEXT: cmpb %dil, %r9b
+; CHECK-NEXT: cmoval %r9d, %edi
+; CHECK-NEXT: cmpb %r8b, %dil
+; CHECK-NEXT: cmovbel %r8d, %edi
+; CHECK-NEXT: orl %edx, %eax
+; CHECK-NEXT: orl %esi, %ecx
+; CHECK-NEXT: orl %eax, %ecx
+; CHECK-NEXT: andb $1, %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $1, %cl
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: orl -24(%rsp,%rdi,4), %eax
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %a, <4 x i1> %c, i32 -1)
+ ret i32 %res
+}
+
+define i32 @extract_last_active_v4i32_no_default(<4 x i32> %a, <4 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v4i32_no_default:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: testl %eax, %eax
+; CHECK-NEXT: setne %cl
+; CHECK-NEXT: leal (%rcx,%rcx,2), %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; CHECK-NEXT: movd %xmm0, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: testl %ecx, %ecx
+; CHECK-NEXT: setne %dl
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; CHECK-NEXT: movd %xmm0, %ecx
+; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: testl %ecx, %ecx
+; CHECK-NEXT: setne %sil
+; CHECK-NEXT: addl %esi, %esi
+; CHECK-NEXT: cmpb %sil, %dl
+; CHECK-NEXT: cmoval %edx, %esi
+; CHECK-NEXT: cmpb %al, %sil
+; CHECK-NEXT: cmovbel %eax, %esi
+; CHECK-NEXT: movl -24(%rsp,%rsi,4), %eax
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %a, <4 x i1> %c, i32 poison)
+ ret i32 %res
+}
+
+; Test v2i32 - smaller vector.
+define i32 @extract_last_active_v2i32(<2 x i32> %a, <2 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; CHECK-NEXT: movq %xmm2, %rcx
+; CHECK-NEXT: movq %xmm1, %rax
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %ecx, %edx
+; CHECK-NEXT: orl %eax, %edx
+; CHECK-NEXT: andb $1, %dl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $1, %dl
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: testq %rcx, %rcx
+; CHECK-NEXT: setne %dl
+; CHECK-NEXT: orl -24(%rsp,%rdx,4), %eax
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> %a, <2 x i1> %c, i32 -1)
+ ret i32 %res
+}
+
+; Test v3i32 - non-power-of-2 element count that requires mask widening
+; (v3i1 -> v4i1) via WidenVecOp_VECTOR_FIND_LAST_ACTIVE.
+define i32 @extract_last_active_v3i32(<3 x i32> %a, <3 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v3i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: orl %edx, %edi
+; CHECK-NEXT: andb $1, %dil
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $1, %dil
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: testl %esi, %esi
+; CHECK-NEXT: setne %cl
+; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: testl %edx, %edx
+; CHECK-NEXT: setne %sil
+; CHECK-NEXT: addl %esi, %esi
+; CHECK-NEXT: cmpb %sil, %cl
+; CHECK-NEXT: cmoval %ecx, %esi
+; CHECK-NEXT: movzbl %sil, %ecx
+; CHECK-NEXT: orl -24(%rsp,%rcx,4), %eax
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v3i32(<3 x i32> %a, <3 x i1> %c, i32 -1)
+ ret i32 %res
+}
+
+; Test v8i32 - larger vector where step vector type doesn't need widening.
+define i32 @extract_last_active_v8i32(<8 x i32> %a, <8 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: .cfi_offset %rbx, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movd %xmm2, %edi
+; CHECK-NEXT: pextrw $7, %xmm2, %eax
+; CHECK-NEXT: pextrw $6, %xmm2, %edx
+; CHECK-NEXT: pextrw $5, %xmm2, %r8d
+; CHECK-NEXT: pextrw $4, %xmm2, %ecx
+; CHECK-NEXT: pextrw $2, %xmm2, %esi
+; CHECK-NEXT: pextrw $1, %xmm2, %r10d
+; CHECK-NEXT: pextrw $3, %xmm2, %r9d
+; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: xorl %r11d, %r11d
+; CHECK-NEXT: testl %r9d, %r9d
+; CHECK-NEXT: setne %r11b
+; CHECK-NEXT: leal (%r11,%r11,2), %r11d
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: testl %r10d, %r10d
+; CHECK-NEXT: setne %bl
+; CHECK-NEXT: xorl %ebp, %ebp
+; CHECK-NEXT: testl %esi, %esi
+; CHECK-NEXT: setne %bpl
+; CHECK-NEXT: addl %ebp, %ebp
+; CHECK-NEXT: cmpb %bpl, %bl
+; CHECK-NEXT: cmoval %ebx, %ebp
+; CHECK-NEXT: cmpb %r11b, %bpl
+; CHECK-NEXT: cmovbel %r11d, %ebp
+; CHECK-NEXT: xorl %r11d, %r11d
+; CHECK-NEXT: testl %ecx, %ecx
+; CHECK-NEXT: setne %r11b
+; CHECK-NEXT: shll $2, %r11d
+; CHECK-NEXT: cmpb %r11b, %bpl
+; CHECK-NEXT: cmoval %ebp, %r11d
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: testl %r8d, %r8d
+; CHECK-NEXT: setne %bl
+; CHECK-NEXT: leal (%rbx,%rbx,4), %ebx
+; CHECK-NEXT: cmpb %bl, %r11b
+; CHECK-NEXT: cmovbel %ebx, %r11d
+; CHECK-NEXT: testl %edx, %edx
+; CHECK-NEXT: movl $6, %ebx
+; CHECK-NEXT: cmovel %edx, %ebx
+; CHECK-NEXT: cmpb %bl, %r11b
+; CHECK-NEXT: cmoval %r11d, %ebx
+; CHECK-NEXT: testl %eax, %eax
+; CHECK-NEXT: movl $7, %r11d
+; CHECK-NEXT: cmovel %eax, %r11d
+; CHECK-NEXT: cmpb %r11b, %bl
+; CHECK-NEXT: cmoval %ebx, %r11d
+; CHECK-NEXT: andl $7, %r11d
+; CHECK-NEXT: orl %r10d, %edi
+; CHECK-NEXT: orl %r9d, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: orl %r8d, %ecx
+; CHECK-NEXT: orl %edx, %ecx
+; CHECK-NEXT: orl %esi, %ecx
+; CHECK-NEXT: orl %eax, %ecx
+; CHECK-NEXT: andb $1, %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $1, %cl
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: orl -40(%rsp,%r11,4), %eax
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> %a, <8 x i1> %c, i32 -1)
+ ret i32 %res
+}
+
+; Test v16i32 - even larger vector.
+define i32 @extract_last_active_v16i32(<16 x i32> %a, <16 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v16i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; CHECK-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %dl, %al
+; CHECK-NEXT: cmoval %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: cmpb %sil, %dl
+; CHECK-NEXT: cmovbel %esi, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: andl $15, %edx
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: andb $1, %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $1, %cl
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: orl -72(%rsp,%rdx,4), %eax
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v16i32(<16 x i32> %a, <16 x i1> %c, i32 -1)
+ ret i32 %res
+}
|
|
@llvm/pr-subscribers-backend-aarch64 Author: Florian Hahn (fhahn) ChangesImplement WidenVecOp_VECTOR_FIND_LAST_ACTIVE to properly widen the Currently lowering crashes when widening of the operand is needed: WidenVectorOperand op #0: t15: i64 = find_last_active t14 Depends on github.com/#174384 to avoid infinite Full diff: https://github.com/llvm/llvm-project/pull/174389.diff 5 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index cd58c8ab1c3e4..5f247b25d1486 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -1157,6 +1157,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue WidenVecOp_VP_REDUCE(SDNode *N);
SDValue WidenVecOp_ExpOp(SDNode *N);
SDValue WidenVecOp_VP_CttzElements(SDNode *N);
+ SDValue WidenVecOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N);
/// Helper function to generate a set of operations to perform
/// a vector operation for a wider type.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index af685191d82d8..cea3c9171f0d6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -7200,6 +7200,9 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::VP_CTTZ_ELTS_ZERO_UNDEF:
Res = WidenVecOp_VP_CttzElements(N);
break;
+ case ISD::VECTOR_FIND_LAST_ACTIVE:
+ Res = WidenVecOp_VECTOR_FIND_LAST_ACTIVE(N);
+ break;
}
// If Res is null, the sub-method took care of registering the result.
@@ -8125,6 +8128,26 @@ SDValue DAGTypeLegalizer::WidenVecOp_VP_CttzElements(SDNode *N) {
{Source, Mask, N->getOperand(2)}, N->getFlags());
}
+SDValue DAGTypeLegalizer::WidenVecOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N) {
+ SDLoc DL(N);
+ SDValue Mask = N->getOperand(0);
+ EVT OrigMaskVT = Mask.getValueType();
+ SDValue WideMask = GetWidenedVector(Mask);
+ EVT WideMaskVT = WideMask.getValueType();
+
+ // Pad the mask with zeros to ensure inactive lanes don't affect the result.
+ unsigned OrigElts = OrigMaskVT.getVectorNumElements();
+ unsigned WideElts = WideMaskVT.getVectorNumElements();
+ if (OrigElts != WideElts) {
+ SDValue ZeroMask = DAG.getConstant(0, DL, WideMaskVT);
+ WideMask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideMaskVT, ZeroMask,
+ Mask, DAG.getVectorIdxConstant(0, DL));
+ }
+
+ return DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, DL, N->getValueType(0),
+ WideMask);
+}
+
//===----------------------------------------------------------------------===//
// Vector Widening Utilities
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 69c3455573918..b37a5e4144aea 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9668,20 +9668,44 @@ SDValue TargetLowering::expandVectorFindLastActive(SDNode *N,
EVT StepVT = MVT::getIntegerVT(EltWidth);
EVT StepVecVT = MaskVT.changeVectorElementType(*DAG.getContext(), StepVT);
- // If promotion is required to make the type legal, do it here; promotion
- // of integers within LegalizeVectorOps is looking for types of the same
- // size but with a smaller number of larger elements, not the usual larger
- // size with the same number of larger elements.
- if (TLI.getTypeAction(StepVecVT.getSimpleVT()) ==
- TargetLowering::TypePromoteInteger) {
+ // If promotion or widening is required to make the type legal, do it here.
+ // Promotion of integers within LegalizeVectorOps is looking for types of
+ // the same size but with a smaller number of larger elements, not the usual
+ // larger size with the same number of larger elements.
+ TargetLowering::LegalizeTypeAction TypeAction =
+ TLI.getTypeAction(StepVecVT.getSimpleVT());
+ SDValue StepVec;
+ if (TypeAction == TargetLowering::TypePromoteInteger) {
StepVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), StepVecVT);
StepVT = StepVecVT.getVectorElementType();
+ StepVec = DAG.getStepVector(DL, StepVecVT);
+ } else if (TypeAction == TargetLowering::TypeWidenVector) {
+ // For widening, the element count changes. Create a step vector with only
+ // the original elements valid and zeros for padding. Also widen the mask.
+ EVT WideVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), StepVecVT);
+ unsigned WideNumElts = WideVecVT.getVectorNumElements();
+
+ // Build widened step vector: <0, 1, ..., OrigNumElts-1, 0, 0, ...>
+ SDValue OrigStepVec = DAG.getStepVector(DL, StepVecVT);
+ StepVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVecVT,
+ DAG.getConstant(0, DL, WideVecVT), OrigStepVec,
+ DAG.getIntPtrConstant(0, DL));
+
+ // Widen mask: pad with zeros.
+ EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), BoolVT, WideNumElts);
+ SDValue ZeroMask = DAG.getConstant(0, DL, WideMaskVT);
+ Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideMaskVT, ZeroMask, Mask,
+ DAG.getIntPtrConstant(0, DL));
+
+ StepVecVT = WideVecVT;
+ StepVT = WideVecVT.getVectorElementType();
+ } else {
+ StepVec = DAG.getStepVector(DL, StepVecVT);
}
// Zero out lanes with inactive elements, then find the highest remaining
// value from the stepvector.
SDValue Zeroes = DAG.getConstant(0, DL, StepVecVT);
- SDValue StepVec = DAG.getStepVector(DL, StepVecVT);
SDValue ActiveElts = DAG.getSelect(DL, StepVecVT, Mask, StepVec, Zeroes);
SDValue HighestIdx = DAG.getNode(ISD::VECREDUCE_UMAX, DL, StepVT, ActiveElts);
return DAG.getZExtOrTrunc(HighestIdx, DL, N->getValueType(0));
diff --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
index bb9a09ca3cc80..7ee8f6fda93f5 100644
--- a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
+++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
@@ -490,6 +490,65 @@ define i1 @extract_last_i1_scalable(<vscale x 16 x i1> %data, <vscale x 16 x i1>
ret i1 %res
}
+; Test v3i32 - non-power-of-2 element count that requires mask widening
+; (v3i1 -> v4i1) via WidenVecOp_VECTOR_FIND_LAST_ACTIVE.
+define i32 @extract_last_active_v3i32(<3 x i32> %a, <3 x i1> %c) {
+; NEON-FIXED-LABEL: extract_last_active_v3i32:
+; NEON-FIXED: // %bb.0:
+; NEON-FIXED-NEXT: sub sp, sp, #16
+; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT: movi v1.2d, #0000000000000000
+; NEON-FIXED-NEXT: adrp x9, .LCPI18_0
+; NEON-FIXED-NEXT: mov x11, sp
+; NEON-FIXED-NEXT: ldr d2, [x9, :lo12:.LCPI18_0]
+; NEON-FIXED-NEXT: str q0, [sp]
+; NEON-FIXED-NEXT: mov v1.h[0], w0
+; NEON-FIXED-NEXT: mov v1.h[1], w1
+; NEON-FIXED-NEXT: fmov x8, d1
+; NEON-FIXED-NEXT: mov v1.h[2], w2
+; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; NEON-FIXED-NEXT: fmov x9, d1
+; NEON-FIXED-NEXT: umaxv h2, v2.4h
+; NEON-FIXED-NEXT: lsr x9, x9, #32
+; NEON-FIXED-NEXT: orr w9, w8, w9
+; NEON-FIXED-NEXT: orr w8, w9, w8, lsr #16
+; NEON-FIXED-NEXT: fmov w10, s2
+; NEON-FIXED-NEXT: tst w8, #0x1
+; NEON-FIXED-NEXT: bfi x11, x10, #2, #2
+; NEON-FIXED-NEXT: ldr w9, [x11]
+; NEON-FIXED-NEXT: csinv w0, w9, wzr, ne
+; NEON-FIXED-NEXT: add sp, sp, #16
+; NEON-FIXED-NEXT: ret
+;
+; SVE-FIXED-LABEL: extract_last_active_v3i32:
+; SVE-FIXED: // %bb.0:
+; SVE-FIXED-NEXT: sub sp, sp, #16
+; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT: movi v1.2d, #0000000000000000
+; SVE-FIXED-NEXT: index z2.h, #0, #1
+; SVE-FIXED-NEXT: mov x11, sp
+; SVE-FIXED-NEXT: str q0, [sp]
+; SVE-FIXED-NEXT: mov v1.h[0], w0
+; SVE-FIXED-NEXT: mov v1.h[1], w1
+; SVE-FIXED-NEXT: fmov x8, d1
+; SVE-FIXED-NEXT: mov v1.h[2], w2
+; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; SVE-FIXED-NEXT: fmov x9, d1
+; SVE-FIXED-NEXT: umaxv h2, v2.4h
+; SVE-FIXED-NEXT: lsr x9, x9, #32
+; SVE-FIXED-NEXT: orr w9, w8, w9
+; SVE-FIXED-NEXT: orr w8, w9, w8, lsr #16
+; SVE-FIXED-NEXT: fmov w10, s2
+; SVE-FIXED-NEXT: tst w8, #0x1
+; SVE-FIXED-NEXT: bfi x11, x10, #2, #2
+; SVE-FIXED-NEXT: ldr w9, [x11]
+; SVE-FIXED-NEXT: csinv w0, w9, wzr, ne
+; SVE-FIXED-NEXT: add sp, sp, #16
+; SVE-FIXED-NEXT: ret
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v3i32(<3 x i32> %a, <3 x i1> %c, i32 -1)
+ ret i32 %res
+}
+
declare i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8)
declare i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16)
declare i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32)
diff --git a/llvm/test/CodeGen/X86/vector-extract-last-active.ll b/llvm/test/CodeGen/X86/vector-extract-last-active.ll
new file mode 100644
index 0000000000000..09d305eaaeb77
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vector-extract-last-active.ll
@@ -0,0 +1,287 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown-linux | FileCheck %s
+
+; This test verifies that the experimental.vector.extract.last.active intrinsic
+; doesn't cause an infinite loop during legalization when the step vector type
+; needs widening (e.g., v4i8 -> v16i8 on X86).
+
+define i32 @extract_last_active_v4i32(<4 x i32> %a, <4 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movd %xmm1, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; CHECK-NEXT: movd %xmm2, %ecx
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; CHECK-NEXT: movd %xmm2, %edx
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; CHECK-NEXT: movd %xmm1, %esi
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: testl %esi, %esi
+; CHECK-NEXT: setne %dil
+; CHECK-NEXT: leal (%rdi,%rdi,2), %r8d
+; CHECK-NEXT: xorl %r9d, %r9d
+; CHECK-NEXT: testl %edx, %edx
+; CHECK-NEXT: setne %r9b
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: testl %ecx, %ecx
+; CHECK-NEXT: setne %dil
+; CHECK-NEXT: addl %edi, %edi
+; CHECK-NEXT: cmpb %dil, %r9b
+; CHECK-NEXT: cmoval %r9d, %edi
+; CHECK-NEXT: cmpb %r8b, %dil
+; CHECK-NEXT: cmovbel %r8d, %edi
+; CHECK-NEXT: orl %edx, %eax
+; CHECK-NEXT: orl %esi, %ecx
+; CHECK-NEXT: orl %eax, %ecx
+; CHECK-NEXT: andb $1, %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $1, %cl
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: orl -24(%rsp,%rdi,4), %eax
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %a, <4 x i1> %c, i32 -1)
+ ret i32 %res
+}
+
+define i32 @extract_last_active_v4i32_no_default(<4 x i32> %a, <4 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v4i32_no_default:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: testl %eax, %eax
+; CHECK-NEXT: setne %cl
+; CHECK-NEXT: leal (%rcx,%rcx,2), %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; CHECK-NEXT: movd %xmm0, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: testl %ecx, %ecx
+; CHECK-NEXT: setne %dl
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; CHECK-NEXT: movd %xmm0, %ecx
+; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: testl %ecx, %ecx
+; CHECK-NEXT: setne %sil
+; CHECK-NEXT: addl %esi, %esi
+; CHECK-NEXT: cmpb %sil, %dl
+; CHECK-NEXT: cmoval %edx, %esi
+; CHECK-NEXT: cmpb %al, %sil
+; CHECK-NEXT: cmovbel %eax, %esi
+; CHECK-NEXT: movl -24(%rsp,%rsi,4), %eax
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %a, <4 x i1> %c, i32 poison)
+ ret i32 %res
+}
+
+; Test v2i32 - smaller vector.
+define i32 @extract_last_active_v2i32(<2 x i32> %a, <2 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; CHECK-NEXT: movq %xmm2, %rcx
+; CHECK-NEXT: movq %xmm1, %rax
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %ecx, %edx
+; CHECK-NEXT: orl %eax, %edx
+; CHECK-NEXT: andb $1, %dl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $1, %dl
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: testq %rcx, %rcx
+; CHECK-NEXT: setne %dl
+; CHECK-NEXT: orl -24(%rsp,%rdx,4), %eax
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> %a, <2 x i1> %c, i32 -1)
+ ret i32 %res
+}
+
+; Test v3i32 - non-power-of-2 element count that requires mask widening
+; (v3i1 -> v4i1) via WidenVecOp_VECTOR_FIND_LAST_ACTIVE.
+define i32 @extract_last_active_v3i32(<3 x i32> %a, <3 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v3i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: orl %edx, %edi
+; CHECK-NEXT: andb $1, %dil
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $1, %dil
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: testl %esi, %esi
+; CHECK-NEXT: setne %cl
+; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: testl %edx, %edx
+; CHECK-NEXT: setne %sil
+; CHECK-NEXT: addl %esi, %esi
+; CHECK-NEXT: cmpb %sil, %cl
+; CHECK-NEXT: cmoval %ecx, %esi
+; CHECK-NEXT: movzbl %sil, %ecx
+; CHECK-NEXT: orl -24(%rsp,%rcx,4), %eax
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v3i32(<3 x i32> %a, <3 x i1> %c, i32 -1)
+ ret i32 %res
+}
+
+; Test v8i32 - larger vector where step vector type doesn't need widening.
+define i32 @extract_last_active_v8i32(<8 x i32> %a, <8 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: .cfi_offset %rbx, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movd %xmm2, %edi
+; CHECK-NEXT: pextrw $7, %xmm2, %eax
+; CHECK-NEXT: pextrw $6, %xmm2, %edx
+; CHECK-NEXT: pextrw $5, %xmm2, %r8d
+; CHECK-NEXT: pextrw $4, %xmm2, %ecx
+; CHECK-NEXT: pextrw $2, %xmm2, %esi
+; CHECK-NEXT: pextrw $1, %xmm2, %r10d
+; CHECK-NEXT: pextrw $3, %xmm2, %r9d
+; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: xorl %r11d, %r11d
+; CHECK-NEXT: testl %r9d, %r9d
+; CHECK-NEXT: setne %r11b
+; CHECK-NEXT: leal (%r11,%r11,2), %r11d
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: testl %r10d, %r10d
+; CHECK-NEXT: setne %bl
+; CHECK-NEXT: xorl %ebp, %ebp
+; CHECK-NEXT: testl %esi, %esi
+; CHECK-NEXT: setne %bpl
+; CHECK-NEXT: addl %ebp, %ebp
+; CHECK-NEXT: cmpb %bpl, %bl
+; CHECK-NEXT: cmoval %ebx, %ebp
+; CHECK-NEXT: cmpb %r11b, %bpl
+; CHECK-NEXT: cmovbel %r11d, %ebp
+; CHECK-NEXT: xorl %r11d, %r11d
+; CHECK-NEXT: testl %ecx, %ecx
+; CHECK-NEXT: setne %r11b
+; CHECK-NEXT: shll $2, %r11d
+; CHECK-NEXT: cmpb %r11b, %bpl
+; CHECK-NEXT: cmoval %ebp, %r11d
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: testl %r8d, %r8d
+; CHECK-NEXT: setne %bl
+; CHECK-NEXT: leal (%rbx,%rbx,4), %ebx
+; CHECK-NEXT: cmpb %bl, %r11b
+; CHECK-NEXT: cmovbel %ebx, %r11d
+; CHECK-NEXT: testl %edx, %edx
+; CHECK-NEXT: movl $6, %ebx
+; CHECK-NEXT: cmovel %edx, %ebx
+; CHECK-NEXT: cmpb %bl, %r11b
+; CHECK-NEXT: cmoval %r11d, %ebx
+; CHECK-NEXT: testl %eax, %eax
+; CHECK-NEXT: movl $7, %r11d
+; CHECK-NEXT: cmovel %eax, %r11d
+; CHECK-NEXT: cmpb %r11b, %bl
+; CHECK-NEXT: cmoval %ebx, %r11d
+; CHECK-NEXT: andl $7, %r11d
+; CHECK-NEXT: orl %r10d, %edi
+; CHECK-NEXT: orl %r9d, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: orl %r8d, %ecx
+; CHECK-NEXT: orl %edx, %ecx
+; CHECK-NEXT: orl %esi, %ecx
+; CHECK-NEXT: orl %eax, %ecx
+; CHECK-NEXT: andb $1, %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $1, %cl
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: orl -40(%rsp,%r11,4), %eax
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> %a, <8 x i1> %c, i32 -1)
+ ret i32 %res
+}
+
+; Test v16i32 - even larger vector.
+define i32 @extract_last_active_v16i32(<16 x i32> %a, <16 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v16i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; CHECK-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %dl, %al
+; CHECK-NEXT: cmoval %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: cmpb %sil, %dl
+; CHECK-NEXT: cmovbel %esi, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: andl $15, %edx
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: andb $1, %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $1, %cl
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: orl -72(%rsp,%rdx,4), %eax
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v16i32(<16 x i32> %a, <16 x i1> %c, i32 -1)
+ ret i32 %res
+}
|
|
@llvm/pr-subscribers-llvm-selectiondag Author: Florian Hahn (fhahn) ChangesImplement WidenVecOp_VECTOR_FIND_LAST_ACTIVE to properly widen the Currently lowering crashes when widening of the operand is needed: WidenVectorOperand op #0: t15: i64 = find_last_active t14 Depends on github.com/#174384 to avoid infinite Full diff: https://github.com/llvm/llvm-project/pull/174389.diff 5 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index cd58c8ab1c3e4..5f247b25d1486 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -1157,6 +1157,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue WidenVecOp_VP_REDUCE(SDNode *N);
SDValue WidenVecOp_ExpOp(SDNode *N);
SDValue WidenVecOp_VP_CttzElements(SDNode *N);
+ SDValue WidenVecOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N);
/// Helper function to generate a set of operations to perform
/// a vector operation for a wider type.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index af685191d82d8..cea3c9171f0d6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -7200,6 +7200,9 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::VP_CTTZ_ELTS_ZERO_UNDEF:
Res = WidenVecOp_VP_CttzElements(N);
break;
+ case ISD::VECTOR_FIND_LAST_ACTIVE:
+ Res = WidenVecOp_VECTOR_FIND_LAST_ACTIVE(N);
+ break;
}
// If Res is null, the sub-method took care of registering the result.
@@ -8125,6 +8128,26 @@ SDValue DAGTypeLegalizer::WidenVecOp_VP_CttzElements(SDNode *N) {
{Source, Mask, N->getOperand(2)}, N->getFlags());
}
+SDValue DAGTypeLegalizer::WidenVecOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N) {
+ SDLoc DL(N);
+ SDValue Mask = N->getOperand(0);
+ EVT OrigMaskVT = Mask.getValueType();
+ SDValue WideMask = GetWidenedVector(Mask);
+ EVT WideMaskVT = WideMask.getValueType();
+
+ // Pad the mask with zeros to ensure inactive lanes don't affect the result.
+ unsigned OrigElts = OrigMaskVT.getVectorNumElements();
+ unsigned WideElts = WideMaskVT.getVectorNumElements();
+ if (OrigElts != WideElts) {
+ SDValue ZeroMask = DAG.getConstant(0, DL, WideMaskVT);
+ WideMask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideMaskVT, ZeroMask,
+ Mask, DAG.getVectorIdxConstant(0, DL));
+ }
+
+ return DAG.getNode(ISD::VECTOR_FIND_LAST_ACTIVE, DL, N->getValueType(0),
+ WideMask);
+}
+
//===----------------------------------------------------------------------===//
// Vector Widening Utilities
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 69c3455573918..b37a5e4144aea 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9668,20 +9668,44 @@ SDValue TargetLowering::expandVectorFindLastActive(SDNode *N,
EVT StepVT = MVT::getIntegerVT(EltWidth);
EVT StepVecVT = MaskVT.changeVectorElementType(*DAG.getContext(), StepVT);
- // If promotion is required to make the type legal, do it here; promotion
- // of integers within LegalizeVectorOps is looking for types of the same
- // size but with a smaller number of larger elements, not the usual larger
- // size with the same number of larger elements.
- if (TLI.getTypeAction(StepVecVT.getSimpleVT()) ==
- TargetLowering::TypePromoteInteger) {
+ // If promotion or widening is required to make the type legal, do it here.
+ // Promotion of integers within LegalizeVectorOps is looking for types of
+ // the same size but with a smaller number of larger elements, not the usual
+ // larger size with the same number of larger elements.
+ TargetLowering::LegalizeTypeAction TypeAction =
+ TLI.getTypeAction(StepVecVT.getSimpleVT());
+ SDValue StepVec;
+ if (TypeAction == TargetLowering::TypePromoteInteger) {
StepVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), StepVecVT);
StepVT = StepVecVT.getVectorElementType();
+ StepVec = DAG.getStepVector(DL, StepVecVT);
+ } else if (TypeAction == TargetLowering::TypeWidenVector) {
+ // For widening, the element count changes. Create a step vector with only
+ // the original elements valid and zeros for padding. Also widen the mask.
+ EVT WideVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), StepVecVT);
+ unsigned WideNumElts = WideVecVT.getVectorNumElements();
+
+ // Build widened step vector: <0, 1, ..., OrigNumElts-1, 0, 0, ...>
+ SDValue OrigStepVec = DAG.getStepVector(DL, StepVecVT);
+ StepVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVecVT,
+ DAG.getConstant(0, DL, WideVecVT), OrigStepVec,
+ DAG.getIntPtrConstant(0, DL));
+
+ // Widen mask: pad with zeros.
+ EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), BoolVT, WideNumElts);
+ SDValue ZeroMask = DAG.getConstant(0, DL, WideMaskVT);
+ Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideMaskVT, ZeroMask, Mask,
+ DAG.getIntPtrConstant(0, DL));
+
+ StepVecVT = WideVecVT;
+ StepVT = WideVecVT.getVectorElementType();
+ } else {
+ StepVec = DAG.getStepVector(DL, StepVecVT);
}
// Zero out lanes with inactive elements, then find the highest remaining
// value from the stepvector.
SDValue Zeroes = DAG.getConstant(0, DL, StepVecVT);
- SDValue StepVec = DAG.getStepVector(DL, StepVecVT);
SDValue ActiveElts = DAG.getSelect(DL, StepVecVT, Mask, StepVec, Zeroes);
SDValue HighestIdx = DAG.getNode(ISD::VECREDUCE_UMAX, DL, StepVT, ActiveElts);
return DAG.getZExtOrTrunc(HighestIdx, DL, N->getValueType(0));
diff --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
index bb9a09ca3cc80..7ee8f6fda93f5 100644
--- a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
+++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
@@ -490,6 +490,65 @@ define i1 @extract_last_i1_scalable(<vscale x 16 x i1> %data, <vscale x 16 x i1>
ret i1 %res
}
+; Test v3i32 - non-power-of-2 element count that requires mask widening
+; (v3i1 -> v4i1) via WidenVecOp_VECTOR_FIND_LAST_ACTIVE.
+define i32 @extract_last_active_v3i32(<3 x i32> %a, <3 x i1> %c) {
+; NEON-FIXED-LABEL: extract_last_active_v3i32:
+; NEON-FIXED: // %bb.0:
+; NEON-FIXED-NEXT: sub sp, sp, #16
+; NEON-FIXED-NEXT: .cfi_def_cfa_offset 16
+; NEON-FIXED-NEXT: movi v1.2d, #0000000000000000
+; NEON-FIXED-NEXT: adrp x9, .LCPI18_0
+; NEON-FIXED-NEXT: mov x11, sp
+; NEON-FIXED-NEXT: ldr d2, [x9, :lo12:.LCPI18_0]
+; NEON-FIXED-NEXT: str q0, [sp]
+; NEON-FIXED-NEXT: mov v1.h[0], w0
+; NEON-FIXED-NEXT: mov v1.h[1], w1
+; NEON-FIXED-NEXT: fmov x8, d1
+; NEON-FIXED-NEXT: mov v1.h[2], w2
+; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; NEON-FIXED-NEXT: fmov x9, d1
+; NEON-FIXED-NEXT: umaxv h2, v2.4h
+; NEON-FIXED-NEXT: lsr x9, x9, #32
+; NEON-FIXED-NEXT: orr w9, w8, w9
+; NEON-FIXED-NEXT: orr w8, w9, w8, lsr #16
+; NEON-FIXED-NEXT: fmov w10, s2
+; NEON-FIXED-NEXT: tst w8, #0x1
+; NEON-FIXED-NEXT: bfi x11, x10, #2, #2
+; NEON-FIXED-NEXT: ldr w9, [x11]
+; NEON-FIXED-NEXT: csinv w0, w9, wzr, ne
+; NEON-FIXED-NEXT: add sp, sp, #16
+; NEON-FIXED-NEXT: ret
+;
+; SVE-FIXED-LABEL: extract_last_active_v3i32:
+; SVE-FIXED: // %bb.0:
+; SVE-FIXED-NEXT: sub sp, sp, #16
+; SVE-FIXED-NEXT: .cfi_def_cfa_offset 16
+; SVE-FIXED-NEXT: movi v1.2d, #0000000000000000
+; SVE-FIXED-NEXT: index z2.h, #0, #1
+; SVE-FIXED-NEXT: mov x11, sp
+; SVE-FIXED-NEXT: str q0, [sp]
+; SVE-FIXED-NEXT: mov v1.h[0], w0
+; SVE-FIXED-NEXT: mov v1.h[1], w1
+; SVE-FIXED-NEXT: fmov x8, d1
+; SVE-FIXED-NEXT: mov v1.h[2], w2
+; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
+; SVE-FIXED-NEXT: fmov x9, d1
+; SVE-FIXED-NEXT: umaxv h2, v2.4h
+; SVE-FIXED-NEXT: lsr x9, x9, #32
+; SVE-FIXED-NEXT: orr w9, w8, w9
+; SVE-FIXED-NEXT: orr w8, w9, w8, lsr #16
+; SVE-FIXED-NEXT: fmov w10, s2
+; SVE-FIXED-NEXT: tst w8, #0x1
+; SVE-FIXED-NEXT: bfi x11, x10, #2, #2
+; SVE-FIXED-NEXT: ldr w9, [x11]
+; SVE-FIXED-NEXT: csinv w0, w9, wzr, ne
+; SVE-FIXED-NEXT: add sp, sp, #16
+; SVE-FIXED-NEXT: ret
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v3i32(<3 x i32> %a, <3 x i1> %c, i32 -1)
+ ret i32 %res
+}
+
declare i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8)
declare i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16)
declare i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32)
diff --git a/llvm/test/CodeGen/X86/vector-extract-last-active.ll b/llvm/test/CodeGen/X86/vector-extract-last-active.ll
new file mode 100644
index 0000000000000..09d305eaaeb77
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vector-extract-last-active.ll
@@ -0,0 +1,287 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown-linux | FileCheck %s
+
+; This test verifies that the experimental.vector.extract.last.active intrinsic
+; doesn't cause an infinite loop during legalization when the step vector type
+; needs widening (e.g., v4i8 -> v16i8 on X86).
+
+define i32 @extract_last_active_v4i32(<4 x i32> %a, <4 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movd %xmm1, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; CHECK-NEXT: movd %xmm2, %ecx
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; CHECK-NEXT: movd %xmm2, %edx
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; CHECK-NEXT: movd %xmm1, %esi
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: testl %esi, %esi
+; CHECK-NEXT: setne %dil
+; CHECK-NEXT: leal (%rdi,%rdi,2), %r8d
+; CHECK-NEXT: xorl %r9d, %r9d
+; CHECK-NEXT: testl %edx, %edx
+; CHECK-NEXT: setne %r9b
+; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: testl %ecx, %ecx
+; CHECK-NEXT: setne %dil
+; CHECK-NEXT: addl %edi, %edi
+; CHECK-NEXT: cmpb %dil, %r9b
+; CHECK-NEXT: cmoval %r9d, %edi
+; CHECK-NEXT: cmpb %r8b, %dil
+; CHECK-NEXT: cmovbel %r8d, %edi
+; CHECK-NEXT: orl %edx, %eax
+; CHECK-NEXT: orl %esi, %ecx
+; CHECK-NEXT: orl %eax, %ecx
+; CHECK-NEXT: andb $1, %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $1, %cl
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: orl -24(%rsp,%rdi,4), %eax
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %a, <4 x i1> %c, i32 -1)
+ ret i32 %res
+}
+
+define i32 @extract_last_active_v4i32_no_default(<4 x i32> %a, <4 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v4i32_no_default:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: testl %eax, %eax
+; CHECK-NEXT: setne %cl
+; CHECK-NEXT: leal (%rcx,%rcx,2), %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
+; CHECK-NEXT: movd %xmm0, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: testl %ecx, %ecx
+; CHECK-NEXT: setne %dl
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; CHECK-NEXT: movd %xmm0, %ecx
+; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: testl %ecx, %ecx
+; CHECK-NEXT: setne %sil
+; CHECK-NEXT: addl %esi, %esi
+; CHECK-NEXT: cmpb %sil, %dl
+; CHECK-NEXT: cmoval %edx, %esi
+; CHECK-NEXT: cmpb %al, %sil
+; CHECK-NEXT: cmovbel %eax, %esi
+; CHECK-NEXT: movl -24(%rsp,%rsi,4), %eax
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %a, <4 x i1> %c, i32 poison)
+ ret i32 %res
+}
+
+; Test v2i32 - smaller vector.
+define i32 @extract_last_active_v2i32(<2 x i32> %a, <2 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; CHECK-NEXT: movq %xmm2, %rcx
+; CHECK-NEXT: movq %xmm1, %rax
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %ecx, %edx
+; CHECK-NEXT: orl %eax, %edx
+; CHECK-NEXT: andb $1, %dl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $1, %dl
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: testq %rcx, %rcx
+; CHECK-NEXT: setne %dl
+; CHECK-NEXT: orl -24(%rsp,%rdx,4), %eax
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v2i32(<2 x i32> %a, <2 x i1> %c, i32 -1)
+ ret i32 %res
+}
+
+; Test v3i32 - non-power-of-2 element count that requires mask widening
+; (v3i1 -> v4i1) via WidenVecOp_VECTOR_FIND_LAST_ACTIVE.
+define i32 @extract_last_active_v3i32(<3 x i32> %a, <3 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v3i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: orl %edx, %edi
+; CHECK-NEXT: andb $1, %dil
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $1, %dil
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: testl %esi, %esi
+; CHECK-NEXT: setne %cl
+; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: testl %edx, %edx
+; CHECK-NEXT: setne %sil
+; CHECK-NEXT: addl %esi, %esi
+; CHECK-NEXT: cmpb %sil, %cl
+; CHECK-NEXT: cmoval %ecx, %esi
+; CHECK-NEXT: movzbl %sil, %ecx
+; CHECK-NEXT: orl -24(%rsp,%rcx,4), %eax
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v3i32(<3 x i32> %a, <3 x i1> %c, i32 -1)
+ ret i32 %res
+}
+
+; Test v8i32 - larger vector where step vector type doesn't need widening.
+define i32 @extract_last_active_v8i32(<8 x i32> %a, <8 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: .cfi_offset %rbx, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movd %xmm2, %edi
+; CHECK-NEXT: pextrw $7, %xmm2, %eax
+; CHECK-NEXT: pextrw $6, %xmm2, %edx
+; CHECK-NEXT: pextrw $5, %xmm2, %r8d
+; CHECK-NEXT: pextrw $4, %xmm2, %ecx
+; CHECK-NEXT: pextrw $2, %xmm2, %esi
+; CHECK-NEXT: pextrw $1, %xmm2, %r10d
+; CHECK-NEXT: pextrw $3, %xmm2, %r9d
+; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: xorl %r11d, %r11d
+; CHECK-NEXT: testl %r9d, %r9d
+; CHECK-NEXT: setne %r11b
+; CHECK-NEXT: leal (%r11,%r11,2), %r11d
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: testl %r10d, %r10d
+; CHECK-NEXT: setne %bl
+; CHECK-NEXT: xorl %ebp, %ebp
+; CHECK-NEXT: testl %esi, %esi
+; CHECK-NEXT: setne %bpl
+; CHECK-NEXT: addl %ebp, %ebp
+; CHECK-NEXT: cmpb %bpl, %bl
+; CHECK-NEXT: cmoval %ebx, %ebp
+; CHECK-NEXT: cmpb %r11b, %bpl
+; CHECK-NEXT: cmovbel %r11d, %ebp
+; CHECK-NEXT: xorl %r11d, %r11d
+; CHECK-NEXT: testl %ecx, %ecx
+; CHECK-NEXT: setne %r11b
+; CHECK-NEXT: shll $2, %r11d
+; CHECK-NEXT: cmpb %r11b, %bpl
+; CHECK-NEXT: cmoval %ebp, %r11d
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: testl %r8d, %r8d
+; CHECK-NEXT: setne %bl
+; CHECK-NEXT: leal (%rbx,%rbx,4), %ebx
+; CHECK-NEXT: cmpb %bl, %r11b
+; CHECK-NEXT: cmovbel %ebx, %r11d
+; CHECK-NEXT: testl %edx, %edx
+; CHECK-NEXT: movl $6, %ebx
+; CHECK-NEXT: cmovel %edx, %ebx
+; CHECK-NEXT: cmpb %bl, %r11b
+; CHECK-NEXT: cmoval %r11d, %ebx
+; CHECK-NEXT: testl %eax, %eax
+; CHECK-NEXT: movl $7, %r11d
+; CHECK-NEXT: cmovel %eax, %r11d
+; CHECK-NEXT: cmpb %r11b, %bl
+; CHECK-NEXT: cmoval %ebx, %r11d
+; CHECK-NEXT: andl $7, %r11d
+; CHECK-NEXT: orl %r10d, %edi
+; CHECK-NEXT: orl %r9d, %esi
+; CHECK-NEXT: orl %edi, %esi
+; CHECK-NEXT: orl %r8d, %ecx
+; CHECK-NEXT: orl %edx, %ecx
+; CHECK-NEXT: orl %esi, %ecx
+; CHECK-NEXT: orl %eax, %ecx
+; CHECK-NEXT: andb $1, %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $1, %cl
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: orl -40(%rsp,%r11,4), %eax
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v8i32(<8 x i32> %a, <8 x i1> %c, i32 -1)
+ ret i32 %res
+}
+
+; Test v16i32 - even larger vector.
+define i32 @extract_last_active_v16i32(<16 x i32> %a, <16 x i1> %c) {
+; CHECK-LABEL: extract_last_active_v16i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; CHECK-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %dl, %al
+; CHECK-NEXT: cmoval %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: cmpb %sil, %dl
+; CHECK-NEXT: cmovbel %esi, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: cmpb %al, %dl
+; CHECK-NEXT: cmovbel %eax, %edx
+; CHECK-NEXT: andl $15, %edx
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: orb -{{[0-9]+}}(%rsp), %cl
+; CHECK-NEXT: andb $1, %cl
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpb $1, %cl
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: orl -72(%rsp,%rdx,4), %eax
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.experimental.vector.extract.last.active.v16i32(<16 x i32> %a, <16 x i1> %c, i32 -1)
+ ret i32 %res
+}
|
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
| EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(), BoolVT, WideNumElts); | ||
| SDValue ZeroMask = DAG.getConstant(0, DL, WideMaskVT); | ||
| Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideMaskVT, ZeroMask, Mask, | ||
| DAG.getIntPtrConstant(0, DL)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wrong type for the index (also can reuse the index from above)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this was fixed in the parent PR, thanks
Implement WidenVecOp_VECTOR_FIND_LAST_ACTIVE to properly widen the vector operand for @llvm.experimental.vector.extract.last.active. Currently lowering crashes when widening of the operand is needed: WidenVectorOperand op #0: t15: i64 = find_last_active t14 LLVM ERROR: Do not know how to widen this operator's operand! Depends on github.com/llvm/pull/174384 to avoid infinite cycles on X86.
8da2be4 to
4218564
Compare
Implement WidenVecOp_VECTOR_FIND_LAST_ACTIVE to properly widen the
vector operand for @llvm.experimental.vector.extract.last.active.
Currently lowering crashes when widening of the operand is needed:
WidenVectorOperand op #0: t15: i64 = find_last_active t14
LLVM ERROR: Do not know how to widen this operator's operand!
Depends on github.com/#174384 to avoid infinite
cycles in the X86 tests.