Skip to content

[AArch64] Add tablegen patterns for i8 and i16 vector insert/extract pairs #136091

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions llvm/include/llvm/Target/TargetSelectionDAG.td
Original file line number Diff line number Diff line change
Expand Up @@ -818,8 +818,11 @@ def step_vector : SDNode<"ISD::STEP_VECTOR", SDTypeProfile<1, 1,
def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>,
[]>;

// vector_extract/vector_insert are deprecated. extractelt/insertelt
// are preferred.
// vector_extract/vector_insert are similar to extractelt/insertelt but allow
// types that require promotion (a 16i8 extract where i8 is not a legal type so
// uses i32 for example). extractelt/insertelt are preferred where the element
// type and the extracted types match due to the extra type checking they
// perform.
def vector_extract : SDNode<"ISD::EXTRACT_VECTOR_ELT",
SDTypeProfile<1, 2, [SDTCisPtrTy<2>]>, []>;
def vector_insert : SDNode<"ISD::INSERT_VECTOR_ELT",
Expand Down
35 changes: 35 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -7307,6 +7307,41 @@ def : Pat<(v2i32 (vector_insert v2i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))),
def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))), (i64 imm:$Immd))),
(INSvi64lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$Sn, dsub), 0)>;

// Patterns for i8/i16 -> v2i32/v4i16 lane moves via insert and extract that go via i32.
multiclass Neon_INS_elt_ext_pattern<ValueType VT128, ValueType VT64, ValueType OutVT,
Instruction INS, SDNodeXForm VecIndexMult> {
// VT64->OutVT
def : Pat<(OutVT (vector_insert (OutVT V64:$src),
(i32 (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))),
(i64 imm:$Immd))),
(EXTRACT_SUBREG
(INS (INSERT_SUBREG (VT128 (IMPLICIT_DEF)), V64:$src, dsub), (VecIndexMult imm:$Immd),
(INSERT_SUBREG (VT128 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immn),
dsub)>;
def : Pat<(OutVT (scalar_to_vector (i32 (vector_extract (VT64 V64:$Rn), (i64 imm:$Immn))))),
(EXTRACT_SUBREG
(INS (IMPLICIT_DEF), 0,
(INSERT_SUBREG (VT128 (IMPLICIT_DEF)), V64:$Rn, dsub), imm:$Immn),
dsub)>;

// VT128->OutVT
def : Pat<(OutVT (vector_insert (OutVT V64:$src),
(i32 (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))),
(i64 imm:$Immd))),
(EXTRACT_SUBREG
(INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), (VecIndexMult imm:$Immd),
V128:$Rn, imm:$Immn),
dsub)>;
def : Pat<(OutVT (scalar_to_vector (i32 (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))))),
(EXTRACT_SUBREG
(INS (IMPLICIT_DEF), 0, V128:$Rn, imm:$Immn),
dsub)>;
}

defm : Neon_INS_elt_ext_pattern<v16i8, v8i8, v4i16, INSvi8lane, VecIndex_x2>;
defm : Neon_INS_elt_ext_pattern<v16i8, v8i8, v2i32, INSvi8lane, VecIndex_x4>;
defm : Neon_INS_elt_ext_pattern<v8i16, v4i16, v2i32, INSvi16lane, VecIndex_x2>;

// bitcast of an extract
// f32 bitcast(vector_extract(v4i32 src, lane)) -> EXTRACT_SUBREG(INSvi32lane(-, 0, src, lane))
def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, imm:$Immd)))),
Expand Down
16 changes: 6 additions & 10 deletions llvm/test/CodeGen/AArch64/arm64-extract-insert-varidx.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,11 @@ define <4 x i8> @test_varidx_extract_v8s8(<8 x i8> %x, i32 %idx) {
; CHECK-SDAG-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SDAG-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SDAG-NEXT: str d0, [sp, #8]
; CHECK-SDAG-NEXT: umov w9, v0.b[1]
; CHECK-SDAG-NEXT: bfxil x8, x0, #0, #3
; CHECK-SDAG-NEXT: ld1 { v1.b }[0], [x8]
; CHECK-SDAG-NEXT: umov w8, v0.b[2]
; CHECK-SDAG-NEXT: mov v1.h[1], w9
; CHECK-SDAG-NEXT: umov w9, v0.b[3]
; CHECK-SDAG-NEXT: mov v1.h[2], w8
; CHECK-SDAG-NEXT: mov v1.h[3], w9
; CHECK-SDAG-NEXT: mov v1.b[2], v0.b[1]
; CHECK-SDAG-NEXT: mov v1.b[4], v0.b[2]
; CHECK-SDAG-NEXT: mov v1.b[6], v0.b[3]
; CHECK-SDAG-NEXT: fmov d0, d1
; CHECK-SDAG-NEXT: add sp, sp, #16
; CHECK-SDAG-NEXT: ret
Expand Down Expand Up @@ -168,11 +165,10 @@ define <2 x i16> @test_varidx_extract_v4s16(<4 x i16> %x, i32 %idx) {
; CHECK-SDAG-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SDAG-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SDAG-NEXT: str d0, [sp, #8]
; CHECK-SDAG-NEXT: umov w9, v0.h[1]
; CHECK-SDAG-NEXT: bfi x8, x0, #1, #2
; CHECK-SDAG-NEXT: ld1 { v0.h }[0], [x8]
; CHECK-SDAG-NEXT: mov v0.s[1], w9
; CHECK-SDAG-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SDAG-NEXT: ld1 { v1.h }[0], [x8]
; CHECK-SDAG-NEXT: mov v1.h[2], v0.h[1]
; CHECK-SDAG-NEXT: fmov d0, d1
; CHECK-SDAG-NEXT: add sp, sp, #16
; CHECK-SDAG-NEXT: ret
;
Expand Down
32 changes: 12 additions & 20 deletions llvm/test/CodeGen/AArch64/bitcast-extend.ll
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,12 @@ define <4 x i64> @z_i32_v4i64(i32 %x) {
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fmov s0, w0
; CHECK-SD-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-SD-NEXT: umov w8, v0.b[2]
; CHECK-SD-NEXT: umov w9, v0.b[0]
; CHECK-SD-NEXT: umov w10, v0.b[3]
; CHECK-SD-NEXT: umov w11, v0.b[1]
; CHECK-SD-NEXT: fmov s0, w9
; CHECK-SD-NEXT: fmov s2, w8
; CHECK-SD-NEXT: mov v0.s[1], w11
; CHECK-SD-NEXT: mov v2.s[1], w10
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0
; CHECK-SD-NEXT: mov v2.b[0], v0.b[0]
; CHECK-SD-NEXT: mov v3.b[0], v0.b[2]
; CHECK-SD-NEXT: mov v2.b[4], v0.b[1]
; CHECK-SD-NEXT: mov v3.b[4], v0.b[3]
; CHECK-SD-NEXT: ushll v0.2d, v2.2s, #0
; CHECK-SD-NEXT: ushll v2.2d, v3.2s, #0
; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-SD-NEXT: and v1.16b, v2.16b, v1.16b
; CHECK-SD-NEXT: ret
Expand Down Expand Up @@ -176,16 +172,12 @@ define <4 x i64> @s_i32_v4i64(i32 %x) {
; CHECK-SD-LABEL: s_i32_v4i64:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fmov s0, w0
; CHECK-SD-NEXT: umov w8, v0.b[2]
; CHECK-SD-NEXT: umov w9, v0.b[0]
; CHECK-SD-NEXT: umov w10, v0.b[3]
; CHECK-SD-NEXT: umov w11, v0.b[1]
; CHECK-SD-NEXT: fmov s0, w9
; CHECK-SD-NEXT: fmov s1, w8
; CHECK-SD-NEXT: mov v0.s[1], w11
; CHECK-SD-NEXT: mov v1.s[1], w10
; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-SD-NEXT: mov v1.b[0], v0.b[0]
; CHECK-SD-NEXT: mov v2.b[0], v0.b[2]
; CHECK-SD-NEXT: mov v1.b[4], v0.b[1]
; CHECK-SD-NEXT: mov v2.b[4], v0.b[3]
; CHECK-SD-NEXT: ushll v0.2d, v1.2s, #0
; CHECK-SD-NEXT: ushll v1.2d, v2.2s, #0
; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-NEXT: sshr v0.2d, v0.2d, #56
Expand Down
32 changes: 12 additions & 20 deletions llvm/test/CodeGen/AArch64/fix-shuffle-vector-be-rev.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,12 @@
define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
; CHECKLE-LABEL: test_reconstructshuffle:
; CHECKLE: // %bb.0:
; CHECKLE-NEXT: umov w8, v0.b[3]
; CHECKLE-NEXT: umov w9, v0.b[2]
; CHECKLE-NEXT: fmov s2, w8
; CHECKLE-NEXT: umov w8, v0.b[1]
; CHECKLE-NEXT: mov v2.h[1], w9
; CHECKLE-NEXT: mov v2.h[2], w8
; CHECKLE-NEXT: umov w8, v0.b[0]
; CHECKLE-NEXT: ext v0.16b, v1.16b, v1.16b, #8
; CHECKLE-NEXT: mov v2.h[3], w8
; CHECKLE-NEXT: zip2 v0.8b, v0.8b, v0.8b
; CHECKLE-NEXT: mov v2.b[0], v0.b[3]
; CHECKLE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECKLE-NEXT: mov v2.b[2], v0.b[2]
; CHECKLE-NEXT: mov v2.b[4], v0.b[1]
; CHECKLE-NEXT: mov v2.b[6], v0.b[0]
; CHECKLE-NEXT: zip2 v0.8b, v1.8b, v0.8b
; CHECKLE-NEXT: add v0.4h, v2.4h, v0.4h
; CHECKLE-NEXT: bic v0.4h, #255, lsl #8
; CHECKLE-NEXT: ret
Expand All @@ -25,16 +21,12 @@ define <4 x i16> @test_reconstructshuffle(<16 x i8> %a, <16 x i8> %b) nounwind {
; CHECKBE-NEXT: rev64 v1.16b, v1.16b
; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: umov w8, v0.b[3]
; CHECKBE-NEXT: umov w9, v0.b[2]
; CHECKBE-NEXT: fmov s2, w8
; CHECKBE-NEXT: umov w8, v0.b[1]
; CHECKBE-NEXT: mov v2.h[1], w9
; CHECKBE-NEXT: mov v2.h[2], w8
; CHECKBE-NEXT: umov w8, v0.b[0]
; CHECKBE-NEXT: ext v0.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: mov v2.h[3], w8
; CHECKBE-NEXT: zip2 v0.8b, v0.8b, v0.8b
; CHECKBE-NEXT: mov v2.b[0], v0.b[3]
; CHECKBE-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECKBE-NEXT: mov v2.b[2], v0.b[2]
; CHECKBE-NEXT: mov v2.b[4], v0.b[1]
; CHECKBE-NEXT: mov v2.b[6], v0.b[0]
; CHECKBE-NEXT: zip2 v0.8b, v1.8b, v0.8b
; CHECKBE-NEXT: add v0.4h, v2.4h, v0.4h
; CHECKBE-NEXT: bic v0.4h, #255, lsl #8
; CHECKBE-NEXT: rev64 v0.4h, v0.4h
Expand Down
Loading
Loading