Skip to content

[LoongArch] Pre-commit for broadcast load #136070

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 18, 2025
Merged

Conversation

tangaac
Copy link
Contributor

@tangaac tangaac commented Apr 17, 2025

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Apr 17, 2025

@llvm/pr-subscribers-backend-loongarch

Author: None (tangaac)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/136070.diff

2 Files Affected:

  • (added) llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll (+172)
  • (added) llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll (+170)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll b/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
new file mode 100644
index 0000000000000..7fec52a340768
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+lasx < %s | FileCheck %s
+
+; TODO: Load a element and splat it to a vector could be lowerd to xvldrepl
+
+; A load has more than one user shouldn't be lowered to xvldrepl
+define <32 x i8> @should_not_be_optimized(ptr %ptr, ptr %dst) {
+; CHECK-LABEL: should_not_be_optimized:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.b $a0, $a0, 0
+; CHECK-NEXT:    xvreplgr2vr.b $xr0, $a0
+; CHECK-NEXT:    st.b $a0, $a1, 0
+; CHECK-NEXT:    ret
+  %tmp = load i8, ptr %ptr
+  store i8 %tmp, ptr %dst
+  %tmp1 = insertelement <32 x i8> zeroinitializer, i8 %tmp, i32 0
+  %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> poison, <32 x i32> zeroinitializer
+  ret <32 x i8> %tmp2
+}
+
+define <32 x i8> @xvldrepl_b(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.b $a0, $a0, 0
+; CHECK-NEXT:    xvreplgr2vr.b $xr0, $a0
+; CHECK-NEXT:    ret
+  %tmp = load i8, ptr %ptr
+  %tmp1 = insertelement <32 x i8> zeroinitializer, i8 %tmp, i32 0
+  %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> poison, <32 x i32> zeroinitializer
+  ret <32 x i8> %tmp2
+}
+
+define <32 x i8> @xvldrepl_b_offset(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_b_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.b $a0, $a0, 33
+; CHECK-NEXT:    xvreplgr2vr.b $xr0, $a0
+; CHECK-NEXT:    ret
+  %p = getelementptr i8, ptr %ptr, i64 33
+  %tmp = load i8, ptr %p
+  %tmp1 = insertelement <32 x i8> zeroinitializer, i8 %tmp, i32 0
+  %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> poison, <32 x i32> zeroinitializer
+  ret <32 x i8> %tmp2
+}
+
+
+define <16 x i16> @xvldrepl_h(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.h $a0, $a0, 0
+; CHECK-NEXT:    xvreplgr2vr.h $xr0, $a0
+; CHECK-NEXT:    ret
+  %tmp = load i16, ptr %ptr
+  %tmp1 = insertelement <16 x i16> zeroinitializer, i16 %tmp, i32 0
+  %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> poison, <16 x i32> zeroinitializer
+  ret <16 x i16> %tmp2
+}
+
+define <16 x i16> @xvldrepl_h_offset(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_h_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.h $a0, $a0, 66
+; CHECK-NEXT:    xvreplgr2vr.h $xr0, $a0
+; CHECK-NEXT:    ret
+  %p = getelementptr i16, ptr %ptr, i64 33
+  %tmp = load i16, ptr %p
+  %tmp1 = insertelement <16 x i16> zeroinitializer, i16 %tmp, i32 0
+  %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> poison, <16 x i32> zeroinitializer
+  ret <16 x i16> %tmp2
+}
+
+define <8 x i32> @xvldrepl_w(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    xvreplgr2vr.w $xr0, $a0
+; CHECK-NEXT:    ret
+  %tmp = load i32, ptr %ptr
+  %tmp1 = insertelement <8 x i32> zeroinitializer, i32 %tmp, i32 0
+  %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> poison, <8 x i32> zeroinitializer
+  ret <8 x i32> %tmp2
+}
+
+define <8 x i32> @xvldrepl_w_offset(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_w_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 132
+; CHECK-NEXT:    xvreplgr2vr.w $xr0, $a0
+; CHECK-NEXT:    ret
+  %p = getelementptr i32, ptr %ptr, i64 33
+  %tmp = load i32, ptr %p
+  %tmp1 = insertelement <8 x i32> zeroinitializer, i32 %tmp, i32 0
+  %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> poison, <8 x i32> zeroinitializer
+  ret <8 x i32> %tmp2
+}
+
+
+define <4 x i64> @xvldrepl_d(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    xvreplgr2vr.d $xr0, $a0
+; CHECK-NEXT:    ret
+  %tmp = load i64, ptr %ptr
+  %tmp1 = insertelement <4 x i64> zeroinitializer, i64 %tmp, i32 0
+  %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> poison, <4 x i32> zeroinitializer
+  ret <4 x i64> %tmp2
+}
+
+define <4 x i64> @xvldrepl_d_offset(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_d_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 264
+; CHECK-NEXT:    xvreplgr2vr.d $xr0, $a0
+; CHECK-NEXT:    ret
+  %p = getelementptr i64, ptr %ptr, i64 33
+  %tmp = load i64, ptr %p
+  %tmp1 = insertelement <4 x i64> zeroinitializer, i64 %tmp, i32 0
+  %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> poison, <4 x i32> zeroinitializer
+  ret <4 x i64> %tmp2
+}
+
+define <8 x float> @vldrepl_w_flt(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w_flt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fld.s $fa0, $a0, 0
+; CHECK-NEXT:    xvreplve0.w $xr0, $xr0
+; CHECK-NEXT:    ret
+  %tmp = load float, ptr %ptr
+  %tmp1 = insertelement <8 x float> zeroinitializer, float %tmp, i32 0
+  %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> poison, <8 x i32> zeroinitializer
+  ret <8 x float> %tmp2
+}
+
+define <8 x float> @vldrepl_w_flt_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w_flt_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fld.s $fa0, $a0, 264
+; CHECK-NEXT:    xvreplve0.w $xr0, $xr0
+; CHECK-NEXT:    ret
+  %p = getelementptr i64, ptr %ptr, i64 33
+  %tmp = load float, ptr %p
+  %tmp1 = insertelement <8 x float> zeroinitializer, float %tmp, i32 0
+  %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> poison, <8 x i32> zeroinitializer
+  ret <8 x float> %tmp2
+}
+
+define <4 x double> @vldrepl_d_dbl(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d_dbl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fld.d $fa0, $a0, 0
+; CHECK-NEXT:    xvreplve0.d $xr0, $xr0
+; CHECK-NEXT:    ret
+  %tmp = load double, ptr %ptr
+  %tmp1 = insertelement <4 x double> zeroinitializer, double %tmp, i32 0
+  %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> poison, <4 x i32> zeroinitializer
+  ret <4 x double> %tmp2
+}
+
+define <4 x double> @vldrepl_d_dbl_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d_dbl_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fld.d $fa0, $a0, 264
+; CHECK-NEXT:    xvreplve0.d $xr0, $xr0
+; CHECK-NEXT:    ret
+  %p = getelementptr i64, ptr %ptr, i64 33
+  %tmp = load double, ptr %p
+  %tmp1 = insertelement <4 x double> zeroinitializer, double %tmp, i32 0
+  %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> poison, <4 x i32> zeroinitializer
+  ret <4 x double> %tmp2
+}
+
diff --git a/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll b/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll
new file mode 100644
index 0000000000000..09edb33a49ed9
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+lsx < %s | FileCheck %s
+
+; TODO: Load a element and splat it to a vector could be lowerd to vldrepl
+
+; A load has more than one user shouldn't be lowered to vldrepl
+define <16 x i8> @should_not_be_optimized(ptr %ptr, ptr %dst){
+; CHECK-LABEL: should_not_be_optimized:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.b $a0, $a0, 0
+; CHECK-NEXT:    vreplgr2vr.b $vr0, $a0
+; CHECK-NEXT:    st.b $a0, $a1, 0
+; CHECK-NEXT:    ret
+  %tmp = load i8, ptr %ptr
+  store i8 %tmp, ptr %dst
+  %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %tmp, i32 0
+  %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> poison, <16 x i32> zeroinitializer
+  ret <16 x i8> %tmp2
+}
+
+define <16 x i8> @vldrepl_b(ptr %ptr) {
+; CHECK-LABEL: vldrepl_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.b $a0, $a0, 0
+; CHECK-NEXT:    vreplgr2vr.b $vr0, $a0
+; CHECK-NEXT:    ret
+  %tmp = load i8, ptr %ptr
+  %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %tmp, i32 0
+  %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> poison, <16 x i32> zeroinitializer
+  ret <16 x i8> %tmp2
+}
+
+define <16 x i8> @vldrepl_b_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_b_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.b $a0, $a0, 33
+; CHECK-NEXT:    vreplgr2vr.b $vr0, $a0
+; CHECK-NEXT:    ret
+  %p = getelementptr i8, ptr %ptr, i64 33
+  %tmp = load i8, ptr %p
+  %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %tmp, i32 0
+  %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> poison, <16 x i32> zeroinitializer
+  ret <16 x i8> %tmp2
+}
+
+
+define <8 x i16> @vldrepl_h(ptr %ptr) {
+; CHECK-LABEL: vldrepl_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.h $a0, $a0, 0
+; CHECK-NEXT:    vreplgr2vr.h $vr0, $a0
+; CHECK-NEXT:    ret
+  %tmp = load i16, ptr %ptr
+  %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %tmp, i32 0
+  %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> poison, <8 x i32> zeroinitializer
+  ret <8 x i16> %tmp2
+}
+
+define <8 x i16> @vldrepl_h_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_h_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.h $a0, $a0, 66
+; CHECK-NEXT:    vreplgr2vr.h $vr0, $a0
+; CHECK-NEXT:    ret
+  %p = getelementptr i16, ptr %ptr, i64 33
+  %tmp = load i16, ptr %p
+  %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %tmp, i32 0
+  %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> poison, <8 x i32> zeroinitializer
+  ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vldrepl_w(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    vreplgr2vr.w $vr0, $a0
+; CHECK-NEXT:    ret
+  %tmp = load i32, ptr %ptr
+  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+  %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @vldrepl_w_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 132
+; CHECK-NEXT:    vreplgr2vr.w $vr0, $a0
+; CHECK-NEXT:    ret
+  %p = getelementptr i32, ptr %ptr, i64 33
+  %tmp = load i32, ptr %p
+  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+  %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vldrepl_d(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vreplgr2vr.d $vr0, $a0
+; CHECK-NEXT:    ret
+  %tmp = load i64, ptr %ptr
+  %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+  %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %tmp2
+}
+
+define <2 x i64> @vldrepl_d_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 264
+; CHECK-NEXT:    vreplgr2vr.d $vr0, $a0
+; CHECK-NEXT:    ret
+  %p = getelementptr i64, ptr %ptr, i64 33
+  %tmp = load i64, ptr %p
+  %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+  %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %tmp2
+}
+
+define <4 x float> @vldrepl_w_flt(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w_flt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fld.s $fa0, $a0, 0
+; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT:    ret
+  %tmp = load float, ptr %ptr
+  %tmp1 = insertelement <4 x float> zeroinitializer, float %tmp, i32 0
+  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> poison, <4 x i32> zeroinitializer
+  ret <4 x float> %tmp2
+}
+
+define <4 x float> @vldrepl_w_flt_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w_flt_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fld.s $fa0, $a0, 264
+; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT:    ret
+  %p = getelementptr i64, ptr %ptr, i64 33
+  %tmp = load float, ptr %p
+  %tmp1 = insertelement <4 x float> zeroinitializer, float %tmp, i32 0
+  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> poison, <4 x i32> zeroinitializer
+  ret <4 x float> %tmp2
+}
+
+define <2 x double> @vldrepl_d_dbl(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d_dbl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fld.d $fa0, $a0, 0
+; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT:    ret
+  %tmp = load double, ptr %ptr
+  %tmp1 = insertelement <2 x double> zeroinitializer, double %tmp, i32 0
+  %tmp2 = shufflevector <2 x double> %tmp1, <2 x double> poison, <2 x i32> zeroinitializer
+  ret <2 x double> %tmp2
+}
+
+define <2 x double> @vldrepl_d_dbl_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d_dbl_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fld.d $fa0, $a0, 264
+; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT:    ret
+  %p = getelementptr i64, ptr %ptr, i64 33
+  %tmp = load double, ptr %p
+  %tmp1 = insertelement <2 x double> zeroinitializer, double %tmp, i32 0
+  %tmp2 = shufflevector <2 x double> %tmp1, <2 x double> poison, <2 x i32> zeroinitializer
+  ret <2 x double> %tmp2
+}

@tangaac tangaac requested review from heiher and SixWeining April 17, 2025 01:30
Copy link
Member

@heiher heiher left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to add some test cases like vldrepl_d_dbl_unaligned_offset to ensure unaligned immediate offsets are correctly encoded?

define <2 x double> @vldrepl_d_dbl_unaligned_offset(ptr %ptr) {
  %p = getelementptr i32, ptr %ptr, i32 1
  %tmp = load double, ptr %p
  %tmp1 = insertelement <2 x double> zeroinitializer, double %tmp, i32 0
  %tmp2 = shufflevector <2 x double> %tmp1, <2 x double> poison, <2 x i32> zeroinitializer
  ret <2 x double> %tmp2
}

@tangaac
Copy link
Contributor Author

tangaac commented Apr 18, 2025

Would it make sense to add some test cases like vldrepl_d_dbl_unaligned_offset to ensure unaligned immediate offsets are correctly encoded?

define <2 x double> @vldrepl_d_dbl_unaligned_offset(ptr %ptr) {
  %p = getelementptr i32, ptr %ptr, i32 1
  %tmp = load double, ptr %p
  %tmp1 = insertelement <2 x double> zeroinitializer, double %tmp, i32 0
  %tmp2 = shufflevector <2 x double> %tmp1, <2 x double> poison, <2 x i32> zeroinitializer
  ret <2 x double> %tmp2
}

After optimized by another pr, asm shows that it's correct
before

fld.d	$fa0, $a0, 4
vreplvei.d	$vr0, $vr0, 0
ret

after

addi.d	$a0, $a0, 4
vldrepl.d	$vr0, $a0, 0
ret

Copy link
Member

@heiher heiher left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@tangaac tangaac merged commit 594bfad into llvm:main Apr 18, 2025
11 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants