-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[LoongArch] Pre-commit for broadcast load #136070
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-loongarch Author: None (tangaac) ChangesFull diff: https://github.com/llvm/llvm-project/pull/136070.diff 2 Files Affected:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll b/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
new file mode 100644
index 0000000000000..7fec52a340768
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+lasx < %s | FileCheck %s
+
+; TODO: Load a element and splat it to a vector could be lowerd to xvldrepl
+
+; A load has more than one user shouldn't be lowered to xvldrepl
+define <32 x i8> @should_not_be_optimized(ptr %ptr, ptr %dst) {
+; CHECK-LABEL: should_not_be_optimized:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.b $a0, $a0, 0
+; CHECK-NEXT: xvreplgr2vr.b $xr0, $a0
+; CHECK-NEXT: st.b $a0, $a1, 0
+; CHECK-NEXT: ret
+ %tmp = load i8, ptr %ptr
+ store i8 %tmp, ptr %dst
+ %tmp1 = insertelement <32 x i8> zeroinitializer, i8 %tmp, i32 0
+ %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> poison, <32 x i32> zeroinitializer
+ ret <32 x i8> %tmp2
+}
+
+define <32 x i8> @xvldrepl_b(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_b:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.b $a0, $a0, 0
+; CHECK-NEXT: xvreplgr2vr.b $xr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i8, ptr %ptr
+ %tmp1 = insertelement <32 x i8> zeroinitializer, i8 %tmp, i32 0
+ %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> poison, <32 x i32> zeroinitializer
+ ret <32 x i8> %tmp2
+}
+
+define <32 x i8> @xvldrepl_b_offset(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_b_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.b $a0, $a0, 33
+; CHECK-NEXT: xvreplgr2vr.b $xr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i8, ptr %ptr, i64 33
+ %tmp = load i8, ptr %p
+ %tmp1 = insertelement <32 x i8> zeroinitializer, i8 %tmp, i32 0
+ %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> poison, <32 x i32> zeroinitializer
+ ret <32 x i8> %tmp2
+}
+
+
+define <16 x i16> @xvldrepl_h(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_h:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.h $a0, $a0, 0
+; CHECK-NEXT: xvreplgr2vr.h $xr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i16, ptr %ptr
+ %tmp1 = insertelement <16 x i16> zeroinitializer, i16 %tmp, i32 0
+ %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> poison, <16 x i32> zeroinitializer
+ ret <16 x i16> %tmp2
+}
+
+define <16 x i16> @xvldrepl_h_offset(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_h_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.h $a0, $a0, 66
+; CHECK-NEXT: xvreplgr2vr.h $xr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i16, ptr %ptr, i64 33
+ %tmp = load i16, ptr %p
+ %tmp1 = insertelement <16 x i16> zeroinitializer, i16 %tmp, i32 0
+ %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> poison, <16 x i32> zeroinitializer
+ ret <16 x i16> %tmp2
+}
+
+define <8 x i32> @xvldrepl_w(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_w:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.w $a0, $a0, 0
+; CHECK-NEXT: xvreplgr2vr.w $xr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i32, ptr %ptr
+ %tmp1 = insertelement <8 x i32> zeroinitializer, i32 %tmp, i32 0
+ %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> poison, <8 x i32> zeroinitializer
+ ret <8 x i32> %tmp2
+}
+
+define <8 x i32> @xvldrepl_w_offset(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_w_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.w $a0, $a0, 132
+; CHECK-NEXT: xvreplgr2vr.w $xr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i32, ptr %ptr, i64 33
+ %tmp = load i32, ptr %p
+ %tmp1 = insertelement <8 x i32> zeroinitializer, i32 %tmp, i32 0
+ %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> poison, <8 x i32> zeroinitializer
+ ret <8 x i32> %tmp2
+}
+
+
+define <4 x i64> @xvldrepl_d(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_d:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.d $a0, $a0, 0
+; CHECK-NEXT: xvreplgr2vr.d $xr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i64, ptr %ptr
+ %tmp1 = insertelement <4 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> poison, <4 x i32> zeroinitializer
+ ret <4 x i64> %tmp2
+}
+
+define <4 x i64> @xvldrepl_d_offset(ptr %ptr) {
+; CHECK-LABEL: xvldrepl_d_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.d $a0, $a0, 264
+; CHECK-NEXT: xvreplgr2vr.d $xr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i64, ptr %ptr, i64 33
+ %tmp = load i64, ptr %p
+ %tmp1 = insertelement <4 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> poison, <4 x i32> zeroinitializer
+ ret <4 x i64> %tmp2
+}
+
+define <8 x float> @vldrepl_w_flt(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w_flt:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fld.s $fa0, $a0, 0
+; CHECK-NEXT: xvreplve0.w $xr0, $xr0
+; CHECK-NEXT: ret
+ %tmp = load float, ptr %ptr
+ %tmp1 = insertelement <8 x float> zeroinitializer, float %tmp, i32 0
+ %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> poison, <8 x i32> zeroinitializer
+ ret <8 x float> %tmp2
+}
+
+define <8 x float> @vldrepl_w_flt_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w_flt_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fld.s $fa0, $a0, 264
+; CHECK-NEXT: xvreplve0.w $xr0, $xr0
+; CHECK-NEXT: ret
+ %p = getelementptr i64, ptr %ptr, i64 33
+ %tmp = load float, ptr %p
+ %tmp1 = insertelement <8 x float> zeroinitializer, float %tmp, i32 0
+ %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> poison, <8 x i32> zeroinitializer
+ ret <8 x float> %tmp2
+}
+
+define <4 x double> @vldrepl_d_dbl(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d_dbl:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fld.d $fa0, $a0, 0
+; CHECK-NEXT: xvreplve0.d $xr0, $xr0
+; CHECK-NEXT: ret
+ %tmp = load double, ptr %ptr
+ %tmp1 = insertelement <4 x double> zeroinitializer, double %tmp, i32 0
+ %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> poison, <4 x i32> zeroinitializer
+ ret <4 x double> %tmp2
+}
+
+define <4 x double> @vldrepl_d_dbl_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d_dbl_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fld.d $fa0, $a0, 264
+; CHECK-NEXT: xvreplve0.d $xr0, $xr0
+; CHECK-NEXT: ret
+ %p = getelementptr i64, ptr %ptr, i64 33
+ %tmp = load double, ptr %p
+ %tmp1 = insertelement <4 x double> zeroinitializer, double %tmp, i32 0
+ %tmp2 = shufflevector <4 x double> %tmp1, <4 x double> poison, <4 x i32> zeroinitializer
+ ret <4 x double> %tmp2
+}
+
diff --git a/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll b/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll
new file mode 100644
index 0000000000000..09edb33a49ed9
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+lsx < %s | FileCheck %s
+
+; TODO: Load a element and splat it to a vector could be lowerd to vldrepl
+
+; A load has more than one user shouldn't be lowered to vldrepl
+define <16 x i8> @should_not_be_optimized(ptr %ptr, ptr %dst){
+; CHECK-LABEL: should_not_be_optimized:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.b $a0, $a0, 0
+; CHECK-NEXT: vreplgr2vr.b $vr0, $a0
+; CHECK-NEXT: st.b $a0, $a1, 0
+; CHECK-NEXT: ret
+ %tmp = load i8, ptr %ptr
+ store i8 %tmp, ptr %dst
+ %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %tmp, i32 0
+ %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> poison, <16 x i32> zeroinitializer
+ ret <16 x i8> %tmp2
+}
+
+define <16 x i8> @vldrepl_b(ptr %ptr) {
+; CHECK-LABEL: vldrepl_b:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.b $a0, $a0, 0
+; CHECK-NEXT: vreplgr2vr.b $vr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i8, ptr %ptr
+ %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %tmp, i32 0
+ %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> poison, <16 x i32> zeroinitializer
+ ret <16 x i8> %tmp2
+}
+
+define <16 x i8> @vldrepl_b_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_b_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.b $a0, $a0, 33
+; CHECK-NEXT: vreplgr2vr.b $vr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i8, ptr %ptr, i64 33
+ %tmp = load i8, ptr %p
+ %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %tmp, i32 0
+ %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> poison, <16 x i32> zeroinitializer
+ ret <16 x i8> %tmp2
+}
+
+
+define <8 x i16> @vldrepl_h(ptr %ptr) {
+; CHECK-LABEL: vldrepl_h:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.h $a0, $a0, 0
+; CHECK-NEXT: vreplgr2vr.h $vr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i16, ptr %ptr
+ %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %tmp, i32 0
+ %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> poison, <8 x i32> zeroinitializer
+ ret <8 x i16> %tmp2
+}
+
+define <8 x i16> @vldrepl_h_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_h_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.h $a0, $a0, 66
+; CHECK-NEXT: vreplgr2vr.h $vr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i16, ptr %ptr, i64 33
+ %tmp = load i16, ptr %p
+ %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %tmp, i32 0
+ %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> poison, <8 x i32> zeroinitializer
+ ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vldrepl_w(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.w $a0, $a0, 0
+; CHECK-NEXT: vreplgr2vr.w $vr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i32, ptr %ptr
+ %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+ %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> poison, <4 x i32> zeroinitializer
+ ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @vldrepl_w_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.w $a0, $a0, 132
+; CHECK-NEXT: vreplgr2vr.w $vr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i32, ptr %ptr, i64 33
+ %tmp = load i32, ptr %p
+ %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
+ %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> poison, <4 x i32> zeroinitializer
+ ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vldrepl_d(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.d $a0, $a0, 0
+; CHECK-NEXT: vreplgr2vr.d $vr0, $a0
+; CHECK-NEXT: ret
+ %tmp = load i64, ptr %ptr
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> poison, <2 x i32> zeroinitializer
+ ret <2 x i64> %tmp2
+}
+
+define <2 x i64> @vldrepl_d_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: ld.d $a0, $a0, 264
+; CHECK-NEXT: vreplgr2vr.d $vr0, $a0
+; CHECK-NEXT: ret
+ %p = getelementptr i64, ptr %ptr, i64 33
+ %tmp = load i64, ptr %p
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = shufflevector <2 x i64> %tmp1, <2 x i64> poison, <2 x i32> zeroinitializer
+ ret <2 x i64> %tmp2
+}
+
+define <4 x float> @vldrepl_w_flt(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w_flt:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fld.s $fa0, $a0, 0
+; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT: ret
+ %tmp = load float, ptr %ptr
+ %tmp1 = insertelement <4 x float> zeroinitializer, float %tmp, i32 0
+ %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> poison, <4 x i32> zeroinitializer
+ ret <4 x float> %tmp2
+}
+
+define <4 x float> @vldrepl_w_flt_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_w_flt_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fld.s $fa0, $a0, 264
+; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0
+; CHECK-NEXT: ret
+ %p = getelementptr i64, ptr %ptr, i64 33
+ %tmp = load float, ptr %p
+ %tmp1 = insertelement <4 x float> zeroinitializer, float %tmp, i32 0
+ %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> poison, <4 x i32> zeroinitializer
+ ret <4 x float> %tmp2
+}
+
+define <2 x double> @vldrepl_d_dbl(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d_dbl:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fld.d $fa0, $a0, 0
+; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT: ret
+ %tmp = load double, ptr %ptr
+ %tmp1 = insertelement <2 x double> zeroinitializer, double %tmp, i32 0
+ %tmp2 = shufflevector <2 x double> %tmp1, <2 x double> poison, <2 x i32> zeroinitializer
+ ret <2 x double> %tmp2
+}
+
+define <2 x double> @vldrepl_d_dbl_offset(ptr %ptr) {
+; CHECK-LABEL: vldrepl_d_dbl_offset:
+; CHECK: # %bb.0:
+; CHECK-NEXT: fld.d $fa0, $a0, 264
+; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT: ret
+ %p = getelementptr i64, ptr %ptr, i64 33
+ %tmp = load double, ptr %p
+ %tmp1 = insertelement <2 x double> zeroinitializer, double %tmp, i32 0
+ %tmp2 = shufflevector <2 x double> %tmp1, <2 x double> poison, <2 x i32> zeroinitializer
+ ret <2 x double> %tmp2
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it make sense to add some test cases like vldrepl_d_dbl_unaligned_offset
to ensure unaligned immediate offsets are correctly encoded?
define <2 x double> @vldrepl_d_dbl_unaligned_offset(ptr %ptr) {
%p = getelementptr i32, ptr %ptr, i32 1
%tmp = load double, ptr %p
%tmp1 = insertelement <2 x double> zeroinitializer, double %tmp, i32 0
%tmp2 = shufflevector <2 x double> %tmp1, <2 x double> poison, <2 x i32> zeroinitializer
ret <2 x double> %tmp2
}
After optimized by another pr, asm shows that it's correct fld.d $fa0, $a0, 4
vreplvei.d $vr0, $vr0, 0
ret after addi.d $a0, $a0, 4
vldrepl.d $vr0, $a0, 0
ret |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
No description provided.