Skip to content

Commit 191b9cd

Browse files
authored
[X86] Add test showing failure to make use of existing concatenated operands for profitable fadd concatenation (llvm#174383)
Since we've already concatenated the fsqrt operands, we only need to concatenate the other fadd operand
1 parent 5698d05 commit 191b9cd

File tree

1 file changed

+67
-2
lines changed

1 file changed

+67
-2
lines changed

llvm/test/CodeGen/X86/combine-fsqrt.ll

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
33
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
4-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
5-
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
4+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
5+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
66
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
77

88
define <4 x double> @concat_sqrt_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
@@ -172,3 +172,68 @@ define <16 x float> @concat_sqrt_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1)
172172
%res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
173173
ret <16 x float> %res
174174
}
175+
176+
; FIXME: We are concatenating the sqrt operands, so the fadd could be cheaply concatenated as well.
177+
define void @concat_sqrt_fadd_v8f32_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %y0, <4 x float> %y1, ptr %p0){
178+
; SSE-LABEL: concat_sqrt_fadd_v8f32_v4f32:
179+
; SSE: # %bb.0:
180+
; SSE-NEXT: addps %xmm0, %xmm2
181+
; SSE-NEXT: addps %xmm1, %xmm3
182+
; SSE-NEXT: movaps %xmm3, 16(%rdi)
183+
; SSE-NEXT: movaps %xmm2, (%rdi)
184+
; SSE-NEXT: sqrtps %xmm0, %xmm0
185+
; SSE-NEXT: sqrtps %xmm1, %xmm1
186+
; SSE-NEXT: movaps %xmm1, 48(%rdi)
187+
; SSE-NEXT: movaps %xmm0, 32(%rdi)
188+
; SSE-NEXT: retq
189+
;
190+
; AVX1-LABEL: concat_sqrt_fadd_v8f32_v4f32:
191+
; AVX1: # %bb.0:
192+
; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
193+
; AVX1-NEXT: vaddps %xmm2, %xmm0, %xmm2
194+
; AVX1-NEXT: vaddps %xmm3, %xmm1, %xmm3
195+
; AVX1-NEXT: vmovaps %xmm3, 16(%rdi)
196+
; AVX1-NEXT: vmovaps %xmm2, (%rdi)
197+
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
198+
; AVX1-NEXT: vsqrtps %ymm0, %ymm0
199+
; AVX1-NEXT: vmovaps %ymm0, 32(%rdi)
200+
; AVX1-NEXT: vzeroupper
201+
; AVX1-NEXT: retq
202+
;
203+
; AVX2-LABEL: concat_sqrt_fadd_v8f32_v4f32:
204+
; AVX2: # %bb.0:
205+
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
206+
; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm2
207+
; AVX2-NEXT: vaddps %xmm3, %xmm1, %xmm3
208+
; AVX2-NEXT: vmovaps %xmm3, 16(%rdi)
209+
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
210+
; AVX2-NEXT: vsqrtps %ymm0, %ymm0
211+
; AVX2-NEXT: vmovaps %xmm2, (%rdi)
212+
; AVX2-NEXT: vmovaps %ymm0, 32(%rdi)
213+
; AVX2-NEXT: vzeroupper
214+
; AVX2-NEXT: retq
215+
;
216+
; AVX512-LABEL: concat_sqrt_fadd_v8f32_v4f32:
217+
; AVX512: # %bb.0:
218+
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
219+
; AVX512-NEXT: vaddps %xmm2, %xmm0, %xmm2
220+
; AVX512-NEXT: vaddps %xmm3, %xmm1, %xmm3
221+
; AVX512-NEXT: vmovaps %xmm3, 16(%rdi)
222+
; AVX512-NEXT: vmovaps %xmm2, (%rdi)
223+
; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
224+
; AVX512-NEXT: vsqrtps %ymm0, %ymm0
225+
; AVX512-NEXT: vmovaps %ymm0, 32(%rdi)
226+
; AVX512-NEXT: vzeroupper
227+
; AVX512-NEXT: retq
228+
%add0 = fadd <4 x float> %x0, %y0
229+
%add1 = fadd <4 x float> %x1, %y1
230+
%add = shufflevector <4 x float> %add0, <4 x float> %add1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
231+
store <8 x float> %add, ptr %p0, align 32
232+
233+
%p1 = getelementptr inbounds nuw i8, ptr %p0, i64 32
234+
%sqrt0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x0)
235+
%sqrt1 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x1)
236+
%sqrt = shufflevector <4 x float> %sqrt0, <4 x float> %sqrt1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
237+
store <8 x float> %sqrt, ptr %p1, align 32
238+
ret void
239+
}

0 commit comments

Comments
 (0)