[X86] Add test showing failure to make use of existing concatenated operands for profitable fadd concatenation (llvm#174383)

RKSimon · web-flow · commit 191b9cd003e9 · 2026-01-05T11:16:03.000Z
Since we've already concatenated the fsqrt operands, we only need to concatenate the other fadd operand
diff --git a/llvm/test/CodeGen/X86/combine-fsqrt.ll b/llvm/test/CodeGen/X86/combine-fsqrt.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64    | FileCheck %s --check-prefixes=SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
 
 define <4 x double> @concat_sqrt_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
@@ -172,3 +172,68 @@ define <16 x float> @concat_sqrt_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1)
   %res  = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x float> %res
 }
+
+; FIXME: We are concatenating the sqrt operands, so the fadd could be cheaply concatenated as well.
+define void @concat_sqrt_fadd_v8f32_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %y0, <4 x float> %y1, ptr %p0){
+; SSE-LABEL: concat_sqrt_fadd_v8f32_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addps %xmm0, %xmm2
+; SSE-NEXT:    addps %xmm1, %xmm3
+; SSE-NEXT:    movaps %xmm3, 16(%rdi)
+; SSE-NEXT:    movaps %xmm2, (%rdi)
+; SSE-NEXT:    sqrtps %xmm0, %xmm0
+; SSE-NEXT:    sqrtps %xmm1, %xmm1
+; SSE-NEXT:    movaps %xmm1, 48(%rdi)
+; SSE-NEXT:    movaps %xmm0, 32(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: concat_sqrt_fadd_v8f32_v4f32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT:    vaddps %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vaddps %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vmovaps %xmm3, 16(%rdi)
+; AVX1-NEXT:    vmovaps %xmm2, (%rdi)
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vsqrtps %ymm0, %ymm0
+; AVX1-NEXT:    vmovaps %ymm0, 32(%rdi)
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: concat_sqrt_fadd_v8f32_v4f32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT:    vaddps %xmm2, %xmm0, %xmm2
+; AVX2-NEXT:    vaddps %xmm3, %xmm1, %xmm3
+; AVX2-NEXT:    vmovaps %xmm3, 16(%rdi)
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vsqrtps %ymm0, %ymm0
+; AVX2-NEXT:    vmovaps %xmm2, (%rdi)
+; AVX2-NEXT:    vmovaps %ymm0, 32(%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: concat_sqrt_fadd_v8f32_v4f32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT:    vaddps %xmm2, %xmm0, %xmm2
+; AVX512-NEXT:    vaddps %xmm3, %xmm1, %xmm3
+; AVX512-NEXT:    vmovaps %xmm3, 16(%rdi)
+; AVX512-NEXT:    vmovaps %xmm2, (%rdi)
+; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vsqrtps %ymm0, %ymm0
+; AVX512-NEXT:    vmovaps %ymm0, 32(%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %add0 = fadd <4 x float> %x0, %y0
+  %add1 = fadd <4 x float> %x1, %y1
+  %add = shufflevector <4 x float> %add0, <4 x float> %add1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %add, ptr %p0, align 32
+
+  %p1 = getelementptr inbounds nuw i8, ptr %p0, i64 32
+  %sqrt0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x0)
+  %sqrt1 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x1)
+  %sqrt = shufflevector <4 x float> %sqrt0, <4 x float> %sqrt1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x float> %sqrt, ptr %p1, align 32
+  ret void
+}