|
1 | 1 | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
2 | 2 | ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE |
3 | 3 | ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE |
4 | | -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2 |
5 | | -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2 |
| 4 | +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 |
| 5 | +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2 |
6 | 6 | ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 |
7 | 7 |
|
8 | 8 | define <4 x double> @concat_sqrt_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) { |
@@ -172,3 +172,68 @@ define <16 x float> @concat_sqrt_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) |
172 | 172 | %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
173 | 173 | ret <16 x float> %res |
174 | 174 | } |
| 175 | + |
| 176 | +; FIXME: We are concatenating the sqrt operands, so the fadd could be cheaply concatenated as well. |
| 177 | +define void @concat_sqrt_fadd_v8f32_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %y0, <4 x float> %y1, ptr %p0){ |
| 178 | +; SSE-LABEL: concat_sqrt_fadd_v8f32_v4f32: |
| 179 | +; SSE: # %bb.0: |
| 180 | +; SSE-NEXT: addps %xmm0, %xmm2 |
| 181 | +; SSE-NEXT: addps %xmm1, %xmm3 |
| 182 | +; SSE-NEXT: movaps %xmm3, 16(%rdi) |
| 183 | +; SSE-NEXT: movaps %xmm2, (%rdi) |
| 184 | +; SSE-NEXT: sqrtps %xmm0, %xmm0 |
| 185 | +; SSE-NEXT: sqrtps %xmm1, %xmm1 |
| 186 | +; SSE-NEXT: movaps %xmm1, 48(%rdi) |
| 187 | +; SSE-NEXT: movaps %xmm0, 32(%rdi) |
| 188 | +; SSE-NEXT: retq |
| 189 | +; |
| 190 | +; AVX1-LABEL: concat_sqrt_fadd_v8f32_v4f32: |
| 191 | +; AVX1: # %bb.0: |
| 192 | +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 |
| 193 | +; AVX1-NEXT: vaddps %xmm2, %xmm0, %xmm2 |
| 194 | +; AVX1-NEXT: vaddps %xmm3, %xmm1, %xmm3 |
| 195 | +; AVX1-NEXT: vmovaps %xmm3, 16(%rdi) |
| 196 | +; AVX1-NEXT: vmovaps %xmm2, (%rdi) |
| 197 | +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
| 198 | +; AVX1-NEXT: vsqrtps %ymm0, %ymm0 |
| 199 | +; AVX1-NEXT: vmovaps %ymm0, 32(%rdi) |
| 200 | +; AVX1-NEXT: vzeroupper |
| 201 | +; AVX1-NEXT: retq |
| 202 | +; |
| 203 | +; AVX2-LABEL: concat_sqrt_fadd_v8f32_v4f32: |
| 204 | +; AVX2: # %bb.0: |
| 205 | +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 |
| 206 | +; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm2 |
| 207 | +; AVX2-NEXT: vaddps %xmm3, %xmm1, %xmm3 |
| 208 | +; AVX2-NEXT: vmovaps %xmm3, 16(%rdi) |
| 209 | +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
| 210 | +; AVX2-NEXT: vsqrtps %ymm0, %ymm0 |
| 211 | +; AVX2-NEXT: vmovaps %xmm2, (%rdi) |
| 212 | +; AVX2-NEXT: vmovaps %ymm0, 32(%rdi) |
| 213 | +; AVX2-NEXT: vzeroupper |
| 214 | +; AVX2-NEXT: retq |
| 215 | +; |
| 216 | +; AVX512-LABEL: concat_sqrt_fadd_v8f32_v4f32: |
| 217 | +; AVX512: # %bb.0: |
| 218 | +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 |
| 219 | +; AVX512-NEXT: vaddps %xmm2, %xmm0, %xmm2 |
| 220 | +; AVX512-NEXT: vaddps %xmm3, %xmm1, %xmm3 |
| 221 | +; AVX512-NEXT: vmovaps %xmm3, 16(%rdi) |
| 222 | +; AVX512-NEXT: vmovaps %xmm2, (%rdi) |
| 223 | +; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 |
| 224 | +; AVX512-NEXT: vsqrtps %ymm0, %ymm0 |
| 225 | +; AVX512-NEXT: vmovaps %ymm0, 32(%rdi) |
| 226 | +; AVX512-NEXT: vzeroupper |
| 227 | +; AVX512-NEXT: retq |
| 228 | + %add0 = fadd <4 x float> %x0, %y0 |
| 229 | + %add1 = fadd <4 x float> %x1, %y1 |
| 230 | + %add = shufflevector <4 x float> %add0, <4 x float> %add1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| 231 | + store <8 x float> %add, ptr %p0, align 32 |
| 232 | + |
| 233 | + %p1 = getelementptr inbounds nuw i8, ptr %p0, i64 32 |
| 234 | + %sqrt0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x0) |
| 235 | + %sqrt1 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x1) |
| 236 | + %sqrt = shufflevector <4 x float> %sqrt0, <4 x float> %sqrt1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| 237 | + store <8 x float> %sqrt, ptr %p1, align 32 |
| 238 | + ret void |
| 239 | +} |
0 commit comments