@@ -73,36 +73,31 @@ namespace lsp
73
73
__ASM_EMIT (" add $0x80, %[off]" )
74
74
__ASM_EMIT (" sub $32, %[count]" )
75
75
__ASM_EMIT (" 4:" )
76
- __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
77
- __ASM_EMIT (" vextractf64x4 $1, %%zmm1, %%ymm3" )
78
- __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
79
- __ASM_EMIT (" vaddps %%ymm3, %%ymm1, %%ymm1" )
76
+ __ASM_EMIT (" vaddps %%zmm1, %%zmm0, %%zmm0" )
80
77
/* x16 block */
81
78
__ASM_EMIT (" add $16, %[count]" )
82
79
__ASM_EMIT (" jl 6f" )
83
- __ASM_EMIT (" vmovups 0x00(%[a], %[off]), %%ymm2" )
84
- __ASM_EMIT (" vmovups 0x20(%[a], %[off]), %%ymm3" )
85
- __ASM_EMIT (" vmulps 0x00(%[b], %[off]), %%ymm2, %%ymm2" )
86
- __ASM_EMIT (" vmulps 0x20(%[b], %[off]), %%ymm3, %%ymm3" )
87
- __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
88
- __ASM_EMIT (" vaddps %%ymm3, %%ymm1, %%ymm1" )
80
+ __ASM_EMIT (" vmovups 0x00(%[a], %[off]), %%zmm2" )
81
+ __ASM_EMIT (" vmulps 0x00(%[b], %[off]), %%zmm2, %%zmm2" )
82
+ __ASM_EMIT (" vaddps %%zmm2, %%zmm0, %%zmm0" )
89
83
__ASM_EMIT (" add $0x40, %[off]" )
90
84
__ASM_EMIT (" sub $16, %[count]" )
91
85
__ASM_EMIT (" 6:" )
86
+ __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
87
+ __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
92
88
/* x8 block */
93
89
__ASM_EMIT (" add $8, %[count]" )
94
- __ASM_EMIT (" vaddps %%ymm1, %%ymm0, %%ymm0" )
95
90
__ASM_EMIT (" jl 8f" )
96
91
__ASM_EMIT (" vmovups 0x00(%[a], %[off]), %%ymm2" )
97
92
__ASM_EMIT (" vmulps 0x00(%[b], %[off]), %%ymm2, %%ymm2" )
98
93
__ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
99
94
__ASM_EMIT (" add $0x20, %[off]" )
100
95
__ASM_EMIT (" sub $8, %[count]" )
101
96
__ASM_EMIT (" 8:" )
97
+ __ASM_EMIT (" vextractf128 $1, %%ymm0, %%xmm2" )
98
+ __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
102
99
/* x4 block */
103
- __ASM_EMIT (" vextractf128 $0x01, %%ymm0, %%xmm1" )
104
100
__ASM_EMIT (" add $4, %[count]" )
105
- __ASM_EMIT (" vaddps %%xmm1, %%xmm0, %%xmm0" )
106
101
__ASM_EMIT (" jl 10f" )
107
102
__ASM_EMIT (" vmovups 0x00(%[a], %[off]), %%xmm2" )
108
103
__ASM_EMIT (" vmulps 0x00(%[b], %[off]), %%xmm2, %%xmm2" )
@@ -164,47 +159,33 @@ namespace lsp
164
159
__ASM_EMIT (" sub $32, %[count]" )
165
160
__ASM_EMIT (" jae 1b" )
166
161
__ASM_EMIT (" 2:" )
167
- __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
168
- __ASM_EMIT (" vextractf64x4 $1, %%zmm1, %%ymm3" )
169
- __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
170
- __ASM_EMIT (" vaddps %%ymm3, %%ymm1, %%ymm1" )
162
+ __ASM_EMIT (" vaddps %%zmm1, %%zmm0, %%zmm0" )
171
163
/* x16 block */
172
164
__ASM_EMIT (" add $16, %[count]" )
173
165
__ASM_EMIT (" jl 4f" )
174
- __ASM_EMIT (" vmovups 0x00(%[a], %[off]), %%ymm2" )
175
- __ASM_EMIT (" vmovups 0x20(%[a], %[off]), %%ymm3" )
176
- __ASM_EMIT (" vmovups 0x00(%[b], %[off]), %%ymm4" )
177
- __ASM_EMIT (" vmovups 0x20(%[b], %[off]), %%ymm5" )
178
- __ASM_EMIT (" vmulps %%ymm2, %%ymm2, %%ymm2" )
179
- __ASM_EMIT (" vmulps %%ymm3, %%ymm3, %%ymm3" )
180
- __ASM_EMIT (" vmulps %%ymm4, %%ymm4, %%ymm4" )
181
- __ASM_EMIT (" vmulps %%ymm5, %%ymm5, %%ymm5" )
182
- __ASM_EMIT (" vfmadd231ps %%ymm4, %%ymm2, %%ymm0" )
183
- __ASM_EMIT (" vfmadd231ps %%ymm5, %%ymm3, %%ymm1" )
166
+ __ASM_EMIT (" vmovups 0x00(%[a], %[off]), %%zmm2" )
167
+ __ASM_EMIT (" vmovups 0x00(%[b], %[off]), %%zmm4" )
168
+ __ASM_EMIT (" vmulps %%zmm2, %%zmm2, %%zmm2" )
169
+ __ASM_EMIT (" vmulps %%zmm4, %%zmm4, %%zmm4" )
170
+ __ASM_EMIT (" vfmadd231ps %%zmm4, %%zmm2, %%zmm0" )
184
171
__ASM_EMIT (" add $0x40, %[off]" )
185
172
__ASM_EMIT (" sub $16, %[count]" )
186
173
__ASM_EMIT (" 4:" )
187
- __ASM_EMIT (" vextractf128 $0x01, %%ymm0, %%xmm2" )
188
- __ASM_EMIT (" vextractf128 $0x01, %%ymm1, %%xmm3" )
189
- __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
190
- __ASM_EMIT (" vaddps %%xmm3, %%xmm1, %%xmm1" )
174
+ __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
175
+ __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
191
176
/* x8 block */
192
177
__ASM_EMIT (" add $8, %[count]" )
193
178
__ASM_EMIT (" jl 6f" )
194
- __ASM_EMIT (" vmovups 0x00(%[a], %[off]), %%xmm2" )
195
- __ASM_EMIT (" vmovups 0x10(%[a], %[off]), %%xmm3" )
196
- __ASM_EMIT (" vmovups 0x00(%[b], %[off]), %%xmm4" )
197
- __ASM_EMIT (" vmovups 0x10(%[b], %[off]), %%xmm5" )
198
- __ASM_EMIT (" vmulps %%xmm2, %%xmm2, %%xmm2" )
199
- __ASM_EMIT (" vmulps %%xmm3, %%xmm3, %%xmm3" )
200
- __ASM_EMIT (" vmulps %%xmm4, %%xmm4, %%xmm4" )
201
- __ASM_EMIT (" vmulps %%xmm5, %%xmm5, %%xmm5" )
202
- __ASM_EMIT (" vfmadd231ps %%xmm4, %%xmm2, %%xmm0" )
203
- __ASM_EMIT (" vfmadd231ps %%xmm5, %%xmm3, %%xmm1" )
179
+ __ASM_EMIT (" vmovups 0x00(%[a], %[off]), %%ymm2" )
180
+ __ASM_EMIT (" vmovups 0x00(%[b], %[off]), %%ymm4" )
181
+ __ASM_EMIT (" vmulps %%ymm2, %%ymm2, %%ymm2" )
182
+ __ASM_EMIT (" vmulps %%ymm4, %%ymm4, %%ymm4" )
183
+ __ASM_EMIT (" vfmadd231ps %%ymm4, %%ymm2, %%ymm0" )
204
184
__ASM_EMIT (" add $0x20, %[off]" )
205
185
__ASM_EMIT (" sub $8, %[count]" )
206
186
__ASM_EMIT (" 6:" )
207
- __ASM_EMIT (" vaddps %%xmm1, %%xmm0, %%xmm0" )
187
+ __ASM_EMIT (" vextractf128 $1, %%ymm0, %%xmm2" )
188
+ __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
208
189
/* x4 block */
209
190
__ASM_EMIT (" add $4, %[count]" )
210
191
__ASM_EMIT (" jl 8f" )
@@ -277,39 +258,29 @@ namespace lsp
277
258
__ASM_EMIT (" sub $32, %[count]" )
278
259
__ASM_EMIT (" jae 1b" )
279
260
__ASM_EMIT (" 2:" )
280
- __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
281
- __ASM_EMIT (" vextractf64x4 $1, %%zmm1, %%ymm3" )
282
- __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
283
- __ASM_EMIT (" vaddps %%ymm3, %%ymm1, %%ymm1" )
261
+ __ASM_EMIT (" vaddps %%zmm1, %%zmm0, %%zmm0" )
284
262
/* x16 block */
285
263
__ASM_EMIT (" add $16, %[count]" )
286
264
__ASM_EMIT (" jl 4f" )
287
- __ASM_EMIT (" vandps 0x00(%[a], %[off]), %%ymm6, %%ymm2" )
288
- __ASM_EMIT (" vandps 0x20(%[a], %[off]), %%ymm7, %%ymm3" )
289
- __ASM_EMIT (" vandps 0x00(%[b], %[off]), %%ymm6, %%ymm4" )
290
- __ASM_EMIT (" vandps 0x20(%[b], %[off]), %%ymm7, %%ymm5" )
291
- __ASM_EMIT (" vfmadd231ps %%ymm4, %%ymm2, %%ymm0" )
292
- __ASM_EMIT (" vfmadd231ps %%ymm5, %%ymm3, %%ymm1" )
265
+ __ASM_EMIT (" vandps 0x00(%[a], %[off]), %%zmm6, %%zmm2" )
266
+ __ASM_EMIT (" vandps 0x00(%[b], %[off]), %%zmm6, %%zmm4" )
267
+ __ASM_EMIT (" vfmadd231ps %%zmm4, %%zmm2, %%zmm0" )
293
268
__ASM_EMIT (" add $0x40, %[off]" )
294
269
__ASM_EMIT (" sub $16, %[count]" )
295
270
__ASM_EMIT (" 4:" )
296
- __ASM_EMIT (" vextractf128 $0x01, %%ymm0, %%xmm2" )
297
- __ASM_EMIT (" vextractf128 $0x01, %%ymm1, %%xmm3" )
298
- __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
299
- __ASM_EMIT (" vaddps %%xmm3, %%xmm1, %%xmm1" )
271
+ __ASM_EMIT (" vextractf64x4 $1, %%zmm0, %%ymm2" )
272
+ __ASM_EMIT (" vaddps %%ymm2, %%ymm0, %%ymm0" )
300
273
/* x8 block */
301
274
__ASM_EMIT (" add $8, %[count]" )
302
275
__ASM_EMIT (" jl 6f" )
303
- __ASM_EMIT (" vandps 0x00(%[a], %[off]), %%xmm6, %%xmm2" )
304
- __ASM_EMIT (" vandps 0x10(%[a], %[off]), %%xmm7, %%xmm3" )
305
- __ASM_EMIT (" vandps 0x00(%[b], %[off]), %%xmm6, %%xmm4" )
306
- __ASM_EMIT (" vandps 0x10(%[b], %[off]), %%xmm7, %%xmm5" )
307
- __ASM_EMIT (" vfmadd231ps %%xmm4, %%xmm2, %%xmm0" )
308
- __ASM_EMIT (" vfmadd231ps %%xmm5, %%xmm3, %%xmm1" )
276
+ __ASM_EMIT (" vandps 0x00(%[a], %[off]), %%ymm6, %%ymm2" )
277
+ __ASM_EMIT (" vandps 0x00(%[b], %[off]), %%ymm6, %%ymm4" )
278
+ __ASM_EMIT (" vfmadd231ps %%ymm4, %%ymm2, %%ymm0" )
309
279
__ASM_EMIT (" add $0x20, %[off]" )
310
280
__ASM_EMIT (" sub $8, %[count]" )
311
281
__ASM_EMIT (" 6:" )
312
- __ASM_EMIT (" vaddps %%xmm1, %%xmm0, %%xmm0" )
282
+ __ASM_EMIT (" vextractf128 $1, %%ymm0, %%xmm2" )
283
+ __ASM_EMIT (" vaddps %%xmm2, %%xmm0, %%xmm0" )
313
284
/* x4 block */
314
285
__ASM_EMIT (" add $4, %[count]" )
315
286
__ASM_EMIT (" jl 8f" )
0 commit comments