33
33
#define CONSTBASE R16
34
34
#define BLOCKS R17
35
35
36
+ // for VPERMXOR
37
+ #define MASK R18
38
+
36
39
DATA consts<>+0x00 (SB)/8 , $0x3320646e61707865
37
40
DATA consts<>+0x08 (SB)/8 , $0x6b20657479622d32
38
41
DATA consts<>+0x10 (SB)/8 , $0x0000000000000001
@@ -53,7 +56,11 @@ DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
53
56
DATA consts<>+0x88 (SB)/8 , $0x6b2065746b206574
54
57
DATA consts<>+0x90 (SB)/8 , $0x0000000100000000
55
58
DATA consts<>+0x98 (SB)/8 , $0x0000000300000002
56
- GLOBL consts<>(SB), RODATA, $0xa0
59
+ DATA consts<>+0xa0 (SB)/8 , $0x5566774411223300
60
+ DATA consts<>+0xa8 (SB)/8 , $0xddeeffcc99aabb88
61
+ DATA consts<>+0xb0 (SB)/8 , $0x6677445522330011
62
+ DATA consts<>+0xb8 (SB)/8 , $0xeeffccddaabb8899
63
+ GLOBL consts<>(SB), RODATA, $0xc0
57
64
58
65
//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
59
66
TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64 -40
@@ -70,6 +77,9 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
70
77
MOVD $48 , R10
71
78
MOVD $64 , R11
72
79
SRD $6 , LEN, BLOCKS
80
+ // for VPERMXOR
81
+ MOVD $consts<>+0xa0 (SB), MASK
82
+ MOVD $16 , R20
73
83
// V16
74
84
LXVW4X (CONSTBASE)(R0), VS48
75
85
ADD $80 ,CONSTBASE
@@ -87,6 +97,10 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
87
97
// V28
88
98
LXVW4X (CONSTBASE)(R11), VS60
89
99
100
+ // Load mask constants for VPERMXOR
101
+ LXVW4X (MASK)(R0), V20
102
+ LXVW4X (MASK)(R20), V21
103
+
90
104
// splat slot from V19 -> V26
91
105
VSPLTW $0 , V19, V26
92
106
@@ -97,7 +111,7 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
97
111
98
112
MOVD $10 , R14
99
113
MOVD R14, CTR
100
-
114
+ PCALIGN $16
101
115
loop_outer_vsx:
102
116
// V0, V1, V2, V3
103
117
LXVW4X (R0)(CONSTBASE), VS32
@@ -128,22 +142,17 @@ loop_outer_vsx:
128
142
VSPLTISW $12 , V28
129
143
VSPLTISW $8 , V29
130
144
VSPLTISW $7 , V30
131
-
145
+ PCALIGN $16
132
146
loop_vsx:
133
147
VADDUWM V0, V4, V0
134
148
VADDUWM V1, V5, V1
135
149
VADDUWM V2, V6, V2
136
150
VADDUWM V3, V7, V3
137
151
138
- VXOR V12, V0, V12
139
- VXOR V13, V1, V13
140
- VXOR V14, V2, V14
141
- VXOR V15, V3, V15
142
-
143
- VRLW V12, V27, V12
144
- VRLW V13, V27, V13
145
- VRLW V14, V27, V14
146
- VRLW V15, V27, V15
152
+ VPERMXOR V12, V0, V21, V12
153
+ VPERMXOR V13, V1, V21, V13
154
+ VPERMXOR V14, V2, V21, V14
155
+ VPERMXOR V15, V3, V21, V15
147
156
148
157
VADDUWM V8, V12, V8
149
158
VADDUWM V9, V13, V9
@@ -165,15 +174,10 @@ loop_vsx:
165
174
VADDUWM V2, V6, V2
166
175
VADDUWM V3, V7, V3
167
176
168
- VXOR V12, V0, V12
169
- VXOR V13, V1, V13
170
- VXOR V14, V2, V14
171
- VXOR V15, V3, V15
172
-
173
- VRLW V12, V29, V12
174
- VRLW V13, V29, V13
175
- VRLW V14, V29, V14
176
- VRLW V15, V29, V15
177
+ VPERMXOR V12, V0, V20, V12
178
+ VPERMXOR V13, V1, V20, V13
179
+ VPERMXOR V14, V2, V20, V14
180
+ VPERMXOR V15, V3, V20, V15
177
181
178
182
VADDUWM V8, V12, V8
179
183
VADDUWM V9, V13, V9
@@ -195,15 +199,10 @@ loop_vsx:
195
199
VADDUWM V2, V7, V2
196
200
VADDUWM V3, V4, V3
197
201
198
- VXOR V15, V0, V15
199
- VXOR V12, V1, V12
200
- VXOR V13, V2, V13
201
- VXOR V14, V3, V14
202
-
203
- VRLW V15, V27, V15
204
- VRLW V12, V27, V12
205
- VRLW V13, V27, V13
206
- VRLW V14, V27, V14
202
+ VPERMXOR V15, V0, V21, V15
203
+ VPERMXOR V12, V1, V21, V12
204
+ VPERMXOR V13, V2, V21, V13
205
+ VPERMXOR V14, V3, V21, V14
207
206
208
207
VADDUWM V10, V15, V10
209
208
VADDUWM V11, V12, V11
@@ -225,15 +224,10 @@ loop_vsx:
225
224
VADDUWM V2, V7, V2
226
225
VADDUWM V3, V4, V3
227
226
228
- VXOR V15, V0, V15
229
- VXOR V12, V1, V12
230
- VXOR V13, V2, V13
231
- VXOR V14, V3, V14
232
-
233
- VRLW V15, V29, V15
234
- VRLW V12, V29, V12
235
- VRLW V13, V29, V13
236
- VRLW V14, V29, V14
227
+ VPERMXOR V15, V0, V20, V15
228
+ VPERMXOR V12, V1, V20, V12
229
+ VPERMXOR V13, V2, V20, V13
230
+ VPERMXOR V14, V3, V20, V14
237
231
238
232
VADDUWM V10, V15, V10
239
233
VADDUWM V11, V12, V11
@@ -249,48 +243,48 @@ loop_vsx:
249
243
VRLW V6, V30, V6
250
244
VRLW V7, V30, V7
251
245
VRLW V4, V30, V4
252
- BC 16 , LT, loop_vsx
246
+ BDNZ loop_vsx
253
247
254
248
VADDUWM V12, V26, V12
255
249
256
- WORD $0x13600F8C // VMRGEW V0, V1, V27
257
- WORD $0x13821F8C // VMRGEW V2, V3, V28
250
+ VMRGEW V0, V1, V27
251
+ VMRGEW V2, V3, V28
258
252
259
- WORD $0x10000E8C // VMRGOW V0, V1, V0
260
- WORD $0x10421E8C // VMRGOW V2, V3, V2
253
+ VMRGOW V0, V1, V0
254
+ VMRGOW V2, V3, V2
261
255
262
- WORD $0x13A42F8C // VMRGEW V4, V5, V29
263
- WORD $0x13C63F8C // VMRGEW V6, V7, V30
256
+ VMRGEW V4, V5, V29
257
+ VMRGEW V6, V7, V30
264
258
265
259
XXPERMDI VS32, VS34, $0 , VS33
266
260
XXPERMDI VS32, VS34, $3 , VS35
267
261
XXPERMDI VS59, VS60, $0 , VS32
268
262
XXPERMDI VS59, VS60, $3 , VS34
269
263
270
- WORD $0x10842E8C // VMRGOW V4, V5, V4
271
- WORD $0x10C63E8C // VMRGOW V6, V7, V6
264
+ VMRGOW V4, V5, V4
265
+ VMRGOW V6, V7, V6
272
266
273
- WORD $0x13684F8C // VMRGEW V8, V9, V27
274
- WORD $0x138A5F8C // VMRGEW V10, V11, V28
267
+ VMRGEW V8, V9, V27
268
+ VMRGEW V10, V11, V28
275
269
276
270
XXPERMDI VS36, VS38, $0 , VS37
277
271
XXPERMDI VS36, VS38, $3 , VS39
278
272
XXPERMDI VS61, VS62, $0 , VS36
279
273
XXPERMDI VS61, VS62, $3 , VS38
280
274
281
- WORD $0x11084E8C // VMRGOW V8, V9, V8
282
- WORD $0x114A5E8C // VMRGOW V10, V11, V10
275
+ VMRGOW V8, V9, V8
276
+ VMRGOW V10, V11, V10
283
277
284
- WORD $0x13AC6F8C // VMRGEW V12, V13, V29
285
- WORD $0x13CE7F8C // VMRGEW V14, V15, V30
278
+ VMRGEW V12, V13, V29
279
+ VMRGEW V14, V15, V30
286
280
287
281
XXPERMDI VS40, VS42, $0 , VS41
288
282
XXPERMDI VS40, VS42, $3 , VS43
289
283
XXPERMDI VS59, VS60, $0 , VS40
290
284
XXPERMDI VS59, VS60, $3 , VS42
291
285
292
- WORD $0x118C6E8C // VMRGOW V12, V13, V12
293
- WORD $0x11CE7E8C // VMRGOW V14, V15, V14
286
+ VMRGOW V12, V13, V12
287
+ VMRGOW V14, V15, V14
294
288
295
289
VSPLTISW $4 , V27
296
290
VADDUWM V26, V27, V26
@@ -431,15 +425,15 @@ tail_vsx:
431
425
ADD $-1 , R11, R12
432
426
ADD $-1 , INP
433
427
ADD $-1 , OUT
434
-
428
+ PCALIGN $16
435
429
looptail_vsx:
436
430
// Copying the result to OUT
437
431
// in bytes.
438
432
MOVBZU 1 (R12), KEY
439
433
MOVBZU 1 (INP), TMP
440
434
XOR KEY, TMP, KEY
441
435
MOVBU KEY, 1 (OUT )
442
- BC 16 , LT, looptail_vsx
436
+ BDNZ looptail_vsx
443
437
444
438
// Clear the stack values
445
439
STXVW4X VS48, (R11)(R0)
0 commit comments