Skip to content

Commit 8d0d405

Browse files
jkrishmyslaboger
authored andcommitted
x/crypto/chacha20: cleanup chacha_ppc64le.s
- Adding PCALIGN before the loops - Changing WORD directive with corresponding Vector Merge EVEN/ODD word instructions - Replacing Branch Conditional (BC) with its extended mnemonic form BDNZ - VPERMXOR instruction usage in place of VXOR instructions followed by VRLW (rotate left) for cases of rotating in multiples of 8. This replacements give performace improvement both in time and space of around 7%-8% as listed below using benchstat tool. goos: linux goarch: ppc64le pkg: golang.org/x/crypto/chacha20 cpu: POWER10 | chacha20.prev.out | chacha20.new.out | | sec/op | sec/op vs base | ChaCha20/64 171.9n ± 0% 156.6n ± 1% -8.90% (p=0.002 n=6) ChaCha20/256 165.5n ± 0% 152.4n ± 0% -7.92% (p=0.002 n=6) ChaCha20/10x25 505.8n ± 0% 504.3n ± 2% -0.32% (p=0.589 n=6) ChaCha20/4096 2.265µ ± 0% 2.052µ ± 0% -9.40% (p=0.002 n=6) ChaCha20/100x40 5.359µ ± 3% 5.018µ ± 2% -6.37% (p=0.002 n=6) ChaCha20/65536 35.71µ ± 0% 32.29µ ± 0% -9.57% (p=0.002 n=6) ChaCha20/1000x65 44.63µ ± 0% 41.05µ ± 0% -8.02% (p=0.002 n=6) geomean 2.235µ 2.073µ -7.26% | chacha20.prev.out | chacha20.new.out | | B/s | B/s vs base | ChaCha20/64 355.1Mi ± 0% 389.8Mi ± 1% +9.78% (p=0.002 n=6) ChaCha20/256 1.440Gi ± 0% 1.565Gi ± 0% +8.62% (p=0.002 n=6) ChaCha20/10x25 471.3Mi ± 0% 472.8Mi ± 2% +0.31% (p=0.589 n=6) ChaCha20/4096 1.684Gi ± 0% 1.859Gi ± 0% +10.38% (p=0.002 n=6) ChaCha20/100x40 711.8Mi ± 3% 760.3Mi ± 2% +6.80% (p=0.002 n=6) ChaCha20/65536 1.709Gi ± 0% 1.890Gi ± 0% +10.59% (p=0.002 n=6) ChaCha20/1000x65 1.356Gi ± 0% 1.475Gi ± 0% +8.72% (p=0.002 n=6) geomean 957.3Mi 1.008Gi +7.83% Change-Id: Ib31cb10a2a11eacdacf0272fbfd887eb5ccd8bcb Reviewed-on: https://go-review.googlesource.com/c/crypto/+/564797 Reviewed-by: Lynn Boger <[email protected]> Run-TryBot: Paul Murphy <[email protected]> TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: David Chase <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Run-TryBot: Lynn Boger <[email protected]> Reviewed-by: Cherry Mui <[email protected]>
1 parent b91329d commit 8d0d405

File tree

1 file changed

+52
-58
lines changed

1 file changed

+52
-58
lines changed

chacha20/chacha_ppc64le.s

+52-58
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@
3333
#define CONSTBASE R16
3434
#define BLOCKS R17
3535

36+
// for VPERMXOR
37+
#define MASK R18
38+
3639
DATA consts<>+0x00(SB)/8, $0x3320646e61707865
3740
DATA consts<>+0x08(SB)/8, $0x6b20657479622d32
3841
DATA consts<>+0x10(SB)/8, $0x0000000000000001
@@ -53,7 +56,11 @@ DATA consts<>+0x80(SB)/8, $0x6b2065746b206574
5356
DATA consts<>+0x88(SB)/8, $0x6b2065746b206574
5457
DATA consts<>+0x90(SB)/8, $0x0000000100000000
5558
DATA consts<>+0x98(SB)/8, $0x0000000300000002
56-
GLOBL consts<>(SB), RODATA, $0xa0
59+
DATA consts<>+0xa0(SB)/8, $0x5566774411223300
60+
DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88
61+
DATA consts<>+0xb0(SB)/8, $0x6677445522330011
62+
DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899
63+
GLOBL consts<>(SB), RODATA, $0xc0
5764

5865
//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
5966
TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
@@ -70,6 +77,9 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
7077
MOVD $48, R10
7178
MOVD $64, R11
7279
SRD $6, LEN, BLOCKS
80+
// for VPERMXOR
81+
MOVD $consts<>+0xa0(SB), MASK
82+
MOVD $16, R20
7383
// V16
7484
LXVW4X (CONSTBASE)(R0), VS48
7585
ADD $80,CONSTBASE
@@ -87,6 +97,10 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
8797
// V28
8898
LXVW4X (CONSTBASE)(R11), VS60
8999

100+
// Load mask constants for VPERMXOR
101+
LXVW4X (MASK)(R0), V20
102+
LXVW4X (MASK)(R20), V21
103+
90104
// splat slot from V19 -> V26
91105
VSPLTW $0, V19, V26
92106

@@ -97,7 +111,7 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
97111

98112
MOVD $10, R14
99113
MOVD R14, CTR
100-
114+
PCALIGN $16
101115
loop_outer_vsx:
102116
// V0, V1, V2, V3
103117
LXVW4X (R0)(CONSTBASE), VS32
@@ -128,22 +142,17 @@ loop_outer_vsx:
128142
VSPLTISW $12, V28
129143
VSPLTISW $8, V29
130144
VSPLTISW $7, V30
131-
145+
PCALIGN $16
132146
loop_vsx:
133147
VADDUWM V0, V4, V0
134148
VADDUWM V1, V5, V1
135149
VADDUWM V2, V6, V2
136150
VADDUWM V3, V7, V3
137151

138-
VXOR V12, V0, V12
139-
VXOR V13, V1, V13
140-
VXOR V14, V2, V14
141-
VXOR V15, V3, V15
142-
143-
VRLW V12, V27, V12
144-
VRLW V13, V27, V13
145-
VRLW V14, V27, V14
146-
VRLW V15, V27, V15
152+
VPERMXOR V12, V0, V21, V12
153+
VPERMXOR V13, V1, V21, V13
154+
VPERMXOR V14, V2, V21, V14
155+
VPERMXOR V15, V3, V21, V15
147156

148157
VADDUWM V8, V12, V8
149158
VADDUWM V9, V13, V9
@@ -165,15 +174,10 @@ loop_vsx:
165174
VADDUWM V2, V6, V2
166175
VADDUWM V3, V7, V3
167176

168-
VXOR V12, V0, V12
169-
VXOR V13, V1, V13
170-
VXOR V14, V2, V14
171-
VXOR V15, V3, V15
172-
173-
VRLW V12, V29, V12
174-
VRLW V13, V29, V13
175-
VRLW V14, V29, V14
176-
VRLW V15, V29, V15
177+
VPERMXOR V12, V0, V20, V12
178+
VPERMXOR V13, V1, V20, V13
179+
VPERMXOR V14, V2, V20, V14
180+
VPERMXOR V15, V3, V20, V15
177181

178182
VADDUWM V8, V12, V8
179183
VADDUWM V9, V13, V9
@@ -195,15 +199,10 @@ loop_vsx:
195199
VADDUWM V2, V7, V2
196200
VADDUWM V3, V4, V3
197201

198-
VXOR V15, V0, V15
199-
VXOR V12, V1, V12
200-
VXOR V13, V2, V13
201-
VXOR V14, V3, V14
202-
203-
VRLW V15, V27, V15
204-
VRLW V12, V27, V12
205-
VRLW V13, V27, V13
206-
VRLW V14, V27, V14
202+
VPERMXOR V15, V0, V21, V15
203+
VPERMXOR V12, V1, V21, V12
204+
VPERMXOR V13, V2, V21, V13
205+
VPERMXOR V14, V3, V21, V14
207206

208207
VADDUWM V10, V15, V10
209208
VADDUWM V11, V12, V11
@@ -225,15 +224,10 @@ loop_vsx:
225224
VADDUWM V2, V7, V2
226225
VADDUWM V3, V4, V3
227226

228-
VXOR V15, V0, V15
229-
VXOR V12, V1, V12
230-
VXOR V13, V2, V13
231-
VXOR V14, V3, V14
232-
233-
VRLW V15, V29, V15
234-
VRLW V12, V29, V12
235-
VRLW V13, V29, V13
236-
VRLW V14, V29, V14
227+
VPERMXOR V15, V0, V20, V15
228+
VPERMXOR V12, V1, V20, V12
229+
VPERMXOR V13, V2, V20, V13
230+
VPERMXOR V14, V3, V20, V14
237231

238232
VADDUWM V10, V15, V10
239233
VADDUWM V11, V12, V11
@@ -249,48 +243,48 @@ loop_vsx:
249243
VRLW V6, V30, V6
250244
VRLW V7, V30, V7
251245
VRLW V4, V30, V4
252-
BC 16, LT, loop_vsx
246+
BDNZ loop_vsx
253247

254248
VADDUWM V12, V26, V12
255249

256-
WORD $0x13600F8C // VMRGEW V0, V1, V27
257-
WORD $0x13821F8C // VMRGEW V2, V3, V28
250+
VMRGEW V0, V1, V27
251+
VMRGEW V2, V3, V28
258252

259-
WORD $0x10000E8C // VMRGOW V0, V1, V0
260-
WORD $0x10421E8C // VMRGOW V2, V3, V2
253+
VMRGOW V0, V1, V0
254+
VMRGOW V2, V3, V2
261255

262-
WORD $0x13A42F8C // VMRGEW V4, V5, V29
263-
WORD $0x13C63F8C // VMRGEW V6, V7, V30
256+
VMRGEW V4, V5, V29
257+
VMRGEW V6, V7, V30
264258

265259
XXPERMDI VS32, VS34, $0, VS33
266260
XXPERMDI VS32, VS34, $3, VS35
267261
XXPERMDI VS59, VS60, $0, VS32
268262
XXPERMDI VS59, VS60, $3, VS34
269263

270-
WORD $0x10842E8C // VMRGOW V4, V5, V4
271-
WORD $0x10C63E8C // VMRGOW V6, V7, V6
264+
VMRGOW V4, V5, V4
265+
VMRGOW V6, V7, V6
272266

273-
WORD $0x13684F8C // VMRGEW V8, V9, V27
274-
WORD $0x138A5F8C // VMRGEW V10, V11, V28
267+
VMRGEW V8, V9, V27
268+
VMRGEW V10, V11, V28
275269

276270
XXPERMDI VS36, VS38, $0, VS37
277271
XXPERMDI VS36, VS38, $3, VS39
278272
XXPERMDI VS61, VS62, $0, VS36
279273
XXPERMDI VS61, VS62, $3, VS38
280274

281-
WORD $0x11084E8C // VMRGOW V8, V9, V8
282-
WORD $0x114A5E8C // VMRGOW V10, V11, V10
275+
VMRGOW V8, V9, V8
276+
VMRGOW V10, V11, V10
283277

284-
WORD $0x13AC6F8C // VMRGEW V12, V13, V29
285-
WORD $0x13CE7F8C // VMRGEW V14, V15, V30
278+
VMRGEW V12, V13, V29
279+
VMRGEW V14, V15, V30
286280

287281
XXPERMDI VS40, VS42, $0, VS41
288282
XXPERMDI VS40, VS42, $3, VS43
289283
XXPERMDI VS59, VS60, $0, VS40
290284
XXPERMDI VS59, VS60, $3, VS42
291285

292-
WORD $0x118C6E8C // VMRGOW V12, V13, V12
293-
WORD $0x11CE7E8C // VMRGOW V14, V15, V14
286+
VMRGOW V12, V13, V12
287+
VMRGOW V14, V15, V14
294288

295289
VSPLTISW $4, V27
296290
VADDUWM V26, V27, V26
@@ -431,15 +425,15 @@ tail_vsx:
431425
ADD $-1, R11, R12
432426
ADD $-1, INP
433427
ADD $-1, OUT
434-
428+
PCALIGN $16
435429
looptail_vsx:
436430
// Copying the result to OUT
437431
// in bytes.
438432
MOVBZU 1(R12), KEY
439433
MOVBZU 1(INP), TMP
440434
XOR KEY, TMP, KEY
441435
MOVBU KEY, 1(OUT)
442-
BC 16, LT, looptail_vsx
436+
BDNZ looptail_vsx
443437

444438
// Clear the stack values
445439
STXVW4X VS48, (R11)(R0)

0 commit comments

Comments
 (0)