66
77#include "textflag.h"
88
9- // func keccakF1600(a *[200]byte)
10- TEXT ·keccakF1600(SB), $200 -8
9+ // func keccakF1600Sha3(a *[200]byte, buf *byte)
10+ // When buf != nil, XORs rate bytes into state before permuting.
11+ // When buf == nil, just permutes.
12+ TEXT ·keccakF1600Sha3(SB), $200 -16
1113 MOVD a+0 (FP), R0
14+ MOVD buf+8 (FP), R3
1215 MOVD $round_consts<>(SB), R1
1316 MOVD $24 , R2 // counter for loop
1417
15- VLD1.P 16 (R0), [V0.D1, V1.D1]
16- VLD1.P 16 (R0), [V2.D1, V3.D1]
17- VLD1.P 16 (R0), [V4.D1, V5.D1]
18- VLD1.P 16 (R0), [V6.D1, V7.D1]
19- VLD1.P 16 (R0), [V8.D1, V9.D1]
20- VLD1.P 16 (R0), [V10.D1, V11.D1]
21- VLD1.P 16 (R0), [V12.D1, V13.D1]
22- VLD1.P 16 (R0), [V14.D1, V15.D1]
23- VLD1.P 16 (R0), [V16.D1, V17.D1]
24- VLD1.P 16 (R0), [V18.D1, V19.D1]
25- VLD1.P 16 (R0), [V20.D1, V21.D1]
26- VLD1.P 16 (R0), [V22.D1, V23.D1]
27- VLD1 (R0), [V24.D1]
28-
29- SUB $192 , R0, R0
30-
31- loop:
32- // theta
33- VEOR3 V20.B16, V15.B16, V10.B16, V25.B16
34- VEOR3 V21.B16, V16.B16, V11.B16, V26.B16
35- VEOR3 V22.B16, V17.B16, V12.B16, V27.B16
36- VEOR3 V23.B16, V18.B16, V13.B16, V28.B16
37- VEOR3 V24.B16, V19.B16, V14.B16, V29.B16
38- VEOR3 V25.B16, V5.B16, V0.B16, V25.B16
39- VEOR3 V26.B16, V6.B16, V1.B16, V26.B16
40- VEOR3 V27.B16, V7.B16, V2.B16, V27.B16
41- VEOR3 V28.B16, V8.B16, V3.B16, V28.B16
42- VEOR3 V29.B16, V9.B16, V4.B16, V29.B16
43-
44- VRAX1 V27.D2, V25.D2, V30.D2
45- VRAX1 V28.D2, V26.D2, V31.D2
46- VRAX1 V29.D2, V27.D2, V27.D2
47- VRAX1 V25.D2, V28.D2, V28.D2
48- VRAX1 V26.D2, V29.D2, V29.D2
49-
50- // theta and rho and Pi
51- VEOR V29.B16, V0.B16, V0.B16
52-
53- VXAR $63 , V30.D2, V1.D2, V25.D2
54-
55- VXAR $20 , V30.D2, V6.D2, V1.D2
56- VXAR $44 , V28.D2, V9.D2, V6.D2
57- VXAR $3 , V31.D2, V22.D2, V9.D2
58- VXAR $25 , V28.D2, V14.D2, V22.D2
59- VXAR $46 , V29.D2, V20.D2, V14.D2
60-
61- VXAR $2 , V31.D2, V2.D2, V26.D2
62-
63- VXAR $21 , V31.D2, V12.D2, V2.D2
64- VXAR $39 , V27.D2, V13.D2, V12.D2
65- VXAR $56 , V28.D2, V19.D2, V13.D2
66- VXAR $8 , V27.D2, V23.D2, V19.D2
67- VXAR $23 , V29.D2, V15.D2, V23.D2
68-
69- VXAR $37 , V28.D2, V4.D2, V15.D2
70-
71- VXAR $50 , V28.D2, V24.D2, V28.D2
72- VXAR $62 , V30.D2, V21.D2, V24.D2
73- VXAR $9 , V27.D2, V8.D2, V8.D2
74- VXAR $19 , V30.D2, V16.D2, V4.D2
75- VXAR $28 , V29.D2, V5.D2, V16.D2
76-
77- VXAR $36 , V27.D2, V3.D2, V5.D2
18+ CBZ R3, load_state
7819
79- VXAR $43 , V27.D2, V18.D2, V27.D2
80- VXAR $49 , V31.D2, V17.D2, V3.D2
81- VXAR $54 , V30.D2, V11.D2, V30.D2
82- VXAR $58 , V31.D2, V7.D2, V31.D2
83- VXAR $61 , V29.D2, V10.D2, V29.D2
84-
85- // chi and iota
86- VBCAX V8.B16, V22.B16, V26.B16, V20.B16
87- VBCAX V22.B16, V23.B16, V8.B16, V21.B16
88- VBCAX V23.B16, V24.B16, V22.B16, V22.B16
89- VBCAX V24.B16, V26.B16, V23.B16, V23.B16
90- VBCAX V26.B16, V8.B16, V24.B16, V24.B16
91-
92- VLD1R.P 8 (R1), [V26.D2]
93-
94- VBCAX V3.B16, V19.B16, V30.B16, V17.B16
95- VBCAX V19.B16, V15.B16, V3.B16, V18.B16
96- VBCAX V15.B16, V16.B16, V19.B16, V19.B16
97- VBCAX V16.B16, V30.B16, V15.B16, V15.B16
98- VBCAX V30.B16, V3.B16, V16.B16, V16.B16
99-
100- VBCAX V31.B16, V12.B16, V25.B16, V10.B16
101- VBCAX V12.B16, V13.B16, V31.B16, V11.B16
102- VBCAX V13.B16, V14.B16, V12.B16, V12.B16
103- VBCAX V14.B16, V25.B16, V13.B16, V13.B16
104- VBCAX V25.B16, V31.B16, V14.B16, V14.B16
105-
106- VBCAX V4.B16, V9.B16, V29.B16, V7.B16
107- VBCAX V9.B16, V5.B16, V4.B16, V8.B16
108- VBCAX V5.B16, V6.B16, V9.B16, V9.B16
109- VBCAX V6.B16, V29.B16, V5.B16, V5.B16
110- VBCAX V29.B16, V4.B16, V6.B16, V6.B16
111-
112- VBCAX V28.B16, V0.B16, V27.B16, V3.B16
113- VBCAX V0.B16, V1.B16, V28.B16, V4.B16
114-
115- VBCAX V1.B16, V2.B16, V0.B16, V0.B16 // iota (chi part)
116-
117- VBCAX V2.B16, V27.B16, V1.B16, V1.B16
118- VBCAX V27.B16, V28.B16, V2.B16, V2.B16
119-
120- VEOR V26.B16, V0.B16, V0.B16 // iota
121-
122- SUB $1 , R2, R2
123- CBNZ R2, loop
124-
125- VST1.P [V0.D1, V1.D1], 16 (R0)
126- VST1.P [V2.D1, V3.D1], 16 (R0)
127- VST1.P [V4.D1, V5.D1], 16 (R0)
128- VST1.P [V6.D1, V7.D1], 16 (R0)
129- VST1.P [V8.D1, V9.D1], 16 (R0)
130- VST1.P [V10.D1, V11.D1], 16 (R0)
131- VST1.P [V12.D1, V13.D1], 16 (R0)
132- VST1.P [V14.D1, V15.D1], 16 (R0)
133- VST1.P [V16.D1, V17.D1], 16 (R0)
134- VST1.P [V18.D1, V19.D1], 16 (R0)
135- VST1.P [V20.D1, V21.D1], 16 (R0)
136- VST1.P [V22.D1, V23.D1], 16 (R0)
137- VST1 [V24.D1], (R0)
138-
139- RET
140-
141- // func xorAndPermute(state *[200]byte, buf *byte)
142- // Loads state, XORs a full rate (136 bytes = 17 lanes) of data, then runs keccakF1600.
143- // Eliminates one state store+load cycle per block vs separate xorIn + keccakF1600.
144- TEXT ·xorAndPermute(SB), $200 -16
145- MOVD state+0 (FP), R0
146- MOVD buf+8 (FP), R3
147- MOVD $round_consts<>(SB), R1
148- MOVD $24 , R2
149-
150- // Load state and XOR data for lanes 0-15 (8 pairs × 16 bytes = 128 bytes)
20+ // XOR path: load state and XOR with buf (17 lanes = 136 bytes)
15121 VLD1.P 16 (R0), [V0.D1, V1.D1]
15222 VLD1.P 16 (R3), [V25.D1, V26.D1]
15323 VEOR V25.B16, V0.B16, V0.B16
@@ -188,7 +58,7 @@ TEXT ·xorAndPermute(SB), $200-16
18858 VEOR V25.B16, V14.B16, V14.B16
18959 VEOR V26.B16, V15.B16, V15.B16
19060
191- // Lane 16-17: XOR only lane 16 ( last data lane, 8 bytes at data offset 128)
61+ // Lane 16: last data lane ( 8 bytes at buf offset 128)
19262 VLD1.P 16 (R0), [V16.D1, V17.D1]
19363 VLD1 (R3), [V25.D1]
19464 VEOR V25.B16, V16.B16, V16.B16
@@ -200,8 +70,26 @@ TEXT ·xorAndPermute(SB), $200-16
20070 VLD1 (R0), [V24.D1]
20171
20272 SUB $192 , R0, R0
73+ B rounds
74+
75+ load_state:
76+ VLD1.P 16 (R0), [V0.D1, V1.D1]
77+ VLD1.P 16 (R0), [V2.D1, V3.D1]
78+ VLD1.P 16 (R0), [V4.D1, V5.D1]
79+ VLD1.P 16 (R0), [V6.D1, V7.D1]
80+ VLD1.P 16 (R0), [V8.D1, V9.D1]
81+ VLD1.P 16 (R0), [V10.D1, V11.D1]
82+ VLD1.P 16 (R0), [V12.D1, V13.D1]
83+ VLD1.P 16 (R0), [V14.D1, V15.D1]
84+ VLD1.P 16 (R0), [V16.D1, V17.D1]
85+ VLD1.P 16 (R0), [V18.D1, V19.D1]
86+ VLD1.P 16 (R0), [V20.D1, V21.D1]
87+ VLD1.P 16 (R0), [V22.D1, V23.D1]
88+ VLD1 (R0), [V24.D1]
89+
90+ SUB $192 , R0, R0
20391
204- loop_xp :
92+ rounds :
20593 // theta
20694 VEOR3 V20.B16, V15.B16, V10.B16, V25.B16
20795 VEOR3 V21.B16, V16.B16, V11.B16, V26.B16
@@ -293,7 +181,7 @@ loop_xp:
293181 VEOR V26.B16, V0.B16, V0.B16 // iota
294182
295183 SUB $1 , R2, R2
296- CBNZ R2, loop_xp
184+ CBNZ R2, rounds
297185
298186 VST1.P [V0.D1, V1.D1], 16 (R0)
299187 VST1.P [V2.D1, V3.D1], 16 (R0)
0 commit comments