2222
2323// FUNC3 f = (b & c) | (b & d) | (c & d)
2424#define FUNC3(b, c, d) \
25- MOVW b, R8 ; \
26- ORR c, R8, R8 ; \
27- ANDW d, R8, R8 ; \
25+ MOVW b, R27 ; \
26+ ORR c, R27, R27 ; \
27+ ANDW d, R27, R27 ; \
2828 MOVW b, R15; \
2929 ANDW c, R15, R15; \
30- ORR R8 , R15, R15
30+ ORR R27 , R15, R15
3131
3232#define FUNC4(b, c, d) FUNC2(b, c, d)
3333
34- #define MIX(a, b, c, d, e, k, vreg ) \
34+ #define MIX(a, b, c, d, e, k) \
3535 RORW $2 , b, b; \
3636 ADDW R15, e, e; \
37- MOVW a, R8 ; \
38- RORW $27 , R8, R8 ; \
37+ MOVW a, R27 ; \
38+ RORW $27 , R27, R27 ; \
3939 MOVW k, R19; \
4040 ADDW R19, e, e; \
4141 ADDW R9, e, e; \
42- ADDW R8 , e, e
42+ ADDW R27 , e, e
4343
4444#define LOAD(index) \
4545 MOVWU (index*4 )(R16), R9; \
4646 REVW R9, R9; \
4747 MOVW R9, (index*4 )(RSP)
4848
4949#define LOADCS(a, b, c, d, e, index) \
50- MOVD cs_base+56 (FP), R8 ; \
51- MOVW a, ((index*20 ))(R8 ); \
52- MOVW b, ((index*20 )+4 )(R8 ); \
53- MOVW c, ((index*20 )+8 )(R8 ); \
54- MOVW d, ((index*20 )+12 )(R8 ); \
55- MOVW e, ((index*20 )+16 )(R8 )
50+ MOVD cs_base+56 (FP), R27 ; \
51+ MOVW a, ((index*20 ))(R27 ); \
52+ MOVW b, ((index*20 )+4 )(R27 ); \
53+ MOVW c, ((index*20 )+8 )(R27 ); \
54+ MOVW d, ((index*20 )+12 )(R27 ); \
55+ MOVW e, ((index*20 )+16 )(R27 )
5656
5757#define SHUFFLE(index) \
5858 MOVW ((index&0xf )*4 )(RSP), R9; \
6767
6868// LOADM1 stores message word to m1 array.
6969#define LOADM1(index) \
70- MOVD m1_base+32 (FP), R8 ; \
70+ MOVD m1_base+32 (FP), R27 ; \
7171 MOVW ((index&0xf )*4 )(RSP), R9; \
72- MOVW R9, (index*4 )(R8 )
72+ MOVW R9, (index*4 )(R27 )
7373
74- #define ROUND1(a, b, c, d, e, index, vreg ) \
74+ #define ROUND1(a, b, c, d, e, index) \
7575 LOAD(index); \
7676 FUNC1(b, c, d); \
77- MIX(a, b, c, d, e, RoundConst0, vreg ); \
77+ MIX(a, b, c, d, e, RoundConst0); \
7878 LOADM1(index)
7979
80- #define ROUND1x(a, b, c, d, e, index, vreg ) \
80+ #define ROUND1x(a, b, c, d, e, index) \
8181 SHUFFLE(index); \
8282 FUNC1(b, c, d); \
83- MIX(a, b, c, d, e, RoundConst0, vreg ); \
83+ MIX(a, b, c, d, e, RoundConst0); \
8484 LOADM1(index)
8585
86- #define ROUND2(a, b, c, d, e, index, vreg ) \
86+ #define ROUND2(a, b, c, d, e, index) \
8787 SHUFFLE(index); \
8888 FUNC2(b, c, d); \
89- MIX(a, b, c, d, e, RoundConst1, vreg ); \
89+ MIX(a, b, c, d, e, RoundConst1); \
9090 LOADM1(index)
9191
92- #define ROUND3(a, b, c, d, e, index, vreg ) \
92+ #define ROUND3(a, b, c, d, e, index) \
9393 SHUFFLE(index); \
9494 FUNC3(b, c, d); \
95- MIX(a, b, c, d, e, RoundConst2, vreg ); \
95+ MIX(a, b, c, d, e, RoundConst2); \
9696 LOADM1(index)
9797
98- #define ROUND4(a, b, c, d, e, index, vreg ) \
98+ #define ROUND4(a, b, c, d, e, index) \
9999 SHUFFLE(index); \
100100 FUNC4(b, c, d); \
101- MIX(a, b, c, d, e, RoundConst3, vreg ); \
101+ MIX(a, b, c, d, e, RoundConst3); \
102102 LOADM1(index)
103103
104104// func blockARM64(dig *digest, p []byte, m1 []uint32, cs [][5]uint32)
@@ -132,98 +132,98 @@ loop:
132132
133133 // ROUND1 (steps 0-15)
134134 LOADCS(R10, R11, R12, R13, R14, 0 )
135- ROUND1(R10, R11, R12, R13, R14, 0 , V31)
136- ROUND1(R14, R10, R11, R12, R13, 1 , V30 )
137- ROUND1(R13, R14, R10, R11, R12, 2 , V29 )
138- ROUND1(R12, R13, R14, R10, R11, 3 , V28 )
139- ROUND1(R11, R12, R13, R14, R10, 4 , V27 )
140- ROUND1(R10, R11, R12, R13, R14, 5 , V26 )
141- ROUND1(R14, R10, R11, R12, R13, 6 , V25 )
142- ROUND1(R13, R14, R10, R11, R12, 7 , V24 )
143- ROUND1(R12, R13, R14, R10, R11, 8 , V23 )
144- ROUND1(R11, R12, R13, R14, R10, 9 , V22 )
145- ROUND1(R10, R11, R12, R13, R14, 10 , V21 )
146- ROUND1(R14, R10, R11, R12, R13, 11 , V20 )
147- ROUND1(R13, R14, R10, R11, R12, 12 , V19 )
148- ROUND1(R12, R13, R14, R10, R11, 13 , V18 )
149- ROUND1(R11, R12, R13, R14, R10, 14 , V17 )
150- ROUND1(R10, R11, R12, R13, R14, 15 , V16 )
135+ ROUND1(R10, R11, R12, R13, R14, 0 )
136+ ROUND1(R14, R10, R11, R12, R13, 1 )
137+ ROUND1(R13, R14, R10, R11, R12, 2 )
138+ ROUND1(R12, R13, R14, R10, R11, 3 )
139+ ROUND1(R11, R12, R13, R14, R10, 4 )
140+ ROUND1(R10, R11, R12, R13, R14, 5 )
141+ ROUND1(R14, R10, R11, R12, R13, 6 )
142+ ROUND1(R13, R14, R10, R11, R12, 7 )
143+ ROUND1(R12, R13, R14, R10, R11, 8 )
144+ ROUND1(R11, R12, R13, R14, R10, 9 )
145+ ROUND1(R10, R11, R12, R13, R14, 10 )
146+ ROUND1(R14, R10, R11, R12, R13, 11 )
147+ ROUND1(R13, R14, R10, R11, R12, 12 )
148+ ROUND1(R12, R13, R14, R10, R11, 13 )
149+ ROUND1(R11, R12, R13, R14, R10, 14 )
150+ ROUND1(R10, R11, R12, R13, R14, 15 )
151151
152152 // ROUND1x (steps 16-19) - same as ROUND1 but with no data load.
153- ROUND1x(R14, R10, R11, R12, R13, 16 , V15 )
154- ROUND1x(R13, R14, R10, R11, R12, 17 , V14 )
155- ROUND1x(R12, R13, R14, R10, R11, 18 , V13 )
156- ROUND1x(R11, R12, R13, R14, R10, 19 , V12 )
153+ ROUND1x(R14, R10, R11, R12, R13, 16 )
154+ ROUND1x(R13, R14, R10, R11, R12, 17 )
155+ ROUND1x(R12, R13, R14, R10, R11, 18 )
156+ ROUND1x(R11, R12, R13, R14, R10, 19 )
157157
158158 // ROUND2 (steps 20-39)
159- ROUND2(R10, R11, R12, R13, R14, 20 , V11 )
160- ROUND2(R14, R10, R11, R12, R13, 21 , V10 )
161- ROUND2(R13, R14, R10, R11, R12, 22 , V9 )
162- ROUND2(R12, R13, R14, R10, R11, 23 , V8 )
163- ROUND2(R11, R12, R13, R14, R10, 24 , V7 )
164- ROUND2(R10, R11, R12, R13, R14, 25 , V6 )
165- ROUND2(R14, R10, R11, R12, R13, 26 , V5 )
166- ROUND2(R13, R14, R10, R11, R12, 27 , V4 )
167- ROUND2(R12, R13, R14, R10, R11, 28 , V3 )
168- ROUND2(R11, R12, R13, R14, R10, 29 , V2 )
169- ROUND2(R10, R11, R12, R13, R14, 30 , V1 )
170- ROUND2(R14, R10, R11, R12, R13, 31 , V0 )
171- ROUND2(R13, R14, R10, R11, R12, 32 , V31 )
172- ROUND2(R12, R13, R14, R10, R11, 33 , V30 )
173- ROUND2(R11, R12, R13, R14, R10, 34 , V29 )
174- ROUND2(R10, R11, R12, R13, R14, 35 , V28 )
175- ROUND2(R14, R10, R11, R12, R13, 36 , V27 )
176- ROUND2(R13, R14, R10, R11, R12, 37 , V26 )
177- ROUND2(R12, R13, R14, R10, R11, 38 , V25 )
178- ROUND2(R11, R12, R13, R14, R10, 39 , V24 )
159+ ROUND2(R10, R11, R12, R13, R14, 20 )
160+ ROUND2(R14, R10, R11, R12, R13, 21 )
161+ ROUND2(R13, R14, R10, R11, R12, 22 )
162+ ROUND2(R12, R13, R14, R10, R11, 23 )
163+ ROUND2(R11, R12, R13, R14, R10, 24 )
164+ ROUND2(R10, R11, R12, R13, R14, 25 )
165+ ROUND2(R14, R10, R11, R12, R13, 26 )
166+ ROUND2(R13, R14, R10, R11, R12, 27 )
167+ ROUND2(R12, R13, R14, R10, R11, 28 )
168+ ROUND2(R11, R12, R13, R14, R10, 29 )
169+ ROUND2(R10, R11, R12, R13, R14, 30 )
170+ ROUND2(R14, R10, R11, R12, R13, 31 )
171+ ROUND2(R13, R14, R10, R11, R12, 32 )
172+ ROUND2(R12, R13, R14, R10, R11, 33 )
173+ ROUND2(R11, R12, R13, R14, R10, 34 )
174+ ROUND2(R10, R11, R12, R13, R14, 35 )
175+ ROUND2(R14, R10, R11, R12, R13, 36 )
176+ ROUND2(R13, R14, R10, R11, R12, 37 )
177+ ROUND2(R12, R13, R14, R10, R11, 38 )
178+ ROUND2(R11, R12, R13, R14, R10, 39 )
179179
180180 // ROUND3 (steps 40-59)
181- ROUND3(R10, R11, R12, R13, R14, 40 , V23 )
182- ROUND3(R14, R10, R11, R12, R13, 41 , V22 )
183- ROUND3(R13, R14, R10, R11, R12, 42 , V21 )
184- ROUND3(R12, R13, R14, R10, R11, 43 , V20 )
185- ROUND3(R11, R12, R13, R14, R10, 44 , V19 )
186- ROUND3(R10, R11, R12, R13, R14, 45 , V18 )
187- ROUND3(R14, R10, R11, R12, R13, 46 , V17 )
188- ROUND3(R13, R14, R10, R11, R12, 47 , V16 )
189- ROUND3(R12, R13, R14, R10, R11, 48 , V15 )
190- ROUND3(R11, R12, R13, R14, R10, 49 , V14 )
191- ROUND3(R10, R11, R12, R13, R14, 50 , V13 )
192- ROUND3(R14, R10, R11, R12, R13, 51 , V12 )
193- ROUND3(R13, R14, R10, R11, R12, 52 , V11 )
194- ROUND3(R12, R13, R14, R10, R11, 53 , V10 )
195- ROUND3(R11, R12, R13, R14, R10, 54 , V9 )
196- ROUND3(R10, R11, R12, R13, R14, 55 , V8 )
197- ROUND3(R14, R10, R11, R12, R13, 56 , V7 )
198- ROUND3(R13, R14, R10, R11, R12, 57 , V6 )
181+ ROUND3(R10, R11, R12, R13, R14, 40 )
182+ ROUND3(R14, R10, R11, R12, R13, 41 )
183+ ROUND3(R13, R14, R10, R11, R12, 42 )
184+ ROUND3(R12, R13, R14, R10, R11, 43 )
185+ ROUND3(R11, R12, R13, R14, R10, 44 )
186+ ROUND3(R10, R11, R12, R13, R14, 45 )
187+ ROUND3(R14, R10, R11, R12, R13, 46 )
188+ ROUND3(R13, R14, R10, R11, R12, 47 )
189+ ROUND3(R12, R13, R14, R10, R11, 48 )
190+ ROUND3(R11, R12, R13, R14, R10, 49 )
191+ ROUND3(R10, R11, R12, R13, R14, 50 )
192+ ROUND3(R14, R10, R11, R12, R13, 51 )
193+ ROUND3(R13, R14, R10, R11, R12, 52 )
194+ ROUND3(R12, R13, R14, R10, R11, 53 )
195+ ROUND3(R11, R12, R13, R14, R10, 54 )
196+ ROUND3(R10, R11, R12, R13, R14, 55 )
197+ ROUND3(R14, R10, R11, R12, R13, 56 )
198+ ROUND3(R13, R14, R10, R11, R12, 57 )
199199
200200 LOADCS(R12, R13, R14, R10, R11, 1 )
201- ROUND3(R12, R13, R14, R10, R11, 58 , V5 )
202- ROUND3(R11, R12, R13, R14, R10, 59 , V4 )
201+ ROUND3(R12, R13, R14, R10, R11, 58 )
202+ ROUND3(R11, R12, R13, R14, R10, 59 )
203203
204204 // ROUND4 (steps 60-79)
205- ROUND4(R10, R11, R12, R13, R14, 60 , V3 )
206- ROUND4(R14, R10, R11, R12, R13, 61 , V2 )
207- ROUND4(R13, R14, R10, R11, R12, 62 , V1 )
208- ROUND4(R12, R13, R14, R10, R11, 63 , V0 )
209- ROUND4(R11, R12, R13, R14, R10, 64 , V31 )
205+ ROUND4(R10, R11, R12, R13, R14, 60 )
206+ ROUND4(R14, R10, R11, R12, R13, 61 )
207+ ROUND4(R13, R14, R10, R11, R12, 62 )
208+ ROUND4(R12, R13, R14, R10, R11, 63 )
209+ ROUND4(R11, R12, R13, R14, R10, 64 )
210210
211211 LOADCS(R10, R11, R12, R13, R14, 2 )
212- ROUND4(R10, R11, R12, R13, R14, 65 , V30 )
213- ROUND4(R14, R10, R11, R12, R13, 66 , V29 )
214- ROUND4(R13, R14, R10, R11, R12, 67 , V28 )
215- ROUND4(R12, R13, R14, R10, R11, 68 , V27 )
216- ROUND4(R11, R12, R13, R14, R10, 69 , V26 )
217- ROUND4(R10, R11, R12, R13, R14, 70 , V25 )
218- ROUND4(R14, R10, R11, R12, R13, 71 , V24 )
219- ROUND4(R13, R14, R10, R11, R12, 72 , V23 )
220- ROUND4(R12, R13, R14, R10, R11, 73 , V22 )
221- ROUND4(R11, R12, R13, R14, R10, 74 , V21 )
222- ROUND4(R10, R11, R12, R13, R14, 75 , V20 )
223- ROUND4(R14, R10, R11, R12, R13, 76 , V19 )
224- ROUND4(R13, R14, R10, R11, R12, 77 , V18 )
225- ROUND4(R12, R13, R14, R10, R11, 78 , V17 )
226- ROUND4(R11, R12, R13, R14, R10, 79 , V16 )
212+ ROUND4(R10, R11, R12, R13, R14, 65 )
213+ ROUND4(R14, R10, R11, R12, R13, 66 )
214+ ROUND4(R13, R14, R10, R11, R12, 67 )
215+ ROUND4(R12, R13, R14, R10, R11, 68 )
216+ ROUND4(R11, R12, R13, R14, R10, 69 )
217+ ROUND4(R10, R11, R12, R13, R14, 70 )
218+ ROUND4(R14, R10, R11, R12, R13, 71 )
219+ ROUND4(R13, R14, R10, R11, R12, 72 )
220+ ROUND4(R12, R13, R14, R10, R11, 73 )
221+ ROUND4(R11, R12, R13, R14, R10, 74 )
222+ ROUND4(R10, R11, R12, R13, R14, 75 )
223+ ROUND4(R14, R10, R11, R12, R13, 76 )
224+ ROUND4(R13, R14, R10, R11, R12, 77 )
225+ ROUND4(R12, R13, R14, R10, R11, 78 )
226+ ROUND4(R11, R12, R13, R14, R10, 79 )
227227
228228 // Add registers to temp hash.
229229 ADDW R10, R1, R1
@@ -236,7 +236,6 @@ loop:
236236 B loop
237237
238238end:
239- MOVD dig+0 (FP), R8
240239 MOVW R1, (R8)
241240 MOVW R2, 4 (R8)
242241 MOVW R3, 8 (R8)
0 commit comments