-
Notifications
You must be signed in to change notification settings - Fork 45
Expand file tree
/
Copy pathMontgomeryMult_64.s
More file actions
196 lines (179 loc) · 8.42 KB
/
MontgomeryMult_64.s
File metadata and controls
196 lines (179 loc) · 8.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
/*
This file is part of Alpertron Calculators.
Copyright 2025 Dario Alejandro Alpern
Alpertron Calculators is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Alpertron Calculators is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Alpertron Calculators. If not, see <http://www.gnu.org/licenses/>.
*/
.arch armv8-a
.global MontgomeryMult
.type MontgomeryMult, %function
.text
.align 4
/*
On input:
x0: Pointer to factor1 in Montgomery notation.
x1: Pointer to factor2 in Montgomery notation.
x2: Pointer to product in Montgomery notation.
Other registers used in the code:
w3: Value of MontgomeryMultN.
x4: Pointer to TestNbr.
w5: Value of NumberLength.
w6: Index for multiplicand (0 to NumberLength - 1).
w7: Index for inner loop (0 to NumberLength - 1).
w9: Limb from multiplicand (index w6).
x10: Accumulator for 32x32 bit multiplications.
Multiply two numbers in Montgomery notation.
For large numbers the REDC algorithm is:
m <- ((T mod R)N') mod R
t <- (T - mN) / R
if t < 0 then
return t + N
else
return t
end if
Multilimb version when multiplying S*T:
1) P <- 0
2) for i from 0 to n-1
3) t <- (S[i] * T[0]) mod R
4) u <- (t * N') mod R (one limb)
5) P <- (P + S[i] * T - u * N) / R
6) if P < 0 then
7) return P - N
8) else
9) return P
10)end if
*/
.set MAX_LIMBS_MONTGOMERY, 14
.set STACK_SIZE_RESULT_BUFFER, (MAX_LIMBS_MONTGOMERY+3)/4 * 16
MontgomeryMult:
/* Get external values and pointers to buffers. */
adrp x3, :got:MontgomeryMultN
ldr x3, [x3, #:got_lo12:MontgomeryMultN]
ldr w3, [x3] /* x3 <- MontgomeryMultN. */
adrp x4, :got:TestNbr
ldr x4, [x4, #:got_lo12:TestNbr] /* x4 <- Pointer to TestNbr. */
adrp x5, :got:NumberLength@GOTPAGE
ldr x5, [x5, #:got_lo12:NumberLength]
ldr w5, [x5] /* x5 <- NumberLength */
cmp w5, #2
bne more_than_two_limbs
/* Montgomery multiplication with two limbs */
/* Optimizations: Do not use loops. Use registers. */
ldr w5, [x4, #4]
ldr w4, [x4] /* R5:R4 = TestNbr */
ldr w6, [x1, #4]
ldr w1, [x1] /* R6:R1 = Multiplier */
/* Processing least significant limb of multiplicand */
ldr w9, [x0] /* Get limb from multiplicand. */
smull x10, w9, w1 /* x10 <- multiplicand[0] * multiplier[0]. */
mul w8, w10, w3 /* w8 <- MontDig = Pr * MontgomeryMultN mod R */
bic w8, w8, #0x80000000 /* Clear bit 31. */
smsubl x10, w8, w4, x10 /* x10 = Pr <- Pr - MontDig * TestNbr[0] */
asr x10, x10, #31 /* Shift right Pr (x10) by 31 bits. */
smaddl x10, w9, w6, x10 /* Pr <- Pr + multiplicand[0] * multiplier[1]. */
smsubl x10, w8, w5, x10 /* Pr <- Pr - MontDig * TestNbr[1]. */
asr x7, x10, #31 /* Shift right Pr (x10) by 31 bits. */
/* and store most significant limb of temp result. */
bic w10, w10, #0x80000000 /* R7:R10 = Temporary result. */
/* Processing most significant limb of multiplicand */
ldr w9, [x0, #4] /* Get limb from multiplicand. */
smaddl x10, w9, w1, x10 /* x10 <- Pr + multiplicand[1] * multiplier[0]. */
mul w8, w10, w3 /* w8 <- MontDig = Pr * MontgomeryMultN mod R */
bic w8, w8, #0x80000000 /* Clear bit 31. */
smsubl x10, w8, w4, x10 /* x10 = Pr <- Pr - MontDig * TestNbr[0] */
asr x10, x10, #31 /* Shift right Pr (x10) by 31 bits. */
smaddl x10, w9, w6, x10 /* Pr <- Pr + multiplicand[1] * multiplier[1]. */
smsubl x10, w8, w5, x10 /* Pr <- Pr - MontDig * TestNbr[1]. */
sxtw x12, w7 /* Get MS limb from temp result (sign extended). */
add x10, x10, x12 /* Pr <- Pr + result[j+1]. */
bic w6, w10, #0x80000000 /* Clear bit 31 and store limb to temp result[0]. */
asr x7, x10, #31 /* Shift right Pr (x10) by 31 bits */
/* and store most significant limb of temp result. */
/* Test whether the result is positive. */
tst w7, #0x80000000 /* Is most significant limb of result positive? */
beq copy_temp_to_result_2L /* If so, copy temporary result to actual result. */
/* Add TestNbr to result. */
add w8, w4, w6 /* TestNbr[0] + temporary result[0]. */
bic w6, w8, #0x80000000 /* Clear bit 31 and store into temp result. */
add w8, w5, w8, lsr #31 /* Add TestNbr[1] + temp result[1] with carry. */
add w8, w8, w7
bic w7, w8, #0x80000000 /* Clear bit 31 and store into temp result. */
copy_temp_to_result_2L:
str w6, [x2]
str w7, [x2, #4]
ret
more_than_two_limbs:
/* Adjust stack to store temporary result there */
/* so the arguments are not overwritten with the result. */
sub sp, sp, #STACK_SIZE_RESULT_BUFFER
/* Clear result. */
mov w7, w5 /* Clear NumberLength limbs. */
mov w6, #0 /* Set all limbs to zero. */
clear_result_loop:
subs w7, w7, #1
str w6, [sp, w7, uxtw #2] /* Set limb of temporary result to zero. */
bne clear_result_loop
/* Index for multiplicand (w6) is already zero. */
outer_loop:
mov w7, #1 /* Initialize index for result. */
ldr w9, [x0, w6, uxtw #2] /* Get limb from multiplicand. */
ldr w10, [sp] /* Get the least significant limb of temp result. */
ldr w12, [x1] /* Get the least significant limb of multiplier. */
smaddl x10, w9, w12, x10 /* x10 <- Pr + multiplicand[i] * multiplier[0]. */
mul w8, w10, w3 /* w8 <- MontDig = Pr * MontgomeryMultN mod R */
bic w8, w8, #0x80000000 /* Clear bit 31. */
ldr w12, [x4] /* Get least significant of TestNbr. */
smsubl x10, w8, w12, x10 /* x10 = Pr <- Pr - MontDig * TestNbr[0] */
inner_loop:
asr x10, x10, #31 /* Shift right Pr (x10) by 31 bits. */
ldr w12, [x1, w7, uxtw #2] /* Get limb from multiplier */
smaddl x10, w9, w12, x10 /* Pr <- Pr + multiplicand[i] * multiplier[j+1]. */
ldr w12, [x4, w7, uxtw #2] /* Get limb from TestNbr */
smsubl x10, w8, w12, x10 /* Pr <- Pr - MontDig * TestNbr[j+1]. */
ldrsw x12, [sp, w7, uxtw #2] /* Get limb from temp result (sign extended). */
add x10, x10, x12 /* Pr <- Pr + result[j+1]. */
sub w7, w7, #1 /* Decrement index. */
bic w12, w10, #0x80000000 /* Clear bit 31. */
str w12, [sp, w7, uxtw #2] /* Store limb to temporary result[j]. */
add w7, w7, #2
cmp w7, w5
bne inner_loop
sub w7, w7, #1
asr x10, x10, #31 /* Shift right Pr (x10) by 31 bits. */
str w10, [sp, w7, uxtw #2] /* Store most significant limb of temp result. */
add w6, w6, #1 /* Increment index to multiplicand. */
cmp w6, w5
bne outer_loop
/* Test whether the result is positive. */
tst w10, #0x80000000 /* Is most significant limb of result positive? */
beq copy_temp_to_result /* If so, copy temporary result to actual result. */
/* Add TestNbr to result. */
mov w6, #0 /* Initialize index. */
mov w7, #0 /* Initialize result of previous addition. */
add_TestNbr_cycle:
ldr w8, [sp, w6, uxtw #2] /* Get limb from temporary result. */
ldr w9, [x4, w6, uxtw #2] /* Get limb from TestNbr. */
add w7, w8, w7, lsr #31 /* Add with carry. */
add w7, w7, w9
bic w8, w7, #0x80000000 /* Clear bit 31. */
str w8, [sp, w6, uxtw #2] /* Store limb into temporary result. */
add w6, w6, #1
cmp w6, w5
bne add_TestNbr_cycle
copy_temp_to_result:
copy_temp_to_result_loop:
subs w5, w5, #1
ldr w8, [sp, w5, uxtw #2] /* Get limb from temporary result. */
str w8, [x2, w5, uxtw #2] /* Set limb to result. */
bne copy_temp_to_result_loop
add sp, sp, #STACK_SIZE_RESULT_BUFFER
ret
.end