|
228 | 228 | # sbox operation for one single word |
229 | 229 | sub sbox_1word () { |
230 | 230 | #my $word = shift; |
231 | | - my ($tmp0, $tmp1) = ("v14", "v15"); # Temporaries |
| 231 | + my ($tmp0, $tmp1, $tmp2, $tmp3, $tmp4) = ("v12", "v14", "v15", "v24", "v25"); # Temporaries |
232 | 232 |
|
233 | 233 | $code.=<<___; |
234 | 234 | //mov @vtmp[3].s[0],$word |
235 | 235 | // optimize sbox using AESE instruction |
236 | | - tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b |
| 236 | + //tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b |
| 237 | + tbl @vtmp[0].16b, {$vtmp5.16b}, $MaskV.16b |
237 | 238 | ___ |
238 | 239 | &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); |
239 | 240 | $code.=<<___; |
|
244 | 245 | $code.=<<___; |
245 | 246 |
|
246 | 247 | //mov $wtmp0,@vtmp[0].s[0] |
247 | | - dup @vtmp[0].4s,@vtmp[0].s[0] |
248 | | - sli $tmp0.4s, @vtmp[0].4s, #2 |
249 | | - eor $vtmp[0].16b,$vtmp[0].16b,@vtmp[0].16b |
250 | | - sli $tmp0.4s, @vtmp[0].4s, #10 |
251 | | - eor $vtmp[0].16b,$vtmp[0].16b,@vtmp[0].16b |
252 | | - sli $tmp0.4s, @vtmp[0].4s, #18 |
253 | | - eor $vtmp[0].16b,$vtmp[0].16b,@vtmp[0].16b |
254 | | - sli $tmp0.4s, @vtmp[0].4s, #24 |
255 | | - eor $vtmp[0].16b,$vtmp[0].16b,@vtmp[0].16b |
| 248 | + //dup @vtmp[0].4s,@vtmp[0].s[0] |
| 249 | +
|
| 250 | + ushr $tmp0.4s,@vtmp[0].4s,32-2 |
| 251 | + ushr $tmp1.4s,@vtmp[0].4s,32-10 |
| 252 | + ushr $tmp2.4s,@vtmp[0].4s,32-18 |
| 253 | + ushr $tmp3.4s,@vtmp[0].4s,32-24 |
| 254 | + sli $tmp0.4s,@vtmp[0].4s,#2 |
| 255 | + sli $tmp1.4s,@vtmp[0].4s,#10 |
| 256 | + sli $tmp2.4s,@vtmp[0].4s,#18 |
| 257 | + sli $tmp3.4s,@vtmp[0].4s,#24 |
| 258 | + eor $tmp0.16b,$tmp0.16b,@vtmp[0].16b |
| 259 | + eor $tmp1.16b,$tmp1.16b,$tmp2.16b |
| 260 | + eor @vtmp[0].16b,$tmp0.16b,$tmp1.16b |
| 261 | + eor @vtmp[0].16b,@vtmp[0].16b,$tmp3.16b |
| 262 | +
|
| 263 | + # --- rotl(B, 2) --- |
| 264 | + //sli $tmp0.4s,@vtmp[0].4s,#2 |
| 265 | + //ushr $tmp1.4s,@vtmp[0].4s,#30 |
| 266 | + //orr $tmp0.16b,$tmp0.16b,$tmp1.16b |
| 267 | + //eor $tmp4.16b,@vtmp[0].16b,$tmp0.16b |
| 268 | +
|
| 269 | + # --- rotl(B, 10) --- |
| 270 | + //sli $tmp2.4s,@vtmp[0].4s,#10 |
| 271 | + //ushr $tmp3.4s,@vtmp[0].4s,#22 |
| 272 | + //orr $tmp2.16b,$tmp2.16b,$tmp3.16b |
| 273 | + //eor $tmp4.16b,$tmp4.16b,$tmp2.16b |
| 274 | +
|
| 275 | + # --- rotl(B, 18) --- |
| 276 | + //sli $tmp0.4s,@vtmp[0].4s,#18 |
| 277 | + //ushr $tmp1.4s,@vtmp[0].4s,#14 |
| 278 | + //orr $tmp0.16b,$tmp0.16b,$tmp1.16b |
| 279 | + //eor $tmp4.16b,$tmp4.16b,$tmp0.16b |
| 280 | +
|
| 281 | + # --- rotl(B, 24) --- |
| 282 | + //sli $tmp2.4s,@vtmp[0].4s,#24 |
| 283 | + //ushr $tmp3.4s,@vtmp[0].4s,#8 |
| 284 | + //orr $tmp2.16b,$tmp2.16b,$tmp3.16b |
| 285 | + //eor @vtmp[0].16b,$tmp4.16b,$tmp2.16b |
| 286 | +
|
256 | 287 | //eor $word,$wtmp0,$wtmp0,ror #32-2 |
257 | 288 | //eor $word,$word,$wtmp0,ror #32-10 |
258 | 289 | //eor $word,$word,$wtmp0,ror #32-18 |
|
280 | 311 | eor $sbox_inputs.16b,$sbox_inputs.16b,$tmp2.16b // sbox_inputs -> (B0^B1^B2^RK3, B1^B2^B3^RK0, B2^B3^B0^RK1, B3^B0^B1^RK2) |
281 | 312 |
|
282 | 313 | // SBOX(B1 ^ B2 ^ B3 ^ RK0) |
283 | | - mov @vtmp[3].s[0],$sbox_inputs.s[1] |
| 314 | + //mov $vtmp5.s[0],$sbox_inputs.s[1] |
| 315 | + dup $vtmp5.4s,$sbox_inputs.s[1] |
284 | 316 | ___ |
285 | 317 | &sbox_1word(); |
286 | 318 | $code.=<<___; |
| 319 | + dup $tmp2.4s,$state.s[0] |
287 | 320 | eor $sbox_inputs.16b,$sbox_inputs.16b,@vtmp[0].16b // V ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) |
288 | | - eor $state.s[0],$state.s[0],@vtmp[0].s[0] // B0' = B0 ^ SBOX(B1 ^ B2 ^ B3 ^ RK0) |
| 321 | + eor $tmp2.16b,$tmp2.16b,@vtmp[0].16b |
| 322 | + mov $state.s[0],$tmp2.s[0] |
| 323 | + //eor $state.s[0],$state.s[0],@vtmp[0].s[0] // B0' = B0 ^ SBOX(B1 ^ B2 ^ B3 ^ RK0) |
289 | 324 |
|
290 | 325 | // SBOX(B0' ^ B2 ^ B3 ^ RK1) |
291 | | - mov @vtmp[3].s[0],$sbox_inputs.s[2] |
| 326 | + //mov $vtmp5.s[0],$sbox_inputs.s[2] |
| 327 | + dup $vtmp5.4s,$sbox_inputs.s[2] |
292 | 328 | ___ |
293 | 329 | &sbox_1word(); |
294 | 330 | $code.=<<___; |
| 331 | + dup $tmp2.4s,$state.s[1] |
295 | 332 | eor $sbox_inputs.16b,$sbox_inputs.16b,@vtmp[0].16b // V ^= SBOX(B0' ^ B2 ^ B3 ^ RK1) |
296 | | - eor $state.s[1],$state.s[1],@vtmp[0].s[0] // B1' = B1 ^ SBOX(B0' ^ B2 ^ B3 ^ RK1) |
| 333 | + eor $tmp2.16b,$tmp2.16b,@vtmp[0].16b |
| 334 | + mov $state.s[1],$tmp2.s[0] |
| 335 | + //eor $state.s[1],$state.s[1],@vtmp[0].s[0] // B1' = B1 ^ SBOX(B0' ^ B2 ^ B3 ^ RK1) |
297 | 336 |
|
298 | 337 | // SBOX(B0' ^ B1' ^ B3 ^ RK2) |
299 | | - mov @vtmp[3].s[0],$sbox_inputs.s[3] |
| 338 | + //mov $vtmp5.s[0],$sbox_inputs.s[3] |
| 339 | + dup $vtmp5.4s,$sbox_inputs.s[3] |
300 | 340 | ___ |
301 | 341 | &sbox_1word(); |
302 | 342 | $code.=<<___; |
| 343 | + dup $tmp2.4s,$state.s[2] |
303 | 344 | eor $sbox_inputs.16b,$sbox_inputs.16b,@vtmp[0].16b // V ^= SBOX(B0' ^ B1' ^ B3 ^ RK2) |
304 | | - eor $state.s[2],$state.s[2],@vtmp[0].s[0] // B2' = B2 ^ SBOX(B0' ^ B1' ^ B3 ^ RK2) |
| 345 | + eor $tmp2.16b,$tmp2.16b,@vtmp[0].16b |
| 346 | + mov $state.s[2],$tmp2.s[0] |
| 347 | + //eor $state.s[2],$state.s[2],@vtmp[0].s[0] // B2' = B2 ^ SBOX(B0' ^ B1' ^ B3 ^ RK2) |
305 | 348 |
|
306 | 349 | // SBOX(B0' ^ B1' ^ B2' ^ RK3) |
307 | | - mov @vtmp[3].s[0],$sbox_inputs.s[0] |
| 350 | + //mov $vtmp5.s[0],$sbox_inputs.s[0] |
| 351 | + dup $vtmp5.4s,$sbox_inputs.s[0] |
308 | 352 | ___ |
309 | 353 | &sbox_1word(); |
310 | 354 | $code.=<<___; |
311 | | - eor $state.s[3],$state.s[3],@vtmp[0].s[0] // B3' = B3 ^ SBOX(B0' ^ B1' ^ B2' ^ RK3) |
| 355 | + dup $tmp2.4s,$state.s[3] |
| 356 | + eor $tmp2.16b,$tmp2.16b,@vtmp[0].16b |
| 357 | + mov $state.s[3],$tmp2.s[0] |
| 358 | + //eor $state.s[3],$state.s[3],@vtmp[0].s[0] // B3' = B3 ^ SBOX(B0' ^ B1' ^ B2' ^ RK3) |
312 | 359 | ___ |
313 | 360 | } |
314 | 361 |
|
|
0 commit comments