|
120 | 120 | # Lower 64 bits of v_x contain the result. |
121 | 121 | # |
122 | 122 | sub f_aese(){ |
123 | | - my ($v_ab, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_; |
| 123 | + my ($v_ab, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_; |
124 | 124 | $code.=<<___; |
125 | 125 |
|
126 | 126 | /* |
127 | 127 | * S-function with AES subbytes |
128 | 128 | */ |
129 | 129 |
|
130 | | - /* Apply input rotation for sbox4 */ |
131 | | - and $v_t0.16b,$v_ab.16b,$sbox4mask.16b |
132 | | - bic $v_x.16b,$v_ab.16b,$sbox4mask.16b |
133 | | - add $v_t1.16b,$v_t0.16b,$v_t0.16b |
134 | | - ushr $v_t0.16b,$v_t0.16b,#7 |
135 | | - orr $v_t0.16b,$v_t0.16b,$v_t1.16b |
136 | | - and $v_t0.16b,$v_t0.16b,$sbox4mask.16b |
137 | | - orr $v_x.16b,$v_x.16b,$v_t0.16b |
138 | | -
|
139 | 130 | /* Prefilter sboxes */ |
140 | 131 | ___ |
141 | | - &filter_8bit_neon($v_x, $pre_s1lo_mask, $pre_s1hi_mask, $_0f0f0f0fmask, $v_t2); |
| 132 | + &filter_8bit_neon_3op($v_t4, $v_ab, $pre_s1lo_mask, $pre_s1hi_mask, $_0f0f0f0fmask, $v_t2); |
| 133 | + &filter_8bit_neon_3op($v_x, $v_ab, $pre_s4lo_mask, $pre_s4hi_mask, $_0f0f0f0fmask, $v_t2); |
142 | 134 | $code.=<<___; |
143 | 135 |
|
144 | 136 | /* AES subbytes + AES shift rows */ |
| 137 | + aese $v_t4.16b,$v_zero.16b |
145 | 138 | aese $v_x.16b,$v_zero.16b |
146 | 139 |
|
147 | 140 | /* Postfilter sboxes */ |
148 | 141 | ___ |
| 142 | + &filter_8bit_neon($v_t4, $post_s1lo_mask, $post_s1hi_mask, $_0f0f0f0fmask, $v_t2); |
149 | 143 | &filter_8bit_neon($v_x, $post_s1lo_mask, $post_s1hi_mask, $_0f0f0f0fmask, $v_t2); |
150 | 144 | $code.=<<___; |
151 | 145 |
|
152 | 146 | /* P-function */ |
153 | | - tbl $v_t1.16b,{$v_x.16b},$inv_shift_row.16b |
154 | | - tbl $v_t4.16b,{$v_x.16b},$sp0044.16b |
155 | | - tbl $v_x.16b,{$v_x.16b},$sp1110.16b |
| 147 | + tbl $v_t1.16b,{$v_t4.16b},$inv_shift_row.16b |
| 148 | + tbl $v_x.16b,{$v_x.16b},$sp0044.16b |
| 149 | + tbl $v_t4.16b,{$v_t4.16b},$sp1110.16b |
156 | 150 | add $v_t2.16b,$v_t1.16b,$v_t1.16b |
157 | 151 | ushr $v_t0.16b,$v_t1.16b,#7 |
158 | 152 | shl $v_t3.16b,$v_t1.16b,#7 |
@@ -195,8 +189,8 @@ sub xor2ror16 { |
195 | 189 | } |
196 | 190 |
|
197 | 191 | sub roundsm_aese(){ |
198 | | - my ($v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_; |
199 | | - &f_aese($v_ab, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key); |
| 192 | + my ($v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_; |
| 193 | + &f_aese($v_ab, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key); |
200 | 194 | $code.=<<___; |
201 | 195 | eor $v_cd.16b,$v_cd.16b,$v_x.16b |
202 | 196 | ___ |
|
348 | 342 | } |
349 | 343 |
|
350 | 344 | sub roundsm_aese_ab_to_cd(){ |
351 | | - my ($subkey_idx, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_; |
| 345 | + my ($subkey_idx, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_; |
352 | 346 | &load_key($subkey_idx, $key); |
353 | | - &roundsm_aese($v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) |
| 347 | + &roundsm_aese($v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) |
354 | 348 | } |
355 | 349 |
|
356 | 350 | sub roundsm_aese_cd_to_ab(){ |
357 | | - my ($subkey_idx, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_; |
| 351 | + my ($subkey_idx, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_; |
358 | 352 | &load_key($subkey_idx, $key); |
359 | | - &roundsm_aese($v_cd, $v_ab, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) |
| 353 | + &roundsm_aese($v_cd, $v_ab, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) |
360 | 354 | } |
361 | 355 |
|
362 | 356 | sub roundsm_ab_to_cd(){ |
|
372 | 366 | } |
373 | 367 |
|
374 | 368 | sub enc_rounds_aese(){ |
375 | | - my ($i, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_; |
376 | | - &roundsm_aese_ab_to_cd($i+2, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key); |
377 | | - &roundsm_aese_cd_to_ab($i+3, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key); |
378 | | - &roundsm_aese_ab_to_cd($i+4, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key); |
379 | | - &roundsm_aese_cd_to_ab($i+5, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key); |
380 | | - &roundsm_aese_ab_to_cd($i+6, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key); |
381 | | - &roundsm_aese_cd_to_ab($i+7, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key); |
| 369 | + my ($i, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_; |
| 370 | + &roundsm_aese_ab_to_cd($i+2, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key); |
| 371 | + &roundsm_aese_cd_to_ab($i+3, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key); |
| 372 | + &roundsm_aese_ab_to_cd($i+4, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key); |
| 373 | + &roundsm_aese_cd_to_ab($i+5, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key); |
| 374 | + &roundsm_aese_ab_to_cd($i+6, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key); |
| 375 | + &roundsm_aese_cd_to_ab($i+7, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key); |
382 | 376 | } |
383 | 377 |
|
384 | 378 | sub enc_rounds(){ |
|
465 | 459 | // Load constants needed for camellia_f into v17-v27 + v16(bswap) |
466 | 460 | adrp x10,camellia_neon_consts |
467 | 461 | add x10,x10,:lo12:camellia_neon_consts |
468 | | - ldp q20,q21,[x10],#64 // pre_tf_lo/hi_s1 |
| 462 | + ldp q20,q21,[x10],#32 // pre_tf_lo/hi_s1 |
| 463 | + ldp q14,q15,[x10],#32 // pre_tf_lo/hi_s4 |
469 | 464 | ldp q22,q23,[x10],#112 // post_tf_lo/hi_s1 |
470 | 465 | ldr q19,[x10],#48 //mask_0f |
471 | 466 | ldr q16,[x10],#104 //bswap128 - then big jump |
|
479 | 474 | $code.=<<___; |
480 | 475 | eor v11.16b,v11.16b,v11.16b |
481 | 476 | ___ |
482 | | - &enc_rounds_aese(0,"v0","v1","v2","v6","v7","v8","v9","v10","v11","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","x8"); |
| 477 | + &enc_rounds_aese(0,"v0","v1","v2","v6","v7","v8","v9","v10","v11","v14","v15","v17","v19","v20","v21","v22","v23","v24","v25","v26","v27","x8"); |
483 | 478 | &fls_neon("v0","v1","v2","v3","x0",8,9); |
484 | | - &enc_rounds_aese(8,"v0","v1","v2","v6","v7","v8","v9","v10","v11","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","x8"); |
| 479 | + &enc_rounds_aese(8,"v0","v1","v2","v6","v7","v8","v9","v10","v11","v14","v15","v17","v19","v20","v21","v22","v23","v24","v25","v26","v27","x8"); |
485 | 480 | &fls_neon("v0","v1","v2","v3","x0",16,17); |
486 | | - &enc_rounds_aese(16,"v0","v1","v2","v6","v7","v8","v9","v10","v11","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","x8"); |
| 481 | + &enc_rounds_aese(16,"v0","v1","v2","v6","v7","v8","v9","v10","v11","v14","v15","v17","v19","v20","v21","v22","v23","v24","v25","v26","v27","x8"); |
487 | 482 | $code.=<<___; |
488 | 483 | mov w30,#24 // MAX = 24 |
489 | 484 |
|
|
492 | 487 | b.eq __enc_done_aese |
493 | 488 | ___ |
494 | 489 | &fls_neon("v0","v1","v2","v3","x0", 24, 25); |
495 | | - &enc_rounds_aese(24,"v0","v1","v2","v6","v7","v8","v9","v10","v11","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","x8"); |
| 490 | + &enc_rounds_aese(24,"v0","v1","v2","v6","v7","v8","v9","v10","v11","v14","v15","v17","v19","v20","v21","v22","v23","v24","v25","v26","v27","x8"); |
496 | 491 | $code.=<<___; |
497 | 492 | mov w30,#32 // MAX = 32 |
498 | 493 |
|
@@ -1433,17 +1428,32 @@ () |
1433 | 1428 | # General macros |
1434 | 1429 | # |
1435 | 1430 |
|
1436 | | -sub filter_8bit_neon(){ |
1437 | | - my ($x,$lo_t,$hi_t,$mask,$tmp) = @_; |
1438 | | -$code.=<<___; |
1439 | | - and $tmp.16b,$x.16b,$mask.16b |
1440 | | - ushr $x.16b,$x.16b,#4 |
| 1431 | +#sub filter_8bit_neon(){ |
| 1432 | +# my ($x,$lo_t,$hi_t,$mask,$tmp) = @_; |
| 1433 | +#$code.=<<___; |
| 1434 | +# and $tmp.16b,$x.16b,$mask.16b |
| 1435 | +# ushr $x.16b,$x.16b,#4 |
| 1436 | +# tbl $tmp.16b,{$lo_t.16b},$tmp.16b |
| 1437 | +# tbl $x.16b,{$hi_t.16b},$x.16b |
| 1438 | +# eor $x.16b,$x.16b,$tmp.16b |
| 1439 | +#___ |
| 1440 | +#} |
| 1441 | +sub filter_8bit_neon_3op(){ |
| 1442 | + my ($out,$in,$lo_t,$hi_t,$mask,$tmp) = @_; |
| 1443 | +$code.=<<___; |
| 1444 | + and $tmp.16b,$in.16b,$mask.16b |
| 1445 | + ushr $out.16b,$in.16b,#4 |
1441 | 1446 | tbl $tmp.16b,{$lo_t.16b},$tmp.16b |
1442 | | - tbl $x.16b,{$hi_t.16b},$x.16b |
1443 | | - eor $x.16b,$x.16b,$tmp.16b |
| 1447 | + tbl $out.16b,{$hi_t.16b},$out.16b |
| 1448 | + eor $out.16b,$out.16b,$tmp.16b |
1444 | 1449 | ___ |
1445 | 1450 | } |
1446 | 1451 |
|
| 1452 | +sub filter_8bit_neon(){ |
| 1453 | + my ($x,$lo_t,$hi_t,$mask,$tmp) = @_; |
| 1454 | + &filter_8bit_neon_3op($x,$x,$lo_t,$hi_t,$mask,$tmp); |
| 1455 | +} |
| 1456 | +
|
1447 | 1457 | # |
1448 | 1458 | # 16-block encryption/decryption macros |
1449 | 1459 | # |
|
0 commit comments