Skip to content

Commit 2f84c11

Browse files
committed
Potential optimisation for f_aese()
1 parent 427ef9f commit 2f84c11

1 file changed

Lines changed: 49 additions & 39 deletions

File tree

crypto/camellia/asm/cmll-armv8.pl

Lines changed: 49 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -120,39 +120,33 @@
120120
# Lower 64 bits of v_x contain the result.
121121
#
122122
sub f_aese(){
123-
my ($v_ab, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_;
123+
my ($v_ab, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_;
124124
$code.=<<___;
125125
126126
/*
127127
* S-function with AES subbytes
128128
*/
129129
130-
/* Apply input rotation for sbox4 */
131-
and $v_t0.16b,$v_ab.16b,$sbox4mask.16b
132-
bic $v_x.16b,$v_ab.16b,$sbox4mask.16b
133-
add $v_t1.16b,$v_t0.16b,$v_t0.16b
134-
ushr $v_t0.16b,$v_t0.16b,#7
135-
orr $v_t0.16b,$v_t0.16b,$v_t1.16b
136-
and $v_t0.16b,$v_t0.16b,$sbox4mask.16b
137-
orr $v_x.16b,$v_x.16b,$v_t0.16b
138-
139130
/* Prefilter sboxes */
140131
___
141-
&filter_8bit_neon($v_x, $pre_s1lo_mask, $pre_s1hi_mask, $_0f0f0f0fmask, $v_t2);
132+
&filter_8bit_neon_3op($v_t4, $v_ab, $pre_s1lo_mask, $pre_s1hi_mask, $_0f0f0f0fmask, $v_t2);
133+
&filter_8bit_neon_3op($v_x, $v_ab, $pre_s4lo_mask, $pre_s4hi_mask, $_0f0f0f0fmask, $v_t2);
142134
$code.=<<___;
143135
144136
/* AES subbytes + AES shift rows */
137+
aese $v_t4.16b,$v_zero.16b
145138
aese $v_x.16b,$v_zero.16b
146139
147140
/* Postfilter sboxes */
148141
___
142+
&filter_8bit_neon($v_t4, $post_s1lo_mask, $post_s1hi_mask, $_0f0f0f0fmask, $v_t2);
149143
&filter_8bit_neon($v_x, $post_s1lo_mask, $post_s1hi_mask, $_0f0f0f0fmask, $v_t2);
150144
$code.=<<___;
151145
152146
/* P-function */
153-
tbl $v_t1.16b,{$v_x.16b},$inv_shift_row.16b
154-
tbl $v_t4.16b,{$v_x.16b},$sp0044.16b
155-
tbl $v_x.16b,{$v_x.16b},$sp1110.16b
147+
tbl $v_t1.16b,{$v_t4.16b},$inv_shift_row.16b
148+
tbl $v_x.16b,{$v_x.16b},$sp0044.16b
149+
tbl $v_t4.16b,{$v_t4.16b},$sp1110.16b
156150
add $v_t2.16b,$v_t1.16b,$v_t1.16b
157151
ushr $v_t0.16b,$v_t1.16b,#7
158152
shl $v_t3.16b,$v_t1.16b,#7
@@ -195,8 +189,8 @@ sub xor2ror16 {
195189
}
196190

197191
sub roundsm_aese(){
198-
my ($v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_;
199-
&f_aese($v_ab, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key);
192+
my ($v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_;
193+
&f_aese($v_ab, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key);
200194
$code.=<<___;
201195
eor $v_cd.16b,$v_cd.16b,$v_x.16b
202196
___
@@ -348,15 +342,15 @@ ()
348342
}
349343
350344
sub roundsm_aese_ab_to_cd(){
351-
my ($subkey_idx, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_;
345+
my ($subkey_idx, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_;
352346
&load_key($subkey_idx, $key);
353-
&roundsm_aese($v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key)
347+
&roundsm_aese($v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key)
354348
}
355349
356350
sub roundsm_aese_cd_to_ab(){
357-
my ($subkey_idx, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_;
351+
my ($subkey_idx, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_;
358352
&load_key($subkey_idx, $key);
359-
&roundsm_aese($v_cd, $v_ab, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key)
353+
&roundsm_aese($v_cd, $v_ab, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key)
360354
}
361355
362356
sub roundsm_ab_to_cd(){
@@ -372,13 +366,13 @@ ()
372366
}
373367
374368
sub enc_rounds_aese(){
375-
my ($i, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_;
376-
&roundsm_aese_ab_to_cd($i+2, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key);
377-
&roundsm_aese_cd_to_ab($i+3, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key);
378-
&roundsm_aese_ab_to_cd($i+4, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key);
379-
&roundsm_aese_cd_to_ab($i+5, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key);
380-
&roundsm_aese_ab_to_cd($i+6, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key);
381-
&roundsm_aese_cd_to_ab($i+7, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $inv_shift_row, $sbox4mask, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key);
369+
my ($i, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key) = @_;
370+
&roundsm_aese_ab_to_cd($i+2, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key);
371+
&roundsm_aese_cd_to_ab($i+3, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key);
372+
&roundsm_aese_ab_to_cd($i+4, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key);
373+
&roundsm_aese_cd_to_ab($i+5, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key);
374+
&roundsm_aese_ab_to_cd($i+6, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key);
375+
&roundsm_aese_cd_to_ab($i+7, $v_ab, $v_cd, $v_x, $v_t0, $v_t1, $v_t2, $v_t3, $v_t4, $v_zero, $pre_s4lo_mask, $pre_s4hi_mask, $inv_shift_row, $_0f0f0f0fmask, $pre_s1lo_mask, $pre_s1hi_mask, $post_s1lo_mask, $post_s1hi_mask, $sp0044, $sp1110, $sp0222, $sp3033, $key);
382376
}
383377
384378
sub enc_rounds(){
@@ -465,7 +459,8 @@ ()
465459
// Load constants needed for camellia_f into v17-v27 + v16(bswap)
466460
adrp x10,camellia_neon_consts
467461
add x10,x10,:lo12:camellia_neon_consts
468-
ldp q20,q21,[x10],#64 // pre_tf_lo/hi_s1
462+
ldp q20,q21,[x10],#32 // pre_tf_lo/hi_s1
463+
ldp q14,q15,[x10],#32 // pre_tf_lo/hi_s4
469464
ldp q22,q23,[x10],#112 // post_tf_lo/hi_s1
470465
ldr q19,[x10],#48 //mask_0f
471466
ldr q16,[x10],#104 //bswap128 - then big jump
@@ -479,11 +474,11 @@ ()
479474
$code.=<<___;
480475
eor v11.16b,v11.16b,v11.16b
481476
___
482-
&enc_rounds_aese(0,"v0","v1","v2","v6","v7","v8","v9","v10","v11","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","x8");
477+
&enc_rounds_aese(0,"v0","v1","v2","v6","v7","v8","v9","v10","v11","v14","v15","v17","v19","v20","v21","v22","v23","v24","v25","v26","v27","x8");
483478
&fls_neon("v0","v1","v2","v3","x0",8,9);
484-
&enc_rounds_aese(8,"v0","v1","v2","v6","v7","v8","v9","v10","v11","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","x8");
479+
&enc_rounds_aese(8,"v0","v1","v2","v6","v7","v8","v9","v10","v11","v14","v15","v17","v19","v20","v21","v22","v23","v24","v25","v26","v27","x8");
485480
&fls_neon("v0","v1","v2","v3","x0",16,17);
486-
&enc_rounds_aese(16,"v0","v1","v2","v6","v7","v8","v9","v10","v11","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","x8");
481+
&enc_rounds_aese(16,"v0","v1","v2","v6","v7","v8","v9","v10","v11","v14","v15","v17","v19","v20","v21","v22","v23","v24","v25","v26","v27","x8");
487482
$code.=<<___;
488483
mov w30,#24 // MAX = 24
489484
@@ -492,7 +487,7 @@ ()
492487
b.eq __enc_done_aese
493488
___
494489
&fls_neon("v0","v1","v2","v3","x0", 24, 25);
495-
&enc_rounds_aese(24,"v0","v1","v2","v6","v7","v8","v9","v10","v11","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","x8");
490+
&enc_rounds_aese(24,"v0","v1","v2","v6","v7","v8","v9","v10","v11","v14","v15","v17","v19","v20","v21","v22","v23","v24","v25","v26","v27","x8");
496491
$code.=<<___;
497492
mov w30,#32 // MAX = 32
498493
@@ -1433,17 +1428,32 @@ ()
14331428
# General macros
14341429
#
14351430
1436-
sub filter_8bit_neon(){
1437-
my ($x,$lo_t,$hi_t,$mask,$tmp) = @_;
1438-
$code.=<<___;
1439-
and $tmp.16b,$x.16b,$mask.16b
1440-
ushr $x.16b,$x.16b,#4
1431+
#sub filter_8bit_neon(){
1432+
# my ($x,$lo_t,$hi_t,$mask,$tmp) = @_;
1433+
#$code.=<<___;
1434+
# and $tmp.16b,$x.16b,$mask.16b
1435+
# ushr $x.16b,$x.16b,#4
1436+
# tbl $tmp.16b,{$lo_t.16b},$tmp.16b
1437+
# tbl $x.16b,{$hi_t.16b},$x.16b
1438+
# eor $x.16b,$x.16b,$tmp.16b
1439+
#___
1440+
#}
1441+
sub filter_8bit_neon_3op(){
1442+
my ($out,$in,$lo_t,$hi_t,$mask,$tmp) = @_;
1443+
$code.=<<___;
1444+
and $tmp.16b,$in.16b,$mask.16b
1445+
ushr $out.16b,$in.16b,#4
14411446
tbl $tmp.16b,{$lo_t.16b},$tmp.16b
1442-
tbl $x.16b,{$hi_t.16b},$x.16b
1443-
eor $x.16b,$x.16b,$tmp.16b
1447+
tbl $out.16b,{$hi_t.16b},$out.16b
1448+
eor $out.16b,$out.16b,$tmp.16b
14441449
___
14451450
}
14461451
1452+
sub filter_8bit_neon(){
1453+
my ($x,$lo_t,$hi_t,$mask,$tmp) = @_;
1454+
&filter_8bit_neon_3op($x,$x,$lo_t,$hi_t,$mask,$tmp);
1455+
}
1456+
14471457
#
14481458
# 16-block encryption/decryption macros
14491459
#

0 commit comments

Comments
 (0)