Skip to content

Commit 5be3093

Browse files
committed
Vectorised version of sm4_1blk passes test.
1 parent b9b7f01 commit 5be3093

1 file changed

Lines changed: 66 additions & 19 deletions

File tree

crypto/sm4/asm/vpsm4_ex-armv8.pl

Lines changed: 66 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -228,12 +228,13 @@ ()
228228
# sbox operation for one single word
229229
sub sbox_1word () {
230230
#my $word = shift;
231-
my ($tmp0, $tmp1) = ("v14", "v15"); # Temporaries
231+
my ($tmp0, $tmp1, $tmp2, $tmp3, $tmp4) = ("v12", "v14", "v15", "v24", "v25"); # Temporaries
232232

233233
$code.=<<___;
234234
//mov @vtmp[3].s[0],$word
235235
// optimize sbox using AESE instruction
236-
tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b
236+
//tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b
237+
tbl @vtmp[0].16b, {$vtmp5.16b}, $MaskV.16b
237238
___
238239
&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
239240
$code.=<<___;
@@ -244,15 +245,45 @@ ()
244245
$code.=<<___;
245246
246247
//mov $wtmp0,@vtmp[0].s[0]
247-
dup @vtmp[0].4s,@vtmp[0].s[0]
248-
sli $tmp0.4s, @vtmp[0].4s, #2
249-
eor $vtmp[0].16b,$vtmp[0].16b,@vtmp[0].16b
250-
sli $tmp0.4s, @vtmp[0].4s, #10
251-
eor $vtmp[0].16b,$vtmp[0].16b,@vtmp[0].16b
252-
sli $tmp0.4s, @vtmp[0].4s, #18
253-
eor $vtmp[0].16b,$vtmp[0].16b,@vtmp[0].16b
254-
sli $tmp0.4s, @vtmp[0].4s, #24
255-
eor $vtmp[0].16b,$vtmp[0].16b,@vtmp[0].16b
248+
//dup @vtmp[0].4s,@vtmp[0].s[0]
249+
250+
ushr $tmp0.4s,@vtmp[0].4s,32-2
251+
ushr $tmp1.4s,@vtmp[0].4s,32-10
252+
ushr $tmp2.4s,@vtmp[0].4s,32-18
253+
ushr $tmp3.4s,@vtmp[0].4s,32-24
254+
sli $tmp0.4s,@vtmp[0].4s,#2
255+
sli $tmp1.4s,@vtmp[0].4s,#10
256+
sli $tmp2.4s,@vtmp[0].4s,#18
257+
sli $tmp3.4s,@vtmp[0].4s,#24
258+
eor $tmp0.16b,$tmp0.16b,@vtmp[0].16b
259+
eor $tmp1.16b,$tmp1.16b,$tmp2.16b
260+
eor @vtmp[0].16b,$tmp0.16b,$tmp1.16b
261+
eor @vtmp[0].16b,@vtmp[0].16b,$tmp3.16b
262+
263+
# --- rotl(B, 2) ---
264+
//sli $tmp0.4s,@vtmp[0].4s,#2
265+
//ushr $tmp1.4s,@vtmp[0].4s,#30
266+
//orr $tmp0.16b,$tmp0.16b,$tmp1.16b
267+
//eor $tmp4.16b,@vtmp[0].16b,$tmp0.16b
268+
269+
# --- rotl(B, 10) ---
270+
//sli $tmp2.4s,@vtmp[0].4s,#10
271+
//ushr $tmp3.4s,@vtmp[0].4s,#22
272+
//orr $tmp2.16b,$tmp2.16b,$tmp3.16b
273+
//eor $tmp4.16b,$tmp4.16b,$tmp2.16b
274+
275+
# --- rotl(B, 18) ---
276+
//sli $tmp0.4s,@vtmp[0].4s,#18
277+
//ushr $tmp1.4s,@vtmp[0].4s,#14
278+
//orr $tmp0.16b,$tmp0.16b,$tmp1.16b
279+
//eor $tmp4.16b,$tmp4.16b,$tmp0.16b
280+
281+
# --- rotl(B, 24) ---
282+
//sli $tmp2.4s,@vtmp[0].4s,#24
283+
//ushr $tmp3.4s,@vtmp[0].4s,#8
284+
//orr $tmp2.16b,$tmp2.16b,$tmp3.16b
285+
//eor @vtmp[0].16b,$tmp4.16b,$tmp2.16b
286+
256287
//eor $word,$wtmp0,$wtmp0,ror #32-2
257288
//eor $word,$word,$wtmp0,ror #32-10
258289
//eor $word,$word,$wtmp0,ror #32-18
@@ -280,35 +311,51 @@ ()
280311
eor $sbox_inputs.16b,$sbox_inputs.16b,$tmp2.16b // sbox_inputs -> (B0^B1^B2^RK3, B1^B2^B3^RK0, B2^B3^B0^RK1, B3^B0^B1^RK2)
281312
282313
// SBOX(B1 ^ B2 ^ B3 ^ RK0)
283-
mov @vtmp[3].s[0],$sbox_inputs.s[1]
314+
//mov $vtmp5.s[0],$sbox_inputs.s[1]
315+
dup $vtmp5.4s,$sbox_inputs.s[1]
284316
___
285317
&sbox_1word();
286318
$code.=<<___;
319+
dup $tmp2.4s,$state.s[0]
287320
eor $sbox_inputs.16b,$sbox_inputs.16b,@vtmp[0].16b // V ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
288-
eor $state.s[0],$state.s[0],@vtmp[0].s[0] // B0' = B0 ^ SBOX(B1 ^ B2 ^ B3 ^ RK0)
321+
eor $tmp2.16b,$tmp2.16b,@vtmp[0].16b
322+
mov $state.s[0],$tmp2.s[0]
323+
//eor $state.s[0],$state.s[0],@vtmp[0].s[0] // B0' = B0 ^ SBOX(B1 ^ B2 ^ B3 ^ RK0)
289324
290325
// SBOX(B0' ^ B2 ^ B3 ^ RK1)
291-
mov @vtmp[3].s[0],$sbox_inputs.s[2]
326+
//mov $vtmp5.s[0],$sbox_inputs.s[2]
327+
dup $vtmp5.4s,$sbox_inputs.s[2]
292328
___
293329
&sbox_1word();
294330
$code.=<<___;
331+
dup $tmp2.4s,$state.s[1]
295332
eor $sbox_inputs.16b,$sbox_inputs.16b,@vtmp[0].16b // V ^= SBOX(B0' ^ B2 ^ B3 ^ RK1)
296-
eor $state.s[1],$state.s[1],@vtmp[0].s[0] // B1' = B1 ^ SBOX(B0' ^ B2 ^ B3 ^ RK1)
333+
eor $tmp2.16b,$tmp2.16b,@vtmp[0].16b
334+
mov $state.s[1],$tmp2.s[0]
335+
//eor $state.s[1],$state.s[1],@vtmp[0].s[0] // B1' = B1 ^ SBOX(B0' ^ B2 ^ B3 ^ RK1)
297336
298337
// SBOX(B0' ^ B1' ^ B3 ^ RK2)
299-
mov @vtmp[3].s[0],$sbox_inputs.s[3]
338+
//mov $vtmp5.s[0],$sbox_inputs.s[3]
339+
dup $vtmp5.4s,$sbox_inputs.s[3]
300340
___
301341
&sbox_1word();
302342
$code.=<<___;
343+
dup $tmp2.4s,$state.s[2]
303344
eor $sbox_inputs.16b,$sbox_inputs.16b,@vtmp[0].16b // V ^= SBOX(B0' ^ B1' ^ B3 ^ RK2)
304-
eor $state.s[2],$state.s[2],@vtmp[0].s[0] // B2' = B2 ^ SBOX(B0' ^ B1' ^ B3 ^ RK2)
345+
eor $tmp2.16b,$tmp2.16b,@vtmp[0].16b
346+
mov $state.s[2],$tmp2.s[0]
347+
//eor $state.s[2],$state.s[2],@vtmp[0].s[0] // B2' = B2 ^ SBOX(B0' ^ B1' ^ B3 ^ RK2)
305348
306349
// SBOX(B0' ^ B1' ^ B2' ^ RK3)
307-
mov @vtmp[3].s[0],$sbox_inputs.s[0]
350+
//mov $vtmp5.s[0],$sbox_inputs.s[0]
351+
dup $vtmp5.4s,$sbox_inputs.s[0]
308352
___
309353
&sbox_1word();
310354
$code.=<<___;
311-
eor $state.s[3],$state.s[3],@vtmp[0].s[0] // B3' = B3 ^ SBOX(B0' ^ B1' ^ B2' ^ RK3)
355+
dup $tmp2.4s,$state.s[3]
356+
eor $tmp2.16b,$tmp2.16b,@vtmp[0].16b
357+
mov $state.s[3],$tmp2.s[0]
358+
//eor $state.s[3],$state.s[3],@vtmp[0].s[0] // B3' = B3 ^ SBOX(B0' ^ B1' ^ B2' ^ RK3)
312359
___
313360
}
314361

0 commit comments

Comments
 (0)