Skip to content

Commit 6e0f4e9

Browse files
Shark64pablodelara
authored andcommitted
Use VPTERNLOG for 3-operands boolean functions.
Signed-off-by: Nicola Torracca <[email protected]>
1 parent 38d4b13 commit 6e0f4e9

File tree

2 files changed

+17
-30
lines changed

2 files changed

+17
-30
lines changed

Release_notes.txt

+2
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ Unreleased
8383

8484
* Optimized AES-GCM for AVX512-VAES x86 implementation.
8585

86+
* Optimized SM3 for AVX512 x86 implementation.
87+
8688
* Optimized MD5 and SM3 for aarch64.
8789

8890
* New optimized version of AES-CBC and AES-XTS for aarch64.

sm3_mb/sm3_mb_x16_avx512.asm

+15-30
Original file line numberDiff line numberDiff line change
@@ -338,10 +338,8 @@ FIELD _rsp, 8, 8
338338
%define %%Z %3
339339
; I > 16 return (x & y) | (x & z) | (y & z)
340340
; Same as (x & y) | (z & (x | y))
341-
vporq TMP0,%%X,%%Y
342-
vpandq TMP0,%%Z
343-
vpandq TMP1,%%X,%%Y
344-
vporq TMP0,TMP1
341+
vmovdqa32 TMP0,%%X
342+
vpternlogd TMP0, %%Y, %%Z, 0xE8 ; Majority function
345343
%endmacro
346344

347345

@@ -364,9 +362,8 @@ FIELD _rsp, 8, 8
364362
%define %%Z %3
365363

366364
; I > 16 return (x & y) | ((~x) & z)
367-
vpandq TMP0,%%X,%%Y
368-
vpandnd TMP1,%%X,%%Z
369-
vporq TMP0,TMP1
365+
vmovdqa32 TMP0,%%X
366+
vpternlogd TMP0, %%Y, %%Z, 0xCA ; X? Y : Z
370367
%endmacro
371368

372369
;; void sm3_mb_x16_avx512(ISAL_SM3_MB_ARGS_X8, uint32_t size)
@@ -517,13 +514,11 @@ lloop:
517514
; F = E
518515
; E = P(TT2)
519516
vmovups D,C
520-
vprold B,9
521-
vmovups C,B
517+
vprold C,B,9
522518
vmovups B,A
523519
vmovups A,TMP3
524520
vmovups H,G
525-
vprold F,19
526-
vmovups G,F
521+
vprold G,F,19
527522
vmovups F,E
528523
P TMP2
529524
vmovups E,TMP0
@@ -552,14 +547,12 @@ lloop:
552547

553548
; clac WB(I+4)
554549
vprold APPEND(WB,J),APPEND(WB,J_3),15
555-
vpxord APPEND(WB,J),APPEND(WB,J_16)
556-
vpxord APPEND(WB,J),APPEND(WB,J_9)
550+
vpternlogd APPEND(WB,J),APPEND(WB,J_9), APPEND(WB,J_16), 0x96; 3-way XOR
557551

558552
P1 APPEND(WB,J)
559553

560554
vprold APPEND(WB,J),APPEND(WB,J_13),7
561-
vpxord APPEND(WB,J),TMP0
562-
vpxord APPEND(WB,J),APPEND(WB,J_6)
555+
vpternlogd APPEND(WB,J),APPEND(WB,J_6), TMP0, 0x96; 3-way XOR
563556

564557
; (A <<< 12)
565558
; store in TMP0
@@ -602,13 +595,11 @@ lloop:
602595
; F = E
603596
; E = P(TT2)
604597
vmovups D,C
605-
vprold B,9
606-
vmovups C,B
598+
vprold C,B,9
607599
vmovups B,A
608600
vmovups A,TMP3
609601
vmovups H,G
610-
vprold F,19
611-
vmovups G,F
602+
vprold G,F,19
612603
vmovups F,E
613604
P TMP2
614605
vmovups E,TMP0
@@ -629,14 +620,12 @@ lloop:
629620
%assign J (((I+4) % 20))
630621

631622
vprold APPEND(WB,J),APPEND(WB,J_3),15
632-
vpxord APPEND(WB,J),APPEND(WB,J_16)
633-
vpxord APPEND(WB,J),APPEND(WB,J_9)
623+
vpternlogd APPEND(WB,J),APPEND(WB,J_9),APPEND(WB,J_16), 0x96; 3-way XOR
634624

635625
P1 APPEND(WB,J)
636626

637627
vprold APPEND(WB,J),APPEND(WB,J_13),7
638-
vpxord APPEND(WB,J),TMP0
639-
vpxord APPEND(WB,J),APPEND(WB,J_6)
628+
vpternlogd APPEND(WB,J),APPEND(WB,J_6),TMP0, 0x96; 3-way XOR
640629

641630
; (A <<< 12)
642631
; store in TMP0
@@ -679,13 +668,11 @@ lloop:
679668
; F = E
680669
; E = P(TT2)
681670
vmovups D,C
682-
vprold B,9
683-
vmovups C,B
671+
vprold C,B,9
684672
vmovups B,A
685673
vmovups A,TMP3
686674
vmovups H,G
687-
vprold F,19
688-
vmovups G,F
675+
vprold G,F,19
689676
vmovups F,E
690677
P TMP2
691678
vmovups E,TMP0
@@ -704,9 +691,7 @@ lloop:
704691

705692
%assign cur_loop cur_loop+1
706693
sub SIZE, 1
707-
je last_loop
708-
709-
jmp lloop
694+
jnz lloop
710695

711696

712697
last_loop:

0 commit comments

Comments
 (0)