Skip to content

Commit c12d97e

Browse files
committed
aes-gcm-avx512: remove AES round-key preloading
Drop the preload optimization added earlier (and the build-time toggle and ADDBE-aliasing guards that accompanied it). Round keys 0-3 are reloaded from memory in the stitched 16-/32-block macros again, the ADDBE counter constants are used directly from their registers (no more aliasing onto AES key registers), and the now-dead "restore ADDBE" sequences after the loops are removed. Rationale (measured, paired ON/OFF, drift-cancelled): - Small/medium packets (<= ~4 KB, the target regime): ~0% on both AMD Zen 5 (EPYC 9655) and Intel Ice Lake (Xeon 8380). The published ~1.13-1.28x speedup over master is fully retained without preloading. - Large buffers (>= 8 KB): a flat ~1.5% on Zen 5, plateauing through 256 KB (it is a constant per-16-block cost, so it neither amortizes away nor grows); ~0% on Ice Lake. A Zen 5-only ~1.5% uplift confined to large buffers outside the target regime does not justify the register-aliasing complexity or the keystream/nonce-reuse hazard (CWE-323) that the preload introduces, so it is removed.
1 parent 7e2cbb8 commit c12d97e

1 file changed

Lines changed: 24 additions & 160 deletions

File tree

crypto/modes/asm/aes-gcm-avx512.pl

Lines changed: 24 additions & 160 deletions
Original file line numberDiff line numberDiff line change
@@ -125,17 +125,6 @@
125125
# ; Currently disabled, as this check is handled outside.
126126
my $CHECK_FUNCTION_ARGUMENTS = 0;
127127

128-
# ; Preload AES round keys 0-3 into dedicated ZMM registers for the >=704-byte
129-
# ; 16-/32-block hot loops, instead of reloading them from memory each
130-
# ; iteration. This is the optimization currently under evaluation: on some
131-
# ; microarchitectures its benefit is negligible, and it forces aliasing the
132-
# ; ADDBE counter constants onto AES-key registers (see the counter-setup
133-
# ; invariants in GHASH_16_ENCRYPT_16_PARALLEL / _N_GHASH_N / INITIAL_BLOCKS_16).
134-
# ; Set to 0 to disable it (round keys are then reloaded from memory). The
135-
# ; OPENSSL_AESGCM_NO_KEYPRELOAD build-time environment variable overrides this
136-
# ; to 0, so both variants can be built from a single source for A/B evaluation.
137-
my $PRELOAD_AES_ROUND_KEYS = $ENV{OPENSSL_AESGCM_NO_KEYPRELOAD} ? 0 : 1;
138-
139128
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
140129
# ;;; Global constants
141130
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2362,38 +2351,9 @@ sub GHASH_16_ENCRYPT_N_GHASH_N {
23622351
jae .L_16_blocks_overflow_${label_suffix}
23632352
___
23642353
2365-
# ;; INVARIANT (do not break): the counter increments below MUST use the
2366-
# ;; ddq_addbe_*(%rip) memory constants and MUST NOT reference the
2367-
# ;; $ADDBE_4x4 / $ADDBE_1234 register arguments (args 34/35).
2368-
# ;;
2369-
# ;; Rationale: the large-message callers in GCM_ENC_DEC (the 16-block hot
2370-
# ;; loop and the 32-block big loop) deliberately alias those two ZMM
2371-
# ;; registers (%zmm27/%zmm28) to preloaded AES round keys and may pass them
2372-
# ;; in here only as dead placeholders. If a counter addend were emitted from
2373-
# ;; $ADDBE_4x4 / $ADDBE_1234 it would add AES key bytes to the CTR block,
2374-
# ;; corrupting the keystream and causing catastrophic keystream/nonce reuse
2375-
# ;; (CWE-323). Whole-ZMM adds are used unconditionally (the unused upper
2376-
# ;; lanes are harmless), mirroring the overflow branch below.
2377-
my $ctr_setup_code = " vpaddd ddq_addbe_1234(%rip),$CTR_BE,$B00_03\n";
2378-
if ($NUM_BLOCKS > 4) {
2379-
$ctr_setup_code .= " vpaddd ddq_addbe_4444(%rip),$B00_03,$B04_07\n";
2380-
}
2381-
if ($NUM_BLOCKS > 8) {
2382-
$ctr_setup_code .= " vpaddd ddq_addbe_4444(%rip),$B04_07,$B08_11\n";
2383-
}
2384-
if ($NUM_BLOCKS > 12) {
2385-
$ctr_setup_code .= " vpaddd ddq_addbe_4444(%rip),$B08_11,$B12_15\n";
2386-
}
2387-
2388-
# ;; Build-time enforcement of the invariant documented above.
2389-
if ($ctr_setup_code =~ /\Q$ADDBE_4x4\E\b/ || $ctr_setup_code =~ /\Q$ADDBE_1234\E\b/) {
2390-
die "GHASH_16_ENCRYPT_N_GHASH_N: counter setup must not reference the "
2391-
. "ADDBE register args ($ADDBE_4x4/$ADDBE_1234); use ddq_addbe_*(%rip). "
2392-
. "Callers alias these registers to preloaded AES round keys, so this "
2393-
. "would corrupt the CTR keystream (keystream/nonce reuse).\n";
2394-
}
2395-
2396-
$code .= $ctr_setup_code;
2354+
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2355+
$NUM_BLOCKS, "vpaddd", $B00_03, $B04_07, $B08_11, $B12_15, $CTR_BE,
2356+
$B00_03, $B04_07, $B08_11, $ADDBE_1234, $ADDBE_4x4, $ADDBE_4x4, $ADDBE_4x4);
23972357
$code .= <<___;
23982358
jmp .L_16_blocks_ok_${label_suffix}
23992359
@@ -3006,10 +2966,6 @@ sub GHASH_16_ENCRYPT_16_PARALLEL {
30062966
my $DATA_DISPL = $_[40]; # [in] fixed numerical data displacement/offset
30072967
my $GHASH_IN = $_[41]; # [in] current GHASH value or "no_ghash_in"
30082968
my $IA0 = $_[42]; # [clobbered] temporary GPR
3009-
my $PRELOADED_KEY0 = defined($_[43]) ? $_[43] : ""; # [in] optional preloaded AES round key 0
3010-
my $PRELOADED_KEY1 = defined($_[44]) ? $_[44] : ""; # [in] optional preloaded AES round key 1
3011-
my $PRELOADED_KEY2 = defined($_[45]) ? $_[45] : ""; # [in] optional preloaded AES round key 2
3012-
my $PRELOADED_KEY3 = defined($_[46]) ? $_[46] : ""; # [in] optional preloaded AES round key 3
30132969
30142970
my $B00_03 = $ZT1;
30152971
my $B04_07 = $ZT2;
@@ -3055,27 +3011,13 @@ sub GHASH_16_ENCRYPT_16_PARALLEL {
30553011
30563012
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30573013
# ;; prepare counter blocks
3058-
# ;;
3059-
# ;; INVARIANT (do not break): the counter increments below MUST use the
3060-
# ;; ddq_addbe_*(%rip) memory constants and MUST NOT reference the
3061-
# ;; $ADDBE_4x4 / $ADDBE_1234 register arguments (args 34/35).
3062-
# ;;
3063-
# ;; Rationale: the large-message callers in GCM_ENC_DEC (the 16-block hot
3064-
# ;; loop and the 32-block big loop) deliberately alias those two ZMM
3065-
# ;; registers (%zmm27/%zmm28) to preloaded AES round keys 2/3 and pass them
3066-
# ;; in here only as dead placeholders. If a counter addend were emitted from
3067-
# ;; $ADDBE_4x4 / $ADDBE_1234 it would add AES key bytes to the CTR block,
3068-
# ;; corrupting the keystream and causing catastrophic keystream/nonce reuse
3069-
# ;; (CWE-323). The $ADDBE_* register args are therefore intentionally unused
3070-
# ;; in this macro; the build-time check below enforces that.
3071-
3072-
my $ctr_setup_code = <<___;
3014+
$code .= <<___;
30733015
cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
30743016
jae .L_16_blocks_overflow_${label_suffix}
3075-
vpaddd ddq_addbe_1234(%rip),$CTR_BE,$B00_03
3076-
vpaddd ddq_addbe_4444(%rip),$B00_03,$B04_07
3077-
vpaddd ddq_addbe_4444(%rip),$B04_07,$B08_11
3078-
vpaddd ddq_addbe_4444(%rip),$B08_11,$B12_15
3017+
vpaddd $ADDBE_1234,$CTR_BE,$B00_03
3018+
vpaddd $ADDBE_4x4,$B00_03,$B04_07
3019+
vpaddd $ADDBE_4x4,$B04_07,$B08_11
3020+
vpaddd $ADDBE_4x4,$B08_11,$B12_15
30793021
jmp .L_16_blocks_ok_${label_suffix}
30803022
.L_16_blocks_overflow_${label_suffix}:
30813023
vpshufb $SHFMSK,$CTR_BE,$CTR_BE
@@ -3091,27 +3033,13 @@ sub GHASH_16_ENCRYPT_16_PARALLEL {
30913033
.L_16_blocks_ok_${label_suffix}:
30923034
___
30933035
3094-
# ;; Build-time enforcement of the invariant documented above. Scoped to the
3095-
# ;; counter-setup snippet only, so the legitimate use of the same physical
3096-
# ;; registers as preloaded AES keys later in this macro is not flagged.
3097-
if ($ctr_setup_code =~ /\Q$ADDBE_4x4\E\b/ || $ctr_setup_code =~ /\Q$ADDBE_1234\E\b/) {
3098-
die "GHASH_16_ENCRYPT_16_PARALLEL: counter setup must not reference the "
3099-
. "ADDBE register args ($ADDBE_4x4/$ADDBE_1234); use ddq_addbe_*(%rip). "
3100-
. "Callers alias these registers to preloaded AES round keys, so this "
3101-
. "would corrupt the CTR keystream (keystream/nonce reuse).\n";
3102-
}
3103-
3104-
$code .= $ctr_setup_code;
3105-
31063036
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31073037
# ;; pre-load constants
3108-
my $ARK_KEY = ($PRELOADED_KEY0 ne "") ? $PRELOADED_KEY0 : $AESKEY1;
3109-
my $R1_KEY = ($PRELOADED_KEY1 ne "") ? $PRELOADED_KEY1 : $AESKEY2;
3110-
my $R2_KEY = ($PRELOADED_KEY2 ne "") ? $PRELOADED_KEY2 : $AESKEY1;
3111-
my $R3_KEY = ($PRELOADED_KEY3 ne "") ? $PRELOADED_KEY3 : $AESKEY2;
3112-
if ($PRELOADED_KEY0 eq "") {
3113-
$code .= "vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1\n";
3114-
}
3038+
my $ARK_KEY = $AESKEY1;
3039+
my $R1_KEY = $AESKEY2;
3040+
my $R2_KEY = $AESKEY1;
3041+
my $R3_KEY = $AESKEY2;
3042+
$code .= "vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1\n";
31153043
if ($GHASH_IN ne "no_ghash_in") {
31163044
$code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHASH_IN,$GHDAT1\n";
31173045
} else {
@@ -3127,9 +3055,7 @@ sub GHASH_16_ENCRYPT_16_PARALLEL {
31273055
vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR_BE
31283056
addb \$16,@{[BYTE($CTR_CHECK)]}
31293057
___
3130-
if ($PRELOADED_KEY1 eq "") {
3131-
$code .= " vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2\n";
3132-
}
3058+
$code .= " vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2\n";
31333059
$code .= <<___;
31343060
vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (1*4)),"%rsp")]},$GHKEY2
31353061
vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
@@ -3145,9 +3071,7 @@ sub GHASH_16_ENCRYPT_16_PARALLEL {
31453071
vpxorq $ARK_KEY,$B08_11,$B08_11
31463072
vpxorq $ARK_KEY,$B12_15,$B12_15
31473073
___
3148-
if ($PRELOADED_KEY2 eq "") {
3149-
$code .= " vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1\n";
3150-
}
3074+
$code .= " vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1\n";
31513075
$code .= <<___;
31523076
31533077
# ;;==================================================
@@ -3166,9 +3090,7 @@ sub GHASH_16_ENCRYPT_16_PARALLEL {
31663090
vaesenc $R1_KEY,$B08_11,$B08_11
31673091
vaesenc $R1_KEY,$B12_15,$B12_15
31683092
___
3169-
if ($PRELOADED_KEY3 eq "") {
3170-
$code .= " vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2\n";
3171-
}
3093+
$code .= " vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2\n";
31723094
$code .= <<___;
31733095
31743096
# ;; =================================================
@@ -4159,12 +4081,6 @@ sub GCM_ENC_DEC {
41594081
# ; Unused in the small packet path
41604082
my $ADDBE_4x4 = "%zmm27";
41614083
my $ADDBE_1234 = "%zmm28";
4162-
# ; Empty when key preloading is disabled => the stitched macros reload the
4163-
# ; round keys from memory (see $PRELOAD_AES_ROUND_KEYS).
4164-
my $PRELOADED_AES_KEY0 = $PRELOAD_AES_ROUND_KEYS ? "%zmm9" : "";
4165-
my $PRELOADED_AES_KEY1 = $PRELOAD_AES_ROUND_KEYS ? "%zmm23" : "";
4166-
my $PRELOADED_AES_KEY2 = $PRELOAD_AES_ROUND_KEYS ? "%zmm27" : "";
4167-
my $PRELOADED_AES_KEY3 = $PRELOAD_AES_ROUND_KEYS ? "%zmm28" : "";
41684084
41694085
my $MASKREG = "%k1";
41704086
@@ -4266,14 +4182,6 @@ sub GCM_ENC_DEC {
42664182
jb .L_4block_early_${label_suffix}
42674183
___
42684184
4269-
# ;; preload AES round keys 0 and 1 (only msg >= 704; 4/8-block path reloads from $AES_KEYS)
4270-
if ($PRELOAD_AES_ROUND_KEYS) {
4271-
$code .= <<___;
4272-
vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$PRELOADED_AES_KEY0
4273-
vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$PRELOADED_AES_KEY1
4274-
___
4275-
}
4276-
42774185
# ;; ==== AES-CTR - first 16 blocks
42784186
my $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
42794187
my $data_in_out_offset = 0;
@@ -4304,12 +4212,6 @@ sub GCM_ENC_DEC {
43044212
43054213
$code .= "mov \$1,$HKEYS_READY\n";
43064214
4307-
# ;; Preload extra AES round keys (overwrites ADDBE registers)
4308-
if ($PRELOAD_AES_ROUND_KEYS) {
4309-
$code .= " vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$PRELOADED_AES_KEY2\n";
4310-
$code .= " vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$PRELOADED_AES_KEY3\n";
4311-
}
4312-
43134215
$code .= <<___;
43144216
add \$`(16 * 16)`,$DATA_OFFSET
43154217
sub \$`(16 * 16)`,$LENGTH
@@ -4341,7 +4243,7 @@ sub GCM_ENC_DEC {
43414243
$GL,
43424244
$GH, $GM, "first_time_reduction", $ENC_DEC, $data_in_out_offset,
43434245
$AAD_HASHz,
4344-
$IA0, $PRELOADED_AES_KEY0, $PRELOADED_AES_KEY1, $PRELOADED_AES_KEY2, $PRELOADED_AES_KEY3);
4246+
$IA0);
43454247
43464248
$code .= <<___;
43474249
vmovdqa64 $ZTMP4,$AAD_HASHz
@@ -4354,10 +4256,6 @@ sub GCM_ENC_DEC {
43544256
.L_small_loop_tail_${label_suffix}:
43554257
___
43564258
4357-
# ;; Restore ADDBE constants (overwritten by preloaded AES keys)
4358-
$code .= " vmovdqa64 ddq_addbe_4444(%rip),$ADDBE_4x4\n";
4359-
$code .= " vmovdqa64 ddq_addbe_1234(%rip),$ADDBE_1234\n";
4360-
43614259
# ;; GHASH the 16 blocks still on stack and reduce
43624260
&GHASH_16(
43634261
"start_reduce", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, 0,
@@ -4715,12 +4613,6 @@ sub GCM_ENC_DEC {
47154613
"mid16");
47164614
$code .= "mov \$1,$HKEYS_READY\n";
47174615
4718-
# ;; Overwrite ADDBE registers with preloaded AES round keys 2 and 3
4719-
if ($PRELOAD_AES_ROUND_KEYS) {
4720-
$code .= " vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$PRELOADED_AES_KEY2\n";
4721-
$code .= " vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$PRELOADED_AES_KEY3\n";
4722-
}
4723-
47244616
$code .= <<___;
47254617
add \$`(32 * 16)`,$DATA_OFFSET
47264618
sub \$`(32 * 16)`,$LENGTH
@@ -4755,7 +4647,7 @@ sub GCM_ENC_DEC {
47554647
$ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
47564648
$GL,
47574649
$GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
4758-
$IA0, $PRELOADED_AES_KEY0, $PRELOADED_AES_KEY1, $PRELOADED_AES_KEY2, $PRELOADED_AES_KEY3);
4650+
$IA0);
47594651
47604652
# ;; ==== AES-CTR + GHASH - 16 blocks, slot 1 (in-place), reduction
47614653
$aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
@@ -4775,7 +4667,7 @@ sub GCM_ENC_DEC {
47754667
$ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
47764668
$GL,
47774669
$GH, $GM, "final_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
4778-
$IA0, $PRELOADED_AES_KEY0, $PRELOADED_AES_KEY1, $PRELOADED_AES_KEY2, $PRELOADED_AES_KEY3);
4670+
$IA0);
47794671
47804672
# ;; === xor cipher block 0 with GHASH (ZT4)
47814673
$code .= <<___;
@@ -4787,10 +4679,6 @@ sub GCM_ENC_DEC {
47874679
jae .L_encrypt_big_nblocks_${label_suffix}
47884680
47894681
.L_no_more_big_nblocks_${label_suffix}:
4790-
# ;; Restore ADDBE constants (overwritten by preloaded AES keys)
4791-
vmovdqa64 ddq_addbe_4444(%rip),$ADDBE_4x4
4792-
vmovdqa64 ddq_addbe_1234(%rip),$ADDBE_1234
4793-
47944682
___
47954683
47964684
# ;; =====================================================
@@ -4853,7 +4741,7 @@ sub GCM_ENC_DEC {
48534741
$ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
48544742
$GL,
48554743
$GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
4856-
$IA0, $PRELOADED_AES_KEY0, $PRELOADED_AES_KEY1);
4744+
$IA0);
48574745
48584746
# ;; Finish the 32-block pending chain: hash slot1 (H^16..H^1) with reduction.
48594747
&GHASH_16(
@@ -4965,40 +4853,16 @@ sub INITIAL_BLOCKS_16 {
49654853
my $label_suffix = $label_count++;
49664854
49674855
my $stack_offset = $BLK_OFFSET;
4968-
4969-
# ;; INVARIANT (do not break): the counter increments in the non-overflow path
4970-
# ;; MUST use the ddq_addbe_*(%rip) memory constants and MUST NOT reference the
4971-
# ;; $ADDBE_4x4 / $ADDBE_1234 register arguments (args 8/7).
4972-
# ;;
4973-
# ;; Rationale: the large-message callers in GCM_ENC_DEC deliberately alias
4974-
# ;; those two ZMM registers (%zmm27/%zmm28) to preloaded AES round keys. Even
4975-
# ;; though this macro currently runs before that aliasing, emitting a counter
4976-
# ;; addend from $ADDBE_4x4 / $ADDBE_1234 would add AES key bytes to the CTR
4977-
# ;; block if the call order ever changed, corrupting the keystream and causing
4978-
# ;; catastrophic keystream/nonce reuse (CWE-323). The register args are
4979-
# ;; therefore intentionally unused here; the build-time check below enforces it.
4980-
my $ctr_setup_code = <<___;
4981-
vpaddd ddq_addbe_1234(%rip),$CTR,$B00_03
4982-
vpaddd ddq_addbe_4444(%rip),$B00_03,$B04_07
4983-
vpaddd ddq_addbe_4444(%rip),$B04_07,$B08_11
4984-
vpaddd ddq_addbe_4444(%rip),$B08_11,$B12_15
4985-
___
4986-
if ($ctr_setup_code =~ /\Q$ADDBE_4x4\E\b/ || $ctr_setup_code =~ /\Q$ADDBE_1234\E\b/) {
4987-
die "INITIAL_BLOCKS_16: counter setup must not reference the "
4988-
. "ADDBE register args ($ADDBE_4x4/$ADDBE_1234); use ddq_addbe_*(%rip). "
4989-
. "Callers alias these registers to preloaded AES round keys, so this "
4990-
. "would corrupt the CTR keystream (keystream/nonce reuse).\n";
4991-
}
4992-
49934856
$code .= <<___;
49944857
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
49954858
# ;; prepare counter blocks
49964859
49974860
cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
49984861
jae .L_next_16_overflow_${label_suffix}
4999-
___
5000-
$code .= $ctr_setup_code;
5001-
$code .= <<___;
4862+
vpaddd $ADDBE_1234,$CTR,$B00_03
4863+
vpaddd $ADDBE_4x4,$B00_03,$B04_07
4864+
vpaddd $ADDBE_4x4,$B04_07,$B08_11
4865+
vpaddd $ADDBE_4x4,$B08_11,$B12_15
50024866
jmp .L_next_16_ok_${label_suffix}
50034867
.L_next_16_overflow_${label_suffix}:
50044868
vpshufb $SHUF_MASK,$CTR,$CTR

0 commit comments

Comments
 (0)