|
125 | 125 | # ; Currently disabled, as this check is handled outside. |
126 | 126 | my $CHECK_FUNCTION_ARGUMENTS = 0; |
127 | 127 |
|
| 128 | +# ; Preload AES round keys 0-3 into dedicated ZMM registers for the >=704-byte |
| 129 | +# ; 16-/32-block hot loops, instead of reloading them from memory each |
| 130 | +# ; iteration. This is the optimization currently under evaluation: on some |
| 131 | +# ; microarchitectures its benefit is negligible, and it forces aliasing the |
| 132 | +# ; ADDBE counter constants onto AES-key registers (see the counter-setup |
| 133 | +# ; invariants in GHASH_16_ENCRYPT_16_PARALLEL / _N_GHASH_N / INITIAL_BLOCKS_16). |
| 134 | +# ; Set to 0 to disable it (round keys are then reloaded from memory). The |
| 135 | +# ; OPENSSL_AESGCM_NO_KEYPRELOAD build-time environment variable overrides this |
| 136 | +# ; to 0, so both variants can be built from a single source for A/B evaluation. |
| 137 | +my $PRELOAD_AES_ROUND_KEYS = $ENV{OPENSSL_AESGCM_NO_KEYPRELOAD} ? 0 : 1; |
| 138 | + |
128 | 139 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
129 | 140 | # ;;; Global constants |
130 | 141 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
@@ -2351,9 +2362,38 @@ sub GHASH_16_ENCRYPT_N_GHASH_N { |
2351 | 2362 | jae .L_16_blocks_overflow_${label_suffix} |
2352 | 2363 | ___ |
2353 | 2364 |
|
2354 | | - &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16( |
2355 | | - $NUM_BLOCKS, "vpaddd", $B00_03, $B04_07, $B08_11, $B12_15, $CTR_BE, |
2356 | | - $B00_03, $B04_07, $B08_11, $ADDBE_1234, $ADDBE_4x4, $ADDBE_4x4, $ADDBE_4x4); |
| 2365 | + # ;; INVARIANT (do not break): the counter increments below MUST use the |
| 2366 | + # ;; ddq_addbe_*(%rip) memory constants and MUST NOT reference the |
| 2367 | + # ;; $ADDBE_4x4 / $ADDBE_1234 register arguments (args 34/35). |
| 2368 | + # ;; |
| 2369 | + # ;; Rationale: the large-message callers in GCM_ENC_DEC (the 16-block hot |
| 2370 | + # ;; loop and the 32-block big loop) deliberately alias those two ZMM |
| 2371 | + # ;; registers (%zmm27/%zmm28) to preloaded AES round keys and may pass them |
| 2372 | + # ;; in here only as dead placeholders. If a counter addend were emitted from |
| 2373 | + # ;; $ADDBE_4x4 / $ADDBE_1234 it would add AES key bytes to the CTR block, |
| 2374 | + # ;; corrupting the keystream and causing catastrophic keystream/nonce reuse |
| 2375 | + # ;; (CWE-323). Whole-ZMM adds are used unconditionally (the unused upper |
| 2376 | + # ;; lanes are harmless), mirroring the overflow branch below. |
| 2377 | + my $ctr_setup_code = " vpaddd ddq_addbe_1234(%rip),$CTR_BE,$B00_03\n"; |
| 2378 | + if ($NUM_BLOCKS > 4) { |
| 2379 | + $ctr_setup_code .= " vpaddd ddq_addbe_4444(%rip),$B00_03,$B04_07\n"; |
| 2380 | + } |
| 2381 | + if ($NUM_BLOCKS > 8) { |
| 2382 | + $ctr_setup_code .= " vpaddd ddq_addbe_4444(%rip),$B04_07,$B08_11\n"; |
| 2383 | + } |
| 2384 | + if ($NUM_BLOCKS > 12) { |
| 2385 | + $ctr_setup_code .= " vpaddd ddq_addbe_4444(%rip),$B08_11,$B12_15\n"; |
| 2386 | + } |
| 2387 | + |
| 2388 | + # ;; Build-time enforcement of the invariant documented above. |
| 2389 | + if ($ctr_setup_code =~ /\Q$ADDBE_4x4\E\b/ || $ctr_setup_code =~ /\Q$ADDBE_1234\E\b/) { |
| 2390 | + die "GHASH_16_ENCRYPT_N_GHASH_N: counter setup must not reference the " |
| 2391 | + . "ADDBE register args ($ADDBE_4x4/$ADDBE_1234); use ddq_addbe_*(%rip). " |
| 2392 | + . "Callers alias these registers to preloaded AES round keys, so this " |
| 2393 | + . "would corrupt the CTR keystream (keystream/nonce reuse).\n"; |
| 2394 | + } |
| 2395 | + |
| 2396 | + $code .= $ctr_setup_code; |
2357 | 2397 | $code .= <<___; |
2358 | 2398 | jmp .L_16_blocks_ok_${label_suffix} |
2359 | 2399 |
|
@@ -4119,10 +4159,12 @@ sub GCM_ENC_DEC { |
4119 | 4159 | # ; Unused in the small packet path |
4120 | 4160 | my $ADDBE_4x4 = "%zmm27"; |
4121 | 4161 | my $ADDBE_1234 = "%zmm28"; |
4122 | | - my $PRELOADED_AES_KEY0 = "%zmm9"; |
4123 | | - my $PRELOADED_AES_KEY1 = "%zmm23"; |
4124 | | - my $PRELOADED_AES_KEY2 = "%zmm27"; |
4125 | | - my $PRELOADED_AES_KEY3 = "%zmm28"; |
| 4162 | + # ; Empty when key preloading is disabled => the stitched macros reload the |
| 4163 | + # ; round keys from memory (see $PRELOAD_AES_ROUND_KEYS). |
| 4164 | + my $PRELOADED_AES_KEY0 = $PRELOAD_AES_ROUND_KEYS ? "%zmm9" : ""; |
| 4165 | + my $PRELOADED_AES_KEY1 = $PRELOAD_AES_ROUND_KEYS ? "%zmm23" : ""; |
| 4166 | + my $PRELOADED_AES_KEY2 = $PRELOAD_AES_ROUND_KEYS ? "%zmm27" : ""; |
| 4167 | + my $PRELOADED_AES_KEY3 = $PRELOAD_AES_ROUND_KEYS ? "%zmm28" : ""; |
4126 | 4168 |
|
4127 | 4169 | my $MASKREG = "%k1"; |
4128 | 4170 |
|
@@ -4225,10 +4267,12 @@ sub GCM_ENC_DEC { |
4225 | 4267 | ___ |
4226 | 4268 |
|
4227 | 4269 | # ;; preload AES round keys 0 and 1 (only msg >= 704; 4/8-block path reloads from $AES_KEYS) |
4228 | | - $code .= <<___; |
| 4270 | + if ($PRELOAD_AES_ROUND_KEYS) { |
| 4271 | + $code .= <<___; |
4229 | 4272 | vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$PRELOADED_AES_KEY0 |
4230 | 4273 | vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$PRELOADED_AES_KEY1 |
4231 | 4274 | ___ |
| 4275 | + } |
4232 | 4276 |
|
4233 | 4277 | # ;; ==== AES-CTR - first 16 blocks |
4234 | 4278 | my $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16)); |
@@ -4261,8 +4305,10 @@ sub GCM_ENC_DEC { |
4261 | 4305 | $code .= "mov \$1,$HKEYS_READY\n"; |
4262 | 4306 |
|
4263 | 4307 | # ;; Preload extra AES round keys (overwrites ADDBE registers) |
| 4308 | + if ($PRELOAD_AES_ROUND_KEYS) { |
4264 | 4309 | $code .= " vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$PRELOADED_AES_KEY2\n"; |
4265 | 4310 | $code .= " vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$PRELOADED_AES_KEY3\n"; |
| 4311 | + } |
4266 | 4312 |
|
4267 | 4313 | $code .= <<___; |
4268 | 4314 | add \$`(16 * 16)`,$DATA_OFFSET |
@@ -4670,8 +4716,10 @@ sub GCM_ENC_DEC { |
4670 | 4716 | $code .= "mov \$1,$HKEYS_READY\n"; |
4671 | 4717 |
|
4672 | 4718 | # ;; Overwrite ADDBE registers with preloaded AES round keys 2 and 3 |
4673 | | - $code .= " vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$PRELOADED_AES_KEY2\n"; |
4674 | | - $code .= " vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$PRELOADED_AES_KEY3\n"; |
| 4719 | + if ($PRELOAD_AES_ROUND_KEYS) { |
| 4720 | + $code .= " vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$PRELOADED_AES_KEY2\n"; |
| 4721 | + $code .= " vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$PRELOADED_AES_KEY3\n"; |
| 4722 | + } |
4675 | 4723 |
|
4676 | 4724 | $code .= <<___; |
4677 | 4725 | add \$`(32 * 16)`,$DATA_OFFSET |
@@ -4917,16 +4965,40 @@ sub INITIAL_BLOCKS_16 { |
4917 | 4965 | my $label_suffix = $label_count++; |
4918 | 4966 |
|
4919 | 4967 | my $stack_offset = $BLK_OFFSET; |
| 4968 | +
|
| 4969 | + # ;; INVARIANT (do not break): the counter increments in the non-overflow path |
| 4970 | + # ;; MUST use the ddq_addbe_*(%rip) memory constants and MUST NOT reference the |
| 4971 | + # ;; $ADDBE_4x4 / $ADDBE_1234 register arguments (args 8/7). |
| 4972 | + # ;; |
| 4973 | + # ;; Rationale: the large-message callers in GCM_ENC_DEC deliberately alias |
| 4974 | + # ;; those two ZMM registers (%zmm27/%zmm28) to preloaded AES round keys. Even |
| 4975 | + # ;; though this macro currently runs before that aliasing, emitting a counter |
| 4976 | + # ;; addend from $ADDBE_4x4 / $ADDBE_1234 would add AES key bytes to the CTR |
| 4977 | + # ;; block if the call order ever changed, corrupting the keystream and causing |
| 4978 | + # ;; catastrophic keystream/nonce reuse (CWE-323). The register args are |
| 4979 | + # ;; therefore intentionally unused here; the build-time check below enforces it. |
| 4980 | + my $ctr_setup_code = <<___; |
| 4981 | + vpaddd ddq_addbe_1234(%rip),$CTR,$B00_03 |
| 4982 | + vpaddd ddq_addbe_4444(%rip),$B00_03,$B04_07 |
| 4983 | + vpaddd ddq_addbe_4444(%rip),$B04_07,$B08_11 |
| 4984 | + vpaddd ddq_addbe_4444(%rip),$B08_11,$B12_15 |
| 4985 | +___ |
| 4986 | + if ($ctr_setup_code =~ /\Q$ADDBE_4x4\E\b/ || $ctr_setup_code =~ /\Q$ADDBE_1234\E\b/) { |
| 4987 | + die "INITIAL_BLOCKS_16: counter setup must not reference the " |
| 4988 | + . "ADDBE register args ($ADDBE_4x4/$ADDBE_1234); use ddq_addbe_*(%rip). " |
| 4989 | + . "Callers alias these registers to preloaded AES round keys, so this " |
| 4990 | + . "would corrupt the CTR keystream (keystream/nonce reuse).\n"; |
| 4991 | + } |
| 4992 | +
|
4920 | 4993 | $code .= <<___; |
4921 | 4994 | # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
4922 | 4995 | # ;; prepare counter blocks |
4923 | 4996 |
|
4924 | 4997 | cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]} |
4925 | 4998 | jae .L_next_16_overflow_${label_suffix} |
4926 | | - vpaddd $ADDBE_1234,$CTR,$B00_03 |
4927 | | - vpaddd $ADDBE_4x4,$B00_03,$B04_07 |
4928 | | - vpaddd $ADDBE_4x4,$B04_07,$B08_11 |
4929 | | - vpaddd $ADDBE_4x4,$B08_11,$B12_15 |
| 4999 | +___ |
| 5000 | + $code .= $ctr_setup_code; |
| 5001 | + $code .= <<___; |
4930 | 5002 | jmp .L_next_16_ok_${label_suffix} |
4931 | 5003 | .L_next_16_overflow_${label_suffix}: |
4932 | 5004 | vpshufb $SHUF_MASK,$CTR,$CTR |
|
0 commit comments