125125# ; Currently disabled, as this check is handled outside.
126126my $ CHECK_FUNCTION_ARGUMENTS = 0 ;
127127
128- # ; Preload AES round keys 0-3 into dedicated ZMM registers for the >=704-byte
129- # ; 16-/32-block hot loops, instead of reloading them from memory each
130- # ; iteration. This is the optimization currently under evaluation: on some
131- # ; microarchitectures its benefit is negligible, and it forces aliasing the
132- # ; ADDBE counter constants onto AES-key registers (see the counter-setup
133- # ; invariants in GHASH_16_ENCRYPT_16_PARALLEL / _N_GHASH_N / INITIAL_BLOCKS_16).
134- # ; Set to 0 to disable it (round keys are then reloaded from memory). The
135- # ; OPENSSL_AESGCM_NO_KEYPRELOAD build-time environment variable overrides this
136- # ; to 0, so both variants can be built from a single source for A/B evaluation.
137- my $ PRELOAD_AES_ROUND_KEYS = $ ENV {OPENSSL_AESGCM_NO_KEYPRELOAD} ? 0 : 1 ;
138-
139128# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
140129# ;;; Global constants
141130# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2362,38 +2351,9 @@ sub GHASH_16_ENCRYPT_N_GHASH_N {
23622351 jae .L_16_blocks_overflow_${label_suffix}
23632352___
23642353
2365- # ;; INVARIANT (do not break): the counter increments below MUST use the
2366- # ;; ddq_addbe_*(%rip) memory constants and MUST NOT reference the
2367- # ;; $ADDBE_4x4 / $ADDBE_1234 register arguments (args 34/35).
2368- # ;;
2369- # ;; Rationale: the large-message callers in GCM_ENC_DEC (the 16-block hot
2370- # ;; loop and the 32-block big loop) deliberately alias those two ZMM
2371- # ;; registers (%zmm27/%zmm28) to preloaded AES round keys and may pass them
2372- # ;; in here only as dead placeholders. If a counter addend were emitted from
2373- # ;; $ADDBE_4x4 / $ADDBE_1234 it would add AES key bytes to the CTR block,
2374- # ;; corrupting the keystream and causing catastrophic keystream/nonce reuse
2375- # ;; (CWE-323). Whole-ZMM adds are used unconditionally (the unused upper
2376- # ;; lanes are harmless), mirroring the overflow branch below.
2377- my $ctr_setup_code = " vpaddd ddq_addbe_1234(%rip),$CTR_BE,$B00_03\n";
2378- if ($NUM_BLOCKS > 4 ) {
2379- $ ctr_setup_code .= " vpaddd ddq_addbe_4444(% rip ),$ B00_03 ,$ B04_07\n " ;
2380- }
2381- if ($ NUM_BLOCKS > 8 ) {
2382- $ ctr_setup_code .= " vpaddd ddq_addbe_4444(% rip ),$ B04_07 ,$ B08_11\n " ;
2383- }
2384- if ($ NUM_BLOCKS > 12 ) {
2385- $ ctr_setup_code .= " vpaddd ddq_addbe_4444(% rip ),$ B08_11 ,$ B12_15\n " ;
2386- }
2387-
2388- # ;; Build-time enforcement of the invariant documented above.
2389- if ($ ctr_setup_code =~ / \Q$ADDBE_4x4\E\b/ || $ ctr_setup_code =~ / \Q$ADDBE_1234\E\b/ ) {
2390- die " GHASH_16_ENCRYPT_N_GHASH_N: counter setup must not reference the "
2391- . " ADDBE register args ($ ADDBE_4x4 /$ ADDBE_1234 ); use ddq_addbe_*(% rip ). "
2392- . " Callers alias these registers to preloaded AES round keys, so this "
2393- . " would corrupt the CTR keystream (keystream/nonce reuse).\n " ;
2394- }
2395-
2396- $ code .= $ ctr_setup_code ;
2354+ &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2355+ $NUM_BLOCKS, "vpaddd", $B00_03, $B04_07, $B08_11, $B12_15, $CTR_BE,
2356+ $B00_03, $B04_07, $B08_11, $ADDBE_1234, $ADDBE_4x4, $ADDBE_4x4, $ADDBE_4x4);
23972357 $code .= <<___;
23982358 jmp .L_16_blocks_ok_${label_suffix}
23992359
@@ -3006,10 +2966,6 @@ sub GHASH_16_ENCRYPT_16_PARALLEL {
30062966 my $DATA_DISPL = $_[40]; # [in] fixed numerical data displacement/offset
30072967 my $GHASH_IN = $_[41]; # [in] current GHASH value or "no_ghash_in"
30082968 my $IA0 = $_[42]; # [clobbered] temporary GPR
3009- my $PRELOADED_KEY0 = defined($_[43]) ? $_[43] : ""; # [in] optional preloaded AES round key 0
3010- my $PRELOADED_KEY1 = defined($_[44]) ? $_[44] : ""; # [in] optional preloaded AES round key 1
3011- my $PRELOADED_KEY2 = defined($_[45]) ? $_[45] : ""; # [in] optional preloaded AES round key 2
3012- my $PRELOADED_KEY3 = defined($_[46]) ? $_[46] : ""; # [in] optional preloaded AES round key 3
30132969
30142970 my $B00_03 = $ZT1;
30152971 my $B04_07 = $ZT2;
@@ -3055,27 +3011,13 @@ sub GHASH_16_ENCRYPT_16_PARALLEL {
30553011
30563012 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30573013 # ;; prepare counter blocks
3058- # ;;
3059- # ;; INVARIANT (do not break): the counter increments below MUST use the
3060- # ;; ddq_addbe_*(%rip) memory constants and MUST NOT reference the
3061- # ;; $ADDBE_4x4 / $ADDBE_1234 register arguments (args 34/35).
3062- # ;;
3063- # ;; Rationale: the large-message callers in GCM_ENC_DEC (the 16-block hot
3064- # ;; loop and the 32-block big loop) deliberately alias those two ZMM
3065- # ;; registers (%zmm27/%zmm28) to preloaded AES round keys 2/3 and pass them
3066- # ;; in here only as dead placeholders. If a counter addend were emitted from
3067- # ;; $ADDBE_4x4 / $ADDBE_1234 it would add AES key bytes to the CTR block,
3068- # ;; corrupting the keystream and causing catastrophic keystream/nonce reuse
3069- # ;; (CWE-323). The $ADDBE_* register args are therefore intentionally unused
3070- # ;; in this macro; the build-time check below enforces that.
3071-
3072- my $ctr_setup_code = <<___;
3014+ $code .= <<___;
30733015 cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
30743016 jae .L_16_blocks_overflow_${label_suffix}
3075- vpaddd ddq_addbe_1234(%rip) ,$CTR_BE,$B00_03
3076- vpaddd ddq_addbe_4444(%rip) ,$B00_03,$B04_07
3077- vpaddd ddq_addbe_4444(%rip) ,$B04_07,$B08_11
3078- vpaddd ddq_addbe_4444(%rip) ,$B08_11,$B12_15
3017+ vpaddd $ADDBE_1234 ,$CTR_BE,$B00_03
3018+ vpaddd $ADDBE_4x4 ,$B00_03,$B04_07
3019+ vpaddd $ADDBE_4x4 ,$B04_07,$B08_11
3020+ vpaddd $ADDBE_4x4 ,$B08_11,$B12_15
30793021 jmp .L_16_blocks_ok_${label_suffix}
30803022.L_16_blocks_overflow_${label_suffix}:
30813023 vpshufb $SHFMSK,$CTR_BE,$CTR_BE
@@ -3091,27 +3033,13 @@ sub GHASH_16_ENCRYPT_16_PARALLEL {
30913033.L_16_blocks_ok_${label_suffix}:
30923034___
30933035
3094- # ;; Build-time enforcement of the invariant documented above. Scoped to the
3095- # ;; counter-setup snippet only, so the legitimate use of the same physical
3096- # ;; registers as preloaded AES keys later in this macro is not flagged.
3097- if ($ctr_setup_code =~ /\Q$ADDBE_4x4\E\b/ || $ctr_setup_code =~ /\Q$ADDBE_1234\E\b/) {
3098- die "GHASH_16_ENCRYPT_16_PARALLEL: counter setup must not reference the "
3099- . "ADDBE register args ($ADDBE_4x4/$ADDBE_1234); use ddq_addbe_*(%rip). "
3100- . "Callers alias these registers to preloaded AES round keys, so this "
3101- . "would corrupt the CTR keystream (keystream/nonce reuse).\n";
3102- }
3103-
3104- $code .= $ctr_setup_code;
3105-
31063036 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31073037 # ;; pre-load constants
3108- my $ARK_KEY = ($PRELOADED_KEY0 ne "") ? $PRELOADED_KEY0 : $AESKEY1;
3109- my $R1_KEY = ($PRELOADED_KEY1 ne "") ? $PRELOADED_KEY1 : $AESKEY2;
3110- my $R2_KEY = ($PRELOADED_KEY2 ne "") ? $PRELOADED_KEY2 : $AESKEY1;
3111- my $R3_KEY = ($PRELOADED_KEY3 ne "") ? $PRELOADED_KEY3 : $AESKEY2;
3112- if ($PRELOADED_KEY0 eq "") {
3113- $code .= "vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1\n";
3114- }
3038+ my $ARK_KEY = $AESKEY1;
3039+ my $R1_KEY = $AESKEY2;
3040+ my $R2_KEY = $AESKEY1;
3041+ my $R3_KEY = $AESKEY2;
3042+ $code .= "vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1\n";
31153043 if ($GHASH_IN ne "no_ghash_in") {
31163044 $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHASH_IN,$GHDAT1\n";
31173045 } else {
@@ -3127,9 +3055,7 @@ sub GHASH_16_ENCRYPT_16_PARALLEL {
31273055 vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR_BE
31283056 addb \$16,@{[BYTE($CTR_CHECK)]}
31293057___
3130- if ($PRELOADED_KEY1 eq "") {
3131- $code .= " vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2\n";
3132- }
3058+ $code .= " vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2\n";
31333059 $code .= <<___;
31343060 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (1*4)),"%rsp")]},$GHKEY2
31353061 vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
@@ -3145,9 +3071,7 @@ sub GHASH_16_ENCRYPT_16_PARALLEL {
31453071 vpxorq $ARK_KEY,$B08_11,$B08_11
31463072 vpxorq $ARK_KEY,$B12_15,$B12_15
31473073___
3148- if ($PRELOADED_KEY2 eq "") {
3149- $code .= " vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1\n";
3150- }
3074+ $code .= " vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1\n";
31513075 $code .= <<___;
31523076
31533077 # ;;==================================================
@@ -3166,9 +3090,7 @@ sub GHASH_16_ENCRYPT_16_PARALLEL {
31663090 vaesenc $R1_KEY,$B08_11,$B08_11
31673091 vaesenc $R1_KEY,$B12_15,$B12_15
31683092___
3169- if ($PRELOADED_KEY3 eq "") {
3170- $code .= " vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2\n";
3171- }
3093+ $code .= " vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2\n";
31723094 $code .= <<___;
31733095
31743096 # ;; =================================================
@@ -4159,12 +4081,6 @@ sub GCM_ENC_DEC {
41594081 # ; Unused in the small packet path
41604082 my $ADDBE_4x4 = "%zmm27";
41614083 my $ADDBE_1234 = "%zmm28";
4162- # ; Empty when key preloading is disabled = > the stitched macros reload the
4163- # ; round keys from memory (see $PRELOAD_AES_ROUND_KEYS).
4164- my $ PRELOADED_AES_KEY0 = $ PRELOAD_AES_ROUND_KEYS ? " % zmm9" : " " ;
4165- my $ PRELOADED_AES_KEY1 = $ PRELOAD_AES_ROUND_KEYS ? " % zmm23" : " " ;
4166- my $ PRELOADED_AES_KEY2 = $ PRELOAD_AES_ROUND_KEYS ? " % zmm27" : " " ;
4167- my $ PRELOADED_AES_KEY3 = $ PRELOAD_AES_ROUND_KEYS ? " % zmm28" : " " ;
41684084
41694085 my $MASKREG = "%k1";
41704086
@@ -4266,14 +4182,6 @@ sub GCM_ENC_DEC {
42664182 jb .L_4block_early_${label_suffix}
42674183___
42684184
4269- # ;; preload AES round keys 0 and 1 (only msg >= 704 ; 4 / 8 -block path reloads from $ AES_KEYS )
4270- if ($ PRELOAD_AES_ROUND_KEYS ) {
4271- $ code .= <<___;
4272- vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$PRELOADED_AES_KEY0
4273- vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$PRELOADED_AES_KEY1
4274- ___
4275- }
4276-
42774185 # ;; ==== AES-CTR - first 16 blocks
42784186 my $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
42794187 my $data_in_out_offset = 0;
@@ -4304,12 +4212,6 @@ sub GCM_ENC_DEC {
43044212
43054213 $code .= "mov \$1,$HKEYS_READY\n";
43064214
4307- # ;; Preload extra AES round keys (overwrites ADDBE registers)
4308- if ($PRELOAD_AES_ROUND_KEYS) {
4309- $code .= " vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$PRELOADED_AES_KEY2\n";
4310- $code .= " vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$PRELOADED_AES_KEY3\n";
4311- }
4312-
43134215 $code .= <<___;
43144216 add \$`(16 * 16)`,$DATA_OFFSET
43154217 sub \$`(16 * 16)`,$LENGTH
@@ -4341,7 +4243,7 @@ sub GCM_ENC_DEC {
43414243 $GL,
43424244 $GH, $GM, "first_time_reduction", $ENC_DEC, $data_in_out_offset,
43434245 $AAD_HASHz,
4344- $IA0, $PRELOADED_AES_KEY0, $PRELOADED_AES_KEY1, $PRELOADED_AES_KEY2, $PRELOADED_AES_KEY3 );
4246+ $IA0);
43454247
43464248 $code .= <<___;
43474249 vmovdqa64 $ZTMP4,$AAD_HASHz
@@ -4354,10 +4256,6 @@ sub GCM_ENC_DEC {
43544256.L_small_loop_tail_${label_suffix}:
43554257___
43564258
4357- # ;; Restore ADDBE constants (overwritten by preloaded AES keys)
4358- $code .= " vmovdqa64 ddq_addbe_4444(%rip),$ADDBE_4x4\n";
4359- $code .= " vmovdqa64 ddq_addbe_1234(%rip),$ADDBE_1234\n";
4360-
43614259 # ;; GHASH the 16 blocks still on stack and reduce
43624260 &GHASH_16(
43634261 "start_reduce", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, 0,
@@ -4715,12 +4613,6 @@ sub GCM_ENC_DEC {
47154613 "mid16");
47164614 $code .= "mov \$1,$HKEYS_READY\n";
47174615
4718- # ;; Overwrite ADDBE registers with preloaded AES round keys 2 and 3
4719- if ($PRELOAD_AES_ROUND_KEYS) {
4720- $code .= " vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$PRELOADED_AES_KEY2\n";
4721- $code .= " vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$PRELOADED_AES_KEY3\n";
4722- }
4723-
47244616 $code .= <<___;
47254617 add \$`(32 * 16)`,$DATA_OFFSET
47264618 sub \$`(32 * 16)`,$LENGTH
@@ -4755,7 +4647,7 @@ sub GCM_ENC_DEC {
47554647 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
47564648 $GL,
47574649 $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
4758- $IA0, $PRELOADED_AES_KEY0, $PRELOADED_AES_KEY1, $PRELOADED_AES_KEY2, $PRELOADED_AES_KEY3 );
4650+ $IA0);
47594651
47604652 # ;; ==== AES-CTR + GHASH - 16 blocks, slot 1 (in-place), reduction
47614653 $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
@@ -4775,7 +4667,7 @@ sub GCM_ENC_DEC {
47754667 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
47764668 $GL,
47774669 $GH, $GM, "final_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
4778- $IA0, $PRELOADED_AES_KEY0, $PRELOADED_AES_KEY1, $PRELOADED_AES_KEY2, $PRELOADED_AES_KEY3 );
4670+ $IA0);
47794671
47804672 # ;; === xor cipher block 0 with GHASH (ZT4)
47814673 $code .= <<___;
@@ -4787,10 +4679,6 @@ sub GCM_ENC_DEC {
47874679 jae .L_encrypt_big_nblocks_${label_suffix}
47884680
47894681.L_no_more_big_nblocks_${label_suffix}:
4790- # ;; Restore ADDBE constants (overwritten by preloaded AES keys)
4791- vmovdqa64 ddq_addbe_4444(%rip),$ADDBE_4x4
4792- vmovdqa64 ddq_addbe_1234(%rip),$ADDBE_1234
4793-
47944682___
47954683
47964684 # ;; =====================================================
@@ -4853,7 +4741,7 @@ sub GCM_ENC_DEC {
48534741 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
48544742 $GL,
48554743 $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
4856- $IA0, $PRELOADED_AES_KEY0, $PRELOADED_AES_KEY1 );
4744+ $IA0);
48574745
48584746 # ;; Finish the 32-block pending chain: hash slot1 (H^16..H^1) with reduction.
48594747 &GHASH_16(
@@ -4965,40 +4853,16 @@ sub INITIAL_BLOCKS_16 {
49654853 my $label_suffix = $label_count++;
49664854
49674855 my $stack_offset = $BLK_OFFSET;
4968-
4969- # ;; INVARIANT (do not break): the counter increments in the non-overflow path
4970- # ;; MUST use the ddq_addbe_*(%rip) memory constants and MUST NOT reference the
4971- # ;; $ADDBE_4x4 / $ADDBE_1234 register arguments (args 8/7).
4972- # ;;
4973- # ;; Rationale: the large-message callers in GCM_ENC_DEC deliberately alias
4974- # ;; those two ZMM registers (%zmm27/%zmm28) to preloaded AES round keys. Even
4975- # ;; though this macro currently runs before that aliasing, emitting a counter
4976- # ;; addend from $ADDBE_4x4 / $ADDBE_1234 would add AES key bytes to the CTR
4977- # ;; block if the call order ever changed, corrupting the keystream and causing
4978- # ;; catastrophic keystream/nonce reuse (CWE-323). The register args are
4979- # ;; therefore intentionally unused here; the build-time check below enforces it.
4980- my $ctr_setup_code = <<___;
4981- vpaddd ddq_addbe_1234(%rip),$CTR,$B00_03
4982- vpaddd ddq_addbe_4444(%rip),$B00_03,$B04_07
4983- vpaddd ddq_addbe_4444(%rip),$B04_07,$B08_11
4984- vpaddd ddq_addbe_4444(%rip),$B08_11,$B12_15
4985- ___
4986- if ($ctr_setup_code =~ /\Q$ADDBE_4x4\E\b/ || $ctr_setup_code =~ /\Q$ADDBE_1234\E\b/) {
4987- die "INITIAL_BLOCKS_16: counter setup must not reference the "
4988- . "ADDBE register args ($ADDBE_4x4/$ADDBE_1234); use ddq_addbe_*(%rip). "
4989- . "Callers alias these registers to preloaded AES round keys, so this "
4990- . "would corrupt the CTR keystream (keystream/nonce reuse).\n";
4991- }
4992-
49934856 $code .= <<___;
49944857 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
49954858 # ;; prepare counter blocks
49964859
49974860 cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
49984861 jae .L_next_16_overflow_${label_suffix}
4999- ___
5000- $code .= $ctr_setup_code;
5001- $code .= <<___;
4862+ vpaddd $ADDBE_1234,$CTR,$B00_03
4863+ vpaddd $ADDBE_4x4,$B00_03,$B04_07
4864+ vpaddd $ADDBE_4x4,$B04_07,$B08_11
4865+ vpaddd $ADDBE_4x4,$B08_11,$B12_15
50024866 jmp .L_next_16_ok_${label_suffix}
50034867.L_next_16_overflow_${label_suffix}:
50044868 vpshufb $SHUF_MASK,$CTR,$CTR
0 commit comments