Fix missing vzeroupper in gcm_ghash_vpclmulqdq_avx2() for len=16

ebiggers · briansmith · commit e98f47fe62ec · 2025-03-11T10:18:09.000-07:00
[Adapted by Brian Smith to *ring*'s prior modifications.]

gcm_ghash_vpclmulqdq_avx2() executes vzeroupper only when len &gt;= 32, as
it was supposed to use only xmm registers for len == 16.  But actually
it was writing to two ymm registers unconditionally for $BSWAP_MASK and
$GFPOLY.  Therefore, there could be a slow-down in later code using
legacy SSE instructions, in the (probably rare) case where
gcm_ghash_vpclmulqdq_avx2() was called with len=16 *and* wasn't followed
by a function that does vzeroupper, e.g. aes_gcm_enc_update_vaes_avx2().

(The Windows xmm register restore epilogue of
gcm_ghash_vpclmulqdq_avx2() itself does use legacy SSE instructions, so
probably was slowed down by this, but that's just 8 instructions.)

Fix this by updating gcm_ghash_vpclmulqdq_avx2() to correctly use only
xmm registers when len=16.  This makes it match the similar code in
aes-gcm-avx10-x86_64.pl, which does do it correctly, more closely.

Also, make both functions just execute vzeroupper unconditionally, so
that it won't be missed again.  It's actually only 1 cycle on the CPUs
this code runs on, and it no longer seems worth executing conditionally.

Change-Id: I975cd5b526e5cdae1a567f4085c2484552bf6bea
diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl
@@ -786,9 +786,6 @@ sub _ghash_update {
     jae             .Laad_loop_1x$local_label_suffix
 
 .Laad_large_done$local_label_suffix:
-    # Issue the vzeroupper that is needed after using ymm or zmm registers.
-    # Do it here instead of at the end, to minimize overhead for small AADLEN.
-    vzeroupper
 
     # GHASH the remaining data 16 bytes at a time, using xmm registers only.
 .Laad_blockbyblock$local_label_suffix:
@@ -809,6 +806,8 @@ sub _ghash_update {
     # Store the updated GHASH accumulator back to memory.
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+
+    vzeroupper      # This is needed after using ymm or zmm registers.
 ___
     return $code;
 }
diff --git a/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl b/crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl
@@ -461,10 +461,16 @@ sub _ghash_4x {
     @{[ _save_xmmregs (6 .. 9) ]}
     .seh_endprologue
 
-    vbroadcasti128  .Lbswap_mask(%rip), $BSWAP_MASK
+    # Load the bswap_mask and gfpoly constants.  Since AADLEN is usually small,
+    # usually only 128-bit vectors will be used.  So as an optimization, don't
+    # broadcast these constants to both 128-bit lanes quite yet.
+    vmovdqu         .Lbswap_mask(%rip), $BSWAP_MASK_XMM
+    vmovdqu         .Lgfpoly(%rip), $GFPOLY_XMM
+
+    # Load the GHASH accumulator.
     vmovdqu         ($GHASH_ACC_PTR), $GHASH_ACC_XMM
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
-    vbroadcasti128  .Lgfpoly(%rip), $GFPOLY
+
 
     # Update GHASH with the remaining 16-byte block if any.
 .Lghash_lastblock:
@@ -479,6 +485,8 @@ sub _ghash_4x {
     # Store the updated GHASH accumulator back to memory.
     vpshufb         $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
     vmovdqu         $GHASH_ACC_XMM, ($GHASH_ACC_PTR)
+
+    vzeroupper
 ___
 }
 $code .= _end_func;