Skip to content

Commit f9ead65

Browse files
committed
aes-gcm: Enable AVX-512 implementation.
1 parent 2a3ed40 commit f9ead65

File tree

9 files changed

+236
-82
lines changed

9 files changed

+236
-82
lines changed

build.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[
7676

7777
(&[X86_64], "crypto/chacha/asm/chacha-x86_64.pl"),
7878
(&[X86_64], "crypto/curve25519/curve25519_64_adx.c"),
79+
(&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl"),
7980
(&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl"),
8081
(&[X86_64], "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl"),
8182
(&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"),
@@ -888,8 +889,10 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
888889
"LIMB_shr",
889890
"OPENSSL_cpuid_setup",
890891
"aes_gcm_dec_kernel",
892+
"aes_gcm_dec_update_vaes_avx10_512",
891893
"aes_gcm_dec_update_vaes_avx2",
892894
"aes_gcm_enc_kernel",
895+
"aes_gcm_enc_update_vaes_avx10_512",
893896
"aes_gcm_enc_update_vaes_avx2",
894897
"aes_hw_ctr32_encrypt_blocks",
895898
"aes_hw_set_encrypt_key",
@@ -949,12 +952,15 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
949952
"gcm_ghash_avx",
950953
"gcm_ghash_clmul",
951954
"gcm_ghash_neon",
955+
"gcm_ghash_vpclmulqdq_avx10_512_1",
952956
"gcm_ghash_vpclmulqdq_avx2_1",
953957
"gcm_gmult_clmul",
954958
"gcm_gmult_neon",
959+
"gcm_gmult_vpclmulqdq_avx2",
955960
"gcm_init_avx",
956961
"gcm_init_clmul",
957962
"gcm_init_neon",
963+
"gcm_init_vpclmulqdq_avx10_512",
958964
"gcm_init_vpclmulqdq_avx2",
959965
"k25519Precomp",
960966
"limbs_mul_add_limb",

crypto/fipsmodule/aes/asm/aes-gcm-avx10-x86_64.pl

Lines changed: 1 addition & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -737,59 +737,6 @@ sub _ghash_update {
737737
vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC_XMM
738738
vpshufb $BSWAP_MASK_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
739739
740-
# Optimize for AADLEN < VL by checking for AADLEN < VL before AADLEN < 4*VL.
741-
cmp \$$VL, $AADLEN
742-
jb .Laad_blockbyblock$local_label_suffix
743-
744-
# AADLEN >= VL, so we'll operate on full vectors. Broadcast bswap_mask and
745-
# gfpoly to all 128-bit lanes.
746-
vshufi64x2 \$0, $BSWAP_MASK, $BSWAP_MASK, $BSWAP_MASK
747-
vshufi64x2 \$0, $GFPOLY, $GFPOLY, $GFPOLY
748-
749-
# Load the lowest set of key powers.
750-
vmovdqu8 $OFFSETOFEND_H_POWERS-1*$VL($H_POWERS), $H_POW1
751-
752-
cmp \$4*$VL-1, $AADLEN
753-
jbe .Laad_loop_1x$local_label_suffix
754-
755-
# AADLEN >= 4*VL. Load the higher key powers.
756-
vmovdqu8 $OFFSETOFEND_H_POWERS-4*$VL($H_POWERS), $H_POW4
757-
vmovdqu8 $OFFSETOFEND_H_POWERS-3*$VL($H_POWERS), $H_POW3
758-
vmovdqu8 $OFFSETOFEND_H_POWERS-2*$VL($H_POWERS), $H_POW2
759-
760-
# Update GHASH with 4*VL bytes of AAD at a time.
761-
.Laad_loop_4x$local_label_suffix:
762-
vmovdqu8 0*$VL($AAD), $GHASHDATA0
763-
vmovdqu8 1*$VL($AAD), $GHASHDATA1
764-
vmovdqu8 2*$VL($AAD), $GHASHDATA2
765-
vmovdqu8 3*$VL($AAD), $GHASHDATA3
766-
@{[ _ghash_4x ]}
767-
sub \$-4*$VL, $AAD # shorter than 'add 4*VL' when VL=32
768-
add \$-4*$VL, $AADLEN
769-
cmp \$4*$VL-1, $AADLEN
770-
ja .Laad_loop_4x$local_label_suffix
771-
772-
# Update GHASH with VL bytes of AAD at a time.
773-
cmp \$$VL, $AADLEN
774-
jb .Laad_large_done$local_label_suffix
775-
.Laad_loop_1x$local_label_suffix:
776-
vmovdqu8 ($AAD), $GHASHDATA0
777-
vpshufb $BSWAP_MASK, $GHASHDATA0, $GHASHDATA0
778-
vpxord $GHASHDATA0, $GHASH_ACC, $GHASH_ACC
779-
@{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY,
780-
$GHASHDATA0, $GHASHDATA1, $GHASHDATA2 ]}
781-
@{[ _horizontal_xor $GHASH_ACC, $GHASH_ACC_XMM, $GHASH_ACC_XMM,
782-
$GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
783-
add \$$VL, $AAD
784-
sub \$$VL, $AADLEN
785-
cmp \$$VL, $AADLEN
786-
jae .Laad_loop_1x$local_label_suffix
787-
788-
.Laad_large_done$local_label_suffix:
789-
# Issue the vzeroupper that is needed after using ymm or zmm registers.
790-
# Do it here instead of at the end, to minimize overhead for small AADLEN.
791-
vzeroupper
792-
793740
# GHASH the remaining data 16 bytes at a time, using xmm registers only.
794741
.Laad_blockbyblock$local_label_suffix:
795742
test $AADLEN, $AADLEN
@@ -801,9 +748,6 @@ sub _ghash_update {
801748
vpxor $GHASHDATA0_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM
802749
@{[ _ghash_mul $H_POW1_XMM, $GHASH_ACC_XMM, $GHASH_ACC_XMM, $GFPOLY_XMM,
803750
$GHASHDATA0_XMM, $GHASHDATA1_XMM, $GHASHDATA2_XMM ]}
804-
add \$16, $AAD
805-
sub \$16, $AADLEN
806-
jnz .Laad_loop_blockbyblock$local_label_suffix
807751
808752
.Laad_done$local_label_suffix:
809753
# Store the updated GHASH accumulator back to memory.
@@ -1303,31 +1247,6 @@ sub _aes_gcm_update {
13031247
return $code;
13041248
}
13051249

1306-
# void gcm_gmult_vpclmulqdq_avx10(uint8_t Xi[16], const u128 Htable[16]);
1307-
$code .= _begin_func "gcm_gmult_vpclmulqdq_avx10", 1;
1308-
{
1309-
my ( $GHASH_ACC_PTR, $H_POWERS ) = @argregs[ 0 .. 1 ];
1310-
my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
1311-
map( "%xmm$_", ( 0 .. 6 ) );
1312-
1313-
$code .= <<___;
1314-
@{[ _save_xmmregs (6) ]}
1315-
.seh_endprologue
1316-
1317-
vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC
1318-
vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK
1319-
vmovdqu $OFFSETOFEND_H_POWERS-16($H_POWERS), $H_POW1
1320-
vmovdqu .Lgfpoly(%rip), $GFPOLY
1321-
vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
1322-
1323-
@{[ _ghash_mul $H_POW1, $GHASH_ACC, $GHASH_ACC, $GFPOLY, $T0, $T1, $T2 ]}
1324-
1325-
vpshufb $BSWAP_MASK, $GHASH_ACC, $GHASH_ACC
1326-
vmovdqu $GHASH_ACC, ($GHASH_ACC_PTR)
1327-
___
1328-
}
1329-
$code .= _end_func;
1330-
13311250
# Disabled until significant deployment of AVX10/256 is seen. The separate
13321251
# *_vaes_avx2 implementation provides the only 256-bit support for now.
13331252
#
@@ -1353,7 +1272,7 @@ sub _aes_gcm_update {
13531272
$code .= _aes_gcm_init;
13541273
$code .= _end_func;
13551274

1356-
$code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_512", 1;
1275+
$code .= _begin_func "gcm_ghash_vpclmulqdq_avx10_512_1", 1;
13571276
$code .= _ghash_update;
13581277
$code .= _end_func;
13591278

src/aead/aes_gcm.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ use cpu::GetFeature as _;
3636
mod aarch64;
3737
mod aeshwclmulmovbe;
3838
mod vaesclmulavx2;
39+
mod vaesclmulavx512;
3940

4041
#[derive(Clone)]
4142
pub(super) struct Key(DynKey);
@@ -51,6 +52,9 @@ impl Key {
5152

5253
#[derive(Clone)]
5354
enum DynKey {
55+
#[cfg(target_arch = "x86_64")]
56+
VAesClMulAvx512(Combo<aes::hw::Key, gcm::vclmulavx512::Key>),
57+
5458
#[cfg(target_arch = "x86_64")]
5559
VAesClMulAvx2(Combo<aes::hw::Key, gcm::vclmulavx2::Key>),
5660

@@ -86,6 +90,9 @@ impl DynKey {
8690
let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature())?;
8791
let gcm_key_value = derive_gcm_key_value(&aes_key);
8892
let combo = if let Some(cpu) = cpu.get_feature() {
93+
let gcm_key = gcm::vclmulavx512::Key::new(gcm_key_value, cpu);
94+
Self::VAesClMulAvx512(Combo { aes_key, gcm_key })
95+
} else if let Some(cpu) = cpu.get_feature() {
8996
let gcm_key = gcm::vclmulavx2::Key::new(gcm_key_value, cpu);
9097
Self::VAesClMulAvx2(Combo { aes_key, gcm_key })
9198
} else if let Some(cpu) = cpu.get_feature() {
@@ -190,6 +197,11 @@ pub(super) fn seal(
190197
seal_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::seal_whole)
191198
}
192199

200+
#[cfg(target_arch = "x86_64")]
201+
DynKey::VAesClMulAvx512(c) => {
202+
seal_whole_partial(c, aad, in_out, ctr, tag_iv, vaesclmulavx512::seal_whole)
203+
}
204+
193205
#[cfg(target_arch = "x86_64")]
194206
DynKey::VAesClMulAvx2(c) => seal_whole_partial(
195207
c,
@@ -317,6 +329,17 @@ pub(super) fn open(
317329
open_whole_partial(c, aad, in_out_slice, src, ctr, tag_iv, aarch64::open_whole)
318330
}
319331

332+
#[cfg(target_arch = "x86_64")]
333+
DynKey::VAesClMulAvx512(c) => open_whole_partial(
334+
c,
335+
aad,
336+
in_out_slice,
337+
src,
338+
ctr,
339+
tag_iv,
340+
vaesclmulavx512::open_whole,
341+
),
342+
320343
#[cfg(target_arch = "x86_64")]
321344
DynKey::VAesClMulAvx2(c) => open_whole_partial(
322345
c,

src/aead/aes_gcm/vaesclmulavx512.rs

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
// Copyright 2015-2025 Brian Smith.
2+
//
3+
// Permission to use, copy, modify, and/or distribute this software for any
4+
// purpose with or without fee is hereby granted, provided that the above
5+
// copyright notice and this permission notice appear in all copies.
6+
//
7+
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8+
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9+
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10+
// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11+
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12+
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13+
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14+
15+
#![cfg(target_arch = "x86_64")]
16+
17+
use super::{aes, gcm, Counter, BLOCK_LEN};
18+
use crate::{aead::aes::Overlapping, c, polyfill::slice::AsChunksMut};
19+
use core::num::{NonZeroU32, NonZeroUsize};
20+
21+
pub(super) fn seal_whole(
22+
aes_key: &aes::hw::Key,
23+
auth: &mut gcm::Context<gcm::vclmulavx512::Key>,
24+
ctr: &mut Counter,
25+
mut in_out: AsChunksMut<u8, BLOCK_LEN>,
26+
) {
27+
prefixed_extern! {
28+
fn aes_gcm_enc_update_vaes_avx10_512(
29+
input: *const u8,
30+
output: *mut u8,
31+
len: c::NonZero_size_t, // TODO? zero OK?
32+
key: &aes::AES_KEY,
33+
ivec: &Counter,
34+
Htable: &gcm::HTable,
35+
Xi: &mut gcm::Xi);
36+
}
37+
38+
let in_out = in_out.as_flattened_mut();
39+
40+
// Precondition: Since we have a `gcm::Context` then the number of blocks
41+
// must fit in `u32`.
42+
let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();
43+
44+
if let Some(len) = NonZeroUsize::new(in_out.len()) {
45+
let aes_key = aes_key.inner_less_safe();
46+
let (htable, xi) = auth.inner();
47+
let input = in_out.as_ptr();
48+
let output = in_out.as_mut_ptr();
49+
unsafe { aes_gcm_enc_update_vaes_avx10_512(input, output, len, aes_key, ctr, htable, xi) };
50+
let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| {
51+
unreachable!() // Due to previous checks.
52+
});
53+
ctr.increment_by_less_safe(blocks);
54+
}
55+
}
56+
57+
pub(super) fn open_whole(
58+
aes_key: &aes::hw::Key,
59+
auth: &mut gcm::Context<gcm::vclmulavx512::Key>,
60+
in_out: Overlapping,
61+
ctr: &mut Counter,
62+
) {
63+
prefixed_extern! {
64+
fn aes_gcm_dec_update_vaes_avx10_512(
65+
input: *const u8,
66+
output: *mut u8,
67+
len: c::NonZero_size_t, // TODO? zero OK?
68+
key: &aes::AES_KEY,
69+
ivec: &mut Counter,
70+
Htable: &gcm::HTable,
71+
Xi: &mut gcm::Xi);
72+
}
73+
74+
// Precondition. TODO: Create an overlapping::AsChunks for this.
75+
assert_eq!(in_out.len() % BLOCK_LEN, 0);
76+
// Precondition: Since we have a `gcm::Context` then the number of blocks
77+
// must fit in `u32`.
78+
let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();
79+
80+
in_out.with_input_output_len(|input, output, len| {
81+
if let Some(len) = NonZeroUsize::new(len) {
82+
let aes_key = aes_key.inner_less_safe();
83+
let (htable, xi) = auth.inner();
84+
unsafe {
85+
aes_gcm_dec_update_vaes_avx10_512(input, output, len, aes_key, ctr, htable, xi)
86+
};
87+
let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| {
88+
unreachable!() // Due to previous checks.
89+
});
90+
ctr.increment_by_less_safe(blocks);
91+
}
92+
})
93+
}

src/aead/gcm.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ pub(super) mod clmulavxmovbe;
3939
pub(super) mod fallback;
4040
pub(super) mod neon;
4141
pub(super) mod vclmulavx2;
42+
pub(super) mod vclmulavx512;
4243

4344
pub(super) struct Context<'key, K> {
4445
Xi: Xi,
@@ -128,6 +129,15 @@ impl Context<'_, vclmulavx2::Key> {
128129
}
129130
}
130131

132+
#[cfg(target_arch = "x86_64")]
133+
impl Context<'_, vclmulavx512::Key> {
134+
/// Access to `inner` for the integrated AES-GCM implementations only.
135+
#[inline]
136+
pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) {
137+
(self.key.inner(), &mut self.Xi)
138+
}
139+
}
140+
131141
impl<K: UpdateBlocks> Context<'_, K> {
132142
#[inline(always)]
133143
pub fn update_blocks(&mut self, input: AsChunks<u8, BLOCK_LEN>) {

src/aead/gcm/vclmulavx2.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ pub struct Key {
2727
}
2828

2929
impl Key {
30+
#[inline(never)]
3031
pub(in super::super) fn new(value: KeyValue, _cpu: (Avx2, VAesClmul)) -> Self {
3132
Self {
3233
h_table: unsafe { htable_new!(gcm_init_vpclmulqdq_avx2, value) },

src/aead/gcm/vclmulavx512.rs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// Copyright 2018-2025 Brian Smith.
2+
//
3+
// Permission to use, copy, modify, and/or distribute this software for any
4+
// purpose with or without fee is hereby granted, provided that the above
5+
// copyright notice and this permission notice appear in all copies.
6+
//
7+
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8+
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9+
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10+
// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11+
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12+
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13+
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14+
15+
#![cfg(target_arch = "x86_64")]
16+
17+
use super::{ffi::KeyValue, HTable, UpdateBlock, Xi};
18+
use crate::{
19+
aead::gcm::ffi::BLOCK_LEN,
20+
cpu::intel::{Avx2, Avx512_BW_VL_ZMM, Bmi2, VAesClmul},
21+
polyfill::slice::AsChunks,
22+
};
23+
24+
#[derive(Clone)]
25+
pub struct Key {
26+
h_table: HTable,
27+
}
28+
29+
impl Key {
30+
pub(in super::super) fn new(
31+
value: KeyValue,
32+
_cpu: (Avx2, Avx512_BW_VL_ZMM, Bmi2, VAesClmul),
33+
) -> Self {
34+
Self {
35+
h_table: unsafe { htable_new!(gcm_init_vpclmulqdq_avx10_512, value) },
36+
}
37+
}
38+
39+
pub(super) fn inner(&self) -> &HTable {
40+
&self.h_table
41+
}
42+
}
43+
44+
impl UpdateBlock for Key {
45+
fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) {
46+
let input: AsChunks<u8, BLOCK_LEN> = (&a).into();
47+
unsafe { ghash!(gcm_ghash_vpclmulqdq_avx10_512_1, xi, &self.h_table, input) }
48+
}
49+
}

0 commit comments

Comments
 (0)