Skip to content

Commit 7ca1e37

Browse files
committed
aes-gcm: Enable AVX-512 implementation.
1 parent bcf68dd commit 7ca1e37

File tree

10 files changed

+242
-3
lines changed

10 files changed

+242
-3
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ include = [
5151
"crypto/curve25519/internal.h",
5252
"crypto/fipsmodule/aes/aes_nohw.c",
5353
"crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl",
54+
"crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl",
5455
"crypto/fipsmodule/aes/asm/aesni-x86.pl",
5556
"crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl",
5657
"crypto/fipsmodule/aes/asm/aesni-x86_64.pl",

build.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[
7777
(&[X86_64], "crypto/chacha/asm/chacha-x86_64.pl"),
7878
(&[X86_64], "crypto/curve25519/curve25519_64_adx.c"),
7979
(&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx2-x86_64.pl"),
80+
(&[X86_64], "crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl"),
8081
(&[X86_64], "crypto/fipsmodule/aes/asm/aesni-gcm-x86_64.pl"),
8182
(&[X86_64], "crypto/fipsmodule/aes/asm/aesni-x86_64.pl"),
8283
(&[X86_64], "crypto/fipsmodule/aes/asm/ghash-x86_64.pl"),
@@ -889,8 +890,10 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
889890
"OPENSSL_cpuid_setup",
890891
"aes_gcm_dec_kernel",
891892
"aes_gcm_dec_update_vaes_avx2",
893+
"aes_gcm_dec_update_vaes_avx512",
892894
"aes_gcm_enc_kernel",
893895
"aes_gcm_enc_update_vaes_avx2",
896+
"aes_gcm_enc_update_vaes_avx512",
894897
"aes_hw_ctr32_encrypt_blocks",
895898
"aes_hw_set_encrypt_key",
896899
"aes_hw_set_encrypt_key_alt",
@@ -950,12 +953,14 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
950953
"gcm_ghash_clmul",
951954
"gcm_ghash_neon",
952955
"gcm_ghash_vpclmulqdq_avx2_16",
956+
"gcm_ghash_vpclmulqdq_avx512_16",
953957
"gcm_gmult_clmul",
954958
"gcm_gmult_neon",
955959
"gcm_init_avx",
956960
"gcm_init_clmul",
957961
"gcm_init_neon",
958962
"gcm_init_vpclmulqdq_avx2",
963+
"gcm_init_vpclmulqdq_avx512",
959964
"k25519Precomp",
960965
"limbs_mul_add_limb",
961966
"little_endian_bytes_from_scalar",

crypto/fipsmodule/aes/asm/aes-gcm-avx512-x86_64.pl

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -588,18 +588,24 @@ sub _ghash_4x {
588588
return $code;
589589
}
590590

591-
# void gcm_gmult_vpclmulqdq_avx512(uint8_t Xi[16], const u128 Htable[16]);
592-
$code .= _begin_func "gcm_gmult_vpclmulqdq_avx512", 1;
591+
# void gcm_ghash_vpclmulqdq_avx512_16(uint8_t Xi[16], const u128 Htable[16],
592+
# const uint8_t aad[16], size_t aad_len_16););
593+
$code .= _begin_func "gcm_ghash_vpclmulqdq_avx512_16", 1;
593594
{
594-
my ( $GHASH_ACC_PTR, $HTABLE ) = @argregs[ 0 .. 1 ];
595+
my ( $GHASH_ACC_PTR, $HTABLE, $AAD, $AAD_LEN_16 ) = @argregs[ 0 .. 3 ];
595596
my ( $GHASH_ACC, $BSWAP_MASK, $H_POW1, $GFPOLY, $T0, $T1, $T2 ) =
596597
map( "%xmm$_", ( 0 .. 6 ) );
597598

598599
$code .= <<___;
599600
@{[ _save_xmmregs (6) ]}
600601
.seh_endprologue
601602
603+
# Load the GHASH accumulator.
602604
vmovdqu ($GHASH_ACC_PTR), $GHASH_ACC
605+
606+
# XOR the AAD into the accumulator.
607+
vpxor ($AAD), $GHASH_ACC, $GHASH_ACC
608+
603609
vmovdqu .Lbswap_mask(%rip), $BSWAP_MASK
604610
vmovdqu $OFFSETOFEND_H_POWERS-16($HTABLE), $H_POW1
605611
vmovdqu .Lgfpoly(%rip), $GFPOLY

src/aead/aes_gcm.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ use cpu::GetFeature as _;
3636
mod aarch64;
3737
mod aeshwclmulmovbe;
3838
mod vaesclmulavx2;
39+
mod vaesclmulavx512;
3940

4041
#[derive(Clone)]
4142
pub(super) struct Key(DynKey);
@@ -51,6 +52,9 @@ impl Key {
5152

5253
#[derive(Clone)]
5354
enum DynKey {
55+
#[cfg(target_arch = "x86_64")]
56+
VAesClMulAvx512(Combo<aes::hw::Key, gcm::vclmulavx512::Key>),
57+
5458
#[cfg(target_arch = "x86_64")]
5559
VAesClMulAvx2(Combo<aes::hw::Key, gcm::vclmulavx2::Key>),
5660

@@ -85,6 +89,9 @@ impl DynKey {
8589
let aes_key = aes::hw::Key::new(key, aes, cpu.get_feature())?;
8690
let gcm_key_value = derive_gcm_key_value(&aes_key);
8791
let combo = if let Some(cpu) = cpu.get_feature() {
92+
let gcm_key = gcm::vclmulavx512::Key::new(gcm_key_value, cpu);
93+
Self::VAesClMulAvx512(Combo { aes_key, gcm_key })
94+
} else if let Some(cpu) = cpu.get_feature() {
8895
let gcm_key = gcm::vclmulavx2::Key::new(gcm_key_value, cpu);
8996
Self::VAesClMulAvx2(Combo { aes_key, gcm_key })
9097
} else if let Some(cpu) = cpu.get_feature() {
@@ -189,6 +196,11 @@ pub(super) fn seal(
189196
seal_whole_partial(c, aad, in_out, ctr, tag_iv, aarch64::seal_whole)
190197
}
191198

199+
#[cfg(target_arch = "x86_64")]
200+
DynKey::VAesClMulAvx512(c) => {
201+
seal_whole_partial(c, aad, in_out, ctr, tag_iv, vaesclmulavx512::seal_whole)
202+
}
203+
192204
#[cfg(target_arch = "x86_64")]
193205
DynKey::VAesClMulAvx2(c) => seal_whole_partial(
194206
c,
@@ -316,6 +328,17 @@ pub(super) fn open(
316328
open_whole_partial(c, aad, in_out_slice, src, ctr, tag_iv, aarch64::open_whole)
317329
}
318330

331+
#[cfg(target_arch = "x86_64")]
332+
DynKey::VAesClMulAvx512(c) => open_whole_partial(
333+
c,
334+
aad,
335+
in_out_slice,
336+
src,
337+
ctr,
338+
tag_iv,
339+
vaesclmulavx512::open_whole,
340+
),
341+
319342
#[cfg(target_arch = "x86_64")]
320343
DynKey::VAesClMulAvx2(c) => open_whole_partial(
321344
c,

src/aead/aes_gcm/vaesclmulavx512.rs

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// Copyright 2015-2025 Brian Smith.
2+
//
3+
// Permission to use, copy, modify, and/or distribute this software for any
4+
// purpose with or without fee is hereby granted, provided that the above
5+
// copyright notice and this permission notice appear in all copies.
6+
//
7+
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8+
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9+
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10+
// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11+
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12+
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13+
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14+
15+
#![cfg(target_arch = "x86_64")]
16+
17+
use super::{aes, gcm, Counter, BLOCK_LEN};
18+
use crate::{aead::aes::Overlapping, c, polyfill::slice::AsChunksMut};
19+
use core::num::{NonZeroU32, NonZeroUsize};
20+
21+
pub(super) fn seal_whole(
22+
aes_key: &aes::hw::Key,
23+
auth: &mut gcm::Context<gcm::vclmulavx512::Key>,
24+
ctr: &mut Counter,
25+
mut in_out: AsChunksMut<u8, BLOCK_LEN>,
26+
) {
27+
prefixed_extern! {
28+
fn aes_gcm_enc_update_vaes_avx512(
29+
input: *const u8,
30+
output: *mut u8,
31+
len: c::NonZero_size_t, // TODO? zero OK?
32+
key: &aes::AES_KEY,
33+
ivec: &Counter,
34+
Htable: &gcm::HTable,
35+
Xi: &mut gcm::Xi);
36+
}
37+
38+
let in_out = in_out.as_flattened_mut();
39+
40+
// Precondition: Since we have a `gcm::Context` then the number of blocks
41+
// must fit in `u32`.
42+
let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();
43+
44+
if let Some(len) = NonZeroUsize::new(in_out.len()) {
45+
let aes_key = aes_key.inner_less_safe();
46+
let (htable, xi) = auth.inner();
47+
let input = in_out.as_ptr();
48+
let output = in_out.as_mut_ptr();
49+
unsafe { aes_gcm_enc_update_vaes_avx512(input, output, len, aes_key, ctr, htable, xi) };
50+
let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| {
51+
unreachable!() // Due to previous checks.
52+
});
53+
ctr.increment_by_less_safe(blocks);
54+
}
55+
}
56+
57+
pub(super) fn open_whole(
58+
aes_key: &aes::hw::Key,
59+
auth: &mut gcm::Context<gcm::vclmulavx512::Key>,
60+
in_out: Overlapping,
61+
ctr: &mut Counter,
62+
) {
63+
prefixed_extern! {
64+
fn aes_gcm_dec_update_vaes_avx512(
65+
input: *const u8,
66+
output: *mut u8,
67+
len: c::NonZero_size_t, // TODO? zero OK?
68+
key: &aes::AES_KEY,
69+
ivec: &mut Counter,
70+
Htable: &gcm::HTable,
71+
Xi: &mut gcm::Xi);
72+
}
73+
74+
// Precondition. TODO: Create an overlapping::AsChunks for this.
75+
assert_eq!(in_out.len() % BLOCK_LEN, 0);
76+
// Precondition: Since we have a `gcm::Context` then the number of blocks
77+
// must fit in `u32`.
78+
let blocks = u32::try_from(in_out.len() / BLOCK_LEN).unwrap();
79+
80+
in_out.with_input_output_len(|input, output, len| {
81+
if let Some(len) = NonZeroUsize::new(len) {
82+
let aes_key = aes_key.inner_less_safe();
83+
let (htable, xi) = auth.inner();
84+
unsafe { aes_gcm_dec_update_vaes_avx512(input, output, len, aes_key, ctr, htable, xi) };
85+
let blocks = NonZeroU32::new(blocks).unwrap_or_else(|| {
86+
unreachable!() // Due to previous checks.
87+
});
88+
ctr.increment_by_less_safe(blocks);
89+
}
90+
})
91+
}

src/aead/gcm.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ pub(super) mod clmulavxmovbe;
3939
pub(super) mod fallback;
4040
pub(super) mod neon;
4141
pub(super) mod vclmulavx2;
42+
pub(super) mod vclmulavx512;
4243

4344
pub(super) struct Context<'key, K> {
4445
Xi: Xi,
@@ -128,6 +129,15 @@ impl Context<'_, vclmulavx2::Key> {
128129
}
129130
}
130131

132+
#[cfg(target_arch = "x86_64")]
133+
impl Context<'_, vclmulavx512::Key> {
134+
/// Access to `inner` for the integrated AES-GCM implementations only.
135+
#[inline]
136+
pub(super) fn inner(&mut self) -> (&HTable, &mut Xi) {
137+
(self.key.inner(), &mut self.Xi)
138+
}
139+
}
140+
131141
impl<K: UpdateBlocks> Context<'_, K> {
132142
#[inline(always)]
133143
pub fn update_blocks(&mut self, input: AsChunks<u8, BLOCK_LEN>) {

src/aead/gcm/vclmulavx2.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ pub struct Key {
2727
}
2828

2929
impl Key {
30+
#[inline(never)]
3031
pub(in super::super) fn new(value: KeyValue, _cpu: (Avx2, VAesClmul)) -> Self {
3132
Self {
3233
h_table: unsafe { htable_new!(gcm_init_vpclmulqdq_avx2, value) },

src/aead/gcm/vclmulavx512.rs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// Copyright 2018-2025 Brian Smith.
2+
//
3+
// Permission to use, copy, modify, and/or distribute this software for any
4+
// purpose with or without fee is hereby granted, provided that the above
5+
// copyright notice and this permission notice appear in all copies.
6+
//
7+
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8+
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9+
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10+
// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11+
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12+
// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13+
// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14+
15+
#![cfg(target_arch = "x86_64")]
16+
17+
use super::{ffi::KeyValue, HTable, UpdateBlock, Xi};
18+
use crate::{
19+
aead::gcm::ffi::BLOCK_LEN,
20+
cpu::intel::{Avx2, Avx512_BW_VL_ZMM, Bmi2, VAesClmul},
21+
polyfill::slice::AsChunks,
22+
};
23+
24+
#[derive(Clone)]
25+
pub struct Key {
26+
h_table: HTable,
27+
}
28+
29+
impl Key {
30+
pub(in super::super) fn new(
31+
value: KeyValue,
32+
_cpu: (Avx2, Avx512_BW_VL_ZMM, Bmi2, VAesClmul),
33+
) -> Self {
34+
Self {
35+
h_table: unsafe { htable_new!(gcm_init_vpclmulqdq_avx512, value) },
36+
}
37+
}
38+
39+
pub(super) fn inner(&self) -> &HTable {
40+
&self.h_table
41+
}
42+
}
43+
44+
impl UpdateBlock for Key {
45+
fn update_block(&self, xi: &mut Xi, a: [u8; BLOCK_LEN]) {
46+
let input: AsChunks<u8, BLOCK_LEN> = (&a).into();
47+
unsafe { ghash!(gcm_ghash_vpclmulqdq_avx512_16, xi, &self.h_table, input) }
48+
}
49+
}

src/cpu.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,20 @@ where
113113
}
114114
}
115115

116+
impl<A, B, C, D> GetFeature<(A, B, C, D)> for features::Values
117+
where
118+
features::Values: GetFeature<(A, B)>,
119+
features::Values: GetFeature<(C, D)>,
120+
{
121+
#[inline(always)]
122+
fn get_feature(&self) -> Option<(A, B, C, D)> {
123+
match (self.get_feature(), self.get_feature()) {
124+
(Some((a, b)), Some((c, d))) => Some((a, b, c, d)),
125+
_ => None,
126+
}
127+
}
128+
}
129+
116130
impl<F> GetFeature<F> for Features
117131
where
118132
features::Values: GetFeature<F>,

src/cpu/intel.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,11 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 {
139139
#[cfg(target_arch = "x86_64")]
140140
let (extended_features_ebx, extended_features_ecx) = (cpuid[2], cpuid[3]);
141141

142+
// `OPENSSL_cpuid_setup` synthesizes this bit when it detects an Intel
143+
// CPU family that is known to downclock when ZMM registers are used.
144+
#[cfg(target_arch = "x86_64")]
145+
let avoid_zmm = check(cpuid[2], 14);
146+
142147
let mut caps = 0;
143148

144149
// AMD: "Collectively the SSE1, [...] are referred to as the legacy SSE
@@ -236,6 +241,35 @@ fn cpuid_to_caps_and_set_c_flags(cpuid: &[u32; 4]) -> u32 {
236241
// calling into the C code.
237242
let flag = unsafe { &avx2_available };
238243
flag.store(1, core::sync::atomic::Ordering::Relaxed);
244+
245+
// AVX-512.
246+
// Initial releases of macOS 12 had a serious bug w.r.t. AVX-512
247+
// support; see https://go-review.googlesource.com/c/sys/+/620256.
248+
// Given that, plus Apple's transition to ARM, AVX-512 isn't worth
249+
// supporting for their targets.
250+
#[cfg(not(target_vendor = "apple"))]
251+
{
252+
// Intel: "15.3 DETECTION OF 512-BIT INSTRUCTION GROUPS OF THE INTEL
253+
// AVX-512 FAMILY".
254+
// `OPENSSL_cpuid_setup` clears these bits when XCR0[7:5] isn't 0b111.
255+
// doesn't AVX-512 state.
256+
let f = check(extended_features_ebx, 16);
257+
let bw = check(extended_features_ebx, 30);
258+
259+
// Intel: "15.4 DETECTION OF INTEL AVX-512 INSTRUCTION GROUPS
260+
// OPERATING AT 256 AND 128-BIT VECTOR LENGTHS"
261+
let vl = check(extended_features_ebx, 31);
262+
263+
// Intel: "15.4 DETECTION OF INTEL AVX-512 INSTRUCTION GROUPS
264+
// OPERATING AT 256 AND 128-BIT VECTOR LENGTHS."
265+
if !avoid_zmm && f {
266+
// Intel: "Table 15-2. Feature Flag Collection Required of
267+
// 256/128 Bit Vector Lengths for Each Instruction Group."
268+
if bw && vl {
269+
set(&mut caps, Shift::Avx512_BW_VL_ZMM)
270+
}
271+
}
272+
}
239273
}
240274

241275
// Intel: "12.13.4 Checking for Intel AES-NI Support"
@@ -348,6 +382,11 @@ impl_get_feature! {
348382
{ ("x86", "x86_64") => Aes },
349383
{ ("x86", "x86_64") => Avx },
350384
{ ("x86_64") => Bmi1 },
385+
386+
// AVX512BW + AVX512VL + AND using ZMM registers isn't expected to caus
387+
// downclocking.
388+
{ ("x86_64") => Avx512_BW_VL_ZMM },
389+
351390
{ ("x86_64") => Avx2 },
352391
{ ("x86_64") => Bmi2 },
353392
{ ("x86_64") => Adx },

0 commit comments

Comments
 (0)