Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions include/os/macos/spl/sys/simd_aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,4 +93,12 @@ zfs_aesv8_available(void)
return ((ftr >> 4) & 0xf);
}

static inline boolean_t
zfs_pmull_available(void)
{
uint64_t ftr;
get_ftr(ID_AA64ISAR0_EL1, ftr);
return ((ftr >> 8) & 0xf);
}

#endif /* _MACOS_SIMD_AARCH64_H */
1 change: 1 addition & 0 deletions lib/libicp/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ nodist_libicp_la_SOURCES = \
if TARGET_CPU_AARCH64
nodist_libicp_la_SOURCES += \
module/icp/asm-aarch64/aes/aesv8-armx.S \
module/icp/asm-aarch64/aes/ghashv8-armx.S \
module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S \
module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S \
module/icp/asm-aarch64/sha2/sha256-armv8.S \
Expand Down
8 changes: 8 additions & 0 deletions lib/libspl/include/sys/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,7 @@ zfs_sha256_available(void)

#define HWCAP_FP 0x00000001
#define HWCAP_AES 0x00000008
#define HWCAP_PMULL 0x00000010
#define HWCAP_SHA2 0x00000040
#define HWCAP_SHA512 0x00200000

Expand Down Expand Up @@ -581,6 +582,13 @@ zfs_aesv8_available(void)
return (hwcap & HWCAP_AES);
}

static inline boolean_t
zfs_pmull_available(void)
{
unsigned long hwcap = getauxval(AT_HWCAP);
return (hwcap & HWCAP_PMULL);
}

#elif defined(__powerpc__)

#define kfpu_allowed() 0
Expand Down
14 changes: 10 additions & 4 deletions module/icp/algs/aes/aes_impl_aesv8.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@
* CDDL HEADER END
*/

#include <aes/aes_impl.h>

/*
* Copyright (c) 2023, Jorgen Lundman <lundman@lundman.net>
*/

#define HAVE_AESV8
#if defined(__aarch64__) && defined(HAVE_AESV8)

#include <sys/simd.h>
Expand Down Expand Up @@ -59,8 +60,6 @@ typedef struct aes_key_st {
unsigned int pad[3];
} AES_KEY;

#include <aes/aes_impl.h>

/*
* Expand the 32-bit AES cipher key array into the encryption and decryption
* key schedules.
Expand Down Expand Up @@ -130,8 +129,15 @@ aes_aesv8_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
static boolean_t
aes_aesv8_will_work(void)
{
/*
* so msr is a system register that returns 0 below
* EL1. But all APPLE M1 onwards support AES. We could
* fetch it from sysctl here for userland.
*/
#ifndef _KERNEL
return (B_TRUE);
#endif
return (kfpu_allowed() && zfs_aesv8_available());

}

const aes_impl_ops_t aes_aesv8_impl = {
Expand Down
108 changes: 106 additions & 2 deletions module/icp/algs/modes/gcm.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@
#include <aes/aes_impl.h>
#endif

// CAN_USE_GCM_ASM appears to be an Intel thing
#ifdef __aarch64__
#include <aes/aes_impl.h>
#endif

#define GHASH(c, d, t, o) \
xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
Expand Down Expand Up @@ -76,6 +81,87 @@ static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *,
size_t, size_t);
#endif /* ifdef CAN_USE_GCM_ASM */

#if defined(__aarch64__) && defined(HAVE_AESV8)

extern void ASMABI gcm_init_v8(uint64_t *Htable, const uint64_t Xi[2]);
extern void ASMABI gcm_gmult_v8(uint64_t Xi[2], const uint64_t Htable[16*2]);
extern void ASMABI gcm_ghash_v8(uint64_t Xi[2], const uint64_t Htable[16*2],
const uint8_t *input, size_t len);

static boolean_t
gcm_ghashv8_will_work(void)
{
/*
* so msr is a system register that returns 0 below
* EL1. But all APPLE M1 onwards support PMULL. We could
* fetch it from sysctl here for userland.
*/
#ifndef _KERNEL
return (B_TRUE);
#endif
return (kfpu_allowed() &&
zfs_aesv8_available() &&
zfs_pmull_available());
}

const gcm_impl_ops_t gcm_ghashv8_impl = {
.name = "ghashv8",
.needs_htable = B_TRUE,
.ghash = &gcm_ghash_v8,
.ghash_init = &gcm_init_v8, // or gcm_init_htab(), not NULL!
.is_supported = &gcm_ghashv8_will_work
};

// Turbo up the GHASH
#undef GHASH
#define GHASH(c, d, t, o) do { \
xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
if ((o)->ghash != NULL) { \
(o)->ghash((uint64_t *)(c)->gcm_ghash, \
(const uint64_t *)(c)->gcm_Htable, \
(const uint8_t *)(t), 16); \
} else { \
(o)->mul((uint64_t *)(c)->gcm_ghash, (c)->gcm_H, \
(uint64_t *)(t)); \
} \
} while (0)

// ChatGPT also suggests we can copy gcm_mode_encrypt_contiguous_blocks()
// and move the GHASH call outside the loop to call gcm_ghash_v8_x4()
// for larger sections, mopping up the tail with regular GHASH.
// Perhaps by adding gcm_impl_ops_t->ghash_x4 variant. But that
// would be a larger change, up where we decide to call contiguous.

#endif /* defined (__aarch64__) && defined(HAVE_AESV8) */

/*
* Generic ghash_init function
*/
__maybe_unused static void
gcm_init_htab(uint64_t *Htable, const uint64_t H[2])
{
Htable[0] = 0;
Htable[1] = 0;

Htable[2] = H[0];
Htable[3] = H[1];

for (int i = 2; i < 16; i++) {
uint64_t prev_hi = Htable[(i - 1) * 2];
uint64_t prev_lo = Htable[(i - 1) * 2 + 1];

uint64_t lo = (prev_lo >> 1) | (prev_hi << 63);
uint64_t hi = (prev_hi >> 1);

if (prev_lo & 1) {
hi ^= 0xe100000000000000ULL;
}

Htable[i * 2] = hi;
Htable[i * 2 + 1] = lo;
}
}

/*
* Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode
* is done in another function.
Expand Down Expand Up @@ -625,6 +711,21 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
const uint8_t *aad = (const uint8_t *)gcm_param->pAAD;
size_t aad_len = gcm_param->ulAADLen;

const gcm_impl_ops_t *gops = gcm_impl_get_ops();

if (gops->needs_htable) {
gcm_ctx->gcm_htab_len = 32 * sizeof (uint64_t);
gcm_ctx->gcm_Htable =
kmem_alloc(gcm_ctx->gcm_htab_len, KM_SLEEP);

/*
* We assume ghash_init is set to at least
* gcm_init_htab() (generic), since this only
* applies to new code with .needs_htable set.
*/
gops->ghash_init(gcm_ctx->gcm_Htable, gcm_ctx->gcm_H);
}

#ifdef CAN_USE_GCM_ASM
boolean_t needs_bswap =
((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
Expand Down Expand Up @@ -728,6 +829,9 @@ static const gcm_impl_ops_t *gcm_all_impl[] = {
#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
&gcm_pclmulqdq_impl,
#endif
#if defined(__aarch64__) && defined(HAVE_AESV8)
&gcm_ghashv8_impl,
#endif
};

/* Indicate that benchmark has been completed */
Expand Down Expand Up @@ -810,8 +914,8 @@ gcm_impl_init(void)
* hardware accelerated version is the fastest.
*/
#if defined(__aarch64__) && defined(HAVE_ARMV8)
if (gcm_armv8_impl.is_supported()) {
memcpy(&gcm_fastest_impl, &gcm_armv8_impl,
if (gcm_ghashv8_impl.is_supported()) {
memcpy(&gcm_fastest_impl, &gcm_ghashv8_impl,
sizeof (gcm_fastest_impl));
} else
#endif
Expand Down
7 changes: 7 additions & 0 deletions module/icp/algs/modes/modes.c
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,15 @@ gcm_clear_ctx(gcm_ctx_t *ctx)
ASSERT3P(ctx->gcm_Htable, !=, NULL);
memset(ctx->gcm_Htable, 0, ctx->gcm_htab_len);
kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len);
ctx->gcm_Htable = NULL;
}
#endif
if (ctx->gcm_Htable) {
ASSERT3P(ctx->gcm_Htable, !=, NULL);
memset(ctx->gcm_Htable, 0, ctx->gcm_htab_len);
kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len);
}

if (ctx->gcm_pt_buf != NULL) {
memset(ctx->gcm_pt_buf, 0, ctx->gcm_pt_buf_len);
vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
Expand Down
Loading
Loading