openzfsonosx · lundman · Jun 2, 2025 · Jun 2, 2025
diff --git a/include/os/macos/spl/sys/simd_aarch64.h b/include/os/macos/spl/sys/simd_aarch64.h
@@ -93,4 +93,12 @@ zfs_aesv8_available(void)
 	return ((ftr >> 4) & 0xf);
 }
 
+static inline boolean_t
+zfs_pmull_available(void)
+{
+	uint64_t ftr;
+	get_ftr(ID_AA64ISAR0_EL1, ftr);
+	return ((ftr >> 8) & 0xf);
+}
+
 #endif /* _MACOS_SIMD_AARCH64_H */
diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am
@@ -42,6 +42,7 @@ nodist_libicp_la_SOURCES = \
 if TARGET_CPU_AARCH64
 nodist_libicp_la_SOURCES += \
 	module/icp/asm-aarch64/aes/aesv8-armx.S \
+	module/icp/asm-aarch64/aes/ghashv8-armx.S \
 	module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S \
 	module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S \
 	module/icp/asm-aarch64/sha2/sha256-armv8.S \

diff --git a/lib/libspl/include/sys/simd.h b/lib/libspl/include/sys/simd.h
@@ -538,6 +538,7 @@ zfs_sha256_available(void)
 
 #define	HWCAP_FP		0x00000001
 #define	HWCAP_AES		0x00000008
+#define	HWCAP_PMULL		0x00000010
 #define	HWCAP_SHA2		0x00000040
 #define	HWCAP_SHA512		0x00200000
 
@@ -581,6 +582,13 @@ zfs_aesv8_available(void)
 	return (hwcap & HWCAP_AES);
 }
 
+static inline boolean_t
+zfs_pmull_available(void)
+{
+	unsigned long hwcap = getauxval(AT_HWCAP);
+	return (hwcap & HWCAP_PMULL);
+}
+
 #elif defined(__powerpc__)
 
 #define	kfpu_allowed()		0

diff --git a/module/icp/algs/aes/aes_impl_aesv8.c b/module/icp/algs/aes/aes_impl_aesv8.c
@@ -19,11 +19,12 @@
  * CDDL HEADER END
  */
 
+#include <aes/aes_impl.h>
+
 /*
  * Copyright (c) 2023, Jorgen Lundman <lundman@lundman.net>
  */
 
-#define	HAVE_AESV8
 #if defined(__aarch64__) && defined(HAVE_AESV8)
 
 #include <sys/simd.h>
@@ -59,8 +60,6 @@ typedef struct aes_key_st {
 	unsigned int	pad[3];
 } AES_KEY;
 
-#include <aes/aes_impl.h>
-
 /*
  * Expand the 32-bit AES cipher key array into the encryption and decryption
  * key schedules.
@@ -130,8 +129,15 @@ aes_aesv8_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
 static boolean_t
 aes_aesv8_will_work(void)
 {
+	/*
+	 * so msr is a system register that returns 0 below
+	 * EL1. But all APPLE M1 onwards support AES. We could
+	 * fetch it from sysctl here for userland.
+	 */
+#ifndef _KERNEL
+	return (B_TRUE);
+#endif
 	return (kfpu_allowed() && zfs_aesv8_available());
-
 }
 
 const aes_impl_ops_t aes_aesv8_impl = {

diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c
@@ -35,6 +35,11 @@
 #include <aes/aes_impl.h>
 #endif
 
+// CAN_USE_GCM_ASM appears to be an Intel thing
+#ifdef __aarch64__
+#include <aes/aes_impl.h>
+#endif
+
 #define	GHASH(c, d, t, o) \
 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
 	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
@@ -76,6 +81,87 @@ static int gcm_init_avx(gcm_ctx_t *, const uint8_t *, size_t, const uint8_t *,
     size_t, size_t);
 #endif /* ifdef CAN_USE_GCM_ASM */
 
+#if defined(__aarch64__) && defined(HAVE_AESV8)
+
+extern void ASMABI gcm_init_v8(uint64_t *Htable, const uint64_t Xi[2]);
+extern void ASMABI gcm_gmult_v8(uint64_t Xi[2], const uint64_t Htable[16*2]);
+extern void ASMABI gcm_ghash_v8(uint64_t Xi[2], const uint64_t Htable[16*2],
+	const uint8_t *input, size_t len);
+
+static boolean_t
+gcm_ghashv8_will_work(void)
+{
+	/*
+	 * so msr is a system register that returns 0 below
+	 * EL1. But all APPLE M1 onwards support PMULL. We could
+	 * fetch it from sysctl here for userland.
+	 */
+#ifndef _KERNEL
+	return (B_TRUE);
+#endif
+	return (kfpu_allowed() &&
+	    zfs_aesv8_available() &&
+	    zfs_pmull_available());
+}
+
+const gcm_impl_ops_t gcm_ghashv8_impl = {
+	.name = "ghashv8",
+	.needs_htable = B_TRUE,
+	.ghash = &gcm_ghash_v8,
+	.ghash_init = &gcm_init_v8, // or gcm_init_htab(), not NULL!
+	.is_supported = &gcm_ghashv8_will_work
+};
+
+// Turbo up the GHASH
+#undef GHASH
+#define	GHASH(c, d, t, o) do { \
+		xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
+		if ((o)->ghash != NULL) { \
+			(o)->ghash((uint64_t *)(c)->gcm_ghash, \
+				(const uint64_t *)(c)->gcm_Htable, \
+				(const uint8_t *)(t), 16); \
+		} else { \
+			(o)->mul((uint64_t *)(c)->gcm_ghash, (c)->gcm_H, \
+				(uint64_t *)(t)); \
+		} \
+	} while (0)
+
+// ChatGPT also suggests we can copy gcm_mode_encrypt_contiguous_blocks()
+// and move the GHASH call outside the loop to call gcm_ghash_v8_x4()
+// for larger sections, mopping up the tail with regular GHASH.
+// Perhaps by adding gcm_impl_ops_t->ghash_x4 variant. But that
+// would be a larger change, up where we decide to call contiguous.
+
+#endif /* defined (__aarch64__) && defined(HAVE_AESV8) */
+
+/*
+ * Generic ghash_init function
+ */
+__maybe_unused static void
+gcm_init_htab(uint64_t *Htable, const uint64_t H[2])
+{
+	Htable[0] = 0;
+	Htable[1] = 0;
+
+	Htable[2] = H[0];
+	Htable[3] = H[1];
+
+	for (int i = 2; i < 16; i++) {
+		uint64_t prev_hi = Htable[(i - 1) * 2];
+		uint64_t prev_lo = Htable[(i - 1) * 2 + 1];
+
+		uint64_t lo = (prev_lo >> 1) | (prev_hi << 63);
+		uint64_t hi = (prev_hi >> 1);
+
+		if (prev_lo & 1) {
+			hi ^= 0xe100000000000000ULL;
+		}
+
+		Htable[i * 2] = hi;
+		Htable[i * 2 + 1] = lo;
+	}
+}
+
 /*
  * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
  * is done in another function.
@@ -625,6 +711,21 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param,
 	const uint8_t *aad = (const uint8_t *)gcm_param->pAAD;
 	size_t aad_len = gcm_param->ulAADLen;
 
+	const gcm_impl_ops_t *gops = gcm_impl_get_ops();
+
+	if (gops->needs_htable) {
+		gcm_ctx->gcm_htab_len = 32 * sizeof (uint64_t);
+		gcm_ctx->gcm_Htable =
+		    kmem_alloc(gcm_ctx->gcm_htab_len, KM_SLEEP);
+
+		/*
+		 * We assume ghash_init is set to at least
+		 * gcm_init_htab() (generic), since this only
+		 * applies to new code with .needs_htable set.
+		 */
+		gops->ghash_init(gcm_ctx->gcm_Htable, gcm_ctx->gcm_H);
+	}
+
 #ifdef CAN_USE_GCM_ASM
 	boolean_t needs_bswap =
 	    ((aes_key_t *)gcm_ctx->gcm_keysched)->ops->needs_byteswap;
@@ -728,6 +829,9 @@ static const gcm_impl_ops_t *gcm_all_impl[] = {
 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
 	&gcm_pclmulqdq_impl,
 #endif
+#if defined(__aarch64__) && defined(HAVE_AESV8)
+	&gcm_ghashv8_impl,
+#endif
 };
 
 /* Indicate that benchmark has been completed */
@@ -810,8 +914,8 @@ gcm_impl_init(void)
 	 * hardware accelerated version is the fastest.
 	 */
 #if defined(__aarch64__) && defined(HAVE_ARMV8)
-	if (gcm_armv8_impl.is_supported()) {
-		memcpy(&gcm_fastest_impl, &gcm_armv8_impl,
+	if (gcm_ghashv8_impl.is_supported()) {
+		memcpy(&gcm_fastest_impl, &gcm_ghashv8_impl,
 		    sizeof (gcm_fastest_impl));
 	} else
 #endif

diff --git a/module/icp/algs/modes/modes.c b/module/icp/algs/modes/modes.c
@@ -174,8 +174,15 @@ gcm_clear_ctx(gcm_ctx_t *ctx)
 		ASSERT3P(ctx->gcm_Htable, !=, NULL);
 		memset(ctx->gcm_Htable, 0, ctx->gcm_htab_len);
 		kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len);
+		ctx->gcm_Htable = NULL;
 	}
 #endif
+	if (ctx->gcm_Htable) {
+		ASSERT3P(ctx->gcm_Htable, !=, NULL);
+		memset(ctx->gcm_Htable, 0, ctx->gcm_htab_len);
+		kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len);
+	}
+
 	if (ctx->gcm_pt_buf != NULL) {
 		memset(ctx->gcm_pt_buf, 0, ctx->gcm_pt_buf_len);
 		vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);