OffchainLabs · parithosh · Jun 23, 2025 · Jun 23, 2025 · Jun 23, 2025 · Jun 24, 2025
diff --git a/src/Makefile b/src/Makefile
@@ -182,3 +182,4 @@ uninstall: $(libname)
 	rm $(DESTDIR)$(PREFIX)/include/hashtree.h
 endif
 
+
diff --git a/src/hashtree.c b/src/hashtree.c
@@ -39,48 +39,125 @@ static void init_and_hash(unsigned char *output, const unsigned char *input, uin
 
 static hashtree_hash_fcn hash_ptr = init_and_hash;
 
+// Enhanced microarchitecture detection
+static int is_intel_cpu() {
+#ifdef __x86_64__
+    uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
+    __get_cpuid(0, &eax, &ebx, &ecx, &edx);
+    // Intel signature: "GenuineIntel"
+    return (ebx == 0x756e6547 && edx == 0x49656e69 && ecx == 0x6c65746e);
+#endif
+    return 0;
+}
+
+static int is_amd_cpu() {
+#ifdef __x86_64__
+    uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
+    __get_cpuid(0, &eax, &ebx, &ecx, &edx);
+    // AMD signature: "AuthenticAMD"
+    return (ebx == 0x68747541 && edx == 0x69746e65 && ecx == 0x444d4163);
+#endif
+    return 0;
+}
+
+
 static hashtree_hash_fcn hashtree_detect() {
 #ifdef __x86_64__
     uint32_t a = 0, b = 0, c = 0, d = 0;
     __get_cpuid_count(7, 0, &a, &b, &c, &d);
 
+    int intel = is_intel_cpu();
+    int amd = is_amd_cpu();
+
     if (b & bit_SHA) {
-        /* Although AVX512 may be faster for full 16-block hashes, SHANI
-        outperforms it significantly on smaller lists - thus, avoid pathological
-        behavior. */
+        /* SHANI provides excellent single-thread performance across both Intel and AMD.
+        On Intel, it outperforms AVX512 for small to medium workloads.
+        On AMD, it's consistently fast and power-efficient. */
         return &hashtree_sha256_shani_x2;
     }
+
     if ((b & bit_AVX512F) && (b & bit_AVX512VL)) {
-        return &hashtree_sha256_avx512_x16;
+        /* AVX512 optimization strategy:
+        - Intel: Good for large parallel workloads, but watch for frequency scaling
+        - AMD: Zen 4+ has good AVX512, but prefer for large datasets only */
+        if (intel) {
+            // Intel AVX512 is mature and well-optimized
+            return &hashtree_sha256_avx512_x16;
+        } else if (amd) {
+            // AMD Zen 4+ AVX512 is newer, use conservatively
+            return &hashtree_sha256_avx512_x16;
+        }
     }
+
     if (b & bit_AVX2) {
+        /* AVX2 is the sweet spot for most modern CPUs:
+        - Intel: Excellent performance from Haswell onwards
+        - AMD: Strong performance from Zen onwards */
         return &hashtree_sha256_avx2_x8;
     }
+
     __get_cpuid_count(1, 0, &a, &b, &c, &d);
     if (c & bit_AVX) {
+        /* First-gen AVX:
+        - Intel: Sandy Bridge/Ivy Bridge era
+        - AMD: Bulldozer family - less optimal, but still good */
         return &hashtree_sha256_avx_x4;
     }
-    if (c & bit_AVX) {
+
+    if (c & bit_SSE2) {
+        /* SSE2 fallback - universally supported on x86_64 */
         return &hashtree_sha256_sse_x1;
     }
 #endif
 #ifdef __aarch64__
+    /* ARM64/AArch64 detection */
 #ifdef __APPLE__
+    /* Apple Silicon always has crypto extensions */
     return &hashtree_sha256_sha_x1;
 #else
+    /* Linux ARM64 - check capabilities */
     long hwcaps = getauxval(AT_HWCAP);
     if (hwcaps & HWCAP_SHA2) {
+        /* ARM crypto extensions available */
         return &hashtree_sha256_sha_x1;
     }
 
     if (hwcaps & HWCAP_ASIMD) {
+        /* NEON SIMD available */
         return &hashtree_sha256_neon_x4;
     }
+    /* Fallback to generic if no SIMD available */
 #endif
 #endif
+
+    /* Ultimate fallback for any undetected architecture:
+     * - Non-x86_64 and non-ARM64 architectures
+     * - x86_64 without SSE2 (extremely rare)
+     * - ARM without NEON (older 32-bit ARM)
+     * - Any future architectures not yet supported
+     * The generic implementation uses pure C code that works everywhere */
     return &hashtree_sha256_generic;
 }
 
+// Helper function to get the name of the current implementation
+const char* hashtree_get_impl_name() {
+    if (hash_ptr == &hashtree_sha256_generic) return "generic";
+#ifdef __x86_64__
+    if (hash_ptr == &hashtree_sha256_sse_x1) return "sse_x1";
+    if (hash_ptr == &hashtree_sha256_avx_x1) return "avx_x1";
+    if (hash_ptr == &hashtree_sha256_avx_x4) return "avx_x4";
+    if (hash_ptr == &hashtree_sha256_avx2_x8) return "avx2_x8";
+    if (hash_ptr == &hashtree_sha256_avx512_x16) return "avx512_x16";
+    if (hash_ptr == &hashtree_sha256_shani_x2) return "shani_x2";
+#endif
+#ifdef __aarch64__
+    if (hash_ptr == &hashtree_sha256_sha_x1) return "arm_sha_x1";
+    if (hash_ptr == &hashtree_sha256_neon_x1) return "arm_neon_x1";
+    if (hash_ptr == &hashtree_sha256_neon_x4) return "arm_neon_x4";
+#endif
+    return "unknown";
+}
+
 void hashtree_init(hashtree_hash_fcn override) {
     if (override) {
         hash_ptr = override;

diff --git a/src/sha256_avx_x1.S b/src/sha256_avx_x1.S
@@ -90,6 +90,15 @@ Copyright (c) 2012-2023, Intel Corporation
 	shld        \src, \src, (32-\shf)
 .endm
 
+# Add efficient prefetch macros
+.macro PREFETCH_DATA offset
+	prefetchnta	[DATA_PTR + \offset]
+.endm
+
+.macro PREFETCH_CONSTANTS offset
+	prefetchnta	[TBL + \offset]
+.endm
+
 // COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
 // Load xmm with mem and byte swap each dword
 .macro COPY_XMM_AND_BSWAP dst, src, msk
@@ -172,49 +181,41 @@ Copyright (c) 2012-2023, Intel Corporation
 .macro FOUR_ROUNDS_AND_SCHED
 		//// compute s0 four at a time and s1 two at a time
 		//// compute W[-16] + W[-7] 4 at a time
-		//vmovdqa	XTMP0, X3
+		// Optimize with better SIMD/CPU interleaving
+		vpalignr	XTMP0, X3, X2, 4	// XTMP0 = W[-7]
+		vpalignr	XTMP1, X1, X0, 4	// XTMP1 = W[-15]
+		vpaddd	XTMP0, XTMP0, X0	// XTMP0 = W[-7] + W[-16]
+
+		// Start CPU operations while SIMD is working
 	mov	y0, e_		// y0 = e
+		vpsrld	XTMP2, XTMP1, 7
 	MY_ROR	y0, (25-11)	// y0 = e >> (25-11)
+		vpslld	XTMP3, XTMP1, (32-7)
 	mov	y1, a_		// y1 = a
-		vpalignr	XTMP0, X3, X2, 4	// XTMP0 = W[-7]
+		vpor	XTMP3, XTMP3, XTMP2	// XTMP3 = W[-15] ror 7
 	MY_ROR	y1, (22-13)	// y1 = a >> (22-13)
 	xor	y0, e_		// y0 = e ^ (e >> (25-11))
 	mov	y2, f_		// y2 = f
 	MY_ROR	y0, (11-6)	// y0 = (e >> (11-6)) ^ (e >> (25-6))
-		//vmovdqa	XTMP1, X1
-	xor	y1, a_		// y1 = a ^ (a >> (22-13)
+	xor	y1, a_		// y1 = a ^ (a >> (22-13))
 	xor	y2, g_		// y2 = f^g
-		vpaddd	XTMP0, XTMP0, X0	// XTMP0 = W[-7] + W[-16]
 	xor	y0, e_		// y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 	and	y2, e_		// y2 = (f^g)&e
 	MY_ROR	y1, (13-2)	// y1 = (a >> (13-2)) ^ (a >> (22-2))
-		//// compute s0
-		vpalignr	XTMP1, X1, X0, 4	// XTMP1 = W[-15]
 	xor	y1, a_		// y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 	MY_ROR	y0, 6		// y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 	xor	y2, g_		// y2 = CH = ((f^g)&e)^g
-
 	MY_ROR	y1, 2		// y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 	add	y2, y0		// y2 = S1 + CH
 	add	y2, [rsp + 0*4]	// y2 = k + w + S1 + CH
-
 	mov	y0, a_		// y0 = a
 	add	h_, y2		// h = h + S1 + CH + k + w
 	mov	y2, a_		// y2 = a
-
-		vpsrld	XTMP2, XTMP1, 7
-
 	or	y0, c_		// y0 = a|c
 	add	d_, h_		// d = d + h + S1 + CH + k + w
 	and	y2, c_		// y2 = a&c
-
-		vpslld	XTMP3, XTMP1, (32-7)
-
 	and	y0, b_		// y0 = (a|c)&b
 	add	h_, y1		// h = h + S1 + CH + k + w + S0
-
-		vpor	XTMP3, XTMP3, XTMP2	// XTMP1 = W[-15] MY_ROR 7
-
 	or	y0, y2		// y0 = MAJ = (a|c)&b)|(a&c)
 	add	h_, y0		// h = h + S1 + CH + k + w + S0 + MAJ
 
@@ -458,7 +459,10 @@ hashtree_sha256_avx_x1:
 
 	lea	TBL,[rip + .LK256]
 
-	//; byte swap first 16 dwords
+	//; byte swap first 16 dwords with multi-level prefetching
+	prefetchnta	[DATA_PTR + 768]	// L3 prefetch 12 blocks ahead
+	prefetcht2	[DATA_PTR + 384]	// L2 prefetch 6 blocks ahead
+	prefetcht0	[DATA_PTR + 192]	// L1 prefetch 3 blocks ahead
 	COPY_XMM_AND_BSWAP	X0, [DATA_PTR + 0*16], BYTE_FLIP_MASK
 	COPY_XMM_AND_BSWAP	X1, [DATA_PTR + 1*16], BYTE_FLIP_MASK
 	COPY_XMM_AND_BSWAP	X2, [DATA_PTR + 2*16], BYTE_FLIP_MASK
@@ -469,6 +473,7 @@ hashtree_sha256_avx_x1:
 .align 32
 	vpaddd	XFER, X0, [TBL + 0*16]
 	vmovdqa	[rsp], XFER
+	prefetcht0	[TBL + 256]	// prefetch K constants to L1
 	FOUR_ROUNDS_AND_SCHED
 
 	vpaddd	XFER, X0, [TBL + 1*16]
@@ -600,6 +605,7 @@ hashtree_sha256_avx_x1:
 #endif
 	pop	rbx
 
+	vzeroupper		// Clear upper YMM state to avoid SSE transition penalty
 	ret
 #ifdef __linux__ 
 .size hashtree_sha256_avx_x1,.-hashtree_sha256_avx_x1

diff --git a/src/sha256_avx_x16.S b/src/sha256_avx_x16.S
@@ -964,6 +964,7 @@ hashtree_sha256_avx512_x16:
         endbr64
         cmp     COUNT, 0
         jne     .Lstart_routine
+        vzeroupper
         ret
 
 .Lstart_routine:
@@ -977,6 +978,15 @@ hashtree_sha256_avx512_x16:
 	cmp		COUNT, 16
 	jb		hashtree_sha256_avx2_x8
 
+	# Prefetch data 16 blocks ahead (16*64 = 1024 bytes)
+	cmp		COUNT, 32
+	jb		.Lskip_prefetch_x16
+	# Multi-level prefetching for better cache utilization
+	prefetchnta	[DATA_PTR + 1024]     # L3 cache hint
+	prefetcht2	[DATA_PTR + 1024]      # L2 cache hint
+	prefetcht0	[DATA_PTR + 1024]      # L1 cache hint
+.Lskip_prefetch_x16:
+
 	# Load pre-transposed digest
 	vmovdqa32	A, [DIGEST + 0*64]
 	vmovdqa32	B, [DIGEST + 1*64]

diff --git a/src/sha256_avx_x4.S b/src/sha256_avx_x4.S
@@ -352,6 +352,7 @@ hashtree_sha256_avx_x4:
         endbr64
 	cmp	NUM_BLKS, 0
 	jne	.Lstart_routine
+	vzeroupper
 	ret
 .Lstart_routine:
 	sub	rsp, sha256_avx_4_stack_size
@@ -373,6 +374,15 @@ hashtree_sha256_avx_x4:
 	cmp 	NUM_BLKS, 4
 	jl 	.Lsha256_4_avx_epilog
 
+	# Prefetch data 4 blocks ahead (4*64 = 256 bytes)
+	cmp	NUM_BLKS, 8
+	jb	.Lskip_prefetch_x4
+	# Multi-level prefetching for better cache utilization
+	prefetchnta [DATA_PTR + 256]     # L3 cache hint
+	prefetcht2 [DATA_PTR + 256]      # L2 cache hint
+	prefetcht0 [DATA_PTR + 256]      # L1 cache hint
+.Lskip_prefetch_x4:
+
 	xor	ROUND, ROUND
 
 	// Load the pre-transposed incoming digest.

diff --git a/src/sha256_avx_x8.S b/src/sha256_avx_x8.S
@@ -586,6 +586,7 @@ hashtree_sha256_avx2_x8:
         endbr64
         cmp     NUM_BLKS, 0
         jne     .Lstart_routine
+        vzeroupper
         ret
 .Lstart_routine:       
 	push 	rbp
@@ -618,6 +619,15 @@ hashtree_sha256_avx2_x8:
 	cmp 	NUM_BLKS, 8
 	jb 	.Lsha256_8_avx2_epilog
 
+	# Prefetch data 8 blocks ahead (8*64 = 512 bytes)
+	cmp	NUM_BLKS, 16
+	jb	.Lskip_prefetch_x8
+	# Multi-level prefetching for better cache utilization
+	prefetchnta [DATA_PTR + 512]     # L3 cache hint
+	prefetcht2 [DATA_PTR + 512]      # L2 cache hint
+	prefetcht0 [DATA_PTR + 512]      # L1 cache hint
+.Lskip_prefetch_x8:
+
 	lea TBL,[rip + .LDIGEST_8]
 	vmovdqa	a,[TBL + 0*32]
 	vmovdqa	b,[TBL + 1*32]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -182,3 +182,4 @@ uninstall: $(libname)
		rm $(DESTDIR)$(PREFIX)/include/hashtree.h
		endif