Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,4 @@ uninstall: $(libname)
rm $(DESTDIR)$(PREFIX)/include/hashtree.h
endif


87 changes: 82 additions & 5 deletions src/hashtree.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,48 +39,125 @@ static void init_and_hash(unsigned char *output, const unsigned char *input, uin

static hashtree_hash_fcn hash_ptr = init_and_hash;

// Enhanced microarchitecture detection
static int is_intel_cpu() {
#ifdef __x86_64__
uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
__get_cpuid(0, &eax, &ebx, &ecx, &edx);
// Intel signature: "GenuineIntel"
return (ebx == 0x756e6547 && edx == 0x49656e69 && ecx == 0x6c65746e);
#endif
return 0;
}

static int is_amd_cpu() {
#ifdef __x86_64__
uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
__get_cpuid(0, &eax, &ebx, &ecx, &edx);
// AMD signature: "AuthenticAMD"
return (ebx == 0x68747541 && edx == 0x69746e65 && ecx == 0x444d4163);
#endif
return 0;
}


static hashtree_hash_fcn hashtree_detect() {
#ifdef __x86_64__
uint32_t a = 0, b = 0, c = 0, d = 0;
__get_cpuid_count(7, 0, &a, &b, &c, &d);

int intel = is_intel_cpu();
int amd = is_amd_cpu();

if (b & bit_SHA) {
/* Although AVX512 may be faster for full 16-block hashes, SHANI
outperforms it significantly on smaller lists - thus, avoid pathological
behavior. */
/* SHANI provides excellent single-thread performance across both Intel and AMD.
On Intel, it outperforms AVX512 for small to medium workloads.
On AMD, it's consistently fast and power-efficient. */
return &hashtree_sha256_shani_x2;
}

if ((b & bit_AVX512F) && (b & bit_AVX512VL)) {
return &hashtree_sha256_avx512_x16;
/* AVX512 optimization strategy:
- Intel: Good for large parallel workloads, but watch for frequency scaling
- AMD: Zen 4+ has good AVX512, but prefer for large datasets only */
if (intel) {
// Intel AVX512 is mature and well-optimized
return &hashtree_sha256_avx512_x16;
} else if (amd) {
// AMD Zen 4+ AVX512 is newer, use conservatively
return &hashtree_sha256_avx512_x16;
}
}

if (b & bit_AVX2) {
/* AVX2 is the sweet spot for most modern CPUs:
- Intel: Excellent performance from Haswell onwards
- AMD: Strong performance from Zen onwards */
return &hashtree_sha256_avx2_x8;
}

__get_cpuid_count(1, 0, &a, &b, &c, &d);
if (c & bit_AVX) {
/* First-gen AVX:
- Intel: Sandy Bridge/Ivy Bridge era
- AMD: Bulldozer family - less optimal, but still good */
return &hashtree_sha256_avx_x4;
}
if (c & bit_AVX) {

if (c & bit_SSE2) {
/* SSE2 fallback - universally supported on x86_64 */
return &hashtree_sha256_sse_x1;
}
#endif
#ifdef __aarch64__
/* ARM64/AArch64 detection */
#ifdef __APPLE__
/* Apple Silicon always has crypto extensions */
return &hashtree_sha256_sha_x1;
#else
/* Linux ARM64 - check capabilities */
long hwcaps = getauxval(AT_HWCAP);
if (hwcaps & HWCAP_SHA2) {
/* ARM crypto extensions available */
return &hashtree_sha256_sha_x1;
}

if (hwcaps & HWCAP_ASIMD) {
/* NEON SIMD available */
return &hashtree_sha256_neon_x4;
}
/* Fallback to generic if no SIMD available */
#endif
#endif

/* Ultimate fallback for any undetected architecture:
* - Non-x86_64 and non-ARM64 architectures
* - x86_64 without SSE2 (extremely rare)
* - ARM without NEON (older 32-bit ARM)
* - Any future architectures not yet supported
* The generic implementation uses pure C code that works everywhere */
return &hashtree_sha256_generic;
}

// Helper function to get the name of the current implementation
const char* hashtree_get_impl_name() {
if (hash_ptr == &hashtree_sha256_generic) return "generic";
#ifdef __x86_64__
if (hash_ptr == &hashtree_sha256_sse_x1) return "sse_x1";
if (hash_ptr == &hashtree_sha256_avx_x1) return "avx_x1";
if (hash_ptr == &hashtree_sha256_avx_x4) return "avx_x4";
if (hash_ptr == &hashtree_sha256_avx2_x8) return "avx2_x8";
if (hash_ptr == &hashtree_sha256_avx512_x16) return "avx512_x16";
if (hash_ptr == &hashtree_sha256_shani_x2) return "shani_x2";
#endif
#ifdef __aarch64__
if (hash_ptr == &hashtree_sha256_sha_x1) return "arm_sha_x1";
if (hash_ptr == &hashtree_sha256_neon_x1) return "arm_neon_x1";
if (hash_ptr == &hashtree_sha256_neon_x4) return "arm_neon_x4";
#endif
return "unknown";
}

void hashtree_init(hashtree_hash_fcn override) {
if (override) {
hash_ptr = override;
Expand Down
44 changes: 25 additions & 19 deletions src/sha256_avx_x1.S
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,15 @@ Copyright (c) 2012-2023, Intel Corporation
shld \src, \src, (32-\shf)
.endm

# Add efficient prefetch macros
.macro PREFETCH_DATA offset
prefetchnta [DATA_PTR + \offset]
.endm

.macro PREFETCH_CONSTANTS offset
prefetchnta [TBL + \offset]
.endm

// COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
// Load xmm with mem and byte swap each dword
.macro COPY_XMM_AND_BSWAP dst, src, msk
Expand Down Expand Up @@ -172,49 +181,41 @@ Copyright (c) 2012-2023, Intel Corporation
.macro FOUR_ROUNDS_AND_SCHED
//// compute s0 four at a time and s1 two at a time
//// compute W[-16] + W[-7] 4 at a time
//vmovdqa XTMP0, X3
// Optimize with better SIMD/CPU interleaving
vpalignr XTMP0, X3, X2, 4 // XTMP0 = W[-7]
vpalignr XTMP1, X1, X0, 4 // XTMP1 = W[-15]
vpaddd XTMP0, XTMP0, X0 // XTMP0 = W[-7] + W[-16]

// Start CPU operations while SIMD is working
mov y0, e_ // y0 = e
vpsrld XTMP2, XTMP1, 7
MY_ROR y0, (25-11) // y0 = e >> (25-11)
vpslld XTMP3, XTMP1, (32-7)
mov y1, a_ // y1 = a
vpalignr XTMP0, X3, X2, 4 // XTMP0 = W[-7]
vpor XTMP3, XTMP3, XTMP2 // XTMP3 = W[-15] ror 7
MY_ROR y1, (22-13) // y1 = a >> (22-13)
xor y0, e_ // y0 = e ^ (e >> (25-11))
mov y2, f_ // y2 = f
MY_ROR y0, (11-6) // y0 = (e >> (11-6)) ^ (e >> (25-6))
//vmovdqa XTMP1, X1
xor y1, a_ // y1 = a ^ (a >> (22-13)
xor y1, a_ // y1 = a ^ (a >> (22-13))
xor y2, g_ // y2 = f^g
vpaddd XTMP0, XTMP0, X0 // XTMP0 = W[-7] + W[-16]
xor y0, e_ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
and y2, e_ // y2 = (f^g)&e
MY_ROR y1, (13-2) // y1 = (a >> (13-2)) ^ (a >> (22-2))
//// compute s0
vpalignr XTMP1, X1, X0, 4 // XTMP1 = W[-15]
xor y1, a_ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
MY_ROR y0, 6 // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
xor y2, g_ // y2 = CH = ((f^g)&e)^g

MY_ROR y1, 2 // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
add y2, y0 // y2 = S1 + CH
add y2, [rsp + 0*4] // y2 = k + w + S1 + CH

mov y0, a_ // y0 = a
add h_, y2 // h = h + S1 + CH + k + w
mov y2, a_ // y2 = a

vpsrld XTMP2, XTMP1, 7

or y0, c_ // y0 = a|c
add d_, h_ // d = d + h + S1 + CH + k + w
and y2, c_ // y2 = a&c

vpslld XTMP3, XTMP1, (32-7)

and y0, b_ // y0 = (a|c)&b
add h_, y1 // h = h + S1 + CH + k + w + S0

vpor XTMP3, XTMP3, XTMP2 // XTMP1 = W[-15] MY_ROR 7

or y0, y2 // y0 = MAJ = (a|c)&b)|(a&c)
add h_, y0 // h = h + S1 + CH + k + w + S0 + MAJ

Expand Down Expand Up @@ -458,7 +459,10 @@ hashtree_sha256_avx_x1:

lea TBL,[rip + .LK256]

//; byte swap first 16 dwords
//; byte swap first 16 dwords with multi-level prefetching
prefetchnta [DATA_PTR + 768] // L3 prefetch 12 blocks ahead
prefetcht2 [DATA_PTR + 384] // L2 prefetch 6 blocks ahead
prefetcht0 [DATA_PTR + 192] // L1 prefetch 3 blocks ahead
COPY_XMM_AND_BSWAP X0, [DATA_PTR + 0*16], BYTE_FLIP_MASK
COPY_XMM_AND_BSWAP X1, [DATA_PTR + 1*16], BYTE_FLIP_MASK
COPY_XMM_AND_BSWAP X2, [DATA_PTR + 2*16], BYTE_FLIP_MASK
Expand All @@ -469,6 +473,7 @@ hashtree_sha256_avx_x1:
.align 32
vpaddd XFER, X0, [TBL + 0*16]
vmovdqa [rsp], XFER
prefetcht0 [TBL + 256] // prefetch K constants to L1
FOUR_ROUNDS_AND_SCHED

vpaddd XFER, X0, [TBL + 1*16]
Expand Down Expand Up @@ -600,6 +605,7 @@ hashtree_sha256_avx_x1:
#endif
pop rbx

vzeroupper // Clear upper YMM state to avoid SSE transition penalty
ret
#ifdef __linux__
.size hashtree_sha256_avx_x1,.-hashtree_sha256_avx_x1
Expand Down
10 changes: 10 additions & 0 deletions src/sha256_avx_x16.S
Original file line number Diff line number Diff line change
Expand Up @@ -964,6 +964,7 @@ hashtree_sha256_avx512_x16:
endbr64
cmp COUNT, 0
jne .Lstart_routine
vzeroupper
ret

.Lstart_routine:
Expand All @@ -977,6 +978,15 @@ hashtree_sha256_avx512_x16:
cmp COUNT, 16
jb hashtree_sha256_avx2_x8

# Prefetch data 16 blocks ahead (16*64 = 1024 bytes)
cmp COUNT, 32
jb .Lskip_prefetch_x16
# Multi-level prefetching for better cache utilization
prefetchnta [DATA_PTR + 1024] # L3 cache hint
prefetcht2 [DATA_PTR + 1024] # L2 cache hint
prefetcht0 [DATA_PTR + 1024] # L1 cache hint
.Lskip_prefetch_x16:

# Load pre-transposed digest
vmovdqa32 A, [DIGEST + 0*64]
vmovdqa32 B, [DIGEST + 1*64]
Expand Down
10 changes: 10 additions & 0 deletions src/sha256_avx_x4.S
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,7 @@ hashtree_sha256_avx_x4:
endbr64
cmp NUM_BLKS, 0
jne .Lstart_routine
vzeroupper
ret
.Lstart_routine:
sub rsp, sha256_avx_4_stack_size
Expand All @@ -373,6 +374,15 @@ hashtree_sha256_avx_x4:
cmp NUM_BLKS, 4
jl .Lsha256_4_avx_epilog

# Prefetch data 4 blocks ahead (4*64 = 256 bytes)
cmp NUM_BLKS, 8
jb .Lskip_prefetch_x4
# Multi-level prefetching for better cache utilization
prefetchnta [DATA_PTR + 256] # L3 cache hint
prefetcht2 [DATA_PTR + 256] # L2 cache hint
prefetcht0 [DATA_PTR + 256] # L1 cache hint
.Lskip_prefetch_x4:

xor ROUND, ROUND

// Load the pre-transposed incoming digest.
Expand Down
10 changes: 10 additions & 0 deletions src/sha256_avx_x8.S
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,7 @@ hashtree_sha256_avx2_x8:
endbr64
cmp NUM_BLKS, 0
jne .Lstart_routine
vzeroupper
ret
.Lstart_routine:
push rbp
Expand Down Expand Up @@ -618,6 +619,15 @@ hashtree_sha256_avx2_x8:
cmp NUM_BLKS, 8
jb .Lsha256_8_avx2_epilog

# Prefetch data 8 blocks ahead (8*64 = 512 bytes)
cmp NUM_BLKS, 16
jb .Lskip_prefetch_x8
# Multi-level prefetching for better cache utilization
prefetchnta [DATA_PTR + 512] # L3 cache hint
prefetcht2 [DATA_PTR + 512] # L2 cache hint
prefetcht0 [DATA_PTR + 512] # L1 cache hint
.Lskip_prefetch_x8:

lea TBL,[rip + .LDIGEST_8]
vmovdqa a,[TBL + 0*32]
vmovdqa b,[TBL + 1*32]
Expand Down
Loading