klever-io · fbsobreira · May 7, 2026 · May 8, 2026 · May 8, 2026 · May 8, 2026
diff --git a/cmd/benchmark/CLI.md b/cmd/benchmark/CLI.md
@@ -17,6 +17,7 @@ Usage of benchmark:
   -skip-kv            Skip KV store benchmark
   -skip-memory        Skip memory bandwidth and latency benchmark
   -skip-bignum        Skip big-number / FPU benchmark
+  -skip-crypto        Skip crypto benchmark (SHA-256/Blake2b/Keccak/Ed25519)
   -output string      Output format: text or json (default: "text")
   -verbose            Enable verbose logging
   -version            Print version and exit
@@ -34,6 +35,7 @@ Usage of benchmark:
 | **KV Store** | In-memory state-access patterns (80/20 read-write) | ops/s |
 | **Memory** | DRAM bandwidth, random latency, allocator speed | GB/s, ns, M allocs/s |
 | **BigNum / FPU** | 2048-bit modexp/modmul and float64 transcendentals | ops/s |
+| **Crypto / Hashing** | SHA-256 / Blake2b / Keccak-256 / Ed25519 throughput + CPU feature flags | MB/s, ops/s |
 
 ---
 
@@ -57,22 +59,49 @@ Skipped categories are excluded from the denominator so the grade stays fair.
 
 | Category | Weight |
 |----------|--------|
-| Goroutine (CPU) | 200 |
 | Disk I/O | 200 |
 | KV Store | 200 |
-| Network | 150 |
-| Memory | 150 |
-| BigNum / FPU | 100 |
+| Crypto / Hashing | 200 |
+| Goroutine (CPU) | 150 |
+| Network | 100 |
+| Memory | 100 |
+| BigNum / FPU | 50 |
 
 | Grade | % of enabled max | Description |
 |-------|-----------------|-------------|
 | **S** | ≥ 90 % | Elite — top-tier validator hardware |
 | **A** | ≥ 75 % | Excellent — production-ready for high-traffic networks |
 | **B** | ≥ 60 % | Good — suitable for standard validator operation |
-| **C** | ≥ 45 % | Acceptable — meets minimum validator requirements |
+| **C** | ≥ 45 % | Acceptable — meets minimum validator requirements; consider a hardware upgrade |
 | **D** | ≥ 30 % | Marginal — several metrics below recommended levels |
 | **F** | < 30 % | Insufficient — does not meet validator requirements |
 
+### Hard veto: SHA-256 throughput floor
+
+Klever's TX hot path hashes SHA-256 per-TX, per-header, and per-state-entry.
+The protocol tolerates some hardware variance — a leader has a 500 ms
+baseline timeout with a 425 ms lower bound below which validators
+attribute leader failure to weak hardware. To prevent operators from
+deploying nodes that cannot consistently process TXs as leader within
+that window, the Crypto category applies a **hard veto** on the measured
+SHA-256 throughput: hosts whose 16 KiB SHA-256 throughput is below
+**500 MB/s** have their overall grade capped at **F** regardless of total
+points.
+
+The veto is grounded in the measurement, not in any specific CPU feature
+flag. SHA-NI absence is the most common cause of low throughput in
+practice (Skylake-X / Cascade Lake / Haswell on amd64), and the report
+calls this out informationally. The text report highlights the reason;
+the JSON report sets `score.grade = "F"`, populates `score.vetoed=true`
+and `score.veto_reason`, and exposes the underlying CPU flags under
+`crypto.cpu_features`.
+
+**Note on validator startup:** the `klever-node` validator binary applies
+a *stricter* CPU preflight at startup, requiring ≥ 800 MB/s on a 200 ms
+self-bench (vs the 500 MB/s benchmark veto). A host can pass this
+benchmark with a non-`F` grade and still be refused by validator
+startup. See `cmd/node/PREFLIGHT.md` for the rationale.
+
 ---
 
 ## Examples

diff --git a/cmd/benchmark/crypto.go b/cmd/benchmark/crypto.go
@@ -0,0 +1,207 @@
+package main
+
+// RunCryptoBenchmark measures the cryptographic throughput a Kleverchain
+// validator depends on for consensus, transaction hashing, and signature
+// verification. The hashing primitives matter most — Klever's TX hot path
+// hashes SHA-256 per-TX, per-header, and per-state-entry, and the difference
+// between SHA-NI-equipped and SHA-NI-deficient hardware is roughly 5–6× on
+// SHA-256 throughput, which translates almost linearly into wall-time on
+// smart-contract transactions.
+//
+// Metrics:
+//
+//   - SHA256MBps      : 1 KiB blocks; sensitive to SHA-NI startup cost.
+//   - SHA256LargeMBps : 16 KiB blocks; matches the openssl-speed reference
+//     used during the original validator-fleet investigation, and reveals
+//     sustained SHA-NI throughput separately from the small-block path.
+//   - Blake2bMBps     : 16 KiB blocks; exercises the AVX2 path in
+//     golang.org/x/crypto/blake2b.
+//   - Keccak256MBps   : 16 KiB blocks; pure-Go reference (no SIMD), used
+//     as a sanity check — Keccak should be roughly identical across CPUs
+//     of the same generation, so a large gap here means non-CPU factors
+//     are at play (frequency cap, thermal throttle, hypervisor masking).
+//   - Ed25519VerifyOpsPerSec : stdlib Ed25519 signature verification,
+//     SHA-512-bound; complements the SHA-256 numbers.
+//
+// CPU feature attestation is reported alongside the throughput numbers so
+// operators can correlate measured perf to the underlying instruction set.
+//
+// BLS-verify (MCL/herumi) is intentionally not measured here — wiring MCL
+// into the benchmark binary would force a CGO dependency on operators that
+// just want to grade their hardware. The AVX-512 IFMA flag is reported
+// instead as a proxy for the BLS-pairing fast path; that flag has a tight
+// 1.5× correlation with herumi/MCL pairing throughput.
+
+import (
+	"crypto/ed25519"
+	"crypto/rand"
+	"crypto/sha256"
+	"fmt"
+	"hash"
+	"os"
+	"runtime"
+	"strings"
+	"time"
+
+	"github.com/klauspost/cpuid/v2"
+	"golang.org/x/crypto/blake2b"
+	"golang.org/x/crypto/sha3"
+)
+
+const (
+	cryptoBenchDuration = 2 * time.Second
+	cryptoSmallBlock    = 1024      // 1 KiB
+	cryptoLargeBlock    = 16 * 1024 // 16 KiB — matches openssl-speed reference
+)
+
+// CryptoResult holds the per-primitive throughput and CPU feature flags.
+type CryptoResult struct {
+	SHA256MBps             float64 // SHA-256 throughput on 1 KiB blocks
+	SHA256LargeMBps        float64 // SHA-256 throughput on 16 KiB blocks
+	Blake2bMBps            float64 // Blake2b-512 throughput on 16 KiB blocks
+	Keccak256MBps          float64 // Keccak-256 throughput on 16 KiB blocks
+	Ed25519VerifyOpsPerSec float64 // Ed25519 signature verifications per second
+
+	// CPU feature flags — reported but not directly scored. The hard veto
+	// in score.go fires on measured SHA-256 throughput, not on these flags
+	// (see BenchmarkScore.Vetoed). The flags are informational and help
+	// operators correlate measured throughput to the underlying ISA.
+	//
+	// HasSHA_NI is the cross-platform shorthand: true means SHA-256
+	// hardware acceleration is available — Intel/AMD SHA-NI on amd64,
+	// ARMv8 SHA2 on arm64 — false on every other architecture.
+	// AVX-512 IFMA on amd64 indicates whether the BLS pairing fast path
+	// is available (~1.5x speedup vs scalar fallback).
+	HasSHA_NI     bool
+	HasAVX512IFMA bool
+	HasVAES       bool
+	HasGFNI       bool
+}
+
+// RunCryptoBenchmark executes all crypto sub-benchmarks and detects CPU
+// feature flags. Returns a populated CryptoResult on success.
+func RunCryptoBenchmark() (*CryptoResult, error) {
+	r := &CryptoResult{
+		HasSHA_NI:     hasSHAAcceleration(),
+		HasAVX512IFMA: cpuid.CPU.Has(cpuid.AVX512IFMA),
+		HasVAES:       cpuid.CPU.Has(cpuid.VAES),
+		HasGFNI:       cpuid.CPU.Has(cpuid.GFNI),
+	}
+
+	clearLine("  Crypto: SHA-256 1 KiB blocks (%s)...", cryptoBenchDuration)
+	r.SHA256MBps = benchHashMBps(sha256.New, cryptoSmallBlock)
+
+	clearLine("  Crypto: SHA-256 16 KiB blocks (%s)...", cryptoBenchDuration)
+	r.SHA256LargeMBps = benchHashMBps(sha256.New, cryptoLargeBlock)
+
+	clearLine("  Crypto: Blake2b 16 KiB blocks (%s)...", cryptoBenchDuration)
+	r.Blake2bMBps = benchHashMBps(newBlake2b512, cryptoLargeBlock)
+
+	clearLine("  Crypto: Keccak-256 16 KiB blocks (%s)...", cryptoBenchDuration)
+	r.Keccak256MBps = benchHashMBps(sha3.NewLegacyKeccak256, cryptoLargeBlock)
+
+	clearLine("  Crypto: Ed25519 verify (%s)...", cryptoBenchDuration)
+	ops, err := benchEd25519Verify()
+	if err != nil {
+		return nil, fmt.Errorf("ed25519 verify: %w", err)
+	}
+	r.Ed25519VerifyOpsPerSec = ops
+
+	fmt.Fprintf(os.Stderr, "  %s\r", strings.Repeat(" ", 60))
+	return r, nil
+}
+
+// hasSHAAcceleration reports whether SHA-256 hardware acceleration is
+// available on the current architecture: SHA-NI on amd64, ARMv8 SHA2 on
+// arm64. Returns false on every other architecture.
+func hasSHAAcceleration() bool {
+	switch runtime.GOARCH {
+	case "amd64":
+		return cpuid.CPU.Has(cpuid.SHA)
+	case "arm64":
+		return cpuid.CPU.Has(cpuid.SHA2)
+	default:
+		return false
+	}
+}
+
+// newBlake2b512 returns a fresh Blake2b-512 hash. Wrapped to match the
+// stdlib `func() hash.Hash` constructor signature so it can be passed to
+// benchHashMBps directly. blake2b.New512(nil) only errors on a non-nil key
+// of invalid length; passing nil never errors.
+func newBlake2b512() hash.Hash {
+	h, err := blake2b.New512(nil)
+	if err != nil {
+		panic(fmt.Sprintf("blake2b.New512(nil) unexpectedly errored: %v", err))
+	}
+	return h
+}
+
+// benchHashMBps hashes blockSize-byte buffers for cryptoBenchDuration and
+// returns sustained throughput in MB/s.
+//
+// The buffer is seeded once with crypto/rand so data-dependent hash
+// implementations (Blake2b, Keccak) measure realistic throughput rather
+// than the all-zero special case. SHA-256's amd64/arm64 fast paths are
+// data-independent, so seeding does not alter their numbers.
+//
+// The hot loop checks the deadline once per innerLoop iterations to
+// minimise time.Now() syscall overhead on fast hosts.
+func benchHashMBps(newHash func() hash.Hash, blockSize int) float64 {
+	buf := make([]byte, blockSize)
+	if _, err := rand.Read(buf); err != nil {
+		for i := range buf {
+			buf[i] = byte(i)
+		}
+	}
+	h := newHash()
+	digest := make([]byte, 0, h.Size())
+	deadline := time.Now().Add(cryptoBenchDuration)
+	start := time.Now()
+	var bytes int64
+	const innerLoop = 256
+	for time.Now().Before(deadline) {
+		for range innerLoop {
+			h.Reset()
+			_, _ = h.Write(buf)
+			digest = h.Sum(digest[:0])
+		}
+		bytes += int64(blockSize) * innerLoop
+	}
+	elapsed := time.Since(start).Seconds()
+	if elapsed <= 0 {
+		return 0
+	}
+	return float64(bytes) / (1024 * 1024) / elapsed
+}
+
+// benchEd25519Verify generates a key pair + signs once, then measures how
+// many verifications per second the host can sustain. Verify is the path
+// that runs on every TX received by the validator, so it is more relevant
+// to fleet behavior than Sign.
+func benchEd25519Verify() (float64, error) {
+	pub, priv, err := ed25519.GenerateKey(rand.Reader)
+	if err != nil {
+		return 0, err
+	}
+	msg := make([]byte, 64)
+	if _, err := rand.Read(msg); err != nil {
+		return 0, err
+	}
+	sig := ed25519.Sign(priv, msg)
+
+	deadline := time.Now().Add(cryptoBenchDuration)
+	start := time.Now()
+	var count int64
+	for time.Now().Before(deadline) {
+		if !ed25519.Verify(pub, msg, sig) {
+			return 0, fmt.Errorf("ed25519 self-verify failed unexpectedly")
+		}
+		count++
+	}
+	elapsed := time.Since(start).Seconds()
+	if elapsed <= 0 {
+		return 0, nil
+	}
+	return float64(count) / elapsed, nil
+}
-	deadline := time.Now().Add(cryptoBenchDuration)
-	start := time.Now()
-	var count int64
-	for time.Now().Before(deadline) {
-		if !ed25519.Verify(pub, msg, sig) {
-			return 0, fmt.Errorf("ed25519 self-verify failed unexpectedly")
-		}
-		count++
-	}
-	elapsed := time.Since(start).Seconds()
-	if elapsed <= 0 {
-		return 0, nil
-	}
-	return float64(count) / elapsed, nil
-}
+	deadline := time.Now().Add(cryptoBenchDuration)
+	start := time.Now()
+	var count int64
+	const innerLoop = 64
+	for time.Now().Before(deadline) {
+		for i := 0; i < innerLoop; i++ {
+			if !ed25519.Verify(pub, msg, sig) {
+				return 0, fmt.Errorf("ed25519 self-verify failed unexpectedly")
+			}
+		}
+		count += innerLoop
+	}
+	elapsed := time.Since(start).Seconds()
+	if elapsed <= 0 {
+		return 0, nil
+	}
+	return float64(count) / elapsed, nil
+}
-	deadline := time.Now().Add(cryptoBenchDuration)
-	start := time.Now()
-	var count int64
-	for time.Now().Before(deadline) {
-		if !ed25519.Verify(pub, msg, sig) {
-			return 0, fmt.Errorf("ed25519 self-verify failed unexpectedly")
-		}
-		count++
-	}
-	elapsed := time.Since(start).Seconds()
-	if elapsed <= 0 {
-		return 0, nil
-	}
-	return float64(count) / elapsed, nil
-}
+	deadline := time.Now().Add(cryptoBenchDuration)
+	start := time.Now()
+	var count int64
+	const innerLoop = 64
+	for time.Now().Before(deadline) {
+		for i := 0; i < innerLoop; i++ {
+			if !ed25519.Verify(pub, msg, sig) {
+				return 0, fmt.Errorf("ed25519 self-verify failed unexpectedly")
+			}
+		}
+		count += innerLoop
+	}
+	elapsed := time.Since(start).Seconds()
+	if elapsed <= 0 {
+		return 0, nil
+	}
+	return float64(count) / elapsed, nil
+}
diff --git a/cmd/benchmark/main.go b/cmd/benchmark/main.go
@@ -31,6 +31,7 @@ func main() {
 		skipKV        = flag.Bool("skip-kv", false, "Skip KV store benchmark")
 		skipMemory    = flag.Bool("skip-memory", false, "Skip memory bandwidth and latency benchmark")
 		skipBigNum    = flag.Bool("skip-bignum", false, "Skip big-number / FPU benchmark")
+		skipCrypto    = flag.Bool("skip-crypto", false, "Skip crypto benchmark (SHA-256/Blake2b/Keccak/Ed25519)")
 		outputFmt     = flag.String("output", "text", "Output format: text or json")
 		verbose       = flag.Bool("verbose", false, "Enable verbose logging")
 		version       = flag.Bool("version", false, "Print version and exit")
@@ -96,6 +97,7 @@ func main() {
 		SkipKV:        *skipKV,
 		SkipMemory:    *skipMemory,
 		SkipBigNum:    *skipBigNum,
+		SkipCrypto:    *skipCrypto,
 		OutputFmt:     *outputFmt,
 	}