From 2a691af493e3fbf680e54c1501aeec912b5b2037 Mon Sep 17 00:00:00 2001 From: Fernando Sobreira Date: Thu, 7 May 2026 19:57:34 -0400 Subject: [PATCH 1/6] [KLC-2387] add validator CPU preflight and benchmark crypto category Adds a startup-time SHA-256 throughput gate for validator nodes plus a crypto benchmark category in klever-benchmark. The preflight refuses to start a validator measuring below 800 MB/s on a 200 ms self-bench (the floor below which leader-mode TX processing exceeds the 425 ms protocol hardware-tolerance window). Operators control enforcement via preferences.enforceCpuPreflight (defaults to true, including for existing configs that omit the key); KLEVER_SKIP_CPU_CHECK=1 is an emergency bypass. The standalone klever-benchmark gains a Crypto category (SHA-256 1K/16K, Blake2b, Keccak-256, Ed25519 verify) with CPU feature attestation and a hard grade-F veto below 500 MB/s SHA-256. JSON output exposes vetoed and veto_reason for fleet tooling. Field validation across the validator fleet confirms the SHA-NI gap: non-SHA-NI guests measure 244-283 MB/s while EPYC hosts measure 1474-1674 MB/s, matching the ~5x ratio that motivated the gate. --- cmd/benchmark/CLI.md | 39 +++++- cmd/benchmark/crypto.go | 200 +++++++++++++++++++++++++++++ cmd/benchmark/main.go | 2 + cmd/benchmark/report.go | 182 +++++++++++++++++++++++---- cmd/benchmark/runner.go | 11 ++ cmd/benchmark/score.go | 90 +++++++++++-- cmd/benchmark/score_test.go | 181 ++++++++++++++++++++++++++ cmd/node/PREFLIGHT.md | 108 ++++++++++++++++ cmd/node/preflight.go | 164 ++++++++++++++++++++++++ cmd/node/preflight_test.go | 244 ++++++++++++++++++++++++++++++++++++ cmd/node/startup.go | 11 ++ config/node/config.yaml | 11 ++ config/prefsConfig.go | 21 ++++ 13 files changed, 1225 insertions(+), 39 deletions(-) create mode 100644 cmd/benchmark/crypto.go create mode 100644 cmd/benchmark/score_test.go create mode 100644 cmd/node/PREFLIGHT.md create mode 100644 cmd/node/preflight.go create mode 100644 cmd/node/preflight_test.go diff --git a/cmd/benchmark/CLI.md b/cmd/benchmark/CLI.md index 9eec2114..727c4668 100644 --- a/cmd/benchmark/CLI.md +++ b/cmd/benchmark/CLI.md @@ -17,6 +17,7 @@ Usage of benchmark: -skip-kv Skip KV store benchmark -skip-memory Skip memory bandwidth and latency benchmark -skip-bignum Skip big-number / FPU benchmark + -skip-crypto Skip crypto benchmark (SHA-256/Blake2b/Keccak/Ed25519) -output string Output format: text or json (default: "text") -verbose Enable verbose logging -version Print version and exit @@ -34,6 +35,7 @@ Usage of benchmark: | **KV Store** | In-memory state-access patterns (80/20 read-write) | ops/s | | **Memory** | DRAM bandwidth, random latency, allocator speed | GB/s, ns, M allocs/s | | **BigNum / FPU** | 2048-bit modexp/modmul and float64 transcendentals | ops/s | +| **Crypto / Hashing** | SHA-256 / Blake2b / Keccak-256 / Ed25519 throughput + CPU feature flags | MB/s, ops/s | --- @@ -57,22 +59,49 @@ Skipped categories are excluded from the denominator so the grade stays fair. | Category | Weight | |----------|--------| -| Goroutine (CPU) | 200 | | Disk I/O | 200 | | KV Store | 200 | -| Network | 150 | -| Memory | 150 | -| BigNum / FPU | 100 | +| Crypto / Hashing | 200 | +| Goroutine (CPU) | 150 | +| Network | 100 | +| Memory | 100 | +| BigNum / FPU | 50 | | Grade | % of enabled max | Description | |-------|-----------------|-------------| | **S** | ≥ 90 % | Elite — top-tier validator hardware | | **A** | ≥ 75 % | Excellent — production-ready for high-traffic networks | | **B** | ≥ 60 % | Good — suitable for standard validator operation | -| **C** | ≥ 45 % | Acceptable — meets minimum validator requirements | +| **C** | ≥ 45 % | Acceptable — meets minimum validator requirements; consider a hardware upgrade | | **D** | ≥ 30 % | Marginal — several metrics below recommended levels | | **F** | < 30 % | Insufficient — does not meet validator requirements | +### Hard veto: SHA-256 throughput floor + +Klever's TX hot path hashes SHA-256 per-TX, per-header, and per-state-entry. +The protocol tolerates some hardware variance — a leader has a 500 ms +baseline timeout with a 425 ms lower bound below which validators +attribute leader failure to weak hardware. To prevent operators from +deploying nodes that cannot consistently process TXs as leader within +that window, the Crypto category applies a **hard veto** on the measured +SHA-256 throughput: hosts whose 16 KiB SHA-256 throughput is below +**500 MB/s** have their overall grade capped at **F** regardless of total +points. + +The veto is grounded in the measurement, not in any specific CPU feature +flag. SHA-NI absence is the most common cause of low throughput in +practice (Skylake-X / Cascade Lake / Haswell on amd64), and the report +calls this out informationally. The text report highlights the reason; +the JSON report sets `score.grade = "F"`, populates `score.vetoed=true` +and `score.veto_reason`, and exposes the underlying CPU flags under +`crypto.cpu_features`. + +**Note on validator startup:** the `klever-node` validator binary applies +a *stricter* CPU preflight at startup, requiring ≥ 800 MB/s on a 200 ms +self-bench (vs the 500 MB/s benchmark veto). A host can pass this +benchmark with a non-`F` grade and still be refused by validator +startup. See `cmd/node/PREFLIGHT.md` for the rationale. + --- ## Examples diff --git a/cmd/benchmark/crypto.go b/cmd/benchmark/crypto.go new file mode 100644 index 00000000..2217ae10 --- /dev/null +++ b/cmd/benchmark/crypto.go @@ -0,0 +1,200 @@ +package main + +// RunCryptoBenchmark measures the cryptographic throughput a Kleverchain +// validator depends on for consensus, transaction hashing, and signature +// verification. The hashing primitives matter most — Klever's TX hot path +// hashes SHA-256 per-TX, per-header, and per-state-entry, and the difference +// between SHA-NI-equipped and SHA-NI-deficient hardware is roughly 5–6× on +// SHA-256 throughput, which translates almost linearly into wall-time on +// smart-contract transactions. +// +// Metrics: +// +// - SHA256MBps : 1 KiB blocks; sensitive to SHA-NI startup cost. +// - SHA256LargeMBps : 16 KiB blocks; matches the openssl-speed reference +// used during the original validator-fleet investigation, and reveals +// sustained SHA-NI throughput separately from the small-block path. +// - Blake2bMBps : 16 KiB blocks; exercises the AVX2 path in +// golang.org/x/crypto/blake2b. +// - Keccak256MBps : 16 KiB blocks; pure-Go reference (no SIMD), used +// as a sanity check — Keccak should be roughly identical across CPUs +// of the same generation, so a large gap here means non-CPU factors +// are at play (frequency cap, thermal throttle, hypervisor masking). +// - Ed25519VerifyOpsPerSec : stdlib Ed25519 signature verification, +// SHA-512-bound; complements the SHA-256 numbers. +// +// CPU feature attestation is reported alongside the throughput numbers so +// operators can correlate measured perf to the underlying instruction set. +// +// BLS-verify (MCL/herumi) is intentionally not measured here — wiring MCL +// into the benchmark binary would force a CGO dependency on operators that +// just want to grade their hardware. The AVX-512 IFMA flag is reported +// instead as a proxy for the BLS-pairing fast path; that flag has a tight +// 1.5× correlation with herumi/MCL pairing throughput. + +import ( + "crypto/ed25519" + "crypto/rand" + "crypto/sha256" + "fmt" + "hash" + "os" + "runtime" + "strings" + "time" + + "github.com/klauspost/cpuid/v2" + "golang.org/x/crypto/blake2b" + "golang.org/x/crypto/sha3" +) + +const ( + cryptoBenchDuration = 2 * time.Second + cryptoSmallBlock = 1024 // 1 KiB + cryptoLargeBlock = 16 * 1024 // 16 KiB — matches openssl-speed reference +) + +// CryptoResult holds the per-primitive throughput and CPU feature flags. +type CryptoResult struct { + SHA256MBps float64 // SHA-256 throughput on 1 KiB blocks + SHA256LargeMBps float64 // SHA-256 throughput on 16 KiB blocks + Blake2bMBps float64 // Blake2b-512 throughput on 16 KiB blocks + Keccak256MBps float64 // Keccak-256 throughput on 16 KiB blocks + Ed25519VerifyOpsPerSec float64 // Ed25519 signature verifications per second + + // CPU feature flags — reported but not directly scored. The overall + // score applies a hard veto when HasSHA_NI is false on amd64 (see + // score.go). The other flags are informational; AVX-512 IFMA in + // particular indicates whether the BLS pairing fast path is available. + HasSHA_NI bool + HasAVX512IFMA bool + HasVAES bool + HasGFNI bool +} + +// RunCryptoBenchmark executes all crypto sub-benchmarks and detects CPU +// feature flags. Returns a populated CryptoResult on success. +func RunCryptoBenchmark() (*CryptoResult, error) { + r := &CryptoResult{ + HasSHA_NI: hasSHAAcceleration(), + HasAVX512IFMA: cpuid.CPU.Has(cpuid.AVX512IFMA), + HasVAES: cpuid.CPU.Has(cpuid.VAES), + HasGFNI: cpuid.CPU.Has(cpuid.GFNI), + } + + clearLine(" Crypto: SHA-256 1 KiB blocks (%s)...", cryptoBenchDuration) + r.SHA256MBps = benchHashMBps(sha256.New, cryptoSmallBlock) + + clearLine(" Crypto: SHA-256 16 KiB blocks (%s)...", cryptoBenchDuration) + r.SHA256LargeMBps = benchHashMBps(sha256.New, cryptoLargeBlock) + + clearLine(" Crypto: Blake2b 16 KiB blocks (%s)...", cryptoBenchDuration) + r.Blake2bMBps = benchHashMBps(newBlake2b512, cryptoLargeBlock) + + clearLine(" Crypto: Keccak-256 16 KiB blocks (%s)...", cryptoBenchDuration) + r.Keccak256MBps = benchHashMBps(sha3.NewLegacyKeccak256, cryptoLargeBlock) + + clearLine(" Crypto: Ed25519 verify (%s)...", cryptoBenchDuration) + ops, err := benchEd25519Verify() + if err != nil { + return nil, fmt.Errorf("ed25519 verify: %w", err) + } + r.Ed25519VerifyOpsPerSec = ops + + fmt.Fprintf(os.Stderr, " %s\r", strings.Repeat(" ", 60)) + return r, nil +} + +// hasSHAAcceleration reports whether SHA-256 hardware acceleration is +// available on the current architecture: SHA-NI on amd64, ARMv8 SHA2 on +// arm64. Returns false on every other architecture. +func hasSHAAcceleration() bool { + switch runtime.GOARCH { + case "amd64": + return cpuid.CPU.Has(cpuid.SHA) + case "arm64": + return cpuid.CPU.Has(cpuid.SHA2) + default: + return false + } +} + +// newBlake2b512 returns a fresh Blake2b-512 hash. Wrapped to match the +// stdlib `func() hash.Hash` constructor signature so it can be passed to +// benchHashMBps directly. blake2b.New512(nil) only errors on a non-nil key +// of invalid length; passing nil never errors. +func newBlake2b512() hash.Hash { + h, err := blake2b.New512(nil) + if err != nil { + panic(fmt.Sprintf("blake2b.New512(nil) unexpectedly errored: %v", err)) + } + return h +} + +// benchHashMBps hashes blockSize-byte buffers for cryptoBenchDuration and +// returns sustained throughput in MB/s. +// +// The buffer is seeded once with crypto/rand so data-dependent hash +// implementations (Blake2b, Keccak) measure realistic throughput rather +// than the all-zero special case. SHA-256's amd64/arm64 fast paths are +// data-independent, so seeding does not alter their numbers. +// +// The hot loop checks the deadline once per innerLoop iterations to +// minimise time.Now() syscall overhead on fast hosts. +func benchHashMBps(newHash func() hash.Hash, blockSize int) float64 { + buf := make([]byte, blockSize) + if _, err := rand.Read(buf); err != nil { + for i := range buf { + buf[i] = byte(i) + } + } + h := newHash() + deadline := time.Now().Add(cryptoBenchDuration) + start := time.Now() + var bytes int64 + const innerLoop = 256 + for time.Now().Before(deadline) { + for range innerLoop { + h.Reset() + _, _ = h.Write(buf) + _ = h.Sum(nil) + } + bytes += int64(blockSize) * innerLoop + } + elapsed := time.Since(start).Seconds() + if elapsed <= 0 { + return 0 + } + return float64(bytes) / (1024 * 1024) / elapsed +} + +// benchEd25519Verify generates a key pair + signs once, then measures how +// many verifications per second the host can sustain. Verify is the path +// that runs on every TX received by the validator, so it is more relevant +// to fleet behavior than Sign. +func benchEd25519Verify() (float64, error) { + pub, priv, err := ed25519.GenerateKey(rand.Reader) + if err != nil { + return 0, err + } + msg := make([]byte, 64) + if _, err := rand.Read(msg); err != nil { + return 0, err + } + sig := ed25519.Sign(priv, msg) + + deadline := time.Now().Add(cryptoBenchDuration) + start := time.Now() + var count int64 + for time.Now().Before(deadline) { + if !ed25519.Verify(pub, msg, sig) { + return 0, fmt.Errorf("ed25519 self-verify failed unexpectedly") + } + count++ + } + elapsed := time.Since(start).Seconds() + if elapsed <= 0 { + return 0, nil + } + return float64(count) / elapsed, nil +} diff --git a/cmd/benchmark/main.go b/cmd/benchmark/main.go index 1d59d9c3..ff8f6d54 100644 --- a/cmd/benchmark/main.go +++ b/cmd/benchmark/main.go @@ -31,6 +31,7 @@ func main() { skipKV = flag.Bool("skip-kv", false, "Skip KV store benchmark") skipMemory = flag.Bool("skip-memory", false, "Skip memory bandwidth and latency benchmark") skipBigNum = flag.Bool("skip-bignum", false, "Skip big-number / FPU benchmark") + skipCrypto = flag.Bool("skip-crypto", false, "Skip crypto benchmark (SHA-256/Blake2b/Keccak/Ed25519)") outputFmt = flag.String("output", "text", "Output format: text or json") verbose = flag.Bool("verbose", false, "Enable verbose logging") version = flag.Bool("version", false, "Print version and exit") @@ -96,6 +97,7 @@ func main() { SkipKV: *skipKV, SkipMemory: *skipMemory, SkipBigNum: *skipBigNum, + SkipCrypto: *skipCrypto, OutputFmt: *outputFmt, } diff --git a/cmd/benchmark/report.go b/cmd/benchmark/report.go index c3934228..7195c17e 100644 --- a/cmd/benchmark/report.go +++ b/cmd/benchmark/report.go @@ -32,6 +32,7 @@ type BenchmarkResults struct { KVResult *KVResult MemoryResult *MemoryResult BigNumResult *BigNumResult + CryptoResult *CryptoResult } // --------------------------------------------------------------------------- @@ -123,6 +124,25 @@ const ( bigFloat64FailOps = 1_000_000.0 bigIntDivPassOps = 13_000_000.0 bigIntDivFailOps = 3_000_000.0 + + // Crypto thresholds (calibrated against the validator-fleet investigation: + // AMD EPYC Zen4 with SHA-NI hits ~1740 MB/s at 16 KiB; Intel Skylake-IBRS + // without SHA-NI sits at ~310 MB/s on the same blocks. The fail floor is + // set above the Skylake number so any SHA-NI-deficient amd64 host fails.) + cryptoSHA256SmallPassMBps = 1_200.0 // SHA-256 on 1 KiB blocks + cryptoSHA256SmallFailMBps = 500.0 + cryptoSHA256LargePassMBps = 1_500.0 // SHA-256 on 16 KiB blocks + cryptoSHA256LargeFailMBps = 600.0 + cryptoBlake2bPassMBps = 700.0 // Blake2b-512 on 16 KiB blocks (AVX2) + cryptoBlake2bFailMBps = 300.0 + // Pure-Go Keccak (no SIMD path); calibrated against three healthy AMD + // Zen2/Zen4 chips that landed in the 295–390 MB/s range. The pass + // threshold is set just below the slowest observed healthy value so a + // production AMD chip does not show a misleading WARN. + cryptoKeccak256PassMBps = 350.0 + cryptoKeccak256FailMBps = 100.0 + cryptoEd25519VerifyPassOps = 12_000.0 // Ed25519.Verify (SHA-512-bound) + cryptoEd25519VerifyFailOps = 5_000.0 ) type verdict int @@ -245,6 +265,27 @@ func memoryVerdict(r *MemoryResult) verdict { return verdictPass } +func cryptoVerdict(r *CryptoResult) verdict { + if r == nil { + return verdictSkip + } + if r.SHA256MBps < cryptoSHA256SmallFailMBps || + r.SHA256LargeMBps < cryptoSHA256LargeFailMBps || + r.Blake2bMBps < cryptoBlake2bFailMBps || + r.Keccak256MBps < cryptoKeccak256FailMBps || + r.Ed25519VerifyOpsPerSec < cryptoEd25519VerifyFailOps { + return verdictFail + } + if r.SHA256MBps < cryptoSHA256SmallPassMBps || + r.SHA256LargeMBps < cryptoSHA256LargePassMBps || + r.Blake2bMBps < cryptoBlake2bPassMBps || + r.Keccak256MBps < cryptoKeccak256PassMBps || + r.Ed25519VerifyOpsPerSec < cryptoEd25519VerifyPassOps { + return verdictWarn + } + return verdictPass +} + func bigNumVerdict(r *BigNumResult) verdict { if r == nil { return verdictSkip @@ -306,6 +347,7 @@ func printText(results *BenchmarkResults) { kv := kvVerdict(results.KVResult) mv := memoryVerdict(results.MemoryResult) bv := bigNumVerdict(results.BigNumResult) + cv := cryptoVerdict(results.CryptoResult) sc := ComputeScore(results) fmt.Println() @@ -315,6 +357,10 @@ func printText(results *BenchmarkResults) { fmt.Println(sep) fmt.Printf(" System : %s/%s CPUs: %d Go: %s\n", si.GOOS, si.GOARCH, si.CPUs, si.GoVersion) + if c := results.CryptoResult; c != nil { + fmt.Printf(" CPU : SHA-NI=%s AVX-512 IFMA=%s VAES=%s GFNI=%s\n", + yesNo(c.HasSHA_NI), yesNo(c.HasAVX512IFMA), yesNo(c.HasVAES), yesNo(c.HasGFNI)) + } fmt.Println(sep) if results.GoroutineResult != nil { @@ -335,6 +381,9 @@ func printText(results *BenchmarkResults) { if results.BigNumResult != nil { printBigNumSection(results.BigNumResult, bv, sep) } + if results.CryptoResult != nil { + printCryptoSection(results.CryptoResult, cv, sep) + } printScoreSection(sc, sep) fmt.Println() @@ -509,6 +558,49 @@ func printBigNumSection(r *BigNumResult, v verdict, sep string) { fmt.Println(sep) } +func printCryptoSection(r *CryptoResult, v verdict, sep string) { + fmt.Printf(" CRYPTO / HASHING %s %s\n", v.Icon(), v) + fmt.Println() + + s256V := metricVerdict(r.SHA256MBps, cryptoSHA256SmallPassMBps, cryptoSHA256SmallFailMBps) + s256LV := metricVerdict(r.SHA256LargeMBps, cryptoSHA256LargePassMBps, cryptoSHA256LargeFailMBps) + b2V := metricVerdict(r.Blake2bMBps, cryptoBlake2bPassMBps, cryptoBlake2bFailMBps) + kV := metricVerdict(r.Keccak256MBps, cryptoKeccak256PassMBps, cryptoKeccak256FailMBps) + edV := metricVerdict(r.Ed25519VerifyOpsPerSec, cryptoEd25519VerifyPassOps, cryptoEd25519VerifyFailOps) + + fmt.Printf(" %-32s %7.1f MB/s %s (pass≥%.0f, fail<%.0f MB/s)\n", + "SHA-256 (1 KiB blocks):", r.SHA256MBps, s256V.Icon(), + cryptoSHA256SmallPassMBps, cryptoSHA256SmallFailMBps) + fmt.Printf(" %-32s %7.1f MB/s %s (pass≥%.0f, fail<%.0f MB/s)\n", + "SHA-256 (16 KiB blocks):", r.SHA256LargeMBps, s256LV.Icon(), + cryptoSHA256LargePassMBps, cryptoSHA256LargeFailMBps) + fmt.Printf(" %-32s %7.1f MB/s %s (pass≥%.0f, fail<%.0f MB/s)\n", + "Blake2b-512 (16 KiB):", r.Blake2bMBps, b2V.Icon(), + cryptoBlake2bPassMBps, cryptoBlake2bFailMBps) + fmt.Printf(" %-32s %7.1f MB/s %s (pass≥%.0f, fail<%.0f MB/s)\n", + "Keccak-256 (16 KiB):", r.Keccak256MBps, kV.Icon(), + cryptoKeccak256PassMBps, cryptoKeccak256FailMBps) + fmt.Printf(" %-32s %s %s (pass≥%.0fK, fail<%.0fK ops/s)\n", + "Ed25519 verify:", humanOps(r.Ed25519VerifyOpsPerSec), edV.Icon(), + cryptoEd25519VerifyPassOps/1000, cryptoEd25519VerifyFailOps/1000) + + if runtime.GOARCH == "amd64" && !r.HasSHA_NI { + fmt.Println() + fmt.Println(" ! CPU lacks SHA-NI; this is the most common cause of low SHA-256 throughput.") + fmt.Println(" ! If the throughput numbers above are below the pass thresholds, migrate to") + fmt.Println(" ! AMD Zen, Intel Ice Lake-SP+, or modern ARM (with ARMv8 SHA2).") + } + fmt.Println() + fmt.Println(sep) +} + +func yesNo(b bool) string { + if b { + return "yes" + } + return "no" +} + // --------------------------------------------------------------------------- // Score section // --------------------------------------------------------------------------- @@ -516,6 +608,9 @@ func printBigNumSection(r *BigNumResult, v verdict, sep string) { func printScoreSection(s BenchmarkScore, sep string) { fmt.Printf(" SCORE : %d / %d Grade: %s %s\n", s.Total, s.MaxTotal, s.Grade, scoreGradeSummary(s.Grade)) + if s.Vetoed { + fmt.Printf(" ! Hard veto: %s\n", s.VetoedReason) + } fmt.Println() printScoreRow("Goroutine (CPU)", s.Goroutine, weightGoroutine) @@ -524,6 +619,7 @@ func printScoreSection(s BenchmarkScore, sep string) { printScoreRow("KV Store", s.KV, weightKV) printScoreRow("Memory", s.Memory, weightMemory) printScoreRow("BigNum / FPU", s.BigNum, weightBigNum) + printScoreRow("Crypto / Hashing", s.Crypto, weightCrypto) } func printScoreRow(name string, c CategoryScore, weight int) { @@ -599,7 +695,7 @@ func verdictSummary(v verdict) string { case verdictPass: return "This node meets Kleverchain validator requirements." case verdictWarn: - return "Performance is below recommended levels; review individual sections before deploying." + return "Performance meets minimum requirements but is below recommended levels — consider a hardware upgrade and review individual sections before deploying." case verdictFail: return "This node does NOT meet Kleverchain validator requirements." default: @@ -620,6 +716,7 @@ type jsonReport struct { KV *jsonKV `json:"kv,omitempty"` Memory *jsonMemory `json:"memory,omitempty"` BigNum *jsonBigNum `json:"bignum,omitempty"` + Crypto *jsonCrypto `json:"crypto,omitempty"` Score jsonScore `json:"score"` OverallVerdict string `json:"overall_verdict"` } @@ -632,16 +729,19 @@ type jsonCategoryScore struct { } type jsonScore struct { - Total int `json:"total"` - MaxTotal int `json:"max_total"` - Pct float64 `json:"pct"` - Grade string `json:"grade"` - Goroutine jsonCategoryScore `json:"goroutine"` - Disk jsonCategoryScore `json:"disk"` - Network jsonCategoryScore `json:"network"` - KV jsonCategoryScore `json:"kv"` - Memory jsonCategoryScore `json:"memory"` - BigNum jsonCategoryScore `json:"bignum"` + Total int `json:"total"` + MaxTotal int `json:"max_total"` + Pct float64 `json:"pct"` + Grade string `json:"grade"` + Vetoed bool `json:"vetoed"` + VetoedReason string `json:"veto_reason,omitempty"` + Goroutine jsonCategoryScore `json:"goroutine"` + Disk jsonCategoryScore `json:"disk"` + Network jsonCategoryScore `json:"network"` + KV jsonCategoryScore `json:"kv"` + Memory jsonCategoryScore `json:"memory"` + BigNum jsonCategoryScore `json:"bignum"` + Crypto jsonCategoryScore `json:"crypto"` } type jsonGoroutineLevel struct { @@ -700,6 +800,23 @@ type jsonBigNum struct { Verdict string `json:"verdict"` } +type jsonCPUFeatures struct { + HasSHA_NI bool `json:"sha_ni"` + HasAVX512IFMA bool `json:"avx512_ifma"` + HasVAES bool `json:"vaes"` + HasGFNI bool `json:"gfni"` +} + +type jsonCrypto struct { + SHA256MBps float64 `json:"sha256_1k_mbps"` + SHA256LargeMBps float64 `json:"sha256_16k_mbps"` + Blake2bMBps float64 `json:"blake2b_16k_mbps"` + Keccak256MBps float64 `json:"keccak256_16k_mbps"` + Ed25519VerifyOpsPerSec float64 `json:"ed25519_verify_ops_per_sec"` + CPUFeatures jsonCPUFeatures `json:"cpu_features"` + Verdict string `json:"verdict"` +} + func printJSON(results *BenchmarkResults) { gv := goroutineVerdict(results.GoroutineResult) dv := diskVerdict(results.DiskResult) @@ -707,8 +824,9 @@ func printJSON(results *BenchmarkResults) { kv := kvVerdict(results.KVResult) mv := memoryVerdict(results.MemoryResult) bv := bigNumVerdict(results.BigNumResult) + cv := cryptoVerdict(results.CryptoResult) sc := ComputeScore(results) - ov := gradeToVerdict(sc.Grade, overallVerdict(gv, dv, nv, kv, mv, bv)) + ov := gradeToVerdict(sc.Grade, overallVerdict(gv, dv, nv, kv, mv, bv, cv)) report := jsonReport{ RunAt: results.RunAt.Format(time.RFC3339), @@ -786,17 +904,37 @@ func printJSON(results *BenchmarkResults) { } } + if r := results.CryptoResult; r != nil { + report.Crypto = &jsonCrypto{ + SHA256MBps: r.SHA256MBps, + SHA256LargeMBps: r.SHA256LargeMBps, + Blake2bMBps: r.Blake2bMBps, + Keccak256MBps: r.Keccak256MBps, + Ed25519VerifyOpsPerSec: r.Ed25519VerifyOpsPerSec, + CPUFeatures: jsonCPUFeatures{ + HasSHA_NI: r.HasSHA_NI, + HasAVX512IFMA: r.HasAVX512IFMA, + HasVAES: r.HasVAES, + HasGFNI: r.HasGFNI, + }, + Verdict: cv.String(), + } + } + report.Score = jsonScore{ - Total: sc.Total, - MaxTotal: sc.MaxTotal, - Pct: sc.Pct, - Grade: sc.Grade, - Goroutine: jsonCategoryScore{Points: sc.Goroutine.Points, Max: sc.Goroutine.Max, Pct: sc.Goroutine.Pct(), Skipped: sc.Goroutine.Skipped}, - Disk: jsonCategoryScore{Points: sc.Disk.Points, Max: sc.Disk.Max, Pct: sc.Disk.Pct(), Skipped: sc.Disk.Skipped}, - Network: jsonCategoryScore{Points: sc.Network.Points, Max: sc.Network.Max, Pct: sc.Network.Pct(), Skipped: sc.Network.Skipped}, - KV: jsonCategoryScore{Points: sc.KV.Points, Max: sc.KV.Max, Pct: sc.KV.Pct(), Skipped: sc.KV.Skipped}, - Memory: jsonCategoryScore{Points: sc.Memory.Points, Max: sc.Memory.Max, Pct: sc.Memory.Pct(), Skipped: sc.Memory.Skipped}, - BigNum: jsonCategoryScore{Points: sc.BigNum.Points, Max: sc.BigNum.Max, Pct: sc.BigNum.Pct(), Skipped: sc.BigNum.Skipped}, + Total: sc.Total, + MaxTotal: sc.MaxTotal, + Pct: sc.Pct, + Grade: sc.Grade, + Vetoed: sc.Vetoed, + VetoedReason: sc.VetoedReason, + Goroutine: jsonCategoryScore{Points: sc.Goroutine.Points, Max: sc.Goroutine.Max, Pct: sc.Goroutine.Pct(), Skipped: sc.Goroutine.Skipped}, + Disk: jsonCategoryScore{Points: sc.Disk.Points, Max: sc.Disk.Max, Pct: sc.Disk.Pct(), Skipped: sc.Disk.Skipped}, + Network: jsonCategoryScore{Points: sc.Network.Points, Max: sc.Network.Max, Pct: sc.Network.Pct(), Skipped: sc.Network.Skipped}, + KV: jsonCategoryScore{Points: sc.KV.Points, Max: sc.KV.Max, Pct: sc.KV.Pct(), Skipped: sc.KV.Skipped}, + Memory: jsonCategoryScore{Points: sc.Memory.Points, Max: sc.Memory.Max, Pct: sc.Memory.Pct(), Skipped: sc.Memory.Skipped}, + BigNum: jsonCategoryScore{Points: sc.BigNum.Points, Max: sc.BigNum.Max, Pct: sc.BigNum.Pct(), Skipped: sc.BigNum.Skipped}, + Crypto: jsonCategoryScore{Points: sc.Crypto.Points, Max: sc.Crypto.Max, Pct: sc.Crypto.Pct(), Skipped: sc.Crypto.Skipped}, } enc := json.NewEncoder(os.Stdout) diff --git a/cmd/benchmark/runner.go b/cmd/benchmark/runner.go index b11b7f00..796fac83 100644 --- a/cmd/benchmark/runner.go +++ b/cmd/benchmark/runner.go @@ -19,6 +19,7 @@ type Config struct { SkipKV bool SkipMemory bool SkipBigNum bool + SkipCrypto bool OutputFmt string } @@ -102,5 +103,15 @@ func (r *Runner) Run() (*BenchmarkResults, error) { results.BigNumResult = br } + if !r.cfg.SkipCrypto { + fmt.Fprintf(os.Stderr, "Running crypto benchmark (SHA-256/Blake2b/Keccak/Ed25519, %s/primitive)...\n", + cryptoBenchDuration) + cr, err := RunCryptoBenchmark() + if err != nil { + return nil, fmt.Errorf("crypto benchmark: %w", err) + } + results.CryptoResult = cr + } + return results, nil } diff --git a/cmd/benchmark/score.go b/cmd/benchmark/score.go index 40329fe9..41c50737 100644 --- a/cmd/benchmark/score.go +++ b/cmd/benchmark/score.go @@ -13,27 +13,47 @@ package main // // Point weights (total = 1000 when all categories are enabled): // -// Goroutine (CPU scalability) 200 // Disk I/O 200 // KV Store (state access) 200 -// Network (P2P stack) 150 -// Memory (DRAM + allocator) 150 -// BigNum / FPU 100 +// Crypto / Hashing 200 ← consensus + TX hashing +// Goroutine (CPU scalability) 150 +// Network (P2P stack) 100 +// Memory (DRAM + allocator) 100 +// BigNum / FPU 50 +// +// The Crypto category is gated by a hard veto on measured SHA-256 +// throughput: hosts that cannot sustain enough hashing throughput to keep +// leader-mode TX processing within the protocol's hardware-tolerance +// window (lowerBound = 425 ms = 85% of the 500 ms baseTimeout) have their +// overall grade capped at F regardless of total points. SHA-NI absence is +// the most common reason for low throughput in practice but is not +// asserted as the sole cause — the veto is grounded in the measured +// number, not a CPU flag. See ComputeScore for the gating logic. // // Grade thresholds (% of enabled max): // // ≥ 90 % → S Elite — top-tier validator hardware // ≥ 75 % → A Excellent — production-ready for high-traffic networks // ≥ 60 % → B Good — suitable for standard validator operation -// ≥ 45 % → C Acceptable — meets minimum validator requirements +// ≥ 45 % → C Acceptable — meets minimum validator requirements; consider a hardware upgrade // ≥ 30 % → D Marginal — several metrics below recommended levels // < 30 % → F Insufficient — does not meet validator requirements import ( + "fmt" "math" "time" ) +// minLeaderSHA256MBps is the SHA-256 throughput floor below which a node +// cannot reliably process TXs as leader within the protocol's hardware- +// tolerance window (lowerBound = 425 ms). Calibrated from field data: a +// validator measured at ~250 MB/s on 16 KiB blocks took ~600 ms to process +// a representative SC TX as leader, well above the 425 ms lowerBound. +// Setting the floor at 500 MB/s gives ~2× margin to the lowerBound and +// matches the existing fail floor for SHA-256 16 KiB blocks. +const minLeaderSHA256MBps = 500.0 + // --------------------------------------------------------------------------- // Excellent thresholds (score = 100) // --------------------------------------------------------------------------- @@ -72,6 +92,14 @@ const ( bigModMulExcellentOps = 2_000_000.0 bigFloat64ExcellentOps = 10_000_000.0 bigIntDivExcellentOps = 30_000_000.0 + + // Crypto excellent ceilings (matches AMD Zen4 with SHA-NI; openssl-speed + // 16 KiB SHA-256 ≈ 1.7 GB/s, Blake2b ≈ 0.9 GB/s; ed25519 stdlib ≈ 30K/s). + cryptoSHA256SmallExcellentMBps = 1_500.0 + cryptoSHA256LargeExcellentMBps = 1_800.0 + cryptoBlake2bExcellentMBps = 900.0 + cryptoKeccak256ExcellentMBps = 600.0 + cryptoEd25519VerifyExcellentOps = 25_000.0 ) // --------------------------------------------------------------------------- @@ -79,12 +107,13 @@ const ( // --------------------------------------------------------------------------- const ( - weightGoroutine = 200 weightDisk = 200 weightKV = 200 - weightNetwork = 150 - weightMemory = 150 - weightBigNum = 100 + weightCrypto = 200 + weightGoroutine = 150 + weightNetwork = 100 + weightMemory = 100 + weightBigNum = 50 ) // --------------------------------------------------------------------------- @@ -115,11 +144,16 @@ type BenchmarkScore struct { KV CategoryScore Memory CategoryScore BigNum CategoryScore + Crypto CategoryScore Total int // sum of enabled category points MaxTotal int // sum of enabled category maxes Pct float64 // Total / MaxTotal (0.0–1.0); 0 if nothing enabled Grade string // S / A / B / C / D / F + // Vetoed is true when a hard requirement failed (e.g., missing SHA-NI + // on amd64). When set, Grade is forced to "F" regardless of point total. + Vetoed bool + VetoedReason string } // ComputeScore builds a BenchmarkScore from all benchmark results. @@ -136,8 +170,9 @@ func ComputeScore(r *BenchmarkResults) BenchmarkScore { s.KV = scoreCategory(kvCatScore(r.KVResult), weightKV, r.KVResult == nil) s.Memory = scoreCategory(memoryCatScore(r.MemoryResult), weightMemory, r.MemoryResult == nil) s.BigNum = scoreCategory(bigNumCatScore(r.BigNumResult), weightBigNum, r.BigNumResult == nil) + s.Crypto = scoreCategory(cryptoCatScore(r.CryptoResult), weightCrypto, r.CryptoResult == nil) - for _, c := range []CategoryScore{s.Goroutine, s.Disk, s.Network, s.KV, s.Memory, s.BigNum} { + for _, c := range []CategoryScore{s.Goroutine, s.Disk, s.Network, s.KV, s.Memory, s.BigNum, s.Crypto} { s.Total += c.Points s.MaxTotal += c.Max } @@ -147,6 +182,24 @@ func ComputeScore(r *BenchmarkResults) BenchmarkScore { } else { s.Grade = "N/A" } + + // Hard veto: SHA-256 throughput below the leader-mode floor caps the + // grade at F regardless of other category scores. Field investigation + // of slow validators showed measured SHA-256 throughput correlates + // with leader-mode TX processing time well enough to predict whether + // a node will exceed the protocol's hardware-tolerance window. SHA-NI + // absence is the most common cause of low throughput on amd64 but is + // not asserted as the sole cause — the veto fires on the measurement. + if c := r.CryptoResult; c != nil && c.SHA256LargeMBps < minLeaderSHA256MBps { + s.Vetoed = true + s.VetoedReason = fmt.Sprintf( + "SHA-256 throughput %.0f MB/s < %.0f MB/s minimum — node likely cannot sustain "+ + "leader-mode TX processing within the consensus hardware-tolerance window. "+ + "Most common cause: missing SHA-NI on amd64 (Skylake-X / Cascade Lake / Haswell)", + c.SHA256LargeMBps, minLeaderSHA256MBps) + s.Grade = "F" + } + return s } @@ -221,6 +274,19 @@ func bigNumCatScore(r *BigNumResult) float64 { ) } +func cryptoCatScore(r *CryptoResult) float64 { + if r == nil { + return 0 + } + return mean( + normHigh(r.SHA256MBps, cryptoSHA256SmallFailMBps, cryptoSHA256SmallExcellentMBps), + normHigh(r.SHA256LargeMBps, cryptoSHA256LargeFailMBps, cryptoSHA256LargeExcellentMBps), + normHigh(r.Blake2bMBps, cryptoBlake2bFailMBps, cryptoBlake2bExcellentMBps), + normHigh(r.Keccak256MBps, cryptoKeccak256FailMBps, cryptoKeccak256ExcellentMBps), + normHigh(r.Ed25519VerifyOpsPerSec, cryptoEd25519VerifyFailOps, cryptoEd25519VerifyExcellentOps), + ) +} + // --------------------------------------------------------------------------- // Helpers // --------------------------------------------------------------------------- @@ -300,9 +366,9 @@ func scoreGradeSummary(g string) string { case "B": return "Good — suitable for standard validator operation" case "C": - return "Below standard — not recommended for production use" + return "Acceptable — meets minimum validator requirements; consider a hardware upgrade" case "D": - return "Poor — critical subsystems underperform validator requirements" + return "Marginal — several metrics below recommended levels" case "N/A": return "No benchmarks were run; all categories were skipped." default: diff --git a/cmd/benchmark/score_test.go b/cmd/benchmark/score_test.go new file mode 100644 index 00000000..7b0b0ceb --- /dev/null +++ b/cmd/benchmark/score_test.go @@ -0,0 +1,181 @@ +package main + +import ( + "testing" +) + +// excellentResults returns a fully-populated BenchmarkResults whose values +// sit at the excellent ceiling for every category — used to verify the +// max-points totals. +func excellentResults() *BenchmarkResults { + return &BenchmarkResults{ + GoroutineResult: &GoroutineResult{CPUEfficiency: cpuEffExcellentPct, NumCPU: 8}, + DiskResult: &DiskResult{ + SeqWriteMBps: seqWriteExcellentMBps, + SeqReadMBps: seqReadExcellentMBps, + RandWriteIOPS: randWriteExcellentIPS, + RandReadIOPS: randReadExcellentIPS, + }, + NetworkResult: &NetworkResult{ + LatP50: 1, // 1 ns → effectively 0 µs (excellent) + LatP99: 1, + ThroughputMBps: netThroughputExcellentMBps, + }, + KVResult: &KVResult{ + WriteOpsPerSec: kvWriteExcellentOps, + ReadOpsPerSec: kvReadExcellentOps, + MixedOpsPerSec: kvMixedExcellentOps, + }, + MemoryResult: &MemoryResult{ + SeqReadGBps: memSeqReadExcellentGBps, + SeqWriteGBps: memSeqWriteExcellentGBps, + RandLatencyNs: 1, // ≈ 0 ns → excellent + AllocMOpsPerS: memAllocExcellentMOps, + }, + BigNumResult: &BigNumResult{ + ModExpOpsPerSec: bigModExpExcellentOps, + ModMulOpsPerSec: bigModMulExcellentOps, + Float64OpsPerSec: bigFloat64ExcellentOps, + IntDivOpsPerSec: bigIntDivExcellentOps, + }, + CryptoResult: &CryptoResult{ + SHA256MBps: cryptoSHA256SmallExcellentMBps, + SHA256LargeMBps: cryptoSHA256LargeExcellentMBps, + Blake2bMBps: cryptoBlake2bExcellentMBps, + Keccak256MBps: cryptoKeccak256ExcellentMBps, + Ed25519VerifyOpsPerSec: cryptoEd25519VerifyExcellentOps, + HasSHA_NI: true, + HasAVX512IFMA: true, + }, + } +} + +func TestComputeScore_TotalMaxIs1000WhenAllEnabled(t *testing.T) { + s := ComputeScore(excellentResults()) + if s.MaxTotal != 1000 { + t.Fatalf("MaxTotal = %d, want 1000 (rebalance must keep weights summing to 1000)", s.MaxTotal) + } + if s.Total != 1000 { + t.Fatalf("Total = %d, want 1000 with excellent inputs across the board", s.Total) + } + if s.Grade != "S" { + t.Fatalf("Grade = %q, want S with 100%% score", s.Grade) + } +} + +func TestComputeScore_CryptoWeightIs200(t *testing.T) { + s := ComputeScore(excellentResults()) + if s.Crypto.Max != 200 { + t.Fatalf("Crypto.Max = %d, want 200", s.Crypto.Max) + } + if s.Crypto.Points != 200 { + t.Fatalf("Crypto.Points = %d, want 200 with excellent inputs", s.Crypto.Points) + } +} + +func TestComputeScore_RebalancedWeights(t *testing.T) { + s := ComputeScore(excellentResults()) + cases := []struct { + name string + got int + want int + }{ + {"Disk", s.Disk.Max, 200}, + {"KV", s.KV.Max, 200}, + {"Crypto", s.Crypto.Max, 200}, + {"Goroutine", s.Goroutine.Max, 150}, + {"Network", s.Network.Max, 100}, + {"Memory", s.Memory.Max, 100}, + {"BigNum", s.BigNum.Max, 50}, + } + for _, c := range cases { + if c.got != c.want { + t.Errorf("%s.Max = %d, want %d", c.name, c.got, c.want) + } + } +} + +func TestComputeScore_ThroughputVeto_CapsGradeAtF(t *testing.T) { + r := excellentResults() + // Bench measured below the leader-mode floor (e.g., a Skylake/Haswell + // without SHA-NI typically lands around 250 MB/s). + r.CryptoResult.SHA256LargeMBps = 250 + + s := ComputeScore(r) + if !s.Vetoed { + t.Fatal("expected Vetoed=true when SHA-256 16K throughput below the floor") + } + if s.Grade != "F" { + t.Fatalf("Grade = %q, want F when veto applies", s.Grade) + } + if s.VetoedReason == "" { + t.Fatal("expected VetoedReason to be populated") + } + // The numeric score should still be substantial — the veto is a grade-cap, + // not a silent zero. Operators get to see how the rest of the system performs. + // (The Crypto category itself will score low because of the bad throughput, + // but the other six categories were set to excellent in this fixture.) + if s.Total < 700 { + t.Fatalf("Total = %d, expected non-veto categories to still score normally", s.Total) + } +} + +func TestComputeScore_ThroughputVeto_DoesNotApply_AboveFloor(t *testing.T) { + r := excellentResults() + // Throughput just above the floor — veto must not trigger even though + // the host could in principle be a non-SHA-NI amd64. + r.CryptoResult.SHA256LargeMBps = minLeaderSHA256MBps + 1 + r.CryptoResult.HasSHA_NI = false + + s := ComputeScore(r) + if s.Vetoed { + t.Fatalf("Vetoed must be false when throughput is above floor; reason: %s", s.VetoedReason) + } + if s.Grade == "F" { + t.Fatalf("Grade = F unexpectedly when throughput is above floor") + } +} + +func TestComputeScore_ThroughputVeto_NoCryptoResult_NoVeto(t *testing.T) { + r := excellentResults() + r.CryptoResult = nil // crypto bench was skipped + + s := ComputeScore(r) + if s.Vetoed { + t.Fatal("Vetoed must be false when CryptoResult is nil (bench skipped)") + } +} + +func TestComputeScore_NilResults_GradeF(t *testing.T) { + if got := ComputeScore(nil); got.Grade != "F" { + t.Fatalf("ComputeScore(nil).Grade = %q, want F", got.Grade) + } +} + +func TestComputeScore_AllSkipped_GradeNotApplicable(t *testing.T) { + s := ComputeScore(&BenchmarkResults{}) // all category results nil + if s.Grade != "N/A" { + t.Fatalf("Grade = %q, want N/A when nothing ran", s.Grade) + } +} + +func TestScoreGrade_Boundaries(t *testing.T) { + cases := []struct { + pct float64 + want string + }{ + {0.95, "S"}, + {0.90, "S"}, + {0.80, "A"}, + {0.65, "B"}, + {0.50, "C"}, + {0.35, "D"}, + {0.10, "F"}, + {0.0, "F"}, + } + for _, c := range cases { + if got := scoreGrade(c.pct); got != c.want { + t.Errorf("scoreGrade(%.2f) = %q, want %q", c.pct, got, c.want) + } + } +} diff --git a/cmd/node/PREFLIGHT.md b/cmd/node/PREFLIGHT.md new file mode 100644 index 00000000..4617b0bc --- /dev/null +++ b/cmd/node/PREFLIGHT.md @@ -0,0 +1,108 @@ +# Validator CPU Preflight + +The validator binary runs a CPU preflight check at startup, immediately after +loading the BLS signing key. The preflight verifies that the host has +sufficient SHA-256 hardware acceleration to keep up with consensus and TX +processing on a production network. + +## Why this exists + +A field investigation across the Klever validator fleet found a ~5× spread in +smart-contract TX processing time — ~600 ms on some validators vs ~120 ms on +peers with otherwise comparable specs. The slow nodes uniformly lacked the +**SHA-NI** instruction set (Skylake-X / Cascade Lake Xeon and earlier never +received it), and the SHA-256 throughput delta correlated with the wall-time +disparity. SHA-NI absence is the most likely contributing cause but was not +conclusively proven to be the sole cause — the consensus log confirms the +protocol's own "leader hardware too weak" detection treats this as a +hardware-class issue regardless of the underlying instruction. + +The preflight is grounded in **measured SHA-256 throughput** rather than the +SHA-NI feature flag. A node whose throughput cannot keep leader-mode TX +processing within the protocol's 425 ms hardware-tolerance window is +refused at startup so operators discover the issue before consensus-time +outliers manifest. + +## What it checks + +On `amd64` and `arm64`: + +1. A 200 ms self-bench measures sustained SHA-256 throughput on 16 KiB + blocks. The result must be at least **800 MB/s** for startup to proceed. +2. Missing SHA-NI (amd64) or ARMv8 SHA2 (arm64) is logged as an + informational `Warn` line — it is the most common cause of low SHA-256 + throughput, and noting it makes the resulting log actionable. Missing + the flag never blocks startup on its own; only the bench does. +3. Missing AVX-512 IFMA on `amd64` is logged as a separate `Warn` + (informational) — it indicates that the BLS pairing path is on the + ~1.5× slower scalar fallback. + +Other architectures are skipped — the preflight is a no-op on `riscv64`, +`386`, `ppc64le`, etc. + +## Behavior modes + +The preflight has two layers of failure handling, controlled by the +`preferences.enforceCpuPreflight` flag in the validator config. + +| Flag value | On preflight failure | +|------------|----------------------| +| `true` (default) | Returns a non-zero exit code with a clear error message. The validator does not start. | +| `false` (escape hatch) | Logs the failure as a `Warn` and continues startup. Useful during a coordinated fleet migration when operators need to observe the issue without bricking running nodes. | + +Every preflight run logs a single `Info` line with the measured throughput +(emitted on success, on warn-only failure, and as a precursor to a hard +failure error): + +```text +INFO validator CPU preflight measurement arch=amd64 sha_ni=true avx512_ifma=true sha256_mbps=1742.3 +``` + +## Override + +For emergencies (CI, dev environments, intentional homogeneity tests), +the env var `KLEVER_SKIP_CPU_CHECK=1` bypasses the preflight entirely. +The exact value `1` is required — `true`, `yes`, etc. are not honored +(fail-closed: a typo leaves the preflight active). A loud warning is +logged on every startup so operators see the bypass in their logs: + +```text +WARN validator CPU preflight bypassed via env var env=KLEVER_SKIP_CPU_CHECK risk=consensus latency may exceed peer median; not for production +``` + +Do not use this flag in production. + +## Migration plan for SHA-NI-deficient hardware + +If the preflight refuses to start your validator, migrate to a CPU with +SHA extensions. Note: the validator startup gate is **stricter** than +the standalone `klever-benchmark` tool — startup requires ≥ 800 MB/s +while the benchmark's SHA-256 hard-veto threshold is 500 MB/s. A host +that earns a non-`F` grade from the benchmark can still fail the +startup gate; always run the actual validator binary on a candidate +host before committing to it. + +Recommended CPU classes: + +- **AMD**: any Zen generation — EPYC Naples, Rome, Milan, Genoa, Turin, or + Ryzen / Threadripper equivalents. +- **Intel**: Ice Lake-SP (3rd-gen Xeon Scalable) or newer. Skylake-X, + Cascade Lake, and earlier consumer Skylake / Coffee Lake / Cooper Lake + parts do not have SHA-NI. +- **ARM**: any ARMv8 chip exposing the SHA2 feature flag (i.e., effectively + every datacenter ARM CPU since 2018, including AWS Graviton and Apple + Silicon). + +For Hetzner Cloud specifically: CCX (dedicated AMD EPYC) and CPX (shared +AMD EPYC) instances always satisfy the preflight. The CX series is a mixed +Intel/AMD pool, and Skylake-class instances within it do not. Run +`klever-benchmark --skip-disk --skip-network --skip-kv --skip-memory \ +--skip-goroutine --skip-bignum` on a candidate instance before deploying as +a validator to confirm. + +## Related + +- `cmd/node/preflight.go` — the preflight implementation. +- `cmd/benchmark/CLI.md` — the operator-facing benchmark, which applies the + same SHA-NI veto and produces a more detailed report. +- `config/prefsConfig.go` — `EnforceCPUPreflight` is the runtime flag. diff --git a/cmd/node/preflight.go b/cmd/node/preflight.go new file mode 100644 index 00000000..42bad3c8 --- /dev/null +++ b/cmd/node/preflight.go @@ -0,0 +1,164 @@ +package main + +import ( + "crypto/rand" + "crypto/sha256" + "fmt" + "os" + "runtime" + "time" + + "github.com/klauspost/cpuid/v2" + logger "github.com/klever-io/klever-go-logger" +) + +const ( + envSkipCPUCheck = "KLEVER_SKIP_CPU_CHECK" + minSHA256ThroughputMBps = 800 + preflightBenchDuration = 200 * time.Millisecond + benchBlockSize = 16 * 1024 +) + +// cpuInfo captures the CPU features the preflight cares about. It is its own +// type (rather than reading klauspost/cpuid globals directly inside the +// preflight) so tests can construct synthetic CPUs on any host. +type cpuInfo struct { + arch string + hasSHA bool + hasAVX512IFMA bool +} + +// detectCPU reads the runtime architecture and the relevant feature bits. +// Architectures other than amd64 and arm64 are treated as "skip" — preflight +// is a no-op there. +func detectCPU() cpuInfo { + info := cpuInfo{arch: runtime.GOARCH} + switch runtime.GOARCH { + case "amd64": + info.hasSHA = cpuid.CPU.Has(cpuid.SHA) + info.hasAVX512IFMA = cpuid.CPU.Has(cpuid.AVX512IFMA) + case "arm64": + info.hasSHA = cpuid.CPU.Has(cpuid.SHA2) + } + return info +} + +// validatorCPUPreflight verifies the host CPU is capable of validator-grade +// SHA-256 throughput. Returns a non-nil error when the check fails; the call +// site decides whether to block startup or downgrade to a warning depending +// on the EnforceCPUPreflight config flag. +// +// Outcomes: +// - skipped (returns nil) on unsupported architectures or when the +// KLEVER_SKIP_CPU_CHECK=1 env var is set; +// - failed (returns error) when measured SHA-256 throughput is below +// minSHA256ThroughputMBps; +// - passed (returns nil) otherwise, with an informational log line that +// includes the measured throughput. +// +// Missing SHA-NI on amd64 is logged as a Warn but is no longer a hard fail +// on its own — the field investigation that motivated this preflight could +// not conclusively prove SHA-NI absence is the sole cause of the observed +// consensus-time disparity, so the gate is grounded in the measured number +// instead of the CPU feature flag. SHA-NI absence is the most common cause +// of low SHA-256 throughput in practice and is called out in the warn line. +func validatorCPUPreflight(log logger.Logger) error { + return validatorCPUPreflightWithInfo(log, detectCPU(), benchSHA256) +} + +// validatorCPUPreflightWithInfo is the test seam for validatorCPUPreflight. +// Passing the cpuInfo and bench function as parameters keeps the package free +// of mutable globals while still letting tests cover every branch. +// +// The bench is run twice and the maximum is reported, so a single transient +// throttle event (thermal, hypervisor noisy neighbor) does not refuse startup +// on a host that would otherwise pass. +func validatorCPUPreflightWithInfo( + log logger.Logger, + info cpuInfo, + bench func(time.Duration) float64, +) error { + if os.Getenv(envSkipCPUCheck) == "1" { + log.Warn("validator CPU preflight bypassed via env var", + "env", envSkipCPUCheck, + "risk", "consensus latency may exceed peer median; not for production") + return nil + } + + if info.arch != "amd64" && info.arch != "arm64" { + log.Info("validator CPU preflight skipped on unsupported arch", "arch", info.arch) + return nil + } + + mbps := bench(preflightBenchDuration) + if second := bench(preflightBenchDuration); second > mbps { + mbps = second + } + log.Info("validator CPU preflight measurement", + "arch", info.arch, + "sha_ni", info.hasSHA, + "avx512_ifma", info.hasAVX512IFMA, + "sha256_mbps", fmt.Sprintf("%.1f", mbps)) + + if info.arch == "amd64" && !info.hasAVX512IFMA { + log.Warn("CPU lacks AVX-512 IFMA; BLS verify ~1.5x slower than Zen4 peers (informational only)") + } + + if !info.hasSHA { + log.Warn("CPU lacks SHA-256 hardware acceleration "+ + "(SHA-NI on amd64 / ARMv8 SHA2 on arm64); "+ + "this is the most common cause of low SHA-256 throughput", + "arch", info.arch) + } + + if mbps < minSHA256ThroughputMBps { + return fmt.Errorf( + "validator CPU preflight failed: measured SHA-256 throughput %.1f MB/s < %d MB/s minimum. "+ + "This typically indicates missing SHA-NI (Skylake-X / Cascade Lake / Haswell on amd64) "+ + "or a degraded host (frequency cap, thermal throttle, hypervisor masking). "+ + "Migrate to AMD Zen, Intel Ice Lake-SP+, or modern ARM with ARMv8 SHA2. "+ + "Override (NOT for production): %s=1", + mbps, minSHA256ThroughputMBps, envSkipCPUCheck) + } + return nil +} + +// benchSHA256 hashes 16 KiB blocks for d and returns sustained throughput in +// megabytes per second. Returns 0 on a non-positive duration. The block size +// matches the openssl-speed reference used during the original investigation +// so operators can compare numbers directly. +// +// The hot loop checks the deadline once per innerLoop iterations to avoid +// time.Now() syscall overhead dominating on SHA-NI hosts (~3 GB/s ≈ 3M +// hashes/s). +func benchSHA256(d time.Duration) float64 { + if d <= 0 { + return 0 + } + buf := make([]byte, benchBlockSize) + if _, err := rand.Read(buf); err != nil { + // Deterministic fallback so the bench remains representative of a + // non-zero working set even when the system RNG is unavailable. + for i := range buf { + buf[i] = byte(i) + } + } + h := sha256.New() + var bytes int64 + start := time.Now() + deadline := start.Add(d) + const innerLoop = 256 + for time.Now().Before(deadline) { + for i := 0; i < innerLoop; i++ { + h.Reset() + _, _ = h.Write(buf) + _ = h.Sum(nil) + } + bytes += benchBlockSize * innerLoop + } + elapsed := time.Since(start).Seconds() + if elapsed <= 0 { + return 0 + } + return float64(bytes) / (1024 * 1024) / elapsed +} diff --git a/cmd/node/preflight_test.go b/cmd/node/preflight_test.go new file mode 100644 index 00000000..6ac25bee --- /dev/null +++ b/cmd/node/preflight_test.go @@ -0,0 +1,244 @@ +package main + +import ( + "strings" + "sync" + "testing" + "time" + + logger "github.com/klever-io/klever-go-logger" +) + +// recordingLogger is a Logger that captures messages by level so tests can +// assert which branches of the preflight produced output. +type recordingLogger struct { + mu sync.Mutex + infos []string + warns []string +} + +func (r *recordingLogger) record(slot *[]string, msg string) { + r.mu.Lock() + *slot = append(*slot, msg) + r.mu.Unlock() +} + +func (r *recordingLogger) Trace(msg string, _ ...interface{}) {} +func (r *recordingLogger) Debug(msg string, _ ...interface{}) {} +func (r *recordingLogger) Info(msg string, _ ...interface{}) { + r.record(&r.infos, msg) +} +func (r *recordingLogger) Warn(msg string, _ ...interface{}) { + r.record(&r.warns, msg) +} +func (r *recordingLogger) Error(msg string, _ ...interface{}) {} +func (r *recordingLogger) LogIfError(_ error, _ ...interface{}) {} +func (r *recordingLogger) Log(_ logger.LogLevel, _ string, _ ...interface{}) {} +func (r *recordingLogger) LogLine(_ *logger.LogLine) {} +func (r *recordingLogger) SetLevel(_ logger.LogLevel) {} +func (r *recordingLogger) GetLevel() logger.LogLevel { return logger.LogTrace } +func (r *recordingLogger) IsInterfaceNil() bool { return r == nil } + +// hasInfo reports whether any captured Info message contains s. The slice +// is read under the lock to keep the helper race-safe even if a future +// caller invokes the preflight in a goroutine. +func (r *recordingLogger) hasInfo(s string) bool { + r.mu.Lock() + defer r.mu.Unlock() + for _, m := range r.infos { + if strings.Contains(m, s) { + return true + } + } + return false +} + +// hasWarn reports whether any captured Warn message contains s. +func (r *recordingLogger) hasWarn(s string) bool { + r.mu.Lock() + defer r.mu.Unlock() + for _, m := range r.warns { + if strings.Contains(m, s) { + return true + } + } + return false +} + +// fastBench returns a fixed throughput regardless of duration. Useful for +// asserting on the bench-too-slow path without waiting on the real bench. +func fastBench(mbps float64) func(time.Duration) float64 { + return func(time.Duration) float64 { return mbps } +} + +func TestValidatorCPUPreflight_HappyPath_Amd64(t *testing.T) { + t.Setenv(envSkipCPUCheck, "") + log := &recordingLogger{} + info := cpuInfo{arch: "amd64", hasSHA: true, hasAVX512IFMA: true} + + if err := validatorCPUPreflightWithInfo(log, info, fastBench(2000)); err != nil { + t.Fatalf("expected nil error, got: %v", err) + } + if !log.hasInfo("validator CPU preflight measurement") { + t.Fatalf("expected a measurement info log, got infos=%v", log.infos) + } + if log.hasWarn("AVX-512 IFMA") { + t.Fatalf("did not expect AVX-512 IFMA warning when feature is present, got warns=%v", log.warns) + } + if log.hasWarn("SHA-256 hardware acceleration") { + t.Fatalf("did not expect SHA-NI warning when feature is present, got warns=%v", log.warns) + } +} + +func TestValidatorCPUPreflight_HappyPath_Arm64(t *testing.T) { + t.Setenv(envSkipCPUCheck, "") + log := &recordingLogger{} + info := cpuInfo{arch: "arm64", hasSHA: true} + + if err := validatorCPUPreflightWithInfo(log, info, fastBench(2000)); err != nil { + t.Fatalf("expected nil error on arm64 with fast bench, got: %v", err) + } + if !log.hasInfo("validator CPU preflight measurement") { + t.Fatalf("expected measurement info log, got infos=%v", log.infos) + } + if log.hasWarn("AVX-512 IFMA") { + t.Fatalf("did not expect AVX-512 IFMA warn on arm64, got warns=%v", log.warns) + } +} + +func TestValidatorCPUPreflight_HappyPath_Amd64_NoIFMA_Warns(t *testing.T) { + t.Setenv(envSkipCPUCheck, "") + log := &recordingLogger{} + // Skylake-X case: SHA-NI present but AVX-512 IFMA missing — should pass with a warn. + info := cpuInfo{arch: "amd64", hasSHA: true, hasAVX512IFMA: false} + + if err := validatorCPUPreflightWithInfo(log, info, fastBench(2000)); err != nil { + t.Fatalf("expected nil error on Skylake-X-shaped CPU, got: %v", err) + } + if !log.hasWarn("AVX-512 IFMA") { + t.Fatalf("expected AVX-512 IFMA warn, got warns=%v", log.warns) + } +} + +func TestValidatorCPUPreflight_MissingSHA_FastBench_Passes_WithWarn(t *testing.T) { + // Missing SHA-NI is no longer a hard fail on its own — only the measured + // throughput is. A (hypothetical) host without SHA-NI but with somehow + // fast-enough SHA-256 should pass with a Warn note. + t.Setenv(envSkipCPUCheck, "") + log := &recordingLogger{} + info := cpuInfo{arch: "amd64", hasSHA: false, hasAVX512IFMA: true} + + if err := validatorCPUPreflightWithInfo(log, info, fastBench(2000)); err != nil { + t.Fatalf("expected nil error when bench is fast enough, even without SHA-NI; got: %v", err) + } + if !log.hasWarn("SHA-256 hardware acceleration") { + t.Fatalf("expected SHA-NI absence warn, got warns=%v", log.warns) + } +} + +func TestValidatorCPUPreflight_MissingSHA_SlowBench_Errors(t *testing.T) { + // Realistic Skylake/Haswell case: no SHA-NI plus low measured throughput. + t.Setenv(envSkipCPUCheck, "") + log := &recordingLogger{} + info := cpuInfo{arch: "amd64", hasSHA: false, hasAVX512IFMA: false} + + err := validatorCPUPreflightWithInfo(log, info, fastBench(250)) + if err == nil { + t.Fatal("expected error when measured throughput is below the floor") + } + if !strings.Contains(err.Error(), "throughput") { + t.Fatalf("error message should mention throughput, got: %v", err) + } + if !strings.Contains(err.Error(), envSkipCPUCheck) { + t.Fatalf("error should mention the override env var, got: %v", err) + } +} + +func TestValidatorCPUPreflight_MissingSHA2_Arm64_SlowBench_Errors(t *testing.T) { + t.Setenv(envSkipCPUCheck, "") + log := &recordingLogger{} + info := cpuInfo{arch: "arm64", hasSHA: false} + + err := validatorCPUPreflightWithInfo(log, info, fastBench(200)) + if err == nil { + t.Fatal("expected error when arm64 measured throughput is below the floor") + } + if !strings.Contains(err.Error(), "throughput") { + t.Fatalf("error message should mention throughput, got: %v", err) + } +} + +func TestValidatorCPUPreflight_BenchTooSlow_Errors(t *testing.T) { + t.Setenv(envSkipCPUCheck, "") + log := &recordingLogger{} + info := cpuInfo{arch: "amd64", hasSHA: true, hasAVX512IFMA: true} + + err := validatorCPUPreflightWithInfo(log, info, fastBench(minSHA256ThroughputMBps-1)) + if err == nil { + t.Fatal("expected error when measured throughput is below the minimum") + } + if !strings.Contains(err.Error(), "throughput") { + t.Fatalf("error message should mention throughput, got: %v", err) + } +} + +func TestValidatorCPUPreflight_EnvBypass_NilEvenWithSlowBench(t *testing.T) { + t.Setenv(envSkipCPUCheck, "1") + log := &recordingLogger{} + info := cpuInfo{arch: "amd64", hasSHA: false} + + if err := validatorCPUPreflightWithInfo(log, info, fastBench(0)); err != nil { + t.Fatalf("expected nil error when env bypass is active, got: %v", err) + } + if !log.hasWarn("bypassed via env var") { + t.Fatalf("expected bypass warn log, got warns=%v", log.warns) + } +} + +func TestValidatorCPUPreflight_UnsupportedArch_NilAndSkips(t *testing.T) { + t.Setenv(envSkipCPUCheck, "") + log := &recordingLogger{} + info := cpuInfo{arch: "386"} + + if err := validatorCPUPreflightWithInfo(log, info, fastBench(0)); err != nil { + t.Fatalf("expected nil error on unsupported arch, got: %v", err) + } + if !log.hasInfo("unsupported arch") { + t.Fatalf("expected unsupported-arch info log, got infos=%v", log.infos) + } +} + +func TestBenchSHA256_NonPositiveDuration_ReturnsZero(t *testing.T) { + if got := benchSHA256(0); got != 0 { + t.Fatalf("benchSHA256(0) = %.2f, want 0", got) + } + if got := benchSHA256(-time.Second); got != 0 { + t.Fatalf("benchSHA256(-1s) = %.2f, want 0", got) + } +} + +func TestBenchSHA256_RealMeasurement_Positive(t *testing.T) { + // Smoke test: a 100 ms run on any modern CPU should produce a positive + // throughput. We do not assert a specific MB/s number to keep the test + // portable across CI runners; 100 ms (vs 50 ms) gives heavily-throttled + // cgroup runners enough wall time to complete a full inner-loop batch. + if got := benchSHA256(100 * time.Millisecond); got <= 0 { + t.Fatalf("benchSHA256(100ms) = %.2f, want > 0", got) + } +} + +// TestValidatorCPUPreflight_RealEntry_NoPanic exercises the production +// signature (which calls detectCPU() and the real benchSHA256). It does +// not assert pass/fail because the result depends on host hardware — it +// only ensures the wiring between detectCPU, benchSHA256, and the +// WithInfo seam does not panic and produces a self-consistent outcome. +func TestValidatorCPUPreflight_RealEntry_NoPanic(t *testing.T) { + t.Setenv(envSkipCPUCheck, "1") // bypass so we don't fail on slow CI hosts + log := &recordingLogger{} + if err := validatorCPUPreflight(log); err != nil { + t.Fatalf("env bypass should make validatorCPUPreflight nil, got: %v", err) + } + if !log.hasWarn("bypassed via env var") { + t.Fatalf("expected bypass warn log, got warns=%v", log.warns) + } +} diff --git a/cmd/node/startup.go b/cmd/node/startup.go index 5c0115bf..259a8350 100644 --- a/cmd/node/startup.go +++ b/cmd/node/startup.go @@ -249,6 +249,17 @@ func startNode(ctx *cli.Context, log logger.Logger, version string) error { return err } + // Run the CPU preflight before loading the validator's BLS private key: + // on a host that fails the gate we want to refuse startup without + // having unlocked the key file (defense-in-depth, CWE-316). + if err := validatorCPUPreflight(log); err != nil { + if cfg.Preferences.ShouldEnforceCPUPreflight() { + return err + } + log.Warn("validator CPU preflight failed (continuing because preferences.enforceCpuPreflight=false)", + "error", err.Error()) + } + validatorKeyPemFileName := ctx.GlobalString(validatorKeyPemFile.Name) cryptoParamsLoader, err := factory.NewCryptoSigningParamsLoader( validatorPubkeyConverter, diff --git a/config/node/config.yaml b/config/node/config.yaml index d066d4ee..32d6627c 100644 --- a/config/node/config.yaml +++ b/config/node/config.yaml @@ -60,6 +60,17 @@ preferences: # MaxComputableSlots represents the max number of slots computable in a round # by the validator statistics processor maxComputableSlots: 100 + # EnforceCPUPreflight refuses to start the validator when measured SHA-256 + # throughput falls below 800 MB/s on a 200 ms self-bench (stricter than + # the standalone klever-benchmark tool, which only hard-vetoes below + # 500 MB/s — see cmd/node/PREFLIGHT.md). Hosts lacking SHA-NI hardware + # acceleration (Skylake-X / Cascade Lake / Haswell on amd64, or ARM + # without ARMv8 SHA2) typically measure ~250 MB/s and will be rejected. + # Use KLEVER_SKIP_CPU_CHECK=1 as an emergency override. + # Set to false to downgrade the failure to a warning instead of refusing. + # If this key is omitted from an existing operator config, enforcement + # defaults to true (safe upgrade behavior). + enforceCpuPreflight: true blockSizeThrottleConfig: minSizeInBytes: 104857 # 104857 is 10% from 1MB diff --git a/config/prefsConfig.go b/config/prefsConfig.go index ab3f3853..6df2c6c2 100644 --- a/config/prefsConfig.go +++ b/config/prefsConfig.go @@ -7,4 +7,25 @@ type PreferencesConfig struct { RedundancyLevel int64 `yaml:"redundancyLevel"` StatusPollingIntervalSec int64 `yaml:"statusPollingIntervalSec"` MaxComputableSlots uint64 `yaml:"maxComputableSlots"` + // EnforceCPUPreflight controls whether the validator refuses to start + // when the startup SHA-256 throughput bench falls below the leader-mode + // floor. When true (default — including when the YAML key is absent so + // existing operator configs upgrade safely), failure aborts startup + // with a clear error. When explicitly set to false, the failure is + // downgraded to a warning so operators can observe the issue without + // bricking a running node — useful during fleet migration. + // KLEVER_SKIP_CPU_CHECK=1 bypasses the check entirely. + // + // Pointer type so an absent YAML key is distinguishable from an + // explicit false; use ShouldEnforceCPUPreflight() at call sites. + EnforceCPUPreflight *bool `yaml:"enforceCpuPreflight"` +} + +// ShouldEnforceCPUPreflight returns the effective enforcement decision, +// applying the safe default (true) when the YAML key is absent. +func (p PreferencesConfig) ShouldEnforceCPUPreflight() bool { + if p.EnforceCPUPreflight == nil { + return true + } + return *p.EnforceCPUPreflight } From 2a056dc6e1c41941362805329c17aa47f2501b4e Mon Sep 17 00:00:00 2001 From: Fernando Sobreira Date: Thu, 7 May 2026 21:16:46 -0400 Subject: [PATCH 2/6] [KLC-2387] address PR review feedback and CI lint - Preallocate digest buffer in benchSHA256 / benchHashMBps to remove per-iteration allocations from the hash hot loop (verified: 2039 allocs/op -> 0 allocs/op). Improves measurement stability on hosts where GC pressure could mask SHA-NI throughput. - Update PREFLIGHT.md: preflight runs immediately *before* loading the BLS key (not after), matching the implementation. - Fix doc drift: BenchmarkScore.Vetoed and CryptoResult.HasSHA_NI now correctly describe throughput-based gating rather than SHA-NI feature flag. HasSHA_NI doc clarifies the cross-platform shorthand (SHA-NI on amd64, ARMv8 SHA2 on arm64). - Document the relationship between cryptoSHA256LargeFailMBps (600, per-metric verdict) and minLeaderSHA256MBps (500, hard grade veto). - Surface preferences.enforceCpuPreflight=false escape hatch in the preflight error message so operators discover the warn-only mode without needing to find the docs first. - go mod tidy: promote klauspost/cpuid/v2 from indirect to direct dep since cmd/node/preflight.go imports it directly. --- cmd/benchmark/crypto.go | 17 ++++++++++++----- cmd/benchmark/report.go | 7 +++++++ cmd/benchmark/score.go | 8 ++++++-- cmd/node/PREFLIGHT.md | 10 ++++++---- cmd/node/preflight.go | 7 +++++-- go.mod | 2 +- 6 files changed, 37 insertions(+), 14 deletions(-) diff --git a/cmd/benchmark/crypto.go b/cmd/benchmark/crypto.go index 2217ae10..af102c32 100644 --- a/cmd/benchmark/crypto.go +++ b/cmd/benchmark/crypto.go @@ -62,10 +62,16 @@ type CryptoResult struct { Keccak256MBps float64 // Keccak-256 throughput on 16 KiB blocks Ed25519VerifyOpsPerSec float64 // Ed25519 signature verifications per second - // CPU feature flags — reported but not directly scored. The overall - // score applies a hard veto when HasSHA_NI is false on amd64 (see - // score.go). The other flags are informational; AVX-512 IFMA in - // particular indicates whether the BLS pairing fast path is available. + // CPU feature flags — reported but not directly scored. The hard veto + // in score.go fires on measured SHA-256 throughput, not on these flags + // (see BenchmarkScore.Vetoed). The flags are informational and help + // operators correlate measured throughput to the underlying ISA. + // + // HasSHA_NI is the cross-platform shorthand: true means SHA-256 + // hardware acceleration is available — Intel/AMD SHA-NI on amd64, + // ARMv8 SHA2 on arm64 — false on every other architecture. + // AVX-512 IFMA on amd64 indicates whether the BLS pairing fast path + // is available (~1.5x speedup vs scalar fallback). HasSHA_NI bool HasAVX512IFMA bool HasVAES bool @@ -149,6 +155,7 @@ func benchHashMBps(newHash func() hash.Hash, blockSize int) float64 { } } h := newHash() + digest := make([]byte, 0, h.Size()) deadline := time.Now().Add(cryptoBenchDuration) start := time.Now() var bytes int64 @@ -157,7 +164,7 @@ func benchHashMBps(newHash func() hash.Hash, blockSize int) float64 { for range innerLoop { h.Reset() _, _ = h.Write(buf) - _ = h.Sum(nil) + digest = h.Sum(digest[:0]) } bytes += int64(blockSize) * innerLoop } diff --git a/cmd/benchmark/report.go b/cmd/benchmark/report.go index 7195c17e..72c077ae 100644 --- a/cmd/benchmark/report.go +++ b/cmd/benchmark/report.go @@ -132,6 +132,13 @@ const ( cryptoSHA256SmallPassMBps = 1_200.0 // SHA-256 on 1 KiB blocks cryptoSHA256SmallFailMBps = 500.0 cryptoSHA256LargePassMBps = 1_500.0 // SHA-256 on 16 KiB blocks + // cryptoSHA256LargeFailMBps drives the per-metric category verdict + // (WARN/FAIL labels in the text report); minLeaderSHA256MBps in score.go + // (500 MB/s) drives the hard grade-F veto. The category fail floor is + // set slightly above the veto so a host in [500, 600) MB/s shows a + // per-metric FAIL label without triggering the grade-cap veto path — + // gradeToVerdict still surfaces the overall verdict as FAIL via the + // category-fail route, so behavior is consistent across both paths. cryptoSHA256LargeFailMBps = 600.0 cryptoBlake2bPassMBps = 700.0 // Blake2b-512 on 16 KiB blocks (AVX2) cryptoBlake2bFailMBps = 300.0 diff --git a/cmd/benchmark/score.go b/cmd/benchmark/score.go index 41c50737..e391cb61 100644 --- a/cmd/benchmark/score.go +++ b/cmd/benchmark/score.go @@ -150,8 +150,12 @@ type BenchmarkScore struct { MaxTotal int // sum of enabled category maxes Pct float64 // Total / MaxTotal (0.0–1.0); 0 if nothing enabled Grade string // S / A / B / C / D / F - // Vetoed is true when a hard requirement failed (e.g., missing SHA-NI - // on amd64). When set, Grade is forced to "F" regardless of point total. + // Vetoed is true when a hard requirement failed — currently only the + // measured SHA-256 throughput floor (SHA256LargeMBps < minLeaderSHA256MBps). + // SHA-NI absence is the most common cause of low throughput on amd64 but + // is not asserted as the sole cause; the gate fires on the measurement, + // not on a CPU feature flag. When set, Grade is forced to "F" regardless + // of point total. VetoedReason carries an operator-facing explanation. Vetoed bool VetoedReason string } diff --git a/cmd/node/PREFLIGHT.md b/cmd/node/PREFLIGHT.md index 4617b0bc..67f01596 100644 --- a/cmd/node/PREFLIGHT.md +++ b/cmd/node/PREFLIGHT.md @@ -1,7 +1,8 @@ # Validator CPU Preflight -The validator binary runs a CPU preflight check at startup, immediately after -loading the BLS signing key. The preflight verifies that the host has +The validator binary runs a CPU preflight check at startup, immediately before +loading the BLS signing key — defense-in-depth so a host that fails the gate +never unlocks the key file. The preflight verifies that the host has sufficient SHA-256 hardware acceleration to keep up with consensus and TX processing on a production network. @@ -103,6 +104,7 @@ a validator to confirm. ## Related - `cmd/node/preflight.go` — the preflight implementation. -- `cmd/benchmark/CLI.md` — the operator-facing benchmark, which applies the - same SHA-NI veto and produces a more detailed report. +- `cmd/benchmark/CLI.md` — the operator-facing benchmark, which applies a + measured-throughput veto (SHA-256 < 500 MB/s) — not a SHA-NI feature-bit + check — and produces a more detailed report. - `config/prefsConfig.go` — `EnforceCPUPreflight` is the runtime flag. diff --git a/cmd/node/preflight.go b/cmd/node/preflight.go index 42bad3c8..f546f4c9 100644 --- a/cmd/node/preflight.go +++ b/cmd/node/preflight.go @@ -117,7 +117,9 @@ func validatorCPUPreflightWithInfo( "This typically indicates missing SHA-NI (Skylake-X / Cascade Lake / Haswell on amd64) "+ "or a degraded host (frequency cap, thermal throttle, hypervisor masking). "+ "Migrate to AMD Zen, Intel Ice Lake-SP+, or modern ARM with ARMv8 SHA2. "+ - "Override (NOT for production): %s=1", + "To downgrade this failure to a warning during a coordinated fleet migration, "+ + "set preferences.enforceCpuPreflight=false in the validator config. "+ + "Emergency override (NOT for production): %s=1", mbps, minSHA256ThroughputMBps, envSkipCPUCheck) } return nil @@ -144,6 +146,7 @@ func benchSHA256(d time.Duration) float64 { } } h := sha256.New() + digest := make([]byte, 0, h.Size()) var bytes int64 start := time.Now() deadline := start.Add(d) @@ -152,7 +155,7 @@ func benchSHA256(d time.Duration) float64 { for i := 0; i < innerLoop; i++ { h.Reset() _, _ = h.Write(buf) - _ = h.Sum(nil) + digest = h.Sum(digest[:0]) } bytes += benchBlockSize * innerLoop } diff --git a/go.mod b/go.mod index 761098ec..eb64971a 100644 --- a/go.mod +++ b/go.mod @@ -28,6 +28,7 @@ require ( github.com/jbenet/goprocess v0.1.4 github.com/joho/godotenv v1.4.0 github.com/keygen-sh/machineid v1.1.1 + github.com/klauspost/cpuid/v2 v2.3.0 github.com/klever-io/klever-go-logger v1.3.1 github.com/libp2p/go-libp2p v0.48.0 github.com/libp2p/go-libp2p-kad-dht v0.39.1 @@ -108,7 +109,6 @@ require ( github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/kardianos/osext v0.0.0-20190222173326-2bc1f35cddc0 // indirect - github.com/klauspost/cpuid/v2 v2.3.0 // indirect github.com/koron/go-ssdp v0.0.6 // indirect github.com/leodido/go-urn v1.4.0 // indirect github.com/libp2p/go-buffer-pool v0.1.0 // indirect From 1a88cfa46473ba3e7de126f302ee7bf0724ada09 Mon Sep 17 00:00:00 2001 From: Fernando Sobreira Date: Thu, 7 May 2026 21:27:00 -0400 Subject: [PATCH 3/6] [KLC-2387] extract MB/s metric row format to a constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sonar flagged the printf template " %-32s %7.1f MB/s %s (pass≥%.0f, fail<%.0f MB/s)\n" as duplicated 5 times across printNetworkSection and printCryptoSection. Centralised as metricThroughputMBpsRowFmt so future column-width or label-format changes stay consistent across sections. --- cmd/benchmark/report.go | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/cmd/benchmark/report.go b/cmd/benchmark/report.go index 72c077ae..5f2a0677 100644 --- a/cmd/benchmark/report.go +++ b/cmd/benchmark/report.go @@ -345,6 +345,11 @@ func PrintReport(results *BenchmarkResults, format string) { const reportWidth = 70 +// metricThroughputMBpsRowFmt is the printf template for MB/s metric rows +// (network throughput + every crypto MB/s metric). Centralised so column +// width and pass/fail label format stay consistent across sections. +const metricThroughputMBpsRowFmt = " %-32s %7.1f MB/s %s (pass≥%.0f, fail<%.0f MB/s)\n" + func printText(results *BenchmarkResults) { sep := strings.Repeat("─", reportWidth) si := results.SystemInfo @@ -479,7 +484,7 @@ func printNetworkSection(r *NetworkResult, v verdict, sep string) { "Latency P50:", p50us, p50v.Icon(), netLatP50PassUs, netLatP50FailUs) fmt.Printf(" %-32s %8.1f µs %s (pass<%.0f, fail≥%.0f µs)\n", "Latency P99:", p99us, p99v.Icon(), netLatP99PassUs, netLatP99FailUs) - fmt.Printf(" %-32s %7.1f MB/s %s (pass≥%.0f, fail<%.0f MB/s)\n", + fmt.Printf(metricThroughputMBpsRowFmt, "Throughput:", r.ThroughputMBps, thrV.Icon(), netThroughputPassMBps, netThroughputFailMBps) fmt.Println() @@ -575,16 +580,16 @@ func printCryptoSection(r *CryptoResult, v verdict, sep string) { kV := metricVerdict(r.Keccak256MBps, cryptoKeccak256PassMBps, cryptoKeccak256FailMBps) edV := metricVerdict(r.Ed25519VerifyOpsPerSec, cryptoEd25519VerifyPassOps, cryptoEd25519VerifyFailOps) - fmt.Printf(" %-32s %7.1f MB/s %s (pass≥%.0f, fail<%.0f MB/s)\n", + fmt.Printf(metricThroughputMBpsRowFmt, "SHA-256 (1 KiB blocks):", r.SHA256MBps, s256V.Icon(), cryptoSHA256SmallPassMBps, cryptoSHA256SmallFailMBps) - fmt.Printf(" %-32s %7.1f MB/s %s (pass≥%.0f, fail<%.0f MB/s)\n", + fmt.Printf(metricThroughputMBpsRowFmt, "SHA-256 (16 KiB blocks):", r.SHA256LargeMBps, s256LV.Icon(), cryptoSHA256LargePassMBps, cryptoSHA256LargeFailMBps) - fmt.Printf(" %-32s %7.1f MB/s %s (pass≥%.0f, fail<%.0f MB/s)\n", + fmt.Printf(metricThroughputMBpsRowFmt, "Blake2b-512 (16 KiB):", r.Blake2bMBps, b2V.Icon(), cryptoBlake2bPassMBps, cryptoBlake2bFailMBps) - fmt.Printf(" %-32s %7.1f MB/s %s (pass≥%.0f, fail<%.0f MB/s)\n", + fmt.Printf(metricThroughputMBpsRowFmt, "Keccak-256 (16 KiB):", r.Keccak256MBps, kV.Icon(), cryptoKeccak256PassMBps, cryptoKeccak256FailMBps) fmt.Printf(" %-32s %s %s (pass≥%.0fK, fail<%.0fK ops/s)\n", From dd9a3328baaadb856c1e5f6da0f4f4fbb725aded Mon Sep 17 00:00:00 2001 From: Fernando Sobreira Date: Thu, 7 May 2026 21:28:24 -0400 Subject: [PATCH 4/6] [KLC-2387] add tests for ShouldEnforceCPUPreflight Sonar flagged the new safe-default accessor as uncovered. The function is load-bearing for the rollout safety guarantee (absent YAML key must default to enforce=true so upgrading operators are not silently downgraded to warn-only), so the absent-key branch is the most important one to lock down. Test covers all three states: nil pointer, explicit *true, explicit *false. --- config/prefsConfig_test.go | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 config/prefsConfig_test.go diff --git a/config/prefsConfig_test.go b/config/prefsConfig_test.go new file mode 100644 index 00000000..53156098 --- /dev/null +++ b/config/prefsConfig_test.go @@ -0,0 +1,32 @@ +package config + +import "testing" + +// TestShouldEnforceCPUPreflight covers the safe-default semantics: when the +// YAML key is absent (pointer is nil), enforcement defaults to true so an +// existing operator config that omits the new key cannot silently +// downgrade the validator startup gate to warn-only. +func TestShouldEnforceCPUPreflight(t *testing.T) { + t.Run("nil pointer (absent YAML key) defaults to true", func(t *testing.T) { + p := PreferencesConfig{EnforceCPUPreflight: nil} + if !p.ShouldEnforceCPUPreflight() { + t.Fatal("expected ShouldEnforceCPUPreflight() to return true when EnforceCPUPreflight is nil") + } + }) + + t.Run("explicit true returns true", func(t *testing.T) { + v := true + p := PreferencesConfig{EnforceCPUPreflight: &v} + if !p.ShouldEnforceCPUPreflight() { + t.Fatal("expected ShouldEnforceCPUPreflight() to return true when EnforceCPUPreflight is *true") + } + }) + + t.Run("explicit false returns false", func(t *testing.T) { + v := false + p := PreferencesConfig{EnforceCPUPreflight: &v} + if p.ShouldEnforceCPUPreflight() { + t.Fatal("expected ShouldEnforceCPUPreflight() to return false when EnforceCPUPreflight is *false") + } + }) +} From 8fc73baeb88fe9195e0e35c4b9e97b65a0183a4a Mon Sep 17 00:00:00 2001 From: Fernando Sobreira Date: Thu, 7 May 2026 21:34:15 -0400 Subject: [PATCH 5/6] [KLC-2387] address re-review: drop crypto/rand, fix comments, soften docs - preflight.go: replace crypto/rand seed with deterministic init. rand.Read can block on entropy starvation during early boot, which could delay validator startup before preflight even runs. SHA-256 amd64/arm64 fast paths are data-independent so the buffer contents do not affect measured throughput. - score.go: stale comment claimed minLeaderSHA256MBps "matches the existing fail floor"; actual fail floor is 600 MB/s vs 500 veto. Document the intentional gap and explain why behavior stays consistent (category-fail path in gradeToVerdict). - PREFLIGHT.md: soften absolute "always satisfy" claim about Hetzner CCX/CPX SKUs to "typically" + emphasise running the benchmark to confirm. Cloud SKU hardware can change over time. --- cmd/benchmark/score.go | 8 ++++++-- cmd/node/PREFLIGHT.md | 12 +++++++----- cmd/node/preflight.go | 13 ++++++------- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/cmd/benchmark/score.go b/cmd/benchmark/score.go index e391cb61..f0b64c65 100644 --- a/cmd/benchmark/score.go +++ b/cmd/benchmark/score.go @@ -50,8 +50,12 @@ import ( // tolerance window (lowerBound = 425 ms). Calibrated from field data: a // validator measured at ~250 MB/s on 16 KiB blocks took ~600 ms to process // a representative SC TX as leader, well above the 425 ms lowerBound. -// Setting the floor at 500 MB/s gives ~2× margin to the lowerBound and -// matches the existing fail floor for SHA-256 16 KiB blocks. +// Setting the floor at 500 MB/s gives ~2× margin to the lowerBound. This +// veto threshold is deliberately below the per-metric category fail +// threshold (cryptoSHA256LargeFailMBps = 600 MB/s in report.go) so a +// host in [500, 600) MB/s shows a per-metric FAIL label without the +// hard grade-cap firing — the overall verdict still resolves to FAIL via +// the category-fail path in gradeToVerdict. const minLeaderSHA256MBps = 500.0 // --------------------------------------------------------------------------- diff --git a/cmd/node/PREFLIGHT.md b/cmd/node/PREFLIGHT.md index 67f01596..f28f8b1d 100644 --- a/cmd/node/PREFLIGHT.md +++ b/cmd/node/PREFLIGHT.md @@ -94,12 +94,14 @@ Recommended CPU classes: every datacenter ARM CPU since 2018, including AWS Graviton and Apple Silicon). -For Hetzner Cloud specifically: CCX (dedicated AMD EPYC) and CPX (shared -AMD EPYC) instances always satisfy the preflight. The CX series is a mixed -Intel/AMD pool, and Skylake-class instances within it do not. Run +For Hetzner Cloud specifically (based on current fleet observations): +CCX (dedicated AMD EPYC) and CPX (shared AMD EPYC) instances typically +satisfy the preflight. The CX series is a mixed Intel/AMD pool, and +Skylake-class instances within it may not. Cloud SKUs and underlying +hardware can change over time, so always confirm by running `klever-benchmark --skip-disk --skip-network --skip-kv --skip-memory \ ---skip-goroutine --skip-bignum` on a candidate instance before deploying as -a validator to confirm. +--skip-goroutine --skip-bignum` on a candidate instance before +deploying as a validator. ## Related diff --git a/cmd/node/preflight.go b/cmd/node/preflight.go index f546f4c9..584cdea4 100644 --- a/cmd/node/preflight.go +++ b/cmd/node/preflight.go @@ -1,7 +1,6 @@ package main import ( - "crypto/rand" "crypto/sha256" "fmt" "os" @@ -137,13 +136,13 @@ func benchSHA256(d time.Duration) float64 { if d <= 0 { return 0 } + // Deterministic init — avoids crypto/rand which can block on entropy + // starvation during early boot (validator startup may run before the + // kernel RNG pool is fully initialised). SHA-256's amd64/arm64 fast + // paths are data-independent so the contents do not affect throughput. buf := make([]byte, benchBlockSize) - if _, err := rand.Read(buf); err != nil { - // Deterministic fallback so the bench remains representative of a - // non-zero working set even when the system RNG is unavailable. - for i := range buf { - buf[i] = byte(i) - } + for i := range buf { + buf[i] = byte(i) } h := sha256.New() digest := make([]byte, 0, h.Size()) From 9bdbaeaac9a32a4898c78390b486caebbc7214ea Mon Sep 17 00:00:00 2001 From: Fernando Sobreira Date: Thu, 7 May 2026 21:54:00 -0400 Subject: [PATCH 6/6] [KLC-2387] make SHA-256 acceleration reporting architecture-aware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit's third pass surfaced a real correctness issue: text report, JSON output, log keys, and remediation messages all hardcoded "SHA-NI" — which is the Intel/AMD x86 brand. On arm64 the equivalent ISA is ARMv8 SHA2, so an operator running on a slow arm64 validator was being misdirected to migrate from SHA-NI-deficient x86 hardware when the actual root cause is missing ARMv8 SHA2. Rename CryptoResult.HasSHA_NI -> HasSHAAccel; JSON tag sha_ni -> sha_accel; preflight log key sha_ni -> sha_accel. Add shaAccelName(arch) helper that returns "SHA-NI" / "ARMv8 SHA2" / generic per architecture and use it in: text CPU section, post-crypto warning, JSON surface, score VetoedReason, preflight measurement log, preflight Warn line, preflight failure error. Qualify the PREFLIGHT.md key-loading guarantee to scope it to the enforced default path, since enforceCpuPreflight=false and KLEVER_SKIP_CPU_CHECK=1 deliberately allow startup past failure. Verified end-to-end: - amd64 Skylake host: text shows "SHA-NI=no", JSON has "sha_accel": false, veto_reason cites "missing SHA-NI on amd64 (Skylake-X / Cascade Lake / Haswell)". - arm64 (M4 Max) local: text shows "ARMv8 SHA2=yes" instead of the previous misleading "SHA-NI=yes". --- cmd/benchmark/crypto.go | 40 ++++++++++++++++++++++++++++++++----- cmd/benchmark/report.go | 14 +++++++------ cmd/benchmark/score.go | 6 ++++-- cmd/benchmark/score_test.go | 4 ++-- cmd/node/PREFLIGHT.md | 16 ++++++++++----- cmd/node/preflight.go | 40 +++++++++++++++++++++++++++++++++---- 6 files changed, 96 insertions(+), 24 deletions(-) diff --git a/cmd/benchmark/crypto.go b/cmd/benchmark/crypto.go index af102c32..f5f901f7 100644 --- a/cmd/benchmark/crypto.go +++ b/cmd/benchmark/crypto.go @@ -67,12 +67,13 @@ type CryptoResult struct { // (see BenchmarkScore.Vetoed). The flags are informational and help // operators correlate measured throughput to the underlying ISA. // - // HasSHA_NI is the cross-platform shorthand: true means SHA-256 - // hardware acceleration is available — Intel/AMD SHA-NI on amd64, - // ARMv8 SHA2 on arm64 — false on every other architecture. + // HasSHAAccel is true when SHA-256 hardware acceleration is available + // on the current architecture: Intel/AMD SHA-NI on amd64, ARMv8 SHA2 + // on arm64. False on every other architecture. Use shaAccelName(arch) + // to render an arch-appropriate label in user-facing output. // AVX-512 IFMA on amd64 indicates whether the BLS pairing fast path // is available (~1.5x speedup vs scalar fallback). - HasSHA_NI bool + HasSHAAccel bool HasAVX512IFMA bool HasVAES bool HasGFNI bool @@ -82,7 +83,7 @@ type CryptoResult struct { // feature flags. Returns a populated CryptoResult on success. func RunCryptoBenchmark() (*CryptoResult, error) { r := &CryptoResult{ - HasSHA_NI: hasSHAAcceleration(), + HasSHAAccel: hasSHAAcceleration(), HasAVX512IFMA: cpuid.CPU.Has(cpuid.AVX512IFMA), HasVAES: cpuid.CPU.Has(cpuid.VAES), HasGFNI: cpuid.CPU.Has(cpuid.GFNI), @@ -111,6 +112,35 @@ func RunCryptoBenchmark() (*CryptoResult, error) { return r, nil } +// shaAccelName returns the operator-facing name of the SHA-256 hardware +// acceleration ISA on the given architecture. Used in reports and +// remediation messages so the output is correct on every supported arch +// (rather than hardcoding the x86 brand on arm64). +func shaAccelName(arch string) string { + switch arch { + case "amd64": + return "SHA-NI" + case "arm64": + return "ARMv8 SHA2" + default: + return "SHA-256 hardware acceleration" + } +} + +// shaCommonCauseSuffix returns the arch-specific "common cause" suffix +// for veto / preflight error messages. Empty on unknown archs so the +// generic shaAccelName label stands alone. +func shaCommonCauseSuffix(arch string) string { + switch arch { + case "amd64": + return " on amd64 (Skylake-X / Cascade Lake / Haswell)" + case "arm64": + return " on arm64" + default: + return "" + } +} + // hasSHAAcceleration reports whether SHA-256 hardware acceleration is // available on the current architecture: SHA-NI on amd64, ARMv8 SHA2 on // arm64. Returns false on every other architecture. diff --git a/cmd/benchmark/report.go b/cmd/benchmark/report.go index 5f2a0677..036cb474 100644 --- a/cmd/benchmark/report.go +++ b/cmd/benchmark/report.go @@ -370,8 +370,9 @@ func printText(results *BenchmarkResults) { fmt.Printf(" System : %s/%s CPUs: %d Go: %s\n", si.GOOS, si.GOARCH, si.CPUs, si.GoVersion) if c := results.CryptoResult; c != nil { - fmt.Printf(" CPU : SHA-NI=%s AVX-512 IFMA=%s VAES=%s GFNI=%s\n", - yesNo(c.HasSHA_NI), yesNo(c.HasAVX512IFMA), yesNo(c.HasVAES), yesNo(c.HasGFNI)) + fmt.Printf(" CPU : %s=%s AVX-512 IFMA=%s VAES=%s GFNI=%s\n", + shaAccelName(si.GOARCH), yesNo(c.HasSHAAccel), + yesNo(c.HasAVX512IFMA), yesNo(c.HasVAES), yesNo(c.HasGFNI)) } fmt.Println(sep) @@ -596,9 +597,10 @@ func printCryptoSection(r *CryptoResult, v verdict, sep string) { "Ed25519 verify:", humanOps(r.Ed25519VerifyOpsPerSec), edV.Icon(), cryptoEd25519VerifyPassOps/1000, cryptoEd25519VerifyFailOps/1000) - if runtime.GOARCH == "amd64" && !r.HasSHA_NI { + if !r.HasSHAAccel && (runtime.GOARCH == "amd64" || runtime.GOARCH == "arm64") { fmt.Println() - fmt.Println(" ! CPU lacks SHA-NI; this is the most common cause of low SHA-256 throughput.") + fmt.Printf(" ! CPU lacks %s; this is the most common cause of low SHA-256 throughput.\n", + shaAccelName(runtime.GOARCH)) fmt.Println(" ! If the throughput numbers above are below the pass thresholds, migrate to") fmt.Println(" ! AMD Zen, Intel Ice Lake-SP+, or modern ARM (with ARMv8 SHA2).") } @@ -813,7 +815,7 @@ type jsonBigNum struct { } type jsonCPUFeatures struct { - HasSHA_NI bool `json:"sha_ni"` + HasSHAAccel bool `json:"sha_accel"` HasAVX512IFMA bool `json:"avx512_ifma"` HasVAES bool `json:"vaes"` HasGFNI bool `json:"gfni"` @@ -924,7 +926,7 @@ func printJSON(results *BenchmarkResults) { Keccak256MBps: r.Keccak256MBps, Ed25519VerifyOpsPerSec: r.Ed25519VerifyOpsPerSec, CPUFeatures: jsonCPUFeatures{ - HasSHA_NI: r.HasSHA_NI, + HasSHAAccel: r.HasSHAAccel, HasAVX512IFMA: r.HasAVX512IFMA, HasVAES: r.HasVAES, HasGFNI: r.HasGFNI, diff --git a/cmd/benchmark/score.go b/cmd/benchmark/score.go index f0b64c65..567c76a9 100644 --- a/cmd/benchmark/score.go +++ b/cmd/benchmark/score.go @@ -203,8 +203,10 @@ func ComputeScore(r *BenchmarkResults) BenchmarkScore { s.VetoedReason = fmt.Sprintf( "SHA-256 throughput %.0f MB/s < %.0f MB/s minimum — node likely cannot sustain "+ "leader-mode TX processing within the consensus hardware-tolerance window. "+ - "Most common cause: missing SHA-NI on amd64 (Skylake-X / Cascade Lake / Haswell)", - c.SHA256LargeMBps, minLeaderSHA256MBps) + "Most common cause: missing %s%s", + c.SHA256LargeMBps, minLeaderSHA256MBps, + shaAccelName(r.SystemInfo.GOARCH), + shaCommonCauseSuffix(r.SystemInfo.GOARCH)) s.Grade = "F" } diff --git a/cmd/benchmark/score_test.go b/cmd/benchmark/score_test.go index 7b0b0ceb..c532163c 100644 --- a/cmd/benchmark/score_test.go +++ b/cmd/benchmark/score_test.go @@ -44,7 +44,7 @@ func excellentResults() *BenchmarkResults { Blake2bMBps: cryptoBlake2bExcellentMBps, Keccak256MBps: cryptoKeccak256ExcellentMBps, Ed25519VerifyOpsPerSec: cryptoEd25519VerifyExcellentOps, - HasSHA_NI: true, + HasSHAAccel: true, HasAVX512IFMA: true, }, } @@ -125,7 +125,7 @@ func TestComputeScore_ThroughputVeto_DoesNotApply_AboveFloor(t *testing.T) { // Throughput just above the floor — veto must not trigger even though // the host could in principle be a non-SHA-NI amd64. r.CryptoResult.SHA256LargeMBps = minLeaderSHA256MBps + 1 - r.CryptoResult.HasSHA_NI = false + r.CryptoResult.HasSHAAccel = false s := ComputeScore(r) if s.Vetoed { diff --git a/cmd/node/PREFLIGHT.md b/cmd/node/PREFLIGHT.md index f28f8b1d..dae75a8c 100644 --- a/cmd/node/PREFLIGHT.md +++ b/cmd/node/PREFLIGHT.md @@ -1,10 +1,16 @@ # Validator CPU Preflight The validator binary runs a CPU preflight check at startup, immediately before -loading the BLS signing key — defense-in-depth so a host that fails the gate -never unlocks the key file. The preflight verifies that the host has -sufficient SHA-256 hardware acceleration to keep up with consensus and TX -processing on a production network. +loading the BLS signing key. When the preflight is enforced (the default, +`preferences.enforceCpuPreflight=true`), a host that fails the gate exits +before the key file is unlocked — defense-in-depth against deploying a +validator key on hardware that cannot keep up with consensus. The +warn-only path (`enforceCpuPreflight=false`) and the emergency env +bypass (`KLEVER_SKIP_CPU_CHECK=1`) deliberately allow startup to +continue past a failure; both log a loud Warn so the bypass is auditable +in fleet logs. The preflight verifies that the host has sufficient SHA-256 +hardware acceleration to keep up with consensus and TX processing on a +production network. ## Why this exists @@ -56,7 +62,7 @@ Every preflight run logs a single `Info` line with the measured throughput failure error): ```text -INFO validator CPU preflight measurement arch=amd64 sha_ni=true avx512_ifma=true sha256_mbps=1742.3 +INFO validator CPU preflight measurement arch=amd64 sha_accel=true avx512_ifma=true sha256_mbps=1742.3 ``` ## Override diff --git a/cmd/node/preflight.go b/cmd/node/preflight.go index 584cdea4..3d42f076 100644 --- a/cmd/node/preflight.go +++ b/cmd/node/preflight.go @@ -95,7 +95,7 @@ func validatorCPUPreflightWithInfo( } log.Info("validator CPU preflight measurement", "arch", info.arch, - "sha_ni", info.hasSHA, + "sha_accel", info.hasSHA, "avx512_ifma", info.hasAVX512IFMA, "sha256_mbps", fmt.Sprintf("%.1f", mbps)) @@ -105,7 +105,7 @@ func validatorCPUPreflightWithInfo( if !info.hasSHA { log.Warn("CPU lacks SHA-256 hardware acceleration "+ - "(SHA-NI on amd64 / ARMv8 SHA2 on arm64); "+ + "("+shaAccelName(info.arch)+"); "+ "this is the most common cause of low SHA-256 throughput", "arch", info.arch) } @@ -113,17 +113,49 @@ func validatorCPUPreflightWithInfo( if mbps < minSHA256ThroughputMBps { return fmt.Errorf( "validator CPU preflight failed: measured SHA-256 throughput %.1f MB/s < %d MB/s minimum. "+ - "This typically indicates missing SHA-NI (Skylake-X / Cascade Lake / Haswell on amd64) "+ + "This typically indicates missing %s%s "+ "or a degraded host (frequency cap, thermal throttle, hypervisor masking). "+ "Migrate to AMD Zen, Intel Ice Lake-SP+, or modern ARM with ARMv8 SHA2. "+ "To downgrade this failure to a warning during a coordinated fleet migration, "+ "set preferences.enforceCpuPreflight=false in the validator config. "+ "Emergency override (NOT for production): %s=1", - mbps, minSHA256ThroughputMBps, envSkipCPUCheck) + mbps, minSHA256ThroughputMBps, + shaAccelName(info.arch), + shaAccelArchSuffix(info.arch), + envSkipCPUCheck) } return nil } +// shaAccelName returns the operator-facing name of the SHA-256 hardware +// acceleration ISA on the given architecture (so log lines on arm64 do +// not misdirect operators to "SHA-NI" — the arm64 instruction set is +// ARMv8 SHA2, not Intel's SHA-NI). +func shaAccelName(arch string) string { + switch arch { + case "amd64": + return "SHA-NI" + case "arm64": + return "ARMv8 SHA2" + default: + return "SHA-256 hardware acceleration" + } +} + +// shaAccelArchSuffix returns the arch-specific "common cause" suffix for +// the preflight error message. Empty on unknown archs so the label stands +// alone. +func shaAccelArchSuffix(arch string) string { + switch arch { + case "amd64": + return " (Skylake-X / Cascade Lake / Haswell on amd64)" + case "arm64": + return " (any ARMv7 or ARMv8 chip without the SHA2 feature flag)" + default: + return "" + } +} + // benchSHA256 hashes 16 KiB blocks for d and returns sustained throughput in // megabytes per second. Returns 0 on a non-positive duration. The block size // matches the openssl-speed reference used during the original investigation