diff --git a/comprehensive_test.go b/comprehensive_test.go index ea60e14..8b40337 100644 --- a/comprehensive_test.go +++ b/comprehensive_test.go @@ -1,6 +1,7 @@ package main import ( + "bytes" "context" "encoding/hex" "encoding/json" @@ -1602,3 +1603,65 @@ func TestGetCandidateNonce(t *testing.T) { } } } + +// TestResyncFromDB_BlockCountMismatch verifies that ResyncFromDB detects and +// recomputes the evolving nonce when epoch_nonces.block_count is stale — the +// scenario where InsertBlockBatch (CopyFrom) succeeds but the process is +// OOMKilled before ProcessBatch can update epoch_nonces. +func TestResyncFromDB_BlockCountMismatch(t *testing.T) { + store := newTestStore(t) + ctx := context.Background() + + // Use preview network magic for simpler slot math + networkMagic := PreprodNetworkMagic + epoch := 10 + + // Insert 5 blocks for the epoch (simulating CopyFrom that succeeded) + epochStart := GetEpochStartSlot(epoch, networkMagic) + for i := 0; i < 5; i++ { + slot := epochStart + uint64(i*20) + vrf := []byte{byte(i + 1), 0x02, 0x03, 0x04} + hash := fmt.Sprintf("hash%d", i) + nonce := vrfNonceValueForEpoch(vrf, epoch, networkMagic) + _, err := store.InsertBlock(ctx, slot, epoch, hash, vrf, nonce) + if err != nil { + t.Fatalf("InsertBlock %d: %v", i, err) + } + } + + // Set a stale evolving nonce with block_count=2 (simulating OOM after only 2 blocks processed) + staleNonce := []byte{0xDE, 0xAD, 0xBE, 0xEF, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} + if err := store.UpsertEvolvingNonce(ctx, epoch, staleNonce, 2); err != nil { + t.Fatalf("UpsertEvolvingNonce: %v", err) + } + + // Also need a previous epoch nonce for the recompute to start from + prevNonce := initialNonce(true) + if err := store.UpsertEvolvingNonce(ctx, epoch-1, prevNonce, 100); err != nil { + t.Fatalf("UpsertEvolvingNonce prev: %v", err) + } + + // Create NonceTracker and call ResyncFromDB + nt := NewNonceTracker(store, nil, epoch, networkMagic, true) + nt.ResyncFromDB() + + // Verify block count was corrected + _, repairedCount, err := store.GetEvolvingNonce(ctx, epoch) + if err != nil { + t.Fatalf("GetEvolvingNonce after resync: %v", err) + } + if repairedCount != 5 { + t.Fatalf("expected block_count=5 after resync repair, got %d", repairedCount) + } + + // Verify the nonce is no longer the stale value + repairedNonce, _, _ := store.GetEvolvingNonce(ctx, epoch) + if bytes.Equal(repairedNonce, staleNonce) { + t.Fatal("nonce was not recomputed — still has stale value") + } + + t.Logf("ResyncFromDB correctly recomputed nonce from 5 blocks (was stale at 2)") +} diff --git a/nonce.go b/nonce.go index b94811c..8a8c3fb 100644 --- a/nonce.go +++ b/nonce.go @@ -280,6 +280,49 @@ func (nt *NonceTracker) ResyncFromDB() { nt.blockCount = blockCount nt.candidateFroze = false log.Printf("ResyncFromDB: restored epoch %d, block count %d", epoch, blockCount) + + // Cross-check: if blocks were bulk-inserted (CopyFrom) but the process + // was OOMKilled before ProcessBatch could update epoch_nonces.block_count, + // the nonce state is stale. Detect and recompute from raw VRF outputs. + actualCount, countErr := nt.store.GetBlockCountForEpoch(ctx, epoch) + if countErr != nil || actualCount == blockCount { + return + } + + log.Printf("ResyncFromDB: block count mismatch for epoch %d (nonce tracker: %d, blocks table: %d) — recomputing", + epoch, blockCount, actualCount) + + // Start from previous epoch's evolving nonce (nonce rolls across boundaries) + var etaV []byte + if epoch > 0 { + prevNonce, _, prevErr := nt.store.GetEvolvingNonce(ctx, epoch-1) + if prevErr == nil && prevNonce != nil { + etaV = prevNonce + } + } + if etaV == nil { + etaV = initialNonce(nt.fullMode) + } + + vrfBlocks, vrfErr := nt.store.GetVrfOutputsForEpoch(ctx, epoch) + if vrfErr != nil { + log.Printf("ResyncFromDB: recompute failed for epoch %d: %v", epoch, vrfErr) + return + } + + for _, b := range vrfBlocks { + nonceValue := vrfNonceValueForEpoch(b.VrfOutput, b.Epoch, nt.networkMagic) + etaV = evolveNonce(etaV, nonceValue) + } + + if upsertErr := nt.store.UpsertEvolvingNonce(ctx, epoch, etaV, len(vrfBlocks)); upsertErr != nil { + log.Printf("ResyncFromDB: failed to persist recomputed nonce for epoch %d: %v", epoch, upsertErr) + return + } + + nt.evolvingNonce = etaV + nt.blockCount = len(vrfBlocks) + log.Printf("ResyncFromDB: epoch %d recomputed from %d blocks", epoch, len(vrfBlocks)) } // RecomputeCurrentEpochNonce recomputes the evolving nonce for a specific epoch