Skip to content

Commit ab0e22f

Browse files
authored
fix(nonce): detect block gaps, invalidate corrupt candidates, reduce OOM risk (#116)
Four fixes to make full mode nonce computation reliable: 1. Backfill: when Koios corrects a final nonce mismatch, also invalidate the candidate nonce for that epoch. Previously the corrupt candidate persisted and TICKN used it to compute wrong next-epoch nonces. 2. TICKN: before using a candidate nonce, verify its epoch has complete block coverage by comparing local block count against Koios. Skip TICKN and fall through to Koios if blocks are missing. 3. Integrity check: add Layer 3 that compares recent epoch block counts against Koios on startup. Detects silent block loss from pipeline restarts and invalidates affected candidate nonces. 4. Reduce historical sync channel buffer from 10000 to 2000 blocks (~440KB vs ~2.2MB) to prevent OOM at 512Mi pod limit. Adds DeleteCandidateNonce to Store interface (sets candidate_nonce=NULL) and fetchEpochBlockCount to NonceTracker (Koios epoch_info API).
1 parent 9e408da commit ab0e22f

File tree

5 files changed

+108
-3
lines changed

5 files changed

+108
-3
lines changed

db.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,14 @@ func (s *PgStore) SetCandidateNonce(ctx context.Context, epoch int, nonce []byte
132132
return err
133133
}
134134

135+
func (s *PgStore) DeleteCandidateNonce(ctx context.Context, epoch int) error {
136+
_, err := s.pool.Exec(ctx,
137+
`UPDATE epoch_nonces SET candidate_nonce = NULL, updated_at = NOW() WHERE epoch = $1`,
138+
epoch,
139+
)
140+
return err
141+
}
142+
135143
func (s *PgStore) SetFinalNonce(ctx context.Context, epoch int, nonce []byte, source string) error {
136144
_, err := s.pool.Exec(ctx,
137145
`INSERT INTO epoch_nonces (epoch, evolving_nonce, final_nonce, source, updated_at)

integrity.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,42 @@ func ValidateDBIntegrity(ctx context.Context, store Store, nonceTracker *NonceTr
106106

107107
log.Println("Layer 2 passed: blocks found on canonical chain")
108108

109+
// Layer 3: Block gap detection — compare recent epoch block counts against Koios.
110+
// Catches silent block loss from pipeline restarts where stored blocks are valid
111+
// (on canonical chain) but incomplete (missing blocks = corrupt nonce computation).
112+
if nonceTracker != nil && nonceTracker.koiosClient != nil {
113+
gapCtx, gapCancel := context.WithTimeout(ctx, 30*time.Second)
114+
defer gapCancel()
115+
gapsFound := false
116+
for checkEpoch := epoch; checkEpoch >= epoch-4 && checkEpoch >= 0; checkEpoch-- {
117+
localCount, countErr := store.GetBlockCountForEpoch(gapCtx, checkEpoch)
118+
if countErr != nil || localCount == 0 {
119+
continue
120+
}
121+
koiosCount, koiosErr := nonceTracker.fetchEpochBlockCount(gapCtx, checkEpoch)
122+
if koiosErr != nil || koiosCount == 0 {
123+
continue
124+
}
125+
if localCount < koiosCount {
126+
log.Printf("BLOCK GAP: epoch %d has %d blocks, Koios has %d (missing %d)",
127+
checkEpoch, localCount, koiosCount, koiosCount-localCount)
128+
gapsFound = true
129+
// Invalidate candidate nonce for this epoch — it was computed from incomplete data
130+
if delErr := store.DeleteCandidateNonce(gapCtx, checkEpoch); delErr != nil {
131+
log.Printf("Failed to invalidate candidate for epoch %d: %v", checkEpoch, delErr)
132+
}
133+
} else {
134+
log.Printf("Layer 3: epoch %d block count OK (%d)", checkEpoch, localCount)
135+
}
136+
time.Sleep(50 * time.Millisecond)
137+
}
138+
if gapsFound {
139+
log.Println("Layer 3: block gaps detected in recent epochs — candidate nonces invalidated")
140+
} else {
141+
log.Println("Layer 3 passed: recent epoch block counts match Koios")
142+
}
143+
}
144+
109145
// If blocks are valid but nonce is stale, repair it
110146
if nonceStale {
111147
log.Printf("Repairing stale nonce for epoch %d from blocks table...", epoch)

main.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -590,8 +590,9 @@ func (i *Indexer) runChainTail() error {
590590
syncCtx, syncCancel := context.WithCancel(context.Background())
591591
defer syncCancel()
592592

593-
// Buffered channel decouples fast chain sync from slower DB writes
594-
blockCh := make(chan BlockData, 10000)
593+
// Buffered channel decouples fast chain sync from slower DB writes.
594+
// 2000 blocks × ~220 bytes ≈ 440KB — safe for 512Mi pod limit.
595+
blockCh := make(chan BlockData, 2000)
595596

596597
onCaughtUp := func() {
597598
log.Println("Historical sync caught up, stopping ChainSyncer...")

nonce.go

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -430,8 +430,21 @@ func (nt *NonceTracker) GetNonceForEpoch(epoch int) ([]byte, error) {
430430
if candErr != nil {
431431
log.Printf("TICKN: GetCandidateNonce(%d) failed: %v", candidateEpoch, candErr)
432432
} else if candidate == nil {
433-
log.Printf("TICKN: GetCandidateNonce(%d) returned nil", candidateEpoch)
433+
log.Printf("TICKN: GetCandidateNonce(%d) returned nil (invalidated or missing)", candidateEpoch)
434434
} else {
435+
// Verify the candidate epoch has complete block coverage before trusting it.
436+
// A candidate computed from incomplete chain data (block gaps) is wrong.
437+
localCount, _ := nt.store.GetBlockCountForEpoch(ctx, candidateEpoch)
438+
if nt.koiosClient != nil && localCount > 0 {
439+
koiosCount, koiosErr := nt.fetchEpochBlockCount(ctx, candidateEpoch)
440+
if koiosErr == nil && koiosCount > 0 && localCount < koiosCount {
441+
log.Printf("TICKN: candidate epoch %d has %d blocks, Koios has %d — skipping TICKN (incomplete data)",
442+
candidateEpoch, localCount, koiosCount)
443+
candidate = nil
444+
}
445+
}
446+
}
447+
if candidate != nil {
435448
log.Printf("TICKN: got candidate for epoch %d: %s", candidateEpoch, hex.EncodeToString(candidate))
436449
// η_ph = prevHash of the last block of etaPhEpoch = hash of second-to-last block.
437450
// This matches how ComputeEpochNonce tracks labNonce via prevBlockHash.
@@ -673,6 +686,11 @@ func (nt *NonceTracker) BackfillNonces(ctx context.Context) error {
673686
log.Printf("Epoch %d: computed %s… != Koios %s… — using Koios",
674687
epoch, computedHex[:16], koiosHex[:16])
675688
eta0 = koiosNonce
689+
// Candidate nonce was computed from the same incomplete chain data.
690+
// Invalidate it so TICKN won't use a corrupt candidate for epoch+1.
691+
if delErr := nt.store.DeleteCandidateNonce(ctx, epoch); delErr != nil {
692+
log.Printf("Failed to invalidate candidate nonce for epoch %d: %v", epoch, delErr)
693+
}
676694
}
677695
}
678696
time.Sleep(50 * time.Millisecond) // rate limit
@@ -1051,3 +1069,36 @@ func (nt *NonceTracker) fetchNonceFromKoios(ctx context.Context, epoch int) ([]b
10511069

10521070
return nonce, nil
10531071
}
1072+
1073+
// fetchEpochBlockCount returns the total block count for an epoch from Koios epoch_info.
1074+
func (nt *NonceTracker) fetchEpochBlockCount(ctx context.Context, epoch int) (int, error) {
1075+
url := fmt.Sprintf(koiosRESTBase(nt.networkMagic)+"/epoch_info?_epoch_no=%d&select=blk_count", epoch)
1076+
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
1077+
if err != nil {
1078+
return 0, err
1079+
}
1080+
resp, err := koiosHTTPClient.Do(req)
1081+
if err != nil {
1082+
return 0, err
1083+
}
1084+
defer resp.Body.Close()
1085+
1086+
body, err := io.ReadAll(resp.Body)
1087+
if err != nil {
1088+
return 0, err
1089+
}
1090+
if resp.StatusCode != 200 {
1091+
return 0, fmt.Errorf("koios returned %d", resp.StatusCode)
1092+
}
1093+
1094+
var result []struct {
1095+
BlkCount int `json:"blk_count"`
1096+
}
1097+
if err := json.Unmarshal(body, &result); err != nil {
1098+
return 0, err
1099+
}
1100+
if len(result) == 0 {
1101+
return 0, fmt.Errorf("no epoch_info for epoch %d", epoch)
1102+
}
1103+
return result[0].BlkCount, nil
1104+
}

store.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ type Store interface {
3636
InsertBlockBatch(ctx context.Context, blocks []BlockData) (int, error)
3737
UpsertEvolvingNonce(ctx context.Context, epoch int, nonce []byte, blockCount int) error
3838
SetCandidateNonce(ctx context.Context, epoch int, nonce []byte) error
39+
DeleteCandidateNonce(ctx context.Context, epoch int) error
3940
SetFinalNonce(ctx context.Context, epoch int, nonce []byte, source string) error
4041
GetFinalNonce(ctx context.Context, epoch int) ([]byte, error)
4142
GetEvolvingNonce(ctx context.Context, epoch int) ([]byte, int, error)
@@ -208,6 +209,14 @@ func (s *SqliteStore) SetCandidateNonce(ctx context.Context, epoch int, nonce []
208209
return err
209210
}
210211

212+
func (s *SqliteStore) DeleteCandidateNonce(ctx context.Context, epoch int) error {
213+
_, err := s.db.ExecContext(ctx,
214+
`UPDATE epoch_nonces SET candidate_nonce = NULL, updated_at = datetime('now') WHERE epoch = ?`,
215+
epoch,
216+
)
217+
return err
218+
}
219+
211220
func (s *SqliteStore) SetFinalNonce(ctx context.Context, epoch int, nonce []byte, source string) error {
212221
_, err := s.db.ExecContext(ctx,
213222
`INSERT INTO epoch_nonces (epoch, evolving_nonce, final_nonce, source, updated_at)

0 commit comments

Comments
 (0)