Skip to content

Commit c73747a

Browse files
JkLondonJkLondonclaude
authored
[r3.5] execution/stagedsync: prune in-RAM overlay when execution unwind is a disk no-op (#21826)
Cherry-pick of #21824 (release/3.4) / #21825 (main) to `release/3.5`. `release/3.5` has the identical gap: `UnwindExecutionStage` (`execution/stagedsync/stage_execute.go`) early-returns when `u.UnwindPoint >= s.BlockNumber` without calling `sd.Unwind`, so the in-RAM `SharedDomains` / `TemporalMemBatch` overlay is not pruned when a block fails its post-execution gas check mid-batch (before its step is committed). The same overlay is reused across the unwind→retry loop in `sync.Run`, so the stale write survives and the re-execution reads it — skipping an `SSTORE_SET`, undercharging gas, and looping. Full root-cause analysis in #21824. ## Adaptations Clean cherry-pick of the `main` commit (`release/3.5` is main-like): the early return is byte-identical, `stage_execute_unwind_test.go` does not exist on `release/3.5` so the new test is added fresh, and `common.Address`/`common.Hash` are raw array types (the test slices them with `[:]`). No manual conflict resolution was needed. ## Verification on release/3.5 + this patch - `go build ./execution/stagedsync/...` — clean - `go test ./execution/stagedsync/ -run TestUnwindExecutionStage_PrunesUncommittedOverlayWrite` — pass - red-check (revert just the prune): the test fails with the stale value `d7549f2a…a3a34` surviving the unwind, then passes with the fix restored - `gofmt` / `golangci-lint run ./execution/stagedsync/` — 0 issues Relates to #21681. Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: JkLondon <me@ilyamikheev.com> Co-authored-by: Claude <noreply@anthropic.com>
1 parent f56b8fe commit c73747a

2 files changed

Lines changed: 184 additions & 0 deletions

File tree

execution/stagedsync/stage_execute.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,25 @@ func SpawnExecuteBlocksStage(s *StageState, u Unwinder, doms *execctx.SharedDoma
393393
func UnwindExecutionStage(u *UnwindState, s *StageState, doms *execctx.SharedDomains, rwTx kv.TemporalRwTx, ctx context.Context, cfg ExecuteBlockCfg, logger log.Logger) (err error) {
394394
//fmt.Printf("unwind: %d -> %d\n", u.CurrentBlockNumber, u.UnwindPoint)
395395
if u.UnwindPoint >= s.BlockNumber {
396+
// The committed execution progress (s.BlockNumber) is at or below the
397+
// unwind point, so MDBX has nothing above u.UnwindPoint to roll back and
398+
// the disk unwind below would be a no-op. However the in-RAM
399+
// SharedDomains / TemporalMemBatch overlay can still hold *uncommitted*
400+
// writes for blocks above u.UnwindPoint — e.g. a block whose
401+
// post-execution gas check failed mid-batch, before its step was ever
402+
// flushed. Those entries must still be pruned: the overlay is reused
403+
// across the unwind→retry loop within a single sync.Run, so leaving them
404+
// makes the re-execution read a stale value (observed on Hoodi 3004265:
405+
// ca5daf64 slot0 kept tx19's first-write, the contract took the
406+
// "already initialised" branch, skipped an SSTORE_SET, came up 21045 gas
407+
// short, and the node spun in an unwind/retry loop). The committed prune
408+
// (#20625) only runs from unwindExec3, which the early return skipped.
409+
txNum, err := cfg.blockReader.TxnumReader().Min(ctx, rwTx, u.UnwindPoint+1)
410+
if err != nil {
411+
return err
412+
}
413+
doms.Unwind(txNum, nil)
414+
doms.SetTxNum(txNum)
396415
return nil
397416
}
398417

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
// Copyright 2026 The Erigon Authors
2+
// This file is part of Erigon.
3+
//
4+
// Erigon is free software: you can redistribute it and/or modify
5+
// it under the terms of the GNU Lesser General Public License as published by
6+
// the Free Software Foundation, either version 3 of the License, or
7+
// (at your option) any later version.
8+
//
9+
// Erigon is distributed in the hope that it will be useful,
10+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
// GNU Lesser General Public License for more details.
13+
//
14+
// You should have received a copy of the GNU Lesser General Public License
15+
// along with Erigon. If not, see <http://www.gnu.org/licenses/>.
16+
17+
package stagedsync
18+
19+
import (
20+
"context"
21+
"testing"
22+
23+
"github.com/stretchr/testify/require"
24+
25+
"github.com/erigontech/erigon/common"
26+
"github.com/erigontech/erigon/common/log/v3"
27+
"github.com/erigontech/erigon/db/datadir"
28+
"github.com/erigontech/erigon/db/kv"
29+
"github.com/erigontech/erigon/db/kv/dbcfg"
30+
"github.com/erigontech/erigon/db/kv/mdbx"
31+
"github.com/erigontech/erigon/db/kv/rawdbv3"
32+
"github.com/erigontech/erigon/db/kv/temporal"
33+
"github.com/erigontech/erigon/db/snapshotsync/freezeblocks"
34+
dbstate "github.com/erigontech/erigon/db/state"
35+
"github.com/erigontech/erigon/db/state/changeset"
36+
"github.com/erigontech/erigon/db/state/execctx"
37+
"github.com/erigontech/erigon/execution/chain/networkname"
38+
"github.com/erigontech/erigon/execution/stagedsync/stages"
39+
"github.com/erigontech/erigon/node/ethconfig"
40+
)
41+
42+
// TestUnwindExecutionStage_PrunesUncommittedOverlayWrite is a regression test
43+
// for the Hoodi block-3004265 gas-used mismatch (originally found on
44+
// release/3.4, same gap on main).
45+
//
46+
// Repro of the original chain of events:
47+
//
48+
// 1. In serial batch execution, block 3004265 tx19 does a first-time SSTORE to
49+
// ca5daf64 slot0. That write lands in the in-RAM SharedDomains /
50+
// TemporalMemBatch overlay (stamped with tx19's txNum) but the block then
51+
// fails its post-execution gas check, so its step is never committed.
52+
// 2. The executor schedules UnwindTo(3004264). Because 3004265 was never
53+
// committed, the committed execution-stage progress (s.BlockNumber) sits at
54+
// or below the unwind point, so UnwindExecutionStage hit the
55+
// `u.UnwindPoint >= s.BlockNumber` early return and skipped unwindExec3 —
56+
// i.e. it never called sd.Unwind, so the overlay prune added by #20625
57+
// (which only runs from unwindExec3) never executed.
58+
// 3. The same overlay is reused across the unwind→retry loop inside one
59+
// sync.Run, so on retry the storage read returned tx19's own stale
60+
// first-write (…a3a34) instead of the committed 0. The contract took the
61+
// "already initialised" branch, skipped an SSTORE_SET (20000 gas), the block
62+
// came up exactly 21045 gas short, and the node spun in an unwind/retry loop.
63+
//
64+
// Before the fix, step 2's early return leaves the overlay untouched, so the
65+
// failed block's write is still visible after the unwind — this test asserts it
66+
// is gone, while a write at/below the unwind point survives (no over-pruning).
67+
func TestUnwindExecutionStage_PrunesUncommittedOverlayWrite(t *testing.T) {
68+
t.Parallel()
69+
70+
logger := log.New()
71+
dirs := datadir.New(t.TempDir())
72+
rawDb := mdbx.New(dbcfg.ChainDB, logger).InMem(t, dirs.Chaindata).MustOpen()
73+
t.Cleanup(rawDb.Close)
74+
75+
// stepSize far above every txNum used here keeps everything in step 0 — the
76+
// overlay prune compares raw txNums, so the commitment/step machinery is moot.
77+
agg, err := dbstate.NewTest(dirs).StepSize(10_000).Logger(logger).Open(context.Background(), rawDb)
78+
require.NoError(t, err)
79+
t.Cleanup(agg.Close)
80+
81+
db, err := temporal.New(rawDb, agg)
82+
require.NoError(t, err)
83+
84+
// Block reader backed only by MDBX — the unwind range is at the tip, above
85+
// any frozen snapshot boundary, so no snapshots are needed.
86+
snaps := freezeblocks.NewRoSnapshots(ethconfig.BlocksFreezing{ChainName: networkname.Mainnet}, dirs.Snap, logger)
87+
t.Cleanup(snaps.Close)
88+
br := freezeblocks.NewBlockReader(snaps, nil)
89+
90+
ctx := context.Background()
91+
tx, err := db.BeginTemporalRw(ctx)
92+
require.NoError(t, err)
93+
defer tx.Rollback()
94+
95+
// txNum layout: block b owns txNums [perBlock*b, perBlock*b+perBlock-1], so
96+
// TxnumReader().Min(b) == perBlock*b (the block's first/system txNum).
97+
const perBlock = uint64(10)
98+
for b := uint64(0); b <= 8; b++ {
99+
require.NoError(t, rawdbv3.TxNums.Append(tx, b, b*perBlock+perBlock-1))
100+
}
101+
102+
doms, err := execctx.NewSharedDomains(ctx, tx, logger)
103+
require.NoError(t, err)
104+
defer doms.Close()
105+
doms.SetChangesetAccumulator(&changeset.StateChangeSet{})
106+
107+
const (
108+
committedBlock = uint64(5) // execution-stage progress (last flushed block)
109+
unwindPoint = uint64(7) // = failedBlock-1; >= committedBlock => disk no-op
110+
failedBlock = uint64(8) // block whose post-exec gas check failed mid-batch
111+
)
112+
boundaryTxNum, err := br.TxnumReader().Min(ctx, tx, unwindPoint+1)
113+
require.NoError(t, err)
114+
require.Equal(t, failedBlock*perBlock, boundaryTxNum, "sanity: Min(failedBlock) == first txNum of failedBlock")
115+
116+
// staleKey mirrors ca5daf64 slot0: first-written by failedBlock's tx19 at a
117+
// txNum strictly above the unwind boundary — must be pruned on unwind.
118+
staleAddr := common.HexToAddress("0xca5daf6473971693b760cc65d726f72c6849d615")
119+
staleSlot := common.Hash{} // slot 0
120+
staleKey := append(append([]byte{}, staleAddr[:]...), staleSlot[:]...)
121+
staleValHash := common.HexToHash("0xd7549f2a387fa81a1d5a77adc7bd3f782ac0780c460689d88e22aee6916a3a34")
122+
staleVal := staleValHash[:]
123+
const staleTxNum = failedBlock*perBlock + 5 // inside failedBlock, above the boundary
124+
125+
// keepKey is written by a block at/below the unwind point — must survive.
126+
keepAddr := common.HexToAddress("0x00000000000000000000000000000000000000aa")
127+
keepSlot := common.Hash{31: 0x01}
128+
keepKey := append(append([]byte{}, keepAddr[:]...), keepSlot[:]...)
129+
keepVal := []byte{0xbe, 0xef}
130+
const keepTxNum = unwindPoint * perBlock // inside the unwind-target block
131+
132+
doms.SetTxNum(keepTxNum)
133+
require.NoError(t, doms.DomainPut(kv.StorageDomain, tx, keepKey, keepVal, keepTxNum, nil))
134+
doms.SetTxNum(staleTxNum)
135+
require.NoError(t, doms.DomainPut(kv.StorageDomain, tx, staleKey, staleVal, staleTxNum, nil))
136+
137+
// Sanity: both visible through the overlay before the unwind.
138+
got, _, err := doms.GetLatest(kv.StorageDomain, tx, staleKey)
139+
require.NoError(t, err)
140+
require.Equal(t, staleVal, got, "precondition: stale write must be visible pre-unwind")
141+
got, _, err = doms.GetLatest(kv.StorageDomain, tx, keepKey)
142+
require.NoError(t, err)
143+
require.Equal(t, keepVal, got, "precondition: keep write must be visible pre-unwind")
144+
145+
cfg := ExecuteBlockCfg{blockReader: br}
146+
s := &StageState{ID: stages.Execution, BlockNumber: committedBlock}
147+
u := &UnwindState{ID: stages.Execution, UnwindPoint: unwindPoint, CurrentBlockNumber: failedBlock}
148+
require.GreaterOrEqual(t, u.UnwindPoint, s.BlockNumber, "test must exercise the no-op-disk-unwind branch")
149+
150+
require.NoError(t, UnwindExecutionStage(u, s, doms, tx, ctx, cfg, logger))
151+
152+
// The failed block's first-time write must be gone, so a re-execution reads
153+
// the committed value (absent here -> empty) and charges SSTORE_SET again.
154+
got, _, err = doms.GetLatest(kv.StorageDomain, tx, staleKey)
155+
require.NoError(t, err)
156+
require.Empty(t, got,
157+
"after Unwind to block %d, the overlay must not return failedBlock(%d) tx write %x (got %x): "+
158+
"UnwindExecutionStage skipped the overlay prune on the no-op-disk-unwind path",
159+
unwindPoint, failedBlock, staleVal, got)
160+
161+
// A write at/below the unwind point must be untouched (no over-pruning).
162+
got, _, err = doms.GetLatest(kv.StorageDomain, tx, keepKey)
163+
require.NoError(t, err)
164+
require.Equal(t, keepVal, got, "write at/below the unwind point must survive the prune")
165+
}

0 commit comments

Comments
 (0)