Skip to content

Commit 185bd6e

Browse files
committed
fix(engine): scope engine-API retry budget to consensus phase
1 parent fd10658 commit 185bd6e

24 files changed

Lines changed: 482 additions & 158 deletions

File tree

beacon/blockchain/execution_engine.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ func (s *Service) sendPostBlockFCU(
6363
fcuData,
6464
s.chainSpec.ActiveForkVersionForTimestamp(lph.GetTimestamp()),
6565
)
66-
if _, err = s.executionEngine.NotifyForkchoiceUpdate(ctx, req); err != nil {
66+
if _, err = s.executionEngine.NotifyForkchoiceUpdate(ctx, req, engineprimitives.PhaseFinalize); err != nil {
6767
return fmt.Errorf("failed forkchoice update, head %s: %w",
6868
lph.GetBlockHash().String(),
6969
err,

beacon/blockchain/finalize_block.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
ctypes "github.com/berachain/beacon-kit/consensus-types/types"
3030
"github.com/berachain/beacon-kit/consensus/types"
3131
datypes "github.com/berachain/beacon-kit/da/types"
32+
engineprimitives "github.com/berachain/beacon-kit/engine-primitives/engine-primitives"
3233
"github.com/berachain/beacon-kit/primitives/crypto"
3334
"github.com/berachain/beacon-kit/primitives/math"
3435
"github.com/berachain/beacon-kit/primitives/transition"
@@ -214,6 +215,7 @@ func (s *Service) executeStateTransition(
214215
ctx,
215216
blk.GetConsensusTime(),
216217
blk.GetProposerAddress(),
218+
engineprimitives.PhaseFinalize,
217219
).
218220
WithVerifyPayload(true).
219221
WithVerifyRandao(false).

beacon/blockchain/interfaces.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,14 @@ type ExecutionEngine interface {
5151
NotifyNewPayload(
5252
ctx context.Context,
5353
req ctypes.NewPayloadRequest,
54-
retryOnSyncingStatus bool,
54+
phase engineprimitives.EnginePhase,
5555
) error
5656
// NotifyForkchoiceUpdate notifies the execution client of a forkchoice
5757
// update.
5858
NotifyForkchoiceUpdate(
5959
ctx context.Context,
6060
req *ctypes.ForkchoiceUpdateRequest,
61+
phase engineprimitives.EnginePhase,
6162
) (*engineprimitives.PayloadID, error)
6263
}
6364

beacon/blockchain/payload.go

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@ import (
2727
payloadtime "github.com/berachain/beacon-kit/beacon/payload-time"
2828
ctypes "github.com/berachain/beacon-kit/consensus-types/types"
2929
engineprimitives "github.com/berachain/beacon-kit/engine-primitives/engine-primitives"
30-
engineerrors "github.com/berachain/beacon-kit/engine-primitives/errors"
31-
"github.com/berachain/beacon-kit/errors"
3230
"github.com/berachain/beacon-kit/payload/builder"
3331
"github.com/berachain/beacon-kit/primitives/crypto"
3432
"github.com/berachain/beacon-kit/primitives/math"
@@ -66,7 +64,7 @@ func (s *Service) forceSyncUponProcess(
6664
},
6765
s.chainSpec.ActiveForkVersionForTimestamp(lph.GetTimestamp()),
6866
)
69-
if _, err = s.executionEngine.NotifyForkchoiceUpdate(ctx, req); err != nil {
67+
if _, err = s.executionEngine.NotifyForkchoiceUpdate(ctx, req, engineprimitives.PhaseStartup); err != nil {
7068
s.logger.Error(
7169
"failed to send force head FCU",
7270
"error", err,
@@ -93,9 +91,10 @@ func (s *Service) forceSyncUponFinalize(
9391
return err
9492
}
9593

96-
// We set retryOnSyncingStatus to false here. We can ignore SYNCING status and proceed
97-
// to the FCU.
98-
err = s.executionEngine.NotifyNewPayload(ctx, payloadReq, false)
94+
// EL may return SYNCING (payload accepted but not yet validated — it's
95+
// still catching up). Fine at startup: the block is already finalized
96+
// in our state, and the FCU below kicks off the EL sync.
97+
err = s.executionEngine.NotifyNewPayload(ctx, payloadReq, engineprimitives.PhaseStartup)
9998
if err != nil {
10099
return fmt.Errorf("startSyncUponFinalize NotifyNewPayload failed: %w", err)
101100
}
@@ -111,22 +110,10 @@ func (s *Service) forceSyncUponFinalize(
111110
s.chainSpec.ActiveForkVersionForTimestamp(executionPayload.GetTimestamp()),
112111
)
113112

114-
switch _, err = s.executionEngine.NotifyForkchoiceUpdate(ctx, req); {
115-
case err == nil:
116-
return nil
117-
118-
case errors.IsAny(err,
119-
engineerrors.ErrSyncingPayloadStatus,
120-
engineerrors.ErrAcceptedPayloadStatus):
121-
s.logger.Warn(
122-
//nolint:lll // long message on one line for readability.
123-
`Your execution client is syncing. It should be downloading eth blocks from its peers. Restart the beacon node once the execution client is caught up.`,
124-
)
125-
return err
126-
127-
default:
113+
if _, err = s.executionEngine.NotifyForkchoiceUpdate(ctx, req, engineprimitives.PhaseStartup); err != nil {
128114
return fmt.Errorf("force startup NotifyForkchoiceUpdate failed: %w", err)
129115
}
116+
return nil
130117
}
131118

132119
// Once you provide the right state, we really need to carry out the very same operations

beacon/blockchain/payload_test.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ func TestOptimisticBlockBuildingRejectedBlockStateChecks(t *testing.T) {
8585
// Since this is the first block called post genesis
8686
// forceSyncUponProcess will be called.
8787
dummyPayloadID := &engineprimitives.PayloadID{1, 2, 3}
88-
eng.EXPECT().NotifyForkchoiceUpdate(mock.Anything, mock.Anything).Return(dummyPayloadID, nil)
88+
eng.EXPECT().NotifyForkchoiceUpdate(mock.Anything, mock.Anything, mock.Anything).Return(dummyPayloadID, nil)
8989

9090
// we set just enough data in invalid block to let it pass
9191
// the first validations in chain before state processor is invoked
@@ -175,7 +175,7 @@ func TestOptimisticBlockBuildingVerifiedBlockStateChecks(t *testing.T) {
175175
// Since this is the first block called post genesis
176176
// forceSyncUponProcess will be called.
177177
dummyPayloadID := &engineprimitives.PayloadID{1, 2, 3}
178-
eng.EXPECT().NotifyForkchoiceUpdate(mock.Anything, mock.Anything).Return(dummyPayloadID, nil)
178+
eng.EXPECT().NotifyForkchoiceUpdate(mock.Anything, mock.Anything, mock.Anything).Return(dummyPayloadID, nil)
179179

180180
// BUILD A VALID BLOCK (without polluting state st)
181181
sdkCtx := sdk.NewContext(cms.CacheMultiStore(), true, log.NewNopLogger())
@@ -403,6 +403,7 @@ func computeStateRoot(
403403
ctx,
404404
consensusTime,
405405
proposerAddress,
406+
engineprimitives.PhaseBuild,
406407
).
407408
WithVerifyPayload(false).
408409
WithVerifyRandao(false).

beacon/blockchain/process_proposal.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,7 @@ func (s *Service) verifyStateRoot(
403403
ctx,
404404
blk.GetConsensusTime(),
405405
blk.GetProposerAddress(),
406+
engineprimitives.PhaseValidate,
406407
).
407408
WithVerifyPayload(true).
408409
WithVerifyRandao(true).

beacon/validator/block_builder.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,7 @@ func (s *Service) computeStateRoot(
418418
ctx,
419419
consensusTime,
420420
proposerAddress,
421+
engineprimitives.PhaseBuild,
421422
).
422423
WithVerifyPayload(false).
423424
WithVerifyRandao(false).
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// SPDX-License-Identifier: BUSL-1.1
2+
//
3+
// Copyright (C) 2025, Berachain Foundation. All rights reserved.
4+
// Use of this software is governed by the Business Source License included
5+
// in the LICENSE file of this repository and at www.mariadb.com/bsl11.
6+
//
7+
// ANY USE OF THE LICENSED WORK IN VIOLATION OF THIS LICENSE WILL AUTOMATICALLY
8+
// TERMINATE YOUR RIGHTS UNDER THIS LICENSE FOR THE CURRENT AND ALL OTHER
9+
// VERSIONS OF THE LICENSED WORK.
10+
//
11+
// THIS LICENSE DOES NOT GRANT YOU ANY RIGHT IN ANY TRADEMARK OR LOGO OF
12+
// LICENSOR OR ITS AFFILIATES (PROVIDED THAT YOU MAY USE A TRADEMARK OR LOGO OF
13+
// LICENSOR AS EXPRESSLY REQUIRED BY THIS LICENSE).
14+
//
15+
// TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON
16+
// AN "AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS,
17+
// EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF
18+
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND
19+
// TITLE.
20+
21+
package engineprimitives
22+
23+
// EnginePhase tags an engine API call with the consensus phase that issued it,
24+
// so the retry wrapper can pick a budget that fits the phase's safety/liveness
25+
// requirements:
26+
//
27+
// - PhaseBuild and PhaseValidate are bounded: a stuck call must return so
28+
// CometBFT can move to a new round/proposer. This is what closes the
29+
// malicious-payload retry-loop class of bugs.
30+
// - PhaseFinalize is unbounded: the block is already agreed by >=2/3 of
31+
// validators, so the node must eventually apply it or fall out of
32+
// consensus. A brief EL outage is absorbed by retrying; the loop logs
33+
// loudly so operator alerting can catch a persistently stuck node.
34+
// - PhaseStartup has an unbounded time budget so a slow cold-starting EL
35+
// (re-importing chain state, snap-syncing) is acceptable. Fatal errors
36+
// still propagate immediately — a misconfigured boot (wrong JWT, wrong
37+
// chain ID) fails fast rather than hot-looping before serving consensus.
38+
type EnginePhase int
39+
40+
const (
41+
// PhaseBuild is used from PrepareProposal when this node is the proposer
42+
// and is asking the EL to start building a payload.
43+
PhaseBuild EnginePhase = iota
44+
// PhaseValidate is used from ProcessProposal when validating a block
45+
// proposed by another validator.
46+
PhaseValidate
47+
// PhaseFinalize is used from FinalizeBlock and the post-block FCU when
48+
// applying a block that consensus has already agreed on.
49+
PhaseFinalize
50+
// PhaseStartup is used from one-shot startup paths (forceSyncUponProcess
51+
// / forceSyncUponFinalize).
52+
PhaseStartup
53+
)
54+
55+
func (p EnginePhase) String() string {
56+
switch p {
57+
case PhaseBuild:
58+
return "build"
59+
case PhaseValidate:
60+
return "validate"
61+
case PhaseFinalize:
62+
return "finalize"
63+
case PhaseStartup:
64+
return "startup"
65+
default:
66+
return "unknown"
67+
}
68+
}

execution/client/errors.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
package client
2222

2323
import (
24+
nethttp "net/http"
25+
2426
engineerrors "github.com/berachain/beacon-kit/engine-primitives/errors"
2527
"github.com/berachain/beacon-kit/errors"
2628
"github.com/berachain/beacon-kit/execution/client/ethclient/rpc"
@@ -76,7 +78,14 @@ func (s *EngineClient) handleRPCError(
7678
var httpErr *rpc.HTTPStatusError
7779
if errors.As(err, &httpErr) && httpErr != nil &&
7880
httpErr.StatusCode >= 400 && httpErr.StatusCode < 500 {
79-
// HTTP 4xx is a request-level rejection; retrying with the same payload will never
81+
// 408/425/429 are RFC-defined retryable signals — a reverse proxy,
82+
// rate limiter, or WAF in front of the EL must not turn brief
83+
// backpressure into missed slots. Treat as transport-level transient.
84+
switch httpErr.StatusCode {
85+
case nethttp.StatusRequestTimeout, nethttp.StatusTooEarly, nethttp.StatusTooManyRequests:
86+
return errors.Join(ErrBadConnection, err)
87+
}
88+
// Other 4xx is a request-level rejection; retrying with the same payload will never
8089
// succeed. Fatal so the proposer skips the slot instead of looping.
8190
return errors.Join(ErrHTTPClientError, err)
8291
}

execution/client/errors_test.go

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,20 +68,53 @@ func TestHandleRPCError_Classification(t *testing.T) {
6868
wantRetry: false,
6969
},
7070
{
71-
name: "HTTP 413 (PoC) → ErrHTTPClientError (fatal)",
72-
in: &rpc.HTTPStatusError{StatusCode: 413, Body: `{"code":-32007,"message":"Request is too big"}`},
71+
name: "HTTP 400 → ErrHTTPClientError (fatal, malformed request)",
72+
in: &rpc.HTTPStatusError{StatusCode: 400, Body: ""},
7373
wantIs: ErrHTTPClientError,
7474
wantFatal: true,
7575
},
7676
{
77-
name: "HTTP 499 → ErrHTTPClientError (fatal)",
78-
in: &rpc.HTTPStatusError{StatusCode: 499, Body: ""},
77+
name: "HTTP 401 → ErrHTTPClientError (fatal, JWT misconfig)",
78+
in: &rpc.HTTPStatusError{StatusCode: 401, Body: ""},
7979
wantIs: ErrHTTPClientError,
8080
wantFatal: true,
8181
},
8282
{
83-
name: "HTTP 401 → ErrHTTPClientError (fatal)",
84-
in: &rpc.HTTPStatusError{StatusCode: 401, Body: ""},
83+
name: "HTTP 404 → ErrHTTPClientError (fatal, wrong RPC path)",
84+
in: &rpc.HTTPStatusError{StatusCode: 404, Body: ""},
85+
wantIs: ErrHTTPClientError,
86+
wantFatal: true,
87+
},
88+
{
89+
name: "HTTP 408 → ErrBadConnection (retryable, idle-conn cleanup from proxy)",
90+
in: &rpc.HTTPStatusError{StatusCode: 408, Body: ""},
91+
wantIs: ErrBadConnection,
92+
wantFatal: false,
93+
wantRetry: true,
94+
},
95+
{
96+
name: "HTTP 413 (PoC) → ErrHTTPClientError (fatal)",
97+
in: &rpc.HTTPStatusError{StatusCode: 413, Body: `{"code":-32007,"message":"Request is too big"}`},
98+
wantIs: ErrHTTPClientError,
99+
wantFatal: true,
100+
},
101+
{
102+
name: "HTTP 425 → ErrBadConnection (retryable, TLS 0-RTT replay)",
103+
in: &rpc.HTTPStatusError{StatusCode: 425, Body: ""},
104+
wantIs: ErrBadConnection,
105+
wantFatal: false,
106+
wantRetry: true,
107+
},
108+
{
109+
name: "HTTP 429 → ErrBadConnection (retryable, rate limited by proxy/WAF)",
110+
in: &rpc.HTTPStatusError{StatusCode: 429, Body: ""},
111+
wantIs: ErrBadConnection,
112+
wantFatal: false,
113+
wantRetry: true,
114+
},
115+
{
116+
name: "HTTP 499 → ErrHTTPClientError (fatal, non-retryable 4xx catch-all)",
117+
in: &rpc.HTTPStatusError{StatusCode: 499, Body: ""},
85118
wantIs: ErrHTTPClientError,
86119
wantFatal: true,
87120
},

0 commit comments

Comments
 (0)