From 10dbefda99c9bedde8b6888c5f332dadd92dc7eb Mon Sep 17 00:00:00 2001 From: sysvm <112189277+sysvm@users.noreply.github.com> Date: Wed, 29 Apr 2026 16:22:47 +0800 Subject: [PATCH 1/6] fix: use buffered channel --- op-node/p2p/sync.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/op-node/p2p/sync.go b/op-node/p2p/sync.go index 58569f326..2a33dfb99 100644 --- a/op-node/p2p/sync.go +++ b/op-node/p2p/sync.go @@ -290,7 +290,7 @@ func NewSyncClient(log log.Logger, cfg *rollup.Config, newStream newStreamFn, rc payloadByNumber: PayloadByNumberProtocolID(cfg.L2ChainID), peers: make(map[peer.ID]context.CancelFunc), quarantineByNum: make(map[uint64]common.Hash), - rangeRequests: make(chan rangeRequest), // blocking + rangeRequests: make(chan rangeRequest, 16), // blocking activeRangeRequests: newRequestIdMap(), peerRequests: make(chan peerRequest, 128), results: make(chan syncResult, 128), From 4ab78be8a271417e7ff62009d2e2cd402bb339cb Mon Sep 17 00:00:00 2001 From: sysvm <112189277+sysvm@users.noreply.github.com> Date: Fri, 1 May 2026 11:35:47 +0800 Subject: [PATCH 2/6] feat: add --catch-up flag to defer gossip on RPC startup --- op-node/flags/flags.go | 11 +++ op-node/node/config.go | 7 ++ op-node/node/node.go | 166 +++++++++++++++++++++++++++++++++++++++-- op-node/p2p/sync.go | 2 +- op-node/service.go | 2 + 5 files changed, 182 insertions(+), 6 deletions(-) diff --git a/op-node/flags/flags.go b/op-node/flags/flags.go index 0dbd9c4d2..f1c9fed7e 100644 --- a/op-node/flags/flags.go +++ b/op-node/flags/flags.go @@ -360,6 +360,16 @@ var ( Value: 86400, EnvVars: prefixEnvVars("EL_TRIGGER_GAP"), } + CatchUpFlag = &cli.BoolFlag{ + Name: "catch-up", + Usage: "When enabled, op-node defers gossip subscription on startup until op-geth's unsafe head " + + "has caught up to the live tip via L1 derivation. This avoids the driver/alt-sync activity loop " + + "that occurs when an RPC node restarts with a large unsafe-head gap. " + + "Recommended for RPC and verifier nodes; not needed for sequencer nodes.", + EnvVars: prefixEnvVars("CATCH_UP"), + Value: false, + Category: RollupCategory, + } /* Deprecated Flags */ L2EngineSyncEnabled = &cli.BoolFlag{ Name: "l2.engine-sync", @@ -440,6 +450,7 @@ var optionalFlags = []cli.Flag{ SyncModeFlag, FastnodeMode, ELTriggerGap, + CatchUpFlag, RPCListenAddr, RPCListenPort, L1TrustRPC, diff --git a/op-node/node/config.go b/op-node/node/config.go index 4f67d0f78..0b22e653a 100644 --- a/op-node/node/config.go +++ b/op-node/node/config.go @@ -77,6 +77,13 @@ type Config struct { // Plasma DA config Plasma plasma.CLIConfig + + // CatchUp toggles the optional pre-gossip catch-up phase at startup. + // When true, op-node defers enabling gossip until op-geth's unsafe head has caught up + // to the live tip via L1 derivation, preventing the driver/alt-sync activity loop + // that otherwise occurs after restarts with a large unsafe-head gap. + // Recommended for RPC and verifier nodes; sequencer nodes should leave it disabled. + CatchUp bool } type RPCConfig struct { diff --git a/op-node/node/node.go b/op-node/node/node.go index 822aa7706..3be2e305c 100644 --- a/op-node/node/node.go +++ b/op-node/node/node.go @@ -79,11 +79,51 @@ type OpNode struct { closed atomic.Bool + // catchUpEnabled mirrors cfg.CatchUp; controls whether Start() runs the pre-gossip catch-up phase. + catchUpEnabled bool + + // gossipReady gates incoming gossip payloads during the startup catch-up phase. + // While false, gossip payloads received via OnUnsafeL2Payload are silently dropped + // to prevent the clSync queue from accumulating orphan payloads (parent != op-geth.UnsafeL2Head) + // while op-geth's unsafe head is still being advanced via L1 derivation. + // Set to true once catch-up completes (or is disabled / times out). + gossipReady atomic.Bool + + // firstPayloadAllowed lets exactly one gossip payload pass through OnUnsafeL2Payload + // while gossipReady is still false. This is required when running in ELSync mode: + // the engineController initial state is syncStatusWillStartEL, which causes + // IsEngineSyncing() to return true and prevents the driver eventLoop from running + // derivation (the stepReqCh handler short-circuits with `continue`). Until at least + // one payload reaches Driver.OnUnsafeL2Payload -> InsertUnsafePayload, the engine's + // "Skipping EL sync" finalized-block check never fires and syncStatus is stuck. + // Allowing exactly one payload through unblocks that transition and lets derivation + // drive op-geth forward during the catch-up phase. After this single payload, + // subsequent payloads continue to be dropped until gossipReady is flipped to true. + firstPayloadAllowed atomic.Bool + // cancels execution prematurely, e.g. to halt. This may be nil. cancel context.CancelCauseFunc halted atomic.Bool } +// Startup catch-up parameters. Hardcoded; tweak here if needed. +const ( + // catchUpLagThreshold is how close op-geth's unsafe head timestamp must be + // to the current wall-clock time before gossip is enabled. + // On opBNB (~500ms blocks) 30s ≈ 60 blocks of remaining gap, well below the + // threshold that triggers the activity loop in tested scenarios. + catchUpLagThreshold = 30 * time.Second + + // catchUpMaxWait is the absolute maximum time we are willing to defer gossip. + // If catch-up does not complete within this window (e.g. L1 derivation is unhealthy), + // gossip is enabled regardless and the system degrades to the pre-fix behavior + // rather than blocking forever. + catchUpMaxWait = 10 * time.Minute + + // catchUpPollInterval is how often we re-check op-geth's unsafe head during catch-up. + catchUpPollInterval = 5 * time.Second +) + // The OpNode handles incoming gossip var _ p2p.GossipIn = (*OpNode)(nil) @@ -96,11 +136,17 @@ func New(ctx context.Context, cfg *Config, log log.Logger, snapshotLog log.Logge } n := &OpNode{ - log: log, - appVersion: appVersion, - metrics: m, - rollupHalt: cfg.RollupHalt, - cancel: cfg.Cancel, + log: log, + appVersion: appVersion, + metrics: m, + rollupHalt: cfg.RollupHalt, + cancel: cfg.Cancel, + catchUpEnabled: cfg.CatchUp, + } + // If catch-up is disabled, gossip should be processed immediately as before. + // Set the gate to "ready" up front so OnUnsafeL2Payload behaves identically to the pre-fix code path. + if !n.catchUpEnabled { + n.gossipReady.Store(true) } // not a context leak, gossipsub is closed with a context. n.resourcesCtx, n.resourcesClose = context.WithCancel(context.Background()) @@ -117,6 +163,72 @@ func New(ctx context.Context, cfg *Config, log log.Logger, snapshotLog log.Logge return n, nil } +// waitForOpGethCatchUp blocks until op-geth's unsafe head timestamp is within +// catchUpLagThreshold of the current time, or until catchUpMaxWait elapses. +// +// Background: +// On RPC node restart, op-geth's unsafe head is frozen at the pre-restart height for +// the duration of the pod outage. When op-node comes back up and immediately subscribes +// to gossip, incoming gossip payloads have a parent that does not match op-geth's +// stale unsafe head; the clSync queue accumulates orphan payloads. The driver's +// checkForGapInUnsafeQueue then triggers alt-sync via an unbuffered rangeRequests +// channel, while alt-sync's mainLoop -- when promoting results back via receivePayload +// -- itself blocks on the driver's unsafeL2Payloads channel (buf=10). The two goroutines +// form a livelock that only releases through ctx timeouts, leaving the unsafe head +// stalled for some time. +// +// This function defers gossip subscription (via the gossipReady gate) until the L1 +// derivation pipeline has advanced op-geth's unsafe head close enough to the live tip +// that no significant gap exists when gossip is finally enabled, eliminating the +// activity loop's preconditions at the source. +// +// Returns nil on successful catch-up; returns an error on context cancellation or timeout. +// In case of timeout, the caller should still enable gossip and degrade gracefully. +func (n *OpNode) waitForOpGethCatchUp(ctx context.Context) error { + n.log.Info("starting op-geth catch-up phase before enabling gossip", + "lag_threshold", catchUpLagThreshold, + "max_wait", catchUpMaxWait, + ) + + deadline := time.Now().Add(catchUpMaxWait) + ticker := time.NewTicker(catchUpPollInterval) + defer ticker.Stop() + + for { + // Query op-geth's current unsafe head. + queryCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + unsafeHead, err := n.l2Source.L2BlockRefByLabel(queryCtx, eth.Unsafe) + cancel() + + if err != nil { + n.log.Warn("Failed to query op-geth unsafe head during catch-up, will retry", "error", err) + } else { + headTime := time.Unix(int64(unsafeHead.Time), 0) + lag := time.Since(headTime) + + // Treat negative lag (clock skew or future-timestamp head) as caught up. + if lag < catchUpLagThreshold { + n.log.Info("op-geth caught up; enabling gossip", "unsafe_head", unsafeHead.Number, "lag", lag) + return nil + } + + n.log.Info("op-geth still catching up via L1 derivation", "unsafe_head", unsafeHead.Number, + "lag", lag, "deadline_in", time.Until(deadline)) + } + + if time.Now().After(deadline) { + return fmt.Errorf("startup catch-up timeout after %v", catchUpMaxWait) + } + + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + continue + } + } +} + func (n *OpNode) init(ctx context.Context, cfg *Config, snapshotLog log.Logger) error { n.log.Info("Initializing rollup node", "version", n.appVersion) if err := n.initTracer(ctx, cfg); err != nil { @@ -484,6 +596,24 @@ func (n *OpNode) Start(ctx context.Context) error { n.log.Error("Could not start a rollup node", "err", err) return err } + + // Optionally defer enabling gossip until op-geth's unsafe head has caught up to the live tip + // via the L1 derivation pipeline. This avoids the driver/alt-sync livelock that occurs when + // gossip floods in with payloads whose parent does not match op-geth's stale unsafe head. + // Disabled by default; enable via --catch-up for RPC / verifier nodes. + // See waitForOpGethCatchUp for full background. + if n.catchUpEnabled { + if err := n.waitForOpGethCatchUp(ctx); err != nil { + // Catch-up failed (e.g. timeout, L1 derivation unhealthy). Enable gossip anyway + // to avoid blocking the node forever; the system degrades to the pre-fix behavior. + n.log.Warn("startup catch-up did not complete cleanly; enabling gossip anyway", "err", err) + } + n.gossipReady.Store(true) + n.log.Info("gossip enabled; op-node fully active") + } + // If catch-up is disabled, gossipReady was already set to true in New(), + // so OnUnsafeL2Payload behaves identically to the pre-fix code path. + log.Info("Rollup node started") return nil } @@ -543,6 +673,32 @@ func (n *OpNode) PublishL2Payload(ctx context.Context, envelope *eth.ExecutionPa } func (n *OpNode) OnUnsafeL2Payload(ctx context.Context, from peer.ID, envelope *eth.ExecutionPayloadEnvelope) error { + // Drop gossip payloads received during the startup catch-up phase. + // While op-geth's unsafe head is still catching up via L1 derivation, accepting + // real-time gossip payloads would fill the clSync queue with orphan payloads + // (parent != op-geth.UnsafeL2Head) and trigger the driver/alt-sync livelock. + // Gossip is re-enabled once waitForOpGethCatchUp completes. + // Any payloads dropped here are recovered by gossipsub mesh re-broadcasts and + // alt-sync backfill once gossipReady is set. + // + // Exception: the very first payload is always let through, regardless of catch-up + // state. In ELSync mode this is required to trigger the WillStartEL → FinishedEL + // transition inside InsertUnsafePayload (the "Skipping EL sync ..." finalized-block + // check). Without this, IsEngineSyncing() stays true and derivation is blocked + // from running during catch-up, defeating the purpose of the wait. In CLSync mode + // this single payload simply enters the clSync queue and is harmless. + if !n.gossipReady.Load() { + if !n.firstPayloadAllowed.Swap(true) { + n.log.Info("allowing first gossip payload through during catch-up to unblock engine sync state", + "id", envelope.ExecutionPayload.ID(), "peer", from) + // fall through to the regular processing path below + } else { + n.log.Debug("dropping gossip payload during startup catch-up phase", + "id", envelope.ExecutionPayload.ID(), "peer", from) + return nil + } + } + // ignore if it's from ourselves if n.p2pNode != nil && from == n.p2pNode.Host().ID() { return nil diff --git a/op-node/p2p/sync.go b/op-node/p2p/sync.go index 2a33dfb99..58569f326 100644 --- a/op-node/p2p/sync.go +++ b/op-node/p2p/sync.go @@ -290,7 +290,7 @@ func NewSyncClient(log log.Logger, cfg *rollup.Config, newStream newStreamFn, rc payloadByNumber: PayloadByNumberProtocolID(cfg.L2ChainID), peers: make(map[peer.ID]context.CancelFunc), quarantineByNum: make(map[uint64]common.Hash), - rangeRequests: make(chan rangeRequest, 16), // blocking + rangeRequests: make(chan rangeRequest), // blocking activeRangeRequests: newRequestIdMap(), peerRequests: make(chan peerRequest, 128), results: make(chan syncResult, 128), diff --git a/op-node/service.go b/op-node/service.go index e162834c5..5c6d709bf 100644 --- a/op-node/service.go +++ b/op-node/service.go @@ -115,6 +115,8 @@ func NewConfig(ctx *cli.Context, log log.Logger) (*node.Config, error) { ConductorRpcTimeout: ctx.Duration(flags.ConductorRpcTimeoutFlag.Name), Plasma: plasma.ReadCLIConfig(ctx), + + CatchUp: ctx.Bool(flags.CatchUpFlag.Name), } if err := cfg.LoadPersisted(log); err != nil { From 63ca20efd15728673556b22ccbb8e0edc926da12 Mon Sep 17 00:00:00 2001 From: sysvm <112189277+sysvm@users.noreply.github.com> Date: Tue, 19 May 2026 16:59:29 +0800 Subject: [PATCH 3/6] fix: rename flag --- op-node/flags/flags.go | 13 ++++++------- op-node/node/node.go | 2 +- op-node/p2p/sync.go | 2 +- op-node/service.go | 2 +- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/op-node/flags/flags.go b/op-node/flags/flags.go index f1c9fed7e..9318f3618 100644 --- a/op-node/flags/flags.go +++ b/op-node/flags/flags.go @@ -360,13 +360,12 @@ var ( Value: 86400, EnvVars: prefixEnvVars("EL_TRIGGER_GAP"), } - CatchUpFlag = &cli.BoolFlag{ - Name: "catch-up", - Usage: "When enabled, op-node defers gossip subscription on startup until op-geth's unsafe head " + + StartupCatchUpFlag = &cli.BoolFlag{ + Name: "startup.catch-up", + Usage: "When enabled, op-node defers gossip subscription during startup until op-geth's unsafe head " + "has caught up to the live tip via L1 derivation. This avoids the driver/alt-sync activity loop " + - "that occurs when an RPC node restarts with a large unsafe-head gap. " + - "Recommended for RPC and verifier nodes; not needed for sequencer nodes.", - EnvVars: prefixEnvVars("CATCH_UP"), + "that occurs when an RPC node restarts with a large unsafe-head gap. Recommended for RPC nodes; not needed for sequencer nodes.", + EnvVars: prefixEnvVars("STARTUP_CATCH_UP"), Value: false, Category: RollupCategory, } @@ -450,7 +449,7 @@ var optionalFlags = []cli.Flag{ SyncModeFlag, FastnodeMode, ELTriggerGap, - CatchUpFlag, + StartupCatchUpFlag, RPCListenAddr, RPCListenPort, L1TrustRPC, diff --git a/op-node/node/node.go b/op-node/node/node.go index 3be2e305c..65731d9f8 100644 --- a/op-node/node/node.go +++ b/op-node/node/node.go @@ -600,7 +600,7 @@ func (n *OpNode) Start(ctx context.Context) error { // Optionally defer enabling gossip until op-geth's unsafe head has caught up to the live tip // via the L1 derivation pipeline. This avoids the driver/alt-sync livelock that occurs when // gossip floods in with payloads whose parent does not match op-geth's stale unsafe head. - // Disabled by default; enable via --catch-up for RPC / verifier nodes. + // Disabled by default; enable via --startup.catch-up for RPC nodes. // See waitForOpGethCatchUp for full background. if n.catchUpEnabled { if err := n.waitForOpGethCatchUp(ctx); err != nil { diff --git a/op-node/p2p/sync.go b/op-node/p2p/sync.go index 58569f326..2a33dfb99 100644 --- a/op-node/p2p/sync.go +++ b/op-node/p2p/sync.go @@ -290,7 +290,7 @@ func NewSyncClient(log log.Logger, cfg *rollup.Config, newStream newStreamFn, rc payloadByNumber: PayloadByNumberProtocolID(cfg.L2ChainID), peers: make(map[peer.ID]context.CancelFunc), quarantineByNum: make(map[uint64]common.Hash), - rangeRequests: make(chan rangeRequest), // blocking + rangeRequests: make(chan rangeRequest, 16), // blocking activeRangeRequests: newRequestIdMap(), peerRequests: make(chan peerRequest, 128), results: make(chan syncResult, 128), diff --git a/op-node/service.go b/op-node/service.go index 5c6d709bf..abe350df7 100644 --- a/op-node/service.go +++ b/op-node/service.go @@ -116,7 +116,7 @@ func NewConfig(ctx *cli.Context, log log.Logger) (*node.Config, error) { Plasma: plasma.ReadCLIConfig(ctx), - CatchUp: ctx.Bool(flags.CatchUpFlag.Name), + CatchUp: ctx.Bool(flags.StartupCatchUpFlag.Name), } if err := cfg.LoadPersisted(log); err != nil { From 430a91f504a2774e24c3a7f7eb6a18c7339bebe5 Mon Sep 17 00:00:00 2001 From: sysvm <112189277+sysvm@users.noreply.github.com> Date: Wed, 20 May 2026 10:03:33 +0800 Subject: [PATCH 4/6] fix: set catchupLagThreshold to 10s --- op-node/node/node.go | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/op-node/node/node.go b/op-node/node/node.go index 65731d9f8..a9c3f0d56 100644 --- a/op-node/node/node.go +++ b/op-node/node/node.go @@ -108,20 +108,20 @@ type OpNode struct { // Startup catch-up parameters. Hardcoded; tweak here if needed. const ( - // catchUpLagThreshold is how close op-geth's unsafe head timestamp must be + // catchupLagThreshold is how close op-geth's unsafe head timestamp must be // to the current wall-clock time before gossip is enabled. - // On opBNB (~500ms blocks) 30s ≈ 60 blocks of remaining gap, well below the + // On opBNB (~250ms blocks) 10s ≈ 40 blocks of remaining gap, well below the // threshold that triggers the activity loop in tested scenarios. - catchUpLagThreshold = 30 * time.Second + catchupLagThreshold = 10 * time.Second - // catchUpMaxWait is the absolute maximum time we are willing to defer gossip. + // catchupMaxWait is the absolute maximum time we are willing to defer gossip. // If catch-up does not complete within this window (e.g. L1 derivation is unhealthy), // gossip is enabled regardless and the system degrades to the pre-fix behavior // rather than blocking forever. - catchUpMaxWait = 10 * time.Minute + catchupMaxWait = 10 * time.Minute - // catchUpPollInterval is how often we re-check op-geth's unsafe head during catch-up. - catchUpPollInterval = 5 * time.Second + // catchupPollInterval is how often we re-check op-geth's unsafe head during catch-up. + catchupPollInterval = 5 * time.Second ) // The OpNode handles incoming gossip @@ -185,13 +185,11 @@ func New(ctx context.Context, cfg *Config, log log.Logger, snapshotLog log.Logge // Returns nil on successful catch-up; returns an error on context cancellation or timeout. // In case of timeout, the caller should still enable gossip and degrade gracefully. func (n *OpNode) waitForOpGethCatchUp(ctx context.Context) error { - n.log.Info("starting op-geth catch-up phase before enabling gossip", - "lag_threshold", catchUpLagThreshold, - "max_wait", catchUpMaxWait, - ) + n.log.Info("Starting op-geth catch-up phase before enabling gossip", "lag_threshold", catchupLagThreshold, + "max_wait", catchupMaxWait) - deadline := time.Now().Add(catchUpMaxWait) - ticker := time.NewTicker(catchUpPollInterval) + deadline := time.Now().Add(catchupMaxWait) + ticker := time.NewTicker(catchupPollInterval) defer ticker.Stop() for { @@ -207,7 +205,7 @@ func (n *OpNode) waitForOpGethCatchUp(ctx context.Context) error { lag := time.Since(headTime) // Treat negative lag (clock skew or future-timestamp head) as caught up. - if lag < catchUpLagThreshold { + if lag < catchupLagThreshold { n.log.Info("op-geth caught up; enabling gossip", "unsafe_head", unsafeHead.Number, "lag", lag) return nil } @@ -217,7 +215,7 @@ func (n *OpNode) waitForOpGethCatchUp(ctx context.Context) error { } if time.Now().After(deadline) { - return fmt.Errorf("startup catch-up timeout after %v", catchUpMaxWait) + return fmt.Errorf("startup catch-up timeout after %v", catchupMaxWait) } select { @@ -606,10 +604,10 @@ func (n *OpNode) Start(ctx context.Context) error { if err := n.waitForOpGethCatchUp(ctx); err != nil { // Catch-up failed (e.g. timeout, L1 derivation unhealthy). Enable gossip anyway // to avoid blocking the node forever; the system degrades to the pre-fix behavior. - n.log.Warn("startup catch-up did not complete cleanly; enabling gossip anyway", "err", err) + n.log.Warn("Startup catch-up did not complete cleanly; enabling gossip anyway", "err", err) } n.gossipReady.Store(true) - n.log.Info("gossip enabled; op-node fully active") + n.log.Info("Gossip enabled; op-node fully active") } // If catch-up is disabled, gossipReady was already set to true in New(), // so OnUnsafeL2Payload behaves identically to the pre-fix code path. @@ -689,12 +687,12 @@ func (n *OpNode) OnUnsafeL2Payload(ctx context.Context, from peer.ID, envelope * // this single payload simply enters the clSync queue and is harmless. if !n.gossipReady.Load() { if !n.firstPayloadAllowed.Swap(true) { - n.log.Info("allowing first gossip payload through during catch-up to unblock engine sync state", - "id", envelope.ExecutionPayload.ID(), "peer", from) + n.log.Info("Allowing first gossip payload through during catch-up to unblock engine sync state", + "peer", from, "id", envelope.ExecutionPayload.ID()) // fall through to the regular processing path below } else { - n.log.Debug("dropping gossip payload during startup catch-up phase", - "id", envelope.ExecutionPayload.ID(), "peer", from) + n.log.Debug("Dropping gossip payload during startup catch-up phase", "peer", from, + "id", envelope.ExecutionPayload.ID()) return nil } } From 1383bad7e5692fd0a8e71778ac11c24bc7b11813 Mon Sep 17 00:00:00 2001 From: sysvm <112189277+sysvm@users.noreply.github.com> Date: Wed, 20 May 2026 10:57:54 +0800 Subject: [PATCH 5/6] fix: optimize codes --- op-node/flags/flags.go | 14 ++++++------ op-node/node/config.go | 6 ++--- op-node/node/node.go | 52 +++++++++++++----------------------------- op-node/p2p/sync.go | 2 +- op-node/service.go | 2 +- 5 files changed, 28 insertions(+), 48 deletions(-) diff --git a/op-node/flags/flags.go b/op-node/flags/flags.go index 9318f3618..f8b414c5d 100644 --- a/op-node/flags/flags.go +++ b/op-node/flags/flags.go @@ -360,12 +360,12 @@ var ( Value: 86400, EnvVars: prefixEnvVars("EL_TRIGGER_GAP"), } - StartupCatchUpFlag = &cli.BoolFlag{ - Name: "startup.catch-up", - Usage: "When enabled, op-node defers gossip subscription during startup until op-geth's unsafe head " + - "has caught up to the live tip via L1 derivation. This avoids the driver/alt-sync activity loop " + - "that occurs when an RPC node restarts with a large unsafe-head gap. Recommended for RPC nodes; not needed for sequencer nodes.", - EnvVars: prefixEnvVars("STARTUP_CATCH_UP"), + StartupDeferGossipFlag = &cli.BoolFlag{ + Name: "startup.defer-gossip", + Usage: "Defers P2P gossip processing during startup until op-geth's unsafe head has caught up to" + + "the live tip via L1 derivation. This avoids the driver/alt-sync activity loop that occurs when an RPC node" + + "restarts with a large unsafe-head gap. Recommended for RPC and bridge nodes; not needed for sequencer and P2P nodes.", + EnvVars: prefixEnvVars("STARTUP_DEFER_GOSSIP"), Value: false, Category: RollupCategory, } @@ -449,7 +449,7 @@ var optionalFlags = []cli.Flag{ SyncModeFlag, FastnodeMode, ELTriggerGap, - StartupCatchUpFlag, + StartupDeferGossipFlag, RPCListenAddr, RPCListenPort, L1TrustRPC, diff --git a/op-node/node/config.go b/op-node/node/config.go index 0b22e653a..06d5c25c1 100644 --- a/op-node/node/config.go +++ b/op-node/node/config.go @@ -78,12 +78,12 @@ type Config struct { // Plasma DA config Plasma plasma.CLIConfig - // CatchUp toggles the optional pre-gossip catch-up phase at startup. + // StartupDeferGossip toggles the optional pre-gossip catch-up phase at startup. // When true, op-node defers enabling gossip until op-geth's unsafe head has caught up // to the live tip via L1 derivation, preventing the driver/alt-sync activity loop // that otherwise occurs after restarts with a large unsafe-head gap. - // Recommended for RPC and verifier nodes; sequencer nodes should leave it disabled. - CatchUp bool + // Recommended for RPC and bridge nodes; sequencer and P2P nodes should leave it disabled. + StartupDeferGossip bool } type RPCConfig struct { diff --git a/op-node/node/node.go b/op-node/node/node.go index a9c3f0d56..c8b941d43 100644 --- a/op-node/node/node.go +++ b/op-node/node/node.go @@ -79,8 +79,8 @@ type OpNode struct { closed atomic.Bool - // catchUpEnabled mirrors cfg.CatchUp; controls whether Start() runs the pre-gossip catch-up phase. - catchUpEnabled bool + // deferGossipEnabled mirrors cfg.StartupDeferGossip; controls whether Start() runs the pre-gossip catch-up phase. + deferGossipEnabled bool // gossipReady gates incoming gossip payloads during the startup catch-up phase. // While false, gossip payloads received via OnUnsafeL2Payload are silently dropped @@ -106,13 +106,10 @@ type OpNode struct { halted atomic.Bool } -// Startup catch-up parameters. Hardcoded; tweak here if needed. const ( // catchupLagThreshold is how close op-geth's unsafe head timestamp must be // to the current wall-clock time before gossip is enabled. - // On opBNB (~250ms blocks) 10s ≈ 40 blocks of remaining gap, well below the - // threshold that triggers the activity loop in tested scenarios. - catchupLagThreshold = 10 * time.Second + catchupLagThreshold = 30 * time.Second // catchupMaxWait is the absolute maximum time we are willing to defer gossip. // If catch-up does not complete within this window (e.g. L1 derivation is unhealthy), @@ -136,16 +133,17 @@ func New(ctx context.Context, cfg *Config, log log.Logger, snapshotLog log.Logge } n := &OpNode{ - log: log, - appVersion: appVersion, - metrics: m, - rollupHalt: cfg.RollupHalt, - cancel: cfg.Cancel, - catchUpEnabled: cfg.CatchUp, - } - // If catch-up is disabled, gossip should be processed immediately as before. + log: log, + appVersion: appVersion, + metrics: m, + rollupHalt: cfg.RollupHalt, + cancel: cfg.Cancel, + deferGossipEnabled: cfg.StartupDeferGossip, + } + + // If defer gossip is disabled, gossip should be processed immediately as before. // Set the gate to "ready" up front so OnUnsafeL2Payload behaves identically to the pre-fix code path. - if !n.catchUpEnabled { + if !n.deferGossipEnabled { n.gossipReady.Store(true) } // not a context leak, gossipsub is closed with a context. @@ -595,12 +593,7 @@ func (n *OpNode) Start(ctx context.Context) error { return err } - // Optionally defer enabling gossip until op-geth's unsafe head has caught up to the live tip - // via the L1 derivation pipeline. This avoids the driver/alt-sync livelock that occurs when - // gossip floods in with payloads whose parent does not match op-geth's stale unsafe head. - // Disabled by default; enable via --startup.catch-up for RPC nodes. - // See waitForOpGethCatchUp for full background. - if n.catchUpEnabled { + if n.deferGossipEnabled { if err := n.waitForOpGethCatchUp(ctx); err != nil { // Catch-up failed (e.g. timeout, L1 derivation unhealthy). Enable gossip anyway // to avoid blocking the node forever; the system degrades to the pre-fix behavior. @@ -609,7 +602,7 @@ func (n *OpNode) Start(ctx context.Context) error { n.gossipReady.Store(true) n.log.Info("Gossip enabled; op-node fully active") } - // If catch-up is disabled, gossipReady was already set to true in New(), + // If defer gossip is disabled, gossipReady was already set to true in New(), // so OnUnsafeL2Payload behaves identically to the pre-fix code path. log.Info("Rollup node started") @@ -671,20 +664,7 @@ func (n *OpNode) PublishL2Payload(ctx context.Context, envelope *eth.ExecutionPa } func (n *OpNode) OnUnsafeL2Payload(ctx context.Context, from peer.ID, envelope *eth.ExecutionPayloadEnvelope) error { - // Drop gossip payloads received during the startup catch-up phase. - // While op-geth's unsafe head is still catching up via L1 derivation, accepting - // real-time gossip payloads would fill the clSync queue with orphan payloads - // (parent != op-geth.UnsafeL2Head) and trigger the driver/alt-sync livelock. - // Gossip is re-enabled once waitForOpGethCatchUp completes. - // Any payloads dropped here are recovered by gossipsub mesh re-broadcasts and - // alt-sync backfill once gossipReady is set. - // - // Exception: the very first payload is always let through, regardless of catch-up - // state. In ELSync mode this is required to trigger the WillStartEL → FinishedEL - // transition inside InsertUnsafePayload (the "Skipping EL sync ..." finalized-block - // check). Without this, IsEngineSyncing() stays true and derivation is blocked - // from running during catch-up, defeating the purpose of the wait. In CLSync mode - // this single payload simply enters the clSync queue and is harmless. + // If defer gossip is enabled, drop gossip payloads received during the startup catch-up phase. if !n.gossipReady.Load() { if !n.firstPayloadAllowed.Swap(true) { n.log.Info("Allowing first gossip payload through during catch-up to unblock engine sync state", diff --git a/op-node/p2p/sync.go b/op-node/p2p/sync.go index 2a33dfb99..58569f326 100644 --- a/op-node/p2p/sync.go +++ b/op-node/p2p/sync.go @@ -290,7 +290,7 @@ func NewSyncClient(log log.Logger, cfg *rollup.Config, newStream newStreamFn, rc payloadByNumber: PayloadByNumberProtocolID(cfg.L2ChainID), peers: make(map[peer.ID]context.CancelFunc), quarantineByNum: make(map[uint64]common.Hash), - rangeRequests: make(chan rangeRequest, 16), // blocking + rangeRequests: make(chan rangeRequest), // blocking activeRangeRequests: newRequestIdMap(), peerRequests: make(chan peerRequest, 128), results: make(chan syncResult, 128), diff --git a/op-node/service.go b/op-node/service.go index abe350df7..53f29429e 100644 --- a/op-node/service.go +++ b/op-node/service.go @@ -116,7 +116,7 @@ func NewConfig(ctx *cli.Context, log log.Logger) (*node.Config, error) { Plasma: plasma.ReadCLIConfig(ctx), - CatchUp: ctx.Bool(flags.StartupCatchUpFlag.Name), + StartupDeferGossip: ctx.Bool(flags.StartupDeferGossipFlag.Name), } if err := cfg.LoadPersisted(log); err != nil { From 247432cc83b85cfccfaf66f9ef344a6aa14cd6a4 Mon Sep 17 00:00:00 2001 From: sysvm <112189277+sysvm@users.noreply.github.com> Date: Wed, 20 May 2026 12:27:16 +0800 Subject: [PATCH 6/6] fix: set startup.defer-gossip default value to true --- op-node/flags/flags.go | 10 ++++++---- op-node/node/config.go | 13 ++++++++----- op-node/node/node.go | 29 +++++++++++++++++++---------- 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/op-node/flags/flags.go b/op-node/flags/flags.go index f8b414c5d..a6d6841b2 100644 --- a/op-node/flags/flags.go +++ b/op-node/flags/flags.go @@ -362,11 +362,13 @@ var ( } StartupDeferGossipFlag = &cli.BoolFlag{ Name: "startup.defer-gossip", - Usage: "Defers P2P gossip processing during startup until op-geth's unsafe head has caught up to" + - "the live tip via L1 derivation. This avoids the driver/alt-sync activity loop that occurs when an RPC node" + - "restarts with a large unsafe-head gap. Recommended for RPC and bridge nodes; not needed for sequencer and P2P nodes.", + Usage: "Defers P2P gossip processing during startup until op-geth's unsafe head has caught up to " + + "the live tip via L1 derivation. This avoids the driver/alt-sync activity loop that occurs when a node " + + "restarts with a large unsafe-head gap. Default enabled for all node types (rpc / bridge / sequencer / p2p); " + + "the catch-up loop returns quickly when no gap exists, so the cost is negligible for nodes that don't need it. " + + "Set to false to opt out and restore the pre-fix startup behavior.", EnvVars: prefixEnvVars("STARTUP_DEFER_GOSSIP"), - Value: false, + Value: true, Category: RollupCategory, } /* Deprecated Flags */ diff --git a/op-node/node/config.go b/op-node/node/config.go index 06d5c25c1..b949666ad 100644 --- a/op-node/node/config.go +++ b/op-node/node/config.go @@ -78,11 +78,14 @@ type Config struct { // Plasma DA config Plasma plasma.CLIConfig - // StartupDeferGossip toggles the optional pre-gossip catch-up phase at startup. - // When true, op-node defers enabling gossip until op-geth's unsafe head has caught up - // to the live tip via L1 derivation, preventing the driver/alt-sync activity loop - // that otherwise occurs after restarts with a large unsafe-head gap. - // Recommended for RPC and bridge nodes; sequencer and P2P nodes should leave it disabled. + // StartupDeferGossip toggles the pre-gossip catch-up phase at startup. + // When true (the default), op-node defers enabling gossip until op-geth's unsafe head + // has caught up to the live tip via L1 derivation, preventing the driver/alt-sync + // activity loop that otherwise occurs after restarts with a large unsafe-head gap. + // Enabled by default for all node types (rpc / bridge / sequencer / p2p); the catch-up + // loop returns immediately when op-geth is already at the live tip, so the cost for + // nodes that don't need it (e.g. a healthy sequencer) is negligible. Set to false only + // to deliberately restore the pre-fix startup behavior. StartupDeferGossip bool } diff --git a/op-node/node/node.go b/op-node/node/node.go index c8b941d43..2254fdc8e 100644 --- a/op-node/node/node.go +++ b/op-node/node/node.go @@ -141,8 +141,12 @@ func New(ctx context.Context, cfg *Config, log log.Logger, snapshotLog log.Logge deferGossipEnabled: cfg.StartupDeferGossip, } - // If defer gossip is disabled, gossip should be processed immediately as before. - // Set the gate to "ready" up front so OnUnsafeL2Payload behaves identically to the pre-fix code path. + // Opt-out path: if the operator has explicitly disabled the startup defer-gossip phase + // (--startup.defer-gossip=false), flip the gate to "ready" immediately so that + // OnUnsafeL2Payload processes gossip without delay, matching the pre-fix code path exactly. + // In the default path (deferGossipEnabled=true), gossipReady stays at its zero value + // (false) here and is flipped to true at the end of Start() after waitForOpGethCatchUp + // completes (or times out). if !n.deferGossipEnabled { n.gossipReady.Store(true) } @@ -602,8 +606,6 @@ func (n *OpNode) Start(ctx context.Context) error { n.gossipReady.Store(true) n.log.Info("Gossip enabled; op-node fully active") } - // If defer gossip is disabled, gossipReady was already set to true in New(), - // so OnUnsafeL2Payload behaves identically to the pre-fix code path. log.Info("Rollup node started") return nil @@ -664,7 +666,19 @@ func (n *OpNode) PublishL2Payload(ctx context.Context, envelope *eth.ExecutionPa } func (n *OpNode) OnUnsafeL2Payload(ctx context.Context, from peer.ID, envelope *eth.ExecutionPayloadEnvelope) error { - // If defer gossip is enabled, drop gossip payloads received during the startup catch-up phase. + // ignore if it's from ourselves. + // Note: this check intentionally runs BEFORE the gossipReady gate so that self-published + // payloads (sequencer publishing its own blocks via gossipsub, which loops back to the + // local subscriber) do not consume the firstPayloadAllowed slot. Without this ordering, + // a sequencer running in ELSync mode could waste its only "free" payload on a self-loop, + // leaving syncStatus stuck at WillStartEL. + if n.p2pNode != nil && from == n.p2pNode.Host().ID() { + return nil + } + + // Drop external gossip payloads received during the startup catch-up phase, except + // for the very first payload which is needed to trigger the ELSync syncStatus + // transition inside InsertUnsafePayload (see firstPayloadAllowed comment on OpNode). if !n.gossipReady.Load() { if !n.firstPayloadAllowed.Swap(true) { n.log.Info("Allowing first gossip payload through during catch-up to unblock engine sync state", @@ -677,11 +691,6 @@ func (n *OpNode) OnUnsafeL2Payload(ctx context.Context, from peer.ID, envelope * } } - // ignore if it's from ourselves - if n.p2pNode != nil && from == n.p2pNode.Host().ID() { - return nil - } - n.tracer.OnUnsafeL2Payload(ctx, from, envelope) n.log.Info("Received signed execution payload from p2p", "id", envelope.ExecutionPayload.ID(), "peer", from)