From 10dbefda99c9bedde8b6888c5f332dadd92dc7eb Mon Sep 17 00:00:00 2001
From: sysvm <112189277+sysvm@users.noreply.github.com>
Date: Wed, 29 Apr 2026 16:22:47 +0800
Subject: [PATCH 1/6] fix: use buffered channel

---
 op-node/p2p/sync.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/op-node/p2p/sync.go b/op-node/p2p/sync.go
index 58569f326..2a33dfb99 100644
--- a/op-node/p2p/sync.go
+++ b/op-node/p2p/sync.go
@@ -290,7 +290,7 @@ func NewSyncClient(log log.Logger, cfg *rollup.Config, newStream newStreamFn, rc
 		payloadByNumber:     PayloadByNumberProtocolID(cfg.L2ChainID),
 		peers:               make(map[peer.ID]context.CancelFunc),
 		quarantineByNum:     make(map[uint64]common.Hash),
-		rangeRequests:       make(chan rangeRequest), // blocking
+		rangeRequests:       make(chan rangeRequest, 16), // blocking
 		activeRangeRequests: newRequestIdMap(),
 		peerRequests:        make(chan peerRequest, 128),
 		results:             make(chan syncResult, 128),

From 4ab78be8a271417e7ff62009d2e2cd402bb339cb Mon Sep 17 00:00:00 2001
From: sysvm <112189277+sysvm@users.noreply.github.com>
Date: Fri, 1 May 2026 11:35:47 +0800
Subject: [PATCH 2/6] feat: add --catch-up flag to defer gossip on RPC startup

---
 op-node/flags/flags.go |  11 +++
 op-node/node/config.go |   7 ++
 op-node/node/node.go   | 166 +++++++++++++++++++++++++++++++++++++++--
 op-node/p2p/sync.go    |   2 +-
 op-node/service.go     |   2 +
 5 files changed, 182 insertions(+), 6 deletions(-)

diff --git a/op-node/flags/flags.go b/op-node/flags/flags.go
index 0dbd9c4d2..f1c9fed7e 100644
--- a/op-node/flags/flags.go
+++ b/op-node/flags/flags.go
@@ -360,6 +360,16 @@ var (
 		Value:   86400,
 		EnvVars: prefixEnvVars("EL_TRIGGER_GAP"),
 	}
+	CatchUpFlag = &cli.BoolFlag{
+		Name: "catch-up",
+		Usage: "When enabled, op-node defers gossip subscription on startup until op-geth's unsafe head " +
+			"has caught up to the live tip via L1 derivation. This avoids the driver/alt-sync activity loop " +
+			"that occurs when an RPC node restarts with a large unsafe-head gap. " +
+			"Recommended for RPC and verifier nodes; not needed for sequencer nodes.",
+		EnvVars:  prefixEnvVars("CATCH_UP"),
+		Value:    false,
+		Category: RollupCategory,
+	}
 	/* Deprecated Flags */
 	L2EngineSyncEnabled = &cli.BoolFlag{
 		Name:    "l2.engine-sync",
@@ -440,6 +450,7 @@ var optionalFlags = []cli.Flag{
 	SyncModeFlag,
 	FastnodeMode,
 	ELTriggerGap,
+	CatchUpFlag,
 	RPCListenAddr,
 	RPCListenPort,
 	L1TrustRPC,
diff --git a/op-node/node/config.go b/op-node/node/config.go
index 4f67d0f78..0b22e653a 100644
--- a/op-node/node/config.go
+++ b/op-node/node/config.go
@@ -77,6 +77,13 @@ type Config struct {
 
 	// Plasma DA config
 	Plasma plasma.CLIConfig
+
+	// CatchUp toggles the optional pre-gossip catch-up phase at startup.
+	// When true, op-node defers enabling gossip until op-geth's unsafe head has caught up
+	// to the live tip via L1 derivation, preventing the driver/alt-sync activity loop
+	// that otherwise occurs after restarts with a large unsafe-head gap.
+	// Recommended for RPC and verifier nodes; sequencer nodes should leave it disabled.
+	CatchUp bool
 }
 
 type RPCConfig struct {
diff --git a/op-node/node/node.go b/op-node/node/node.go
index 822aa7706..3be2e305c 100644
--- a/op-node/node/node.go
+++ b/op-node/node/node.go
@@ -79,11 +79,51 @@ type OpNode struct {
 
 	closed atomic.Bool
 
+	// catchUpEnabled mirrors cfg.CatchUp; controls whether Start() runs the pre-gossip catch-up phase.
+	catchUpEnabled bool
+
+	// gossipReady gates incoming gossip payloads during the startup catch-up phase.
+	// While false, gossip payloads received via OnUnsafeL2Payload are silently dropped
+	// to prevent the clSync queue from accumulating orphan payloads (parent != op-geth.UnsafeL2Head)
+	// while op-geth's unsafe head is still being advanced via L1 derivation.
+	// Set to true once catch-up completes (or is disabled / times out).
+	gossipReady atomic.Bool
+
+	// firstPayloadAllowed lets exactly one gossip payload pass through OnUnsafeL2Payload
+	// while gossipReady is still false. This is required when running in ELSync mode:
+	// the engineController initial state is syncStatusWillStartEL, which causes
+	// IsEngineSyncing() to return true and prevents the driver eventLoop from running
+	// derivation (the stepReqCh handler short-circuits with `continue`). Until at least
+	// one payload reaches Driver.OnUnsafeL2Payload -> InsertUnsafePayload, the engine's
+	// "Skipping EL sync" finalized-block check never fires and syncStatus is stuck.
+	// Allowing exactly one payload through unblocks that transition and lets derivation
+	// drive op-geth forward during the catch-up phase. After this single payload,
+	// subsequent payloads continue to be dropped until gossipReady is flipped to true.
+	firstPayloadAllowed atomic.Bool
+
 	// cancels execution prematurely, e.g. to halt. This may be nil.
 	cancel context.CancelCauseFunc
 	halted atomic.Bool
 }
 
+// Startup catch-up parameters. Hardcoded; tweak here if needed.
+const (
+	// catchUpLagThreshold is how close op-geth's unsafe head timestamp must be
+	// to the current wall-clock time before gossip is enabled.
+	// On opBNB (~500ms blocks) 30s ≈ 60 blocks of remaining gap, well below the
+	// threshold that triggers the activity loop in tested scenarios.
+	catchUpLagThreshold = 30 * time.Second
+
+	// catchUpMaxWait is the absolute maximum time we are willing to defer gossip.
+	// If catch-up does not complete within this window (e.g. L1 derivation is unhealthy),
+	// gossip is enabled regardless and the system degrades to the pre-fix behavior
+	// rather than blocking forever.
+	catchUpMaxWait = 10 * time.Minute
+
+	// catchUpPollInterval is how often we re-check op-geth's unsafe head during catch-up.
+	catchUpPollInterval = 5 * time.Second
+)
+
 // The OpNode handles incoming gossip
 var _ p2p.GossipIn = (*OpNode)(nil)
 
@@ -96,11 +136,17 @@ func New(ctx context.Context, cfg *Config, log log.Logger, snapshotLog log.Logge
 	}
 
 	n := &OpNode{
-		log:        log,
-		appVersion: appVersion,
-		metrics:    m,
-		rollupHalt: cfg.RollupHalt,
-		cancel:     cfg.Cancel,
+		log:            log,
+		appVersion:     appVersion,
+		metrics:        m,
+		rollupHalt:     cfg.RollupHalt,
+		cancel:         cfg.Cancel,
+		catchUpEnabled: cfg.CatchUp,
+	}
+	// If catch-up is disabled, gossip should be processed immediately as before.
+	// Set the gate to "ready" up front so OnUnsafeL2Payload behaves identically to the pre-fix code path.
+	if !n.catchUpEnabled {
+		n.gossipReady.Store(true)
 	}
 	// not a context leak, gossipsub is closed with a context.
 	n.resourcesCtx, n.resourcesClose = context.WithCancel(context.Background())
@@ -117,6 +163,72 @@ func New(ctx context.Context, cfg *Config, log log.Logger, snapshotLog log.Logge
 	return n, nil
 }
 
+// waitForOpGethCatchUp blocks until op-geth's unsafe head timestamp is within
+// catchUpLagThreshold of the current time, or until catchUpMaxWait elapses.
+//
+// Background:
+// On RPC node restart, op-geth's unsafe head is frozen at the pre-restart height for
+// the duration of the pod outage. When op-node comes back up and immediately subscribes
+// to gossip, incoming gossip payloads have a parent that does not match op-geth's
+// stale unsafe head; the clSync queue accumulates orphan payloads. The driver's
+// checkForGapInUnsafeQueue then triggers alt-sync via an unbuffered rangeRequests
+// channel, while alt-sync's mainLoop -- when promoting results back via receivePayload
+// -- itself blocks on the driver's unsafeL2Payloads channel (buf=10). The two goroutines
+// form a livelock that only releases through ctx timeouts, leaving the unsafe head
+// stalled for some time.
+//
+// This function defers gossip subscription (via the gossipReady gate) until the L1
+// derivation pipeline has advanced op-geth's unsafe head close enough to the live tip
+// that no significant gap exists when gossip is finally enabled, eliminating the
+// activity loop's preconditions at the source.
+//
+// Returns nil on successful catch-up; returns an error on context cancellation or timeout.
+// In case of timeout, the caller should still enable gossip and degrade gracefully.
+func (n *OpNode) waitForOpGethCatchUp(ctx context.Context) error {
+	n.log.Info("starting op-geth catch-up phase before enabling gossip",
+		"lag_threshold", catchUpLagThreshold,
+		"max_wait", catchUpMaxWait,
+	)
+
+	deadline := time.Now().Add(catchUpMaxWait)
+	ticker := time.NewTicker(catchUpPollInterval)
+	defer ticker.Stop()
+
+	for {
+		// Query op-geth's current unsafe head.
+		queryCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
+		unsafeHead, err := n.l2Source.L2BlockRefByLabel(queryCtx, eth.Unsafe)
+		cancel()
+
+		if err != nil {
+			n.log.Warn("Failed to query op-geth unsafe head during catch-up, will retry", "error", err)
+		} else {
+			headTime := time.Unix(int64(unsafeHead.Time), 0)
+			lag := time.Since(headTime)
+
+			// Treat negative lag (clock skew or future-timestamp head) as caught up.
+			if lag < catchUpLagThreshold {
+				n.log.Info("op-geth caught up; enabling gossip", "unsafe_head", unsafeHead.Number, "lag", lag)
+				return nil
+			}
+
+			n.log.Info("op-geth still catching up via L1 derivation", "unsafe_head", unsafeHead.Number,
+				"lag", lag, "deadline_in", time.Until(deadline))
+		}
+
+		if time.Now().After(deadline) {
+			return fmt.Errorf("startup catch-up timeout after %v", catchUpMaxWait)
+		}
+
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-ticker.C:
+			continue
+		}
+	}
+}
+
 func (n *OpNode) init(ctx context.Context, cfg *Config, snapshotLog log.Logger) error {
 	n.log.Info("Initializing rollup node", "version", n.appVersion)
 	if err := n.initTracer(ctx, cfg); err != nil {
@@ -484,6 +596,24 @@ func (n *OpNode) Start(ctx context.Context) error {
 		n.log.Error("Could not start a rollup node", "err", err)
 		return err
 	}
+
+	// Optionally defer enabling gossip until op-geth's unsafe head has caught up to the live tip
+	// via the L1 derivation pipeline. This avoids the driver/alt-sync livelock that occurs when
+	// gossip floods in with payloads whose parent does not match op-geth's stale unsafe head.
+	// Disabled by default; enable via --catch-up for RPC / verifier nodes.
+	// See waitForOpGethCatchUp for full background.
+	if n.catchUpEnabled {
+		if err := n.waitForOpGethCatchUp(ctx); err != nil {
+			// Catch-up failed (e.g. timeout, L1 derivation unhealthy). Enable gossip anyway
+			// to avoid blocking the node forever; the system degrades to the pre-fix behavior.
+			n.log.Warn("startup catch-up did not complete cleanly; enabling gossip anyway", "err", err)
+		}
+		n.gossipReady.Store(true)
+		n.log.Info("gossip enabled; op-node fully active")
+	}
+	// If catch-up is disabled, gossipReady was already set to true in New(),
+	// so OnUnsafeL2Payload behaves identically to the pre-fix code path.
+
 	log.Info("Rollup node started")
 	return nil
 }
@@ -543,6 +673,32 @@ func (n *OpNode) PublishL2Payload(ctx context.Context, envelope *eth.ExecutionPa
 }
 
 func (n *OpNode) OnUnsafeL2Payload(ctx context.Context, from peer.ID, envelope *eth.ExecutionPayloadEnvelope) error {
+	// Drop gossip payloads received during the startup catch-up phase.
+	// While op-geth's unsafe head is still catching up via L1 derivation, accepting
+	// real-time gossip payloads would fill the clSync queue with orphan payloads
+	// (parent != op-geth.UnsafeL2Head) and trigger the driver/alt-sync livelock.
+	// Gossip is re-enabled once waitForOpGethCatchUp completes.
+	// Any payloads dropped here are recovered by gossipsub mesh re-broadcasts and
+	// alt-sync backfill once gossipReady is set.
+	//
+	// Exception: the very first payload is always let through, regardless of catch-up
+	// state. In ELSync mode this is required to trigger the WillStartEL → FinishedEL
+	// transition inside InsertUnsafePayload (the "Skipping EL sync ..." finalized-block
+	// check). Without this, IsEngineSyncing() stays true and derivation is blocked
+	// from running during catch-up, defeating the purpose of the wait. In CLSync mode
+	// this single payload simply enters the clSync queue and is harmless.
+	if !n.gossipReady.Load() {
+		if !n.firstPayloadAllowed.Swap(true) {
+			n.log.Info("allowing first gossip payload through during catch-up to unblock engine sync state",
+				"id", envelope.ExecutionPayload.ID(), "peer", from)
+			// fall through to the regular processing path below
+		} else {
+			n.log.Debug("dropping gossip payload during startup catch-up phase",
+				"id", envelope.ExecutionPayload.ID(), "peer", from)
+			return nil
+		}
+	}
+
 	// ignore if it's from ourselves
 	if n.p2pNode != nil && from == n.p2pNode.Host().ID() {
 		return nil
diff --git a/op-node/p2p/sync.go b/op-node/p2p/sync.go
index 2a33dfb99..58569f326 100644
--- a/op-node/p2p/sync.go
+++ b/op-node/p2p/sync.go
@@ -290,7 +290,7 @@ func NewSyncClient(log log.Logger, cfg *rollup.Config, newStream newStreamFn, rc
 		payloadByNumber:     PayloadByNumberProtocolID(cfg.L2ChainID),
 		peers:               make(map[peer.ID]context.CancelFunc),
 		quarantineByNum:     make(map[uint64]common.Hash),
-		rangeRequests:       make(chan rangeRequest, 16), // blocking
+		rangeRequests:       make(chan rangeRequest), // blocking
 		activeRangeRequests: newRequestIdMap(),
 		peerRequests:        make(chan peerRequest, 128),
 		results:             make(chan syncResult, 128),
diff --git a/op-node/service.go b/op-node/service.go
index e162834c5..5c6d709bf 100644
--- a/op-node/service.go
+++ b/op-node/service.go
@@ -115,6 +115,8 @@ func NewConfig(ctx *cli.Context, log log.Logger) (*node.Config, error) {
 		ConductorRpcTimeout: ctx.Duration(flags.ConductorRpcTimeoutFlag.Name),
 
 		Plasma: plasma.ReadCLIConfig(ctx),
+
+		CatchUp: ctx.Bool(flags.CatchUpFlag.Name),
 	}
 
 	if err := cfg.LoadPersisted(log); err != nil {

From 63ca20efd15728673556b22ccbb8e0edc926da12 Mon Sep 17 00:00:00 2001
From: sysvm <112189277+sysvm@users.noreply.github.com>
Date: Tue, 19 May 2026 16:59:29 +0800
Subject: [PATCH 3/6] fix: rename flag

---
 op-node/flags/flags.go | 13 ++++++-------
 op-node/node/node.go   |  2 +-
 op-node/p2p/sync.go    |  2 +-
 op-node/service.go     |  2 +-
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/op-node/flags/flags.go b/op-node/flags/flags.go
index f1c9fed7e..9318f3618 100644
--- a/op-node/flags/flags.go
+++ b/op-node/flags/flags.go
@@ -360,13 +360,12 @@ var (
 		Value:   86400,
 		EnvVars: prefixEnvVars("EL_TRIGGER_GAP"),
 	}
-	CatchUpFlag = &cli.BoolFlag{
-		Name: "catch-up",
-		Usage: "When enabled, op-node defers gossip subscription on startup until op-geth's unsafe head " +
+	StartupCatchUpFlag = &cli.BoolFlag{
+		Name: "startup.catch-up",
+		Usage: "When enabled, op-node defers gossip subscription during startup until op-geth's unsafe head " +
 			"has caught up to the live tip via L1 derivation. This avoids the driver/alt-sync activity loop " +
-			"that occurs when an RPC node restarts with a large unsafe-head gap. " +
-			"Recommended for RPC and verifier nodes; not needed for sequencer nodes.",
-		EnvVars:  prefixEnvVars("CATCH_UP"),
+			"that occurs when an RPC node restarts with a large unsafe-head gap. Recommended for RPC nodes; not needed for sequencer nodes.",
+		EnvVars:  prefixEnvVars("STARTUP_CATCH_UP"),
 		Value:    false,
 		Category: RollupCategory,
 	}
@@ -450,7 +449,7 @@ var optionalFlags = []cli.Flag{
 	SyncModeFlag,
 	FastnodeMode,
 	ELTriggerGap,
-	CatchUpFlag,
+	StartupCatchUpFlag,
 	RPCListenAddr,
 	RPCListenPort,
 	L1TrustRPC,
diff --git a/op-node/node/node.go b/op-node/node/node.go
index 3be2e305c..65731d9f8 100644
--- a/op-node/node/node.go
+++ b/op-node/node/node.go
@@ -600,7 +600,7 @@ func (n *OpNode) Start(ctx context.Context) error {
 	// Optionally defer enabling gossip until op-geth's unsafe head has caught up to the live tip
 	// via the L1 derivation pipeline. This avoids the driver/alt-sync livelock that occurs when
 	// gossip floods in with payloads whose parent does not match op-geth's stale unsafe head.
-	// Disabled by default; enable via --catch-up for RPC / verifier nodes.
+	// Disabled by default; enable via --startup.catch-up for RPC nodes.
 	// See waitForOpGethCatchUp for full background.
 	if n.catchUpEnabled {
 		if err := n.waitForOpGethCatchUp(ctx); err != nil {
diff --git a/op-node/p2p/sync.go b/op-node/p2p/sync.go
index 58569f326..2a33dfb99 100644
--- a/op-node/p2p/sync.go
+++ b/op-node/p2p/sync.go
@@ -290,7 +290,7 @@ func NewSyncClient(log log.Logger, cfg *rollup.Config, newStream newStreamFn, rc
 		payloadByNumber:     PayloadByNumberProtocolID(cfg.L2ChainID),
 		peers:               make(map[peer.ID]context.CancelFunc),
 		quarantineByNum:     make(map[uint64]common.Hash),
-		rangeRequests:       make(chan rangeRequest), // blocking
+		rangeRequests:       make(chan rangeRequest, 16), // blocking
 		activeRangeRequests: newRequestIdMap(),
 		peerRequests:        make(chan peerRequest, 128),
 		results:             make(chan syncResult, 128),
diff --git a/op-node/service.go b/op-node/service.go
index 5c6d709bf..abe350df7 100644
--- a/op-node/service.go
+++ b/op-node/service.go
@@ -116,7 +116,7 @@ func NewConfig(ctx *cli.Context, log log.Logger) (*node.Config, error) {
 
 		Plasma: plasma.ReadCLIConfig(ctx),
 
-		CatchUp: ctx.Bool(flags.CatchUpFlag.Name),
+		CatchUp: ctx.Bool(flags.StartupCatchUpFlag.Name),
 	}
 
 	if err := cfg.LoadPersisted(log); err != nil {

From 430a91f504a2774e24c3a7f7eb6a18c7339bebe5 Mon Sep 17 00:00:00 2001
From: sysvm <112189277+sysvm@users.noreply.github.com>
Date: Wed, 20 May 2026 10:03:33 +0800
Subject: [PATCH 4/6] fix: set catchupLagThreshold to 10s

---
 op-node/node/node.go | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/op-node/node/node.go b/op-node/node/node.go
index 65731d9f8..a9c3f0d56 100644
--- a/op-node/node/node.go
+++ b/op-node/node/node.go
@@ -108,20 +108,20 @@ type OpNode struct {
 
 // Startup catch-up parameters. Hardcoded; tweak here if needed.
 const (
-	// catchUpLagThreshold is how close op-geth's unsafe head timestamp must be
+	// catchupLagThreshold is how close op-geth's unsafe head timestamp must be
 	// to the current wall-clock time before gossip is enabled.
-	// On opBNB (~500ms blocks) 30s ≈ 60 blocks of remaining gap, well below the
+	// On opBNB (~250ms blocks) 10s ≈ 40 blocks of remaining gap, well below the
 	// threshold that triggers the activity loop in tested scenarios.
-	catchUpLagThreshold = 30 * time.Second
+	catchupLagThreshold = 10 * time.Second
 
-	// catchUpMaxWait is the absolute maximum time we are willing to defer gossip.
+	// catchupMaxWait is the absolute maximum time we are willing to defer gossip.
 	// If catch-up does not complete within this window (e.g. L1 derivation is unhealthy),
 	// gossip is enabled regardless and the system degrades to the pre-fix behavior
 	// rather than blocking forever.
-	catchUpMaxWait = 10 * time.Minute
+	catchupMaxWait = 10 * time.Minute
 
-	// catchUpPollInterval is how often we re-check op-geth's unsafe head during catch-up.
-	catchUpPollInterval = 5 * time.Second
+	// catchupPollInterval is how often we re-check op-geth's unsafe head during catch-up.
+	catchupPollInterval = 5 * time.Second
 )
 
 // The OpNode handles incoming gossip
@@ -185,13 +185,11 @@ func New(ctx context.Context, cfg *Config, log log.Logger, snapshotLog log.Logge
 // Returns nil on successful catch-up; returns an error on context cancellation or timeout.
 // In case of timeout, the caller should still enable gossip and degrade gracefully.
 func (n *OpNode) waitForOpGethCatchUp(ctx context.Context) error {
-	n.log.Info("starting op-geth catch-up phase before enabling gossip",
-		"lag_threshold", catchUpLagThreshold,
-		"max_wait", catchUpMaxWait,
-	)
+	n.log.Info("Starting op-geth catch-up phase before enabling gossip", "lag_threshold", catchupLagThreshold,
+		"max_wait", catchupMaxWait)
 
-	deadline := time.Now().Add(catchUpMaxWait)
-	ticker := time.NewTicker(catchUpPollInterval)
+	deadline := time.Now().Add(catchupMaxWait)
+	ticker := time.NewTicker(catchupPollInterval)
 	defer ticker.Stop()
 
 	for {
@@ -207,7 +205,7 @@ func (n *OpNode) waitForOpGethCatchUp(ctx context.Context) error {
 			lag := time.Since(headTime)
 
 			// Treat negative lag (clock skew or future-timestamp head) as caught up.
-			if lag < catchUpLagThreshold {
+			if lag < catchupLagThreshold {
 				n.log.Info("op-geth caught up; enabling gossip", "unsafe_head", unsafeHead.Number, "lag", lag)
 				return nil
 			}
@@ -217,7 +215,7 @@ func (n *OpNode) waitForOpGethCatchUp(ctx context.Context) error {
 		}
 
 		if time.Now().After(deadline) {
-			return fmt.Errorf("startup catch-up timeout after %v", catchUpMaxWait)
+			return fmt.Errorf("startup catch-up timeout after %v", catchupMaxWait)
 		}
 
 		select {
@@ -606,10 +604,10 @@ func (n *OpNode) Start(ctx context.Context) error {
 		if err := n.waitForOpGethCatchUp(ctx); err != nil {
 			// Catch-up failed (e.g. timeout, L1 derivation unhealthy). Enable gossip anyway
 			// to avoid blocking the node forever; the system degrades to the pre-fix behavior.
-			n.log.Warn("startup catch-up did not complete cleanly; enabling gossip anyway", "err", err)
+			n.log.Warn("Startup catch-up did not complete cleanly; enabling gossip anyway", "err", err)
 		}
 		n.gossipReady.Store(true)
-		n.log.Info("gossip enabled; op-node fully active")
+		n.log.Info("Gossip enabled; op-node fully active")
 	}
 	// If catch-up is disabled, gossipReady was already set to true in New(),
 	// so OnUnsafeL2Payload behaves identically to the pre-fix code path.
@@ -689,12 +687,12 @@ func (n *OpNode) OnUnsafeL2Payload(ctx context.Context, from peer.ID, envelope *
 	// this single payload simply enters the clSync queue and is harmless.
 	if !n.gossipReady.Load() {
 		if !n.firstPayloadAllowed.Swap(true) {
-			n.log.Info("allowing first gossip payload through during catch-up to unblock engine sync state",
-				"id", envelope.ExecutionPayload.ID(), "peer", from)
+			n.log.Info("Allowing first gossip payload through during catch-up to unblock engine sync state",
+				"peer", from, "id", envelope.ExecutionPayload.ID())
 			// fall through to the regular processing path below
 		} else {
-			n.log.Debug("dropping gossip payload during startup catch-up phase",
-				"id", envelope.ExecutionPayload.ID(), "peer", from)
+			n.log.Debug("Dropping gossip payload during startup catch-up phase", "peer", from,
+				"id", envelope.ExecutionPayload.ID())
 			return nil
 		}
 	}

From 1383bad7e5692fd0a8e71778ac11c24bc7b11813 Mon Sep 17 00:00:00 2001
From: sysvm <112189277+sysvm@users.noreply.github.com>
Date: Wed, 20 May 2026 10:57:54 +0800
Subject: [PATCH 5/6] fix: optimize codes

---
 op-node/flags/flags.go | 14 ++++++------
 op-node/node/config.go |  6 ++---
 op-node/node/node.go   | 52 +++++++++++++-----------------------------
 op-node/p2p/sync.go    |  2 +-
 op-node/service.go     |  2 +-
 5 files changed, 28 insertions(+), 48 deletions(-)

diff --git a/op-node/flags/flags.go b/op-node/flags/flags.go
index 9318f3618..f8b414c5d 100644
--- a/op-node/flags/flags.go
+++ b/op-node/flags/flags.go
@@ -360,12 +360,12 @@ var (
 		Value:   86400,
 		EnvVars: prefixEnvVars("EL_TRIGGER_GAP"),
 	}
-	StartupCatchUpFlag = &cli.BoolFlag{
-		Name: "startup.catch-up",
-		Usage: "When enabled, op-node defers gossip subscription during startup until op-geth's unsafe head " +
-			"has caught up to the live tip via L1 derivation. This avoids the driver/alt-sync activity loop " +
-			"that occurs when an RPC node restarts with a large unsafe-head gap. Recommended for RPC nodes; not needed for sequencer nodes.",
-		EnvVars:  prefixEnvVars("STARTUP_CATCH_UP"),
+	StartupDeferGossipFlag = &cli.BoolFlag{
+		Name: "startup.defer-gossip",
+		Usage: "Defers P2P gossip processing during startup until op-geth's unsafe head has caught up to" +
+			"the live tip via L1 derivation. This avoids the driver/alt-sync activity loop that occurs when an RPC node" +
+			"restarts with a large unsafe-head gap. Recommended for RPC and bridge nodes; not needed for sequencer and P2P nodes.",
+		EnvVars:  prefixEnvVars("STARTUP_DEFER_GOSSIP"),
 		Value:    false,
 		Category: RollupCategory,
 	}
@@ -449,7 +449,7 @@ var optionalFlags = []cli.Flag{
 	SyncModeFlag,
 	FastnodeMode,
 	ELTriggerGap,
-	StartupCatchUpFlag,
+	StartupDeferGossipFlag,
 	RPCListenAddr,
 	RPCListenPort,
 	L1TrustRPC,
diff --git a/op-node/node/config.go b/op-node/node/config.go
index 0b22e653a..06d5c25c1 100644
--- a/op-node/node/config.go
+++ b/op-node/node/config.go
@@ -78,12 +78,12 @@ type Config struct {
 	// Plasma DA config
 	Plasma plasma.CLIConfig
 
-	// CatchUp toggles the optional pre-gossip catch-up phase at startup.
+	// StartupDeferGossip toggles the optional pre-gossip catch-up phase at startup.
 	// When true, op-node defers enabling gossip until op-geth's unsafe head has caught up
 	// to the live tip via L1 derivation, preventing the driver/alt-sync activity loop
 	// that otherwise occurs after restarts with a large unsafe-head gap.
-	// Recommended for RPC and verifier nodes; sequencer nodes should leave it disabled.
-	CatchUp bool
+	// Recommended for RPC and bridge nodes; sequencer and P2P nodes should leave it disabled.
+	StartupDeferGossip bool
 }
 
 type RPCConfig struct {
diff --git a/op-node/node/node.go b/op-node/node/node.go
index a9c3f0d56..c8b941d43 100644
--- a/op-node/node/node.go
+++ b/op-node/node/node.go
@@ -79,8 +79,8 @@ type OpNode struct {
 
 	closed atomic.Bool
 
-	// catchUpEnabled mirrors cfg.CatchUp; controls whether Start() runs the pre-gossip catch-up phase.
-	catchUpEnabled bool
+	// deferGossipEnabled mirrors cfg.StartupDeferGossip; controls whether Start() runs the pre-gossip catch-up phase.
+	deferGossipEnabled bool
 
 	// gossipReady gates incoming gossip payloads during the startup catch-up phase.
 	// While false, gossip payloads received via OnUnsafeL2Payload are silently dropped
@@ -106,13 +106,10 @@ type OpNode struct {
 	halted atomic.Bool
 }
 
-// Startup catch-up parameters. Hardcoded; tweak here if needed.
 const (
 	// catchupLagThreshold is how close op-geth's unsafe head timestamp must be
 	// to the current wall-clock time before gossip is enabled.
-	// On opBNB (~250ms blocks) 10s ≈ 40 blocks of remaining gap, well below the
-	// threshold that triggers the activity loop in tested scenarios.
-	catchupLagThreshold = 10 * time.Second
+	catchupLagThreshold = 30 * time.Second
 
 	// catchupMaxWait is the absolute maximum time we are willing to defer gossip.
 	// If catch-up does not complete within this window (e.g. L1 derivation is unhealthy),
@@ -136,16 +133,17 @@ func New(ctx context.Context, cfg *Config, log log.Logger, snapshotLog log.Logge
 	}
 
 	n := &OpNode{
-		log:            log,
-		appVersion:     appVersion,
-		metrics:        m,
-		rollupHalt:     cfg.RollupHalt,
-		cancel:         cfg.Cancel,
-		catchUpEnabled: cfg.CatchUp,
-	}
-	// If catch-up is disabled, gossip should be processed immediately as before.
+		log:                log,
+		appVersion:         appVersion,
+		metrics:            m,
+		rollupHalt:         cfg.RollupHalt,
+		cancel:             cfg.Cancel,
+		deferGossipEnabled: cfg.StartupDeferGossip,
+	}
+
+	// If defer gossip is disabled, gossip should be processed immediately as before.
 	// Set the gate to "ready" up front so OnUnsafeL2Payload behaves identically to the pre-fix code path.
-	if !n.catchUpEnabled {
+	if !n.deferGossipEnabled {
 		n.gossipReady.Store(true)
 	}
 	// not a context leak, gossipsub is closed with a context.
@@ -595,12 +593,7 @@ func (n *OpNode) Start(ctx context.Context) error {
 		return err
 	}
 
-	// Optionally defer enabling gossip until op-geth's unsafe head has caught up to the live tip
-	// via the L1 derivation pipeline. This avoids the driver/alt-sync livelock that occurs when
-	// gossip floods in with payloads whose parent does not match op-geth's stale unsafe head.
-	// Disabled by default; enable via --startup.catch-up for RPC nodes.
-	// See waitForOpGethCatchUp for full background.
-	if n.catchUpEnabled {
+	if n.deferGossipEnabled {
 		if err := n.waitForOpGethCatchUp(ctx); err != nil {
 			// Catch-up failed (e.g. timeout, L1 derivation unhealthy). Enable gossip anyway
 			// to avoid blocking the node forever; the system degrades to the pre-fix behavior.
@@ -609,7 +602,7 @@ func (n *OpNode) Start(ctx context.Context) error {
 		n.gossipReady.Store(true)
 		n.log.Info("Gossip enabled; op-node fully active")
 	}
-	// If catch-up is disabled, gossipReady was already set to true in New(),
+	// If defer gossip is disabled, gossipReady was already set to true in New(),
 	// so OnUnsafeL2Payload behaves identically to the pre-fix code path.
 
 	log.Info("Rollup node started")
@@ -671,20 +664,7 @@ func (n *OpNode) PublishL2Payload(ctx context.Context, envelope *eth.ExecutionPa
 }
 
 func (n *OpNode) OnUnsafeL2Payload(ctx context.Context, from peer.ID, envelope *eth.ExecutionPayloadEnvelope) error {
-	// Drop gossip payloads received during the startup catch-up phase.
-	// While op-geth's unsafe head is still catching up via L1 derivation, accepting
-	// real-time gossip payloads would fill the clSync queue with orphan payloads
-	// (parent != op-geth.UnsafeL2Head) and trigger the driver/alt-sync livelock.
-	// Gossip is re-enabled once waitForOpGethCatchUp completes.
-	// Any payloads dropped here are recovered by gossipsub mesh re-broadcasts and
-	// alt-sync backfill once gossipReady is set.
-	//
-	// Exception: the very first payload is always let through, regardless of catch-up
-	// state. In ELSync mode this is required to trigger the WillStartEL → FinishedEL
-	// transition inside InsertUnsafePayload (the "Skipping EL sync ..." finalized-block
-	// check). Without this, IsEngineSyncing() stays true and derivation is blocked
-	// from running during catch-up, defeating the purpose of the wait. In CLSync mode
-	// this single payload simply enters the clSync queue and is harmless.
+	// If defer gossip is enabled, drop gossip payloads received during the startup catch-up phase.
 	if !n.gossipReady.Load() {
 		if !n.firstPayloadAllowed.Swap(true) {
 			n.log.Info("Allowing first gossip payload through during catch-up to unblock engine sync state",
diff --git a/op-node/p2p/sync.go b/op-node/p2p/sync.go
index 2a33dfb99..58569f326 100644
--- a/op-node/p2p/sync.go
+++ b/op-node/p2p/sync.go
@@ -290,7 +290,7 @@ func NewSyncClient(log log.Logger, cfg *rollup.Config, newStream newStreamFn, rc
 		payloadByNumber:     PayloadByNumberProtocolID(cfg.L2ChainID),
 		peers:               make(map[peer.ID]context.CancelFunc),
 		quarantineByNum:     make(map[uint64]common.Hash),
-		rangeRequests:       make(chan rangeRequest, 16), // blocking
+		rangeRequests:       make(chan rangeRequest), // blocking
 		activeRangeRequests: newRequestIdMap(),
 		peerRequests:        make(chan peerRequest, 128),
 		results:             make(chan syncResult, 128),
diff --git a/op-node/service.go b/op-node/service.go
index abe350df7..53f29429e 100644
--- a/op-node/service.go
+++ b/op-node/service.go
@@ -116,7 +116,7 @@ func NewConfig(ctx *cli.Context, log log.Logger) (*node.Config, error) {
 
 		Plasma: plasma.ReadCLIConfig(ctx),
 
-		CatchUp: ctx.Bool(flags.StartupCatchUpFlag.Name),
+		StartupDeferGossip: ctx.Bool(flags.StartupDeferGossipFlag.Name),
 	}
 
 	if err := cfg.LoadPersisted(log); err != nil {

From 247432cc83b85cfccfaf66f9ef344a6aa14cd6a4 Mon Sep 17 00:00:00 2001
From: sysvm <112189277+sysvm@users.noreply.github.com>
Date: Wed, 20 May 2026 12:27:16 +0800
Subject: [PATCH 6/6] fix: set startup.defer-gossip default value to true

---
 op-node/flags/flags.go | 10 ++++++----
 op-node/node/config.go | 13 ++++++++-----
 op-node/node/node.go   | 29 +++++++++++++++++++----------
 3 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/op-node/flags/flags.go b/op-node/flags/flags.go
index f8b414c5d..a6d6841b2 100644
--- a/op-node/flags/flags.go
+++ b/op-node/flags/flags.go
@@ -362,11 +362,13 @@ var (
 	}
 	StartupDeferGossipFlag = &cli.BoolFlag{
 		Name: "startup.defer-gossip",
-		Usage: "Defers P2P gossip processing during startup until op-geth's unsafe head has caught up to" +
-			"the live tip via L1 derivation. This avoids the driver/alt-sync activity loop that occurs when an RPC node" +
-			"restarts with a large unsafe-head gap. Recommended for RPC and bridge nodes; not needed for sequencer and P2P nodes.",
+		Usage: "Defers P2P gossip processing during startup until op-geth's unsafe head has caught up to " +
+			"the live tip via L1 derivation. This avoids the driver/alt-sync activity loop that occurs when a node " +
+			"restarts with a large unsafe-head gap. Default enabled for all node types (rpc / bridge / sequencer / p2p); " +
+			"the catch-up loop returns quickly when no gap exists, so the cost is negligible for nodes that don't need it. " +
+			"Set to false to opt out and restore the pre-fix startup behavior.",
 		EnvVars:  prefixEnvVars("STARTUP_DEFER_GOSSIP"),
-		Value:    false,
+		Value:    true,
 		Category: RollupCategory,
 	}
 	/* Deprecated Flags */
diff --git a/op-node/node/config.go b/op-node/node/config.go
index 06d5c25c1..b949666ad 100644
--- a/op-node/node/config.go
+++ b/op-node/node/config.go
@@ -78,11 +78,14 @@ type Config struct {
 	// Plasma DA config
 	Plasma plasma.CLIConfig
 
-	// StartupDeferGossip toggles the optional pre-gossip catch-up phase at startup.
-	// When true, op-node defers enabling gossip until op-geth's unsafe head has caught up
-	// to the live tip via L1 derivation, preventing the driver/alt-sync activity loop
-	// that otherwise occurs after restarts with a large unsafe-head gap.
-	// Recommended for RPC and bridge nodes; sequencer and P2P nodes should leave it disabled.
+	// StartupDeferGossip toggles the pre-gossip catch-up phase at startup.
+	// When true (the default), op-node defers enabling gossip until op-geth's unsafe head
+	// has caught up to the live tip via L1 derivation, preventing the driver/alt-sync
+	// activity loop that otherwise occurs after restarts with a large unsafe-head gap.
+	// Enabled by default for all node types (rpc / bridge / sequencer / p2p); the catch-up
+	// loop returns immediately when op-geth is already at the live tip, so the cost for
+	// nodes that don't need it (e.g. a healthy sequencer) is negligible. Set to false only
+	// to deliberately restore the pre-fix startup behavior.
 	StartupDeferGossip bool
 }
 
diff --git a/op-node/node/node.go b/op-node/node/node.go
index c8b941d43..2254fdc8e 100644
--- a/op-node/node/node.go
+++ b/op-node/node/node.go
@@ -141,8 +141,12 @@ func New(ctx context.Context, cfg *Config, log log.Logger, snapshotLog log.Logge
 		deferGossipEnabled: cfg.StartupDeferGossip,
 	}
 
-	// If defer gossip is disabled, gossip should be processed immediately as before.
-	// Set the gate to "ready" up front so OnUnsafeL2Payload behaves identically to the pre-fix code path.
+	// Opt-out path: if the operator has explicitly disabled the startup defer-gossip phase
+	// (--startup.defer-gossip=false), flip the gate to "ready" immediately so that
+	// OnUnsafeL2Payload processes gossip without delay, matching the pre-fix code path exactly.
+	// In the default path (deferGossipEnabled=true), gossipReady stays at its zero value
+	// (false) here and is flipped to true at the end of Start() after waitForOpGethCatchUp
+	// completes (or times out).
 	if !n.deferGossipEnabled {
 		n.gossipReady.Store(true)
 	}
@@ -602,8 +606,6 @@ func (n *OpNode) Start(ctx context.Context) error {
 		n.gossipReady.Store(true)
 		n.log.Info("Gossip enabled; op-node fully active")
 	}
-	// If defer gossip is disabled, gossipReady was already set to true in New(),
-	// so OnUnsafeL2Payload behaves identically to the pre-fix code path.
 
 	log.Info("Rollup node started")
 	return nil
@@ -664,7 +666,19 @@ func (n *OpNode) PublishL2Payload(ctx context.Context, envelope *eth.ExecutionPa
 }
 
 func (n *OpNode) OnUnsafeL2Payload(ctx context.Context, from peer.ID, envelope *eth.ExecutionPayloadEnvelope) error {
-	// If defer gossip is enabled, drop gossip payloads received during the startup catch-up phase.
+	// ignore if it's from ourselves.
+	// Note: this check intentionally runs BEFORE the gossipReady gate so that self-published
+	// payloads (sequencer publishing its own blocks via gossipsub, which loops back to the
+	// local subscriber) do not consume the firstPayloadAllowed slot. Without this ordering,
+	// a sequencer running in ELSync mode could waste its only "free" payload on a self-loop,
+	// leaving syncStatus stuck at WillStartEL.
+	if n.p2pNode != nil && from == n.p2pNode.Host().ID() {
+		return nil
+	}
+
+	// Drop external gossip payloads received during the startup catch-up phase, except
+	// for the very first payload which is needed to trigger the ELSync syncStatus
+	// transition inside InsertUnsafePayload (see firstPayloadAllowed comment on OpNode).
 	if !n.gossipReady.Load() {
 		if !n.firstPayloadAllowed.Swap(true) {
 			n.log.Info("Allowing first gossip payload through during catch-up to unblock engine sync state",
@@ -677,11 +691,6 @@ func (n *OpNode) OnUnsafeL2Payload(ctx context.Context, from peer.ID, envelope *
 		}
 	}
 
-	// ignore if it's from ourselves
-	if n.p2pNode != nil && from == n.p2pNode.Host().ID() {
-		return nil
-	}
-
 	n.tracer.OnUnsafeL2Payload(ctx, from, envelope)
 
 	n.log.Info("Received signed execution payload from p2p", "id", envelope.ExecutionPayload.ID(), "peer", from)