diff --git a/src/metrics.rs b/src/metrics.rs index b660619c..72cba99d 100644 --- a/src/metrics.rs +++ b/src/metrics.rs @@ -295,6 +295,41 @@ pub struct BscParliaGethMetrics { pub doublesign: Counter, } +/// Metrics for the BSC block import service (`block_import/service.rs`). +/// +/// Exposes the per-outcome distribution of `engine.new_payload` results so the +/// reorg-race vs. structural-invalid ratio is visible at the Prometheus layer. +/// This is the diagnostic surface that lets us tune the race-shaped error +/// classifier in `service.rs` without grepping logs, and tells us whether a +/// peer-drop incident is correlated with a spike of structural rejects (which +/// trigger the BadBlock penalty path) versus race rejects (which do not). +#[derive(Metrics, Clone)] +#[metrics(scope = "bsc.block_import")] +pub struct BscBlockImportMetrics { + /// `Invalid` outcomes whose error string matched a reorg-race heuristic + /// (`state root`, `parent block`, `unknown parent`, `missing trie`, + /// `receipts root`, `insert_canonical`, `reorg`). These do NOT emit an Err + /// outcome to the network manager and therefore do NOT trigger the + /// `BadBlock` reputation penalty. + pub invalid_race: Counter, + + /// `Invalid` outcomes whose error string did NOT match any race heuristic. + /// These are treated as structural failures and DO emit an Err outcome, + /// which means reth's `NetworkManager::on_block_imported` applies a + /// `BadBlock` reputation change to the source peer. + pub invalid_structural: Counter, + + /// `Valid` outcomes — block accepted by the engine. + pub valid: Counter, + + /// `Syncing` outcomes — engine couldn't validate yet, FCU was forced. + pub syncing: Counter, + + /// Payloads rejected before reaching the engine because the + /// announced-hash and computed-hash diverged. + pub hash_mismatch: Counter, +} + #[cfg(test)] mod tests { use super::*; @@ -310,5 +345,6 @@ mod tests { let _miner_metrics = BscMinerMetrics::default(); let _finality_metrics = BscFinalityMetrics::default(); let _blockchain_metrics = BscBlockchainMetrics::default(); + let _block_import_metrics = BscBlockImportMetrics::default(); } } diff --git a/src/node/network/block_import/service.rs b/src/node/network/block_import/service.rs index 031b2424..e8a2fec2 100644 --- a/src/node/network/block_import/service.rs +++ b/src/node/network/block_import/service.rs @@ -114,6 +114,9 @@ where failed_heads: crate::node::network::block_import::fork_recover::FailedHeadsCooler, /// Periodic timer for head announcement. announce_interval: tokio::time::Interval, + /// Per-outcome counters for `engine.new_payload` results. Cloned into each + /// import future since `Counter` is a cheap handle into the global registry. + metrics: crate::metrics::BscBlockImportMetrics, } fn resolve_bsc_peer_static(announcer: PeerId) -> Option { @@ -174,6 +177,7 @@ where interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); interval }, + metrics: crate::metrics::BscBlockImportMetrics::default(), } } @@ -181,6 +185,7 @@ where fn new_payload(&self, block: BlockMsg, peer_id: PeerId) -> ImportFut { let engine = self.engine.clone(); let forkchoice_engine = self.forkchoice_engine.clone(); + let metrics = self.metrics.clone(); let recovering_heads = self.recovering_heads.clone(); let failed_heads = self.failed_heads.clone(); @@ -189,6 +194,7 @@ where tracing::debug!(target: "bsc::block_import", "New payload: block = ({:?}, {:?}), peer_id = {:?}", block.block.0.block.header.number, block_hash, peer_id); Box::pin(async move { if announced_hash != block_hash { + metrics.hash_mismatch.increment(1); tracing::warn!( target: "bsc::block_import", number = block.block.0.block.header.number, @@ -215,6 +221,7 @@ where match engine.new_payload(payload).await { Ok(payload_status) => match payload_status.status { PayloadStatusEnum::Valid => { + metrics.valid.increment(1); tracing::debug!(target: "bsc::block_import", "New payload is valid, block_hash = {:?}, block_number = {}, peer_id = {:?}", block.hash, header.number, peer_id); // handle fork choice update with valid payload if let Err(e) = forkchoice_engine.update_forkchoice(&header).await { @@ -226,32 +233,81 @@ where .into() } PayloadStatusEnum::Invalid { validation_error } => { - // Do NOT penalize the peer for Invalid blocks. + // Issue #320 / PR #297 context: on BSC's fast block time + // (post-Lorentz down to 0.45s) concurrent reorgs are + // routine. When two peers relay racing siblings, the + // loser returns Invalid from new_payload — but the block + // itself is legitimate, just executed against the wrong + // parent state. Reth's NetworkManager::on_block_imported + // penalizes EVERY Err with BadBlock (-16384 reputation), + // regardless of variant; 4 hits = peer banned (threshold + // -51200), recovery +1/sec = 4.5h to re-earn. Under BSC + // tip-follow, races arrive faster than that recovery, so + // the peer pool drains to zero (#320). // - // In BSC's PoSA with devp2p block propagation, Invalid - // frequently results from timing issues during concurrent - // reorgs — the block itself is legitimate but was executed - // against the wrong state. Penalizing the peer with BadBlock - // (-16384 reputation) for this drains peers rapidly, - // especially under BSC's fast block time (0.45s), where - // concurrent forks are routine. + // PR #297 suppressed the penalty for EVERY Invalid. + // That's too blunt: a truly malformed block from a bad + // peer (wrong Parlia signature, structural hash + // mismatch, etc.) now also escapes the BadBlock + // mechanism. We classify the error string instead — + // race-shaped failures skip the penalty, structural + // failures keep the original behaviour so BadBlock + // still catches real offenders. // - // This aligns with geth's behavior: geth's fetcher only - // drops a peer when header verification fails - // (verifyHeader), never when block execution fails - // (insertChain). Truly malicious peers are still caught by - // the network layer's BadMessage / BadProtocol penalties. - tracing::debug!( - target: "bsc::block_import", - block_hash = %header.hash_slow(), - block_number = header.number, - %validation_error, - peer = %peer_id, - "New payload returned Invalid - not penalizing peer" - ); - None + // Aligns with geth's fetcher semantics: drop on header + // verification failure, not on execution failure. + let msg = validation_error.to_string(); + let lower = msg.to_ascii_lowercase(); + let looks_like_reorg_race = [ + // EVM state root mismatch — classic "executed + // against wrong parent" race. + "state root", + "receipts root", + // Parent resolution failures — the engine didn't + // have the parent tip by the time we executed. + "parent block", + "unknown parent", + "missing trie", + // Canonical insertion races. + "insert_canonical", + "reorg", + ] + .iter() + .any(|needle| lower.contains(needle)); + + if looks_like_reorg_race { + metrics.invalid_race.increment(1); + tracing::debug!( + target: "bsc::block_import", + block_hash = %header.hash_slow(), + block_number = header.number, + peer = %peer_id, + error = %msg, + "Invalid attributed to reorg race; not penalizing peer", + ); + // No Outcome → NetworkManager never sees an Err → + // no BadBlock penalty applied. Early ValidHeader + // announcement from on_new_block still stands. + None + } else { + metrics.invalid_structural.increment(1); + tracing::info!( + target: "bsc::block_import", + block_hash = %header.hash_slow(), + block_number = header.number, + peer = %peer_id, + error = %msg, + "Invalid treated as structural; peer will be penalized", + ); + Outcome { + peer: peer_id, + result: Err(BlockImportError::Other(msg.into())), + } + .into() + } } PayloadStatusEnum::Syncing => { + metrics.syncing.increment(1); // Parent block is missing. Launch fork-aware ancestor // recovery rather than a naive range fetch + premature // FCU. The recovery task also owns the final FCU. @@ -925,10 +981,37 @@ mod tests { #[tokio::test] async fn can_handle_invalid_new_payload() { - // When new_payload returns Invalid, the peer should NOT be penalized. - // The only event emitted is the early ValidHeader announcement from - // on_new_block; no BlockImportOutcome error should follow. + // "test error" doesn't hit any of the race-shaped heuristic needles, + // so it flows through the "structural" branch and must still emit an + // Err outcome — the BadBlock penalty path is preserved for genuine + // structural invalidity (bad signature, wrong turn difficulty, etc.). let mut fixture = TestFixture::new(EngineResponses::invalid_new_payload()).await; + fixture + .assert_block_import(|outcome| { + matches!( + outcome, + BlockImportEvent::Outcome(BlockImportOutcome { + peer: _, + result: Err(BlockImportError::Other(_)) + }) + ) + }) + .await; + } + + #[tokio::test] + async fn reorg_race_invalid_does_not_penalize() { + // Error string contains "state root" — our heuristic classifies this + // as a reorg race, so NO Err outcome should be emitted. Only the + // early ValidHeader announcement (from on_new_block) is observed. + let mut fixture = TestFixture::new(EngineResponses { + new_payload: PayloadStatusEnum::Invalid { + validation_error: "invalid state root (local != remote)".into(), + }, + fcu: PayloadStatusEnum::Syncing, + }) + .await; + fixture .assert_block_import(|outcome| { matches!( @@ -938,29 +1021,25 @@ mod tests { }) .await; - // Verify no error outcome was emitted + // Give the service time to process and confirm no follow-up Err + // outcome is emitted. let waker = futures::task::noop_waker(); let mut cx = Context::from_waker(&waker); - let mut extra = Vec::new(); let deadline = tokio::time::Instant::now() + tokio::time::Duration::from_millis(200); - loop { + let mut extras = Vec::new(); + while tokio::time::Instant::now() < deadline { match fixture.handle.poll_outcome(&mut cx) { - Poll::Ready(Some(event)) => extra.push(event), + Poll::Ready(Some(ev)) => extras.push(ev), Poll::Ready(None) => break, - Poll::Pending => { - if tokio::time::Instant::now() >= deadline { - break; - } - tokio::task::yield_now().await; - } + Poll::Pending => tokio::task::yield_now().await, } } assert!( - !extra.iter().any(|e| matches!( - e, + !extras.iter().any(|ev| matches!( + ev, BlockImportEvent::Outcome(BlockImportOutcome { result: Err(_), .. }) )), - "Should not penalize peer for Invalid new_payload. Extra events: {extra:?}" + "race-shaped Invalid must not emit an Err outcome (would drain peer reputation). extras = {extras:?}", ); }