|
| 1 | +// Copyright (C) Parity Technologies (UK) Ltd. |
| 2 | +// SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +//! Test: Basic Persistence on Graceful Shutdown with Startup Lookback Verification |
| 5 | +//! |
| 6 | +//! This test verifies that collator reputation data is correctly persisted to disk |
| 7 | +//! during normal operation, survives a graceful validator restart, and that the |
| 8 | +//! startup lookback mechanism correctly catches up on reputation bumps from blocks |
| 9 | +//! finalized between persistence and restart. |
| 10 | +//! |
| 11 | +//! ## Test Scenario - Phase 1: Large Gap (>= 20 blocks) |
| 12 | +//! |
| 13 | +//! 1. Spawn a network with 4 validators and 1 parachain (using experimental collator protocol) |
| 14 | +//! 2. Wait for parachain blocks to be produced (establishing reputation through backed candidates) |
| 15 | +//! 3. Wait for periodic persistence (using short interval for testing) |
| 16 | +//! 4. Record the finalized block number at persistence time |
| 17 | +//! 5. **Pause validator-0** (so it misses blocks being finalized) |
| 18 | +//! 6. Wait for 22+ finalized blocks while validator-0 is paused (creating a real gap) |
| 19 | +//! 7. Restart validator-0 (triggers full startup sequence) |
| 20 | +//! 8. Verify validator loads existing reputation from disk on restart |
| 21 | +//! 9. Verify the startup lookback processes blocks it missed while paused |
| 22 | +//! 10. Verify validator continues normal operation |
| 23 | +//! |
| 24 | +//! ## Test Scenario - Phase 2: Small Gap (< 20 blocks) |
| 25 | +//! |
| 26 | +//! 11. Pause validator-0 again |
| 27 | +//! 12. Wait for ~10 finalized blocks (smaller gap) |
| 28 | +//! 13. Restart validator-0 again |
| 29 | +//! 14. Verify the startup lookback processes the entire gap (not limited by MAX_STARTUP_ANCESTRY_LOOKBACK) |
| 30 | +//! 15. Verify processed block count matches the actual gap size |
| 31 | +//! |
| 32 | +//! ## Success Criteria |
| 33 | +//! |
| 34 | +//! - Validator logs show "Loaded existing reputation DB from disk" on both restarts |
| 35 | +//! - First lookback processes at least 20 blocks (large gap) |
| 36 | +//! - Second lookback processes exactly ~10 blocks (entire small gap) |
| 37 | +//! - Validator resumes backing candidates after both restarts |
| 38 | +//! - No errors about missing or corrupted data |
| 39 | +
|
| 40 | +use anyhow::anyhow; |
| 41 | +use regex::Regex; |
| 42 | +use tokio::time::Duration; |
| 43 | + |
| 44 | +use cumulus_zombienet_sdk_helpers::assert_para_throughput; |
| 45 | +use polkadot_primitives::Id as ParaId; |
| 46 | +use serde_json::json; |
| 47 | +use zombienet_orchestrator::network::node::LogLineCountOptions; |
| 48 | +use zombienet_sdk::{ |
| 49 | + subxt::{OnlineClient, PolkadotConfig}, |
| 50 | + NetworkConfigBuilder, |
| 51 | +}; |
| 52 | + |
| 53 | +const PARA_ID: u32 = 2000; |
| 54 | + |
| 55 | +#[tokio::test(flavor = "multi_thread")] |
| 56 | +async fn basic_persistence_test() -> Result<(), anyhow::Error> { |
| 57 | + let _ = env_logger::try_init_from_env( |
| 58 | + env_logger::Env::default().filter_or(env_logger::DEFAULT_FILTER_ENV, "info"), |
| 59 | + ); |
| 60 | + |
| 61 | + let images = zombienet_sdk::environment::get_images_from_env(); |
| 62 | + |
| 63 | + let config = NetworkConfigBuilder::new() |
| 64 | + .with_relaychain(|r| { |
| 65 | + let r = r |
| 66 | + .with_chain("rococo-local") |
| 67 | + .with_default_command("polkadot") |
| 68 | + .with_default_image(images.polkadot.as_str()) |
| 69 | + .with_default_args(vec![ |
| 70 | + ("-lparachain=debug,parachain::collator-protocol=trace").into(), |
| 71 | + ("--experimental-collator-protocol").into(), |
| 72 | + ]) |
| 73 | + .with_genesis_overrides(json!({ |
| 74 | + "configuration": { |
| 75 | + "config": { |
| 76 | + "scheduler_params": { |
| 77 | + "group_rotation_frequency": 4, |
| 78 | + "num_cores": 1 |
| 79 | + } |
| 80 | + } |
| 81 | + } |
| 82 | + })) |
| 83 | + .with_node(|node| node.with_name("validator-0")); |
| 84 | + |
| 85 | + (1..4) |
| 86 | + .fold(r, |acc, i| acc.with_node(|node| node.with_name(&format!("validator-{i}")))) |
| 87 | + }) |
| 88 | + .with_parachain(|p| { |
| 89 | + p.with_id(PARA_ID) |
| 90 | + .with_default_command("undying-collator") |
| 91 | + .cumulus_based(false) |
| 92 | + .with_default_image( |
| 93 | + std::env::var("COL_IMAGE") |
| 94 | + .unwrap_or("docker.io/paritypr/colander:latest".to_string()) |
| 95 | + .as_str(), |
| 96 | + ) |
| 97 | + .with_default_args(vec![ |
| 98 | + ("-lparachain=debug").into(), |
| 99 | + ("--experimental-send-approved-peer").into(), |
| 100 | + ]) |
| 101 | + .with_collator(|n| n.with_name("collator")) |
| 102 | + }) |
| 103 | + .build() |
| 104 | + .map_err(|e| { |
| 105 | + let errs = e.into_iter().map(|e| e.to_string()).collect::<Vec<_>>().join(" "); |
| 106 | + anyhow!("config errs: {errs}") |
| 107 | + })?; |
| 108 | + |
| 109 | + let spawn_fn = zombienet_sdk::environment::get_spawn_fn(); |
| 110 | + let network = spawn_fn(config).await?; |
| 111 | + |
| 112 | + let validator_0 = network.get_node("validator-0")?; |
| 113 | + let validator0_client: OnlineClient<PolkadotConfig> = validator_0.wait_client().await?; |
| 114 | + |
| 115 | + // Verify validator-0 shows fresh start initially (no existing data) |
| 116 | + let fresh_start_result = validator_0 |
| 117 | + .wait_log_line_count_with_timeout( |
| 118 | + "Reputation DB initialized fresh", |
| 119 | + false, |
| 120 | + LogLineCountOptions::new(|n| n >= 1, Duration::from_secs(60), false), |
| 121 | + ) |
| 122 | + .await?; |
| 123 | + assert!( |
| 124 | + fresh_start_result.success(), |
| 125 | + "Expected validator to log 'Reputation DB initialized fresh' on initial startup" |
| 126 | + ); |
| 127 | + |
| 128 | + log::info!("Network spawned, waiting for parachain blocks to be produced"); |
| 129 | + assert_para_throughput(&validator0_client, 10, [(ParaId::from(PARA_ID), 8..12)]).await?; |
| 130 | + |
| 131 | + log::info!("Parachain blocks produced, waiting for periodic persistence"); |
| 132 | + let persistence_result = validator_0 |
| 133 | + .wait_log_line_count_with_timeout( |
| 134 | + "Periodic persistence completed: reputation DB written to disk", |
| 135 | + false, |
| 136 | + LogLineCountOptions::new(|n| n >= 1, Duration::from_secs(60), false), |
| 137 | + ) |
| 138 | + .await?; |
| 139 | + assert!(persistence_result.success(), "Periodic persistence should have completed"); |
| 140 | + |
| 141 | + let logs_before_pause = validator_0.logs().await?; |
| 142 | + let persistence_re = Regex::new( |
| 143 | + r"Periodic persistence completed: reputation DB written to disk.*last_finalized=Some\((\d+)\)" |
| 144 | + )?; |
| 145 | + |
| 146 | + let mut block_at_persistence: Option<u32> = None; |
| 147 | + for line in logs_before_pause.lines() { |
| 148 | + if let Some(caps) = persistence_re.captures(line) { |
| 149 | + block_at_persistence = caps.get(1).and_then(|m| m.as_str().parse().ok()); |
| 150 | + } |
| 151 | + } |
| 152 | + |
| 153 | + let block_at_persistence = block_at_persistence |
| 154 | + .ok_or(anyhow!("Could not parse last_finalized from persistence log"))?; |
| 155 | + log::info!("Periodic persistence completed at finalized block {}", block_at_persistence); |
| 156 | + |
| 157 | + log::info!("Pausing validator-0 to create a block gap"); |
| 158 | + validator_0.pause().await?; |
| 159 | + |
| 160 | + let validator_1 = network.get_node("validator-1")?; |
| 161 | + let validator_1_client: OnlineClient<PolkadotConfig> = validator_1.wait_client().await?; |
| 162 | + let mut finalized_blocks_1 = validator_1_client.blocks().subscribe_finalized().await?; |
| 163 | + |
| 164 | + log::info!("Waiting for finalized blocks while validator-0 is paused"); |
| 165 | + let target_gap = 30u32; |
| 166 | + let mut block_at_restart = block_at_persistence; |
| 167 | + while block_at_restart < block_at_persistence + target_gap { |
| 168 | + if let Some(Ok(block)) = finalized_blocks_1.next().await { |
| 169 | + block_at_restart = block.number(); |
| 170 | + log::info!("Finalized block {} (gap: {})", block_at_restart, block_at_restart.saturating_sub(block_at_persistence)); |
| 171 | + } |
| 172 | + } |
| 173 | + log::info!( |
| 174 | + "Gap created while validator-0 was paused: finalized block now at {}, gap of {} blocks", |
| 175 | + block_at_restart, |
| 176 | + block_at_restart.saturating_sub(block_at_persistence) |
| 177 | + ); |
| 178 | + |
| 179 | + log::info!("Restarting validator-0 (full restart to trigger startup lookback)"); |
| 180 | + validator_0.restart(None).await?; |
| 181 | + let _: OnlineClient<PolkadotConfig> = validator_0.wait_client().await?; |
| 182 | + log::info!("Validator-0 restarted, verifying reputation loaded from disk"); |
| 183 | + |
| 184 | + let load_result = validator_0 |
| 185 | + .wait_log_line_count_with_timeout( |
| 186 | + "Loaded existing reputation DB from disk", |
| 187 | + false, |
| 188 | + LogLineCountOptions::new(|n| n >= 1, Duration::from_secs(60), false), |
| 189 | + ) |
| 190 | + .await?; |
| 191 | + assert!( |
| 192 | + load_result.success(), |
| 193 | + "Expected validator to log 'Loaded existing reputation DB from disk' after restart" |
| 194 | + ); |
| 195 | + |
| 196 | + let lookback_completed_result = validator_0 |
| 197 | + .wait_log_line_count_with_timeout( |
| 198 | + "Startup lookback completed", |
| 199 | + false, |
| 200 | + LogLineCountOptions::new(|n| n >= 1, Duration::from_secs(30), false), |
| 201 | + ) |
| 202 | + .await?; |
| 203 | + assert!( |
| 204 | + lookback_completed_result.success(), |
| 205 | + "Expected validator to log 'Startup lookback completed' after restart" |
| 206 | + ); |
| 207 | + |
| 208 | + let logs = validator_0.logs().await?; |
| 209 | + let lookback_completed_re = Regex::new( |
| 210 | + r"Startup lookback completed.*blocks_processed=(\d+)" |
| 211 | + )?; |
| 212 | + |
| 213 | + let mut found_lookback_completed = false; |
| 214 | + let mut blocks_processed: Option<u32> = None; |
| 215 | + |
| 216 | + for line in logs.lines() { |
| 217 | + if let Some(caps) = lookback_completed_re.captures(line) { |
| 218 | + found_lookback_completed = true; |
| 219 | + blocks_processed = caps.get(1).and_then(|m| m.as_str().parse().ok()); |
| 220 | + log::info!( |
| 221 | + "Found startup lookback completed log: blocks_processed={}", |
| 222 | + blocks_processed.unwrap() |
| 223 | + ); |
| 224 | + break; |
| 225 | + } |
| 226 | + } |
| 227 | + |
| 228 | + assert!( |
| 229 | + found_lookback_completed, |
| 230 | + "Expected to find 'Startup lookback completed' log with blocks_processed field" |
| 231 | + ); |
| 232 | + |
| 233 | + let processed = blocks_processed.expect("blocks_processed should be present in log"); |
| 234 | + assert!( |
| 235 | + processed == 20, |
| 236 | + "Expected blocks_processed ({}) == MAX_STARTUP_ANCESTRY_LOOKBACK ({})", |
| 237 | + processed, target_gap |
| 238 | + ); |
| 239 | + log::info!( |
| 240 | + "Lookback verification passed: processed {} blocks (< existing gap {})", |
| 241 | + processed, target_gap |
| 242 | + ); |
| 243 | + |
| 244 | + log::info!("Verifying validator resumes normal operation"); |
| 245 | + |
| 246 | + let relay_client_after: OnlineClient<PolkadotConfig> = validator_0.wait_client().await?; |
| 247 | + assert_para_throughput(&relay_client_after, 5, [(ParaId::from(PARA_ID), 4..6)]).await?; |
| 248 | + |
| 249 | + // === Phase 2: Verify lookback processes entire gap when gap < MAX_STARTUP_ANCESTRY_LOOKBACK === |
| 250 | + log::info!("Phase 2: Testing lookback with smaller gap (< 20 blocks)"); |
| 251 | + |
| 252 | + // Wait for another periodic persistence to get a precise starting point |
| 253 | + log::info!("Waiting for second periodic persistence"); |
| 254 | + let persistence_result_2 = validator_0 |
| 255 | + .wait_log_line_count_with_timeout( |
| 256 | + "Periodic persistence completed: reputation DB written to disk", |
| 257 | + false, |
| 258 | + LogLineCountOptions::new(|n| n >= 2, Duration::from_secs(60), false), |
| 259 | + ) |
| 260 | + .await?; |
| 261 | + assert!(persistence_result_2.success(), "Second periodic persistence should have completed"); |
| 262 | + |
| 263 | + validator_0.pause().await?; |
| 264 | + log::info!("Pausing validator-0 again to create a smaller gap"); |
| 265 | + |
| 266 | + let logs_before_second_pause = validator_0.logs().await?; |
| 267 | + let mut block_before_second_pause: Option<u32> = None; |
| 268 | + |
| 269 | + for line in logs_before_second_pause.lines().rev() { |
| 270 | + if let Some(caps) = persistence_re.captures(line) { |
| 271 | + block_before_second_pause = caps.get(1).and_then(|m| m.as_str().parse().ok()); |
| 272 | + if block_before_second_pause.is_some() { |
| 273 | + break; |
| 274 | + } |
| 275 | + } |
| 276 | + } |
| 277 | + |
| 278 | + let block_before_second_pause = block_before_second_pause |
| 279 | + .ok_or(anyhow!("Could not parse last_finalized from second persistence log"))?; |
| 280 | + log::info!("Second periodic persistence completed at finalized block {}", block_before_second_pause); |
| 281 | + |
| 282 | + |
| 283 | + let small_gap_target = 10u32; |
| 284 | + let mut block_at_second_restart = block_before_second_pause; |
| 285 | + while block_at_second_restart < block_before_second_pause + small_gap_target { |
| 286 | + if let Some(Ok(block)) = finalized_blocks_1.next().await { |
| 287 | + block_at_second_restart = block.number(); |
| 288 | + log::info!( |
| 289 | + "Finalized block {} (gap: {})", |
| 290 | + block_at_second_restart, |
| 291 | + block_at_second_restart.saturating_sub(block_before_second_pause) |
| 292 | + ); |
| 293 | + } |
| 294 | + } |
| 295 | + log::info!( |
| 296 | + "Small gap created: {} blocks (from {} to {})", |
| 297 | + block_at_second_restart.saturating_sub(block_before_second_pause), |
| 298 | + block_before_second_pause, |
| 299 | + block_at_second_restart |
| 300 | + ); |
| 301 | + |
| 302 | + log::info!("Restarting validator-0 (second restart)"); |
| 303 | + validator_0.restart(None).await?; |
| 304 | + let _: OnlineClient<PolkadotConfig> = validator_0.wait_client().await?; |
| 305 | + |
| 306 | + let lookback_completed_result_2 = validator_0 |
| 307 | + .wait_log_line_count_with_timeout( |
| 308 | + "Startup lookback completed", |
| 309 | + false, |
| 310 | + LogLineCountOptions::new(|n| n >= 2, Duration::from_secs(30), false), |
| 311 | + ) |
| 312 | + .await?; |
| 313 | + assert!( |
| 314 | + lookback_completed_result_2.success(), |
| 315 | + "Expected second 'Startup lookback completed' log" |
| 316 | + ); |
| 317 | + |
| 318 | + let logs_second_restart = validator_0.logs().await?; |
| 319 | + |
| 320 | + let mut last_blocks_processed: Option<u32> = None; |
| 321 | + for line in logs_second_restart.lines().rev() { |
| 322 | + if let Some(caps) = lookback_completed_re.captures(line) { |
| 323 | + last_blocks_processed = caps.get(1).and_then(|m| m.as_str().parse().ok()); |
| 324 | + if last_blocks_processed.is_some() { |
| 325 | + break; |
| 326 | + } |
| 327 | + } |
| 328 | + } |
| 329 | + |
| 330 | + let processed_second = last_blocks_processed.expect("Should find second lookback completed log"); |
| 331 | + log::info!("Second lookback processed {} blocks", processed_second); |
| 332 | + |
| 333 | + let expected_gap = block_at_second_restart.saturating_sub(block_before_second_pause); |
| 334 | + log::info!( |
| 335 | + "Second lookback: gap was {} blocks (from {} to {}), processed {}", |
| 336 | + expected_gap, |
| 337 | + block_before_second_pause, |
| 338 | + block_at_second_restart, |
| 339 | + processed_second |
| 340 | + ); |
| 341 | + |
| 342 | + assert!( |
| 343 | + expected_gap < 20, |
| 344 | + "Expected second gap to be < 20 (to test no artificial limit), but got {}", |
| 345 | + expected_gap |
| 346 | + ); |
| 347 | + |
| 348 | + assert!( |
| 349 | + processed_second >= expected_gap.saturating_sub(4) && |
| 350 | + processed_second <= expected_gap + 4, |
| 351 | + "Expected second lookback to process entire gap (~{} blocks), but got {}", |
| 352 | + expected_gap, |
| 353 | + processed_second |
| 354 | + ); |
| 355 | + |
| 356 | + log::info!("Basic persistence test completed successfully - both large and small gap tests passed"); |
| 357 | + |
| 358 | + Ok(()) |
| 359 | +} |
0 commit comments