Skip to content

Commit d1ae2f5

Browse files
Add zombienet tests for collator reputation persistence
Tests added: - basic_persistence.rs: Validates that reputation data persists across validator restarts and that the startup lookback mechanism correctly processes missed blocks during downtime. Tests both large gaps (20+ blocks) and small gaps (<20 blocks). - pruning.rs: Verifies that reputation data is correctly pruned when parachains are deregistered. Tests para cleanup via sudo, session boundary detection, immediate persistence of pruned state, and correct loading of only remaining paras after restart. Signed-off-by: Alexandru Cihodaru <[email protected]>
1 parent 2991942 commit d1ae2f5

File tree

5 files changed

+632
-0
lines changed

5 files changed

+632
-0
lines changed

polkadot/zombienet-sdk-tests/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ log = { workspace = true }
1919
pallet-revive = { workspace = true, features = ["std"] }
2020
polkadot-primitives = { workspace = true, default-features = true }
2121
rand = { workspace = true }
22+
regex = { workspace = true }
2223
serde = { workspace = true }
2324
serde_json = { workspace = true }
2425
sp-core = { workspace = true }

polkadot/zombienet-sdk-tests/tests/functional/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ mod approved_peer_mixed_validators;
66
mod async_backing_6_seconds_rate;
77
mod dispute_old_finalized;
88
mod duplicate_collations;
9+
mod reputation_persistence;
910
mod shared_core_idle_parachain;
1011
mod spam_statement_distribution_requests;
1112
mod sync_backing;
Lines changed: 359 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,359 @@
1+
// Copyright (C) Parity Technologies (UK) Ltd.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
//! Test: Basic Persistence on Graceful Shutdown with Startup Lookback Verification
5+
//!
6+
//! This test verifies that collator reputation data is correctly persisted to disk
7+
//! during normal operation, survives a graceful validator restart, and that the
8+
//! startup lookback mechanism correctly catches up on reputation bumps from blocks
9+
//! finalized between persistence and restart.
10+
//!
11+
//! ## Test Scenario - Phase 1: Large Gap (>= 20 blocks)
12+
//!
13+
//! 1. Spawn a network with 4 validators and 1 parachain (using experimental collator protocol)
14+
//! 2. Wait for parachain blocks to be produced (establishing reputation through backed candidates)
15+
//! 3. Wait for periodic persistence (using short interval for testing)
16+
//! 4. Record the finalized block number at persistence time
17+
//! 5. **Pause validator-0** (so it misses blocks being finalized)
18+
//! 6. Wait for 22+ finalized blocks while validator-0 is paused (creating a real gap)
19+
//! 7. Restart validator-0 (triggers full startup sequence)
20+
//! 8. Verify validator loads existing reputation from disk on restart
21+
//! 9. Verify the startup lookback processes blocks it missed while paused
22+
//! 10. Verify validator continues normal operation
23+
//!
24+
//! ## Test Scenario - Phase 2: Small Gap (< 20 blocks)
25+
//!
26+
//! 11. Pause validator-0 again
27+
//! 12. Wait for ~10 finalized blocks (smaller gap)
28+
//! 13. Restart validator-0 again
29+
//! 14. Verify the startup lookback processes the entire gap (not limited by MAX_STARTUP_ANCESTRY_LOOKBACK)
30+
//! 15. Verify processed block count matches the actual gap size
31+
//!
32+
//! ## Success Criteria
33+
//!
34+
//! - Validator logs show "Loaded existing reputation DB from disk" on both restarts
35+
//! - First lookback processes at least 20 blocks (large gap)
36+
//! - Second lookback processes exactly ~10 blocks (entire small gap)
37+
//! - Validator resumes backing candidates after both restarts
38+
//! - No errors about missing or corrupted data
39+
40+
use anyhow::anyhow;
41+
use regex::Regex;
42+
use tokio::time::Duration;
43+
44+
use cumulus_zombienet_sdk_helpers::assert_para_throughput;
45+
use polkadot_primitives::Id as ParaId;
46+
use serde_json::json;
47+
use zombienet_orchestrator::network::node::LogLineCountOptions;
48+
use zombienet_sdk::{
49+
subxt::{OnlineClient, PolkadotConfig},
50+
NetworkConfigBuilder,
51+
};
52+
53+
const PARA_ID: u32 = 2000;
54+
55+
#[tokio::test(flavor = "multi_thread")]
56+
async fn basic_persistence_test() -> Result<(), anyhow::Error> {
57+
let _ = env_logger::try_init_from_env(
58+
env_logger::Env::default().filter_or(env_logger::DEFAULT_FILTER_ENV, "info"),
59+
);
60+
61+
let images = zombienet_sdk::environment::get_images_from_env();
62+
63+
let config = NetworkConfigBuilder::new()
64+
.with_relaychain(|r| {
65+
let r = r
66+
.with_chain("rococo-local")
67+
.with_default_command("polkadot")
68+
.with_default_image(images.polkadot.as_str())
69+
.with_default_args(vec![
70+
("-lparachain=debug,parachain::collator-protocol=trace").into(),
71+
("--experimental-collator-protocol").into(),
72+
])
73+
.with_genesis_overrides(json!({
74+
"configuration": {
75+
"config": {
76+
"scheduler_params": {
77+
"group_rotation_frequency": 4,
78+
"num_cores": 1
79+
}
80+
}
81+
}
82+
}))
83+
.with_node(|node| node.with_name("validator-0"));
84+
85+
(1..4)
86+
.fold(r, |acc, i| acc.with_node(|node| node.with_name(&format!("validator-{i}"))))
87+
})
88+
.with_parachain(|p| {
89+
p.with_id(PARA_ID)
90+
.with_default_command("undying-collator")
91+
.cumulus_based(false)
92+
.with_default_image(
93+
std::env::var("COL_IMAGE")
94+
.unwrap_or("docker.io/paritypr/colander:latest".to_string())
95+
.as_str(),
96+
)
97+
.with_default_args(vec![
98+
("-lparachain=debug").into(),
99+
("--experimental-send-approved-peer").into(),
100+
])
101+
.with_collator(|n| n.with_name("collator"))
102+
})
103+
.build()
104+
.map_err(|e| {
105+
let errs = e.into_iter().map(|e| e.to_string()).collect::<Vec<_>>().join(" ");
106+
anyhow!("config errs: {errs}")
107+
})?;
108+
109+
let spawn_fn = zombienet_sdk::environment::get_spawn_fn();
110+
let network = spawn_fn(config).await?;
111+
112+
let validator_0 = network.get_node("validator-0")?;
113+
let validator0_client: OnlineClient<PolkadotConfig> = validator_0.wait_client().await?;
114+
115+
// Verify validator-0 shows fresh start initially (no existing data)
116+
let fresh_start_result = validator_0
117+
.wait_log_line_count_with_timeout(
118+
"Reputation DB initialized fresh",
119+
false,
120+
LogLineCountOptions::new(|n| n >= 1, Duration::from_secs(60), false),
121+
)
122+
.await?;
123+
assert!(
124+
fresh_start_result.success(),
125+
"Expected validator to log 'Reputation DB initialized fresh' on initial startup"
126+
);
127+
128+
log::info!("Network spawned, waiting for parachain blocks to be produced");
129+
assert_para_throughput(&validator0_client, 10, [(ParaId::from(PARA_ID), 8..12)]).await?;
130+
131+
log::info!("Parachain blocks produced, waiting for periodic persistence");
132+
let persistence_result = validator_0
133+
.wait_log_line_count_with_timeout(
134+
"Periodic persistence completed: reputation DB written to disk",
135+
false,
136+
LogLineCountOptions::new(|n| n >= 1, Duration::from_secs(60), false),
137+
)
138+
.await?;
139+
assert!(persistence_result.success(), "Periodic persistence should have completed");
140+
141+
let logs_before_pause = validator_0.logs().await?;
142+
let persistence_re = Regex::new(
143+
r"Periodic persistence completed: reputation DB written to disk.*last_finalized=Some\((\d+)\)"
144+
)?;
145+
146+
let mut block_at_persistence: Option<u32> = None;
147+
for line in logs_before_pause.lines() {
148+
if let Some(caps) = persistence_re.captures(line) {
149+
block_at_persistence = caps.get(1).and_then(|m| m.as_str().parse().ok());
150+
}
151+
}
152+
153+
let block_at_persistence = block_at_persistence
154+
.ok_or(anyhow!("Could not parse last_finalized from persistence log"))?;
155+
log::info!("Periodic persistence completed at finalized block {}", block_at_persistence);
156+
157+
log::info!("Pausing validator-0 to create a block gap");
158+
validator_0.pause().await?;
159+
160+
let validator_1 = network.get_node("validator-1")?;
161+
let validator_1_client: OnlineClient<PolkadotConfig> = validator_1.wait_client().await?;
162+
let mut finalized_blocks_1 = validator_1_client.blocks().subscribe_finalized().await?;
163+
164+
log::info!("Waiting for finalized blocks while validator-0 is paused");
165+
let target_gap = 30u32;
166+
let mut block_at_restart = block_at_persistence;
167+
while block_at_restart < block_at_persistence + target_gap {
168+
if let Some(Ok(block)) = finalized_blocks_1.next().await {
169+
block_at_restart = block.number();
170+
log::info!("Finalized block {} (gap: {})", block_at_restart, block_at_restart.saturating_sub(block_at_persistence));
171+
}
172+
}
173+
log::info!(
174+
"Gap created while validator-0 was paused: finalized block now at {}, gap of {} blocks",
175+
block_at_restart,
176+
block_at_restart.saturating_sub(block_at_persistence)
177+
);
178+
179+
log::info!("Restarting validator-0 (full restart to trigger startup lookback)");
180+
validator_0.restart(None).await?;
181+
let _: OnlineClient<PolkadotConfig> = validator_0.wait_client().await?;
182+
log::info!("Validator-0 restarted, verifying reputation loaded from disk");
183+
184+
let load_result = validator_0
185+
.wait_log_line_count_with_timeout(
186+
"Loaded existing reputation DB from disk",
187+
false,
188+
LogLineCountOptions::new(|n| n >= 1, Duration::from_secs(60), false),
189+
)
190+
.await?;
191+
assert!(
192+
load_result.success(),
193+
"Expected validator to log 'Loaded existing reputation DB from disk' after restart"
194+
);
195+
196+
let lookback_completed_result = validator_0
197+
.wait_log_line_count_with_timeout(
198+
"Startup lookback completed",
199+
false,
200+
LogLineCountOptions::new(|n| n >= 1, Duration::from_secs(30), false),
201+
)
202+
.await?;
203+
assert!(
204+
lookback_completed_result.success(),
205+
"Expected validator to log 'Startup lookback completed' after restart"
206+
);
207+
208+
let logs = validator_0.logs().await?;
209+
let lookback_completed_re = Regex::new(
210+
r"Startup lookback completed.*blocks_processed=(\d+)"
211+
)?;
212+
213+
let mut found_lookback_completed = false;
214+
let mut blocks_processed: Option<u32> = None;
215+
216+
for line in logs.lines() {
217+
if let Some(caps) = lookback_completed_re.captures(line) {
218+
found_lookback_completed = true;
219+
blocks_processed = caps.get(1).and_then(|m| m.as_str().parse().ok());
220+
log::info!(
221+
"Found startup lookback completed log: blocks_processed={}",
222+
blocks_processed.unwrap()
223+
);
224+
break;
225+
}
226+
}
227+
228+
assert!(
229+
found_lookback_completed,
230+
"Expected to find 'Startup lookback completed' log with blocks_processed field"
231+
);
232+
233+
let processed = blocks_processed.expect("blocks_processed should be present in log");
234+
assert!(
235+
processed == 20,
236+
"Expected blocks_processed ({}) == MAX_STARTUP_ANCESTRY_LOOKBACK ({})",
237+
processed, target_gap
238+
);
239+
log::info!(
240+
"Lookback verification passed: processed {} blocks (< existing gap {})",
241+
processed, target_gap
242+
);
243+
244+
log::info!("Verifying validator resumes normal operation");
245+
246+
let relay_client_after: OnlineClient<PolkadotConfig> = validator_0.wait_client().await?;
247+
assert_para_throughput(&relay_client_after, 5, [(ParaId::from(PARA_ID), 4..6)]).await?;
248+
249+
// === Phase 2: Verify lookback processes entire gap when gap < MAX_STARTUP_ANCESTRY_LOOKBACK ===
250+
log::info!("Phase 2: Testing lookback with smaller gap (< 20 blocks)");
251+
252+
// Wait for another periodic persistence to get a precise starting point
253+
log::info!("Waiting for second periodic persistence");
254+
let persistence_result_2 = validator_0
255+
.wait_log_line_count_with_timeout(
256+
"Periodic persistence completed: reputation DB written to disk",
257+
false,
258+
LogLineCountOptions::new(|n| n >= 2, Duration::from_secs(60), false),
259+
)
260+
.await?;
261+
assert!(persistence_result_2.success(), "Second periodic persistence should have completed");
262+
263+
validator_0.pause().await?;
264+
log::info!("Pausing validator-0 again to create a smaller gap");
265+
266+
let logs_before_second_pause = validator_0.logs().await?;
267+
let mut block_before_second_pause: Option<u32> = None;
268+
269+
for line in logs_before_second_pause.lines().rev() {
270+
if let Some(caps) = persistence_re.captures(line) {
271+
block_before_second_pause = caps.get(1).and_then(|m| m.as_str().parse().ok());
272+
if block_before_second_pause.is_some() {
273+
break;
274+
}
275+
}
276+
}
277+
278+
let block_before_second_pause = block_before_second_pause
279+
.ok_or(anyhow!("Could not parse last_finalized from second persistence log"))?;
280+
log::info!("Second periodic persistence completed at finalized block {}", block_before_second_pause);
281+
282+
283+
let small_gap_target = 10u32;
284+
let mut block_at_second_restart = block_before_second_pause;
285+
while block_at_second_restart < block_before_second_pause + small_gap_target {
286+
if let Some(Ok(block)) = finalized_blocks_1.next().await {
287+
block_at_second_restart = block.number();
288+
log::info!(
289+
"Finalized block {} (gap: {})",
290+
block_at_second_restart,
291+
block_at_second_restart.saturating_sub(block_before_second_pause)
292+
);
293+
}
294+
}
295+
log::info!(
296+
"Small gap created: {} blocks (from {} to {})",
297+
block_at_second_restart.saturating_sub(block_before_second_pause),
298+
block_before_second_pause,
299+
block_at_second_restart
300+
);
301+
302+
log::info!("Restarting validator-0 (second restart)");
303+
validator_0.restart(None).await?;
304+
let _: OnlineClient<PolkadotConfig> = validator_0.wait_client().await?;
305+
306+
let lookback_completed_result_2 = validator_0
307+
.wait_log_line_count_with_timeout(
308+
"Startup lookback completed",
309+
false,
310+
LogLineCountOptions::new(|n| n >= 2, Duration::from_secs(30), false),
311+
)
312+
.await?;
313+
assert!(
314+
lookback_completed_result_2.success(),
315+
"Expected second 'Startup lookback completed' log"
316+
);
317+
318+
let logs_second_restart = validator_0.logs().await?;
319+
320+
let mut last_blocks_processed: Option<u32> = None;
321+
for line in logs_second_restart.lines().rev() {
322+
if let Some(caps) = lookback_completed_re.captures(line) {
323+
last_blocks_processed = caps.get(1).and_then(|m| m.as_str().parse().ok());
324+
if last_blocks_processed.is_some() {
325+
break;
326+
}
327+
}
328+
}
329+
330+
let processed_second = last_blocks_processed.expect("Should find second lookback completed log");
331+
log::info!("Second lookback processed {} blocks", processed_second);
332+
333+
let expected_gap = block_at_second_restart.saturating_sub(block_before_second_pause);
334+
log::info!(
335+
"Second lookback: gap was {} blocks (from {} to {}), processed {}",
336+
expected_gap,
337+
block_before_second_pause,
338+
block_at_second_restart,
339+
processed_second
340+
);
341+
342+
assert!(
343+
expected_gap < 20,
344+
"Expected second gap to be < 20 (to test no artificial limit), but got {}",
345+
expected_gap
346+
);
347+
348+
assert!(
349+
processed_second >= expected_gap.saturating_sub(4) &&
350+
processed_second <= expected_gap + 4,
351+
"Expected second lookback to process entire gap (~{} blocks), but got {}",
352+
expected_gap,
353+
processed_second
354+
);
355+
356+
log::info!("Basic persistence test completed successfully - both large and small gap tests passed");
357+
358+
Ok(())
359+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
// Copyright (C) Parity Technologies (UK) Ltd.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
mod basic_persistence;
5+
mod pruning;

0 commit comments

Comments
 (0)