Skip to content

Commit eb60175

Browse files
scripts: confirm before restarting all Core pods at once
Restarting every Core pod simultaneously brings all consensus validators down at once and can halt the chain. In the parallel (ALL_AT_ONCE) restart flow, prompt the user for an explicit y/n confirmation when the service is Core, and abort the restart if they decline. Non-Core services are unaffected. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 53882ed commit eb60175

2 files changed

Lines changed: 51 additions & 0 deletions

File tree

scripts/prod/restarter_lib.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,15 @@ def restart_all(self, max_parallelism: int) -> None:
191191

192192
indices = list(range(self.namespace_and_instruction_args.size()))
193193

194+
# Restarting every Core pod simultaneously brings all consensus validators down at once and
195+
# can halt the chain. Require an explicit extra confirmation before doing so.
196+
if self.service == Service.Core and not wait_until_y_or_n(
197+
f"WARNING: this will restart ALL {len(indices)} Core pods at the same time, which can "
198+
"halt consensus. Are you sure you want to continue?"
199+
):
200+
print_colored("\nAborting restart process.")
201+
sys.exit(1)
202+
194203
# Phase 1: restart every node's pod concurrently (pod deletes have no ordering dependency).
195204
print_colored(f"\nRestarting {len(indices)} node(s) in parallel...", Colors.YELLOW)
196205
run_in_parallel(

scripts/prod/test_parallel_restart.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def enter(self, key):
4343

4444

4545
def test_all_at_once_restarts_every_node_concurrently(monkeypatch):
46+
monkeypatch.setattr("restarter_lib.wait_until_y_or_n", lambda question: True)
4647
recorder = _ConcurrencyRecorder()
4748
monkeypatch.setattr(
4849
ServiceRestarter,
@@ -65,6 +66,7 @@ def test_all_at_once_restarts_every_node_concurrently(monkeypatch):
6566

6667

6768
def test_max_parallelism_caps_concurrency(monkeypatch):
69+
monkeypatch.setattr("restarter_lib.wait_until_y_or_n", lambda question: True)
6870
recorder = _ConcurrencyRecorder()
6971
monkeypatch.setattr(
7072
ServiceRestarter,
@@ -82,6 +84,7 @@ def test_max_parallelism_caps_concurrency(monkeypatch):
8284

8385

8486
def test_all_at_once_waits_for_every_node_concurrently(monkeypatch):
87+
monkeypatch.setattr("restarter_lib.wait_until_y_or_n", lambda question: True)
8588
# Restarts are mocked to be instant; the wait phase is what we measure.
8689
monkeypatch.setattr(
8790
ServiceRestarter, "_restart_pod", staticmethod(lambda *args, **kwargs: None)
@@ -133,5 +136,44 @@ def test_no_restart_metric_restarter_is_sequential():
133136
assert restarter.parallel is False
134137

135138

139+
def test_core_all_at_once_aborts_when_user_declines(monkeypatch):
140+
monkeypatch.setattr("restarter_lib.wait_until_y_or_n", lambda question: False)
141+
restarted = []
142+
monkeypatch.setattr(
143+
ServiceRestarter,
144+
"_restart_pod",
145+
staticmethod(lambda namespace, service, index, cluster=None: restarted.append(namespace)),
146+
)
147+
148+
restarter = ServiceRestarter.from_restart_strategy(
149+
RestartStrategy.ALL_AT_ONCE, _make_args(), Service.Core
150+
)
151+
with pytest.raises(SystemExit) as exit_info:
152+
restarter.restart_all(max_parallelism=len(NAMESPACES))
153+
154+
assert exit_info.value.code == 1
155+
assert restarted == [] # declining means nothing is restarted
156+
157+
158+
def test_non_core_all_at_once_does_not_prompt(monkeypatch):
159+
def fail_if_called(question):
160+
raise AssertionError("Non-Core restarts must not prompt for confirmation")
161+
162+
monkeypatch.setattr("restarter_lib.wait_until_y_or_n", fail_if_called)
163+
restarted = []
164+
monkeypatch.setattr(
165+
ServiceRestarter,
166+
"_restart_pod",
167+
staticmethod(lambda namespace, service, index, cluster=None: restarted.append(namespace)),
168+
)
169+
170+
restarter = ServiceRestarter.from_restart_strategy(
171+
RestartStrategy.ALL_AT_ONCE, _make_args(), Service.Gateway
172+
)
173+
restarter.restart_all(max_parallelism=len(NAMESPACES))
174+
175+
assert sorted(restarted) == sorted(NAMESPACES)
176+
177+
136178
if __name__ == "__main__":
137179
raise SystemExit(pytest.main([__file__, "-v"]))

0 commit comments

Comments
 (0)