Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions scripts/prod/restarter_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,15 @@ def restart_all(self, max_parallelism: int) -> None:

indices = list(range(self.namespace_and_instruction_args.size()))

# Restarting every Core pod simultaneously brings all consensus validators down at once and
# can halt the chain. Require an explicit extra confirmation before doing so.
if self.service == Service.Core and not wait_until_y_or_n(
f"WARNING: this will restart ALL {len(indices)} Core pods at the same time, which can "
"halt consensus. Are you sure you want to continue?"
):
print_colored("\nAborting restart process.")
sys.exit(1)
Comment thread
cursor[bot] marked this conversation as resolved.

# Phase 1: restart every node's pod concurrently (pod deletes have no ordering dependency).
print_colored(f"\nRestarting {len(indices)} node(s) in parallel...", Colors.YELLOW)
run_in_parallel(
Expand Down
42 changes: 42 additions & 0 deletions scripts/prod/test_parallel_restart.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def enter(self, key):


def test_all_at_once_restarts_every_node_concurrently(monkeypatch):
monkeypatch.setattr("restarter_lib.wait_until_y_or_n", lambda question: True)
recorder = _ConcurrencyRecorder()
monkeypatch.setattr(
ServiceRestarter,
Expand All @@ -65,6 +66,7 @@ def test_all_at_once_restarts_every_node_concurrently(monkeypatch):


def test_max_parallelism_caps_concurrency(monkeypatch):
monkeypatch.setattr("restarter_lib.wait_until_y_or_n", lambda question: True)
recorder = _ConcurrencyRecorder()
monkeypatch.setattr(
ServiceRestarter,
Expand All @@ -82,6 +84,7 @@ def test_max_parallelism_caps_concurrency(monkeypatch):


def test_all_at_once_waits_for_every_node_concurrently(monkeypatch):
monkeypatch.setattr("restarter_lib.wait_until_y_or_n", lambda question: True)
# Restarts are mocked to be instant; the wait phase is what we measure.
monkeypatch.setattr(
ServiceRestarter, "_restart_pod", staticmethod(lambda *args, **kwargs: None)
Expand Down Expand Up @@ -133,5 +136,44 @@ def test_no_restart_metric_restarter_is_sequential():
assert restarter.parallel is False


def test_core_all_at_once_aborts_when_user_declines(monkeypatch):
monkeypatch.setattr("restarter_lib.wait_until_y_or_n", lambda question: False)
restarted = []
monkeypatch.setattr(
ServiceRestarter,
"_restart_pod",
staticmethod(lambda namespace, service, index, cluster=None: restarted.append(namespace)),
)

restarter = ServiceRestarter.from_restart_strategy(
RestartStrategy.ALL_AT_ONCE, _make_args(), Service.Core
)
with pytest.raises(SystemExit) as exit_info:
restarter.restart_all(max_parallelism=len(NAMESPACES))

assert exit_info.value.code == 1
assert restarted == [] # declining means nothing is restarted


def test_non_core_all_at_once_does_not_prompt(monkeypatch):
def fail_if_called(question):
raise AssertionError("Non-Core restarts must not prompt for confirmation")

monkeypatch.setattr("restarter_lib.wait_until_y_or_n", fail_if_called)
restarted = []
monkeypatch.setattr(
ServiceRestarter,
"_restart_pod",
staticmethod(lambda namespace, service, index, cluster=None: restarted.append(namespace)),
)

restarter = ServiceRestarter.from_restart_strategy(
RestartStrategy.ALL_AT_ONCE, _make_args(), Service.Gateway
)
restarter.restart_all(max_parallelism=len(NAMESPACES))

assert sorted(restarted) == sorted(NAMESPACES)


if __name__ == "__main__":
raise SystemExit(pytest.main([__file__, "-v"]))
Loading