Skip to content

Commit 66fd9fc

Browse files
committed
[iris] Move periodic checkpoint to its own thread
The checkpoint write was piggybacking on the autoscaler loop iteration, so a multi-second backup+upload would stall the next autoscaler tick. Spawn a dedicated checkpoint-loop thread that owns the limiter and runs write_checkpoint independently.
1 parent 02a6261 commit 66fd9fc

File tree

1 file changed

+15
-6
lines changed

1 file changed

+15
-6
lines changed

lib/iris/src/iris/cluster/controller/controller.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1223,6 +1223,9 @@ def start(self) -> None:
12231223
logger.info("Autoscaler configured with %d scale groups", len(self._autoscaler.groups))
12241224
self._autoscaler_thread = self._threads.spawn(self._run_autoscaler_loop, name="autoscaler-loop")
12251225

1226+
if self._periodic_checkpoint_limiter is not None and not self._config.dry_run:
1227+
self._checkpoint_thread = self._threads.spawn(self._run_checkpoint_loop, name="checkpoint-loop")
1228+
12261229
# Register atexit hook to capture final state for post-mortem analysis.
12271230
# Unregistered in stop() so it doesn't fire against a closed DB.
12281231
self._atexit_registered = True
@@ -1390,12 +1393,18 @@ def _run_autoscaler_loop(self, stop_event: threading.Event) -> None:
13901393
except Exception:
13911394
logger.exception("Autoscaler loop iteration failed")
13921395

1393-
if self._periodic_checkpoint_limiter is not None and self._periodic_checkpoint_limiter.should_run():
1394-
if not self._config.dry_run:
1395-
try:
1396-
write_checkpoint(self._db, self._config.remote_state_dir)
1397-
except Exception:
1398-
logger.exception("Periodic checkpoint failed")
1396+
def _run_checkpoint_loop(self, stop_event: threading.Event) -> None:
1397+
"""Periodic checkpoint loop: runs on its own thread so the multi-second
1398+
backup+upload doesn't stall the autoscaler cadence."""
1399+
limiter = self._periodic_checkpoint_limiter
1400+
assert limiter is not None, "checkpoint loop spawned without configured limiter"
1401+
while not stop_event.is_set():
1402+
if not limiter.wait(cancel=stop_event):
1403+
break
1404+
try:
1405+
write_checkpoint(self._db, self._config.remote_state_dir)
1406+
except Exception:
1407+
logger.exception("Periodic checkpoint failed")
13991408

14001409
def _run_provider_loop(self, stop_event: threading.Event) -> None:
14011410
"""Provider sync loop on its own thread so slow RPCs don't block scheduling."""

0 commit comments

Comments
 (0)