1111
1212import asyncio
1313import logging
14+ import random
1415import threading
1516import time
1617from contextlib import suppress
@@ -72,6 +73,7 @@ def __init__(
7273 max_rollouts : Optional [int ] = None ,
7374 poll_interval : float = 5.0 ,
7475 heartbeat_interval : float = 10.0 ,
76+ interval_jitter : float = 0.1 ,
7577 heartbeat_launch_mode : Literal ["asyncio" , "thread" ] = "asyncio" ,
7678 ) -> None :
7779 """Initialize the agent runner.
@@ -82,6 +84,9 @@ def __init__(
8284 [`iter`][agentlightning.LitAgentRunner.iter].
8385 poll_interval: Seconds to wait between store polls when no work is available.
8486 heartbeat_interval: Seconds to wait between sending heartbeats to the store.
87+ interval_jitter: Jitter factor for the poll interval. The actual interval will be between
88+ poll_interval - interval_jitter and poll_interval + interval_jitter.
89+ This is to avoid the overload caused by the synchronization of the runners.
8590 heartbeat_launch_mode: Launch mode for the heartbeat loop. Can be "asyncio" or "thread".
8691 "asyncio" is the default and recommended mode. Use "thread" if you are experiencing blocking coroutines.
8792 """
@@ -90,7 +95,9 @@ def __init__(
9095 self ._max_rollouts = max_rollouts
9196 self ._poll_interval = poll_interval
9297 self ._heartbeat_interval = heartbeat_interval
98+ self ._interval_jitter = interval_jitter
9399 self ._heartbeat_launch_mode = heartbeat_launch_mode
100+ self ._random_state = random .Random ()
94101
95102 # Set later
96103 self ._agent : Optional [LitAgent [T_task ]] = None
@@ -360,7 +367,11 @@ async def heartbeat_loop() -> None:
360367 while not stop_event .is_set ():
361368 await self ._emit_heartbeat (store )
362369 with suppress (asyncio .TimeoutError ):
363- await asyncio .wait_for (stop_event .wait (), timeout = self ._heartbeat_interval )
370+ interval = self ._heartbeat_interval + self ._random_state .uniform (
371+ - self ._interval_jitter , self ._interval_jitter
372+ )
373+ interval = max (interval , 0.01 )
374+ await asyncio .wait_for (stop_event .wait (), timeout = interval )
364375
365376 task = asyncio .create_task (heartbeat_loop (), name = f"{ self .get_worker_id ()} -heartbeat" )
366377
@@ -379,7 +390,11 @@ def thread_worker() -> None:
379390 asyncio .set_event_loop (loop )
380391 while not stop_evt .is_set ():
381392 loop .run_until_complete (self ._emit_heartbeat (store ))
382- stop_evt .wait (self ._heartbeat_interval )
393+ interval = self ._heartbeat_interval + self ._random_state .uniform (
394+ - self ._interval_jitter , self ._interval_jitter
395+ )
396+ interval = max (interval , 0.01 )
397+ stop_evt .wait (interval )
383398
384399 thread = threading .Thread (target = thread_worker , name = f"{ self .get_worker_id ()} -heartbeat" , daemon = True )
385400 thread .start ()
@@ -402,11 +417,13 @@ async def _sleep_until_next_poll(self, event: Optional[ExecutionEvent] = None) -
402417 event: Optional [`ExecutionEvent`][agentlightning.ExecutionEvent] object that can be used to interrupt the sleep.
403418 If set during the sleep period, the method returns immediately.
404419 """
420+ interval = self ._poll_interval + self ._random_state .uniform (- self ._interval_jitter , self ._interval_jitter )
421+ interval = max (interval , 0.01 )
405422 if event is None :
406- await asyncio .sleep (self . _poll_interval )
423+ await asyncio .sleep (interval )
407424 return
408425 current_time = time .time ()
409- next_time = current_time + self . _poll_interval
426+ next_time = current_time + interval
410427 while time .time () < next_time :
411428 await asyncio .sleep (0.1 )
412429 if event .is_set ():
0 commit comments