88import random
99import sys
1010import threading
11+ import time
1112from typing import Any , Dict , List , Literal , Optional , Sequence , Set , Tuple , cast
1213
1314from rich .console import Console
1920
2021console = Console ()
2122
22- MAX_RUNTIME_SECONDS = 30 * 60
23+ # Minus 10 to leave time for setting up env.
24+ MAX_RUNTIME_SECONDS = (int (os .getenv ("GITHUB_ACTIONS_TIMEOUT_MINUTES" , "30" )) - 10 ) * 60
25+ MAX_STALE_SECONDS = 300
2326
2427
2528def _abort_due_to_timeout () -> None :
@@ -157,7 +160,7 @@ async def algorithm_batch(self, total_tasks: int, batch_size: int):
157160
158161 pending = {rollout_id : task_name for rollout_id , task_name in batch_rollouts }
159162 completed_ids : Set [str ] = set ()
160- completed_ids_last_updated : int = 0
163+ completed_ids_last_updated : float = time . perf_counter ()
161164 while len (completed_ids ) < len (batch_rollouts ):
162165 finished_rollouts = await store .wait_for_rollouts (
163166 rollout_ids = [rollout_id for rollout_id , _ in batch_rollouts ],
@@ -177,13 +180,17 @@ async def algorithm_batch(self, total_tasks: int, batch_size: int):
177180
178181 # Check and warn for stale rollouts
179182 if complete_ids_updated :
180- completed_ids_last_updated = 0
183+ completed_ids_last_updated = time . perf_counter ()
181184 else :
182- completed_ids_last_updated += 1
183- if completed_ids_last_updated >= 10 :
185+ if time .perf_counter () - completed_ids_last_updated > MAX_STALE_SECONDS / 2 :
184186 unfinished_ids = set (rollout_id for rollout_id , _ in batch_rollouts ) - completed_ids
185187 print (f"Stale rollouts: { unfinished_ids } " )
186- completed_ids_last_updated = 0
188+ if time .perf_counter () - completed_ids_last_updated > MAX_STALE_SECONDS :
189+ current_workers = await store .query_workers ()
190+ console .print (f"Stalled. Current worker status shown below:" )
191+ for worker in current_workers :
192+ console .print (f" Worker: { worker } " , width = 1024 ) # Avoid wrapping
193+ raise RuntimeError ("Rollout progress has stalled for too long" )
187194
188195 await asyncio .sleep (5.0 )
189196
0 commit comments