Skip to content

Commit bb51bf1

Browse files
authored
Fix #1615: tolerate abort_request connection failures (#1632)
1 parent 3b81f14 commit bb51bf1

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

slime/rollout/sglang_rollout.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,11 @@ async def abort(args: Namespace, rollout_id: int) -> list[list[Sample]]:
319319
urls = [worker["url"] for worker in response["workers"]]
320320

321321
logger.info(f"Abort request for {urls}")
322-
await asyncio.gather(*[post(f"{url}/abort_request", {"abort_all": True}) for url in urls])
322+
abort_tasks = [post(f"{url}/abort_request", {"abort_all": True}) for url in urls]
323+
abort_results = await asyncio.gather(*abort_tasks, return_exceptions=True)
324+
for url, result in zip(urls, abort_results, strict=False):
325+
if isinstance(result, Exception):
326+
logger.warning(f"Failed to abort worker at {url}: {result}")
323327

324328
# make sure all the pending tasks are finished
325329
count = 0

0 commit comments

Comments
 (0)