Skip to content

Commit 6ec076e

Browse files
committed
Handle image pull failure
1 parent e902955 commit 6ec076e

File tree

3 files changed

+43
-16
lines changed

3 files changed

+43
-16
lines changed

src/ai/backend/agent/server.py

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
from ai.backend.common.docker import ImageRef
5454
from ai.backend.common.etcd import AsyncEtcd, ConfigScopes
5555
from ai.backend.common.events import (
56+
ImagePullFailedEvent,
5657
ImagePullFinishedEvent,
5758
ImagePullStartedEvent,
5859
KernelLifecycleEventReason,
@@ -522,15 +523,42 @@ async def _pull(reporter: ProgressReporter, *, img_conf: ImageConfig) -> None:
522523
image_pull_timeout = cast(
523524
Optional[float], self.local_config["agent"]["api"]["pull-timeout"]
524525
)
525-
await self.agent.pull_image(
526-
img_ref, img_conf["registry"], timeout=image_pull_timeout
527-
)
528-
await self.agent.produce_event(
529-
ImagePullFinishedEvent(
530-
image=str(img_ref),
531-
agent_id=self.agent.id,
526+
try:
527+
await self.agent.pull_image(
528+
img_ref, img_conf["registry"], timeout=image_pull_timeout
529+
)
530+
except asyncio.TimeoutError:
531+
log.exception(f"Image pull timeout after {image_pull_timeout} sec")
532+
await self.agent.produce_event(
533+
ImagePullFailedEvent(
534+
image=str(img_ref),
535+
agent_id=self.agent.id,
536+
msg=f"timeout (s:{image_pull_timeout})",
537+
)
538+
)
539+
except Exception as e:
540+
log.exception(f"Image pull failed (e:{repr(e)})")
541+
await self.agent.produce_event(
542+
ImagePullFailedEvent(
543+
image=str(img_ref),
544+
agent_id=self.agent.id,
545+
msg=repr(e),
546+
)
547+
)
548+
else:
549+
await self.agent.produce_event(
550+
ImagePullFinishedEvent(
551+
image=str(img_ref),
552+
agent_id=self.agent.id,
553+
)
554+
)
555+
else:
556+
await self.agent.produce_event(
557+
ImagePullFinishedEvent(
558+
image=str(img_ref),
559+
agent_id=self.agent.id,
560+
)
532561
)
533-
)
534562

535563
ret: dict[str, str] = {}
536564
for img, img_conf in image_configs.items():

src/ai/backend/manager/models/kernel.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -670,13 +670,10 @@ def set_status(
670670
self.terminated_at = now
671671
self.status_changed = now
672672
self.status = status
673-
self.status_history = sql_json_merge(
674-
KernelRow.status_history,
675-
(),
676-
{
677-
status.name: now.isoformat(),
678-
},
679-
)
673+
self.status_history = {
674+
**self.status_history,
675+
status.name: now.isoformat(),
676+
}
680677
if status_info is not None:
681678
self.status_info = status_info
682679
if status_data is not None:

src/ai/backend/manager/registry.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3332,7 +3332,9 @@ async def _transit(db_session: AsyncSession) -> set[SessionId]:
33323332
for row in await db_session.scalars(_stmt):
33333333
kernel_row = cast(KernelRow, row)
33343334
is_transited = kernel_row.transit_status(
3335-
KernelStatus.CANCELLED, status_info="image-pull-failed"
3335+
KernelStatus.CANCELLED,
3336+
status_info="image-pull-failed",
3337+
status_data={"error": {"src": "other", "repr": msg}},
33363338
)
33373339
if is_transited:
33383340
session_ids.add(kernel_row.session_id)

0 commit comments

Comments
 (0)