Skip to content

Commit

Permalink
Handle image pull failure
Browse files Browse the repository at this point in the history
  • Loading branch information
fregataa committed Nov 17, 2024
1 parent e902955 commit 6ec076e
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 16 deletions.
44 changes: 36 additions & 8 deletions src/ai/backend/agent/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
from ai.backend.common.docker import ImageRef
from ai.backend.common.etcd import AsyncEtcd, ConfigScopes
from ai.backend.common.events import (
ImagePullFailedEvent,
ImagePullFinishedEvent,
ImagePullStartedEvent,
KernelLifecycleEventReason,
Expand Down Expand Up @@ -522,15 +523,42 @@ async def _pull(reporter: ProgressReporter, *, img_conf: ImageConfig) -> None:
image_pull_timeout = cast(
Optional[float], self.local_config["agent"]["api"]["pull-timeout"]
)
await self.agent.pull_image(
img_ref, img_conf["registry"], timeout=image_pull_timeout
)
await self.agent.produce_event(
ImagePullFinishedEvent(
image=str(img_ref),
agent_id=self.agent.id,
try:
await self.agent.pull_image(
img_ref, img_conf["registry"], timeout=image_pull_timeout
)
except asyncio.TimeoutError:
log.exception(f"Image pull timeout after {image_pull_timeout} sec")
await self.agent.produce_event(
ImagePullFailedEvent(
image=str(img_ref),
agent_id=self.agent.id,
msg=f"timeout (s:{image_pull_timeout})",
)
)
except Exception as e:
log.exception(f"Image pull failed (e:{repr(e)})")
await self.agent.produce_event(
ImagePullFailedEvent(
image=str(img_ref),
agent_id=self.agent.id,
msg=repr(e),
)
)
else:
await self.agent.produce_event(
ImagePullFinishedEvent(
image=str(img_ref),
agent_id=self.agent.id,
)
)
else:
await self.agent.produce_event(
ImagePullFinishedEvent(
image=str(img_ref),
agent_id=self.agent.id,
)
)
)

ret: dict[str, str] = {}
for img, img_conf in image_configs.items():
Expand Down
11 changes: 4 additions & 7 deletions src/ai/backend/manager/models/kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,13 +670,10 @@ def set_status(
self.terminated_at = now
self.status_changed = now
self.status = status
self.status_history = sql_json_merge(
KernelRow.status_history,
(),
{
status.name: now.isoformat(),
},
)
self.status_history = {
**self.status_history,
status.name: now.isoformat(),
}
if status_info is not None:
self.status_info = status_info
if status_data is not None:
Expand Down
4 changes: 3 additions & 1 deletion src/ai/backend/manager/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -3332,7 +3332,9 @@ async def _transit(db_session: AsyncSession) -> set[SessionId]:
for row in await db_session.scalars(_stmt):
kernel_row = cast(KernelRow, row)
is_transited = kernel_row.transit_status(
KernelStatus.CANCELLED, status_info="image-pull-failed"
KernelStatus.CANCELLED,
status_info="image-pull-failed",
status_data={"error": {"src": "other", "repr": msg}},
)
if is_transited:
session_ids.add(kernel_row.session_id)
Expand Down

0 comments on commit 6ec076e

Please sign in to comment.