Skip to content

Commit 6a31219

Browse files
rapsealkclaude
andcommitted
test(agent): Lock DockerKernel pickling invariant and harden attach_docker
Addresses review feedback on PR #11228: - Add a pickle round-trip test asserting __getstate__ excludes _docker, locking the RPC-marshalling invariant for future contributors. - Document on DockerKernel that _docker is local-only and must be re-attached by the agent via attach_docker(). - Warn (but still overwrite) when attach_docker is called with a connector different from the currently-attached one; same-connector reattach remains silent. - Align the self.docker type annotation on DockerAgent with the one introduced in #11226 (no-op if already present). Refs #11227 Refs #11228 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 87da619 commit 6a31219

3 files changed

Lines changed: 105 additions & 1 deletion

File tree

src/ai/backend/agent/docker/agent.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1418,6 +1418,7 @@ async def _rollback_container_creation() -> None:
14181418

14191419

14201420
class DockerAgent(AbstractAgent[DockerKernel, DockerKernelCreationContext]):
1421+
docker: Docker
14211422
docker_info: Mapping[str, Any]
14221423
monitor_docker_task: asyncio.Task[Any]
14231424
agent_sockpath: Path

src/ai/backend/agent/docker/kernel.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,15 @@
4545

4646

4747
class DockerKernel(AbstractKernel):
48+
"""
49+
Docker-backed kernel implementation.
50+
51+
Note: ``_docker`` is a local, process-bound reference to the agent-owned
52+
``aiodocker.Docker`` client and MUST NOT be serialized (aiohttp sessions
53+
are not picklable). It is excluded from ``__getstate__`` / ``__setstate__``
54+
and must be re-attached by the agent on recovery via ``attach_docker()``.
55+
"""
56+
4857
network_driver: str
4958
_docker: Docker
5059

@@ -80,6 +89,12 @@ def __init__(
8089

8190
def attach_docker(self, docker: Docker) -> None:
8291
"""Re-attach the agent-owned aiodocker client after unpickling or recovery."""
92+
existing = getattr(self, "_docker", None)
93+
if existing is not None and existing is not docker:
94+
log.warning(
95+
"DockerKernel {} re-attached with a different Docker client; this is likely a bug",
96+
self.kernel_id,
97+
)
8398
self._docker = docker
8499

85100
@override

tests/unit/agent/test_container_id_sync.py

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,17 @@
77

88
from __future__ import annotations
99

10+
import pickle
1011
from unittest.mock import Mock
1112
from uuid import uuid4
1213

1314
import pytest
1415

1516
from ai.backend.agent.docker.kernel import DockerKernel
17+
from ai.backend.agent.resources import KernelResourceSpec
1618
from ai.backend.agent.types import KernelOwnershipData
1719
from ai.backend.common.docker import ImageRef
18-
from ai.backend.common.types import AgentId, ContainerId, KernelId, SessionId
20+
from ai.backend.common.types import AgentId, ContainerId, KernelId, ResourceSlot, SessionId
1921

2022

2123
@pytest.fixture
@@ -80,3 +82,89 @@ def test_second_call_updates_both_sources(self, mock_kernel_obj: DockerKernel) -
8082
mock_kernel_obj.set_container_id(second)
8183
assert mock_kernel_obj.container_id == second
8284
assert mock_kernel_obj["container_id"] == second
85+
86+
87+
@pytest.fixture
88+
def picklable_kernel_obj() -> DockerKernel:
89+
"""
90+
Create a DockerKernel whose state (apart from the excluded ``_docker``
91+
reference) is fully picklable.
92+
93+
The shared ``mock_kernel_obj`` fixture uses ``Mock()`` for
94+
``resource_spec``, which cannot be pickled. Pickling tests therefore
95+
need a kernel constructed with a real, lightweight ``KernelResourceSpec``
96+
so the only non-picklable member is the intentionally-excluded
97+
``_docker`` client.
98+
"""
99+
kernel_id = KernelId(uuid4())
100+
session_id = SessionId(uuid4())
101+
agent_id = AgentId("test-agent-id")
102+
ownership_data = KernelOwnershipData(
103+
kernel_id=kernel_id,
104+
session_id=session_id,
105+
agent_id=agent_id,
106+
)
107+
image = ImageRef(
108+
name="test-image",
109+
project="test-project",
110+
registry="registry.local",
111+
tag="latest",
112+
architecture="x86_64",
113+
is_local=False,
114+
)
115+
resource_spec = KernelResourceSpec(
116+
slots=ResourceSlot(),
117+
allocations={},
118+
scratch_disk_size=0,
119+
)
120+
return DockerKernel(
121+
ownership_data=ownership_data,
122+
network_id="test-network",
123+
image=image,
124+
version=1,
125+
network_driver="bridge",
126+
agent_config={},
127+
resource_spec=resource_spec,
128+
service_ports=[],
129+
environ={},
130+
data={},
131+
docker=Mock(),
132+
)
133+
134+
135+
class TestDockerKernelPickling:
136+
"""
137+
Tests locking the DockerKernel pickling invariant.
138+
139+
The agent marshals DockerKernel instances via pickle for RPC / recovery.
140+
The live ``_docker`` client (aiohttp-backed aiodocker.Docker) is NOT
141+
picklable, so ``__getstate__`` must drop it and ``__setstate__`` must
142+
tolerate its absence. The agent re-attaches the client after unpickling
143+
via ``attach_docker()``. These tests guard against future regressions
144+
that would accidentally put ``_docker`` back into the pickled payload.
145+
"""
146+
147+
def test_pickle_round_trip_excludes_docker(self, picklable_kernel_obj: DockerKernel) -> None:
148+
"""pickle.dumps/loads must succeed and drop the _docker reference."""
149+
# The Mock() Docker client is not picklable; the test confirms that
150+
# __getstate__ drops it so pickling succeeds anyway, and that non-
151+
# excluded state is preserved across the round trip.
152+
cid = ContainerId("pickled-container-xyz")
153+
picklable_kernel_obj.set_container_id(cid)
154+
155+
data = pickle.dumps(picklable_kernel_obj)
156+
restored = pickle.loads(data)
157+
158+
assert isinstance(restored, DockerKernel)
159+
# _docker must not be carried across pickle; the agent re-attaches
160+
# it via attach_docker() on recovery.
161+
assert getattr(restored, "_docker", None) is None
162+
# Sanity check: non-excluded state survives the round trip.
163+
assert restored.container_id == cid
164+
assert restored["container_id"] == cid
165+
assert restored.network_driver == picklable_kernel_obj.network_driver
166+
167+
def test_getstate_does_not_contain_docker_key(self, picklable_kernel_obj: DockerKernel) -> None:
168+
"""__getstate__ output must not carry the live _docker reference."""
169+
state = picklable_kernel_obj.__getstate__()
170+
assert "_docker" not in state

0 commit comments

Comments
 (0)