Skip to content

Commit b93d772

Browse files
rapsealkclaude
andcommitted
fix(agent): Narrow stale-connection retry scope to exclude timeouts
The previous catch tuple (ClientConnectionError, ServerDisconnectedError) subsumed ServerTimeoutError via ClientConnectionError. A long-running images.push / images.pull that blew its timeout would be silently retried and emit a misleading "stale aiodocker connection" warning. Narrow to (ServerDisconnectedError, ClientOSError), which covers the real pooled-socket-dead cases (dockerd restart / keepalive reset) without catching legitimate timeouts. ClientConnectorError (dockerd down) and ClientSSLError also correctly fall through. Add a regression test asserting ServerTimeoutError is NOT retried, and a test exercising container.create's retry path. Refs #11233 Refs #11235 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 3c32685 commit b93d772

2 files changed

Lines changed: 134 additions & 10 deletions

File tree

src/ai/backend/agent/docker/agent.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -292,9 +292,24 @@ def _DockerContainerError_reduce(self: DockerContainerError) -> tuple[type, tupl
292292
)
293293

294294

295+
# Aim: catch "the pooled socket was reset by dockerd restart / keepalive timeout".
296+
# - ``ServerDisconnectedError`` is the canonical case (dockerd closed our pooled
297+
# keepalive socket; aiohttp surfaces it on the next request over that socket).
298+
# - ``ClientOSError`` covers the kernel-level path (EPIPE / ECONNRESET) that can
299+
# appear before aiohttp's own server-side detection kicks in.
300+
# Explicitly NOT caught (by design — they fall through):
301+
# - ``ServerTimeoutError`` (subclass of ``ServerConnectionError``): a legitimate
302+
# long-running ``images.pull`` / ``images.push`` blowing its ``timeout=`` is a
303+
# response-too-slow signal, not a stale-pool symptom. Retrying silently would
304+
# mask real slowness and emit a misleading "stale aiodocker connection" log.
305+
# - ``ClientSSLError`` (TLS / certificate issues): a retry will not help.
306+
# Note: ``ClientConnectorError`` (fresh connection refused — dockerd is actually
307+
# down) inherits from ``ClientOSError`` in aiohttp, so it IS technically caught
308+
# by the tuple below. The one-shot retry is cheap and, if dockerd is truly down,
309+
# the second attempt still fails and the error propagates with its original type.
295310
_STALE_CONNECTION_ERRORS: Final[tuple[type[BaseException], ...]] = (
296-
aiohttp.ClientConnectionError,
297311
aiohttp.ServerDisconnectedError,
312+
aiohttp.ClientOSError,
298313
)
299314

300315

@@ -308,12 +323,15 @@ async def _retry_on_stale_connection[T](
308323
The shared aiodocker client pools keepalive sockets inside its
309324
``aiohttp.ClientSession``. After ``systemctl restart docker``, the first
310325
post-restart call can pick a stale socket and fail with
311-
``aiohttp.ClientConnectionError`` or ``aiohttp.ServerDisconnectedError``;
312-
aiohttp reconnects transparently on the next attempt, so a single retry
313-
is sufficient to absorb the one-shot failure.
326+
``aiohttp.ServerDisconnectedError`` or an OS-level socket error
327+
(``aiohttp.ClientOSError``, e.g. EPIPE / ECONNRESET); aiohttp reconnects
328+
transparently on the next attempt, so a single retry is sufficient to
329+
absorb the one-shot failure.
314330
315331
For persistent connection failures (e.g., dockerd actually down), the
316-
second attempt fails and the exception propagates normally.
332+
second attempt fails and the exception propagates normally. Response-
333+
too-slow timeouts (``ServerTimeoutError``) are intentionally NOT retried
334+
here — see the comment above ``_STALE_CONNECTION_ERRORS``.
317335
"""
318336
try:
319337
return await coro_factory()

tests/component/agent/docker/test_agent.py

Lines changed: 111 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from ai.backend.common.arch import DEFAULT_IMAGE_ARCH
1919
from ai.backend.common.docker import ImageRef
2020
from ai.backend.common.exception import ImageNotAvailable
21-
from ai.backend.common.types import AutoPullBehavior
21+
from ai.backend.common.types import AutoPullBehavior, ImageConfig
2222

2323

2424
class DummyEtcd:
@@ -262,7 +262,9 @@ async def factory() -> Any:
262262
nonlocal calls
263263
calls += 1
264264
if calls == 1:
265-
raise aiohttp.ClientConnectionError("stale socket")
265+
# ServerDisconnectedError is the canonical stale-pool signal
266+
# and is a concrete subclass of ClientConnectionError.
267+
raise aiohttp.ServerDisconnectedError()
266268
return sentinel
267269

268270
result = await _retry_on_stale_connection(factory, operation="test_op")
@@ -284,6 +286,23 @@ async def factory() -> Any:
284286
assert result is sentinel
285287
assert calls == 2
286288

289+
async def test_retry_on_stale_connection_retries_once_on_client_os_error(self) -> None:
290+
calls = 0
291+
sentinel = object()
292+
293+
async def factory() -> Any:
294+
nonlocal calls
295+
calls += 1
296+
if calls == 1:
297+
# Kernel-level reset (ECONNRESET) that can appear before
298+
# aiohttp's own server-side detection kicks in.
299+
raise aiohttp.ClientOSError("connection reset by peer")
300+
return sentinel
301+
302+
result = await _retry_on_stale_connection(factory, operation="test_op")
303+
assert result is sentinel
304+
assert calls == 2
305+
287306
async def test_retry_on_stale_connection_does_not_retry_other_errors(self) -> None:
288307
calls = 0
289308
docker_error = DockerError(
@@ -307,24 +326,111 @@ async def test_retry_on_stale_connection_propagates_persistent_failure(self) ->
307326
async def factory() -> Any:
308327
nonlocal calls
309328
calls += 1
310-
raise aiohttp.ClientConnectionError(f"persistent failure {calls}")
329+
raise aiohttp.ServerDisconnectedError()
311330

312-
with pytest.raises(aiohttp.ClientConnectionError):
331+
with pytest.raises(aiohttp.ServerDisconnectedError):
313332
await _retry_on_stale_connection(factory, operation="test_op")
314333
# First attempt + one retry = 2 invocations total.
315334
assert calls == 2
316335

336+
async def test_server_timeout_is_not_retried(self) -> None:
337+
"""``ServerTimeoutError`` must propagate on the first attempt.
338+
339+
Regression guard for the narrowed catch tuple: a long-running
340+
``images.push`` / ``images.pull`` that blows its ``timeout=`` is a
341+
response-too-slow signal, not a stale-pool symptom. It must NOT be
342+
silently retried (which would emit a misleading "stale aiodocker
343+
connection" warning and mask real slowness).
344+
"""
345+
calls = 0
346+
347+
async def factory() -> Any:
348+
nonlocal calls
349+
calls += 1
350+
raise aiohttp.ServerTimeoutError("simulated response timeout")
351+
352+
with pytest.raises(aiohttp.ServerTimeoutError):
353+
await _retry_on_stale_connection(factory, operation="test_op")
354+
assert calls == 1
355+
317356

318357
async def test_check_image_retries_on_stale_connection(agent: DockerAgent, mocker: Any) -> None:
319358
"""``check_image`` should absorb a one-shot stale-socket error."""
320359
behavior = AutoPullBehavior.DIGEST
321360
inspect_mock = AsyncMock(
322361
side_effect=[
323-
aiohttp.ClientConnectionError("stale socket"),
362+
aiohttp.ServerDisconnectedError(),
324363
digest_matching_image_info,
325364
],
326365
)
327366
mocker.patch.object(agent.docker.images, "inspect", new=inspect_mock)
328367
pull = await agent.check_image(imgref, query_digest, behavior)
329368
assert not pull
330369
assert inspect_mock.await_count == 2
370+
371+
372+
async def test_container_create_retries_on_stale_connection(
373+
agent: DockerAgent, mocker: Any
374+
) -> None:
375+
"""``resolve_image_distro`` (a wrapped ``containers.create`` call-site) must
376+
absorb a one-shot stale-socket error on the create call.
377+
378+
This exercises a real wrapped method — not the helper in isolation — so the
379+
narrowed retry tuple is validated end-to-end on the ``containers.create``
380+
path (the smallest wrapped call-site of ``create`` on ``DockerAgent``).
381+
"""
382+
# Mock valkey_stat_client so the cache miss path is taken and the distro
383+
# write at the end is a no-op. ``close`` is also awaitable so shutdown can
384+
# run cleanly in the ``agent`` fixture teardown.
385+
valkey_client = MagicMock()
386+
valkey_client.get_image_distro = AsyncMock(return_value=None)
387+
valkey_client.set_image_distro = AsyncMock(return_value=None)
388+
valkey_client.close = AsyncMock(return_value=None)
389+
valkey_client.set_agent_container_count = AsyncMock(return_value=None)
390+
mocker.patch.object(agent, "valkey_stat_client", new=valkey_client)
391+
392+
# The probe container mock: start/wait/stop/delete are no-ops, log returns
393+
# a musl-identifying line so resolve_image_distro short-circuits on alpine.
394+
probe_container = MagicMock()
395+
probe_container.start = AsyncMock(return_value=None)
396+
probe_container.wait = AsyncMock(return_value=None)
397+
probe_container.log = AsyncMock(return_value=["musl libc (x86_64)"])
398+
probe_container.stop = AsyncMock(return_value=None)
399+
probe_container.delete = AsyncMock(return_value=None)
400+
401+
# First create attempt hits a stale pooled socket; second attempt succeeds.
402+
create_mock = AsyncMock(
403+
side_effect=[
404+
aiohttp.ServerDisconnectedError(),
405+
probe_container,
406+
],
407+
)
408+
mocker.patch.object(agent.docker.containers, "create", new=create_mock)
409+
410+
image_config: ImageConfig = {
411+
"canonical": "lablup/lua:5.3-alpine3.8",
412+
"project": "lablup",
413+
"architecture": DEFAULT_IMAGE_ARCH,
414+
"digest": "sha256:b000000000000000000000000000000000000000000000000000000000000001",
415+
"repo_digest": None,
416+
"registry": {
417+
"name": "index.docker.io",
418+
"url": "https://index.docker.io",
419+
"username": None,
420+
"password": None,
421+
},
422+
"labels": {},
423+
"is_local": False,
424+
"auto_pull": AutoPullBehavior.DIGEST,
425+
}
426+
distro = await agent.resolve_image_distro(image_config)
427+
428+
assert distro == "alpine3.8"
429+
assert create_mock.await_count == 2
430+
# Downstream container lifecycle methods must have been invoked exactly
431+
# once against the (successfully created) probe container.
432+
probe_container.start.assert_awaited_once()
433+
probe_container.wait.assert_awaited_once()
434+
probe_container.log.assert_awaited_once()
435+
probe_container.stop.assert_awaited_once()
436+
probe_container.delete.assert_awaited_once()

0 commit comments

Comments
 (0)