1818from ai .backend .common .arch import DEFAULT_IMAGE_ARCH
1919from ai .backend .common .docker import ImageRef
2020from ai .backend .common .exception import ImageNotAvailable
21- from ai .backend .common .types import AutoPullBehavior
21+ from ai .backend .common .types import AutoPullBehavior , ImageConfig
2222
2323
2424class DummyEtcd :
@@ -262,7 +262,9 @@ async def factory() -> Any:
262262 nonlocal calls
263263 calls += 1
264264 if calls == 1 :
265- raise aiohttp .ClientConnectionError ("stale socket" )
265+ # ServerDisconnectedError is the canonical stale-pool signal
266+ # and is a concrete subclass of ClientConnectionError.
267+ raise aiohttp .ServerDisconnectedError ()
266268 return sentinel
267269
268270 result = await _retry_on_stale_connection (factory , operation = "test_op" )
@@ -284,6 +286,23 @@ async def factory() -> Any:
284286 assert result is sentinel
285287 assert calls == 2
286288
289+ async def test_retry_on_stale_connection_retries_once_on_client_os_error (self ) -> None :
290+ calls = 0
291+ sentinel = object ()
292+
293+ async def factory () -> Any :
294+ nonlocal calls
295+ calls += 1
296+ if calls == 1 :
297+ # Kernel-level reset (ECONNRESET) that can appear before
298+ # aiohttp's own server-side detection kicks in.
299+ raise aiohttp .ClientOSError ("connection reset by peer" )
300+ return sentinel
301+
302+ result = await _retry_on_stale_connection (factory , operation = "test_op" )
303+ assert result is sentinel
304+ assert calls == 2
305+
287306 async def test_retry_on_stale_connection_does_not_retry_other_errors (self ) -> None :
288307 calls = 0
289308 docker_error = DockerError (
@@ -307,24 +326,111 @@ async def test_retry_on_stale_connection_propagates_persistent_failure(self) ->
307326 async def factory () -> Any :
308327 nonlocal calls
309328 calls += 1
310- raise aiohttp .ClientConnectionError ( f"persistent failure { calls } " )
329+ raise aiohttp .ServerDisconnectedError ( )
311330
312- with pytest .raises (aiohttp .ClientConnectionError ):
331+ with pytest .raises (aiohttp .ServerDisconnectedError ):
313332 await _retry_on_stale_connection (factory , operation = "test_op" )
314333 # First attempt + one retry = 2 invocations total.
315334 assert calls == 2
316335
336+ async def test_server_timeout_is_not_retried (self ) -> None :
337+ """``ServerTimeoutError`` must propagate on the first attempt.
338+
339+ Regression guard for the narrowed catch tuple: a long-running
340+ ``images.push`` / ``images.pull`` that blows its ``timeout=`` is a
341+ response-too-slow signal, not a stale-pool symptom. It must NOT be
342+ silently retried (which would emit a misleading "stale aiodocker
343+ connection" warning and mask real slowness).
344+ """
345+ calls = 0
346+
347+ async def factory () -> Any :
348+ nonlocal calls
349+ calls += 1
350+ raise aiohttp .ServerTimeoutError ("simulated response timeout" )
351+
352+ with pytest .raises (aiohttp .ServerTimeoutError ):
353+ await _retry_on_stale_connection (factory , operation = "test_op" )
354+ assert calls == 1
355+
317356
318357async def test_check_image_retries_on_stale_connection (agent : DockerAgent , mocker : Any ) -> None :
319358 """``check_image`` should absorb a one-shot stale-socket error."""
320359 behavior = AutoPullBehavior .DIGEST
321360 inspect_mock = AsyncMock (
322361 side_effect = [
323- aiohttp .ClientConnectionError ( "stale socket" ),
362+ aiohttp .ServerDisconnectedError ( ),
324363 digest_matching_image_info ,
325364 ],
326365 )
327366 mocker .patch .object (agent .docker .images , "inspect" , new = inspect_mock )
328367 pull = await agent .check_image (imgref , query_digest , behavior )
329368 assert not pull
330369 assert inspect_mock .await_count == 2
370+
371+
372+ async def test_container_create_retries_on_stale_connection (
373+ agent : DockerAgent , mocker : Any
374+ ) -> None :
375+ """``resolve_image_distro`` (a wrapped ``containers.create`` call-site) must
376+ absorb a one-shot stale-socket error on the create call.
377+
378+ This exercises a real wrapped method — not the helper in isolation — so the
379+ narrowed retry tuple is validated end-to-end on the ``containers.create``
380+ path (the smallest wrapped call-site of ``create`` on ``DockerAgent``).
381+ """
382+ # Mock valkey_stat_client so the cache miss path is taken and the distro
383+ # write at the end is a no-op. ``close`` is also awaitable so shutdown can
384+ # run cleanly in the ``agent`` fixture teardown.
385+ valkey_client = MagicMock ()
386+ valkey_client .get_image_distro = AsyncMock (return_value = None )
387+ valkey_client .set_image_distro = AsyncMock (return_value = None )
388+ valkey_client .close = AsyncMock (return_value = None )
389+ valkey_client .set_agent_container_count = AsyncMock (return_value = None )
390+ mocker .patch .object (agent , "valkey_stat_client" , new = valkey_client )
391+
392+ # The probe container mock: start/wait/stop/delete are no-ops, log returns
393+ # a musl-identifying line so resolve_image_distro short-circuits on alpine.
394+ probe_container = MagicMock ()
395+ probe_container .start = AsyncMock (return_value = None )
396+ probe_container .wait = AsyncMock (return_value = None )
397+ probe_container .log = AsyncMock (return_value = ["musl libc (x86_64)" ])
398+ probe_container .stop = AsyncMock (return_value = None )
399+ probe_container .delete = AsyncMock (return_value = None )
400+
401+ # First create attempt hits a stale pooled socket; second attempt succeeds.
402+ create_mock = AsyncMock (
403+ side_effect = [
404+ aiohttp .ServerDisconnectedError (),
405+ probe_container ,
406+ ],
407+ )
408+ mocker .patch .object (agent .docker .containers , "create" , new = create_mock )
409+
410+ image_config : ImageConfig = {
411+ "canonical" : "lablup/lua:5.3-alpine3.8" ,
412+ "project" : "lablup" ,
413+ "architecture" : DEFAULT_IMAGE_ARCH ,
414+ "digest" : "sha256:b000000000000000000000000000000000000000000000000000000000000001" ,
415+ "repo_digest" : None ,
416+ "registry" : {
417+ "name" : "index.docker.io" ,
418+ "url" : "https://index.docker.io" ,
419+ "username" : None ,
420+ "password" : None ,
421+ },
422+ "labels" : {},
423+ "is_local" : False ,
424+ "auto_pull" : AutoPullBehavior .DIGEST ,
425+ }
426+ distro = await agent .resolve_image_distro (image_config )
427+
428+ assert distro == "alpine3.8"
429+ assert create_mock .await_count == 2
430+ # Downstream container lifecycle methods must have been invoked exactly
431+ # once against the (successfully created) probe container.
432+ probe_container .start .assert_awaited_once ()
433+ probe_container .wait .assert_awaited_once ()
434+ probe_container .log .assert_awaited_once ()
435+ probe_container .stop .assert_awaited_once ()
436+ probe_container .delete .assert_awaited_once ()
0 commit comments