Skip to content

Commit 9cc2b7e

Browse files
rapsealkclaude
andcommitted
refactor(agent): Address review feedback on sysfs-first stats
- Extend changelog to call out the memory metric shift (sysfs excludes inactive file cache; dashboards may step-down). - Bound the _cgroup_fallback_warned set to prevent unbounded growth on hosts with high container churn. - Replace RuntimeError("should not reach here") with assert_never() to satisfy the BackendAIError-only rule and strengthen exhaustive match-statement type checking. - Add parallel MemoryPlugin fallback + linuxkit tests and a dedup-warn test. Refs #11220 Refs #11223 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent b4fe2ce commit 9cc2b7e

3 files changed

Lines changed: 111 additions & 5 deletions

File tree

changes/11223.enhance.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
11
Default to cgroup (sysfs) stat collection on native Linux hosts, falling back to the Docker API only on linuxkit or read failure.
2+
Note: reported container memory usage may step down on hosts previously using `stats-type: docker`, because sysfs excludes inactive file cache (matching `docker stats`).
3+
Dashboards or autoscaling thresholds tuned to the old (higher) values should be re-evaluated after upgrade.

src/ai/backend/agent/docker/intrinsic.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,13 @@
88
from collections.abc import Collection, Iterable, Mapping, Sequence
99
from decimal import Decimal
1010
from pathlib import Path
11-
from typing import Any, cast
11+
from typing import Any, assert_never, cast
1212

1313
import aiohttp
1414
import psutil
1515
from aiodocker.docker import Docker, DockerContainer
1616
from aiodocker.exceptions import DockerError
17+
from cachetools import LRUCache
1718

1819
from ai.backend.agent import __version__ # pants: no-infer-dep
1920
from ai.backend.agent.alloc_map import AllocationStrategy
@@ -75,14 +76,20 @@
7576

7677
# Tracks containers for which we have already logged a cgroup->Docker-API
7778
# fallback warning, to avoid log spam on persistent read failures.
78-
_cgroup_fallback_warned: set[str] = set()
79+
# Bounded LRU cache to prevent unbounded growth on hosts with high container churn;
80+
# oldest entries are evicted on overflow, so a recreated container may re-warn.
81+
# Shared across CPUPlugin / MemoryPlugin since keys are namespaced via the `plugin:` prefix.
82+
_CGROUP_FALLBACK_WARN_CACHE_SIZE = 1024
83+
_cgroup_fallback_warned: LRUCache[str, None] = LRUCache(
84+
maxsize=_CGROUP_FALLBACK_WARN_CACHE_SIZE,
85+
)
7986

8087

8188
def _warn_cgroup_fallback_once(plugin: str, container_id: str) -> None:
8289
key = f"{plugin}:{container_id}"
8390
if key in _cgroup_fallback_warned:
8491
return
85-
_cgroup_fallback_warned.add(key)
92+
_cgroup_fallback_warned[key] = None
8693
log.warning(
8794
"{0}: cgroup sysfs read failed for container {1}; falling back to Docker API",
8895
plugin,
@@ -340,7 +347,7 @@ async def cgroup_first_impl(container_id: str) -> float | None:
340347
case StatModes.CGROUP | StatModes.DOCKER:
341348
impl = api_impl
342349
case _:
343-
raise RuntimeError("should not reach here")
350+
assert_never(ctx.mode)
344351

345352
tasks = []
346353
for cid in container_ids:
@@ -863,7 +870,7 @@ async def cgroup_first_impl(
863870
case StatModes.CGROUP | StatModes.DOCKER:
864871
impl = api_impl
865872
case _:
866-
raise RuntimeError("should not reach here")
873+
assert_never(ctx.mode)
867874

868875
per_container_mem_used_bytes = {}
869876
per_container_io_read_bytes = {}

tests/unit/agent/test_docker_intrinsic.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@
1010

1111
import pytest
1212

13+
from ai.backend.agent.docker import intrinsic
1314
from ai.backend.agent.docker.intrinsic import (
1415
ContainerNetStat,
1516
CPUPlugin,
1617
MemoryPlugin,
18+
_warn_cgroup_fallback_once,
1719
read_proc_net_dev,
1820
)
1921
from ai.backend.agent.stats import StatModes
@@ -277,6 +279,101 @@ async def test_sysfs_mode_uses_instance_docker_client(
277279
await memory_plugin.gather_container_measures(memory_cgroup_context, container_ids)
278280
mock_docker_cls.assert_not_called()
279281

282+
async def test_cgroup_mode_falls_back_to_api_on_sysfs_failure(
283+
self,
284+
memory_plugin: MemoryPlugin,
285+
container_ids: list[str],
286+
cgroup_stat_context: MagicMock,
287+
mock_fetch_api_stats: MagicMock,
288+
) -> None:
289+
"""When sysfs read fails in CGROUP mode, the Docker API is used
290+
as a per-read fallback instead of silently returning zero."""
291+
# Arrange: cgroup version that triggers "return None" in sysfs_impl.
292+
cgroup_stat_context.agent.get_cgroup_version = MagicMock(return_value="invalid")
293+
cgroup_stat_context.agent.get_cgroup_path = MagicMock(return_value=MagicMock())
294+
295+
results = await memory_plugin.gather_container_measures(cgroup_stat_context, container_ids)
296+
297+
assert mock_fetch_api_stats.call_count == len(container_ids)
298+
# api_impl returns mem_cur = 1024 * 1024 * 100 = 104857600 bytes
299+
for cid in container_ids:
300+
assert results[0].per_container[cid].value == 1024 * 1024 * 100
301+
302+
async def test_linuxkit_forces_api_even_in_cgroup_mode(
303+
self,
304+
container_ids: list[str],
305+
cgroup_stat_context: MagicMock,
306+
mock_fetch_api_stats: MagicMock,
307+
) -> None:
308+
"""On linuxkit hosts the API path is used even when mode is CGROUP."""
309+
plugin = MemoryPlugin.__new__(MemoryPlugin)
310+
plugin.local_config = {"agent": {"docker-mode": "linuxkit"}}
311+
plugin._docker = AsyncMock()
312+
313+
await plugin.gather_container_measures(cgroup_stat_context, container_ids)
314+
assert mock_fetch_api_stats.call_count == len(container_ids)
315+
316+
317+
class TestWarnCgroupFallbackOnce:
318+
"""Tests for _warn_cgroup_fallback_once() dedup and bounded-cache semantics."""
319+
320+
@pytest.fixture(autouse=True)
321+
def _reset_warn_cache(self) -> Generator[None, None, None]:
322+
"""Reset the module-level warn cache between tests to avoid bleed-through."""
323+
intrinsic._cgroup_fallback_warned.clear()
324+
yield
325+
intrinsic._cgroup_fallback_warned.clear()
326+
327+
def test_deduplicates_per_container(
328+
self,
329+
caplog: pytest.LogCaptureFixture,
330+
) -> None:
331+
"""The helper logs once per (plugin, container_id) and stays silent on
332+
subsequent calls for the same container."""
333+
with caplog.at_level("WARNING", logger="ai.backend.agent.docker.intrinsic"):
334+
_warn_cgroup_fallback_once("CPUPlugin", "container_abc")
335+
_warn_cgroup_fallback_once("CPUPlugin", "container_abc")
336+
_warn_cgroup_fallback_once("CPUPlugin", "container_abc")
337+
338+
warn_records = [r for r in caplog.records if r.levelname == "WARNING"]
339+
assert len(warn_records) == 1
340+
341+
# A different container should still warn.
342+
caplog.clear()
343+
with caplog.at_level("WARNING", logger="ai.backend.agent.docker.intrinsic"):
344+
_warn_cgroup_fallback_once("CPUPlugin", "container_xyz")
345+
warn_records = [r for r in caplog.records if r.levelname == "WARNING"]
346+
assert len(warn_records) == 1
347+
348+
# Same container under a different plugin namespace should also warn once.
349+
caplog.clear()
350+
with caplog.at_level("WARNING", logger="ai.backend.agent.docker.intrinsic"):
351+
_warn_cgroup_fallback_once("MemoryPlugin", "container_abc")
352+
warn_records = [r for r in caplog.records if r.levelname == "WARNING"]
353+
assert len(warn_records) == 1
354+
355+
def test_evicts_beyond_limit(
356+
self,
357+
caplog: pytest.LogCaptureFixture,
358+
) -> None:
359+
"""When the bounded cache overflows, the oldest entry is evicted and a
360+
previously-seen container may warn again."""
361+
cap = intrinsic._CGROUP_FALLBACK_WARN_CACHE_SIZE
362+
first_cid = "first_container"
363+
364+
with caplog.at_level("WARNING", logger="ai.backend.agent.docker.intrinsic"):
365+
# First warn for `first_cid`.
366+
_warn_cgroup_fallback_once("CPUPlugin", first_cid)
367+
# Fill the cache with `cap` distinct new entries to evict `first_cid`.
368+
for i in range(cap):
369+
_warn_cgroup_fallback_once("CPUPlugin", f"filler_{i}")
370+
# `first_cid` should have been evicted and now warn again.
371+
_warn_cgroup_fallback_once("CPUPlugin", first_cid)
372+
373+
warn_records = [r for r in caplog.records if r.levelname == "WARNING"]
374+
# 1 (first) + cap (fillers) + 1 (re-warn of first) = cap + 2
375+
assert len(warn_records) == cap + 2
376+
280377

281378
class TestMemoryPluginContainerPidValidation(BaseDockerIntrinsicTest):
282379
"""Tests for container PID validation before reading /proc/[pid]/net/dev."""

0 commit comments

Comments
 (0)