-
Notifications
You must be signed in to change notification settings - Fork 175
refactor(BA-5860): Default to sysfs-first CPU/memory stats on native Linux #11223
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
edae4b0
b4fe2ce
9cc2b7e
99d5145
1a8903c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| Default to cgroup (sysfs) stat collection on native Linux hosts, falling back to the Docker API only on linuxkit or read failure. | ||
| Note: reported container memory usage may step down on hosts previously using `stats-type: docker`, because sysfs excludes inactive file cache (matching `docker stats`). | ||
| Block-I/O readings may also shift on cgroup v1 hosts, because the sysfs path reads `blkio.throttle.io_service_bytes` while the Docker API path reads `blkio_stats.io_service_bytes_recursive` (which sums across nested cgroups). | ||
| Dashboards or autoscaling thresholds tuned to the old values should be re-evaluated after upgrade. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,12 +8,13 @@ | |
| from collections.abc import Collection, Iterable, Mapping, Sequence | ||
| from decimal import Decimal | ||
| from pathlib import Path | ||
| from typing import Any, cast | ||
| from typing import Any, assert_never, cast | ||
|
|
||
| import aiohttp | ||
| import psutil | ||
| from aiodocker.docker import Docker, DockerContainer | ||
| from aiodocker.exceptions import DockerError | ||
| from cachetools import LRUCache | ||
|
|
||
| from ai.backend.agent import __version__ # pants: no-infer-dep | ||
| from ai.backend.agent.alloc_map import AllocationStrategy | ||
|
|
@@ -73,6 +74,33 @@ | |
| _CONTAINER_STAT_TIMEOUT: float = 2.0 | ||
| _INVALID_PID: int = 0 | ||
|
|
||
| # Tracks containers for which we have already logged a cgroup->Docker-API | ||
| # fallback warning, to avoid log spam on persistent read failures. | ||
| # Bounded LRU cache to prevent unbounded growth on hosts with high container churn; | ||
| # oldest entries are evicted on overflow, so a recreated container may re-warn. | ||
| # Shared across CPUPlugin / MemoryPlugin since keys are namespaced via the `plugin:` prefix. | ||
| _CGROUP_FALLBACK_WARN_CACHE_SIZE = 1024 | ||
| _cgroup_fallback_warned: LRUCache[str, None] = LRUCache( | ||
| maxsize=_CGROUP_FALLBACK_WARN_CACHE_SIZE, | ||
| ) | ||
|
|
||
|
|
||
| def _warn_cgroup_fallback_once(plugin: str, container_id: str) -> None: | ||
| key = f"{plugin}:{container_id}" | ||
| if key in _cgroup_fallback_warned: | ||
| return | ||
| _cgroup_fallback_warned[key] = None | ||
| log.warning( | ||
| "{0}: cgroup sysfs read failed for container {1}; falling back to Docker API", | ||
| plugin, | ||
| container_id[:7], | ||
| ) | ||
|
Comment on lines
+77
to
+97
|
||
|
|
||
|
Comment on lines
+88
to
+98
|
||
|
|
||
| def _is_linuxkit(local_config: Mapping[str, Any]) -> bool: | ||
| return cast(str, local_config["agent"]["docker-mode"]) == "linuxkit" | ||
|
|
||
|
|
||
| # The list of pruned fstype when checking the filesystem usage statistics. | ||
| pruned_disk_types = frozenset([ | ||
| "vfat", | ||
|
|
@@ -306,12 +334,20 @@ async def api_impl(container_id: str) -> float | None: | |
| cpu_usage = cast(float, nmget(ret, "cpu_stats.cpu_usage.total_usage", 0)) | ||
| return cpu_usage / 1e6 | ||
|
|
||
| if ctx.mode == StatModes.CGROUP: | ||
| impl = sysfs_impl | ||
| elif ctx.mode == StatModes.DOCKER: | ||
| impl = api_impl | ||
| else: | ||
| raise RuntimeError("should not reach here") | ||
| async def cgroup_first_impl(container_id: str) -> float | None: | ||
| cpu_used = await sysfs_impl(container_id) | ||
| if cpu_used is None: | ||
| _warn_cgroup_fallback_once("CPUPlugin", container_id) | ||
| return await api_impl(container_id) | ||
| return cpu_used | ||
|
|
||
| match ctx.mode: | ||
| case StatModes.CGROUP if not _is_linuxkit(self.local_config): | ||
| impl = cgroup_first_impl | ||
| case StatModes.CGROUP | StatModes.DOCKER: | ||
| impl = api_impl | ||
| case _: | ||
| assert_never(ctx.mode) | ||
|
|
||
| tasks = [] | ||
| for cid in container_ids: | ||
|
|
@@ -819,12 +855,22 @@ async def api_impl( | |
| scratch_sz, | ||
| ) | ||
|
|
||
| if ctx.mode == StatModes.CGROUP: | ||
| impl = sysfs_impl | ||
| elif ctx.mode == StatModes.DOCKER: | ||
| impl = api_impl | ||
| else: | ||
| raise RuntimeError("should not reach here") | ||
| async def cgroup_first_impl( | ||
| container_id: str, | ||
| ) -> tuple[int, int, int, int, int, int, int] | None: | ||
| result = await sysfs_impl(container_id) | ||
| if result is None: | ||
| _warn_cgroup_fallback_once("MemoryPlugin", container_id) | ||
| return await api_impl(container_id) | ||
| return result | ||
|
|
||
| match ctx.mode: | ||
| case StatModes.CGROUP if not _is_linuxkit(self.local_config): | ||
| impl = cgroup_first_impl | ||
| case StatModes.CGROUP | StatModes.DOCKER: | ||
| impl = api_impl | ||
| case _: | ||
| assert_never(ctx.mode) | ||
|
|
||
| per_container_mem_used_bytes = {} | ||
| per_container_io_read_bytes = {} | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The PR description says there are no changes to network/IO stat collection, but switching the default
stats_typeto auto-select cgroup on native Linux will causeMemoryPlugin.gather_container_measures()to use its sysfs implementation by default (which collects io/net via cgroup + /proc), rather than Docker stats API. If the intent is to keep network/IO coming from the Docker stats API by default, the mode selection likely needs to be split (CPU/memory via cgroup, net/IO via API) or the PR description should be updated to reflect the behavior change.