-
Notifications
You must be signed in to change notification settings - Fork 175
refactor(BA-5860): Default to sysfs-first CPU/memory stats on native Linux #11223
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
edae4b0
b4fe2ce
9cc2b7e
99d5145
1a8903c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Default to cgroup (sysfs) stat collection on native Linux hosts, falling back to the Docker API only on linuxkit or read failure. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -73,6 +73,27 @@ | |
| _CONTAINER_STAT_TIMEOUT: float = 2.0 | ||
| _INVALID_PID: int = 0 | ||
|
|
||
| # Tracks containers for which we have already logged a cgroup->Docker-API | ||
| # fallback warning, to avoid log spam on persistent read failures. | ||
| _cgroup_fallback_warned: set[str] = set() | ||
|
|
||
|
|
||
| def _warn_cgroup_fallback_once(plugin: str, container_id: str) -> None: | ||
| key = f"{plugin}:{container_id}" | ||
| if key in _cgroup_fallback_warned: | ||
| return | ||
| _cgroup_fallback_warned.add(key) | ||
| log.warning( | ||
| "{0}: cgroup sysfs read failed for container {1}; falling back to Docker API", | ||
| plugin, | ||
| container_id[:7], | ||
| ) | ||
|
Comment on lines
+77
to
+97
|
||
|
|
||
|
Comment on lines
+88
to
+98
|
||
|
|
||
| def _is_linuxkit(local_config: Mapping[str, Any]) -> bool: | ||
| return bool(local_config["agent"]["docker-mode"] == "linuxkit") | ||
|
|
||
|
|
||
| # The list of pruned fstype when checking the filesystem usage statistics. | ||
| pruned_disk_types = frozenset([ | ||
| "vfat", | ||
|
|
@@ -306,12 +327,20 @@ async def api_impl(container_id: str) -> float | None: | |
| cpu_usage = cast(float, nmget(ret, "cpu_stats.cpu_usage.total_usage", 0)) | ||
| return cpu_usage / 1e6 | ||
|
|
||
| if ctx.mode == StatModes.CGROUP: | ||
| impl = sysfs_impl | ||
| elif ctx.mode == StatModes.DOCKER: | ||
| impl = api_impl | ||
| else: | ||
| raise RuntimeError("should not reach here") | ||
| async def cgroup_first_impl(container_id: str) -> float | None: | ||
| cpu_used = await sysfs_impl(container_id) | ||
| if cpu_used is None: | ||
| _warn_cgroup_fallback_once("CPUPlugin", container_id) | ||
| return await api_impl(container_id) | ||
| return cpu_used | ||
|
|
||
| match ctx.mode: | ||
| case StatModes.CGROUP if not _is_linuxkit(self.local_config): | ||
| impl = cgroup_first_impl | ||
| case StatModes.CGROUP | StatModes.DOCKER: | ||
| impl = api_impl | ||
| case _: | ||
| raise RuntimeError("should not reach here") | ||
|
|
||
| tasks = [] | ||
| for cid in container_ids: | ||
|
|
@@ -819,12 +848,22 @@ async def api_impl( | |
| scratch_sz, | ||
| ) | ||
|
|
||
| if ctx.mode == StatModes.CGROUP: | ||
| impl = sysfs_impl | ||
| elif ctx.mode == StatModes.DOCKER: | ||
| impl = api_impl | ||
| else: | ||
| raise RuntimeError("should not reach here") | ||
| async def cgroup_first_impl( | ||
| container_id: str, | ||
| ) -> tuple[int, int, int, int, int, int, int] | None: | ||
| result = await sysfs_impl(container_id) | ||
| if result is None: | ||
| _warn_cgroup_fallback_once("MemoryPlugin", container_id) | ||
| return await api_impl(container_id) | ||
| return result | ||
|
|
||
| match ctx.mode: | ||
| case StatModes.CGROUP if not _is_linuxkit(self.local_config): | ||
| impl = cgroup_first_impl | ||
| case StatModes.CGROUP | StatModes.DOCKER: | ||
| impl = api_impl | ||
| case _: | ||
| raise RuntimeError("should not reach here") | ||
|
||
|
|
||
| per_container_mem_used_bytes = {} | ||
| per_container_io_read_bytes = {} | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The PR description says there are no changes to network/IO stat collection, but switching the default
stats_typeto auto-select cgroup on native Linux will causeMemoryPlugin.gather_container_measures()to use its sysfs implementation by default (which collects io/net via cgroup + /proc), rather than Docker stats API. If the intent is to keep network/IO coming from the Docker stats API by default, the mode selection likely needs to be split (CPU/memory via cgroup, net/IO via API) or the PR description should be updated to reflect the behavior change.