Skip to content

Commit 0eda461

Browse files
rapsealkclaude
andcommitted
refactor(agent): Skip CpusetMems on non-NUMA hosts and clean dummy tombstone
Addresses review feedback on PR #11222: - Skip CpusetMems entirely when libnuma reports NUMA unsupported or the host has a single node, so non-NUMA Linux hosts and macOS/WSL dev environments do not get CpusetMems="0" pinned on every container. - Add a unit test covering the non-NUMA short-circuit branch. - Remove the twin commented-out CpusetMems tombstone from src/ai/backend/agent/dummy/intrinsic.py. Refs #11217 Refs #11222 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 64529b2 commit 0eda461

3 files changed

Lines changed: 61 additions & 18 deletions

File tree

src/ai/backend/agent/docker/intrinsic.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -437,18 +437,23 @@ async def generate_docker_args(
437437
"Cpus": len(cores),
438438
"CpusetCpus": ",".join(sorted_core_ids),
439439
}
440-
allocated_nodes: set[int] = set()
441-
for core in cores:
442-
node = libnuma.node_of_cpu(core)
443-
if node < 0:
444-
allocated_nodes.clear()
445-
break
446-
allocated_nodes.add(node)
447-
# Pin memory only when the CPU allocation is fully node-local.
448-
# For multi-node CPU allocations, intentionally leave CpusetMems unset
449-
# so Docker/kernel default NUMA memory placement policy can apply.
450-
if len(allocated_nodes) == 1:
451-
host_config["CpusetMems"] = str(next(iter(allocated_nodes)))
440+
# Skip CpusetMems entirely when NUMA is unsupported (non-Linux hosts,
441+
# Linux without libnuma.so, Docker Desktop, WSL, etc.) or when the host
442+
# exposes a single node; libnuma.node_of_cpu would otherwise fall back
443+
# to 0 and cause every container to be pinned to "CpusetMems": "0".
444+
if libnuma.num_nodes() > 1:
445+
allocated_nodes: set[int] = set()
446+
for core in cores:
447+
node = libnuma.node_of_cpu(core)
448+
if node < 0:
449+
allocated_nodes.clear()
450+
break
451+
allocated_nodes.add(node)
452+
# Pin memory only when the CPU allocation is fully node-local.
453+
# For multi-node CPU allocations, intentionally leave CpusetMems unset
454+
# so Docker/kernel default NUMA memory placement policy can apply.
455+
if len(allocated_nodes) == 1:
456+
host_config["CpusetMems"] = str(next(iter(allocated_nodes)))
452457
return {
453458
"HostConfig": host_config,
454459
}

src/ai/backend/agent/dummy/intrinsic.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,6 @@ async def generate_docker_args(
145145
"CpuQuota": int(100_000 * len(cores)),
146146
"Cpus": ",".join(sorted_core_ids),
147147
"CpusetCpus": ",".join(sorted_core_ids),
148-
# 'CpusetMems': f'{resource_spec.numa_node}',
149148
},
150149
}
151150

tests/unit/agent/test_docker_intrinsic.py

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -632,13 +632,28 @@ def _device_alloc(core_ids: list[int]) -> dict[SlotName, dict[DeviceId, Decimal]
632632

633633
@staticmethod
634634
@contextmanager
635-
def _patch_node_of_cpu(core_to_node: dict[int, int]) -> Generator[None, None, None]:
635+
def _patch_node_of_cpu(
636+
core_to_node: dict[int, int],
637+
*,
638+
num_nodes: int = 2,
639+
) -> Generator[None, None, None]:
636640
"""Patch libnuma.node_of_cpu; return -1 for any core missing from the map
637641
(matches real libnuma's behavior for unknown cores when NUMA info is
638-
unavailable)."""
639-
with patch(
640-
"ai.backend.agent.docker.intrinsic.libnuma.node_of_cpu",
641-
side_effect=lambda core: core_to_node.get(core, -1),
642+
unavailable).
643+
644+
Also patches libnuma.num_nodes to report a multi-node host by default
645+
so the NUMA-aware branch is exercised. Tests covering the non-NUMA
646+
short-circuit can pass ``num_nodes=1``.
647+
"""
648+
with (
649+
patch(
650+
"ai.backend.agent.docker.intrinsic.libnuma.num_nodes",
651+
return_value=num_nodes,
652+
),
653+
patch(
654+
"ai.backend.agent.docker.intrinsic.libnuma.node_of_cpu",
655+
side_effect=lambda core: core_to_node.get(core, -1),
656+
),
642657
):
643658
yield
644659

@@ -710,3 +725,27 @@ async def test_unknown_or_negative_node_omits_cpuset_mems(
710725
# Sanity: core-list plumbing still works.
711726
assert host_config["Cpus"] == 2
712727
assert host_config["CpusetCpus"] == "0,1"
728+
729+
async def test_non_numa_host_omits_cpuset_mems(
730+
self,
731+
cpu_plugin: CPUPlugin,
732+
) -> None:
733+
"""On non-NUMA / non-Linux hosts (macOS, Docker Desktop, WSL, Linux
734+
without libnuma.so) libnuma.num_nodes() reports 1 and node_of_cpu()
735+
hardcodes 0. The plugin must short-circuit before inspecting per-core
736+
nodes so containers are not unconditionally pinned to CpusetMems="0".
737+
"""
738+
# node_of_cpu would return 0 for every core on a non-NUMA host; assert
739+
# we never reach that branch by mapping cores to a bogus node that
740+
# would otherwise produce a stale CpusetMems assignment.
741+
with self._patch_node_of_cpu({0: 0, 1: 0}, num_nodes=1):
742+
result = await cpu_plugin.generate_docker_args(
743+
AsyncMock(),
744+
self._device_alloc([0, 1]),
745+
)
746+
747+
host_config = result["HostConfig"]
748+
assert "CpusetMems" not in host_config
749+
# Sanity: core-list plumbing still works.
750+
assert host_config["Cpus"] == 2
751+
assert host_config["CpusetCpus"] == "0,1"

0 commit comments

Comments
 (0)