Skip to content

Commit 5d46d65

Browse files
rapsealkclaude
andcommitted
refactor(agent): Extract NUMA resolution and parametrize CpusetMems tests
Address review comments from @jopemachine on PR #11222: - Factor the NUMA-locality branch out of `generate_docker_args` into `CPUPlugin._resolve_node_local_mem`, so the main method stays flat and the helper can be unit-tested without plumbing a full `device_alloc`. - Collapse the four scenario-specific tests into a single `_NumaScenario`-keyed fixture + parametrized tests — one directly against `_resolve_node_local_mem`, one end-to-end through `generate_docker_args`. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 7026ac1 commit 5d46d65

2 files changed

Lines changed: 130 additions & 129 deletions

File tree

src/ai/backend/agent/docker/intrinsic.py

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,33 @@ async def get_hooks(self, distro: str, arch: str) -> Sequence[Path]:
426426
# TODO: move the sysconf hook in libbaihook.so here
427427
return []
428428

429+
@staticmethod
430+
def _resolve_node_local_mem(cores: list[int]) -> str | None:
431+
"""Return the NUMA node id (as a string suitable for ``CpusetMems``) when
432+
every core in ``cores`` is on the same node, otherwise ``None``.
433+
434+
Returns ``None`` when:
435+
- NUMA is unsupported (non-Linux hosts, Linux without libnuma.so,
436+
Docker Desktop, WSL, etc.) or the host exposes a single node —
437+
otherwise ``libnuma.node_of_cpu`` would fall back to ``0`` and every
438+
container would be pinned to ``CpusetMems="0"``.
439+
- ``libnuma.node_of_cpu`` cannot resolve a core (returns a negative id).
440+
- The allocation spans multiple NUMA nodes — in which case we
441+
intentionally leave ``CpusetMems`` unset so Docker / the kernel
442+
default NUMA memory placement policy can apply.
443+
"""
444+
if libnuma.num_nodes() <= 1:
445+
return None
446+
allocated_nodes: set[int] = set()
447+
for core in cores:
448+
node = libnuma.node_of_cpu(core)
449+
if node < 0:
450+
return None
451+
allocated_nodes.add(node)
452+
if len(allocated_nodes) != 1:
453+
return None
454+
return str(next(iter(allocated_nodes)))
455+
429456
async def generate_docker_args(
430457
self,
431458
docker: Docker,
@@ -437,23 +464,9 @@ async def generate_docker_args(
437464
"Cpus": len(cores),
438465
"CpusetCpus": ",".join(sorted_core_ids),
439466
}
440-
# Skip CpusetMems entirely when NUMA is unsupported (non-Linux hosts,
441-
# Linux without libnuma.so, Docker Desktop, WSL, etc.) or when the host
442-
# exposes a single node; libnuma.node_of_cpu would otherwise fall back
443-
# to 0 and cause every container to be pinned to "CpusetMems": "0".
444-
if libnuma.num_nodes() > 1:
445-
allocated_nodes: set[int] = set()
446-
for core in cores:
447-
node = libnuma.node_of_cpu(core)
448-
if node < 0:
449-
allocated_nodes.clear()
450-
break
451-
allocated_nodes.add(node)
452-
# Pin memory only when the CPU allocation is fully node-local.
453-
# For multi-node CPU allocations, intentionally leave CpusetMems unset
454-
# so Docker/kernel default NUMA memory placement policy can apply.
455-
if len(allocated_nodes) == 1:
456-
host_config["CpusetMems"] = str(next(iter(allocated_nodes)))
467+
cpuset_mems = self._resolve_node_local_mem(cores)
468+
if cpuset_mems is not None:
469+
host_config["CpusetMems"] = cpuset_mems
457470
return {
458471
"HostConfig": host_config,
459472
}

tests/unit/agent/test_docker_intrinsic.py

Lines changed: 100 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,29 @@
2121
from ai.backend.common.types import DeviceId, SlotName
2222

2323

24+
@contextmanager
25+
def _patched_libnuma(
26+
core_to_node: dict[int, int],
27+
num_nodes: int,
28+
) -> Generator[None, None, None]:
29+
"""Patch ``libnuma.num_nodes`` and ``libnuma.node_of_cpu`` used by
30+
``CPUPlugin._resolve_node_local_mem``. ``node_of_cpu`` returns ``-1`` for
31+
cores missing from the map, matching real libnuma's behavior when NUMA
32+
info is unavailable.
33+
"""
34+
with (
35+
patch(
36+
"ai.backend.agent.docker.intrinsic.libnuma.num_nodes",
37+
return_value=num_nodes,
38+
),
39+
patch(
40+
"ai.backend.agent.docker.intrinsic.libnuma.node_of_cpu",
41+
side_effect=lambda core: core_to_node.get(core, -1),
42+
),
43+
):
44+
yield
45+
46+
2447
class BaseDockerIntrinsicTest:
2548
"""Shared fixtures for Docker intrinsic plugin tests."""
2649

@@ -614,8 +637,74 @@ def test_raises_oserror_for_nonexistent_pid(self) -> None:
614637
read_proc_net_dev(999999999)
615638

616639

640+
@dataclass(frozen=True)
641+
class _NumaScenario:
642+
num_nodes: int
643+
core_to_node: dict[int, int]
644+
cores: list[int]
645+
expected_cpuset_mems: str | None
646+
647+
648+
_NUMA_SCENARIOS: dict[str, _NumaScenario] = {
649+
# All allocated cores on node 0 → pin memory to "0".
650+
"node_local_allocation": _NumaScenario(
651+
num_nodes=2,
652+
core_to_node={0: 0, 1: 0, 2: 1, 3: 1},
653+
cores=[0, 1],
654+
expected_cpuset_mems="0",
655+
),
656+
# Cores span nodes 0 and 1 → let the kernel default policy apply.
657+
"multi_node_allocation": _NumaScenario(
658+
num_nodes=2,
659+
core_to_node={0: 0, 1: 0, 2: 1, 3: 1},
660+
cores=[0, 2],
661+
expected_cpuset_mems=None,
662+
),
663+
# libnuma can't resolve core 1 → side_effect returns -1 for unmapped cores.
664+
"unknown_core": _NumaScenario(
665+
num_nodes=2,
666+
core_to_node={0: 0},
667+
cores=[0, 1],
668+
expected_cpuset_mems=None,
669+
),
670+
# libnuma reports -1 for a known core (NUMA info unavailable).
671+
"negative_node_id": _NumaScenario(
672+
num_nodes=2,
673+
core_to_node={0: 0, 1: -1},
674+
cores=[0, 1],
675+
expected_cpuset_mems=None,
676+
),
677+
# Non-NUMA host (macOS, Docker Desktop, WSL, Linux w/o libnuma.so):
678+
# num_nodes==1 must short-circuit before inspecting per-core nodes so
679+
# containers are not unconditionally pinned to CpusetMems="0".
680+
"non_numa_host": _NumaScenario(
681+
num_nodes=1,
682+
core_to_node={0: 0, 1: 0},
683+
cores=[0, 1],
684+
expected_cpuset_mems=None,
685+
),
686+
}
687+
688+
689+
@pytest.fixture(params=list(_NUMA_SCENARIOS), ids=list(_NUMA_SCENARIOS))
690+
def numa_scenario(request: pytest.FixtureRequest) -> _NumaScenario:
691+
return _NUMA_SCENARIOS[request.param]
692+
693+
694+
class TestCPUPluginResolveNodeLocalMem:
695+
"""Unit tests for ``CPUPlugin._resolve_node_local_mem`` NUMA-locality logic."""
696+
697+
def test_returns_expected_cpuset_mems(self, numa_scenario: _NumaScenario) -> None:
698+
with _patched_libnuma(numa_scenario.core_to_node, numa_scenario.num_nodes):
699+
result = CPUPlugin._resolve_node_local_mem(numa_scenario.cores)
700+
701+
assert result == numa_scenario.expected_cpuset_mems
702+
703+
617704
class TestCPUPluginGenerateDockerArgsNumaLocality:
618-
"""Tests for CPUPlugin.generate_docker_args() NUMA-locality CpusetMems logic."""
705+
"""Integration tests for ``CPUPlugin.generate_docker_args`` covering the
706+
end-to-end wiring of ``_resolve_node_local_mem`` into ``HostConfig``.
707+
"""
619708

620709
@pytest.fixture
621710
def cpu_plugin(self) -> CPUPlugin:
@@ -627,122 +716,21 @@ def _device_alloc(core_ids: list[int]) -> dict[SlotName, dict[DeviceId, Decimal]
627716
SlotName("cpu"): {DeviceId(str(cid)): Decimal("1") for cid in core_ids},
628717
}
629718

630-
@staticmethod
631-
@contextmanager
632-
def _patch_node_of_cpu(
633-
core_to_node: dict[int, int],
634-
*,
635-
num_nodes: int = 2,
636-
) -> Generator[None, None, None]:
637-
"""Patch libnuma.node_of_cpu; return -1 for any core missing from the map
638-
(matches real libnuma's behavior for unknown cores when NUMA info is
639-
unavailable).
640-
641-
Also patches libnuma.num_nodes to report a multi-node host by default
642-
so the NUMA-aware branch is exercised. Tests covering the non-NUMA
643-
short-circuit can pass ``num_nodes=1``.
644-
"""
645-
with (
646-
patch(
647-
"ai.backend.agent.docker.intrinsic.libnuma.num_nodes",
648-
return_value=num_nodes,
649-
),
650-
patch(
651-
"ai.backend.agent.docker.intrinsic.libnuma.node_of_cpu",
652-
side_effect=lambda core: core_to_node.get(core, -1),
653-
),
654-
):
655-
yield
656-
657-
async def test_single_node_allocation_sets_cpuset_mems(
658-
self,
659-
cpu_plugin: CPUPlugin,
660-
) -> None:
661-
"""When all allocated cores are on the same NUMA node, CpusetMems is pinned
662-
to that node as a string."""
663-
with self._patch_node_of_cpu({0: 0, 1: 0, 2: 1, 3: 1}):
664-
result = await cpu_plugin.generate_docker_args(
665-
AsyncMock(),
666-
self._device_alloc([0, 1]),
667-
)
668-
669-
host_config = result["HostConfig"]
670-
assert host_config["CpusetMems"] == "0"
671-
# Sanity: core-list plumbing still works.
672-
assert host_config["Cpus"] == 2
673-
assert host_config["CpusetCpus"] == "0,1"
674-
675-
async def test_multi_node_allocation_omits_cpuset_mems(
676-
self,
677-
cpu_plugin: CPUPlugin,
678-
) -> None:
679-
"""When cores span multiple NUMA nodes, CpusetMems must be omitted so that
680-
the Docker/kernel default NUMA memory placement policy can apply."""
681-
with self._patch_node_of_cpu({0: 0, 1: 0, 2: 1, 3: 1}):
682-
result = await cpu_plugin.generate_docker_args(
683-
AsyncMock(),
684-
self._device_alloc([0, 2]),
685-
)
686-
687-
host_config = result["HostConfig"]
688-
assert "CpusetMems" not in host_config
689-
# Sanity: core-list plumbing still works.
690-
assert host_config["Cpus"] == 2
691-
assert host_config["CpusetCpus"] == "0,2"
692-
693-
@pytest.mark.parametrize(
694-
"core_to_node",
695-
[
696-
{0: 0},
697-
{0: 0, 1: -1},
698-
],
699-
ids=["unknown_node", "negative_node"],
700-
)
701-
async def test_unknown_or_negative_node_omits_cpuset_mems(
702-
self,
703-
cpu_plugin: CPUPlugin,
704-
core_to_node: dict[int, int],
705-
) -> None:
706-
"""When any allocated core maps to an unknown (libnuma returns -1) or
707-
explicitly negative NUMA node, CpusetMems must be omitted.
708-
709-
`unknown_node` covers the case where libnuma cannot resolve a core (the
710-
patched side_effect returns -1 for unmapped cores). `negative_node` covers
711-
the case where libnuma reports -1 for a known core (NUMA info unavailable).
712-
Both collapse to the same `node < 0` branch in the SUT.
713-
"""
714-
with self._patch_node_of_cpu(core_to_node):
715-
result = await cpu_plugin.generate_docker_args(
716-
AsyncMock(),
717-
self._device_alloc([0, 1]),
718-
)
719-
720-
host_config = result["HostConfig"]
721-
assert "CpusetMems" not in host_config
722-
# Sanity: core-list plumbing still works.
723-
assert host_config["Cpus"] == 2
724-
assert host_config["CpusetCpus"] == "0,1"
725-
726-
async def test_non_numa_host_omits_cpuset_mems(
719+
async def test_host_config_matches_scenario(
727720
self,
728721
cpu_plugin: CPUPlugin,
722+
numa_scenario: _NumaScenario,
729723
) -> None:
730-
"""On non-NUMA / non-Linux hosts (macOS, Docker Desktop, WSL, Linux
731-
without libnuma.so) libnuma.num_nodes() reports 1 and node_of_cpu()
732-
hardcodes 0. The plugin must short-circuit before inspecting per-core
733-
nodes so containers are not unconditionally pinned to CpusetMems="0".
734-
"""
735-
# node_of_cpu would return 0 for every core on a non-NUMA host; assert
736-
# we never reach that branch by mapping cores to a bogus node that
737-
# would otherwise produce a stale CpusetMems assignment.
738-
with self._patch_node_of_cpu({0: 0, 1: 0}, num_nodes=1):
724+
with _patched_libnuma(numa_scenario.core_to_node, numa_scenario.num_nodes):
739725
result = await cpu_plugin.generate_docker_args(
740726
AsyncMock(),
741-
self._device_alloc([0, 1]),
727+
self._device_alloc(numa_scenario.cores),
742728
)
743729

744730
host_config = result["HostConfig"]
745-
assert "CpusetMems" not in host_config
746-
# Sanity: core-list plumbing still works.
747-
assert host_config["Cpus"] == 2
748-
assert host_config["CpusetCpus"] == "0,1"
731+
assert host_config["Cpus"] == len(numa_scenario.cores)
732+
assert host_config["CpusetCpus"] == ",".join(str(c) for c in sorted(numa_scenario.cores))
733+
if numa_scenario.expected_cpuset_mems is None:
734+
assert "CpusetMems" not in host_config
735+
else:
736+
assert host_config["CpusetMems"] == numa_scenario.expected_cpuset_mems

0 commit comments

Comments
 (0)