refactor(agent): Extract NUMA resolution and parametrize CpusetMems tests

rapsealk · claude · rapsealk · commit 5d46d655ebb1 · 2026-04-24T22:25:27.000+09:00
Address review comments from @jopemachine on PR #11222: - Factor the NUMA-locality branch out of `generate_docker_args` into `CPUPlugin._resolve_node_local_mem`, so the main method stays flat and the helper can be unit-tested without plumbing a full `device_alloc`. - Collapse the four scenario-specific tests into a single `_NumaScenario`-keyed fixture + parametrized tests — one directly against `_resolve_node_local_mem`, one end-to-end through `generate_docker_args`. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/src/ai/backend/agent/docker/intrinsic.py b/src/ai/backend/agent/docker/intrinsic.py
@@ -426,6 +426,33 @@ async def get_hooks(self, distro: str, arch: str) -> Sequence[Path]:
         # TODO: move the sysconf hook in libbaihook.so here
         return []
 
+    @staticmethod
+    def _resolve_node_local_mem(cores: list[int]) -> str | None:
+        """Return the NUMA node id (as a string suitable for ``CpusetMems``) when
+        every core in ``cores`` is on the same node, otherwise ``None``.
+
+        Returns ``None`` when:
+        - NUMA is unsupported (non-Linux hosts, Linux without libnuma.so,
+          Docker Desktop, WSL, etc.) or the host exposes a single node —
+          otherwise ``libnuma.node_of_cpu`` would fall back to ``0`` and every
+          container would be pinned to ``CpusetMems="0"``.
+        - ``libnuma.node_of_cpu`` cannot resolve a core (returns a negative id).
+        - The allocation spans multiple NUMA nodes — in which case we
+          intentionally leave ``CpusetMems`` unset so Docker / the kernel
+          default NUMA memory placement policy can apply.
+        """
+        if libnuma.num_nodes() <= 1:
+            return None
+        allocated_nodes: set[int] = set()
+        for core in cores:
+            node = libnuma.node_of_cpu(core)
+            if node < 0:
+                return None
+            allocated_nodes.add(node)
+        if len(allocated_nodes) != 1:
+            return None
+        return str(next(iter(allocated_nodes)))
+
     async def generate_docker_args(
         self,
         docker: Docker,
@@ -437,23 +464,9 @@ async def generate_docker_args(
             "Cpus": len(cores),
             "CpusetCpus": ",".join(sorted_core_ids),
         }
-        # Skip CpusetMems entirely when NUMA is unsupported (non-Linux hosts,
-        # Linux without libnuma.so, Docker Desktop, WSL, etc.) or when the host
-        # exposes a single node; libnuma.node_of_cpu would otherwise fall back
-        # to 0 and cause every container to be pinned to "CpusetMems": "0".
-        if libnuma.num_nodes() > 1:
-            allocated_nodes: set[int] = set()
-            for core in cores:
-                node = libnuma.node_of_cpu(core)
-                if node < 0:
-                    allocated_nodes.clear()
-                    break
-                allocated_nodes.add(node)
-            # Pin memory only when the CPU allocation is fully node-local.
-            # For multi-node CPU allocations, intentionally leave CpusetMems unset
-            # so Docker/kernel default NUMA memory placement policy can apply.
-            if len(allocated_nodes) == 1:
-                host_config["CpusetMems"] = str(next(iter(allocated_nodes)))
+        cpuset_mems = self._resolve_node_local_mem(cores)
+        if cpuset_mems is not None:
+            host_config["CpusetMems"] = cpuset_mems
         return {
             "HostConfig": host_config,
         }
diff --git a/tests/unit/agent/test_docker_intrinsic.py b/tests/unit/agent/test_docker_intrinsic.py
@@ -21,6 +21,29 @@
 from ai.backend.common.types import DeviceId, SlotName
 
 
+@contextmanager
+def _patched_libnuma(
+    core_to_node: dict[int, int],
+    num_nodes: int,
+) -> Generator[None, None, None]:
+    """Patch ``libnuma.num_nodes`` and ``libnuma.node_of_cpu`` used by
+    ``CPUPlugin._resolve_node_local_mem``. ``node_of_cpu`` returns ``-1`` for
+    cores missing from the map, matching real libnuma's behavior when NUMA
+    info is unavailable.
+    """
+    with (
+        patch(
+            "ai.backend.agent.docker.intrinsic.libnuma.num_nodes",
+            return_value=num_nodes,
+        ),
+        patch(
+            "ai.backend.agent.docker.intrinsic.libnuma.node_of_cpu",
+            side_effect=lambda core: core_to_node.get(core, -1),
+        ),
+    ):
+        yield
+
+
 class BaseDockerIntrinsicTest:
     """Shared fixtures for Docker intrinsic plugin tests."""
 
@@ -614,8 +637,74 @@ def test_raises_oserror_for_nonexistent_pid(self) -> None:
             read_proc_net_dev(999999999)
 
 
+@dataclass(frozen=True)
+class _NumaScenario:
+    num_nodes: int
+    core_to_node: dict[int, int]
+    cores: list[int]
+    expected_cpuset_mems: str | None
+
+
+_NUMA_SCENARIOS: dict[str, _NumaScenario] = {
+    # All allocated cores on node 0 → pin memory to "0".
+    "node_local_allocation": _NumaScenario(
+        num_nodes=2,
+        core_to_node={0: 0, 1: 0, 2: 1, 3: 1},
+        cores=[0, 1],
+        expected_cpuset_mems="0",
+    ),
+    # Cores span nodes 0 and 1 → let the kernel default policy apply.
+    "multi_node_allocation": _NumaScenario(
+        num_nodes=2,
+        core_to_node={0: 0, 1: 0, 2: 1, 3: 1},
+        cores=[0, 2],
+        expected_cpuset_mems=None,
+    ),
+    # libnuma can't resolve core 1 → side_effect returns -1 for unmapped cores.
+    "unknown_core": _NumaScenario(
+        num_nodes=2,
+        core_to_node={0: 0},
+        cores=[0, 1],
+        expected_cpuset_mems=None,
+    ),
+    # libnuma reports -1 for a known core (NUMA info unavailable).
+    "negative_node_id": _NumaScenario(
+        num_nodes=2,
+        core_to_node={0: 0, 1: -1},
+        cores=[0, 1],
+        expected_cpuset_mems=None,
+    ),
+    # Non-NUMA host (macOS, Docker Desktop, WSL, Linux w/o libnuma.so):
+    # num_nodes==1 must short-circuit before inspecting per-core nodes so
+    # containers are not unconditionally pinned to CpusetMems="0".
+    "non_numa_host": _NumaScenario(
+        num_nodes=1,
+        core_to_node={0: 0, 1: 0},
+        cores=[0, 1],
+        expected_cpuset_mems=None,
+    ),
+}
+
+
+@pytest.fixture(params=list(_NUMA_SCENARIOS), ids=list(_NUMA_SCENARIOS))
+def numa_scenario(request: pytest.FixtureRequest) -> _NumaScenario:
+    return _NUMA_SCENARIOS[request.param]
+
+
+class TestCPUPluginResolveNodeLocalMem:
+    """Unit tests for ``CPUPlugin._resolve_node_local_mem`` NUMA-locality logic."""
+
+    def test_returns_expected_cpuset_mems(self, numa_scenario: _NumaScenario) -> None:
+        with _patched_libnuma(numa_scenario.core_to_node, numa_scenario.num_nodes):
+            result = CPUPlugin._resolve_node_local_mem(numa_scenario.cores)
+
+        assert result == numa_scenario.expected_cpuset_mems
+
+
 class TestCPUPluginGenerateDockerArgsNumaLocality:
-    """Tests for CPUPlugin.generate_docker_args() NUMA-locality CpusetMems logic."""
+    """Integration tests for ``CPUPlugin.generate_docker_args`` covering the
+    end-to-end wiring of ``_resolve_node_local_mem`` into ``HostConfig``.
+    """
 
     @pytest.fixture
     def cpu_plugin(self) -> CPUPlugin:
@@ -627,122 +716,21 @@ def _device_alloc(core_ids: list[int]) -> dict[SlotName, dict[DeviceId, Decimal]
             SlotName("cpu"): {DeviceId(str(cid)): Decimal("1") for cid in core_ids},
         }
 
-    @staticmethod
-    @contextmanager
-    def _patch_node_of_cpu(
-        core_to_node: dict[int, int],
-        *,
-        num_nodes: int = 2,
-    ) -> Generator[None, None, None]:
-        """Patch libnuma.node_of_cpu; return -1 for any core missing from the map
-        (matches real libnuma's behavior for unknown cores when NUMA info is
-        unavailable).
-
-        Also patches libnuma.num_nodes to report a multi-node host by default
-        so the NUMA-aware branch is exercised. Tests covering the non-NUMA
-        short-circuit can pass ``num_nodes=1``.
-        """
-        with (
-            patch(
-                "ai.backend.agent.docker.intrinsic.libnuma.num_nodes",
-                return_value=num_nodes,
-            ),
-            patch(
-                "ai.backend.agent.docker.intrinsic.libnuma.node_of_cpu",
-                side_effect=lambda core: core_to_node.get(core, -1),
-            ),
-        ):
-            yield
-
-    async def test_single_node_allocation_sets_cpuset_mems(
-        self,
-        cpu_plugin: CPUPlugin,
-    ) -> None:
-        """When all allocated cores are on the same NUMA node, CpusetMems is pinned
-        to that node as a string."""
-        with self._patch_node_of_cpu({0: 0, 1: 0, 2: 1, 3: 1}):
-            result = await cpu_plugin.generate_docker_args(
-                AsyncMock(),
-                self._device_alloc([0, 1]),
-            )
-
-        host_config = result["HostConfig"]
-        assert host_config["CpusetMems"] == "0"
-        # Sanity: core-list plumbing still works.
-        assert host_config["Cpus"] == 2
-        assert host_config["CpusetCpus"] == "0,1"
-
-    async def test_multi_node_allocation_omits_cpuset_mems(
-        self,
-        cpu_plugin: CPUPlugin,
-    ) -> None:
-        """When cores span multiple NUMA nodes, CpusetMems must be omitted so that
-        the Docker/kernel default NUMA memory placement policy can apply."""
-        with self._patch_node_of_cpu({0: 0, 1: 0, 2: 1, 3: 1}):
-            result = await cpu_plugin.generate_docker_args(
-                AsyncMock(),
-                self._device_alloc([0, 2]),
-            )
-
-        host_config = result["HostConfig"]
-        assert "CpusetMems" not in host_config
-        # Sanity: core-list plumbing still works.
-        assert host_config["Cpus"] == 2
-        assert host_config["CpusetCpus"] == "0,2"
-
-    @pytest.mark.parametrize(
-        "core_to_node",
-        [
-            {0: 0},
-            {0: 0, 1: -1},
-        ],
-        ids=["unknown_node", "negative_node"],
-    )
-    async def test_unknown_or_negative_node_omits_cpuset_mems(
-        self,
-        cpu_plugin: CPUPlugin,
-        core_to_node: dict[int, int],
-    ) -> None:
-        """When any allocated core maps to an unknown (libnuma returns -1) or
-        explicitly negative NUMA node, CpusetMems must be omitted.
-
-        `unknown_node` covers the case where libnuma cannot resolve a core (the
-        patched side_effect returns -1 for unmapped cores). `negative_node` covers
-        the case where libnuma reports -1 for a known core (NUMA info unavailable).
-        Both collapse to the same `node < 0` branch in the SUT.
-        """
-        with self._patch_node_of_cpu(core_to_node):
-            result = await cpu_plugin.generate_docker_args(
-                AsyncMock(),
-                self._device_alloc([0, 1]),
-            )
-
-        host_config = result["HostConfig"]
-        assert "CpusetMems" not in host_config
-        # Sanity: core-list plumbing still works.
-        assert host_config["Cpus"] == 2
-        assert host_config["CpusetCpus"] == "0,1"
-
-    async def test_non_numa_host_omits_cpuset_mems(
+    async def test_host_config_matches_scenario(
         self,
         cpu_plugin: CPUPlugin,
+        numa_scenario: _NumaScenario,
     ) -> None:
-        """On non-NUMA / non-Linux hosts (macOS, Docker Desktop, WSL, Linux
-        without libnuma.so) libnuma.num_nodes() reports 1 and node_of_cpu()
-        hardcodes 0. The plugin must short-circuit before inspecting per-core
-        nodes so containers are not unconditionally pinned to CpusetMems="0".
-        """
-        # node_of_cpu would return 0 for every core on a non-NUMA host; assert
-        # we never reach that branch by mapping cores to a bogus node that
-        # would otherwise produce a stale CpusetMems assignment.
-        with self._patch_node_of_cpu({0: 0, 1: 0}, num_nodes=1):
+        with _patched_libnuma(numa_scenario.core_to_node, numa_scenario.num_nodes):
             result = await cpu_plugin.generate_docker_args(
                 AsyncMock(),
-                self._device_alloc([0, 1]),
+                self._device_alloc(numa_scenario.cores),
             )
 
         host_config = result["HostConfig"]
-        assert "CpusetMems" not in host_config
-        # Sanity: core-list plumbing still works.
-        assert host_config["Cpus"] == 2
-        assert host_config["CpusetCpus"] == "0,1"
+        assert host_config["Cpus"] == len(numa_scenario.cores)
+        assert host_config["CpusetCpus"] == ",".join(str(c) for c in sorted(numa_scenario.cores))
+        if numa_scenario.expected_cpuset_mems is None:
+            assert "CpusetMems" not in host_config
+        else:
+            assert host_config["CpusetMems"] == numa_scenario.expected_cpuset_mems