|
4 | 4 | from collections.abc import Generator |
5 | 5 | from contextlib import contextmanager |
6 | 6 | from dataclasses import dataclass |
| 7 | +from decimal import Decimal |
7 | 8 | from pathlib import Path |
8 | 9 | from typing import Any |
9 | 10 | from unittest.mock import AsyncMock, MagicMock, patch |
|
12 | 13 |
|
13 | 14 | from ai.backend.agent.docker.intrinsic import ( |
14 | 15 | ContainerNetStat, |
| 16 | + CPUDevice, |
15 | 17 | CPUPlugin, |
16 | 18 | MemoryPlugin, |
17 | 19 | read_proc_net_dev, |
18 | 20 | ) |
19 | 21 | from ai.backend.agent.stats import StatModes |
| 22 | +from ai.backend.common.types import DeviceId, DeviceName, SlotName |
20 | 23 |
|
21 | 24 |
|
22 | 25 | class BaseDockerIntrinsicTest: |
@@ -610,3 +613,127 @@ def test_raises_oserror_for_nonexistent_pid(self) -> None: |
610 | 613 | """Raises OSError when /proc/[pid]/net/dev does not exist.""" |
611 | 614 | with pytest.raises(OSError): |
612 | 615 | read_proc_net_dev(999999999) |
| 616 | + |
| 617 | + |
| 618 | +class TestCPUPluginGenerateDockerArgsNumaLocality: |
| 619 | + """Tests for CPUPlugin.generate_docker_args() NUMA-locality CpusetMems logic.""" |
| 620 | + |
| 621 | + @pytest.fixture |
| 622 | + def cpu_plugin(self) -> CPUPlugin: |
| 623 | + plugin = CPUPlugin.__new__(CPUPlugin) |
| 624 | + plugin.local_config = {"agent": {"docker-mode": "default"}} |
| 625 | + plugin._docker = AsyncMock() |
| 626 | + return plugin |
| 627 | + |
| 628 | + @staticmethod |
| 629 | + def _make_device(core_id: int, numa_node: int | None) -> CPUDevice: |
| 630 | + return CPUDevice( |
| 631 | + device_id=DeviceId(str(core_id)), |
| 632 | + hw_location="root", |
| 633 | + memory_size=0, |
| 634 | + processing_units=1, |
| 635 | + numa_node=numa_node, |
| 636 | + device_name=DeviceName("cpu"), |
| 637 | + ) |
| 638 | + |
| 639 | + @staticmethod |
| 640 | + def _device_alloc(core_ids: list[int]) -> dict[SlotName, dict[DeviceId, Decimal]]: |
| 641 | + return { |
| 642 | + SlotName("cpu"): {DeviceId(str(cid)): Decimal("1") for cid in core_ids}, |
| 643 | + } |
| 644 | + |
| 645 | + async def test_single_node_allocation_sets_cpuset_mems( |
| 646 | + self, |
| 647 | + cpu_plugin: CPUPlugin, |
| 648 | + ) -> None: |
| 649 | + """When all allocated cores are on the same NUMA node, CpusetMems is pinned |
| 650 | + to that node as a string.""" |
| 651 | + devices = [ |
| 652 | + self._make_device(0, 0), |
| 653 | + self._make_device(1, 0), |
| 654 | + self._make_device(2, 1), |
| 655 | + self._make_device(3, 1), |
| 656 | + ] |
| 657 | + with patch.object(CPUPlugin, "list_devices", AsyncMock(return_value=devices)): |
| 658 | + result = await cpu_plugin.generate_docker_args( |
| 659 | + AsyncMock(), |
| 660 | + self._device_alloc([0, 1]), |
| 661 | + ) |
| 662 | + |
| 663 | + host_config = result["HostConfig"] |
| 664 | + assert host_config["CpusetMems"] == "0" |
| 665 | + # Sanity: core-list plumbing still works. |
| 666 | + assert host_config["Cpus"] == 2 |
| 667 | + assert host_config["CpusetCpus"] == "0,1" |
| 668 | + |
| 669 | + async def test_multi_node_allocation_omits_cpuset_mems( |
| 670 | + self, |
| 671 | + cpu_plugin: CPUPlugin, |
| 672 | + ) -> None: |
| 673 | + """When cores span multiple NUMA nodes, CpusetMems must be omitted |
| 674 | + because Docker's HostConfig cannot express a multi-node cpuset.mems.""" |
| 675 | + devices = [ |
| 676 | + self._make_device(0, 0), |
| 677 | + self._make_device(1, 0), |
| 678 | + self._make_device(2, 1), |
| 679 | + self._make_device(3, 1), |
| 680 | + ] |
| 681 | + with patch.object(CPUPlugin, "list_devices", AsyncMock(return_value=devices)): |
| 682 | + result = await cpu_plugin.generate_docker_args( |
| 683 | + AsyncMock(), |
| 684 | + self._device_alloc([0, 2]), |
| 685 | + ) |
| 686 | + |
| 687 | + host_config = result["HostConfig"] |
| 688 | + assert "CpusetMems" not in host_config |
| 689 | + # Sanity: core-list plumbing still works. |
| 690 | + assert host_config["Cpus"] == 2 |
| 691 | + assert host_config["CpusetCpus"] == "0,2" |
| 692 | + |
| 693 | + @pytest.mark.parametrize( |
| 694 | + ("missing_core_numa", "case_id"), |
| 695 | + [ |
| 696 | + (None, "unknown_node"), |
| 697 | + (-1, "negative_node"), |
| 698 | + ], |
| 699 | + ) |
| 700 | + async def test_unknown_or_negative_node_omits_cpuset_mems( |
| 701 | + self, |
| 702 | + cpu_plugin: CPUPlugin, |
| 703 | + missing_core_numa: int | None, |
| 704 | + case_id: str, |
| 705 | + ) -> None: |
| 706 | + """When any allocated core maps to an unknown (None) or negative NUMA node, |
| 707 | + CpusetMems must be omitted. |
| 708 | +
|
| 709 | + For the `unknown_node` case, we simulate a core missing from the device list |
| 710 | + (so `core_to_node.get(core)` returns None). For the `negative_node` case, we |
| 711 | + include a device with numa_node = -1. |
| 712 | + """ |
| 713 | + if missing_core_numa is None: |
| 714 | + # Core 5 is allocated but not present in the device list. |
| 715 | + devices = [ |
| 716 | + self._make_device(0, 0), |
| 717 | + self._make_device(1, 0), |
| 718 | + ] |
| 719 | + allocated_cores = [0, 5] |
| 720 | + expected_cpuset_cpus = "0,5" |
| 721 | + else: |
| 722 | + devices = [ |
| 723 | + self._make_device(0, 0), |
| 724 | + self._make_device(1, missing_core_numa), |
| 725 | + ] |
| 726 | + allocated_cores = [0, 1] |
| 727 | + expected_cpuset_cpus = "0,1" |
| 728 | + |
| 729 | + with patch.object(CPUPlugin, "list_devices", AsyncMock(return_value=devices)): |
| 730 | + result = await cpu_plugin.generate_docker_args( |
| 731 | + AsyncMock(), |
| 732 | + self._device_alloc(allocated_cores), |
| 733 | + ) |
| 734 | + |
| 735 | + host_config = result["HostConfig"] |
| 736 | + assert "CpusetMems" not in host_config, f"case={case_id}" |
| 737 | + # Sanity: core-list plumbing still works. |
| 738 | + assert host_config["Cpus"] == len(allocated_cores) |
| 739 | + assert host_config["CpusetCpus"] == expected_cpuset_cpus |
0 commit comments