1313
1414from ai .backend .agent .docker .intrinsic import (
1515 ContainerNetStat ,
16- CPUDevice ,
1716 CPUPlugin ,
1817 MemoryPlugin ,
1918 read_proc_net_dev ,
2019)
2120from ai .backend .agent .stats import StatModes
22- from ai .backend .common .types import DeviceId , DeviceName , SlotName
21+ from ai .backend .common .types import DeviceId , SlotName
2322
2423
2524class BaseDockerIntrinsicTest :
@@ -625,36 +624,31 @@ def cpu_plugin(self) -> CPUPlugin:
625624 plugin ._docker = AsyncMock ()
626625 return plugin
627626
628- @staticmethod
629- def _make_device (core_id : int , numa_node : int | None ) -> CPUDevice :
630- return CPUDevice (
631- device_id = DeviceId (str (core_id )),
632- hw_location = "root" ,
633- memory_size = 0 ,
634- processing_units = 1 ,
635- numa_node = numa_node ,
636- device_name = DeviceName ("cpu" ),
637- )
638-
639627 @staticmethod
640628 def _device_alloc (core_ids : list [int ]) -> dict [SlotName , dict [DeviceId , Decimal ]]:
641629 return {
642630 SlotName ("cpu" ): {DeviceId (str (cid )): Decimal ("1" ) for cid in core_ids },
643631 }
644632
633+ @staticmethod
634+ @contextmanager
635+ def _patch_node_of_cpu (core_to_node : dict [int , int ]) -> Generator [None , None , None ]:
636+ """Patch libnuma.node_of_cpu; return -1 for any core missing from the map
637+ (matches real libnuma's behavior for unknown cores when NUMA info is
638+ unavailable)."""
639+ with patch (
640+ "ai.backend.agent.docker.intrinsic.libnuma.node_of_cpu" ,
641+ side_effect = lambda core : core_to_node .get (core , - 1 ),
642+ ):
643+ yield
644+
645645 async def test_single_node_allocation_sets_cpuset_mems (
646646 self ,
647647 cpu_plugin : CPUPlugin ,
648648 ) -> None :
649649 """When all allocated cores are on the same NUMA node, CpusetMems is pinned
650650 to that node as a string."""
651- devices = [
652- self ._make_device (0 , 0 ),
653- self ._make_device (1 , 0 ),
654- self ._make_device (2 , 1 ),
655- self ._make_device (3 , 1 ),
656- ]
657- with patch .object (CPUPlugin , "list_devices" , AsyncMock (return_value = devices )):
651+ with self ._patch_node_of_cpu ({0 : 0 , 1 : 0 , 2 : 1 , 3 : 1 }):
658652 result = await cpu_plugin .generate_docker_args (
659653 AsyncMock (),
660654 self ._device_alloc ([0 , 1 ]),
@@ -670,15 +664,9 @@ async def test_multi_node_allocation_omits_cpuset_mems(
670664 self ,
671665 cpu_plugin : CPUPlugin ,
672666 ) -> None :
673- """When cores span multiple NUMA nodes, CpusetMems must be omitted
674- because Docker's HostConfig cannot express a multi-node cpuset.mems."""
675- devices = [
676- self ._make_device (0 , 0 ),
677- self ._make_device (1 , 0 ),
678- self ._make_device (2 , 1 ),
679- self ._make_device (3 , 1 ),
680- ]
681- with patch .object (CPUPlugin , "list_devices" , AsyncMock (return_value = devices )):
667+ """When cores span multiple NUMA nodes, CpusetMems must be omitted so that
668+ the Docker/kernel default NUMA memory placement policy can apply."""
669+ with self ._patch_node_of_cpu ({0 : 0 , 1 : 0 , 2 : 1 , 3 : 1 }):
682670 result = await cpu_plugin .generate_docker_args (
683671 AsyncMock (),
684672 self ._device_alloc ([0 , 2 ]),
@@ -691,49 +679,34 @@ async def test_multi_node_allocation_omits_cpuset_mems(
691679 assert host_config ["CpusetCpus" ] == "0,2"
692680
693681 @pytest .mark .parametrize (
694- ("missing_core_numa " , "case_id" ),
682+ ("core_to_node " , "case_id" ),
695683 [
696- (None , "unknown_node" ),
697- (- 1 , "negative_node" ),
684+ ({ 0 : 0 } , "unknown_node" ),
685+ ({ 0 : 0 , 1 : - 1 } , "negative_node" ),
698686 ],
699687 )
700688 async def test_unknown_or_negative_node_omits_cpuset_mems (
701689 self ,
702690 cpu_plugin : CPUPlugin ,
703- missing_core_numa : int | None ,
691+ core_to_node : dict [ int , int ] ,
704692 case_id : str ,
705693 ) -> None :
706- """When any allocated core maps to an unknown (None) or negative NUMA node,
707- CpusetMems must be omitted.
694+ """When any allocated core maps to an unknown (libnuma returns -1) or
695+ explicitly negative NUMA node, CpusetMems must be omitted.
708696
709- For the `unknown_node` case, we simulate a core missing from the device list
710- (so `core_to_node.get(core)` returns None). For the `negative_node` case, we
711- include a device with numa_node = -1.
697+ `unknown_node` covers the case where libnuma cannot resolve a core (the
698+ patched side_effect returns -1 for unmapped cores). `negative_node` covers
699+ the case where libnuma reports -1 for a known core (NUMA info unavailable).
700+ Both collapse to the same `node < 0` branch in the SUT.
712701 """
713- if missing_core_numa is None :
714- # Core 5 is allocated but not present in the device list.
715- devices = [
716- self ._make_device (0 , 0 ),
717- self ._make_device (1 , 0 ),
718- ]
719- allocated_cores = [0 , 5 ]
720- expected_cpuset_cpus = "0,5"
721- else :
722- devices = [
723- self ._make_device (0 , 0 ),
724- self ._make_device (1 , missing_core_numa ),
725- ]
726- allocated_cores = [0 , 1 ]
727- expected_cpuset_cpus = "0,1"
728-
729- with patch .object (CPUPlugin , "list_devices" , AsyncMock (return_value = devices )):
702+ with self ._patch_node_of_cpu (core_to_node ):
730703 result = await cpu_plugin .generate_docker_args (
731704 AsyncMock (),
732- self ._device_alloc (allocated_cores ),
705+ self ._device_alloc ([ 0 , 1 ] ),
733706 )
734707
735708 host_config = result ["HostConfig" ]
736709 assert "CpusetMems" not in host_config , f"case={ case_id } "
737710 # Sanity: core-list plumbing still works.
738- assert host_config ["Cpus" ] == len ( allocated_cores )
739- assert host_config ["CpusetCpus" ] == expected_cpuset_cpus
711+ assert host_config ["Cpus" ] == 2
712+ assert host_config ["CpusetCpus" ] == "0,1"
0 commit comments