diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go index 28720845a4..b8b314793f 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go @@ -198,10 +198,18 @@ func (s *DeviceAffinityStrategy) allocateCandidateDevices( return allocatedDevices, nil } - // Process affinity groups from highest to lowest priority - for priority := 0; priority < len(affinityGroupsMap); priority++ { - affinityGroups, exists := affinityGroupsMap[priority] - if !exists || len(affinityGroups) == 0 { + // Process affinity groups from highest to lowest priority. + // Do not assume priorities are consecutive (e.g., keys can be 0,2,3,5). + priorityLevels := make([]int, 0, len(affinityGroupsMap)) + for p := range affinityGroupsMap { + priorityLevels = append(priorityLevels, p) + } + // Sort priorities ascending so stronger affinity (lower number) is processed first + sort.Ints(priorityLevels) + + for idx, priority := range priorityLevels { + affinityGroups := affinityGroupsMap[priority] + if len(affinityGroups) == 0 { continue } @@ -221,8 +229,8 @@ func (s *DeviceAffinityStrategy) allocateCandidateDevices( return result, nil } - // For the lowest priority, use more flexible allocation strategies - if priority == len(affinityGroupsMap)-1 { + // For the lowest priority (last in the sorted list), use more flexible allocation strategies + if idx == len(priorityLevels)-1 { return s.handleLowestPriorityAllocation( groupInfos, affinityGroupsMap, candidateDevicesSet, devicesToAllocate, allocatedDevices, remainingDevicesToAllocate, diff --git a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go index 2b6769a7af..5597f012f7 100644 --- a/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go +++ b/pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind_test.go @@ -873,6 +873,81 @@ func TestBind_NumberOfDevicesAllocated(t *testing.T) { Success: true, }, }, + { + name: "priority levels can be non-consecutive", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-5", "gpu-7"}, // gpu-1 and gpu-2 have affinity in 1st level, gpu-5 and gpu-7 have affinity in 2nd level + DeviceRequest: 4, + }, + // Level 0: [gpu-1, gpu-2], [gpu-3, gpu-4], [gpu-5, gpu-6], [gpu-7, gpu-8] + // Level 2: [gpu-1, gpu-2, gpu-3, gpu-4], [gpu-5, gpu-6, gpu-7, gpu-8] + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + convertIntToAffinityPriority(0): {"gpu-2"}, + convertIntToAffinityPriority(2): {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + convertIntToAffinityPriority(0): {"gpu-1"}, + convertIntToAffinityPriority(2): {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + convertIntToAffinityPriority(0): {"gpu-4"}, + convertIntToAffinityPriority(2): {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + convertIntToAffinityPriority(0): {"gpu-3"}, + convertIntToAffinityPriority(2): {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + convertIntToAffinityPriority(0): {"gpu-6"}, + convertIntToAffinityPriority(2): {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + convertIntToAffinityPriority(0): {"gpu-5"}, + convertIntToAffinityPriority(2): {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + convertIntToAffinityPriority(0): {"gpu-8"}, + convertIntToAffinityPriority(2): {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + convertIntToAffinityPriority(0): {"gpu-7"}, + convertIntToAffinityPriority(2): {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + }, + }, + }, + // Allocate gpu-1, gpu-2 in first level, then allocate gpu-3 as it has affinity with gpu-1 and gpu-2 + // Then allocate gpu-5 as gpu-6 is already allocated + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-5", "gpu-7", "gpu-8"}, + expectedResult: &allocate.AllocationResult{ + AllocatedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-5"}, + Success: true, + }, + }, { name: "allocate reusable devices first, then allocate available devices with affinity to the allocated reusable devices", ctx: &allocate.AllocationContext{ @@ -2437,6 +2512,74 @@ func TestBind_DeviceAffinity(t *testing.T) { sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, expectedAffinityPriorityLevel: 1, }, + { + name: "2 devices in affinity priority 0, 4 devices in affinity priority 2, levels are non-consecutive", + ctx: &allocate.AllocationContext{ + ResourceReq: &pluginapi.ResourceRequest{ + PodUid: "pod-1", + ContainerName: "container-1", + }, + DeviceReq: &pluginapi.DeviceRequest{ + DeviceName: "gpu", + ReusableDevices: nil, + DeviceRequest: 4, + }, + DeviceTopology: &machine.DeviceTopology{ + Devices: map[string]machine.DeviceInfo{ + "gpu-1": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + convertIntToAffinityPriority(0): {"gpu-2"}, + convertIntToAffinityPriority(2): {"gpu-2", "gpu-3", "gpu-4"}, + }, + }, + "gpu-2": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + convertIntToAffinityPriority(0): {"gpu-1"}, + convertIntToAffinityPriority(2): {"gpu-1", "gpu-3", "gpu-4"}, + }, + }, + "gpu-3": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + convertIntToAffinityPriority(0): {"gpu-4"}, + convertIntToAffinityPriority(2): {"gpu-1", "gpu-2", "gpu-4"}, + }, + }, + "gpu-4": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + convertIntToAffinityPriority(0): {"gpu-3"}, + convertIntToAffinityPriority(2): {"gpu-1", "gpu-2", "gpu-3"}, + }, + }, + "gpu-5": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + convertIntToAffinityPriority(0): {"gpu-6"}, + convertIntToAffinityPriority(2): {"gpu-6", "gpu-7", "gpu-8"}, + }, + }, + "gpu-6": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + convertIntToAffinityPriority(0): {"gpu-5"}, + convertIntToAffinityPriority(2): {"gpu-5", "gpu-7", "gpu-8"}, + }, + }, + "gpu-7": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + convertIntToAffinityPriority(0): {"gpu-8"}, + convertIntToAffinityPriority(2): {"gpu-5", "gpu-6", "gpu-8"}, + }, + }, + "gpu-8": { + DeviceAffinity: map[machine.AffinityPriority]machine.DeviceIDs{ + convertIntToAffinityPriority(0): {"gpu-7"}, + convertIntToAffinityPriority(2): {"gpu-5", "gpu-6", "gpu-7"}, + }, + }, + }, + }, + }, + sortedDevices: []string{"gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7", "gpu-8"}, + expectedAffinityPriorityLevel: 2, + }, { name: "4 devices in affinity priority 0, 8 devices in affinity priority 1, allocate 4 devices", ctx: &allocate.AllocationContext{