Skip to content

Commit 2d74007

Browse files
committed
fix: maintain affinity subgroup sequence in larger affinity groups
fix: maintain affinity subgroup sequence in larger affinity groups
1 parent 3b68867 commit 2d74007

3 files changed

Lines changed: 963 additions & 122 deletions

File tree

pkg/agent/qrm-plugins/gpu/strategy/allocate/strategies/deviceaffinity/bind.go

Lines changed: 64 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ import (
3434
// It is uniquely identified by an id.
3535
type affinityGroup struct {
3636
id string
37-
unallocatedDevices sets.String
37+
unallocatedDevices []string
3838
}
3939

4040
// possibleAllocation refers to information about a certain affinity group, which includes the number of unallocated devices in the group,
@@ -74,6 +74,7 @@ func (s *DeviceAffinityStrategy) Bind(
7474

7575
// Get a map of affinity groups that is grouped by priority
7676
affinityMap := ctx.DeviceTopology.GroupDeviceAffinity()
77+
klog.Infof("affinity map: %v", affinityMap)
7778
affinityGroupByPriority := s.getAffinityGroupsByPriority(affinityMap, unallocatedDevicesSet)
7879

7980
idToAffinityGroupMap := s.getAffinityGroupById(affinityGroupByPriority)
@@ -144,6 +145,7 @@ func (s *DeviceAffinityStrategy) getAffinityGroupsByPriority(
144145
for priority, affinityDevices := range affinityMap {
145146
affinityGroupsMap[priority] = s.getAffinityGroups(affinityDevices, unallocatedDevicesSet)
146147
}
148+
147149
return affinityGroupsMap
148150
}
149151

@@ -162,7 +164,7 @@ func (s *DeviceAffinityStrategy) getAffinityGroups(
162164
}
163165
}
164166
affinityGroups = append(affinityGroups, affinityGroup{
165-
unallocatedDevices: sets.NewString(unallocatedDevices...),
167+
unallocatedDevices: unallocatedDevices,
166168
id: uuid.NewString(),
167169
})
168170
}
@@ -196,22 +198,24 @@ func (s *DeviceAffinityStrategy) allocateCandidateDevices(
196198
return allocatedDevices, nil
197199
}
198200

199-
// Otherwise, we need to allocate these devices by their affinity
200-
for priority := 0; priority < len(affinityMap); priority++ {
201-
groups, ok := affinityMap[machine.AffinityPriority(priority)]
202-
if !ok {
203-
return nil, fmt.Errorf("affinity priority %v not found", priority)
204-
}
201+
for allocatedDevices.Len() < devicesToAllocate {
202+
// Otherwise, we need to allocate these devices by their affinity
203+
for priority := 0; priority < len(affinityMap); priority++ {
204+
groups, ok := affinityMap[machine.AffinityPriority(priority)]
205+
if !ok {
206+
return nil, fmt.Errorf("affinity priority %v not found", priority)
207+
}
205208

206-
intersectionToPossibleAllocationsMap := s.makeIntersectionToPossibleAllocationsMap(groups, availableDevicesSet, allocatedDevices)
209+
intersectionToPossibleAllocationsMap := s.makeIntersectionToPossibleAllocationsMap(groups, availableDevicesSet, allocatedDevices)
207210

208-
allocateByIntersectionRes := s.allocateByIntersection(intersectionToPossibleAllocationsMap, allocatedDevices, devicesToAllocate, priority == len(affinityMap)-1)
209-
if allocateByIntersectionRes.err != nil {
210-
return nil, allocateByIntersectionRes.err
211-
}
211+
allocateByIntersectionRes := s.allocateByIntersection(intersectionToPossibleAllocationsMap, allocatedDevices, devicesToAllocate, priority == len(affinityMap)-1)
212+
if allocateByIntersectionRes.err != nil {
213+
return nil, allocateByIntersectionRes.err
214+
}
212215

213-
if allocateByIntersectionRes.finished {
214-
return allocatedDevices, nil
216+
if allocateByIntersectionRes.finished {
217+
return allocatedDevices, nil
218+
}
215219
}
216220
}
217221

@@ -244,15 +248,15 @@ func (s *DeviceAffinityStrategy) allocateAvailableDevicesWithAffinity(
244248
return nil, fmt.Errorf("affinity group %v not found", groupID)
245249
}
246250

247-
deviceIntersection := group.unallocatedDevices.Intersection(unallocatedAvailableDevices)
251+
deviceIntersection := sets.NewString(group.unallocatedDevices...).Intersection(unallocatedAvailableDevices)
248252
if _, ok = intersectionToPossibleAllocationsMap[deviceIntersection.Len()]; !ok {
249253
intersectionToPossibleAllocationsMap[deviceIntersection.Len()] = make([]possibleAllocation, 0)
250254
}
251255

252256
intersectionToPossibleAllocationsMap[deviceIntersection.Len()] = append(intersectionToPossibleAllocationsMap[deviceIntersection.Len()], possibleAllocation{
253257
// The number of unallocated devices in the group is retrieved by taking a difference between
254258
// the unallocated devices in the group and the already allocated devices
255-
unallocatedSize: group.unallocatedDevices.Difference(allocatedDevices).Len(),
259+
unallocatedSize: sets.NewString(group.unallocatedDevices...).Difference(allocatedDevices).Len(),
256260
candidateDevices: deviceIntersection.UnsortedList(),
257261
})
258262
}
@@ -303,25 +307,38 @@ func (s *DeviceAffinityStrategy) makeIntersectionToPossibleAllocationsMap(
303307
intersectionToPossibleAllocationsMap := make(map[int][]possibleAllocation)
304308
for _, group := range groups {
305309
// Find intersection of affinity group and the available reusable devices
306-
deviceIntersection := group.unallocatedDevices.Intersection(availableDevicesSet)
307-
if _, ok := intersectionToPossibleAllocationsMap[deviceIntersection.Len()]; !ok {
308-
intersectionToPossibleAllocationsMap[deviceIntersection.Len()] = make([]possibleAllocation, 0)
310+
deviceIntersection := getDeviceIntersection(group.unallocatedDevices, availableDevicesSet)
311+
if len(deviceIntersection) == 0 {
312+
continue
309313
}
310-
intersectionToPossibleAllocationsMap[deviceIntersection.Len()] = append(intersectionToPossibleAllocationsMap[deviceIntersection.Len()], possibleAllocation{
314+
if _, ok := intersectionToPossibleAllocationsMap[len(deviceIntersection)]; !ok {
315+
intersectionToPossibleAllocationsMap[len(deviceIntersection)] = make([]possibleAllocation, 0)
316+
}
317+
intersectionToPossibleAllocationsMap[len(deviceIntersection)] = append(intersectionToPossibleAllocationsMap[len(deviceIntersection)], possibleAllocation{
311318
// The number of unallocated devices in the group is retrieved by taking a difference between
312319
// the unallocated devices in the group and the already allocated devices
313-
unallocatedSize: group.unallocatedDevices.Difference(allocatedDevices).Len(),
314-
candidateDevices: deviceIntersection.UnsortedList(),
320+
unallocatedSize: sets.NewString(group.unallocatedDevices...).Difference(allocatedDevices).Len(),
321+
candidateDevices: deviceIntersection,
315322
})
316323
}
317324

318325
return intersectionToPossibleAllocationsMap
319326
}
320327

328+
func getDeviceIntersection(unallocatedDevices []string, availableDevices sets.String) []string {
329+
deviceIntersection := make([]string, 0)
330+
for _, device := range unallocatedDevices {
331+
if availableDevices.Has(device) {
332+
deviceIntersection = append(deviceIntersection, device)
333+
}
334+
}
335+
return deviceIntersection
336+
}
337+
321338
// allocateByIntersection allocates devices by the following algorithm
322339
// 1. Sort the intersection sizes of possible allocations in descending order, we want to allocate devices with larger intersection size with an affinity group.
323340
// 2. For each intersection size, merge and sort the possible allocations by their unallocated size in ascending order, this is to maximize
324-
// bin-packing (try to fill up an affinity group that is already allocated with other devices.
341+
// bin-packing (try to fill up an affinity group that is already allocated with other devices).
325342
// 3. For each intersection size, allocate devices in the order of the sorted possible allocations.
326343
// 4. If a possible allocation has a number of intersected devices larger than the devices needed for allocation, we go to the next priority and try to find an allocation from there.
327344
// 5. If we are currently at the last affinity priority level, we go through the other possible allocations (that are in sorted ascending order of number of unallocated devices)
@@ -330,7 +347,8 @@ func (s *DeviceAffinityStrategy) allocateByIntersection(
330347
intersectionToPossibleAllocationsMap map[int][]possibleAllocation, allocatedDevices sets.String,
331348
devicesToAllocate int, isLastPriority bool,
332349
) allocationByIntersectionResult {
333-
// Sort the intersection sizes of possible allocations in descending order
350+
// Sort the intersection sizes of possible allocations in descending order because we want to process the larger intersections first.
351+
// A larger intersection means that we are able to find more devices that have an affinity with an affinity group.
334352
intersectionSizes := make([]int, 0, len(intersectionToPossibleAllocationsMap))
335353
for intersectionSize := range intersectionToPossibleAllocationsMap {
336354
intersectionSizes = append(intersectionSizes, intersectionSize)
@@ -340,33 +358,40 @@ func (s *DeviceAffinityStrategy) allocateByIntersection(
340358
return intersectionSizes[i] > intersectionSizes[j]
341359
})
342360

343-
if len(intersectionSizes) > 0 {
344-
// Find the first intersection size that is larger than or equal to the devices needed for allocation
345-
maxIntersection := intersectionSizes[0]
346-
for _, intersectionSize := range intersectionSizes {
347-
if intersectionSize <= devicesToAllocate {
348-
maxIntersection = intersectionSize
349-
break
350-
}
361+
// If there is an intersection size that is larger than or equal to the number of devices needed for allocation,
362+
// find the smallest intersection size. This is so that we try to reduce fragmentation as much as possible.
363+
// For example, if we have 1 device to allocate, and we have intersectionSizes of 2 and 1, we want to allocate to the group with
364+
// intersection of size 1, as this means that we are able to successfully do bin-packing (fill up an affinity group that
365+
// that is already allocated with other devices)
366+
start := 0
367+
for i, intersectionSize := range intersectionSizes {
368+
if intersectionSize <= devicesToAllocate {
369+
start = i
370+
break
351371
}
372+
}
352373

353-
possibleAllocations, ok := intersectionToPossibleAllocationsMap[maxIntersection]
374+
klog.Infof("intersection to possible allocations map: %v", intersectionToPossibleAllocationsMap)
375+
376+
for i := start; i < len(intersectionSizes); i++ {
377+
intersectionSize := intersectionSizes[i]
378+
possibleAllocations, ok := intersectionToPossibleAllocationsMap[intersectionSize]
354379
if !ok {
355380
return allocationByIntersectionResult{
356381
finished: false,
357-
err: fmt.Errorf("possible reusable devices of intersection size %v not found", maxIntersection),
382+
err: fmt.Errorf("possible reusable devices of intersection size %v not found", intersectionSize),
358383
}
359384
}
360385

361-
// TODO: Dont need to merge if last priority
362386
mergedPossibleAllocations := s.mergePossibleAllocationsAndSort(possibleAllocations)
363387

364388
klog.Infof("possible allocations: %v", mergedPossibleAllocations)
365389

366390
for _, possibleAlloc := range mergedPossibleAllocations {
367391
// If devices of possible allocation size is larger than the devices needed, and it is not the last priority level,
368392
// go to the next priority and try to allocate
369-
if !isLastPriority && len(possibleAlloc.candidateDevices) > devicesToAllocate-allocatedDevices.Len() {
393+
remainingToAllocate := devicesToAllocate - allocatedDevices.Len()
394+
if !isLastPriority && len(possibleAlloc.candidateDevices) > remainingToAllocate {
370395
return allocationByIntersectionResult{
371396
finished: false,
372397
err: nil,
@@ -383,37 +408,8 @@ func (s *DeviceAffinityStrategy) allocateByIntersection(
383408
}
384409
}
385410
}
386-
387-
// At the last priority, we just go through the other possible allocations of the other intersection sizes if we have not allocated finish the candidate devices
388-
if isLastPriority {
389-
for _, intersectionSize := range intersectionSizes {
390-
possibleAllocations, ok = intersectionToPossibleAllocationsMap[intersectionSize]
391-
if !ok {
392-
return allocationByIntersectionResult{
393-
finished: false,
394-
err: fmt.Errorf("possible device allocation of intersection size %v not found", intersectionSize),
395-
}
396-
}
397-
398-
// Sort possible allocations by their unallocated size in ascending order
399-
sort.Slice(possibleAllocations, func(i, j int) bool {
400-
return possibleAllocations[i].unallocatedSize < possibleAllocations[j].unallocatedSize
401-
})
402-
403-
for _, possibleAlloc := range possibleAllocations {
404-
for _, device := range possibleAlloc.candidateDevices {
405-
allocatedDevices.Insert(device)
406-
if allocatedDevices.Len() == devicesToAllocate {
407-
return allocationByIntersectionResult{
408-
finished: true,
409-
err: nil,
410-
}
411-
}
412-
}
413-
}
414-
}
415-
}
416411
}
412+
417413
return allocationByIntersectionResult{
418414
finished: false,
419415
err: nil,
@@ -428,7 +424,7 @@ func (s *DeviceAffinityStrategy) findAllAffinityGroupIdsByPriority(
428424
for _, device := range allocatedDevices {
429425
for priority, groups := range affinityMap {
430426
for _, group := range groups {
431-
if group.unallocatedDevices.Has(device) {
427+
if sets.NewString(group.unallocatedDevices...).Has(device) {
432428
if _, ok := affinityGroupIds[priority]; !ok {
433429
affinityGroupIds[priority] = sets.NewString()
434430
}

0 commit comments

Comments
 (0)