Skip to content

Commit 9cd3e0d

Browse files
committed
feat: refactor of rdma allocation into a new generic preallocatedevice strategy
1 parent b917052 commit 9cd3e0d

24 files changed

Lines changed: 3193 additions & 2330 deletions

File tree

pkg/agent/qrm-plugins/gpu/customdeviceplugin/gpu/gpu.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,16 +145,17 @@ func (p *GPUDevicePlugin) AllocateAssociatedDevice(
145145
}
146146

147147
// Use the strategy framework to allocate GPU devices
148-
result, err := manager.AllocateGPUUsingStrategy(
148+
result, err := manager.AllocateDevicesUsingStrategy(
149149
resReq,
150150
deviceReq,
151-
gpuTopology,
151+
p.DeviceTopologyRegistry,
152152
p.Conf.GPUQRMPluginConfig,
153153
p.Emitter,
154154
p.MetaServer,
155155
p.GetState().GetMachineState(),
156156
qosLevel,
157157
"",
158+
"",
158159
)
159160
if err != nil {
160161
return nil, fmt.Errorf("GPU allocation using strategy failed: %v", err)

pkg/agent/qrm-plugins/gpu/customdeviceplugin/rdma/rdma.go

Lines changed: 23 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,15 @@ package rdma
1919
import (
2020
"context"
2121
"fmt"
22-
"math"
2322

24-
v1 "k8s.io/api/core/v1"
25-
"k8s.io/apimachinery/pkg/util/sets"
2623
pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1"
2724

2825
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate"
2926
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin"
3027
gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts"
3128
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin"
3229
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state"
30+
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/strategy/allocate/manager"
3331
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util"
3432
"github.com/kubewharf/katalyst-core/pkg/util/general"
3533
"github.com/kubewharf/katalyst-core/pkg/util/machine"
@@ -123,21 +121,31 @@ func (p *RDMADevicePlugin) AllocateAssociatedDevice(
123121
return nil, err
124122
}
125123

126-
var allocatedRdmaDevices []string
124+
accompanyResourceName = p.ResolveResourceName(accompanyResourceName, false)
125+
126+
// Use strategy framework to allocate RDMA devices
127+
result, err := manager.AllocateDevicesUsingStrategy(
128+
resReq,
129+
deviceReq,
130+
p.DeviceTopologyRegistry,
131+
p.Conf.GPUQRMPluginConfig,
132+
p.Emitter,
133+
p.MetaServer,
134+
p.GetState().GetMachineState(),
135+
qosLevel,
136+
gpuconsts.RDMADeviceType,
137+
accompanyResourceName,
138+
)
139+
if err != nil {
140+
return nil, fmt.Errorf("RDMA allocation using strategy failed: %v", err)
141+
}
127142

128-
// No accompany resource name
129-
if accompanyResourceName == "" {
130-
allocatedRdmaDevices, err = p.allocateWithNoAccompanyResource(deviceReq)
131-
if err != nil {
132-
return nil, fmt.Errorf("failed to allocate with no accompany resource: %v", err)
133-
}
134-
} else {
135-
allocatedRdmaDevices, err = p.allocateWithAccompanyResource(deviceReq, resReq, accompanyResourceName)
136-
if err != nil {
137-
return nil, fmt.Errorf("failed to allocate with accompany resource: %v", err)
138-
}
143+
if !result.Success {
144+
return nil, fmt.Errorf("RDMA allocation failed: %v", result.ErrorMessage)
139145
}
140146

147+
allocatedRdmaDevices := result.AllocatedDevices
148+
141149
// Modify rdma state
142150
topologyAwareAllocations := make(map[string]state.Allocation)
143151
for _, deviceID := range allocatedRdmaDevices {
@@ -182,101 +190,3 @@ func (p *RDMADevicePlugin) AllocateAssociatedDevice(
182190
},
183191
}, nil
184192
}
185-
186-
// allocateWithNoAccompanyResource allocates the rdma devices by best effort basis on the by making sure that
187-
// it fits the hint nodes.
188-
func (p *RDMADevicePlugin) allocateWithNoAccompanyResource(deviceReq *pluginapi.DeviceRequest) ([]string, error) {
189-
reqQuantity := deviceReq.GetDeviceRequest()
190-
191-
machineState, ok := p.GetState().GetMachineState()[gpuconsts.RDMADeviceType]
192-
if !ok {
193-
return nil, fmt.Errorf("no machine state for resource %s", gpuconsts.RDMADeviceType)
194-
}
195-
196-
allocatedDevices := sets.NewString()
197-
allocateDevices := func(devices ...string) bool {
198-
for _, device := range devices {
199-
allocatedDevices.Insert(device)
200-
if allocatedDevices.Len() >= int(reqQuantity) {
201-
return true
202-
}
203-
}
204-
return false
205-
}
206-
207-
availableDevices := deviceReq.GetAvailableDevices()
208-
reusableDevices := deviceReq.GetReusableDevices()
209-
210-
// allocate reusable devices first
211-
allocated := allocateDevices(reusableDevices...)
212-
if allocated {
213-
return allocatedDevices.UnsortedList(), nil
214-
}
215-
216-
for _, device := range availableDevices {
217-
if !machineState.IsRequestSatisfied(device, 1, 1) {
218-
general.Infof("available numa affinity rdma %s is already allocated", device)
219-
continue
220-
}
221-
222-
if allocateDevices(device) {
223-
return allocatedDevices.UnsortedList(), nil
224-
}
225-
}
226-
227-
return nil, fmt.Errorf("not enough available RDMAs found in rdmaTopology, number of needed RDMAs: %d, availableDevices len: %d, allocatedDevices len: %d", reqQuantity, len(availableDevices), len(allocatedDevices))
228-
}
229-
230-
// allocateWithAccompanyResource allocates the rdma devices by first allocating the reusable devices, then allocating the
231-
// available devices proportionally by ensuring NUMA affinity with the accompany resource
232-
func (p *RDMADevicePlugin) allocateWithAccompanyResource(
233-
deviceReq *pluginapi.DeviceRequest, resReq *pluginapi.ResourceRequest, accompanyResourceName string,
234-
) ([]string, error) {
235-
// Find out the accompany devices that are allocated to the container and allocate RDMA devices that correspond to the numa nodes of accompany device
236-
accompanyDeviceType := p.ResolveResourceName(accompanyResourceName, false)
237-
if accompanyDeviceType == "" {
238-
return nil, fmt.Errorf("failed to get device type for accompany resource: %s", accompanyResourceName)
239-
}
240-
241-
// Allocate all the reusable devices first
242-
allocatedDevices := sets.NewString(deviceReq.ReusableDevices...)
243-
244-
// Get ratio of accompany resource to target device
245-
accompanyResourceToTargetDeviceRatio := p.GetState().GetMachineState().GetRatioOfAccompanyResourceToTargetResource(accompanyDeviceType, gpuconsts.RDMADeviceType)
246-
247-
// Allocate target device according to ratio of accompany resource to target device
248-
podResourceEntries := p.GetState().GetPodResourceEntries()
249-
totalAllocated := podResourceEntries.GetTotalAllocatedResourceOfContainer(v1.ResourceName(accompanyDeviceType), resReq.PodUid, resReq.ContainerName)
250-
251-
rdmaToBeAllocated := int(math.Ceil(float64(totalAllocated) / accompanyResourceToTargetDeviceRatio))
252-
253-
machineState := p.GetState().GetMachineState()[v1.ResourceName(gpuconsts.RDMADeviceType)]
254-
255-
allocateDevices := func(devices ...string) bool {
256-
for _, device := range devices {
257-
if allocatedDevices.Len() >= rdmaToBeAllocated {
258-
return true
259-
}
260-
allocatedDevices.Insert(device)
261-
}
262-
if allocatedDevices.Len() >= rdmaToBeAllocated {
263-
return true
264-
}
265-
return false
266-
}
267-
268-
// Allocate the rest of the available rdma devices in best-effort manner
269-
for _, deviceId := range deviceReq.AvailableDevices {
270-
// Skip rdma devices that are already allocated to other containers
271-
if !machineState.IsRequestSatisfied(deviceId, 1, 1) {
272-
continue
273-
}
274-
275-
if allocateDevices(deviceId) {
276-
return allocatedDevices.UnsortedList(), nil
277-
}
278-
}
279-
280-
// Did not find enough available rdma devices to allocate, return the devices that are already allocated
281-
return allocatedDevices.UnsortedList(), nil
282-
}

0 commit comments

Comments
 (0)