Skip to content

Commit efe914c

Browse files
committed
feat: implement rdma associated device allocation logic
1 parent 07dcf1d commit efe914c

3 files changed

Lines changed: 901 additions & 0 deletions

File tree

Lines changed: 309 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,309 @@
1+
/*
2+
Copyright 2022 The Katalyst Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package rdma
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"math"
23+
24+
v1 "k8s.io/api/core/v1"
25+
"k8s.io/apimachinery/pkg/util/sets"
26+
pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1"
27+
28+
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate"
29+
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin"
30+
gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts"
31+
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin"
32+
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state"
33+
gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util"
34+
"github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util"
35+
"github.com/kubewharf/katalyst-core/pkg/util/general"
36+
"github.com/kubewharf/katalyst-core/pkg/util/machine"
37+
)
38+
39+
const RDMACustomDevicePluginName = "rdma-custom-device-plugin"
40+
41+
type RDMADevicePlugin struct {
42+
*baseplugin.BasePlugin
43+
deviceNames []string
44+
}
45+
46+
func NewRDMADevicePlugin(base *baseplugin.BasePlugin) customdeviceplugin.CustomDevicePlugin {
47+
rdmaTopologyProvider := machine.NewDeviceTopologyProvider(base.Conf.RDMADeviceNames)
48+
base.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.RDMADeviceType, rdmaTopologyProvider)
49+
base.DefaultResourceStateGeneratorRegistry.RegisterResourceStateGenerator(gpuconsts.RDMADeviceType,
50+
state.NewGenericDefaultResourceStateGenerator(gpuconsts.RDMADeviceType, base.DeviceTopologyRegistry))
51+
base.RegisterDeviceNameToType(base.Conf.RDMADeviceNames, gpuconsts.RDMADeviceType)
52+
53+
return &RDMADevicePlugin{
54+
BasePlugin: base,
55+
deviceNames: base.Conf.RDMADeviceNames,
56+
}
57+
}
58+
59+
func (p *RDMADevicePlugin) DefaultPreAllocateResourceName() string {
60+
return ""
61+
}
62+
63+
func (p *RDMADevicePlugin) DeviceNames() []string {
64+
return p.deviceNames
65+
}
66+
67+
func (p *RDMADevicePlugin) UpdateAllocatableAssociatedDevices(ctx context.Context, request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) {
68+
return p.UpdateAllocatableAssociatedDevicesByDeviceType(request, gpuconsts.RDMADeviceType)
69+
}
70+
71+
func (p *RDMADevicePlugin) GetAssociatedDeviceTopologyHints(context.Context, *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) {
72+
return &pluginapi.AssociatedDeviceHintsResponse{}, nil
73+
}
74+
75+
// AllocateAssociatedDevice check if rdma is allocated to other containers, make sure they do not share rdma
76+
func (p *RDMADevicePlugin) AllocateAssociatedDevice(
77+
ctx context.Context, resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string,
78+
) (*pluginapi.AssociatedDeviceAllocationResponse, error) {
79+
qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.Conf.QoSConfiguration, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys)
80+
if err != nil {
81+
err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v",
82+
resReq.PodNamespace, resReq.PodName, resReq.ContainerName, err)
83+
general.Errorf("%s", err.Error())
84+
return nil, err
85+
}
86+
87+
general.InfoS("called",
88+
"podNamespace", resReq.PodNamespace,
89+
"podName", resReq.PodName,
90+
"containerName", resReq.ContainerName,
91+
"qosLevel", qosLevel,
92+
"reqAnnotations", resReq.Annotations,
93+
"resourceRequests", resReq.ResourceRequests,
94+
"deviceName", deviceReq.DeviceName,
95+
"resourceHint", resReq.Hint,
96+
"deviceHint", deviceReq.Hint,
97+
"availableDevices", deviceReq.AvailableDevices,
98+
"reusableDevices", deviceReq.ReusableDevices,
99+
"deviceRequest", deviceReq.DeviceRequest,
100+
)
101+
102+
// Check if there is state for the device name
103+
rdmaAllocationInfo := p.GetState().GetAllocationInfo(gpuconsts.RDMADeviceType, resReq.PodUid, resReq.ContainerName)
104+
if rdmaAllocationInfo != nil && rdmaAllocationInfo.TopologyAwareAllocations != nil {
105+
allocatedDevices := make([]string, 0, len(rdmaAllocationInfo.TopologyAwareAllocations))
106+
for rdmaID := range rdmaAllocationInfo.TopologyAwareAllocations {
107+
allocatedDevices = append(allocatedDevices, rdmaID)
108+
}
109+
return &pluginapi.AssociatedDeviceAllocationResponse{
110+
AllocationResult: &pluginapi.AssociatedDeviceAllocation{
111+
AllocatedDevices: allocatedDevices,
112+
},
113+
}, nil
114+
}
115+
116+
rdmaTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.RDMADeviceType)
117+
if err != nil {
118+
return nil, fmt.Errorf("failed to get gpu device topology: %v", err)
119+
}
120+
if !numaTopologyReady {
121+
return nil, fmt.Errorf("gpu device topology is not ready")
122+
}
123+
124+
hintNodes, err := machine.NewCPUSetUint64(deviceReq.GetHint().GetNodes()...)
125+
if err != nil {
126+
general.Warningf("failed to get hint nodes: %v", err)
127+
return nil, err
128+
}
129+
130+
var allocatedRdmaDevices []string
131+
132+
// No accompany resource name
133+
if accompanyResourceName == "" {
134+
allocatedRdmaDevices, err = p.allocateWithNoAccompanyResource(deviceReq, rdmaTopology, hintNodes)
135+
if err != nil {
136+
return nil, fmt.Errorf("failed to allocate with no accompany resource: %v", err)
137+
}
138+
} else {
139+
allocatedRdmaDevices, err = p.allocateWithAccompanyResource(deviceReq, resReq, accompanyResourceName)
140+
if err != nil {
141+
return nil, fmt.Errorf("failed to allocate with accompany resource: %v", err)
142+
}
143+
}
144+
145+
// Modify rdma state
146+
topologyAwareAllocations := make(map[string]state.Allocation)
147+
for _, deviceID := range allocatedRdmaDevices {
148+
info, ok := rdmaTopology.Devices[deviceID]
149+
if !ok {
150+
return nil, fmt.Errorf("failed to get rdma info for device %s", deviceID)
151+
}
152+
153+
topologyAwareAllocations[deviceID] = state.Allocation{
154+
Quantity: 1,
155+
NUMANodes: info.GetNUMANodes(),
156+
}
157+
}
158+
159+
allocationInfo := &state.AllocationInfo{
160+
AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(resReq, commonstate.EmptyOwnerPoolName, qosLevel),
161+
AllocatedAllocation: state.Allocation{
162+
Quantity: 1,
163+
NUMANodes: hintNodes.ToSliceInt(),
164+
},
165+
}
166+
167+
allocationInfo.TopologyAwareAllocations = topologyAwareAllocations
168+
p.GetState().SetAllocationInfo(gpuconsts.RDMADeviceType, resReq.PodUid, resReq.ContainerName, allocationInfo, false)
169+
resourceState, err := p.GenerateResourceStateFromPodEntries(gpuconsts.RDMADeviceType, nil)
170+
if err != nil {
171+
return nil, fmt.Errorf("failed to generate rdma device state from pod entries: %v", err)
172+
}
173+
174+
p.GetState().SetResourceState(gpuconsts.RDMADeviceType, resourceState, true)
175+
176+
general.InfoS("allocated rdma devices",
177+
"podNamespace", resReq.PodNamespace,
178+
"podName", resReq.PodName,
179+
"containerName", resReq.ContainerName,
180+
"qosLevel", qosLevel,
181+
"allocatedRdmaDevices", allocatedRdmaDevices)
182+
183+
return &pluginapi.AssociatedDeviceAllocationResponse{
184+
AllocationResult: &pluginapi.AssociatedDeviceAllocation{
185+
AllocatedDevices: allocatedRdmaDevices,
186+
},
187+
}, nil
188+
}
189+
190+
// allocateWithNoAccompanyResource allocates the rdma devices by best effort basis on the by making sure that
191+
// it fits the hint nodes.
192+
func (p *RDMADevicePlugin) allocateWithNoAccompanyResource(
193+
deviceReq *pluginapi.DeviceRequest, rdmaTopology *machine.DeviceTopology, hintNodes machine.CPUSet,
194+
) ([]string, error) {
195+
reqQuantity := deviceReq.GetDeviceRequest()
196+
197+
machineState, ok := p.GetState().GetMachineState()[gpuconsts.RDMADeviceType]
198+
if !ok {
199+
return nil, fmt.Errorf("no machine state for resource %s", gpuconsts.RDMADeviceType)
200+
}
201+
202+
allocatedDevices := sets.NewString()
203+
allocateDevices := func(devices ...string) bool {
204+
for _, device := range devices {
205+
allocatedDevices.Insert(device)
206+
if allocatedDevices.Len() >= int(reqQuantity) {
207+
return true
208+
}
209+
}
210+
return false
211+
}
212+
213+
availableDevices := deviceReq.GetAvailableDevices()
214+
reusableDevices := deviceReq.GetReusableDevices()
215+
216+
// allocate reusable devices first
217+
allocated := allocateDevices(reusableDevices...)
218+
if allocated {
219+
return allocatedDevices.UnsortedList(), nil
220+
}
221+
222+
for _, device := range availableDevices {
223+
if !gpuutil.IsNUMAAffinityDevice(device, rdmaTopology, hintNodes) {
224+
continue
225+
}
226+
227+
if !machineState.IsRequestSatisfied(device, 1, 1) {
228+
general.Infof("available numa affinity rdma %s is already allocated", device)
229+
continue
230+
}
231+
232+
if allocateDevices(device) {
233+
return allocatedDevices.UnsortedList(), nil
234+
}
235+
}
236+
237+
return nil, fmt.Errorf("not enough available RDMAs found in rdmaTopology, number of needed RDMAs: %d, availableDevices len: %d, allocatedDevices len: %d", reqQuantity, len(availableDevices), len(allocatedDevices))
238+
}
239+
240+
// allocateWithAccompanyResource allocates the rdma devices by first allocating the reusable devices, then allocating the
241+
// available devices proportionally by ensuring NUMA affinity with the accompany resource
242+
func (p *RDMADevicePlugin) allocateWithAccompanyResource(
243+
deviceReq *pluginapi.DeviceRequest, resReq *pluginapi.ResourceRequest, accompanyResourceName string,
244+
) ([]string, error) {
245+
var err error
246+
247+
// Find out the accompany devices that are allocated to the container and allocate RDMA devices that correspond to the numa nodes of accompany device
248+
accompanyDeviceType := p.ResolveResourceName(accompanyResourceName, false)
249+
if accompanyDeviceType == "" {
250+
return nil, fmt.Errorf("failed to get device type for accompany resource: %s", accompanyResourceName)
251+
}
252+
253+
// Allocate all the reusable devices first
254+
allocatedDevices := sets.NewString(deviceReq.ReusableDevices...)
255+
256+
// Get ratio of accompany resource to target device
257+
accompanyResourceToTargetDeviceRatio := p.GetState().GetMachineState().GetRatioOfAccompanyResourceToTargetResource(accompanyDeviceType, gpuconsts.RDMADeviceType)
258+
259+
// Allocate target device according to ratio of accompany resource to target device
260+
podResourceEntries := p.GetState().GetPodResourceEntries()
261+
totalAllocated, accompanyResourceIds := podResourceEntries.GetTotalAllocatedResourceOfContainer(v1.ResourceName(accompanyDeviceType), resReq.PodUid, resReq.ContainerName)
262+
263+
rdmaToBeAllocated := int(math.Ceil(float64(totalAllocated) * accompanyResourceToTargetDeviceRatio))
264+
265+
// For every gpu that is allocated to the container, find out the rdma devices that have affinity to the same
266+
// numa nodes as the gpu and allocate them
267+
accompanyResourceToRdmaAffinityMap, err := p.DeviceTopologyRegistry.GetDeviceNUMAAffinity(accompanyDeviceType, gpuconsts.RDMADeviceType)
268+
if err != nil {
269+
general.Warningf("failed to get gpu to rdma affinity map: %v", err)
270+
return nil, err
271+
}
272+
273+
machineState := p.GetState().GetMachineState()[v1.ResourceName(gpuconsts.RDMADeviceType)]
274+
275+
allocateDevices := func(devices ...string) bool {
276+
for _, device := range devices {
277+
if allocatedDevices.Len() >= rdmaToBeAllocated {
278+
return true
279+
}
280+
allocatedDevices.Insert(device)
281+
}
282+
if allocatedDevices.Len() >= rdmaToBeAllocated {
283+
return true
284+
}
285+
return false
286+
}
287+
288+
for accompanyResourceId := range accompanyResourceIds {
289+
rdmaDevices, ok := accompanyResourceToRdmaAffinityMap[accompanyResourceId]
290+
if !ok {
291+
general.Warningf("failed to get rdma device with accompany device id: %s", accompanyResourceId)
292+
continue
293+
}
294+
295+
// Iterate through the rdma devices and check if they are already allocated
296+
for _, rdmaDevice := range rdmaDevices {
297+
if !machineState.IsRequestSatisfied(rdmaDevice, 1, 1) {
298+
continue
299+
}
300+
301+
if allocateDevices(rdmaDevice) {
302+
return allocatedDevices.UnsortedList(), nil
303+
}
304+
}
305+
}
306+
307+
// Did not find enough available rdma devices to allocate, return the devices that are already allocated
308+
return allocatedDevices.UnsortedList(), nil
309+
}

0 commit comments

Comments
 (0)