|
| 1 | +/* |
| 2 | +Copyright 2022 The Katalyst Authors. |
| 3 | +
|
| 4 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +you may not use this file except in compliance with the License. |
| 6 | +You may obtain a copy of the License at |
| 7 | +
|
| 8 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +
|
| 10 | +Unless required by applicable law or agreed to in writing, software |
| 11 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +See the License for the specific language governing permissions and |
| 14 | +limitations under the License. |
| 15 | +*/ |
| 16 | + |
| 17 | +package rdma |
| 18 | + |
| 19 | +import ( |
| 20 | + "context" |
| 21 | + "fmt" |
| 22 | + "math" |
| 23 | + |
| 24 | + v1 "k8s.io/api/core/v1" |
| 25 | + "k8s.io/apimachinery/pkg/util/sets" |
| 26 | + pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1" |
| 27 | + |
| 28 | + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/commonstate" |
| 29 | + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/baseplugin" |
| 30 | + gpuconsts "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/consts" |
| 31 | + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/customdeviceplugin" |
| 32 | + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/state" |
| 33 | + gpuutil "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/gpu/util" |
| 34 | + "github.com/kubewharf/katalyst-core/pkg/agent/qrm-plugins/util" |
| 35 | + "github.com/kubewharf/katalyst-core/pkg/util/general" |
| 36 | + "github.com/kubewharf/katalyst-core/pkg/util/machine" |
| 37 | +) |
| 38 | + |
| 39 | +const RDMACustomDevicePluginName = "rdma-custom-device-plugin" |
| 40 | + |
| 41 | +type RDMADevicePlugin struct { |
| 42 | + *baseplugin.BasePlugin |
| 43 | + deviceNames []string |
| 44 | +} |
| 45 | + |
| 46 | +func NewRDMADevicePlugin(base *baseplugin.BasePlugin) customdeviceplugin.CustomDevicePlugin { |
| 47 | + rdmaTopologyProvider := machine.NewDeviceTopologyProvider(base.Conf.RDMADeviceNames) |
| 48 | + base.DeviceTopologyRegistry.RegisterDeviceTopologyProvider(gpuconsts.RDMADeviceType, rdmaTopologyProvider) |
| 49 | + base.DefaultResourceStateGeneratorRegistry.RegisterResourceStateGenerator(gpuconsts.RDMADeviceType, |
| 50 | + state.NewGenericDefaultResourceStateGenerator(gpuconsts.RDMADeviceType, base.DeviceTopologyRegistry)) |
| 51 | + base.RegisterDeviceNameToType(base.Conf.RDMADeviceNames, gpuconsts.RDMADeviceType) |
| 52 | + |
| 53 | + return &RDMADevicePlugin{ |
| 54 | + BasePlugin: base, |
| 55 | + deviceNames: base.Conf.RDMADeviceNames, |
| 56 | + } |
| 57 | +} |
| 58 | + |
| 59 | +func (p *RDMADevicePlugin) DefaultPreAllocateResourceName() string { |
| 60 | + return "" |
| 61 | +} |
| 62 | + |
| 63 | +func (p *RDMADevicePlugin) DeviceNames() []string { |
| 64 | + return p.deviceNames |
| 65 | +} |
| 66 | + |
| 67 | +func (p *RDMADevicePlugin) UpdateAllocatableAssociatedDevices(ctx context.Context, request *pluginapi.UpdateAllocatableAssociatedDevicesRequest) (*pluginapi.UpdateAllocatableAssociatedDevicesResponse, error) { |
| 68 | + return p.UpdateAllocatableAssociatedDevicesByDeviceType(request, gpuconsts.RDMADeviceType) |
| 69 | +} |
| 70 | + |
| 71 | +func (p *RDMADevicePlugin) GetAssociatedDeviceTopologyHints(context.Context, *pluginapi.AssociatedDeviceRequest) (*pluginapi.AssociatedDeviceHintsResponse, error) { |
| 72 | + return &pluginapi.AssociatedDeviceHintsResponse{}, nil |
| 73 | +} |
| 74 | + |
| 75 | +// AllocateAssociatedDevice check if rdma is allocated to other containers, make sure they do not share rdma |
| 76 | +func (p *RDMADevicePlugin) AllocateAssociatedDevice( |
| 77 | + ctx context.Context, resReq *pluginapi.ResourceRequest, deviceReq *pluginapi.DeviceRequest, accompanyResourceName string, |
| 78 | +) (*pluginapi.AssociatedDeviceAllocationResponse, error) { |
| 79 | + qosLevel, err := util.GetKatalystQoSLevelFromResourceReq(p.Conf.QoSConfiguration, resReq, p.PodAnnotationKeptKeys, p.PodLabelKeptKeys) |
| 80 | + if err != nil { |
| 81 | + err = fmt.Errorf("GetKatalystQoSLevelFromResourceReq for pod: %s/%s, container: %s failed with error: %v", |
| 82 | + resReq.PodNamespace, resReq.PodName, resReq.ContainerName, err) |
| 83 | + general.Errorf("%s", err.Error()) |
| 84 | + return nil, err |
| 85 | + } |
| 86 | + |
| 87 | + general.InfoS("called", |
| 88 | + "podNamespace", resReq.PodNamespace, |
| 89 | + "podName", resReq.PodName, |
| 90 | + "containerName", resReq.ContainerName, |
| 91 | + "qosLevel", qosLevel, |
| 92 | + "reqAnnotations", resReq.Annotations, |
| 93 | + "resourceRequests", resReq.ResourceRequests, |
| 94 | + "deviceName", deviceReq.DeviceName, |
| 95 | + "resourceHint", resReq.Hint, |
| 96 | + "deviceHint", deviceReq.Hint, |
| 97 | + "availableDevices", deviceReq.AvailableDevices, |
| 98 | + "reusableDevices", deviceReq.ReusableDevices, |
| 99 | + "deviceRequest", deviceReq.DeviceRequest, |
| 100 | + ) |
| 101 | + |
| 102 | + // Check if there is state for the device name |
| 103 | + rdmaAllocationInfo := p.GetState().GetAllocationInfo(gpuconsts.RDMADeviceType, resReq.PodUid, resReq.ContainerName) |
| 104 | + if rdmaAllocationInfo != nil && rdmaAllocationInfo.TopologyAwareAllocations != nil { |
| 105 | + allocatedDevices := make([]string, 0, len(rdmaAllocationInfo.TopologyAwareAllocations)) |
| 106 | + for rdmaID := range rdmaAllocationInfo.TopologyAwareAllocations { |
| 107 | + allocatedDevices = append(allocatedDevices, rdmaID) |
| 108 | + } |
| 109 | + return &pluginapi.AssociatedDeviceAllocationResponse{ |
| 110 | + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ |
| 111 | + AllocatedDevices: allocatedDevices, |
| 112 | + }, |
| 113 | + }, nil |
| 114 | + } |
| 115 | + |
| 116 | + rdmaTopology, numaTopologyReady, err := p.DeviceTopologyRegistry.GetDeviceTopology(gpuconsts.RDMADeviceType) |
| 117 | + if err != nil { |
| 118 | + return nil, fmt.Errorf("failed to get gpu device topology: %v", err) |
| 119 | + } |
| 120 | + if !numaTopologyReady { |
| 121 | + return nil, fmt.Errorf("gpu device topology is not ready") |
| 122 | + } |
| 123 | + |
| 124 | + hintNodes, err := machine.NewCPUSetUint64(deviceReq.GetHint().GetNodes()...) |
| 125 | + if err != nil { |
| 126 | + general.Warningf("failed to get hint nodes: %v", err) |
| 127 | + return nil, err |
| 128 | + } |
| 129 | + |
| 130 | + var allocatedRdmaDevices []string |
| 131 | + |
| 132 | + // No accompany resource name |
| 133 | + if accompanyResourceName == "" { |
| 134 | + allocatedRdmaDevices, err = p.allocateWithNoAccompanyResource(deviceReq, rdmaTopology, hintNodes) |
| 135 | + if err != nil { |
| 136 | + return nil, fmt.Errorf("failed to allocate with no accompany resource: %v", err) |
| 137 | + } |
| 138 | + } else { |
| 139 | + allocatedRdmaDevices, err = p.allocateWithAccompanyResource(deviceReq, resReq, accompanyResourceName) |
| 140 | + if err != nil { |
| 141 | + return nil, fmt.Errorf("failed to allocate with accompany resource: %v", err) |
| 142 | + } |
| 143 | + } |
| 144 | + |
| 145 | + // Modify rdma state |
| 146 | + topologyAwareAllocations := make(map[string]state.Allocation) |
| 147 | + for _, deviceID := range allocatedRdmaDevices { |
| 148 | + info, ok := rdmaTopology.Devices[deviceID] |
| 149 | + if !ok { |
| 150 | + return nil, fmt.Errorf("failed to get rdma info for device %s", deviceID) |
| 151 | + } |
| 152 | + |
| 153 | + topologyAwareAllocations[deviceID] = state.Allocation{ |
| 154 | + Quantity: 1, |
| 155 | + NUMANodes: info.GetNUMANodes(), |
| 156 | + } |
| 157 | + } |
| 158 | + |
| 159 | + allocationInfo := &state.AllocationInfo{ |
| 160 | + AllocationMeta: commonstate.GenerateGenericContainerAllocationMeta(resReq, commonstate.EmptyOwnerPoolName, qosLevel), |
| 161 | + AllocatedAllocation: state.Allocation{ |
| 162 | + Quantity: 1, |
| 163 | + NUMANodes: hintNodes.ToSliceInt(), |
| 164 | + }, |
| 165 | + } |
| 166 | + |
| 167 | + allocationInfo.TopologyAwareAllocations = topologyAwareAllocations |
| 168 | + p.GetState().SetAllocationInfo(gpuconsts.RDMADeviceType, resReq.PodUid, resReq.ContainerName, allocationInfo, false) |
| 169 | + resourceState, err := p.GenerateResourceStateFromPodEntries(gpuconsts.RDMADeviceType, nil) |
| 170 | + if err != nil { |
| 171 | + return nil, fmt.Errorf("failed to generate rdma device state from pod entries: %v", err) |
| 172 | + } |
| 173 | + |
| 174 | + p.GetState().SetResourceState(gpuconsts.RDMADeviceType, resourceState, true) |
| 175 | + |
| 176 | + general.InfoS("allocated rdma devices", |
| 177 | + "podNamespace", resReq.PodNamespace, |
| 178 | + "podName", resReq.PodName, |
| 179 | + "containerName", resReq.ContainerName, |
| 180 | + "qosLevel", qosLevel, |
| 181 | + "allocatedRdmaDevices", allocatedRdmaDevices) |
| 182 | + |
| 183 | + return &pluginapi.AssociatedDeviceAllocationResponse{ |
| 184 | + AllocationResult: &pluginapi.AssociatedDeviceAllocation{ |
| 185 | + AllocatedDevices: allocatedRdmaDevices, |
| 186 | + }, |
| 187 | + }, nil |
| 188 | +} |
| 189 | + |
| 190 | +// allocateWithNoAccompanyResource allocates the rdma devices by best effort basis on the by making sure that |
| 191 | +// it fits the hint nodes. |
| 192 | +func (p *RDMADevicePlugin) allocateWithNoAccompanyResource( |
| 193 | + deviceReq *pluginapi.DeviceRequest, rdmaTopology *machine.DeviceTopology, hintNodes machine.CPUSet, |
| 194 | +) ([]string, error) { |
| 195 | + reqQuantity := deviceReq.GetDeviceRequest() |
| 196 | + |
| 197 | + machineState, ok := p.GetState().GetMachineState()[gpuconsts.RDMADeviceType] |
| 198 | + if !ok { |
| 199 | + return nil, fmt.Errorf("no machine state for resource %s", gpuconsts.RDMADeviceType) |
| 200 | + } |
| 201 | + |
| 202 | + allocatedDevices := sets.NewString() |
| 203 | + allocateDevices := func(devices ...string) bool { |
| 204 | + for _, device := range devices { |
| 205 | + allocatedDevices.Insert(device) |
| 206 | + if allocatedDevices.Len() >= int(reqQuantity) { |
| 207 | + return true |
| 208 | + } |
| 209 | + } |
| 210 | + return false |
| 211 | + } |
| 212 | + |
| 213 | + availableDevices := deviceReq.GetAvailableDevices() |
| 214 | + reusableDevices := deviceReq.GetReusableDevices() |
| 215 | + |
| 216 | + // allocate reusable devices first |
| 217 | + allocated := allocateDevices(reusableDevices...) |
| 218 | + if allocated { |
| 219 | + return allocatedDevices.UnsortedList(), nil |
| 220 | + } |
| 221 | + |
| 222 | + for _, device := range availableDevices { |
| 223 | + if !gpuutil.IsNUMAAffinityDevice(device, rdmaTopology, hintNodes) { |
| 224 | + continue |
| 225 | + } |
| 226 | + |
| 227 | + if !machineState.IsRequestSatisfied(device, 1, 1) { |
| 228 | + general.Infof("available numa affinity rdma %s is already allocated", device) |
| 229 | + continue |
| 230 | + } |
| 231 | + |
| 232 | + if allocateDevices(device) { |
| 233 | + return allocatedDevices.UnsortedList(), nil |
| 234 | + } |
| 235 | + } |
| 236 | + |
| 237 | + return nil, fmt.Errorf("not enough available RDMAs found in rdmaTopology, number of needed RDMAs: %d, availableDevices len: %d, allocatedDevices len: %d", reqQuantity, len(availableDevices), len(allocatedDevices)) |
| 238 | +} |
| 239 | + |
| 240 | +// allocateWithAccompanyResource allocates the rdma devices by first allocating the reusable devices, then allocating the |
| 241 | +// available devices proportionally by ensuring NUMA affinity with the accompany resource |
| 242 | +func (p *RDMADevicePlugin) allocateWithAccompanyResource( |
| 243 | + deviceReq *pluginapi.DeviceRequest, resReq *pluginapi.ResourceRequest, accompanyResourceName string, |
| 244 | +) ([]string, error) { |
| 245 | + var err error |
| 246 | + |
| 247 | + // Find out the accompany devices that are allocated to the container and allocate RDMA devices that correspond to the numa nodes of accompany device |
| 248 | + accompanyDeviceType := p.ResolveResourceName(accompanyResourceName, false) |
| 249 | + if accompanyDeviceType == "" { |
| 250 | + return nil, fmt.Errorf("failed to get device type for accompany resource: %s", accompanyResourceName) |
| 251 | + } |
| 252 | + |
| 253 | + // Allocate all the reusable devices first |
| 254 | + allocatedDevices := sets.NewString(deviceReq.ReusableDevices...) |
| 255 | + |
| 256 | + // Get ratio of accompany resource to target device |
| 257 | + accompanyResourceToTargetDeviceRatio := p.GetState().GetMachineState().GetRatioOfAccompanyResourceToTargetResource(accompanyDeviceType, gpuconsts.RDMADeviceType) |
| 258 | + |
| 259 | + // Allocate target device according to ratio of accompany resource to target device |
| 260 | + podResourceEntries := p.GetState().GetPodResourceEntries() |
| 261 | + totalAllocated, accompanyResourceIds := podResourceEntries.GetTotalAllocatedResourceOfContainer(v1.ResourceName(accompanyDeviceType), resReq.PodUid, resReq.ContainerName) |
| 262 | + |
| 263 | + rdmaToBeAllocated := int(math.Ceil(float64(totalAllocated) * accompanyResourceToTargetDeviceRatio)) |
| 264 | + |
| 265 | + // For every gpu that is allocated to the container, find out the rdma devices that have affinity to the same |
| 266 | + // numa nodes as the gpu and allocate them |
| 267 | + accompanyResourceToRdmaAffinityMap, err := p.DeviceTopologyRegistry.GetDeviceNUMAAffinity(accompanyDeviceType, gpuconsts.RDMADeviceType) |
| 268 | + if err != nil { |
| 269 | + general.Warningf("failed to get gpu to rdma affinity map: %v", err) |
| 270 | + return nil, err |
| 271 | + } |
| 272 | + |
| 273 | + machineState := p.GetState().GetMachineState()[v1.ResourceName(gpuconsts.RDMADeviceType)] |
| 274 | + |
| 275 | + allocateDevices := func(devices ...string) bool { |
| 276 | + for _, device := range devices { |
| 277 | + if allocatedDevices.Len() >= rdmaToBeAllocated { |
| 278 | + return true |
| 279 | + } |
| 280 | + allocatedDevices.Insert(device) |
| 281 | + } |
| 282 | + if allocatedDevices.Len() >= rdmaToBeAllocated { |
| 283 | + return true |
| 284 | + } |
| 285 | + return false |
| 286 | + } |
| 287 | + |
| 288 | + for accompanyResourceId := range accompanyResourceIds { |
| 289 | + rdmaDevices, ok := accompanyResourceToRdmaAffinityMap[accompanyResourceId] |
| 290 | + if !ok { |
| 291 | + general.Warningf("failed to get rdma device with accompany device id: %s", accompanyResourceId) |
| 292 | + continue |
| 293 | + } |
| 294 | + |
| 295 | + // Iterate through the rdma devices and check if they are already allocated |
| 296 | + for _, rdmaDevice := range rdmaDevices { |
| 297 | + if !machineState.IsRequestSatisfied(rdmaDevice, 1, 1) { |
| 298 | + continue |
| 299 | + } |
| 300 | + |
| 301 | + if allocateDevices(rdmaDevice) { |
| 302 | + return allocatedDevices.UnsortedList(), nil |
| 303 | + } |
| 304 | + } |
| 305 | + } |
| 306 | + |
| 307 | + // Did not find enough available rdma devices to allocate, return the devices that are already allocated |
| 308 | + return allocatedDevices.UnsortedList(), nil |
| 309 | +} |
0 commit comments