Skip to content

Commit 40c71bb

Browse files
committed
cluster-api: node template in scale-from-0-nodes scenario with DRA
Modify TemplateNodeInfo() to return the template of ResourceSlice. This is to address the DRA expansion of Cluster Autoscaler, allowing users to set the number of GPUs and DRA driver name by specifying the annotation to NodeGroup provided by cluster-api. Signed-off-by: Tsubasa Watanabe <[email protected]>
1 parent dffe7ac commit 40c71bb

File tree

3 files changed

+63
-1
lines changed

3 files changed

+63
-1
lines changed

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go

+6-1
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,12 @@ func (ng *nodegroup) TemplateNodeInfo() (*framework.NodeInfo, error) {
283283
return nil, err
284284
}
285285

286-
nodeInfo := framework.NewNodeInfo(&node, nil, &framework.PodInfo{Pod: cloudprovider.BuildKubeProxy(ng.scalableResource.Name())})
286+
resourceSlices, err := ng.scalableResource.InstanceResourceSlices(nodeName)
287+
if err != nil {
288+
return nil, err
289+
}
290+
291+
nodeInfo := framework.NewNodeInfo(&node, resourceSlices, &framework.PodInfo{Pod: cloudprovider.BuildKubeProxy(ng.scalableResource.Name())})
287292
return nodeInfo, nil
288293
}
289294

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go

+47
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,21 @@ import (
2020
"context"
2121
"fmt"
2222
"path"
23+
"strconv"
2324
"strings"
2425
"time"
2526

2627
"github.com/pkg/errors"
2728
apiv1 "k8s.io/api/core/v1"
2829
corev1 "k8s.io/api/core/v1"
30+
resourceapi "k8s.io/api/resource/v1beta1"
2931
"k8s.io/apimachinery/pkg/api/resource"
3032
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3133
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
3234
"k8s.io/apimachinery/pkg/runtime/schema"
3335
"k8s.io/apimachinery/pkg/util/validation"
3436
klog "k8s.io/klog/v2"
37+
"k8s.io/utils/ptr"
3538
)
3639

3740
type unstructuredScalableResource struct {
@@ -297,6 +300,46 @@ func (r unstructuredScalableResource) InstanceCapacity() (map[corev1.ResourceNam
297300
return capacity, nil
298301
}
299302

303+
func (r unstructuredScalableResource) InstanceResourceSlices(nodeName string) ([]*resourceapi.ResourceSlice, error) {
304+
driver := r.InstanceDRADriver()
305+
gpuCount, err := r.InstanceGPUCapacityAnnotation()
306+
if err != nil {
307+
return nil, err
308+
}
309+
310+
var result []*resourceapi.ResourceSlice
311+
if driver != "" && !gpuCount.IsZero() {
312+
resourceslice := &resourceapi.ResourceSlice{
313+
ObjectMeta: metav1.ObjectMeta{
314+
Name: nodeName + "-" + driver,
315+
},
316+
Spec: resourceapi.ResourceSliceSpec{
317+
Driver: driver,
318+
NodeName: nodeName,
319+
Pool: resourceapi.ResourcePool{
320+
Name: nodeName,
321+
},
322+
},
323+
}
324+
for i := 0; i < int(gpuCount.Value()); i++ {
325+
device := resourceapi.Device{
326+
Name: "gpu-" + strconv.Itoa(i),
327+
Basic: &resourceapi.BasicDevice{
328+
Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{
329+
"type": {
330+
StringValue: ptr.To(GpuDeviceType),
331+
},
332+
},
333+
},
334+
}
335+
resourceslice.Spec.Devices = append(resourceslice.Spec.Devices, device)
336+
}
337+
result = append(result, resourceslice)
338+
return result, nil
339+
}
340+
return nil, nil
341+
}
342+
300343
func (r unstructuredScalableResource) InstanceEphemeralDiskCapacityAnnotation() (resource.Quantity, error) {
301344
return parseEphemeralDiskCapacity(r.unstructured.GetAnnotations())
302345
}
@@ -321,6 +364,10 @@ func (r unstructuredScalableResource) InstanceMaxPodsCapacityAnnotation() (resou
321364
return parseMaxPodsCapacity(r.unstructured.GetAnnotations())
322365
}
323366

367+
func (r unstructuredScalableResource) InstanceDRADriver() string {
368+
return parseDRADriver(r.unstructured.GetAnnotations())
369+
}
370+
324371
func (r unstructuredScalableResource) readInfrastructureReferenceResource() (*unstructured.Unstructured, error) {
325372
infraref, found, err := unstructured.NestedStringMap(r.unstructured.Object, "spec", "template", "spec", "infrastructureRef")
326373
if !found || err != nil {

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_utils.go

+10
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ const (
4040
maxPodsKey = "capacity.cluster-autoscaler.kubernetes.io/maxPods"
4141
taintsKey = "capacity.cluster-autoscaler.kubernetes.io/taints"
4242
labelsKey = "capacity.cluster-autoscaler.kubernetes.io/labels"
43+
draDriverKey = "capacity.cluster-autoscaler.kubernetes.io/dra-driver"
4344
// UnknownArch is used if the Architecture is Unknown
4445
UnknownArch SystemArchitecture = ""
4546
// Amd64 is used if the Architecture is x86_64
@@ -54,6 +55,8 @@ const (
5455
DefaultArch = Amd64
5556
// scaleUpFromZeroDefaultEnvVar is the name of the env var for the default architecture
5657
scaleUpFromZeroDefaultArchEnvVar = "CAPI_SCALE_ZERO_DEFAULT_ARCH"
58+
// gpuDeviceType is used if DRA device is GPU
59+
GpuDeviceType = "gpu"
5760
)
5861

5962
var (
@@ -282,6 +285,13 @@ func parseMaxPodsCapacity(annotations map[string]string) (resource.Quantity, err
282285
return parseIntKey(annotations, maxPodsKey)
283286
}
284287

288+
func parseDRADriver(annotations map[string]string) string {
289+
if val, found := annotations[draDriverKey]; found {
290+
return val
291+
}
292+
return ""
293+
}
294+
285295
func clusterNameFromResource(r *unstructured.Unstructured) string {
286296
// Use Spec.ClusterName if defined (only available on v1alpha3+ types)
287297
clusterName, found, err := unstructured.NestedString(r.Object, "spec", "clusterName")

0 commit comments

Comments
 (0)