berops · Despire · Sep 26, 2025 · Sep 19, 2025 · Sep 19, 2025 · Sep 22, 2025
diff --git a/docs/autoscaling/autoscaling.md b/docs/autoscaling/autoscaling.md
@@ -30,3 +30,25 @@ As stated earlier, Claudie deploys Cluster Autoscaler and Autoscaler Adapter for
 ## Considerations
 
 As Claudie just extends Cluster Autoscaler, it is important that you follow their [best practices](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#what-are-the-key-best-practices-for-running-cluster-autoscaler). Furthermore, as number of nodes in autoscaled node pools can be volatile, you should carefully plan out how you will use the storage on such node pools. Longhorn support of Cluster Autoscaler is still in experimental phase ([longhorn documentation](https://longhorn.io/docs/1.4.0/high-availability/k8s-cluster-autoscaler/)).
+
+## GPUs
+
+The custom Claudie-Provider for the [Cluster-Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) does not automatically determine whether the provided instance types have GPU capabilities. If you want autoscaling for a nodepool with GPUs, you must explicitly specify how many GPUs each node in the nodepool has.
+
+```
+- name: autoscaled
+  providerSpec:
+   name: aws
+   region: eu-central-1
+   zone: eu-central-1a
+  autoscaler:
+    min: 0
+    max: 20
+  # GPU machine type name.
+  serverType: g4dn.xlarge
+  machineSpec:
+    # explicitly specify how many GPU's the instance type provides.
+    nvidiaGpu: 1
+  # OS image name
+  image: ami-07eef52105e8a2059
+  ```
diff --git a/docs/input-manifest/api-reference.md b/docs/input-manifest/api-reference.md
@@ -259,8 +259,9 @@ Dynamic nodepools are defined for cloud provider machines that Claudie is expect
 
   Further describes the selected server type, if available by the cloud provider.
 
-  - `cpuCount`: specifies the number of cpu to be used by the `serverType`
-  - `memory`: specifies the memory in GB to be used by the `serverType`
+  - `cpuCount`: specifies the number of cpus used by the `serverType`
+  - `memory`: specifies the memory in GBs used by the `serverType`
+  - `nvidiaGpu`: specifies the number of nvidia GPUs used by the `serverType`
 
 - `image`
 

diff --git a/internal/api/manifest/manifest.go b/internal/api/manifest/manifest.go
@@ -125,9 +125,15 @@ type Kubernetes struct {
 
 // MachineSpec specifies further the configuration of the requested server type in DynamicNodePool.
 type MachineSpec struct {
-	// CpuCount specifies the number of CPU cores to be used.
-	CpuCount int `validate:"required,gte=1" yaml:"cpuCount" json:"cpuCount"`
-	Memory   int `validate:"required,gte=1" yaml:"memory" json:"memory"`
+	// CpuCount specifies the number of CPU cores the provided instance type will have.
+	// +optional
+	CpuCount int `validate:"required_with=Memory,gte=0" yaml:"cpuCount" json:"cpuCount"`
+	// Memory specifies the memory the provided instance type will have.
+	// +optional
+	Memory int `validate:"required_with=CpuCount,gte=0" yaml:"memory" json:"memory"`
+	// Nvidia specifies the number of NVIDIA GPUs the provided instance type will have.
+	// +optional
+	NvidiaGpu int `validate:"gte=0" yaml:"nvidiaGpu" json:"nvidiaGpu"`
 }
 
 // DynamicNodePool List of dynamically to-be-created nodepools of not yet existing machines, used for Kubernetes or loadbalancer clusters.
@@ -167,7 +173,6 @@ type DynamicNodePool struct {
 	Taints []k8sV1.Taint `validate:"omitempty" yaml:"taints" json:"taints"`
 	// MachineSpec further describe the properties of the selected server type.
 	MachineSpec *MachineSpec `validate:"omitempty" yaml:"machineSpec,omitempty" json:"machineSpec,omitempty"`
-	// Templates for setting up the Nodepool. (optional)
 }
 
 // Autoscaler configuration on per nodepool basis. Defines the number of nodes, autoscaler will scale up or down specific nodepool.

diff --git a/internal/api/manifest/utils.go b/internal/api/manifest/utils.go
@@ -290,8 +290,9 @@ func (ds *Manifest) CreateNodepools(pools []string, isControl bool) ([]*spec.Nod
 			var machineSpec *spec.MachineSpec
 			if nodePool.MachineSpec != nil {
 				machineSpec = &spec.MachineSpec{
-					CpuCount: int32(nodePool.MachineSpec.CpuCount),
-					Memory:   int32(nodePool.MachineSpec.Memory),
+					CpuCount:  int32(nodePool.MachineSpec.CpuCount),
+					Memory:    int32(nodePool.MachineSpec.Memory),
+					NvidiaGpu: int32(nodePool.MachineSpec.NvidiaGpu),
 				}
 			}
 

diff --git a/internal/nodepools/nodepools_test.go b/internal/nodepools/nodepools_test.go
@@ -1,6 +1,7 @@
 package nodepools
 
 import (
+	"sort"
 	"testing"
 
 	"github.com/berops/claudie/proto/pb/spec"
@@ -311,6 +312,19 @@ func TestLabelsTaintsAnnotationsToRemove(t *testing.T) {
 	for _, tt := range tests {
 		t.Run(tt.Name, func(t *testing.T) {
 			got := LabelsTaintsAnnotationsDiff(tt.Args.current, tt.Args.desired)
+
+			// sort the output to be same every call.
+
+			for _, v := range got.LabelKeys {
+				sort.Strings(v)
+			}
+			for _, v := range got.AnnotationKeys {
+				sort.Strings(v)
+			}
+			for _, v := range got.TaintKeys {
+				sort.Slice(v, func(i, j int) bool { return v[i].Key < v[j].Key })
+			}
+
 			if diff := cmp.Diff(got, tt.Want, protocmp.Transform()); diff != "" {
 				t.Fatalf("labelsTaintsAnnotationsToRemove(%s) = %s", tt.Name, diff)
 			}

diff --git a/manifests/claudie/crd/claudie.io_inputmanifests.yaml b/manifests/claudie/crd/claudie.io_inputmanifests.yaml
@@ -303,13 +303,16 @@ spec:
                           properties:
                             cpuCount:
                               description: CpuCount specifies the number of CPU cores
-                                to be used.
+                                the provided instance type will have.
                               type: integer
                             memory:
+                              description: Memory specifies the memory the provided
+                                instance type will have.
+                              type: integer
+                            nvidiaGpu:
+                              description: Nvidia specifies the number of NVIDIA GPUs
+                                the provided instance type will have.
                               type: integer
-                          required:
-                          - cpuCount
-                          - memory
                           type: object
                         name:
                           description: Name of the nodepool. Each nodepool will have

diff --git a/manifests/claudie/crd/claudie.io_settings.yaml b/manifests/claudie/crd/claudie.io_settings.yaml
@@ -0,0 +1,85 @@
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.18.0
+  labels:
+    app.kubernetes.io/part-of: claudie
+  name: settings.claudie.io
+spec:
+  group: claudie.io
+  names:
+    kind: Setting
+    listKind: SettingList
+    plural: settings
+    singular: setting
+  scope: Namespaced
+  versions:
+    - name: v1alpha1
+      schema:
+        openAPIV3Schema:
+          description:
+            Settings used for customization of deployed clusters via the
+            InputManifest.
+          properties:
+            apiVersion:
+              description: |-
+                APIVersion defines the versioned schema of this representation of an object.
+                Servers should convert recognized schemas to the latest internal value, and
+                may reject unrecognized values.
+                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+              type: string
+            kind:
+              description: |-
+                Kind is a string value representing the REST resource this object represents.
+                Servers may infer this from the endpoint the client submits requests to.
+                Cannot be updated.
+                In CamelCase.
+                More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+              type: string
+            metadata:
+              type: object
+            spec:
+              properties:
+                envoy:
+                  description: |-
+                    Envoy configuration to be referenced by a Role
+                    in a LoadBalancer cluster in the InputManifest.
+                  properties:
+                    cds:
+                      description: |-
+                        Specifies the cluster dynamic configuration which will replace
+                        the default claudie provided configuration.
+
+                        Be careful when replacing the default configuration as you may break
+                        the 'settings' configurable options for the role definition in the
+                        InputManifest.
+
+                        If you need to change the default behaviour, it is advisable to start
+                        with the default configuration provided by claudie, which matches the
+                        configurable options in the InputManifest, and then make your own changes
+                        from there.
+                      type: string
+                    lds:
+                      description: |-
+                        Specifies the dynamic listener configuration that will replace the
+                        default configuration provided by claudie.
+
+                        Be careful when replacing the default configuration as you may break
+                        the 'settings' configurable options for the role definition in the
+                        InputManifest.
+
+                        If you need to change the default behaviour, it is advisable to start
+                        with the default configuration provided by claudie, which matches the
+                        configurable options in the InputManifest, and then make your own changes
+                        from there.
+                      type: string
+                  type: object
+              type: object
+          required:
+            - metadata
+            - spec
+          type: object
+      served: true
+      storage: true
diff --git a/manifests/claudie/kustomization.yaml b/manifests/claudie/kustomization.yaml
@@ -57,18 +57,18 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 images:
 - name: ghcr.io/berops/claudie/ansibler
-  newTag: ae81a20-3587
+  newTag: 2634ffc-3590
 - name: ghcr.io/berops/claudie/autoscaler-adapter
-  newTag: ae81a20-3587
+  newTag: 2634ffc-3590
 - name: ghcr.io/berops/claudie/builder
-  newTag: ae81a20-3587
+  newTag: 2634ffc-3590
 - name: ghcr.io/berops/claudie/claudie-operator
-  newTag: ae81a20-3587
+  newTag: 2634ffc-3590
 - name: ghcr.io/berops/claudie/kube-eleven
-  newTag: ae81a20-3587
+  newTag: 2634ffc-3590
 - name: ghcr.io/berops/claudie/kuber
-  newTag: ae81a20-3587
+  newTag: 2634ffc-3590
 - name: ghcr.io/berops/claudie/manager
-  newTag: ae81a20-3587
+  newTag: 2634ffc-3590
 - name: ghcr.io/berops/claudie/terraformer
-  newTag: ae81a20-3587
+  newTag: 2634ffc-3590
diff --git a/manifests/testing-framework/kustomization.yaml b/manifests/testing-framework/kustomization.yaml
@@ -90,4 +90,4 @@ secretGenerator:
 
 images:
 - name: ghcr.io/berops/claudie/testing-framework
-  newTag: ae81a20-3587
+  newTag: 2634ffc-3590
diff --git a/manifests/testing-framework/test-sets/autoscaling-1/1.yaml b/manifests/testing-framework/test-sets/autoscaling-1/1.yaml
@@ -52,7 +52,7 @@ spec:
   kubernetes:
     clusters:
       - name: autoscaling-cluster-test-set
-        version: 1.30.0
+        version: 1.31.0
         network: 192.168.2.0/24
         pools:
           control:

diff --git a/manifests/testing-framework/test-sets/autoscaling-1/2.yaml b/manifests/testing-framework/test-sets/autoscaling-1/2.yaml
@@ -50,7 +50,7 @@ spec:
   kubernetes:
     clusters:
       - name: autoscaling-cluster-test-set
-        version: 1.30.0
+        version: 1.31.0
         network: 192.168.2.0/24
         pools:
           control:

diff --git a/manifests/testing-framework/test-sets/autoscaling-1/3.yaml b/manifests/testing-framework/test-sets/autoscaling-1/3.yaml
@@ -48,7 +48,7 @@ spec:
   kubernetes:
     clusters:
       - name: autoscaling-cluster-test-set
-        version: 1.30.0
+        version: 1.31.0
         network: 192.168.2.0/24
         pools:
           control:

diff --git a/proto/pb/spec/nodepool.pb.go b/proto/pb/spec/nodepool.pb.go
diff --git a/proto/spec/nodepool.proto b/proto/spec/nodepool.proto
@@ -91,6 +91,7 @@ message DynamicNodePool {
 message MachineSpec {
   int32 cpuCount = 1;
   int32 memory = 2;
+  int32 nvidiaGpu = 3;
 }
 
 // Autoscaler configuration on per node pool basis.
@@ -112,4 +113,4 @@ enum StaticNodepoolInfo {
   STATIC_PROVIDER = 0;
   STATIC_REGION = 1;
   STATIC_ZONE = 2;
-}
+}
diff --git a/services/autoscaler-adapter/node_manager/node_manager.go b/services/autoscaler-adapter/node_manager/node_manager.go
@@ -36,6 +36,8 @@ type typeInfo struct {
 	memory int64
 	// Size in bytes
 	disk int64
+	// Number of NVIDIA gpus
+	nvidiaGpus int64
 }
 
 // NewNodeManager returns a NodeManager pointer with initialised caches about nodes.
@@ -63,23 +65,40 @@ func (nm *NodeManager) Refresh(nodepools []*spec.NodePool) error {
 
 // GetCapacity returns a theoretical capacity for a new node from specified nodepool.
 func (nm *NodeManager) GetCapacity(np *spec.NodePool) k8sV1.ResourceList {
-	typeInfo := nm.getTypeInfo(np.GetDynamicNodePool().Provider.CloudProviderName, np.GetDynamicNodePool())
-	if typeInfo != nil {
-		var disk int64
-		// Check if disk is define for the instance.
-		if typeInfo.disk > 0 {
-			disk = typeInfo.disk
-		} else {
-			disk = int64(np.GetDynamicNodePool().StorageDiskSize) * 1024 * 1024 * 1024 // Convert to bytes
-		}
-		rl := k8sV1.ResourceList{}
-		rl[k8sV1.ResourcePods] = *resource.NewQuantity(defaultPodAmountsLimit, resource.DecimalSI)
-		rl[k8sV1.ResourceCPU] = *resource.NewQuantity(typeInfo.cpu, resource.DecimalSI)
-		rl[k8sV1.ResourceMemory] = *resource.NewQuantity(typeInfo.memory, resource.DecimalSI)
-		rl[k8sV1.ResourceStorage] = *resource.NewQuantity(disk, resource.DecimalSI)
-		return rl
+	dnp := np.GetDynamicNodePool()
+	if dnp == nil {
+		return nil
 	}
-	return nil
+
+	typeInfo := nm.getTypeInfo(dnp.Provider.CloudProviderName, dnp)
+	if typeInfo == nil {
+		return nil
+	}
+
+	var disk int64
+	// Check if disk is define for the instance.
+	if typeInfo.disk > 0 {
+		disk = typeInfo.disk
+	} else {
+		disk = int64(np.GetDynamicNodePool().StorageDiskSize) * 1024 * 1024 * 1024 // Convert to bytes
+	}
+
+	rl := k8sV1.ResourceList{}
+	rl[k8sV1.ResourcePods] = *resource.NewQuantity(defaultPodAmountsLimit, resource.DecimalSI)
+	rl[k8sV1.ResourceCPU] = *resource.NewQuantity(typeInfo.cpu, resource.DecimalSI)
+	rl[k8sV1.ResourceMemory] = *resource.NewQuantity(typeInfo.memory, resource.DecimalSI)
+	rl[k8sV1.ResourceStorage] = *resource.NewQuantity(disk, resource.DecimalSI)
+
+	if typeInfo.nvidiaGpus > 0 {
+		rl["nvidia.com/gpu"] = *resource.NewQuantity(typeInfo.nvidiaGpus, resource.DecimalSI)
+	}
+
+	// If the machine spec contains a valid number of NvidiaGPUs, prefer that value over the cached
+	// one from [typeInfo].
+	if dnp.MachineSpec != nil && dnp.MachineSpec.NvidiaGpu > 0 {
+		rl["nvidia.com/gpu"] = *resource.NewQuantity(int64(dnp.MachineSpec.NvidiaGpu), resource.DecimalSI)
+	}
+	return rl
 }
 
 // Arch returns the architecture for the dynamic nodepool.