Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions docs/autoscaling/autoscaling.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,25 @@ As stated earlier, Claudie deploys Cluster Autoscaler and Autoscaler Adapter for
## Considerations

As Claudie just extends Cluster Autoscaler, it is important that you follow their [best practices](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#what-are-the-key-best-practices-for-running-cluster-autoscaler). Furthermore, as number of nodes in autoscaled node pools can be volatile, you should carefully plan out how you will use the storage on such node pools. Longhorn support of Cluster Autoscaler is still in experimental phase ([longhorn documentation](https://longhorn.io/docs/1.4.0/high-availability/k8s-cluster-autoscaler/)).

## GPUs

The custom Claudie-Provider for the [Cluster-Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) does not automatically determine whether the provided instance types have GPU capabilities. If you want autoscaling for a nodepool with GPUs, you must explicitly specify how many GPUs each node in the nodepool has.

```
- name: autoscaled
providerSpec:
name: aws
region: eu-central-1
zone: eu-central-1a
autoscaler:
min: 0
max: 20
# GPU machine type name.
serverType: g4dn.xlarge
machineSpec:
# explicitly specify how many GPU's the instance type provides.
nvidiaGpu: 1
# OS image name
image: ami-07eef52105e8a2059
```
5 changes: 3 additions & 2 deletions docs/input-manifest/api-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -259,8 +259,9 @@ Dynamic nodepools are defined for cloud provider machines that Claudie is expect

Further describes the selected server type, if available by the cloud provider.

- `cpuCount`: specifies the number of cpu to be used by the `serverType`
- `memory`: specifies the memory in GB to be used by the `serverType`
- `cpuCount`: specifies the number of cpus used by the `serverType`
- `memory`: specifies the memory in GBs used by the `serverType`
- `nvidiaGpu`: specifies the number of nvidia GPUs used by the `serverType`

- `image`

Expand Down
13 changes: 9 additions & 4 deletions internal/api/manifest/manifest.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,15 @@ type Kubernetes struct {

// MachineSpec specifies further the configuration of the requested server type in DynamicNodePool.
type MachineSpec struct {
// CpuCount specifies the number of CPU cores to be used.
CpuCount int `validate:"required,gte=1" yaml:"cpuCount" json:"cpuCount"`
Memory int `validate:"required,gte=1" yaml:"memory" json:"memory"`
// CpuCount specifies the number of CPU cores the provided instance type will have.
// +optional
CpuCount int `validate:"required_with=Memory,gte=0" yaml:"cpuCount" json:"cpuCount"`
// Memory specifies the memory the provided instance type will have.
// +optional
Memory int `validate:"required_with=CpuCount,gte=0" yaml:"memory" json:"memory"`
// Nvidia specifies the number of NVIDIA GPUs the provided instance type will have.
// +optional
NvidiaGpu int `validate:"gte=0" yaml:"nvidiaGpu" json:"nvidiaGpu"`
}

// DynamicNodePool List of dynamically to-be-created nodepools of not yet existing machines, used for Kubernetes or loadbalancer clusters.
Expand Down Expand Up @@ -167,7 +173,6 @@ type DynamicNodePool struct {
Taints []k8sV1.Taint `validate:"omitempty" yaml:"taints" json:"taints"`
// MachineSpec further describe the properties of the selected server type.
MachineSpec *MachineSpec `validate:"omitempty" yaml:"machineSpec,omitempty" json:"machineSpec,omitempty"`
// Templates for setting up the Nodepool. (optional)
}

// Autoscaler configuration on per nodepool basis. Defines the number of nodes, autoscaler will scale up or down specific nodepool.
Expand Down
5 changes: 3 additions & 2 deletions internal/api/manifest/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,8 +290,9 @@ func (ds *Manifest) CreateNodepools(pools []string, isControl bool) ([]*spec.Nod
var machineSpec *spec.MachineSpec
if nodePool.MachineSpec != nil {
machineSpec = &spec.MachineSpec{
CpuCount: int32(nodePool.MachineSpec.CpuCount),
Memory: int32(nodePool.MachineSpec.Memory),
CpuCount: int32(nodePool.MachineSpec.CpuCount),
Memory: int32(nodePool.MachineSpec.Memory),
NvidiaGpu: int32(nodePool.MachineSpec.NvidiaGpu),
}
}

Expand Down
14 changes: 14 additions & 0 deletions internal/nodepools/nodepools_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package nodepools

import (
"sort"
"testing"

"github.com/berops/claudie/proto/pb/spec"
Expand Down Expand Up @@ -311,6 +312,19 @@ func TestLabelsTaintsAnnotationsToRemove(t *testing.T) {
for _, tt := range tests {
t.Run(tt.Name, func(t *testing.T) {
got := LabelsTaintsAnnotationsDiff(tt.Args.current, tt.Args.desired)

// sort the output to be same every call.

for _, v := range got.LabelKeys {
sort.Strings(v)
}
for _, v := range got.AnnotationKeys {
sort.Strings(v)
}
for _, v := range got.TaintKeys {
sort.Slice(v, func(i, j int) bool { return v[i].Key < v[j].Key })
}

if diff := cmp.Diff(got, tt.Want, protocmp.Transform()); diff != "" {
t.Fatalf("labelsTaintsAnnotationsToRemove(%s) = %s", tt.Name, diff)
}
Expand Down
11 changes: 7 additions & 4 deletions manifests/claudie/crd/claudie.io_inputmanifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -303,13 +303,16 @@ spec:
properties:
cpuCount:
description: CpuCount specifies the number of CPU cores
to be used.
the provided instance type will have.
type: integer
memory:
description: Memory specifies the memory the provided
instance type will have.
type: integer
nvidiaGpu:
description: Nvidia specifies the number of NVIDIA GPUs
the provided instance type will have.
type: integer
required:
- cpuCount
- memory
type: object
name:
description: Name of the nodepool. Each nodepool will have
Expand Down
85 changes: 85 additions & 0 deletions manifests/claudie/crd/claudie.io_settings.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.18.0
labels:
app.kubernetes.io/part-of: claudie
name: settings.claudie.io
spec:
group: claudie.io
names:
kind: Setting
listKind: SettingList
plural: settings
singular: setting
scope: Namespaced
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
description:
Settings used for customization of deployed clusters via the
InputManifest.
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
properties:
envoy:
description: |-
Envoy configuration to be referenced by a Role
in a LoadBalancer cluster in the InputManifest.
properties:
cds:
description: |-
Specifies the cluster dynamic configuration which will replace
the default claudie provided configuration.

Be careful when replacing the default configuration as you may break
the 'settings' configurable options for the role definition in the
InputManifest.

If you need to change the default behaviour, it is advisable to start
with the default configuration provided by claudie, which matches the
configurable options in the InputManifest, and then make your own changes
from there.
type: string
lds:
description: |-
Specifies the dynamic listener configuration that will replace the
default configuration provided by claudie.

Be careful when replacing the default configuration as you may break
the 'settings' configurable options for the role definition in the
InputManifest.

If you need to change the default behaviour, it is advisable to start
with the default configuration provided by claudie, which matches the
configurable options in the InputManifest, and then make your own changes
from there.
type: string
type: object
type: object
required:
- metadata
- spec
type: object
served: true
storage: true
16 changes: 8 additions & 8 deletions manifests/claudie/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,18 +57,18 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
images:
- name: ghcr.io/berops/claudie/ansibler
newTag: ae81a20-3587
newTag: 2634ffc-3590
- name: ghcr.io/berops/claudie/autoscaler-adapter
newTag: ae81a20-3587
newTag: 2634ffc-3590
- name: ghcr.io/berops/claudie/builder
newTag: ae81a20-3587
newTag: 2634ffc-3590
- name: ghcr.io/berops/claudie/claudie-operator
newTag: ae81a20-3587
newTag: 2634ffc-3590
- name: ghcr.io/berops/claudie/kube-eleven
newTag: ae81a20-3587
newTag: 2634ffc-3590
- name: ghcr.io/berops/claudie/kuber
newTag: ae81a20-3587
newTag: 2634ffc-3590
- name: ghcr.io/berops/claudie/manager
newTag: ae81a20-3587
newTag: 2634ffc-3590
- name: ghcr.io/berops/claudie/terraformer
newTag: ae81a20-3587
newTag: 2634ffc-3590
2 changes: 1 addition & 1 deletion manifests/testing-framework/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,4 @@ secretGenerator:

images:
- name: ghcr.io/berops/claudie/testing-framework
newTag: ae81a20-3587
newTag: 2634ffc-3590
2 changes: 1 addition & 1 deletion manifests/testing-framework/test-sets/autoscaling-1/1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ spec:
kubernetes:
clusters:
- name: autoscaling-cluster-test-set
version: 1.30.0
version: 1.31.0
network: 192.168.2.0/24
pools:
control:
Expand Down
2 changes: 1 addition & 1 deletion manifests/testing-framework/test-sets/autoscaling-1/2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ spec:
kubernetes:
clusters:
- name: autoscaling-cluster-test-set
version: 1.30.0
version: 1.31.0
network: 192.168.2.0/24
pools:
control:
Expand Down
2 changes: 1 addition & 1 deletion manifests/testing-framework/test-sets/autoscaling-1/3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ spec:
kubernetes:
clusters:
- name: autoscaling-cluster-test-set
version: 1.30.0
version: 1.31.0
network: 192.168.2.0/24
pools:
control:
Expand Down
13 changes: 11 additions & 2 deletions proto/pb/spec/nodepool.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion proto/spec/nodepool.proto
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ message DynamicNodePool {
message MachineSpec {
int32 cpuCount = 1;
int32 memory = 2;
int32 nvidiaGpu = 3;
}

// Autoscaler configuration on per node pool basis.
Expand All @@ -112,4 +113,4 @@ enum StaticNodepoolInfo {
STATIC_PROVIDER = 0;
STATIC_REGION = 1;
STATIC_ZONE = 2;
}
}
51 changes: 35 additions & 16 deletions services/autoscaler-adapter/node_manager/node_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ type typeInfo struct {
memory int64
// Size in bytes
disk int64
// Number of NVIDIA gpus
nvidiaGpus int64
}

// NewNodeManager returns a NodeManager pointer with initialised caches about nodes.
Expand Down Expand Up @@ -63,23 +65,40 @@ func (nm *NodeManager) Refresh(nodepools []*spec.NodePool) error {

// GetCapacity returns a theoretical capacity for a new node from specified nodepool.
func (nm *NodeManager) GetCapacity(np *spec.NodePool) k8sV1.ResourceList {
typeInfo := nm.getTypeInfo(np.GetDynamicNodePool().Provider.CloudProviderName, np.GetDynamicNodePool())
if typeInfo != nil {
var disk int64
// Check if disk is define for the instance.
if typeInfo.disk > 0 {
disk = typeInfo.disk
} else {
disk = int64(np.GetDynamicNodePool().StorageDiskSize) * 1024 * 1024 * 1024 // Convert to bytes
}
rl := k8sV1.ResourceList{}
rl[k8sV1.ResourcePods] = *resource.NewQuantity(defaultPodAmountsLimit, resource.DecimalSI)
rl[k8sV1.ResourceCPU] = *resource.NewQuantity(typeInfo.cpu, resource.DecimalSI)
rl[k8sV1.ResourceMemory] = *resource.NewQuantity(typeInfo.memory, resource.DecimalSI)
rl[k8sV1.ResourceStorage] = *resource.NewQuantity(disk, resource.DecimalSI)
return rl
dnp := np.GetDynamicNodePool()
if dnp == nil {
return nil
}
return nil

typeInfo := nm.getTypeInfo(dnp.Provider.CloudProviderName, dnp)
if typeInfo == nil {
return nil
}

var disk int64
// Check if disk is define for the instance.
if typeInfo.disk > 0 {
disk = typeInfo.disk
} else {
disk = int64(np.GetDynamicNodePool().StorageDiskSize) * 1024 * 1024 * 1024 // Convert to bytes
}

rl := k8sV1.ResourceList{}
rl[k8sV1.ResourcePods] = *resource.NewQuantity(defaultPodAmountsLimit, resource.DecimalSI)
rl[k8sV1.ResourceCPU] = *resource.NewQuantity(typeInfo.cpu, resource.DecimalSI)
rl[k8sV1.ResourceMemory] = *resource.NewQuantity(typeInfo.memory, resource.DecimalSI)
rl[k8sV1.ResourceStorage] = *resource.NewQuantity(disk, resource.DecimalSI)

if typeInfo.nvidiaGpus > 0 {
rl["nvidia.com/gpu"] = *resource.NewQuantity(typeInfo.nvidiaGpus, resource.DecimalSI)
}

// If the machine spec contains a valid number of NvidiaGPUs, prefer that value over the cached
// one from [typeInfo].
if dnp.MachineSpec != nil && dnp.MachineSpec.NvidiaGpu > 0 {
rl["nvidia.com/gpu"] = *resource.NewQuantity(int64(dnp.MachineSpec.NvidiaGpu), resource.DecimalSI)
}
return rl
}

// Arch returns the architecture for the dynamic nodepool.
Expand Down
Loading