Skip to content
Merged
15 changes: 13 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.PHONY: proto manager builder terraformer ansibler kubeEleven test database minio containerimgs crd crd-apply controller-gen kind-load-images
.PHONY: proto manager builder terraformer ansibler kubeEleven test database minio containerimgs crd crd-apply controller-gen kind-load-images kind-deploy

# Enforce same version of protoc
PROTOC_VERSION = "29.5"
Expand Down Expand Up @@ -84,12 +84,23 @@ containerimgs:
done
$(SED_INPLACE) "s/adapter:.*$$/adapter/" services/kuber/templates/cluster-autoscaler.goyaml

KIND_CLUSTER ?= kind
KIND_NAMESPACE ?= claudie
kind-load-images:
for service in $(SERVICES) ; do \
echo " --- loading $$service to kind cluster --- "; \
kind load docker-image ghcr.io/berops/claudie/$$service:$(REV); \
kind load docker-image --name $(KIND_CLUSTER) ghcr.io/berops/claudie/$$service:$(REV); \
done

kind-deploy: kind-load-images
@echo " --- updating deployments in $(KIND_NAMESPACE) namespace --- "
@for svc in ansibler builder claudie-operator kube-eleven kuber manager terraformer; do \
echo " --- updating $$svc deployment --- "; \
kubectl set image deployment/$$svc $$svc=ghcr.io/berops/claudie/$$svc:$(REV) -n $(KIND_NAMESPACE); \
done
@echo " --- waiting for rollouts to complete --- "
@kubectl rollout status deployment -n $(KIND_NAMESPACE)

# Generate CustomResourceDefinition objects.
crd:
go tool controller-gen rbac:roleName=manager-role crd paths="./..." output:crd:artifacts:config=manifests/claudie/crd
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ Before you begin, please make sure you have the following prerequisites installe
| --------------------------------------------------------------------------------- | ------------------ | ------------------ |------------------ | ------------------ |
| [AWS](https://docs.claudie.io/latest/input-manifest/providers/aws/) | :heavy_check_mark: | :heavy_check_mark: |:heavy_check_mark: | :heavy_check_mark: |
| [Azure](https://docs.claudie.io/latest/input-manifest/providers/azure/) | :heavy_check_mark: | :heavy_check_mark: |:heavy_check_mark: | :heavy_check_mark: |
| [GCP](https://docs.claudie.io/latest/input-manifest/providers/gcp/) | :heavy_check_mark: | :heavy_check_mark: |:heavy_check_mark: | :x: |
| [GCP](https://docs.claudie.io/latest/input-manifest/providers/gcp/) | :heavy_check_mark: | :heavy_check_mark: |:heavy_check_mark: | :heavy_check_mark: |
| [OCI](https://docs.claudie.io/latest/input-manifest/providers/oci/) | :heavy_check_mark: | :heavy_check_mark: |:heavy_check_mark: | :heavy_check_mark: |
| [Hetzner](https://docs.claudie.io/latest/input-manifest/providers/hetzner/) | :heavy_check_mark: | :heavy_check_mark: | N/A | :heavy_check_mark: |
| [Cloudflare](https://docs.claudie.io/latest/input-manifest/providers/cloudflare/) | N/A | :heavy_check_mark: |:heavy_check_mark: | N/A |
Expand Down
4 changes: 3 additions & 1 deletion docs/input-manifest/api-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,9 @@ Dynamic nodepools are defined for cloud provider machines that Claudie is expect

- `cpuCount`: specifies the number of cpus used by the `serverType`
- `memory`: specifies the memory in GBs used by the `serverType`
- `nvidiaGpu`: specifies the number of nvidia GPUs used by the `serverType`
- `nvidiaGpuCount`: specifies the number of NVIDIA GPUs used by the `serverType`
- `nvidiaGpuType`: specifies the NVIDIA GPU accelerator type (required for GCP when using GPUs). Examples: `nvidia-tesla-t4`, `nvidia-tesla-v100`, `nvidia-tesla-a100`, `nvidia-l4`
- `nvidiaGpu`: (deprecated) use `nvidiaGpuCount` instead

- `image`

Expand Down
69 changes: 68 additions & 1 deletion docs/input-manifest/gpu-example.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ from [Nvidia](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/lates
to deploy the `gpu-operator` into a Claudie-built Kubernetes cluster. Make sure you fulfill the necessary listed
requirements in prerequisites before continuing, if you decide to use a different cloud provider.

In this example we will be using [AWS](providers/aws.md) as our provider, with the following config:
## AWS GPU Example

In this example we will be using [AWS](providers/aws.md) as our provider. AWS GPU instances (like `g4dn.xlarge`) come with GPUs attached, so no additional `machineSpec` configuration is needed:

```yaml
apiVersion: claudie.io/v1beta1
Expand Down Expand Up @@ -57,6 +59,71 @@ spec:
- gpu-aws
```

## GCP GPU Example

For [GCP](providers/gcp.md), you must explicitly specify the GPU type and count using the `machineSpec` block. GCP requires both `nvidiaGpuCount` and `nvidiaGpuType` to attach GPUs to instances:

```yaml
apiVersion: claudie.io/v1beta1
kind: InputManifest
metadata:
name: gcp-gpu-example
labels:
app.kubernetes.io/part-of: claudie
spec:
providers:
- name: gcp-1
providerType: gcp
secretRef:
name: gcp-secret
namespace: secrets

nodePools:
dynamic:
- name: control-gcp
providerSpec:
name: gcp-1
region: us-central1
zone: us-central1-a
count: 1
serverType: e2-medium
image: ubuntu-2404-noble-amd64-v20251001

- name: gpu-gcp
providerSpec:
name: gcp-1
region: us-central1
zone: us-central1-a
count: 2
# Use n1-standard machine types for GPU attachment
serverType: n1-standard-4
image: ubuntu-2404-noble-amd64-v20251001
storageDiskSize: 50
# GPU configuration required for GCP
machineSpec:
nvidiaGpuCount: 1
nvidiaGpuType: nvidia-tesla-t4

kubernetes:
clusters:
- name: gpu-example
version: v1.31.0
network: 172.16.2.0/24
pools:
control:
- control-gcp
compute:
- gpu-gcp
```

!!! note "GCP GPU Requirements"
- The `nvidiaGpuType` field is required when `nvidiaGpuCount > 0` for GCP providers
- Available GPU types vary by zone. Check [GCP GPU regions and zones](https://cloud.google.com/compute/docs/gpus/gpu-regions-zones) for availability
- Common GPU types: `nvidia-tesla-t4`, `nvidia-tesla-v100`, `nvidia-tesla-a100`, `nvidia-l4`
- GPU instances cannot be live migrated, so they will be terminated during maintenance events

## Deploying the GPU Operator

After the `InputManifest` has been successfully built by Claudie, deploy the `gpu-operator` to the `gpu-example` Kubernetes cluster.

1. Create a namespace for the gpu-operator.
Expand Down
47 changes: 47 additions & 0 deletions docs/input-manifest/providers/gcp.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,53 @@ If you wish to use GCP as your DNS provider where Claudie creates DNS records po
!!! warning "GCP is not my domain registrar"
If you haven't acquired a domain via GCP and wish to utilize GCP for hosting your zone, you can refer to [this guide](https://cloud.google.com/dns/docs/update-name-servers) on GCP nameservers. However, if you prefer not to use the entire domain, an alternative option is to delegate a subdomain to GCP.

## GPU Support

GCP requires explicit configuration to attach GPUs to compute instances. Unlike some other providers where GPU-enabled instance types automatically include GPUs, GCP uses a separate `guest_accelerator` mechanism that requires both GPU count and GPU type to be specified.

### Configuration

To use GPUs with GCP nodepools, you must specify both `nvidiaGpuCount` and `nvidiaGpuType` in the `machineSpec` block:

```yaml
nodePools:
dynamic:
- name: gpu-nodepool
providerSpec:
name: gcp-1
region: us-central1
zone: us-central1-a
count: 2
serverType: n1-standard-4
image: ubuntu-2404-noble-amd64-v20251001
machineSpec:
nvidiaGpuCount: 1
nvidiaGpuType: nvidia-tesla-t4
```

### Available GPU Types

Common NVIDIA GPU accelerator types available on GCP:

| GPU Type | Description |
|----------|-------------|
| `nvidia-tesla-t4` | NVIDIA Tesla T4 (cost-effective for inference) |
| `nvidia-tesla-v100` | NVIDIA Tesla V100 (high performance training) |
| `nvidia-tesla-a100` | NVIDIA A100 (latest generation) |
| `nvidia-l4` | NVIDIA L4 (successor to T4) |
| `nvidia-tesla-p100` | NVIDIA Tesla P100 |
| `nvidia-tesla-k80` | NVIDIA Tesla K80 (legacy) |

!!! note "GPU Availability"
GPU availability varies by zone. Check [GCP GPU regions and zones](https://cloud.google.com/compute/docs/gpus/gpu-regions-zones) for current availability in your desired region.

!!! warning "GPU Instance Limitations"
- GPU instances cannot be live migrated and will be terminated during maintenance events
- Use `n1-standard-*` or `n1-highmem-*` machine types with GPUs (not `e2-*` types)
- Some GPU types have minimum vCPU and memory requirements
Comment thread
samuelstolicny marked this conversation as resolved.

For a complete GPU deployment example including the NVIDIA GPU Operator installation, see the [GPU Example](../gpu-example.md).

## Input manifest examples
### Single provider, multi region cluster example

Expand Down
12 changes: 10 additions & 2 deletions internal/api/manifest/manifest.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,17 @@ type MachineSpec struct {
// Memory specifies the memory the provided instance type will have.
// +optional
Memory int `validate:"required_with=CpuCount,gte=0" yaml:"memory" json:"memory"`
// Nvidia specifies the number of NVIDIA GPUs the provided instance type will have.
// NvidiaGpuCount specifies the number of NVIDIA GPUs the provided instance type will have.
// +optional
NvidiaGpu int `validate:"gte=0" yaml:"nvidiaGpu" json:"nvidiaGpu"`
NvidiaGpuCount int `validate:"gte=0" yaml:"nvidiaGpuCount" json:"nvidiaGpuCount"`
// NvidiaGpuType specifies the NVIDIA GPU accelerator type (required for GCP when using GPUs).
// Examples: nvidia-tesla-k80, nvidia-tesla-v100, nvidia-tesla-a100, nvidia-l4
// +optional
NvidiaGpuType string `validate:"omitempty" yaml:"nvidiaGpuType" json:"nvidiaGpuType,omitempty"`
Comment thread
samuelstolicny marked this conversation as resolved.
// NvidiaGpu is deprecated, use NvidiaGpuCount instead. Kept for backward compatibility.
// +optional
// Deprecated: Use NvidiaGpuCount instead.
NvidiaGpu int `validate:"gte=0" yaml:"nvidiaGpu" json:"nvidiaGpu,omitempty"`
}

// DynamicNodePool List of dynamically to-be-created nodepools of not yet existing machines, used for Kubernetes or loadbalancer clusters.
Expand Down
12 changes: 9 additions & 3 deletions internal/api/manifest/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,10 +293,16 @@ func (ds *Manifest) CreateNodepools(pools []string, isControl bool) ([]*spec.Nod

var machineSpec *spec.MachineSpec
if nodePool.MachineSpec != nil {
// Use NvidiaGpuCount as primary, fall back to deprecated NvidiaGpu for backward compatibility
gpuCount := int32(nodePool.MachineSpec.NvidiaGpuCount)
if gpuCount == 0 && nodePool.MachineSpec.NvidiaGpu > 0 {
gpuCount = int32(nodePool.MachineSpec.NvidiaGpu)
}
machineSpec = &spec.MachineSpec{
CpuCount: int32(nodePool.MachineSpec.CpuCount),
Memory: int32(nodePool.MachineSpec.Memory),
NvidiaGpu: int32(nodePool.MachineSpec.NvidiaGpu),
CpuCount: int32(nodePool.MachineSpec.CpuCount),
Memory: int32(nodePool.MachineSpec.Memory),
NvidiaGpuCount: gpuCount,
NvidiaGpuType: nodePool.MachineSpec.NvidiaGpuType,
}
}

Expand Down
36 changes: 36 additions & 0 deletions internal/api/manifest/validate_node_pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,11 @@ func (d *DynamicNodePool) Validate(m *Manifest) error {
return fmt.Errorf("max available count for a nodepool is 255")
}

// Validate GCP-specific GPU configuration
if err := d.validateGCPGpuConfig(m); err != nil {
return err
}

validate := validator.New()

if err := validate.RegisterValidation("external_net", validateExternalNet); err != nil {
Expand All @@ -137,6 +142,37 @@ func (d *DynamicNodePool) Validate(m *Manifest) error {
return nil
}

// validateGCPGpuConfig validates that GCP nodepools with GPUs have the required type specified.
// GCP requires the guest_accelerator block with both type and count to attach GPUs to instances.
func (d *DynamicNodePool) validateGCPGpuConfig(m *Manifest) error {
providerType, err := m.GetProviderType(d.ProviderSpec.Name)
if err != nil {
// Provider existence is validated in NodePool.Validate() at lines 28-31
// before DynamicNodePool.Validate() is called.
return nil
Comment thread
samuelstolicny marked this conversation as resolved.
}

if providerType != "gcp" {
return nil
}

if d.MachineSpec == nil {
return nil
}

// Check both NvidiaGpuCount (new) and NvidiaGpu (deprecated) for backward compatibility
gpuCount := d.MachineSpec.NvidiaGpuCount
if gpuCount == 0 {
gpuCount = d.MachineSpec.NvidiaGpu
}

if gpuCount > 0 && d.MachineSpec.NvidiaGpuType == "" {
return fmt.Errorf("nvidiaGpuType is required for GCP when nvidiaGpuCount > 0")
}
Comment thread
samuelstolicny marked this conversation as resolved.

return nil
}

func (s *StaticNodePool) Validate() error {
if err := validator.New().Struct(s); err != nil {
return prettyPrintValidationError(err)
Expand Down
109 changes: 109 additions & 0 deletions internal/api/manifest/validate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -329,3 +329,112 @@ func TestOptionalZone(t *testing.T) {
// Nodepools with zone should still pass validation
r.NoError(testNodepoolWithZone.Validate(&Manifest{}))
}

// TestGCPGpuValidation tests that GCP nodepools with GPUs require nvidiaGpuType to be specified.
func TestGCPGpuValidation(t *testing.T) {
r := require.New(t)

// Create a manifest with a GCP provider
gcpManifest := &Manifest{
Providers: Provider{
GCP: []GCP{{
Name: "gcp-1",
Credentials: "fake-credentials",
GCPProject: "fake-project",
}},
},
}

// Create a manifest with a Hetzner provider (non-GCP)
hetznerManifest := &Manifest{
Providers: Provider{
Hetzner: []Hetzner{{
Name: "hetzner-1",
Credentials: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
}},
},
}

// Test case 1: GCP nodepool with GPU count but no type - should fail
gcpNodepoolGpuNoType := &DynamicNodePool{
Name: "gpu-np",
ServerType: "n1-standard-4",
Image: "ubuntu-2204",
Count: 1,
ProviderSpec: ProviderSpec{
Name: "gcp-1",
Region: "us-central1",
Zone: "us-central1-a",
},
MachineSpec: &MachineSpec{
NvidiaGpuCount: 1,
},
}
r.Error(gcpNodepoolGpuNoType.Validate(gcpManifest), "GCP nodepool with GPU count but no type should fail validation")

// Test case 2: GCP nodepool with GPU count and type - should pass
gcpNodepoolGpuWithType := &DynamicNodePool{
Name: "gpu-np",
ServerType: "n1-standard-4",
Image: "ubuntu-2204",
Count: 1,
ProviderSpec: ProviderSpec{
Name: "gcp-1",
Region: "us-central1",
Zone: "us-central1-a",
},
MachineSpec: &MachineSpec{
NvidiaGpuCount: 1,
NvidiaGpuType: "nvidia-tesla-t4",
},
}
r.NoError(gcpNodepoolGpuWithType.Validate(gcpManifest), "GCP nodepool with GPU count and type should pass validation")

// Test case 3: GCP nodepool without GPU - should pass
gcpNodepoolNoGpu := &DynamicNodePool{
Name: "regular-np",
ServerType: "e2-medium",
Image: "ubuntu-2204",
Count: 1,
ProviderSpec: ProviderSpec{
Name: "gcp-1",
Region: "us-central1",
Zone: "us-central1-a",
},
}
r.NoError(gcpNodepoolNoGpu.Validate(gcpManifest), "GCP nodepool without GPU should pass validation")

// Test case 4: Non-GCP nodepool with GPU count but no type - should pass (only GCP requires type)
hetznerNodepoolGpuNoType := &DynamicNodePool{
Name: "gpu-np",
ServerType: "cx21",
Image: "ubuntu-22.04",
Count: 1,
ProviderSpec: ProviderSpec{
Name: "hetzner-1",
Region: "fsn1",
Zone: "fsn1-dc14",
},
MachineSpec: &MachineSpec{
NvidiaGpuCount: 1,
},
}
r.NoError(hetznerNodepoolGpuNoType.Validate(hetznerManifest), "Non-GCP nodepool with GPU count but no type should pass validation")

// Test case 5: Non-GCP nodepool with deprecated nvidiaGpu field but no type - should pass (backward compatibility)
hetznerNodepoolDeprecatedGpu := &DynamicNodePool{
Name: "gpu-np-dep",
ServerType: "cx21",
Image: "ubuntu-22.04",
Count: 1,
ProviderSpec: ProviderSpec{
Name: "hetzner-1",
Region: "fsn1",
Zone: "fsn1-dc14",
},
MachineSpec: &MachineSpec{
NvidiaGpu: 1, // Using deprecated field
},
}
r.NoError(hetznerNodepoolDeprecatedGpu.Validate(hetznerManifest), "Non-GCP nodepool with deprecated nvidiaGpu but no type should pass validation")
}
Loading