Website: https://www.cast.ai
- Terraform 0.13+
A module to connect an EKS cluster to CAST AI.
Requires castai/castai and hashicorp/aws providers to be configured.
module "castai-eks-cluster" {
source = "castai/eks-cluster/castai"
aws_account_id = var.aws_account_id
aws_cluster_region = var.cluster_region
aws_cluster_name = var.cluster_id
aws_assume_role_arn = module.castai-eks-role-iam.role_arn
// Default node configuration will be used for all CAST provisioned nodes unless specific configuration is requested.
default_node_configuration = module.cast-eks-cluster.castai_node_configurations["default"]
node_configurations = {
default = {
subnets = module.vpc.private_subnets
dns_cluster_ip = "10.100.0.10"
instance_profile_role_arn = var.instance_profile_arn
ssh_public_key = var.ssh_public_key
security_groups = [
module.eks.node_security_group_id,
]
tags = {
"team" : "core"
}
init_script = base64encode(var.init_script)
docker_config = jsonencode({
"insecure-registries" = ["registry.com:5000"],
"max-concurrent-downloads" = 10
})
kubelet_config = jsonencode({
"registryBurst" : 20,
"registryPullQPS" : 10
})
container_runtime = "dockerd"
}
}
node_templates = {
spot_tmpl = {
configuration_id = module.cast-eks-cluster.castai_node_configurations["default"]
should_taint = true
custom_labels = {
custom-label-key-1 = "custom-label-value-1"
custom-label-key-2 = "custom-label-value-2"
}
custom_taints = [
{
key = "custom-taint-key-1"
value = "custom-taint-value-1"
},
{
key = "custom-taint-key-2"
value = "custom-taint-value-2"
}
]
constraints = {
fallback_restore_rate_seconds = 1800
spot = true
use_spot_fallbacks = true
min_cpu = 4
max_cpu = 100
instance_families = {
exclude = ["m5"]
}
compute_optimized_state = "disabled"
storage_optimized_state = "disabled"
is_gpu_only = false
architectures = ["amd64"]
gpu = {
fractional_gpus = "enabled"
}
}
gpu = {
default_shared_clients_per_gpu = 9
enable_time_sharing = true
sharing_configuration = [
{
gpu_name = "A100"
shared_clients_per_gpu = 11
},
{
gpu_name = "L4"
shared_clients_per_gpu = 5
},
{
gpu_name = "T4"
shared_clients_per_gpu = 3
}
]
}
}
}
autoscaler_settings = {
enabled = true
node_templates_partial_matching_enabled = false
unschedulable_pods = {
enabled = true
}
node_downscaler = {
enabled = true
empty_nodes = {
enabled = true
}
evictor = {
aggressive_mode = false
cycle_interval = "5s10s"
dry_run = false
enabled = true
node_grace_period_minutes = 10
scoped_mode = false
}
}
cluster_limits = {
enabled = true
cpu = {
max_cores = 20
min_cores = 1
}
}
}
workload_scaling_policies = {
default = {
apply_type = "IMMEDIATE"
management_option = "MANAGED"
cpu = {
function = "QUANTILE"
args = ["0.9"]
overhead = 0.15
look_back_period_seconds = 172800
min = 0.1
max = 2.0
}
memory = {
function = "MAX"
overhead = 0.35
look_back_period_seconds = 172800
limit = {
type = "NOLIMIT"
}
}
assignment_rules = {
rules = [
{
namespace = {
names = ["default", "kube-system"]
}
},
{
workload = {
gvk: ["Deployment", "StatefulSet"]
labels_expressions = [
{
key = "region"
operator = "NotIn"
values = ["eu-west-1", "eu-west-2"]
},
{
key = "helm.sh/chart"
operator = "Exists"
}
]
}
}
]
}
startup = {
period_seconds = 300
}
predictive_scaling = {
cpu = {
enabled = true
}
}
}
}
}Existing configuration:
module "castai-eks-cluster" {
// ...
subnets = module.vpc.private_subnets
dns_cluster_ip = "10.100.0.10"
instance_profile_role_arn = var.instance_profile_arn
ssh_public_key = var.ssh_public_key
override_security_groups = [
module.eks.node_security_group_id,
]
tags = {
"team" : "core"
}
}New configuration:
module "castai-eks-cluster" {
// ...
// Default node configuration will be used for all CAST provisioned nodes unless specific configuration is requested.
default_node_configuration = module.cast-eks-cluster.castai_node_configurations["default"]
node_configurations = {
default = {
subnets = module.vpc.private_subnets
dns_cluster_ip = "10.100.0.10"
instance_profile_role_arn = var.instance_profile_arn
ssh_public_key = var.ssh_public_key
security_groups = [
module.eks.node_security_group_id,
]
tags = {
"team" : "core"
}
}
}
}
Existing configuration:
module "castai-eks-cluster" {
// ...
node_templates = {
// ...
}
autoscaler_policies_json = <<-EOT
{
"enabled": true,
"unschedulablePods": {
"enabled": true
},
"spotInstances": {
"enabled": true,
"clouds": ["aws"],
"spotBackups": {
"enabled": true
},
"spotDiversityEnabled": false,
"spotDiversityPriceIncreaseLimitPercent": 20,
"spotInterruptionPredictions": {
"enabled": true,
"type": "AWSRebalanceRecommendations"
}
},
"nodeDownscaler": {
"enabled": true,
"emptyNodes": {
"enabled": true
},
"evictor": {
"aggressiveMode": true,
"cycleInterval": "5m10s",
"dryRun": false,
"enabled": true,
"nodeGracePeriodMinutes": 10,
"scopedMode": false
}
}
}
EOT
}New configuration:
module "castai-eks-cluster" {
// ...
node_templates = {
default_by_castai = {
name = "default-by-castai"
configuration_id = module.castai-eks-cluster.castai_node_configurations["default"]
is_default = true
should_taint = false
constraints = {
on_demand = true
spot = true
use_spot_fallbacks = true
enable_spot_diversity = false
spot_diversity_price_increase_limit_percent = 20
spot_interruption_predictions_enabled = true
spot_interruption_predictions_type = "aws-rebalance-recommendations"
}
}
}
autoscaler_policies_json = <<-EOT
{
"enabled": true,
"unschedulablePods": {
"enabled": true
},
"nodeDownscaler": {
"enabled": true,
"emptyNodes": {
"enabled": true
},
"evictor": {
"aggressiveMode": true,
"cycleInterval": "5m10s",
"dryRun": false,
"enabled": true,
"nodeGracePeriodMinutes": 10,
"scopedMode": false
}
}
}
EOT
}
Version 7.x.x changes:
- Removed
custom_labelattribute incastai_node_templateresource. Usecustom_labelsinstead.
Old configuration:
module "castai-eks-cluster" {
// ...
node_templates = {
spot_tmpl = {
custom_label = {
key = "custom-label-key-1"
value = "custom-label-value-1"
}
}
}
}New configuration:
module "castai-eks-cluster" {
// ...
node_templates = {
spot_tmpl = {
custom_labels = {
custom-label-key-1 = "custom-label-value-1"
}
}
}
}Version 8.x.x changed:
- Removed
compute_optimizedandstorage_optimizedattributes incastai_node_templateresource,constraintsobject. Usecompute_optimized_stateandstorage_optimized_stateinstead.
Old configuration:
module "castai-eks-cluster" {
node_templates = {
spot_tmpl = {
constraints = {
compute_optimized = false
storage_optimized = true
}
}
}
}New configuration:
module "castai-eks-cluster" {
node_templates = {
spot_tmpl = {
constraints = {
compute_optimized_state = "disabled"
storage_optimized_state = "enabled"
}
}
}
}Version 9.3.x changed:
- Deprecated
autoscaler_policies_jsonattribute. Useautoscaler_settingsinstead.
Old configuration:
module "castai-eks-cluster" {
autoscaler_policies_json = <<-EOT
{
"enabled": true,
"unschedulablePods": {
"enabled": true
},
"nodeDownscaler": {
"enabled": true,
"emptyNodes": {
"enabled": true
},
"evictor": {
"aggressiveMode": false,
"cycleInterval": "5m10s",
"dryRun": false,
"enabled": true,
"nodeGracePeriodMinutes": 10,
"scopedMode": false
}
},
"nodeTemplatesPartialMatchingEnabled": false,
"clusterLimits": {
"cpu": {
"maxCores": 20,
"minCores": 1
},
"enabled": true
}
}
EOT
}New configuration:
module "castai-eks-cluster" {
autoscaler_settings = {
enabled = true
node_templates_partial_matching_enabled = false
unschedulable_pods = {
enabled = true
}
node_downscaler = {
enabled = true
empty_nodes = {
enabled = true
}
evictor = {
aggressive_mode = false
cycle_interval = "5m10s"
dry_run = false
enabled = true
node_grace_period_minutes = 10
scoped_mode = false
}
}
cluster_limits = {
enabled = true
cpu = {
max_cores = 20
min_cores = 1
}
}
}
}Version 14.x.x removes deprecated fields that were deprecated in CAST.AI provider v7.9.3+. These settings have been moved to node_templates and autoscaler_settings for better configuration management.
- Removed: The entire
autoscaler_policies_jsonvariable and attribute - Replacement: Use the structured
autoscaler_settingsblock instead
headroom- replaced with low-priority placeholder workloadsheadroom_spot- replaced with low-priority placeholder workloadsnode_constraints- moved tonode_templates.constraintscustom_instances_enabled- moved tonode_templates
spot_instances- moved tonode_templates.constraints
Old configuration (removed):
module "castai-eks-cluster" {
autoscaler_policies_json = <<-EOT
{
"enabled": true,
"unschedulablePods": {
"enabled": true
},
"nodeDownscaler": {
"enabled": true,
"emptyNodes": {
"enabled": true
},
"evictor": {
"aggressiveMode": false,
"cycleInterval": "5m10s",
"dryRun": false,
"enabled": true,
"nodeGracePeriodMinutes": 10,
"scopedMode": false
}
},
"clusterLimits": {
"cpu": {
"maxCores": 20,
"minCores": 1
},
"enabled": true
}
}
EOT
}New configuration:
module "castai-eks-cluster" {
autoscaler_settings = {
enabled = true
unschedulable_pods = {
enabled = true
}
node_downscaler = {
enabled = true
empty_nodes = {
enabled = true
}
evictor = {
aggressive_mode = false
cycle_interval = "5m10s"
dry_run = false
enabled = true
node_grace_period_minutes = 10
scoped_mode = false
}
}
cluster_limits = {
enabled = true
cpu = {
max_cores = 20
min_cores = 1
}
}
}
}Old configuration (removed):
module "castai-eks-cluster" {
autoscaler_settings = {
unschedulable_pods = {
headroom = {
enabled = true
cpu_percentage = 10
memory_percentage = 10
}
headroom_spot = {
enabled = true
cpu_percentage = 10
memory_percentage = 10
}
}
}
}New configuration:
module "castai-eks-cluster" {
autoscaler_settings = {
unschedulable_pods = {
enabled = true
}
}
}
# Deploy low-priority placeholder workloads instead
# See: https://docs.cast.ai/docs/autoscaler-faq#how-can-i-maintain-cluster-headroomOld configuration (removed):
module "castai-eks-cluster" {
autoscaler_settings = {
unschedulable_pods = {
enabled = true
custom_instances_enabled = true
node_constraints = {
enabled = true
min_cpu_cores = 2
max_cpu_cores = 16
min_ram_mib = 4096
max_ram_mib = 32768
}
}
}
}New configuration:
module "castai-eks-cluster" {
autoscaler_settings = {
unschedulable_pods = {
enabled = true
}
}
# Move constraints to node template
node_templates = {
default = {
configuration_id = module.castai-eks-cluster.castai_node_configurations["default"]
is_default = true
constraints = {
min_cpu = 2
max_cpu = 16
min_memory = 4096
max_memory = 32768
}
}
}
}Old configuration (removed):
module "castai-eks-cluster" {
autoscaler_settings = {
spot_instances = {
enabled = true
max_reclaim_rate = 10
spot_diversity_enabled = true
spot_diversity_price_increase_limit = 20
spot_backups = {
enabled = true
spot_backup_restore_rate_seconds = 1800
}
spot_interruption_predictions = {
enabled = true
spot_interruption_predictions_type = "aws-rebalance-recommendations"
}
}
}
}New configuration:
module "castai-eks-cluster" {
node_templates = {
default = {
configuration_id = module.castai-eks-cluster.castai_node_configurations["default"]
is_default = true
constraints = {
spot = true
use_spot_fallbacks = true
fallback_restore_rate_seconds = 1800
enable_spot_diversity = true
spot_diversity_price_increase_limit_percent = 20
spot_interruption_predictions_enabled = true
spot_interruption_predictions_type = "aws-rebalance-recommendations"
}
}
}
}Before (v13.x.x):
module "castai-eks-cluster" {
source = "castai/eks-cluster/castai"
autoscaler_settings = {
enabled = true
unschedulable_pods = {
enabled = true
custom_instances_enabled = true
headroom = {
enabled = true
cpu_percentage = 10
memory_percentage = 10
}
node_constraints = {
min_cpu_cores = 4
max_cpu_cores = 32
}
}
spot_instances = {
enabled = true
spot_backups = {
enabled = true
}
}
}
}After (v14.x.x):
module "castai-eks-cluster" {
source = "castai/eks-cluster/castai"
autoscaler_settings = {
enabled = true
unschedulable_pods = {
enabled = true
}
}
node_templates = {
default = {
configuration_id = module.castai-eks-cluster.castai_node_configurations["default"]
is_default = true
constraints = {
min_cpu = 4
max_cpu = 32
spot = true
use_spot_fallbacks = true
}
}
}
}
# For headroom: Deploy low-priority placeholder workloads
# See: https://docs.cast.ai/docs/autoscaler-faq#how-can-i-maintain-cluster-headroom| Old Field (Removed) | New Field (node_templates.constraints) |
|---|---|
unschedulable_pods.node_constraints.min_cpu_cores |
min_cpu |
unschedulable_pods.node_constraints.max_cpu_cores |
max_cpu |
unschedulable_pods.node_constraints.min_ram_mib |
min_memory |
unschedulable_pods.node_constraints.max_ram_mib |
max_memory |
unschedulable_pods.custom_instances_enabled |
Top-level in node_template |
spot_instances.enabled |
spot |
spot_instances.spot_backups.enabled |
use_spot_fallbacks |
spot_instances.spot_backups.spot_backup_restore_rate_seconds |
fallback_restore_rate_seconds |
spot_instances.spot_diversity_enabled |
enable_spot_diversity |
spot_instances.spot_diversity_price_increase_limit |
spot_diversity_price_increase_limit_percent |
spot_instances.spot_interruption_predictions.enabled |
spot_interruption_predictions_enabled |
spot_instances.spot_interruption_predictions.spot_interruption_predictions_type |
spot_interruption_predictions_type |
Usage examples are located in terraform provider repo
terraform-docs markdown table . --output-file README.md| Name | Version |
|---|---|
| terraform | >= 0.13 |
| aws | >= 6.23.0 |
| castai | >= 8.3 |
| helm | >= 3.0.0 |
| null | >= 3.0 |
| Name | Version |
|---|---|
| aws | 6.22.0 |
| castai | 8.3.0 |
| helm | 3.1.1 |
| null | 3.2.4 |
| Name | Source | Version |
|---|---|---|
| castai_omni_cluster | github.com/castai/terraform-castai-omni-cluster | n/a |
| Name | Description | Type | Default | Required |
|---|---|---|---|---|
| agent_aws_access_key_id | AWS access key for CAST AI agent to fetch instance details. | string |
"" |
no |
| agent_aws_iam_service_account_role_arn | Arn of the role to be used by CAST AI agent to fetch instance details. Only readonly AmazonEC2ReadOnlyAccess is needed. | string |
"" |
no |
| agent_aws_secret_access_key | AWS access key secret for CAST AI agent to fetch instance details. | string |
"" |
no |
| agent_values | List of YAML formatted string with agent values | list(string) |
[] |
no |
| agent_version | Version of castai-agent helm chart. Default latest | string |
null |
no |
| ai_optimizer_values | List of YAML formatted string with ai-optimizer values | list(string) |
[] |
no |
| ai_optimizer_version | Version of castai-ai-optimizer helm chart. Default latest | string |
null |
no |
| api_url | URL of alternative CAST AI API to be used during development or testing | string |
"https://api.cast.ai" |
no |
| autoscaler_settings | Optional Autoscaler policy definitions to override current autoscaler settings | any |
null |
no |
| aws_account_id | ID of AWS account the cluster is located in. | string |
n/a | yes |
| aws_assume_role_arn | Arn of the role to be used by CAST AI for IAM access | string |
null |
no |
| aws_cluster_name | Name of the cluster to be connected to CAST AI. | string |
n/a | yes |
| aws_cluster_region | Region of the cluster to be connected to CAST AI. | string |
n/a | yes |
| castai_api_token | Optional CAST AI API token created in console.cast.ai API Access keys section. Used only when wait_for_cluster_ready is set to true |
string |
"" |
no |
| castai_components_labels | Optional additional Kubernetes labels for CAST AI pods | map(any) |
{} |
no |
| cluster_controller_values | List of YAML formatted string with cluster-controller values | list(string) |
[] |
no |
| cluster_controller_version | Version of castai-cluster-controller helm chart. Default latest | string |
null |
no |
| default_node_configuration | ID of the default node configuration | string |
"" |
no |
| default_node_configuration_name | Name of the default node configuration | string |
"" |
no |
| delete_nodes_on_disconnect | Optionally delete Cast AI created nodes when the cluster is destroyed | bool |
false |
no |
| egressd_values | List of YAML formatted string with egressd values | list(string) |
[] |
no |
| egressd_version | Version of castai-egressd helm chart. Default latest | string |
null |
no |
| evictor_ext_values | List of YAML formatted string with evictor-ext values | list(string) |
[] |
no |
| evictor_ext_version | Version of castai-evictor-ext chart. Default latest | string |
null |
no |
| evictor_values | List of YAML formatted string with evictor values | list(string) |
[] |
no |
| evictor_version | Version of castai-evictor chart. Default latest | string |
null |
no |
| grpc_url | gRPC endpoint used by pod-pinner | string |
"grpc.cast.ai:443" |
no |
| install_ai_optimizer | Optional flag for installation of AI Optimizer (https://docs.cast.ai/docs/getting-started-ai) | bool |
false |
no |
| install_egressd | Optional flag for installation of Egressd (Network cost monitoring) (https://docs.cast.ai/docs/network-cost) | bool |
false |
no |
| install_live | Optional flag for installation of CAST AI Live (https://docs.cast.ai/docs/clm-getting-started). Default is true | bool |
true |
no |
| install_live_cni | Optional flag for installing CAST AI aws-vpc-cni fork for CAST AI Live. Default is true | bool |
true |
no |
| install_omni | Optional flag for installation of Omni product | bool |
false |
no |
| install_pod_mutator | Optional flag for installation of pod mutator | bool |
false |
no |
| install_security_agent | Optional flag for installation of security agent (Kvisor - https://docs.cast.ai/docs/kvisor) | bool |
false |
no |
| install_workload_autoscaler | Optional flag for installation of workload autoscaler (https://docs.cast.ai/docs/workload-autoscaling-configuration) | bool |
false |
no |
| kvisor_controller_extra_args | map(string) |
{ |
no | |
| kvisor_grpc_addr | CAST AI Kvisor optimized GRPC API address | string |
"kvisor.prod-master.cast.ai:443" |
no |
| kvisor_values | List of YAML formatted string with kvisor values, see example: https://github.com/castai/terraform-provider-castai/tree/master/examples/eks/eks_cluster_with_security/castai.tf | list(string) |
[] |
no |
| kvisor_version | Version of kvisor chart. Default latest | string |
null |
no |
| kvisor_wait | Wait for kvisor chart to finish release | bool |
true |
no |
| live_values | List of YAML formatted string with castai-live values | list(string) |
[] |
no |
| live_version | Version of castai-live helm chart. Default latest | string |
null |
no |
| node_configurations | Map of EKS node configurations to create | any |
{} |
no |
| node_templates | Map of node templates to create | any |
{} |
no |
| organization_id | DEPRECATED (required only for pod mutator v0.0.25 and older): CAST AI Organization ID | string |
"" |
no |
| pod_mutator_version | Version of castai-pod-mutator helm chart. Default latest | string |
null |
no |
| pod_pinner_values | List of YAML formatted string values for agent helm chart | list(string) |
[] |
no |
| pod_pinner_version | Version of pod-pinner helm chart. Default latest | string |
null |
no |
| self_managed | Whether CAST AI components' upgrades are managed by a customer; by default upgrades are managed CAST AI central system. WARNING: changing this after the module was created is not supported. | bool |
false |
no |
| spot_handler_values | List of YAML formatted string with spot-handler values | list(string) |
[] |
no |
| spot_handler_version | Version of castai-spot-handler helm chart. Default latest | string |
null |
no |
| wait_for_cluster_ready | Wait for cluster to be ready before finishing the module execution, this option requires castai_api_token to be set |
bool |
false |
no |
| workload_autoscaler_values | List of YAML formatted string with cluster-workload-autoscaler values | list(string) |
[] |
no |
| workload_autoscaler_version | Version of castai-workload-autoscaler helm chart. Default latest | string |
null |
no |
| workload_scaling_policies | Map of workload scaling policies to create | any |
{} |
no |
| Name | Description |
|---|---|
| castai_node_configurations | Map of node configurations ids by name |
| castai_node_templates | Map of node template by name |
| cluster_id | CAST AI cluster id, which can be used for accessing cluster data using API |
| organization_id | CAST.AI organization id of the cluster |