Skip to content

castai/terraform-castai-eks-cluster

Repository files navigation

Cast AI logo

Terraform module for connecting an AWS EKS cluster to CAST AI

Website: https://www.cast.ai

Requirements

Using the module

A module to connect an EKS cluster to CAST AI.

Requires castai/castai and hashicorp/aws providers to be configured.

module "castai-eks-cluster" {
  source = "castai/eks-cluster/castai"

  aws_account_id     = var.aws_account_id
  aws_cluster_region = var.cluster_region
  aws_cluster_name   = var.cluster_id

  aws_assume_role_arn      = module.castai-eks-role-iam.role_arn

  // Default node configuration will be used for all CAST provisioned nodes unless specific configuration is requested.
  default_node_configuration = module.cast-eks-cluster.castai_node_configurations["default"]

  node_configurations = {
    default = {
      subnets                   = module.vpc.private_subnets
      dns_cluster_ip            = "10.100.0.10"
      instance_profile_role_arn = var.instance_profile_arn
      ssh_public_key            = var.ssh_public_key
      security_groups           = [
        module.eks.node_security_group_id,
      ]
      tags = {
        "team" : "core"
      }
      init_script   = base64encode(var.init_script)
      docker_config = jsonencode({
        "insecure-registries"      = ["registry.com:5000"],
        "max-concurrent-downloads" = 10
      })
      kubelet_config = jsonencode({
        "registryBurst" : 20,
        "registryPullQPS" : 10
      })
      container_runtime = "dockerd"
    }
  }

  node_templates = {
    spot_tmpl = {
      configuration_id = module.cast-eks-cluster.castai_node_configurations["default"]

      should_taint = true

      custom_labels = {
        custom-label-key-1 = "custom-label-value-1"
        custom-label-key-2 = "custom-label-value-2"
      }

      custom_taints = [
        {
          key   = "custom-taint-key-1"
          value = "custom-taint-value-1"
        },
        {
          key   = "custom-taint-key-2"
          value = "custom-taint-value-2"
        }
      ]

      constraints = {
        fallback_restore_rate_seconds = 1800
        spot                          = true
        use_spot_fallbacks            = true
        min_cpu                       = 4
        max_cpu                       = 100
        instance_families             = {
          exclude = ["m5"]
        }
        compute_optimized_state = "disabled"
        storage_optimized_state = "disabled"
        is_gpu_only              = false
        architectures            = ["amd64"]

        gpu = {
          fractional_gpus = "enabled"
        }
      }
      gpu = {
        default_shared_clients_per_gpu = 9
        enable_time_sharing            = true

        sharing_configuration = [
          {
            gpu_name = "A100"
            shared_clients_per_gpu = 11
          },
          {
            gpu_name = "L4"
            shared_clients_per_gpu = 5
          },
          {
            gpu_name = "T4"
            shared_clients_per_gpu = 3
          }
        ]
      }
    }
  }

  autoscaler_settings = {
    enabled                                 = true
    node_templates_partial_matching_enabled = false

    unschedulable_pods = {
      enabled = true
    }

    node_downscaler = {
      enabled = true

      empty_nodes = {
        enabled = true
      }

      evictor = {
        aggressive_mode           = false
        cycle_interval            = "5s10s"
        dry_run                   = false
        enabled                   = true
        node_grace_period_minutes = 10
        scoped_mode               = false
      }
    }

    cluster_limits = {
      enabled = true

      cpu = {
        max_cores = 20
        min_cores = 1
      }
    }
  }

  workload_scaling_policies = {
    default = {
      apply_type        = "IMMEDIATE"
      management_option = "MANAGED"

      cpu = {
        function                 = "QUANTILE"
        args                     = ["0.9"]
        overhead                 = 0.15
        look_back_period_seconds = 172800
        min                      = 0.1
        max                      = 2.0
      }

      memory = {
        function                 = "MAX"
        overhead                 = 0.35
        look_back_period_seconds = 172800

        limit = {
          type = "NOLIMIT"
        }
      }

      assignment_rules = {
        rules = [
          {
            namespace = {
              names = ["default", "kube-system"]
            }
          },
          {
            workload = {
              gvk: ["Deployment", "StatefulSet"]
              labels_expressions = [
                {
                  key      = "region"
                  operator = "NotIn"
                  values   = ["eu-west-1", "eu-west-2"]
                },
                {
                  key      = "helm.sh/chart"
                  operator = "Exists"
                }
              ]
            }
          }
        ]
      }

      startup = {
        period_seconds = 300
      }

      predictive_scaling = {
        cpu = {
          enabled = true
        }
      }
    }
  }
}

Migrating from 2.x.x to 3.x.x

Existing configuration:

module "castai-eks-cluster" {
  // ...
  
  subnets                   = module.vpc.private_subnets
  dns_cluster_ip            = "10.100.0.10"
  instance_profile_role_arn = var.instance_profile_arn
  ssh_public_key            = var.ssh_public_key
  override_security_groups  = [
    module.eks.node_security_group_id,
  ]
  tags = {
    "team" : "core"
  }
}

New configuration:

module "castai-eks-cluster" {
  // ...
  
  // Default node configuration will be used for all CAST provisioned nodes unless specific configuration is requested.
  default_node_configuration = module.cast-eks-cluster.castai_node_configurations["default"]

  node_configurations = {
    default = {
      subnets                   = module.vpc.private_subnets
      dns_cluster_ip            = "10.100.0.10"
      instance_profile_role_arn = var.instance_profile_arn
      ssh_public_key            = var.ssh_public_key
      security_groups           = [
        module.eks.node_security_group_id,
      ]
      tags = {
        "team" : "core"
      }
    }
  }
}

Migrating from 5.x.x to 6.x.x

Existing configuration:

module "castai-eks-cluster" {
  // ...

  node_templates = {
    // ...
  }
  autoscaler_policies_json = <<-EOT
    {
        "enabled": true,
        "unschedulablePods": {
            "enabled": true
        },
        "spotInstances": {
            "enabled": true,
            "clouds": ["aws"],
            "spotBackups": {
                "enabled": true
            },
            "spotDiversityEnabled": false,
            "spotDiversityPriceIncreaseLimitPercent": 20,
            "spotInterruptionPredictions": {
              "enabled": true,
              "type": "AWSRebalanceRecommendations"
            }
        },
        "nodeDownscaler": {
            "enabled": true,
            "emptyNodes": {
                "enabled": true
            },
            "evictor": {
                "aggressiveMode": true,
                "cycleInterval": "5m10s",
                "dryRun": false,
                "enabled": true,
                "nodeGracePeriodMinutes": 10,
                "scopedMode": false
            }
        }
    }
  EOT
}

New configuration:

module "castai-eks-cluster" {
  // ...

  node_templates = {
    default_by_castai = {
      name = "default-by-castai"
      configuration_id = module.castai-eks-cluster.castai_node_configurations["default"]
      is_default   = true
      should_taint = false

      constraints = {
        on_demand          = true
        spot               = true
        use_spot_fallbacks = true

        enable_spot_diversity                       = false
        spot_diversity_price_increase_limit_percent = 20

        spot_interruption_predictions_enabled = true
        spot_interruption_predictions_type = "aws-rebalance-recommendations"
      }
    }
  }
  autoscaler_policies_json = <<-EOT
    {
        "enabled": true,
        "unschedulablePods": {
            "enabled": true
        },
        "nodeDownscaler": {
            "enabled": true,
            "emptyNodes": {
                "enabled": true
            },
            "evictor": {
                "aggressiveMode": true,
                "cycleInterval": "5m10s",
                "dryRun": false,
                "enabled": true,
                "nodeGracePeriodMinutes": 10,
                "scopedMode": false
            }
        }
    }
  EOT
}

Migrating from 6.x.x to 7.x.x

Version 7.x.x changes:

  • Removed custom_label attribute in castai_node_template resource. Use custom_labels instead.

Old configuration:

module "castai-eks-cluster" {
  // ...

  node_templates = {
    spot_tmpl = {
      custom_label = {
        key = "custom-label-key-1"
        value = "custom-label-value-1"
      }
    }
  }
}

New configuration:

module "castai-eks-cluster" {
  // ...

  node_templates = {
    spot_tmpl = {
      custom_labels = {
        custom-label-key-1 = "custom-label-value-1"
      }
    }
  }
}

Migrating from 7.x.x to 8.x.x

Version 8.x.x changed:

  • Removed compute_optimized and storage_optimized attributes in castai_node_template resource, constraints object. Use compute_optimized_state and storage_optimized_state instead.

Old configuration:

module "castai-eks-cluster" {
  node_templates = {
    spot_tmpl = {
      constraints = {
        compute_optimized = false
        storage_optimized = true
      }
    }
  }
}

New configuration:

module "castai-eks-cluster" {
  node_templates = {
    spot_tmpl = {
      constraints = {
        compute_optimized_state = "disabled"
        storage_optimized_state = "enabled"
      }
    }
  }
}

Migrating from 9.x.x to 9.3.x

Version 9.3.x changed:

  • Deprecated autoscaler_policies_json attribute. Use autoscaler_settings instead.

Old configuration:

module "castai-eks-cluster" {
  autoscaler_policies_json = <<-EOT
    {
        "enabled": true,
        "unschedulablePods": {
            "enabled": true
        },
        "nodeDownscaler": {
            "enabled": true,
            "emptyNodes": {
                "enabled": true
            },
            "evictor": {
                "aggressiveMode": false,
                "cycleInterval": "5m10s",
                "dryRun": false,
                "enabled": true,
                "nodeGracePeriodMinutes": 10,
                "scopedMode": false
            }
        },
        "nodeTemplatesPartialMatchingEnabled": false,
        "clusterLimits": {
            "cpu": {
                "maxCores": 20,
                "minCores": 1
            },
            "enabled": true
        }
    }
  EOT
}

New configuration:

module "castai-eks-cluster" {
  autoscaler_settings = {
    enabled                                 = true
    node_templates_partial_matching_enabled = false

    unschedulable_pods = {
      enabled = true
    }

    node_downscaler = {
      enabled = true

      empty_nodes = {
        enabled = true
      }

      evictor = {
        aggressive_mode           = false
        cycle_interval            = "5m10s"
        dry_run                   = false
        enabled                   = true
        node_grace_period_minutes = 10
        scoped_mode               = false
      }
    }

    cluster_limits = {
      enabled = true

      cpu = {
        max_cores = 20
        min_cores = 1
      }
    }
  }
}

Migrating from 13.x.x to 14.x.x

Version 14.x.x removes deprecated fields that were deprecated in CAST.AI provider v7.9.3+. These settings have been moved to node_templates and autoscaler_settings for better configuration management.

Removed Fields

autoscaler_policies_json variable (deprecated since v9.3.x)

  • Removed: The entire autoscaler_policies_json variable and attribute
  • Replacement: Use the structured autoscaler_settings block instead

Fields removed from autoscaler_settings.unschedulable_pods:

  • headroom - replaced with low-priority placeholder workloads
  • headroom_spot - replaced with low-priority placeholder workloads
  • node_constraints - moved to node_templates.constraints
  • custom_instances_enabled - moved to node_templates

Entire blocks removed:

  • spot_instances - moved to node_templates.constraints

Migration Examples

Migrating from autoscaler_policies_json to autoscaler_settings

Old configuration (removed):

module "castai-eks-cluster" {
  autoscaler_policies_json = <<-EOT
    {
        "enabled": true,
        "unschedulablePods": {
            "enabled": true
        },
        "nodeDownscaler": {
            "enabled": true,
            "emptyNodes": {
                "enabled": true
            },
            "evictor": {
                "aggressiveMode": false,
                "cycleInterval": "5m10s",
                "dryRun": false,
                "enabled": true,
                "nodeGracePeriodMinutes": 10,
                "scopedMode": false
            }
        },
        "clusterLimits": {
            "cpu": {
                "maxCores": 20,
                "minCores": 1
            },
            "enabled": true
        }
    }
  EOT
}

New configuration:

module "castai-eks-cluster" {
  autoscaler_settings = {
    enabled = true

    unschedulable_pods = {
      enabled = true
    }

    node_downscaler = {
      enabled = true

      empty_nodes = {
        enabled = true
      }

      evictor = {
        aggressive_mode           = false
        cycle_interval            = "5m10s"
        dry_run                   = false
        enabled                   = true
        node_grace_period_minutes = 10
        scoped_mode               = false
      }
    }

    cluster_limits = {
      enabled = true

      cpu = {
        max_cores = 20
        min_cores = 1
      }
    }
  }
}

Headroom Configuration

Old configuration (removed):

module "castai-eks-cluster" {
  autoscaler_settings = {
    unschedulable_pods = {
      headroom = {
        enabled           = true
        cpu_percentage    = 10
        memory_percentage = 10
      }
      headroom_spot = {
        enabled           = true
        cpu_percentage    = 10
        memory_percentage = 10
      }
    }
  }
}

New configuration:

module "castai-eks-cluster" {
  autoscaler_settings = {
    unschedulable_pods = {
      enabled = true
    }
  }
}

# Deploy low-priority placeholder workloads instead
# See: https://docs.cast.ai/docs/autoscaler-faq#how-can-i-maintain-cluster-headroom

Node Constraints Configuration

Old configuration (removed):

module "castai-eks-cluster" {
  autoscaler_settings = {
    unschedulable_pods = {
      enabled                  = true
      custom_instances_enabled = true
      node_constraints = {
        enabled       = true
        min_cpu_cores = 2
        max_cpu_cores = 16
        min_ram_mib   = 4096
        max_ram_mib   = 32768
      }
    }
  }
}

New configuration:

module "castai-eks-cluster" {
  autoscaler_settings = {
    unschedulable_pods = {
      enabled = true
    }
  }

  # Move constraints to node template
  node_templates = {
    default = {
      configuration_id = module.castai-eks-cluster.castai_node_configurations["default"]
      is_default       = true

      constraints = {
        min_cpu    = 2
        max_cpu    = 16
        min_memory = 4096
        max_memory = 32768
      }
    }
  }
}

Spot Instance Configuration

Old configuration (removed):

module "castai-eks-cluster" {
  autoscaler_settings = {
    spot_instances = {
      enabled                             = true
      max_reclaim_rate                    = 10
      spot_diversity_enabled              = true
      spot_diversity_price_increase_limit = 20

      spot_backups = {
        enabled                          = true
        spot_backup_restore_rate_seconds = 1800
      }

      spot_interruption_predictions = {
        enabled                            = true
        spot_interruption_predictions_type = "aws-rebalance-recommendations"
      }
    }
  }
}

New configuration:

module "castai-eks-cluster" {
  node_templates = {
    default = {
      configuration_id = module.castai-eks-cluster.castai_node_configurations["default"]
      is_default       = true

      constraints = {
        spot                                          = true
        use_spot_fallbacks                            = true
        fallback_restore_rate_seconds                 = 1800
        enable_spot_diversity                         = true
        spot_diversity_price_increase_limit_percent   = 20
        spot_interruption_predictions_enabled         = true
        spot_interruption_predictions_type            = "aws-rebalance-recommendations"
      }
    }
  }
}

Complete Migration Example

Before (v13.x.x):

module "castai-eks-cluster" {
  source = "castai/eks-cluster/castai"

  autoscaler_settings = {
    enabled = true

    unschedulable_pods = {
      enabled                  = true
      custom_instances_enabled = true

      headroom = {
        enabled           = true
        cpu_percentage    = 10
        memory_percentage = 10
      }

      node_constraints = {
        min_cpu_cores = 4
        max_cpu_cores = 32
      }
    }

    spot_instances = {
      enabled = true
      spot_backups = {
        enabled = true
      }
    }
  }
}

After (v14.x.x):

module "castai-eks-cluster" {
  source = "castai/eks-cluster/castai"

  autoscaler_settings = {
    enabled = true

    unschedulable_pods = {
      enabled = true
    }
  }

  node_templates = {
    default = {
      configuration_id = module.castai-eks-cluster.castai_node_configurations["default"]
      is_default       = true

      constraints = {
        min_cpu            = 4
        max_cpu            = 32
        spot               = true
        use_spot_fallbacks = true
      }
    }
  }
}

# For headroom: Deploy low-priority placeholder workloads
# See: https://docs.cast.ai/docs/autoscaler-faq#how-can-i-maintain-cluster-headroom

Field Mapping Reference

Old Field (Removed) New Field (node_templates.constraints)
unschedulable_pods.node_constraints.min_cpu_cores min_cpu
unschedulable_pods.node_constraints.max_cpu_cores max_cpu
unschedulable_pods.node_constraints.min_ram_mib min_memory
unschedulable_pods.node_constraints.max_ram_mib max_memory
unschedulable_pods.custom_instances_enabled Top-level in node_template
spot_instances.enabled spot
spot_instances.spot_backups.enabled use_spot_fallbacks
spot_instances.spot_backups.spot_backup_restore_rate_seconds fallback_restore_rate_seconds
spot_instances.spot_diversity_enabled enable_spot_diversity
spot_instances.spot_diversity_price_increase_limit spot_diversity_price_increase_limit_percent
spot_instances.spot_interruption_predictions.enabled spot_interruption_predictions_enabled
spot_instances.spot_interruption_predictions.spot_interruption_predictions_type spot_interruption_predictions_type

Examples

Usage examples are located in terraform provider repo

Generate docs

terraform-docs markdown table . --output-file README.md

Requirements

Name Version
terraform >= 0.13
aws >= 6.23.0
castai >= 8.3
helm >= 3.0.0
null >= 3.0

Providers

Name Version
aws 6.22.0
castai 8.3.0
helm 3.1.1
null 3.2.4

Modules

Name Source Version
castai_omni_cluster github.com/castai/terraform-castai-omni-cluster n/a

Resources

Name Type
castai_autoscaler.castai_autoscaler_policies resource
castai_eks_cluster.my_castai_cluster resource
castai_node_configuration.this resource
castai_node_configuration_default.this resource
castai_node_template.this resource
castai_workload_scaling_policy.this resource
helm_release.castai_agent resource
helm_release.castai_ai_optimizer_proxy resource
helm_release.castai_ai_optimizer_proxy_self_managed resource
helm_release.castai_cluster_controller resource
helm_release.castai_cluster_controller_self_managed resource
helm_release.castai_egressd resource
helm_release.castai_egressd_self_managed resource
helm_release.castai_evictor resource
helm_release.castai_evictor_ext resource
helm_release.castai_evictor_self_managed resource
helm_release.castai_kvisor resource
helm_release.castai_kvisor_self_managed resource
helm_release.castai_live resource
helm_release.castai_live_self_managed resource
helm_release.castai_pod_mutator resource
helm_release.castai_pod_mutator_self_managed resource
helm_release.castai_pod_pinner resource
helm_release.castai_pod_pinner_self_managed resource
helm_release.castai_spot_handler resource
helm_release.castai_workload_autoscaler resource
helm_release.castai_workload_autoscaler_self_managed resource
null_resource.wait_for_cluster resource
aws_eks_cluster.this data source
aws_vpc.eks_vpc data source

Inputs

Name Description Type Default Required
agent_aws_access_key_id AWS access key for CAST AI agent to fetch instance details. string "" no
agent_aws_iam_service_account_role_arn Arn of the role to be used by CAST AI agent to fetch instance details. Only readonly AmazonEC2ReadOnlyAccess is needed. string "" no
agent_aws_secret_access_key AWS access key secret for CAST AI agent to fetch instance details. string "" no
agent_values List of YAML formatted string with agent values list(string) [] no
agent_version Version of castai-agent helm chart. Default latest string null no
ai_optimizer_values List of YAML formatted string with ai-optimizer values list(string) [] no
ai_optimizer_version Version of castai-ai-optimizer helm chart. Default latest string null no
api_url URL of alternative CAST AI API to be used during development or testing string "https://api.cast.ai" no
autoscaler_settings Optional Autoscaler policy definitions to override current autoscaler settings any null no
aws_account_id ID of AWS account the cluster is located in. string n/a yes
aws_assume_role_arn Arn of the role to be used by CAST AI for IAM access string null no
aws_cluster_name Name of the cluster to be connected to CAST AI. string n/a yes
aws_cluster_region Region of the cluster to be connected to CAST AI. string n/a yes
castai_api_token Optional CAST AI API token created in console.cast.ai API Access keys section. Used only when wait_for_cluster_ready is set to true string "" no
castai_components_labels Optional additional Kubernetes labels for CAST AI pods map(any) {} no
cluster_controller_values List of YAML formatted string with cluster-controller values list(string) [] no
cluster_controller_version Version of castai-cluster-controller helm chart. Default latest string null no
default_node_configuration ID of the default node configuration string "" no
default_node_configuration_name Name of the default node configuration string "" no
delete_nodes_on_disconnect Optionally delete Cast AI created nodes when the cluster is destroyed bool false no
egressd_values List of YAML formatted string with egressd values list(string) [] no
egressd_version Version of castai-egressd helm chart. Default latest string null no
evictor_ext_values List of YAML formatted string with evictor-ext values list(string) [] no
evictor_ext_version Version of castai-evictor-ext chart. Default latest string null no
evictor_values List of YAML formatted string with evictor values list(string) [] no
evictor_version Version of castai-evictor chart. Default latest string null no
grpc_url gRPC endpoint used by pod-pinner string "grpc.cast.ai:443" no
install_ai_optimizer Optional flag for installation of AI Optimizer (https://docs.cast.ai/docs/getting-started-ai) bool false no
install_egressd Optional flag for installation of Egressd (Network cost monitoring) (https://docs.cast.ai/docs/network-cost) bool false no
install_live Optional flag for installation of CAST AI Live (https://docs.cast.ai/docs/clm-getting-started). Default is true bool true no
install_live_cni Optional flag for installing CAST AI aws-vpc-cni fork for CAST AI Live. Default is true bool true no
install_omni Optional flag for installation of Omni product bool false no
install_pod_mutator Optional flag for installation of pod mutator bool false no
install_security_agent Optional flag for installation of security agent (Kvisor - https://docs.cast.ai/docs/kvisor) bool false no
install_workload_autoscaler Optional flag for installation of workload autoscaler (https://docs.cast.ai/docs/workload-autoscaling-configuration) bool false no
kvisor_controller_extra_args ⚠️ DEPRECATED: use kvisor_values instead (see example: https://github.com/castai/terraform-provider-castai/tree/master/examples/eks/eks_cluster_with_security/castai.tf ). Extra arguments for the kvisor controller. Optionally enable kvisor to lint Kubernetes YAML manifests, scan workload images and check if workloads pass CIS Kubernetes Benchmarks as well as NSA, WASP and PCI recommendations. map(string)
{
"image-scan-enabled": "true",
"kube-bench-enabled": "true",
"kube-linter-enabled": "true"
}
no
kvisor_grpc_addr CAST AI Kvisor optimized GRPC API address string "kvisor.prod-master.cast.ai:443" no
kvisor_values List of YAML formatted string with kvisor values, see example: https://github.com/castai/terraform-provider-castai/tree/master/examples/eks/eks_cluster_with_security/castai.tf list(string) [] no
kvisor_version Version of kvisor chart. Default latest string null no
kvisor_wait Wait for kvisor chart to finish release bool true no
live_values List of YAML formatted string with castai-live values list(string) [] no
live_version Version of castai-live helm chart. Default latest string null no
node_configurations Map of EKS node configurations to create any {} no
node_templates Map of node templates to create any {} no
organization_id DEPRECATED (required only for pod mutator v0.0.25 and older): CAST AI Organization ID string "" no
pod_mutator_version Version of castai-pod-mutator helm chart. Default latest string null no
pod_pinner_values List of YAML formatted string values for agent helm chart list(string) [] no
pod_pinner_version Version of pod-pinner helm chart. Default latest string null no
self_managed Whether CAST AI components' upgrades are managed by a customer; by default upgrades are managed CAST AI central system. WARNING: changing this after the module was created is not supported. bool false no
spot_handler_values List of YAML formatted string with spot-handler values list(string) [] no
spot_handler_version Version of castai-spot-handler helm chart. Default latest string null no
wait_for_cluster_ready Wait for cluster to be ready before finishing the module execution, this option requires castai_api_token to be set bool false no
workload_autoscaler_values List of YAML formatted string with cluster-workload-autoscaler values list(string) [] no
workload_autoscaler_version Version of castai-workload-autoscaler helm chart. Default latest string null no
workload_scaling_policies Map of workload scaling policies to create any {} no

Outputs

Name Description
castai_node_configurations Map of node configurations ids by name
castai_node_templates Map of node template by name
cluster_id CAST AI cluster id, which can be used for accessing cluster data using API
organization_id CAST.AI organization id of the cluster

About

CAST AI terraform module for connecting an EKS cluster

Resources

License

Stars

Watchers

Forks

Packages

No packages published

Contributors 42