From 4944f14b9fda7fb5ace122a1e4ce2245f8d7ce3d Mon Sep 17 00:00:00 2001 From: Filipe Augusto Lima de Souza Date: Mon, 16 Jun 2025 17:35:22 +0200 Subject: [PATCH 1/3] adding live-migration examples --- examples/eks/eks_live_migration/README.md | 53 +++++ examples/eks/eks_live_migration/calico.yaml | 16 ++ examples/eks/eks_live_migration/castai.tf | 147 ++++++++++++ examples/eks/eks_live_migration/deploy.yaml | 39 ++++ .../eks/eks_live_migration/eks-init-script.sh | 44 ++++ examples/eks/eks_live_migration/eks.tf | 217 ++++++++++++++++++ examples/eks/eks_live_migration/main.tf | 17 ++ .../eks/eks_live_migration/tf.vars.example | 3 + examples/eks/eks_live_migration/variables.tf | 33 +++ examples/gke/gke_live_migration/README.MD | 37 +++ examples/gke/gke_live_migration/castai.tf | 114 +++++++++ examples/gke/gke_live_migration/gke.tf | 45 ++++ examples/gke/gke_live_migration/init_cos.sh | 44 ++++ .../gke/gke_live_migration/init_ubuntu.sh | 45 ++++ .../gke/gke_live_migration/tf.vars.example | 5 + examples/gke/gke_live_migration/variables.tf | 56 +++++ examples/gke/gke_live_migration/version.tf | 17 ++ examples/gke/gke_live_migration/vpc.tf | 49 ++++ 18 files changed, 981 insertions(+) create mode 100644 examples/eks/eks_live_migration/README.md create mode 100644 examples/eks/eks_live_migration/calico.yaml create mode 100644 examples/eks/eks_live_migration/castai.tf create mode 100644 examples/eks/eks_live_migration/deploy.yaml create mode 100644 examples/eks/eks_live_migration/eks-init-script.sh create mode 100644 examples/eks/eks_live_migration/eks.tf create mode 100644 examples/eks/eks_live_migration/main.tf create mode 100644 examples/eks/eks_live_migration/tf.vars.example create mode 100644 examples/eks/eks_live_migration/variables.tf create mode 100644 examples/gke/gke_live_migration/README.MD create mode 100644 examples/gke/gke_live_migration/castai.tf create mode 100644 examples/gke/gke_live_migration/gke.tf create mode 100644 examples/gke/gke_live_migration/init_cos.sh create mode 100644 examples/gke/gke_live_migration/init_ubuntu.sh create mode 100644 examples/gke/gke_live_migration/tf.vars.example create mode 100644 examples/gke/gke_live_migration/variables.tf create mode 100644 examples/gke/gke_live_migration/version.tf create mode 100644 examples/gke/gke_live_migration/vpc.tf diff --git a/examples/eks/eks_live_migration/README.md b/examples/eks/eks_live_migration/README.md new file mode 100644 index 000000000..d2a7b88ab --- /dev/null +++ b/examples/eks/eks_live_migration/README.md @@ -0,0 +1,53 @@ +# AWS Live Migration with Containerd + +This setup creates an EKS cluster and onboards it to the CAST AI. Live binaries are then installed on nodes using dedicated +Node Configuration. Included script installs Live binaries on nodes works with Amazon Linux 2023. + +## How to create your env +1. Rename `tf.vars.example` to `tf.vars` +2. Update `tf.vars` file with your project name, cluster name, cluster region and Cast AI API token. +3. Initialize tofu. Under example root folder run: + ```bash + tofu init + ``` +4. Verify: + ``` + tofu plan -var-file=tf.vars + ``` + +5. Run tofu apply: + ``` + tofu apply -var-file=tf.vars + ``` +6. To destroy resources created by this example: + ``` + tofu destroy -var-file=tf.vars + ``` + +## Troubleshooting +There are some known issues with the terraform setup, and know workarounds. + +### Cluster creation stuck / timeouts on node group creation +If cluster creation gets stuck on node group creation, and nodes are not healthy, it most probably means Calico installtion did not trigger +at the right time. To fix it, just break the tofu execution and reexecute it again. + +### CAST AI onboarding stuck in connecting / pods don't have internet connection +Make sure Calico pods are running on all the nodes without errors and Core DNS addon is installed. + +### Timeout on resources destruction +- Check if There are no hanging CAST AI EC2 instances left and blocking VPC deletion. +- If Calico uninstallation job is stuck for any reason, just delete it manually: + ```bash + k delete job -n tigera-operator tigera-operator-uninstall + ``` +### No AWS or tofu binaries + +#### Setup AWS CLI + - Follow the [installation guide](https://castai.atlassian.net/wiki/spaces/ENG/pages/2784493777/AWS) to install AWS CLI. + +#### Setup tofu + - For tofu run `brew install opentofu` + - export AWS profile so tofu can pick it up: `export AWS_PROFILE=` + +## Enjoy +Once cluster is created and onboarded, you can manually play with Live Migrations. diff --git a/examples/eks/eks_live_migration/calico.yaml b/examples/eks/eks_live_migration/calico.yaml new file mode 100644 index 000000000..113202e86 --- /dev/null +++ b/examples/eks/eks_live_migration/calico.yaml @@ -0,0 +1,16 @@ +installation: + calicoNetwork: + linuxDataplane: BPF + ipPools: + - cidr: 10.244.0.0/16 + blockSize: 26 + encapsulation: VXLAN + natOutgoing: Enabled + nodeSelector: "all()" + kubernetesProvider: "EKS" + registry: quay.io/ + cni: + type: Calico +kubernetesServiceEndpoint: + host: ${api_endpoint} + port: 443 \ No newline at end of file diff --git a/examples/eks/eks_live_migration/castai.tf b/examples/eks/eks_live_migration/castai.tf new file mode 100644 index 000000000..f483b3483 --- /dev/null +++ b/examples/eks/eks_live_migration/castai.tf @@ -0,0 +1,147 @@ +data "aws_caller_identity" "current" {} + +provider "castai" { + api_url = var.castai_api_url + api_token = var.castai_api_token +} + +resource "aws_eks_access_entry" "access_entry" { + cluster_name = module.eks.cluster_name + principal_arn = module.castai-eks-role-iam.instance_profile_role_arn + type = "EC2_LINUX" +} + +# Configure EKS cluster connection using CAST AI eks-cluster module. +resource "castai_eks_clusterid" "cluster_id" { + account_id = data.aws_caller_identity.current.account_id + region = var.region + cluster_name = var.cluster_name + depends_on = [module.eks, helm_release.calico, aws_eks_access_entry.access_entry] +} + +resource "castai_eks_user_arn" "castai_user_arn" { + cluster_id = castai_eks_clusterid.cluster_id.id +} + +# Create AWS IAM policies and a user to connect to CAST AI. +module "castai-eks-role-iam" { + source = "castai/eks-role-iam/castai" + + aws_account_id = data.aws_caller_identity.current.account_id + aws_cluster_region = var.region + aws_cluster_name = var.cluster_name + aws_cluster_vpc_id = module.vpc.vpc_id + + castai_user_arn = castai_eks_user_arn.castai_user_arn.arn + + create_iam_resources_per_cluster = true +} + +module "castai-eks-cluster" { + source = "castai/eks-cluster/castai" + + delete_nodes_on_disconnect = var.delete_nodes_on_disconnect + + aws_account_id = data.aws_caller_identity.current.account_id + aws_cluster_region = var.region + aws_cluster_name = module.eks.cluster_name + aws_assume_role_arn = module.castai-eks-role-iam.role_arn + api_url = var.castai_api_url + castai_api_token = var.castai_api_token + grpc_url = var.castai_grpc_url + wait_for_cluster_ready = true + + // Default node configuration will be used for all CAST provisioned nodes unless specific configuration is requested. + default_node_configuration = module.castai-eks-cluster.castai_node_configurations["default"] + + node_configurations = { + default = { + subnets = module.vpc.private_subnets + instance_profile_arn = module.castai-eks-role-iam.instance_profile_arn + security_groups = [ + module.eks.node_security_group_id, + ] + init_script = base64encode(file("eks-init-script.sh")) + container_runtime = "containerd" + eks_image_family = "al2023" + } + } + + node_templates = { + # Already contains live binaries on nodes + default_by_castai = { + name = "default-by-castai" + configuration_id = module.castai-eks-cluster.castai_node_configurations["default"] + is_default = true + is_enabled = true + should_taint = false + + constraints = { + on_demand = true + spot = true + use_spot_fallbacks = true + fallback_restore_rate_seconds = 1800 + + enable_spot_diversity = false + spot_diversity_price_increase_limit_percent = 20 + + architectures = ["amd64"] + } + } + + # Same setup as default, but with the goal to forcefully bring nodes with Live binaries installed, based on the NT node selector + live-enabled = { + name = "live-enabled" + configuration_id = module.castai-eks-cluster.castai_node_configurations["default"] + is_enabled = true + should_taint = false + + constraints = { + on_demand = true + spot = true + use_spot_fallbacks = true + fallback_restore_rate_seconds = 1800 + + enable_spot_diversity = false + spot_diversity_price_increase_limit_percent = 20 + + architectures = ["amd64"] + } + } + } + + autoscaler_settings = { + enabled = true + node_templates_partial_matching_enabled = false + + unschedulable_pods = { + enabled = true + } + + node_downscaler = { + enabled = true + + empty_nodes = { + enabled = true + } + + evictor = { + aggressive_mode = false + cycle_interval = "5s10s" + dry_run = false + enabled = true + node_grace_period_minutes = 10 + scoped_mode = false + } + } + + cluster_limits = { + enabled = true + + cpu = { + max_cores = 100 + min_cores = 1 + } + } + } +} \ No newline at end of file diff --git a/examples/eks/eks_live_migration/deploy.yaml b/examples/eks/eks_live_migration/deploy.yaml new file mode 100644 index 000000000..9366ce176 --- /dev/null +++ b/examples/eks/eks_live_migration/deploy.yaml @@ -0,0 +1,39 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: dummy-live-enabled + name: dummy-live-enabled + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: dummy-live-enabled + strategy: {} + template: + metadata: + labels: + app: dummy-live-enabled + spec: + nodeSelector: + scheduling.cast.ai/node-template: "live-enabled" + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: app + operator: In + values: + - dummy-live-enabled + topologyKey: "kubernetes.io/hostname" + containers: + - command: + - /bin/sh + - -c + - 'trap "exit 0" SIGTERM; i=0; while true; do echo "Count: $i"; i=$((i+1)); sleep 10; done' + image: busybox:1.37.0 + name: busybox + resources: {} + terminationGracePeriodSeconds: 0 diff --git a/examples/eks/eks_live_migration/eks-init-script.sh b/examples/eks/eks_live_migration/eks-init-script.sh new file mode 100644 index 000000000..90886ba70 --- /dev/null +++ b/examples/eks/eks_live_migration/eks-init-script.sh @@ -0,0 +1,44 @@ +#!/bin/bash +set -euo pipefail + +# Detect system architecture +ARCH=$(uname -m) +case "$ARCH" in + x86_64) ARCH="amd64" ;; + aarch64) ARCH="arm64" ;; + arm64) ARCH="arm64" ;; + amd64) ARCH="amd64" ;; + *) + echo "Warning: Unsupported architecture: $ARCH, defaulting to amd64" >&2 + ARCH="amd64" + ;; +esac + +CRI_URL=https://storage.googleapis.com/castai-node-components/castai-cri-proxy/releases/0.26.0 + +wget ${CRI_URL}/castai-cri-proxy-linux-${ARCH}.tar.gz -O /var/tmp/castai-cri-proxy-linux-${ARCH}.tar.gz +wget ${CRI_URL}/castai-cri-proxy_SHA256SUMS -O /var/tmp/proxy_SHA256SUMS +SHA256_AMD64_FROM_FILE=$(head -n 1 /var/tmp/proxy_SHA256SUMS | awk '{print $1}') +SHA256_ARM64_FROM_FILE=$(sed -n '2p' /var/tmp/proxy_SHA256SUMS | awk '{print $1}') +pushd /var/tmp +sha256sum --ignore-missing --check /var/tmp/proxy_SHA256SUMS +popd +tar -xvzf /var/tmp/castai-cri-proxy-linux-${ARCH}.tar.gz -C /var/tmp/ cri-proxy +chmod +x /var/tmp/cri-proxy + +cat </var/tmp/pre-install.yaml +packages: + cri-proxy: + downloadURL: ${CRI_URL} + unpackDir: /usr/local/bin + customUnpackLocations: + cni-proxy: /opt/cni/bin/ + arch: + amd64: + fileName: castai-cri-proxy-linux-amd64.tar.gz + sha256sum: ${SHA256_AMD64_FROM_FILE} + arm64: + fileName: castai-cri-proxy-linux-arm64.tar.gz + sha256sum: ${SHA256_ARM64_FROM_FILE} +EOF +sudo /var/tmp/cri-proxy install --base-config=amazon-linux-2023 --config /var/tmp/pre-install.yaml --debug \ No newline at end of file diff --git a/examples/eks/eks_live_migration/eks.tf b/examples/eks/eks_live_migration/eks.tf new file mode 100644 index 000000000..e1eb2c432 --- /dev/null +++ b/examples/eks/eks_live_migration/eks.tf @@ -0,0 +1,217 @@ +data "aws_availability_zones" "available" {} + +data "aws_eks_cluster_auth" "eks_onboarded" { + name = module.eks.cluster_name +} + +# Get the kubernetes service endpoints in the default namespace for Calico installation +data "kubernetes_endpoints_v1" "kubernetes_service" { + metadata { + name = "kubernetes" + namespace = "default" + } + + depends_on = [module.eks.cluster_endpoint] +} + +provider "aws" { + region = "eu-central-1" # Set the AWS region to EU Central (Frankfurt) +} + +provider "kubernetes" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } +} + +provider "helm" { + kubernetes { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.eks_onboarded.token + } +} + +locals { + vpc_cidr = "10.0.0.0/16" + nfs_subnet = "10.0.99.0/24" + azs = slice(data.aws_availability_zones.available.names, 0, 3) + + tags = { + # repo_url = "http://gitlab.com/castai/IaC" + team = "live" + persist = "true" + terraform = "true" + } + + # Create a local value to store the first IP of the kubernetes endpoint -> to install Calico + all_endpoint_ips = flatten([ + for subset in data.kubernetes_endpoints_v1.kubernetes_service.subset : [ + for addresses in subset : [ + for ip in addresses: ip + ] + ] + ]) + kubernetes_endpoint_ip = length(local.all_endpoint_ips) > 0 ? local.all_endpoint_ips[0].ip : "" +} + +# Without that, pods on nodes with Calico don't have network access (internet, nor even node IPs) +resource "aws_security_group_rule" "calico-vxlan" { + security_group_id = module.eks.node_security_group_id + type = "ingress" + from_port = 4789 + to_port = 4789 + protocol = "udp" + cidr_blocks = [local.vpc_cidr] + description = "VXLAN calico" +} + +# trivy:ignore:aws-ec2-no-excessive-port-access +# trivy:ignore:aws-ec2-no-public-ingress-acl +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + version = "~> 5.0" + + name = var.cluster_name + cidr = local.vpc_cidr + + azs = local.azs + private_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 4, k)] + public_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 48)] + + enable_nat_gateway = true + single_nat_gateway = true + one_nat_gateway_per_az = false + + public_subnet_tags = { + "kubernetes.io/role/elb" = 1 + } + + private_subnet_tags = { + "kubernetes.io/role/internal-elb" = 1 + } + + tags = local.tags +} + +# Security group for VPC endpoints +resource "aws_security_group" "vpc_endpoints_sg" { + name = "${var.cluster_name}-vpc-endpoints-sg" + description = "Security group for VPC endpoints" + vpc_id = module.vpc.vpc_id + + ingress { + description = "HTTPS from VPC" + from_port = 443 + to_port = 443 + protocol = "tcp" + cidr_blocks = [local.vpc_cidr] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } + + + tags = local.tags +} + +# Add rule to EKS node security group to allow communication with VPC endpoints +resource "aws_security_group_rule" "nodes_to_vpc_endpoints" { + security_group_id = module.eks.node_security_group_id + type = "egress" + from_port = 443 + to_port = 443 + protocol = "tcp" + source_security_group_id = aws_security_group.vpc_endpoints_sg.id + description = "Allow nodes to communicate with VPC endpoints" +} + +# trivy:ignore:aws-eks-no-public-cluster-access +# trivy:ignore:aws-eks-no-public-cluster-access-to-cidr +# trivy:ignore:aws-ec2-no-public-egress-sgr +module "eks" { + source = "terraform-aws-modules/eks/aws" + version = "~> 20.0" + + cluster_name = var.cluster_name + cluster_version = "1.32" + + cluster_endpoint_public_access = true + + enable_cluster_creator_admin_permissions = true + + enable_irsa = true + + #Do not enable default VPC CNI and kube-proxy as we will install calico + bootstrap_self_managed_addons = false + cluster_addons = {} + + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.private_subnets + cluster_security_group_additional_rules = { + allow_all_vpc = { + type = "ingress" + protocol = "tcp" + from_port = 0 + to_port = 0 + cidr_blocks = [ + local.vpc_cidr + ] + } + } + + eks_managed_node_groups = { + stock_ami = { + name = "stock-ami" + ami_family = "AmazonLinux2023" + instance_types = ["c5a.large"] + privateNetworking = true + min_size = 2 + max_size = 4 + desired_size = 2 + + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + } + + tags = local.tags +} + +resource "helm_release" "calico" { + name = "calico" + + repository = "https://docs.tigera.io/calico/charts" + chart = "tigera-operator" + version = "3.29.3" + + namespace = "tigera-operator" + create_namespace = true + + values = [ + templatefile("${path.module}/calico.yaml", { + # Trim any quotes and newlines from the IP address + api_endpoint = trimspace(local.kubernetes_endpoint_ip) + }) + ] + wait = false + + depends_on = [data.kubernetes_endpoints_v1.kubernetes_service] +} + +resource "null_resource" "deploy_non_blocking_coredns" { + provisioner "local-exec" { + command = "aws eks create-addon --cluster-name ${var.cluster_name} --region ${var.region} --addon-name coredns --addon-version v1.11.4-eksbuild.2" + } + depends_on = [module.eks] +} \ No newline at end of file diff --git a/examples/eks/eks_live_migration/main.tf b/examples/eks/eks_live_migration/main.tf new file mode 100644 index 000000000..336760cc2 --- /dev/null +++ b/examples/eks/eks_live_migration/main.tf @@ -0,0 +1,17 @@ +terraform { + required_version = ">= 0.13" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.95.0" + } + castai = { + source = "castai/castai" + version = "7.51.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + } + } +} \ No newline at end of file diff --git a/examples/eks/eks_live_migration/tf.vars.example b/examples/eks/eks_live_migration/tf.vars.example new file mode 100644 index 000000000..d1f6b5b0b --- /dev/null +++ b/examples/eks/eks_live_migration/tf.vars.example @@ -0,0 +1,3 @@ +cluster_name = +region = +castai_api_token = diff --git a/examples/eks/eks_live_migration/variables.tf b/examples/eks/eks_live_migration/variables.tf new file mode 100644 index 000000000..74efb56f6 --- /dev/null +++ b/examples/eks/eks_live_migration/variables.tf @@ -0,0 +1,33 @@ +variable "cluster_name" { + type = string + description = "Name of the cluster to create" +} + +variable "region" { + description = "AWS region where cluster will be created" + type = string +} + +variable "castai_api_token" { + type = string + description = "CAST AI api token" + sensitive = true +} + +variable "castai_api_url" { + type = string + description = "URL of alternative CAST AI API to be used during development or testing" + default = "https://api.cast.ai" +} + +variable "castai_grpc_url" { + type = string + description = "CAST AI gRPC URL" + default = "grpc.cast.ai:443" +} + +variable "delete_nodes_on_disconnect" { + type = bool + description = "Optional parameter, if set to true - CAST AI provisioned nodes will be deleted from cloud on cluster disconnection. For production use it is recommended to set it to false." + default = true +} \ No newline at end of file diff --git a/examples/gke/gke_live_migration/README.MD b/examples/gke/gke_live_migration/README.MD new file mode 100644 index 000000000..b8fc7bfb0 --- /dev/null +++ b/examples/gke/gke_live_migration/README.MD @@ -0,0 +1,37 @@ +# Running LIVE on GKE + +## Disclaimer + +This is a temporary hack to enable early access to LIVE Migrations in Cast AI. We do not expect customers to run our installer in node configuration as we are doing in this example or having to make complicated setups. + +This example contains a full creation of a GKE cluster and onboarding on Cast AI with the needed node init script to enable live migration in NEW created nodes. + +## Network migration / TCP Migration + +Currently not supported in GKE yet, be aware of the type of workloads you try to migrate for customers, it will fail eventually if they use persistent TCP connections. + +# How to create your env +1. Rename `tf.vars.example` to `tf.vars` +2. Update `tf.vars` file with your project name, cluster name, cluster region and Cast AI API token. +3. Initialize tofu. Under example root folder run: +```bash +tofu init +``` +4. Run tofu apply: +``` +tofu apply -var-file=tf.vars +``` +5. To destroy resources created by this example: +``` +tofu destroy -var-file=tf.vars +``` + +Please refer to this guide if you run into any issues https://docs.cast.ai/docs/terraform-troubleshooting + +# Notes + +- Make sure to rebalance the cluster after it is fully onboarded, we replace the original image with a new image we know that contains a new version of containerd and is compatible with live. + +# Closing + +Enjoy, hack and provide feedback to live-team if you find any issue! diff --git a/examples/gke/gke_live_migration/castai.tf b/examples/gke/gke_live_migration/castai.tf new file mode 100644 index 000000000..0f9d6cf47 --- /dev/null +++ b/examples/gke/gke_live_migration/castai.tf @@ -0,0 +1,114 @@ +# 3. Connect GKE cluster to CAST AI in read-only mode. + +# Configure Data sources and providers required for CAST AI connection. + +locals { + init_script = var.gke_img_type == "COS_CONTAINERD" ? "init_cos.sh" : "init_ubuntu.sh" +} + +data "google_client_config" "default" {} + +provider "castai" { + api_url = var.castai_api_url + api_token = var.castai_api_token +} + +provider "helm" { + kubernetes { + host = "https://${module.gke.endpoint}" + token = data.google_client_config.default.access_token + cluster_ca_certificate = base64decode(module.gke.ca_certificate) + } +} + +# Configure GKE cluster connection using CAST AI gke-cluster module. +module "castai-gke-iam" { + source = "castai/gke-iam/castai" + + project_id = var.project_id + gke_cluster_name = var.cluster_name +} + +module "castai-gke-cluster" { + source = "castai/gke-cluster/castai" + + api_url = var.castai_api_url + castai_api_token = var.castai_api_token + grpc_url = var.castai_grpc_url + wait_for_cluster_ready = true + project_id = var.project_id + gke_cluster_name = var.cluster_name + gke_cluster_location = module.gke.location + + gke_credentials = module.castai-gke-iam.private_key + delete_nodes_on_disconnect = true + + default_node_configuration = module.castai-gke-cluster.castai_node_configurations["default"] + + node_configurations = { + default = { + disk_cpu_ratio = 25 + subnets = [module.vpc.subnets_ids[0]] + # https://cloud.google.com/container-optimized-os/docs/release-notes/m121 + image = "projects/cos-cloud/global/images/cos-121-18867-90-59" + init_script = base64encode(file(local.init_script)) + } + } + + node_templates = { + default_by_castai = { + name = "default-by-castai" + configuration_id = module.castai-gke-cluster.castai_node_configurations["default"] + is_default = true + is_enabled = true + should_taint = false + + constraints = { + on_demand = true + spot = true + use_spot_fallbacks = true + + enable_spot_diversity = false + spot_diversity_price_increase_limit_percent = 20 + } + } + } + + autoscaler_settings = { + enabled = true + node_templates_partial_matching_enabled = false + + unschedulable_pods = { + enabled = true + } + + node_downscaler = { + enabled = true + + empty_nodes = { + enabled = true + } + + evictor = { + aggressive_mode = false + cycle_interval = "5m10s" + dry_run = false + enabled = true + node_grace_period_minutes = 10 + scoped_mode = false + } + } + + cluster_limits = { + enabled = true + + cpu = { + max_cores = 20 + min_cores = 1 + } + } + } + // depends_on helps terraform with creating proper dependencies graph in case of resource creation and in this case destroy + // module "castai-gke-cluster" has to be destroyed before module "castai-gke-iam" and "module.gke" + depends_on = [module.gke, module.castai-gke-iam] +} \ No newline at end of file diff --git a/examples/gke/gke_live_migration/gke.tf b/examples/gke/gke_live_migration/gke.tf new file mode 100644 index 000000000..cc20e3083 --- /dev/null +++ b/examples/gke/gke_live_migration/gke.tf @@ -0,0 +1,45 @@ +# 2. Create GKE cluster. + +module "gke" { + source = "terraform-google-modules/kubernetes-engine/google" + version = "~> 36.0" + project_id = var.project_id + name = var.cluster_name + region = var.cluster_region + zones = var.cluster_zones + network = module.vpc.network_name + subnetwork = module.vpc.subnets_names[0] + ip_range_pods = local.ip_range_pods + ip_range_services = local.ip_range_services + http_load_balancing = false + network_policy = false + horizontal_pod_autoscaling = true + filestore_csi_driver = false + enable_gcfs = var.gke_img_type == "COS_CONTAINERD" ? true : false + deletion_protection = false + + node_pools = [ + { + name = "default-node-pool" + machine_type = "e2-standard-2" + min_count = 0 + max_count = 10 + local_ssd_count = 0 + disk_size_gb = 100 + disk_type = "pd-standard" + image_type = var.gke_img_type + auto_repair = true + auto_upgrade = true + preemptible = false + initial_node_count = 2 # has to be >=2 to successfully deploy CAST AI controller + }, + ] + + node_pools_labels = { + all = {} + + default-node-pool = { + default-node-pool = true + } + } +} diff --git a/examples/gke/gke_live_migration/init_cos.sh b/examples/gke/gke_live_migration/init_cos.sh new file mode 100644 index 000000000..2cff50327 --- /dev/null +++ b/examples/gke/gke_live_migration/init_cos.sh @@ -0,0 +1,44 @@ +#!/bin/bash +set -euo pipefail + +# Detect system architecture +ARCH=$(uname -m) +case "$ARCH" in +x86_64) ARCH="amd64" ;; +aarch64) ARCH="arm64" ;; +arm64) ARCH="arm64" ;; +amd64) ARCH="amd64" ;; +*) + echo "Warning: Unsupported architecture: $ARCH, defaulting to amd64" >&2 + ARCH="amd64" + ;; +esac + +CRI_URL=https://storage.googleapis.com/castai-node-components/castai-cri-proxy/releases/0.27.0 + +wget ${CRI_URL}/castai-cri-proxy-linux-${ARCH}.tar.gz -O /var/tmp/castai-cri-proxy-linux-${ARCH}.tar.gz +wget ${CRI_URL}/castai-cri-proxy_SHA256SUMS -O /var/tmp/proxy_SHA256SUMS +SHA256_AMD64_FROM_FILE=$(head -n 1 /var/tmp/proxy_SHA256SUMS | awk '{print $1}') +SHA256_ARM64_FROM_FILE=$(sed -n '2p' /var/tmp/proxy_SHA256SUMS | awk '{print $1}') +pushd /var/tmp +sha256sum --ignore-missing --check /var/tmp/proxy_SHA256SUMS +popd +tar -xvzf /var/tmp/castai-cri-proxy-linux-${ARCH}.tar.gz -C /home/kubernetes/bin/ cri-proxy +chmod +x /home/kubernetes/bin/cri-proxy + +cat </var/tmp/pre-install.yaml +packages: + cri-proxy: + downloadURL: ${CRI_URL} + unpackDir: /home/kubernetes/bin + arch: + amd64: + fileName: castai-cri-proxy-linux-amd64.tar.gz + sha256sum: ${SHA256_AMD64_FROM_FILE} + arm64: + fileName: castai-cri-proxy-linux-arm64.tar.gz + sha256sum: ${SHA256_ARM64_FROM_FILE} +EOF + +sudo /home/kubernetes/bin/cri-proxy install --base-config=gke-cos --config /var/tmp/pre-install.yaml --debug 2>&1 | sudo tee /var/tmp/LIVE_INSTALL_LOG >/dev/null + diff --git a/examples/gke/gke_live_migration/init_ubuntu.sh b/examples/gke/gke_live_migration/init_ubuntu.sh new file mode 100644 index 000000000..fd1926497 --- /dev/null +++ b/examples/gke/gke_live_migration/init_ubuntu.sh @@ -0,0 +1,45 @@ +#!/bin/bash +set -euo pipefail + +# Detect system architecture +ARCH=$(uname -m) +case "$ARCH" in +x86_64) ARCH="amd64" ;; +aarch64) ARCH="arm64" ;; +arm64) ARCH="arm64" ;; +amd64) ARCH="amd64" ;; +*) + echo "Warning: Unsupported architecture: $ARCH, defaulting to amd64" >&2 + ARCH="amd64" + ;; +esac + +CRI_URL=https://storage.googleapis.com/castai-node-components/castai-cri-proxy/releases/0.27.0 + +wget ${CRI_URL}/castai-cri-proxy-linux-${ARCH}.tar.gz -O /var/tmp/castai-cri-proxy-linux-${ARCH}.tar.gz +wget ${CRI_URL}/castai-cri-proxy_SHA256SUMS -O /var/tmp/proxy_SHA256SUMS +SHA256_AMD64_FROM_FILE=$(head -n 1 /var/tmp/proxy_SHA256SUMS | awk '{print $1}') +SHA256_ARM64_FROM_FILE=$(sed -n '2p' /var/tmp/proxy_SHA256SUMS | awk '{print $1}') +pushd /var/tmp +sha256sum --ignore-missing --check /var/tmp/proxy_SHA256SUMS +popd +tar -xvzf /var/tmp/castai-cri-proxy-linux-${ARCH}.tar.gz -C /var/tmp/ cri-proxy +chmod +x /var/tmp/cri-proxy + +cat </var/tmp/pre-install.yaml +packages: + cri-proxy: + downloadURL: ${CRI_URL} + unpackDir: /usr/local/bin + customUnpackLocations: + cni-proxy: /home/kubernetes/bin/ + arch: + amd64: + fileName: castai-cri-proxy-linux-amd64.tar.gz + sha256sum: ${SHA256_AMD64_FROM_FILE} + arm64: + fileName: castai-cri-proxy-linux-arm64.tar.gz + sha256sum: ${SHA256_ARM64_FROM_FILE} +EOF +sudo /var/tmp/cri-proxy install --base-config=gke-ubuntu --config /var/tmp/pre-install.yaml --debug + diff --git a/examples/gke/gke_live_migration/tf.vars.example b/examples/gke/gke_live_migration/tf.vars.example new file mode 100644 index 000000000..46c4aaff0 --- /dev/null +++ b/examples/gke/gke_live_migration/tf.vars.example @@ -0,0 +1,5 @@ +cluster_name = "" +cluster_region = "" +cluster_zones = ["", ""] +castai_api_token = "" +project_id = "" diff --git a/examples/gke/gke_live_migration/variables.tf b/examples/gke/gke_live_migration/variables.tf new file mode 100644 index 000000000..88ffa8ba5 --- /dev/null +++ b/examples/gke/gke_live_migration/variables.tf @@ -0,0 +1,56 @@ +# GKE module variables. +variable "cluster_name" { + type = string + description = "GKE cluster name in GCP project." +} + +variable "cluster_region" { + type = string + description = "The region to create the cluster." +} + +variable "cluster_zones" { + type = list(string) + description = "The zones to create the cluster." +} + +variable "project_id" { + type = string + description = "GCP project ID in which GKE cluster would be created." +} + +variable "castai_api_url" { + type = string + description = "URL of alternative CAST AI API to be used during development or testing" + default = "https://api.cast.ai" +} + +# Variables required for connecting EKS cluster to CAST AI +variable "castai_api_token" { + type = string + description = "CAST AI API token created in console.cast.ai API Access keys section." +} + +variable "castai_grpc_url" { + type = string + description = "CAST AI gRPC URL" + default = "grpc.cast.ai:443" +} + +variable "delete_nodes_on_disconnect" { + type = bool + description = "Optional parameter, if set to true - CAST AI provisioned nodes will be deleted from cloud on cluster disconnection. For production use it is recommended to set it to false." + default = true +} + +variable "tags" { + type = map(any) + description = "Optional tags for new cluster nodes. This parameter applies only to new nodes - tags for old nodes are not reconciled." + default = {} +} + +variable "gke_img_type" { + type = string + description = "defines the type of image cluster nodes should use." + default = "COS_CONTAINERD" +} diff --git a/examples/gke/gke_live_migration/version.tf b/examples/gke/gke_live_migration/version.tf new file mode 100644 index 000000000..502f0c51f --- /dev/null +++ b/examples/gke/gke_live_migration/version.tf @@ -0,0 +1,17 @@ +terraform { + required_providers { + castai = { + source = "castai/castai" + } + kubernetes = { + source = "hashicorp/kubernetes" + } + google = { + source = "hashicorp/google" + } + google-beta = { + source = "hashicorp/google-beta" + } + } + required_version = ">= 0.13" +} diff --git a/examples/gke/gke_live_migration/vpc.tf b/examples/gke/gke_live_migration/vpc.tf new file mode 100644 index 000000000..bb69f9ec7 --- /dev/null +++ b/examples/gke/gke_live_migration/vpc.tf @@ -0,0 +1,49 @@ +# 1. Create VPC. + +locals { + ip_range_pods = "${var.cluster_name}-ip-range-pods" + ip_range_services = "${var.cluster_name}-ip-range-services" + ip_range_nodes = "${var.cluster_name}-ip-range-nodes" +} + +module "vpc" { + source = "terraform-google-modules/network/google" + version = "~> 10.0" + project_id = var.project_id + network_name = var.cluster_name + subnets = [ + { + subnet_name = local.ip_range_nodes + subnet_ip = "10.0.0.0/16" + subnet_region = var.cluster_region + subnet_private_access = "true" + }, + ] + + secondary_ranges = { + (local.ip_range_nodes) = [ + { + range_name = local.ip_range_pods + ip_cidr_range = "10.20.0.0/16" + }, + { + range_name = local.ip_range_services + ip_cidr_range = "10.30.0.0/24" + } + ] + } +} + +resource "google_compute_firewall" "allow_ssh" { + name = "allow-ssh-${var.cluster_name}" + network = var.cluster_name + project = var.project_id + + allow { + protocol = "tcp" + ports = ["22"] + } + + source_ranges = ["0.0.0.0/0"] + direction = "INGRESS" +} From 90a92d23fa2ece7bc51068e9f1ebda893c3ec75f Mon Sep 17 00:00:00 2001 From: Filipe Augusto Lima de Souza Date: Mon, 16 Jun 2025 17:37:09 +0200 Subject: [PATCH 2/3] bump proxy to 0.27.0 --- examples/eks/eks_live_migration/eks-init-script.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/eks/eks_live_migration/eks-init-script.sh b/examples/eks/eks_live_migration/eks-init-script.sh index 90886ba70..4cda81149 100644 --- a/examples/eks/eks_live_migration/eks-init-script.sh +++ b/examples/eks/eks_live_migration/eks-init-script.sh @@ -14,7 +14,7 @@ case "$ARCH" in ;; esac -CRI_URL=https://storage.googleapis.com/castai-node-components/castai-cri-proxy/releases/0.26.0 +CRI_URL=https://storage.googleapis.com/castai-node-components/castai-cri-proxy/releases/0.27.0 wget ${CRI_URL}/castai-cri-proxy-linux-${ARCH}.tar.gz -O /var/tmp/castai-cri-proxy-linux-${ARCH}.tar.gz wget ${CRI_URL}/castai-cri-proxy_SHA256SUMS -O /var/tmp/proxy_SHA256SUMS From 2ed5441c5d230438f75d8ee4eb3f6d20125a9ea1 Mon Sep 17 00:00:00 2001 From: Filipe Augusto Lima de Souza Date: Tue, 17 Jun 2025 09:44:32 +0200 Subject: [PATCH 3/3] make format-tf --- examples/eks/eks_live_migration/castai.tf | 10 +++--- examples/eks/eks_live_migration/eks.tf | 36 ++++++++++---------- examples/eks/eks_live_migration/variables.tf | 14 ++++---- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/examples/eks/eks_live_migration/castai.tf b/examples/eks/eks_live_migration/castai.tf index f483b3483..c6871e207 100644 --- a/examples/eks/eks_live_migration/castai.tf +++ b/examples/eks/eks_live_migration/castai.tf @@ -16,7 +16,7 @@ resource "castai_eks_clusterid" "cluster_id" { account_id = data.aws_caller_identity.current.account_id region = var.region cluster_name = var.cluster_name - depends_on = [module.eks, helm_release.calico, aws_eks_access_entry.access_entry] + depends_on = [module.eks, helm_release.calico, aws_eks_access_entry.access_entry] } resource "castai_eks_user_arn" "castai_user_arn" { @@ -56,14 +56,14 @@ module "castai-eks-cluster" { node_configurations = { default = { - subnets = module.vpc.private_subnets + subnets = module.vpc.private_subnets instance_profile_arn = module.castai-eks-role-iam.instance_profile_arn - security_groups = [ + security_groups = [ module.eks.node_security_group_id, ] - init_script = base64encode(file("eks-init-script.sh")) + init_script = base64encode(file("eks-init-script.sh")) container_runtime = "containerd" - eks_image_family = "al2023" + eks_image_family = "al2023" } } diff --git a/examples/eks/eks_live_migration/eks.tf b/examples/eks/eks_live_migration/eks.tf index e1eb2c432..f339b1d52 100644 --- a/examples/eks/eks_live_migration/eks.tf +++ b/examples/eks/eks_live_migration/eks.tf @@ -19,13 +19,13 @@ provider "aws" { } provider "kubernetes" { - host = module.eks.cluster_endpoint + host = module.eks.cluster_endpoint cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) exec { api_version = "client.authentication.k8s.io/v1beta1" command = "aws" - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] } } @@ -44,8 +44,8 @@ locals { tags = { # repo_url = "http://gitlab.com/castai/IaC" - team = "live" - persist = "true" + team = "live" + persist = "true" terraform = "true" } @@ -53,7 +53,7 @@ locals { all_endpoint_ips = flatten([ for subset in data.kubernetes_endpoints_v1.kubernetes_service.subset : [ for addresses in subset : [ - for ip in addresses: ip + for ip in addresses : ip ] ] ]) @@ -62,13 +62,13 @@ locals { # Without that, pods on nodes with Calico don't have network access (internet, nor even node IPs) resource "aws_security_group_rule" "calico-vxlan" { - security_group_id = module.eks.node_security_group_id - type = "ingress" - from_port = 4789 - to_port = 4789 - protocol = "udp" - cidr_blocks = [local.vpc_cidr] - description = "VXLAN calico" + security_group_id = module.eks.node_security_group_id + type = "ingress" + from_port = 4789 + to_port = 4789 + protocol = "udp" + cidr_blocks = [local.vpc_cidr] + description = "VXLAN calico" } # trivy:ignore:aws-ec2-no-excessive-port-access @@ -171,13 +171,13 @@ module "eks" { eks_managed_node_groups = { stock_ami = { - name = "stock-ami" - ami_family = "AmazonLinux2023" - instance_types = ["c5a.large"] + name = "stock-ami" + ami_family = "AmazonLinux2023" + instance_types = ["c5a.large"] privateNetworking = true - min_size = 2 - max_size = 4 - desired_size = 2 + min_size = 2 + max_size = 4 + desired_size = 2 iam_role_additional_policies = { AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" diff --git a/examples/eks/eks_live_migration/variables.tf b/examples/eks/eks_live_migration/variables.tf index 74efb56f6..f329c252c 100644 --- a/examples/eks/eks_live_migration/variables.tf +++ b/examples/eks/eks_live_migration/variables.tf @@ -5,25 +5,25 @@ variable "cluster_name" { variable "region" { description = "AWS region where cluster will be created" - type = string + type = string } variable "castai_api_token" { - type = string + type = string description = "CAST AI api token" - sensitive = true + sensitive = true } variable "castai_api_url" { - type = string + type = string description = "URL of alternative CAST AI API to be used during development or testing" - default = "https://api.cast.ai" + default = "https://api.cast.ai" } variable "castai_grpc_url" { - type = string + type = string description = "CAST AI gRPC URL" - default = "grpc.cast.ai:443" + default = "grpc.cast.ai:443" } variable "delete_nodes_on_disconnect" {