Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions examples/eks/eks_live_migration/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# AWS Live Migration with Containerd

This setup creates an EKS cluster and onboards it to the CAST AI. Live binaries are then installed on nodes using dedicated
Node Configuration. Included script installs Live binaries on nodes works with Amazon Linux 2023.

## How to create your env
1. Rename `tf.vars.example` to `tf.vars`
2. Update `tf.vars` file with your project name, cluster name, cluster region and Cast AI API token.
3. Initialize tofu. Under example root folder run:
```bash
tofu init
```
4. Verify:
```
tofu plan -var-file=tf.vars
```

5. Run tofu apply:
```
tofu apply -var-file=tf.vars
```
6. To destroy resources created by this example:
```
tofu destroy -var-file=tf.vars
```

## Troubleshooting
There are some known issues with the terraform setup, and know workarounds.

### Cluster creation stuck / timeouts on node group creation
If cluster creation gets stuck on node group creation, and nodes are not healthy, it most probably means Calico installtion did not trigger
at the right time. To fix it, just break the tofu execution and reexecute it again.

### CAST AI onboarding stuck in connecting / pods don't have internet connection
Make sure Calico pods are running on all the nodes without errors and Core DNS addon is installed.

### Timeout on resources destruction
- Check if There are no hanging CAST AI EC2 instances left and blocking VPC deletion.
- If Calico uninstallation job is stuck for any reason, just delete it manually:
```bash
k delete job -n tigera-operator tigera-operator-uninstall
```
### No AWS or tofu binaries

#### Setup AWS CLI
- Follow the [installation guide](https://castai.atlassian.net/wiki/spaces/ENG/pages/2784493777/AWS) to install AWS CLI.

#### Setup tofu
- For tofu run `brew install opentofu`
- export AWS profile so tofu can pick it up: `export AWS_PROFILE=<ProfileName>`

## Enjoy
Once cluster is created and onboarded, you can manually play with Live Migrations.
16 changes: 16 additions & 0 deletions examples/eks/eks_live_migration/calico.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
installation:
calicoNetwork:
linuxDataplane: BPF
ipPools:
- cidr: 10.244.0.0/16
blockSize: 26
encapsulation: VXLAN
natOutgoing: Enabled
nodeSelector: "all()"
kubernetesProvider: "EKS"
registry: quay.io/
cni:
type: Calico
kubernetesServiceEndpoint:
host: ${api_endpoint}
port: 443
147 changes: 147 additions & 0 deletions examples/eks/eks_live_migration/castai.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
data "aws_caller_identity" "current" {}

provider "castai" {
api_url = var.castai_api_url
api_token = var.castai_api_token
}

resource "aws_eks_access_entry" "access_entry" {
cluster_name = module.eks.cluster_name
principal_arn = module.castai-eks-role-iam.instance_profile_role_arn
type = "EC2_LINUX"
}

# Configure EKS cluster connection using CAST AI eks-cluster module.
resource "castai_eks_clusterid" "cluster_id" {
account_id = data.aws_caller_identity.current.account_id
region = var.region
cluster_name = var.cluster_name
depends_on = [module.eks, helm_release.calico, aws_eks_access_entry.access_entry]
}

resource "castai_eks_user_arn" "castai_user_arn" {
cluster_id = castai_eks_clusterid.cluster_id.id
}

# Create AWS IAM policies and a user to connect to CAST AI.
module "castai-eks-role-iam" {
source = "castai/eks-role-iam/castai"

aws_account_id = data.aws_caller_identity.current.account_id
aws_cluster_region = var.region
aws_cluster_name = var.cluster_name
aws_cluster_vpc_id = module.vpc.vpc_id

castai_user_arn = castai_eks_user_arn.castai_user_arn.arn

create_iam_resources_per_cluster = true
}

module "castai-eks-cluster" {
source = "castai/eks-cluster/castai"

delete_nodes_on_disconnect = var.delete_nodes_on_disconnect

aws_account_id = data.aws_caller_identity.current.account_id
aws_cluster_region = var.region
aws_cluster_name = module.eks.cluster_name
aws_assume_role_arn = module.castai-eks-role-iam.role_arn
api_url = var.castai_api_url
castai_api_token = var.castai_api_token
grpc_url = var.castai_grpc_url
wait_for_cluster_ready = true

// Default node configuration will be used for all CAST provisioned nodes unless specific configuration is requested.
default_node_configuration = module.castai-eks-cluster.castai_node_configurations["default"]

node_configurations = {
default = {
subnets = module.vpc.private_subnets
instance_profile_arn = module.castai-eks-role-iam.instance_profile_arn
security_groups = [
module.eks.node_security_group_id,
]
init_script = base64encode(file("eks-init-script.sh"))
container_runtime = "containerd"
eks_image_family = "al2023"
}
}

node_templates = {
# Already contains live binaries on nodes
default_by_castai = {
name = "default-by-castai"
configuration_id = module.castai-eks-cluster.castai_node_configurations["default"]
is_default = true
is_enabled = true
should_taint = false

constraints = {
on_demand = true
spot = true
use_spot_fallbacks = true
fallback_restore_rate_seconds = 1800

enable_spot_diversity = false
spot_diversity_price_increase_limit_percent = 20

architectures = ["amd64"]
}
}

# Same setup as default, but with the goal to forcefully bring nodes with Live binaries installed, based on the NT node selector
live-enabled = {
name = "live-enabled"
configuration_id = module.castai-eks-cluster.castai_node_configurations["default"]
is_enabled = true
should_taint = false

constraints = {
on_demand = true
spot = true
use_spot_fallbacks = true
fallback_restore_rate_seconds = 1800

enable_spot_diversity = false
spot_diversity_price_increase_limit_percent = 20

architectures = ["amd64"]
}
}
}

autoscaler_settings = {
enabled = true
node_templates_partial_matching_enabled = false

unschedulable_pods = {
enabled = true
}

node_downscaler = {
enabled = true

empty_nodes = {
enabled = true
}

evictor = {
aggressive_mode = false
cycle_interval = "5s10s"
dry_run = false
enabled = true
node_grace_period_minutes = 10
scoped_mode = false
}
}

cluster_limits = {
enabled = true

cpu = {
max_cores = 100
min_cores = 1
}
}
}
}
39 changes: 39 additions & 0 deletions examples/eks/eks_live_migration/deploy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: dummy-live-enabled
name: dummy-live-enabled
namespace: default
spec:
replicas: 2
selector:
matchLabels:
app: dummy-live-enabled
strategy: {}
template:
metadata:
labels:
app: dummy-live-enabled
spec:
nodeSelector:
scheduling.cast.ai/node-template: "live-enabled"
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- dummy-live-enabled
topologyKey: "kubernetes.io/hostname"
containers:
- command:
- /bin/sh
- -c
- 'trap "exit 0" SIGTERM; i=0; while true; do echo "Count: $i"; i=$((i+1)); sleep 10; done'
image: busybox:1.37.0
name: busybox
resources: {}
terminationGracePeriodSeconds: 0
44 changes: 44 additions & 0 deletions examples/eks/eks_live_migration/eks-init-script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash
set -euo pipefail

# Detect system architecture
ARCH=$(uname -m)
case "$ARCH" in
x86_64) ARCH="amd64" ;;
aarch64) ARCH="arm64" ;;
arm64) ARCH="arm64" ;;
amd64) ARCH="amd64" ;;
*)
echo "Warning: Unsupported architecture: $ARCH, defaulting to amd64" >&2
ARCH="amd64"
;;
esac

CRI_URL=https://storage.googleapis.com/castai-node-components/castai-cri-proxy/releases/0.27.0

wget ${CRI_URL}/castai-cri-proxy-linux-${ARCH}.tar.gz -O /var/tmp/castai-cri-proxy-linux-${ARCH}.tar.gz
wget ${CRI_URL}/castai-cri-proxy_SHA256SUMS -O /var/tmp/proxy_SHA256SUMS
SHA256_AMD64_FROM_FILE=$(head -n 1 /var/tmp/proxy_SHA256SUMS | awk '{print $1}')
SHA256_ARM64_FROM_FILE=$(sed -n '2p' /var/tmp/proxy_SHA256SUMS | awk '{print $1}')
pushd /var/tmp
sha256sum --ignore-missing --check /var/tmp/proxy_SHA256SUMS
popd
tar -xvzf /var/tmp/castai-cri-proxy-linux-${ARCH}.tar.gz -C /var/tmp/ cri-proxy
chmod +x /var/tmp/cri-proxy

cat <<EOF >/var/tmp/pre-install.yaml
packages:
cri-proxy:
downloadURL: ${CRI_URL}
unpackDir: /usr/local/bin
customUnpackLocations:
cni-proxy: /opt/cni/bin/
arch:
amd64:
fileName: castai-cri-proxy-linux-amd64.tar.gz
sha256sum: ${SHA256_AMD64_FROM_FILE}
arm64:
fileName: castai-cri-proxy-linux-arm64.tar.gz
sha256sum: ${SHA256_ARM64_FROM_FILE}
EOF
sudo /var/tmp/cri-proxy install --base-config=amazon-linux-2023 --config /var/tmp/pre-install.yaml --debug
Loading
Loading