Skip to content

Commit 4944f14

Browse files
committed
adding live-migration examples
1 parent e4ce0e7 commit 4944f14

File tree

18 files changed

+981
-0
lines changed

18 files changed

+981
-0
lines changed
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# AWS Live Migration with Containerd
2+
3+
This setup creates an EKS cluster and onboards it to the CAST AI. Live binaries are then installed on nodes using dedicated
4+
Node Configuration. Included script installs Live binaries on nodes works with Amazon Linux 2023.
5+
6+
## How to create your env
7+
1. Rename `tf.vars.example` to `tf.vars`
8+
2. Update `tf.vars` file with your project name, cluster name, cluster region and Cast AI API token.
9+
3. Initialize tofu. Under example root folder run:
10+
```bash
11+
tofu init
12+
```
13+
4. Verify:
14+
```
15+
tofu plan -var-file=tf.vars
16+
```
17+
18+
5. Run tofu apply:
19+
```
20+
tofu apply -var-file=tf.vars
21+
```
22+
6. To destroy resources created by this example:
23+
```
24+
tofu destroy -var-file=tf.vars
25+
```
26+
27+
## Troubleshooting
28+
There are some known issues with the terraform setup, and know workarounds.
29+
30+
### Cluster creation stuck / timeouts on node group creation
31+
If cluster creation gets stuck on node group creation, and nodes are not healthy, it most probably means Calico installtion did not trigger
32+
at the right time. To fix it, just break the tofu execution and reexecute it again.
33+
34+
### CAST AI onboarding stuck in connecting / pods don't have internet connection
35+
Make sure Calico pods are running on all the nodes without errors and Core DNS addon is installed.
36+
37+
### Timeout on resources destruction
38+
- Check if There are no hanging CAST AI EC2 instances left and blocking VPC deletion.
39+
- If Calico uninstallation job is stuck for any reason, just delete it manually:
40+
```bash
41+
k delete job -n tigera-operator tigera-operator-uninstall
42+
```
43+
### No AWS or tofu binaries
44+
45+
#### Setup AWS CLI
46+
- Follow the [installation guide](https://castai.atlassian.net/wiki/spaces/ENG/pages/2784493777/AWS) to install AWS CLI.
47+
48+
#### Setup tofu
49+
- For tofu run `brew install opentofu`
50+
- export AWS profile so tofu can pick it up: `export AWS_PROFILE=<ProfileName>`
51+
52+
## Enjoy
53+
Once cluster is created and onboarded, you can manually play with Live Migrations.
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
installation:
2+
calicoNetwork:
3+
linuxDataplane: BPF
4+
ipPools:
5+
- cidr: 10.244.0.0/16
6+
blockSize: 26
7+
encapsulation: VXLAN
8+
natOutgoing: Enabled
9+
nodeSelector: "all()"
10+
kubernetesProvider: "EKS"
11+
registry: quay.io/
12+
cni:
13+
type: Calico
14+
kubernetesServiceEndpoint:
15+
host: ${api_endpoint}
16+
port: 443
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
data "aws_caller_identity" "current" {}
2+
3+
provider "castai" {
4+
api_url = var.castai_api_url
5+
api_token = var.castai_api_token
6+
}
7+
8+
resource "aws_eks_access_entry" "access_entry" {
9+
cluster_name = module.eks.cluster_name
10+
principal_arn = module.castai-eks-role-iam.instance_profile_role_arn
11+
type = "EC2_LINUX"
12+
}
13+
14+
# Configure EKS cluster connection using CAST AI eks-cluster module.
15+
resource "castai_eks_clusterid" "cluster_id" {
16+
account_id = data.aws_caller_identity.current.account_id
17+
region = var.region
18+
cluster_name = var.cluster_name
19+
depends_on = [module.eks, helm_release.calico, aws_eks_access_entry.access_entry]
20+
}
21+
22+
resource "castai_eks_user_arn" "castai_user_arn" {
23+
cluster_id = castai_eks_clusterid.cluster_id.id
24+
}
25+
26+
# Create AWS IAM policies and a user to connect to CAST AI.
27+
module "castai-eks-role-iam" {
28+
source = "castai/eks-role-iam/castai"
29+
30+
aws_account_id = data.aws_caller_identity.current.account_id
31+
aws_cluster_region = var.region
32+
aws_cluster_name = var.cluster_name
33+
aws_cluster_vpc_id = module.vpc.vpc_id
34+
35+
castai_user_arn = castai_eks_user_arn.castai_user_arn.arn
36+
37+
create_iam_resources_per_cluster = true
38+
}
39+
40+
module "castai-eks-cluster" {
41+
source = "castai/eks-cluster/castai"
42+
43+
delete_nodes_on_disconnect = var.delete_nodes_on_disconnect
44+
45+
aws_account_id = data.aws_caller_identity.current.account_id
46+
aws_cluster_region = var.region
47+
aws_cluster_name = module.eks.cluster_name
48+
aws_assume_role_arn = module.castai-eks-role-iam.role_arn
49+
api_url = var.castai_api_url
50+
castai_api_token = var.castai_api_token
51+
grpc_url = var.castai_grpc_url
52+
wait_for_cluster_ready = true
53+
54+
// Default node configuration will be used for all CAST provisioned nodes unless specific configuration is requested.
55+
default_node_configuration = module.castai-eks-cluster.castai_node_configurations["default"]
56+
57+
node_configurations = {
58+
default = {
59+
subnets = module.vpc.private_subnets
60+
instance_profile_arn = module.castai-eks-role-iam.instance_profile_arn
61+
security_groups = [
62+
module.eks.node_security_group_id,
63+
]
64+
init_script = base64encode(file("eks-init-script.sh"))
65+
container_runtime = "containerd"
66+
eks_image_family = "al2023"
67+
}
68+
}
69+
70+
node_templates = {
71+
# Already contains live binaries on nodes
72+
default_by_castai = {
73+
name = "default-by-castai"
74+
configuration_id = module.castai-eks-cluster.castai_node_configurations["default"]
75+
is_default = true
76+
is_enabled = true
77+
should_taint = false
78+
79+
constraints = {
80+
on_demand = true
81+
spot = true
82+
use_spot_fallbacks = true
83+
fallback_restore_rate_seconds = 1800
84+
85+
enable_spot_diversity = false
86+
spot_diversity_price_increase_limit_percent = 20
87+
88+
architectures = ["amd64"]
89+
}
90+
}
91+
92+
# Same setup as default, but with the goal to forcefully bring nodes with Live binaries installed, based on the NT node selector
93+
live-enabled = {
94+
name = "live-enabled"
95+
configuration_id = module.castai-eks-cluster.castai_node_configurations["default"]
96+
is_enabled = true
97+
should_taint = false
98+
99+
constraints = {
100+
on_demand = true
101+
spot = true
102+
use_spot_fallbacks = true
103+
fallback_restore_rate_seconds = 1800
104+
105+
enable_spot_diversity = false
106+
spot_diversity_price_increase_limit_percent = 20
107+
108+
architectures = ["amd64"]
109+
}
110+
}
111+
}
112+
113+
autoscaler_settings = {
114+
enabled = true
115+
node_templates_partial_matching_enabled = false
116+
117+
unschedulable_pods = {
118+
enabled = true
119+
}
120+
121+
node_downscaler = {
122+
enabled = true
123+
124+
empty_nodes = {
125+
enabled = true
126+
}
127+
128+
evictor = {
129+
aggressive_mode = false
130+
cycle_interval = "5s10s"
131+
dry_run = false
132+
enabled = true
133+
node_grace_period_minutes = 10
134+
scoped_mode = false
135+
}
136+
}
137+
138+
cluster_limits = {
139+
enabled = true
140+
141+
cpu = {
142+
max_cores = 100
143+
min_cores = 1
144+
}
145+
}
146+
}
147+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
labels:
5+
app: dummy-live-enabled
6+
name: dummy-live-enabled
7+
namespace: default
8+
spec:
9+
replicas: 2
10+
selector:
11+
matchLabels:
12+
app: dummy-live-enabled
13+
strategy: {}
14+
template:
15+
metadata:
16+
labels:
17+
app: dummy-live-enabled
18+
spec:
19+
nodeSelector:
20+
scheduling.cast.ai/node-template: "live-enabled"
21+
affinity:
22+
podAntiAffinity:
23+
requiredDuringSchedulingIgnoredDuringExecution:
24+
- labelSelector:
25+
matchExpressions:
26+
- key: app
27+
operator: In
28+
values:
29+
- dummy-live-enabled
30+
topologyKey: "kubernetes.io/hostname"
31+
containers:
32+
- command:
33+
- /bin/sh
34+
- -c
35+
- 'trap "exit 0" SIGTERM; i=0; while true; do echo "Count: $i"; i=$((i+1)); sleep 10; done'
36+
image: busybox:1.37.0
37+
name: busybox
38+
resources: {}
39+
terminationGracePeriodSeconds: 0
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
# Detect system architecture
5+
ARCH=$(uname -m)
6+
case "$ARCH" in
7+
x86_64) ARCH="amd64" ;;
8+
aarch64) ARCH="arm64" ;;
9+
arm64) ARCH="arm64" ;;
10+
amd64) ARCH="amd64" ;;
11+
*)
12+
echo "Warning: Unsupported architecture: $ARCH, defaulting to amd64" >&2
13+
ARCH="amd64"
14+
;;
15+
esac
16+
17+
CRI_URL=https://storage.googleapis.com/castai-node-components/castai-cri-proxy/releases/0.26.0
18+
19+
wget ${CRI_URL}/castai-cri-proxy-linux-${ARCH}.tar.gz -O /var/tmp/castai-cri-proxy-linux-${ARCH}.tar.gz
20+
wget ${CRI_URL}/castai-cri-proxy_SHA256SUMS -O /var/tmp/proxy_SHA256SUMS
21+
SHA256_AMD64_FROM_FILE=$(head -n 1 /var/tmp/proxy_SHA256SUMS | awk '{print $1}')
22+
SHA256_ARM64_FROM_FILE=$(sed -n '2p' /var/tmp/proxy_SHA256SUMS | awk '{print $1}')
23+
pushd /var/tmp
24+
sha256sum --ignore-missing --check /var/tmp/proxy_SHA256SUMS
25+
popd
26+
tar -xvzf /var/tmp/castai-cri-proxy-linux-${ARCH}.tar.gz -C /var/tmp/ cri-proxy
27+
chmod +x /var/tmp/cri-proxy
28+
29+
cat <<EOF >/var/tmp/pre-install.yaml
30+
packages:
31+
cri-proxy:
32+
downloadURL: ${CRI_URL}
33+
unpackDir: /usr/local/bin
34+
customUnpackLocations:
35+
cni-proxy: /opt/cni/bin/
36+
arch:
37+
amd64:
38+
fileName: castai-cri-proxy-linux-amd64.tar.gz
39+
sha256sum: ${SHA256_AMD64_FROM_FILE}
40+
arm64:
41+
fileName: castai-cri-proxy-linux-arm64.tar.gz
42+
sha256sum: ${SHA256_ARM64_FROM_FILE}
43+
EOF
44+
sudo /var/tmp/cri-proxy install --base-config=amazon-linux-2023 --config /var/tmp/pre-install.yaml --debug

0 commit comments

Comments
 (0)