Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@

/scratch
/crio
patch-dir

.*sw*
3 changes: 3 additions & 0 deletions Justfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
@build:
packer build sysbox-eks.pkr.hcl

@build-incremental:
packer build sysbox-eks-incremental.pkr.hcl

@build-crio:
docker build -t sysbox-eks-ami-crio . -f crio.Dockerfile
docker run \
Expand Down
48 changes: 43 additions & 5 deletions bootstrap.sh.patch
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
--- bootstrap.sh 2024-04-08 15:32:39
+++ patched_bootstrap.sh 2024-04-08 15:36:38
--- bootstrap.sh 2025-02-11 13:09:53
+++ patched_bootstrap.sh 2025-02-11 13:23:06
@@ -150,7 +150,7 @@
API_RETRY_ATTEMPTS="${API_RETRY_ATTEMPTS:-3}"
DOCKER_CONFIG_JSON="${DOCKER_CONFIG_JSON:-}"
Expand All @@ -9,7 +9,45 @@
CONTAINER_RUNTIME="${CONTAINER_RUNTIME:-$DEFAULT_CONTAINER_RUNTIME}"
# from >= 1.27, the cloud-provider will be external
CLOUD_PROVIDER="external"
@@ -426,17 +426,28 @@
@@ -295,11 +295,15 @@
--region=${AWS_DEFAULT_REGION} \
--name=${CLUSTER_NAME}

+ # Switch to JSON output to avoid "NoneType" flush bug in text mode.
+ # Then parse the required fields with jq, output them on a single line.
aws eks describe-cluster \
--region=${AWS_DEFAULT_REGION} \
--name=${CLUSTER_NAME} \
- --output=text \
- --query 'cluster.{certificateAuthorityData: certificateAuthority.data, endpoint: endpoint, serviceIpv4Cidr: kubernetesNetworkConfig.serviceIpv4Cidr, serviceIpv6Cidr: kubernetesNetworkConfig.serviceIpv6Cidr, clusterIpFamily: kubernetesNetworkConfig.ipFamily}' > $DESCRIBE_CLUSTER_RESULT || rc=$?
+ --output=json \
+ | jq -r '.cluster | "\( .certificateAuthority.data ) \( .endpoint ) \( .kubernetesNetworkConfig.serviceIpv4Cidr ) \( .kubernetesNetworkConfig.serviceIpv6Cidr ) \( .kubernetesNetworkConfig.ipFamily )"' \
+ > $DESCRIBE_CLUSTER_RESULT || rc=$?
+
if [[ $rc -eq 0 ]]; then
break
fi
@@ -310,13 +314,14 @@
sleep_sec="$(( $(( 5 << $((1+$attempt)) )) + $jitter))"
sleep $sleep_sec
done
- B64_CLUSTER_CA=$(cat $DESCRIBE_CLUSTER_RESULT | awk '{print $1}')
- APISERVER_ENDPOINT=$(cat $DESCRIBE_CLUSTER_RESULT | awk '{print $3}')
- SERVICE_IPV4_CIDR=$(cat $DESCRIBE_CLUSTER_RESULT | awk '{print $4}')
- SERVICE_IPV6_CIDR=$(cat $DESCRIBE_CLUSTER_RESULT | awk '{print $5}')

+ # Our jq line puts five fields on one line. Re-map them accordingly.
+ B64_CLUSTER_CA="$(awk '{print $1}' $DESCRIBE_CLUSTER_RESULT)"
+ APISERVER_ENDPOINT="$(awk '{print $2}' $DESCRIBE_CLUSTER_RESULT)"
+ SERVICE_IPV4_CIDR="$(awk '{print $3}' $DESCRIBE_CLUSTER_RESULT)"
+ SERVICE_IPV6_CIDR="$(awk '{print $4}' $DESCRIBE_CLUSTER_RESULT)"
if [[ -z "${IP_FAMILY}" ]]; then
- IP_FAMILY=$(cat $DESCRIBE_CLUSTER_RESULT | awk '{print $2}')
+ IP_FAMILY="$(awk '{print $5}' $DESCRIBE_CLUSTER_RESULT)"
fi
fi

@@ -434,17 +439,28 @@
systemctl restart docker
snap set kubelet-eks \
container-runtime=docker
Expand All @@ -25,7 +63,7 @@
+ --file /etc/crio/crio.conf \
+ --selector 'crio.image.pause_image' \
+ "${PAUSE_CONTAINER}"

elif [[ "$CONTAINER_RUNTIME" = "nvidia-container-runtime" ]]; then
- echo "Container runtime is ${CONTAINER_RUNTIME}"
- # update config.toml file
Expand All @@ -37,7 +75,7 @@
+ # see https://github.com/NVIDIA/k8s-device-plugin
+ cp /usr/local/share/eks/nvidia-runtime-config.toml /etc/containerd/config.toml
+ systemctl restart containerd

else
- echo "Container runtime ${CONTAINER_RUNTIME} is not supported."
- exit 1
Expand Down
3 changes: 3 additions & 0 deletions cloud-init-local.service.d/10-wait-for-net-device.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[Unit]
Requires=dev-ec2imds.device
After=dev-ec2imds.device
126 changes: 126 additions & 0 deletions sysbox-eks-incremental.pkr.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
variable "ubuntu_version" {

default = "jammy-22.04"

validation {
condition = can(regex("^\\w+-\\d+\\.\\d+$", var.ubuntu_version))
error_message = "Invalid Ubuntu version: expected '{name}-{major}.{minor}'."
}
}

variable "sysbox_version" {
type = string
default = "0.6.5"

validation {
condition = can(regex("^\\d+\\.\\d+\\.\\d+$", var.sysbox_version))
error_message = "Invalid Sysbox version: expected '{major}.{minor}.{patch}'."
}
}

variable "k8s_version" {
type = string
default = "1.29"

validation {
condition = can(regex("^\\d+\\.\\d+$", var.k8s_version))
error_message = "Invalid K8s version: expected '{major}.{minor}'."
}
}

variable "cuda_driver_version" {
type = string
default = "560.35.05"
}

packer {
required_plugins {
amazon = {
version = "= 1.0.9"
source = "github.com/hashicorp/amazon"
}
git = {
version = ">= 0.5.0"
source = "github.com/ethanmdavidson/git"
}

}
}

data "git-commit" "current" {}

local "git_branch" {
expression = "${substr(data.git-commit.current.hash, 0, 4)}-${replace(element(data.git-commit.current.branches, 0), "/", "-")}"
}

local "ami_name" {
expression = "latch-bio/sysbox-eks_0.6.5/k8s_1.29/jammy-22.04-amd64-server/nvidia-560.35.05/kvm-support-b744"
}

source "amazon-ebs" "ubuntu-eks" {
ami_name = local.ami_name
ami_description = "Latch Bio, Sysbox EKS Node (k8s_${var.k8s_version}) with NVIDIA GPU support, on Ubuntu ${var.ubuntu_version}, amd64 image."

tags = {
Linux = "Ubuntu"
UbuntuRelease = split("-", var.ubuntu_version)[0]
UbuntuVersion = split("-", var.ubuntu_version)[1]
Arch = "amd64"
K8sVersion = var.k8s_version
SysboxVersion = var.sysbox_version

BaseImageID = "{{ .SourceAMI }}"
BaseImageOwnerID = "{{ .SourceAMIOwner }}"

BaseImageOwnerName = "{{ .SourceAMIOwnerName }}"
BaseImageName = "{{ .SourceAMIName }}"
}

source_ami_filter {
filters = {
name = "latch-bio/sysbox-eks_0.6.5/k8s_1.29/ubuntu-jammy-22.04-amd64-server/nvidia-560.35.05/latch-92cf-aidan-latest-gpu-drivers"
}
owners = ["812206152185"]
}

launch_block_device_mappings {
device_name = "/dev/sda1"
volume_size = 30
volume_type = "gp3"
delete_on_termination = true
}

region = "us-west-2"
instance_type = "t2.micro"
ssh_username = "ubuntu"
temporary_key_pair_type = "ed25519"
ssh_handshake_attempts = 100
}

build {
name = "sysbox-eks-incremental"
sources = [
"source.amazon-ebs.ubuntu-eks",
]

provisioner "shell" {
inline_shebang = "/usr/bin/env bash"
inline = [
"set -o pipefail -o errexit",

"echo '>>> Configuring KVM support'",
"sudo modprobe kvm",

"echo 'kvm' | sudo tee -a /etc/modules",

"sudo dasel put string --parser toml --file /etc/crio/crio.conf --selector 'crio.runtime.allowed_devices.[]' --multiple /dev/kvm",

"sudo systemctl restart crio",

# configure /dev/kvm perms to allow containers to r/w to it
"echo 'KERNEL==\"kvm\", MODE=\"0666\"' | sudo tee /etc/udev/rules.d/99-kvm-permissions.rules > /dev/null",
"sudo udevadm control --reload-rules",
"sudo udevadm trigger"
]
}
}
66 changes: 62 additions & 4 deletions sysbox-eks.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ variable "ubuntu_version" {

variable "sysbox_version" {
type = string
default = "0.6.5"
default = "0.6.7"

validation {
condition = can(regex("^\\d+\\.\\d+\\.\\d+$", var.sysbox_version))
Expand Down Expand Up @@ -83,7 +83,7 @@ source "amazon-ebs" "ubuntu-eks" {

source_ami_filter {
filters = {
name = "ubuntu-eks/k8s_${var.k8s_version}/images/hvm-ssd/ubuntu-${var.ubuntu_version}-amd64-server-20241204"
name = "ubuntu-eks/k8s_${var.k8s_version}/images/hvm-ssd/ubuntu-${var.ubuntu_version}-amd64-server-20250730"
}
owners = ["099720109477"]
}
Expand All @@ -96,7 +96,7 @@ source "amazon-ebs" "ubuntu-eks" {
}

region = "us-west-2"
instance_type = "t2.micro"
instance_type = "t3.large"
ssh_username = "ubuntu"
temporary_key_pair_type = "ed25519"
ssh_handshake_attempts = 100
Expand Down Expand Up @@ -333,7 +333,8 @@ build {
"sudo touch /etc/crio/crio.conf",

# todo(maximsmol): do this only when K8s is configured without systemd cgroups (from sysbox todos)
"sudo dasel put string --parser toml --file /etc/crio/crio.conf --selector 'crio.runtime.cgroup_manager' 'cgroupfs'",
# note(aidan): removed this with sysbox 0.6.7 and put back to 'systemd'
"sudo dasel put string --parser toml --file /etc/crio/crio.conf --selector 'crio.runtime.cgroup_manager' 'systemd'",
"sudo dasel put string --parser toml --file /etc/crio/crio.conf --selector 'crio.runtime.conmon_cgroup' 'pod'",

# use containerd/Docker's default capabilities: https://github.com/moby/moby/blob/faf84d7f0a1f2e6badff6f720a3e1e559c356fff/oci/caps/defaults.go
Expand Down Expand Up @@ -483,4 +484,61 @@ build {
"sudo dasel put string --parser json --file /etc/kubernetes/kubelet/kubelet-config.json --selector 'memorySwap.swapBehavior' 'UnlimitedSwap'",
]
}

provisioner "shell" {
inline_shebang = "/usr/bin/env bash"
inline = [
"set -o pipefail -o errexit",

"echo '>>> Configuring IPv6 prioritization'",
"echo 'label ::/0 100' | sudo tee -a /etc/gai.conf"
]
}

provisioner "file" {
source = "cloud-init-local.service.d/10-wait-for-net-device.conf"
destination = "/home/ubuntu/10-wait-for-net-device.conf"
}

provisioner "file" {
source = "udev/10-ec2imds.rules"
destination = "/home/ubuntu/10-ec2imds.rules"
}

provisioner "shell" {
inline_shebang = "/usr/bin/env bash"
inline = [
"set -o pipefail -o errexit",
"",
"echo '>>> Installing cloud-init network device wait configuration'",
"sudo mkdir -p /etc/systemd/system/cloud-init-local.service.d",
"sudo mv /home/ubuntu/10-wait-for-net-device.conf /etc/systemd/system/cloud-init-local.service.d/",
"",
"sudo mkdir -p /etc/udev/rules.d",
"sudo mv /home/ubuntu/10-ec2imds.rules /etc/udev/rules.d/",
"",
"sudo systemctl daemon-reload"
]
}

provisioner "shell" {
inline_shebang = "/usr/bin/env bash"
inline = [
"set -o pipefail -o errexit",

"echo '>>> Configuring KVM support'",
"sudo modprobe kvm",

"echo 'kvm' | sudo tee -a /etc/modules",

"sudo dasel put string --parser toml --file /etc/crio/crio.conf --selector 'crio.runtime.allowed_devices.[]' --multiple /dev/kvm",

"sudo systemctl restart crio",

# configure /dev/kvm perms to allow containers to r/w to it
"echo 'KERNEL==\"kvm\", MODE=\"0666\"' | sudo tee /etc/udev/rules.d/99-kvm-permissions.rules > /dev/null",
"sudo udevadm control --reload-rules",
"sudo udevadm trigger"
]
}
}
7 changes: 7 additions & 0 deletions udev/10-ec2imds.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# cloud-init-local must wait for at least one network interface device to exist
# before attempting to download EC2 instance metadata.
#
# These udev rules implement this policy along with
# /etc/systemd/system/cloud-init.local.service.d/10-wait-for-net-device.conf

ACTION!="remove", SUBSYSTEM=="net", KERNEL!="lo", DRIVERS=="ena|vif", TAG+="systemd", ENV{SYSTEMD_ALIAS}+="/dev/ec2imds"
Loading