Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
24def6b
Upgrade EKS Cluster version to 1.32
sirutBuasai Feb 11, 2025
87d67ea
upgrade only pytorch
sirutBuasai Feb 11, 2025
f35fdeb
fix list
sirutBuasai Feb 11, 2025
30ffe78
update addon
sirutBuasai Feb 11, 2025
c86b81a
fix addon upgrade
sirutBuasai Feb 11, 2025
27d7e49
fix addon aws-node
sirutBuasai Feb 11, 2025
0386d2e
update vpc-cni
sirutBuasai Feb 11, 2025
5baae91
fix update cni
sirutBuasai Feb 11, 2025
b3ac863
upgrade to 1.26 first
sirutBuasai Feb 12, 2025
03c75fb
change upgrade nodegroup log parsing
sirutBuasai Feb 12, 2025
46f30dd
fix upgrade
sirutBuasai Feb 12, 2025
30d0fc1
do upgrade list addons
sirutBuasai Feb 12, 2025
f98cdbc
upgrade cluster with name
sirutBuasai Feb 12, 2025
4315844
update jq command
sirutBuasai Feb 12, 2025
2aad2ca
jq fix
sirutBuasai Feb 12, 2025
e5ba31b
upgrade all
sirutBuasai Feb 12, 2025
e71e657
upgrade mainline
sirutBuasai Feb 12, 2025
534200f
test eks
sirutBuasai Feb 13, 2025
e65dd46
test only eks
sirutBuasai Feb 13, 2025
1b63b8d
temp add iam
sirutBuasai Feb 13, 2025
3c54a4a
no build staging
sirutBuasai Feb 13, 2025
260592e
temp check status
sirutBuasai Feb 13, 2025
c847aaf
tmp create tbac
sirutBuasai Feb 13, 2025
771954e
install autoscalar
sirutBuasai Feb 13, 2025
3b683cd
delete PR cluster
sirutBuasai Feb 13, 2025
5972c6d
upgrade kubeflow and kustomize
sirutBuasai Feb 13, 2025
8ff7bcc
upgrade eks utils
sirutBuasai Feb 13, 2025
60f2d47
delete
sirutBuasai Feb 13, 2025
29fc9aa
create
sirutBuasai Feb 13, 2025
ecdfe54
create cluster
sirutBuasai Feb 13, 2025
d754dba
test eks
sirutBuasai Feb 13, 2025
e4f1dd7
run inf
sirutBuasai Feb 13, 2025
98575ac
run inf
sirutBuasai Feb 13, 2025
7dcdd74
delete cluster
sirutBuasai Feb 14, 2025
1d971d5
use g5 for gpu nodegroup
sirutBuasai Feb 18, 2025
23446f3
revert toml
sirutBuasai Feb 19, 2025
f6f570d
temp delete PR cluster
sirutBuasai Feb 19, 2025
1d8d79f
upgrade nodegroup
sirutBuasai Feb 19, 2025
fa15d87
delete nodegroups
sirutBuasai Feb 20, 2025
9ee81bb
create cluster
sirutBuasai Feb 21, 2025
4070d69
buidl test 2.5
sirutBuasai Feb 21, 2025
8180328
delete cluster
sirutBuasai Feb 21, 2025
4290643
Merge branch 'master' into eks-upgrade
sirutBuasai Feb 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions eks_infrastructure/build_param.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
{
"operation": "new_operation",
"operation": "delete",
"contexts": [
"MAINLINE"
],
"eks_clusters": [
"dlc-pytorch",
"dlc-tensorflow"
],
"eks_version": "1.25",
"cluster_autoscalar_image_version": "v1.25.0"
"eks_version": "1.32",
"cluster_autoscalar_image_version": "v1.32.0"
}
4 changes: 2 additions & 2 deletions eks_infrastructure/cluster-autoscalar-autodiscover.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ spec:
spec:
serviceAccountName: cluster-autoscaler
containers:
- image: k8s.gcr.io/autoscaling/cluster-autoscaler:<VERSION>
- image: registry.k8s.io/autoscaling/cluster-autoscaler:<VERSION>
name: cluster-autoscaler
resources:
limits:
Expand Down Expand Up @@ -166,4 +166,4 @@ spec:
volumes:
- name: ssl-certs
hostPath:
path: "/etc/ssl/certs/ca-bundle.crt"
path: "/etc/ssl/certs/ca-bundle.crt"
40 changes: 38 additions & 2 deletions eks_infrastructure/create_cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ function create_eks_cluster() {
function create_node_group() {

STATIC_NODEGROUP_INSTANCE_TYPE="m5.large"
GPU_NODEGROUP_INSTANCE_TYPE="p3.16xlarge"
GPU_NODEGROUP_INSTANCE_TYPE="g5.24xlarge"
INF_NODEGROUP_INSTANCE_TYPE="inf1.xlarge"
GRAVITON_NODEGROUP_INSTANCE_TYPE="c6g.4xlarge"

# static nodegroup
eksctl create nodegroup \
Expand Down Expand Up @@ -78,6 +79,36 @@ function create_node_group() {
--ssh-access \
--ssh-public-key "${3}"

# dynamic graviton nodegroup
eksctl create nodegroup \
--name ${1}-graviton-nodegroup-${2/./-} \
--cluster ${1} \
--node-type ${GRAVITON_NODEGROUP_INSTANCE_TYPE} \
--nodes-min 0 \
--nodes-max 100 \
--node-volume-size 80 \
--node-labels "test_type=graviton" \
--tags "k8s.io/cluster-autoscaler/node-template/label/test_type=graviton" \
--asg-access \
--managed=true \
--ssh-access \
--ssh-public-key "${3}"
}

#Function to upgrade core k8s components
function update_eksctl_utils() {
LIST_ADDONS=$(eksctl get addon --cluster ${CLUSTER} -o json | jq -r '.[].Name')

if [ -n "${LIST_ADDONS}" ]; then
for ADDONS in ${LIST_ADDONS}; do
eksctl update addon \
--name ${ADDONS} \
--cluster ${1} \
--region ${2}
done
else
echo "No addons present in the EKS cluster ${CLUSTER}"
fi
}

# Attach IAM policy to nodegroup IAM role
Expand Down Expand Up @@ -148,6 +179,10 @@ function add_tags_asg() {
ResourceId=${asg_name},ResourceType=auto-scaling-group,Key=k8s.io/cluster-autoscaler/node-template/resources/hugepages-2Mi,Value=256Mi,PropagateAtLaunch=true
fi

if [[ ${nodegroup_name} == *"graviton"* ]]; then
aws autoscaling create-or-update-tags \
--tags ResourceId=${asg_name},ResourceType=auto-scaling-group,Key=k8s.io/cluster-autoscaler/node-template/label/test_type,Value=graviton,PropagateAtLaunch=true
fi
done

}
Expand Down Expand Up @@ -185,4 +220,5 @@ create_eks_cluster ${CLUSTER} ${EKS_VERSION} ${AWS_REGION}
create_node_group ${CLUSTER} ${EKS_VERSION} ${EC2_KEY_PAIR_NAME}
add_tags_asg ${CLUSTER} ${AWS_REGION}
add_iam_permissions_nodegroup ${CLUSTER} ${AWS_REGION}
create_namespaces
create_namespaces
update_eksctl_utils ${CLUSTER} ${AWS_REGION}
10 changes: 5 additions & 5 deletions eks_infrastructure/env_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
set -ex

# The below url/version is based on EKS v1.25.6. The same needs to be updated for EKS version upgrade.
KUBECTL_CLIENT="https://s3.us-west-2.amazonaws.com/amazon-eks/1.25.6/2023-01-30/bin/linux/amd64/kubectl"
EKSCTL_CLIENT="https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz"
AWS_IAM_AUTHENTICATOR="https://github.com/kubernetes-sigs/aws-iam-authenticator/releases/download/v0.6.2/aws-iam-authenticator_0.6.2_linux_amd64"
KUBECTL_CLIENT="https://s3.us-west-2.amazonaws.com/amazon-eks/1.32.0/2024-12-20/bin/linux/amd64/kubectl"
EKSCTL_CLIENT="https://github.com/eksctl-io/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz"
AWS_IAM_AUTHENTICATOR="https://github.com/kubernetes-sigs/aws-iam-authenticator/releases/download/v0.6.29/aws-iam-authenticator_0.6.29_linux_amd64"

LATEST_KUBECTL_CLIENT_VERSION=1.25
LATEST_KUBECTL_CLIENT_VERSION=1.32

function install_kubectl_client() {
curl --silent --location ${KUBECTL_CLIENT} -o /usr/local/bin/kubectl
Expand Down Expand Up @@ -38,7 +38,7 @@ else
fi

#kubectl version
kubectl version --short --client
kubectl version --client

# install eksctl
if ! [ -x "$(command -v eksctl)" ]; then
Expand Down
11 changes: 10 additions & 1 deletion eks_infrastructure/helper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,27 @@ function create_cluster() {
# 4. Upgrade core k8s components
# 5. Scale cluster autoscalar back to 1
function upgrade_cluster() {
# Initialize error log and ensure log does not exist
ERROR_LOG="failed_nodegroups.log"
rm -f ${ERROR_LOG}

TARGET="CLUSTER"
for CONTEXT in "${CONTEXTS[@]}"; do
for CLUSTER in "${EKS_CLUSTERS[@]}"; do
CLUSTER_NAME=${CLUSTER}-${CONTEXT}
if check_cluster_status $CLUSTER_NAME; then
./upgrade_operation.sh $TARGET $CLUSTER_NAME $EKS_VERSION $CLUSTER_AUTOSCALAR_IMAGE_VERSION
./upgrade_operation.sh $TARGET $CLUSTER_NAME $EKS_VERSION $ERROR_LOG $CLUSTER_AUTOSCALAR_IMAGE_VERSION
else
echo "EKS Cluster :: ${CLUSTER_NAME} :: does not exists. Skipping upgrade operation."
fi
done
done

if [ -f ${ERROR_LOG} ]; then
echo "The following nodegroups failed to upgrade."
cat ${ERROR_LOG}
exit 1
fi
}

# Upgrade nodegroup operation function
Expand Down
28 changes: 13 additions & 15 deletions eks_infrastructure/new_operation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -105,20 +105,18 @@ function create_graviton_node_group() {

#Function to upgrade core k8s components
function update_eksctl_utils() {
eksctl utils update-kube-proxy \
--cluster ${1} \
--region ${2} \
--approve

eksctl utils update-aws-node \
--cluster ${1} \
--region ${2} \
--approve

eksctl utils update-coredns \
--cluster ${1} \
--region ${2} \
--approve
LIST_ADDONS=$(eksctl get addon --cluster ${CLUSTER} -o json | jq -r '.[].Name')

if [ -n "${LIST_ADDONS}" ]; then
for ADDONS in ${LIST_ADDONS}; do
eksctl update addon \
--name ${ADDONS} \
--cluster ${1} \
--region ${2}
done
else
echo "No addons present in the EKS cluster ${CLUSTER}"
fi
}

if [ $# -ne 2 ]; then
Expand Down Expand Up @@ -155,4 +153,4 @@ fi
# update_eksctl_utils ${CLUSTER} ${AWS_REGION}
create_graviton_node_group ${CLUSTER} ${EKS_VERSION} ${EC2_KEY_PAIR_NAME}
add_tags_asg ${CLUSTER} ${AWS_REGION}
add_iam_permissions_nodegroup ${CLUSTER} ${AWS_REGION}
add_iam_permissions_nodegroup ${CLUSTER} ${AWS_REGION}
35 changes: 15 additions & 20 deletions eks_infrastructure/upgrade_operation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,20 +66,18 @@ function upgrade_nodegroups() {

#Function to upgrade core k8s components
function update_eksctl_utils() {
eksctl utils update-kube-proxy \
--cluster ${1} \
--region ${2} \
--approve

eksctl utils update-aws-node \
--cluster ${1} \
--region ${2} \
--approve

eksctl utils update-coredns \
--cluster ${1} \
--region ${2} \
--approve
LIST_ADDONS=$(eksctl get addon --cluster ${CLUSTER} -o json | jq -r '.[].Name')

if [ -n "${LIST_ADDONS}" ]; then
for ADDONS in ${LIST_ADDONS}; do
eksctl update addon \
--name ${ADDONS} \
--cluster ${1} \
--region ${2}
done
else
echo "No addons present in the EKS cluster ${CLUSTER}"
fi
}

if [ $# -lt 3 ]; then
Expand All @@ -95,11 +93,8 @@ fi
TARGET=${1}
CLUSTER=${2}
EKS_VERSION=${3}
if [ "${TARGET}" = "CLUSTER" ]; then
CLUSTER_AUTOSCALAR_IMAGE_VERSION=${4}
elif [ "${TARGET}" = "NODEGROUP" ]; then
ERROR_LOG=${4}
fi
ERROR_LOG=${4}
CLUSTER_AUTOSCALAR_IMAGE_VERSION=${5}

if [ -n "${EKS_CLUSTER_MANAGER_ROLE}" ]; then
update_kubeconfig ${CLUSTER} ${EKS_CLUSTER_MANAGER_ROLE} ${AWS_REGION}
Expand All @@ -110,7 +105,7 @@ if [ "${TARGET}" = "CLUSTER" ]; then
scale_cluster_autoscalar 0
upgrade_autoscalar_image ${CLUSTER_AUTOSCALAR_IMAGE_VERSION}
upgrade_eks_control_plane ${CLUSTER} ${EKS_VERSION}
upgrade_nodegroups ${CLUSTER} ${EKS_VERSION} ${AWS_REGION}
upgrade_nodegroups ${CLUSTER} ${EKS_VERSION} ${AWS_REGION} ${ERROR_LOG}
update_eksctl_utils ${CLUSTER} ${AWS_REGION}
#scale back to 1
scale_cluster_autoscalar 1
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#!/bin/bash
#/ Usage:
#/ Usage:
#/ cluster name is required. operation is optional
#/ ./install_kubeflow.sh eks_cluster_name [operation]

set -ex

# Function to install kustomize
install_kustomize(){
KUSTOMIZE_VERSION="v4.5.7"
KUSTOMIZE_VERSION="v5.5.0"
KUSTOMIZE_URL="https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2F${KUSTOMIZE_VERSION}/kustomize_${KUSTOMIZE_VERSION}_linux_amd64.tar.gz"

if ! command -v kustomize &> /dev/null
Expand Down Expand Up @@ -42,17 +42,17 @@ uninstall_kubeflow(){

# Function to create directory and download kubeflow components
setup_kubeflow(){
KUBEFLOW_VERSION="v1.7.0"
KUBEFLOW_VERSION="v1.9.1"
local EKS_CLUSTER_NAME=$1
DIRECTORY="${HOME}/${EKS_CLUSTER_NAME}"

if [ -d "${DIRECTORY}" ]; then
rm -rf ${DIRECTORY};
fi
mkdir ${DIRECTORY}

mkdir ${DIRECTORY}
cd ${DIRECTORY}

# clones manifests from kubeflow github into a folder named manifests
git clone -b ${KUBEFLOW_VERSION} --single-branch https://github.com/kubeflow/manifests.git
}
Expand Down
16 changes: 8 additions & 8 deletions test/test_utils/eks.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@
LOGGER.addHandler(logging.StreamHandler(sys.stderr))


EKS_VERSION = "1.20.4"
EKSCTL_VERSION = "0.53.0"
KUBETAIL_VERSION = "1.6.7"
EKS_VERSION = "1.32.0"
EKSCTL_VERSION = "0.203.0"
KUBETAIL_VERSION = "1.6.20"


def get_aws_secret_yml_path():
Expand Down Expand Up @@ -152,7 +152,7 @@ def setup_eksctl():

platform = run("uname -s", echo=True).stdout.strip()
eksctl_download_command = (
f"curl --silent --location https://github.com/weaveworks/eksctl/releases/download/"
f"curl --silent --location https://github.com/eksctl-io/eksctl/releases/download/v"
f"{EKSCTL_VERSION}/eksctl_{platform}_amd64.tar.gz | tar xz -C /tmp"
)
run(eksctl_download_command, echo=True)
Expand All @@ -169,7 +169,7 @@ def eks_setup():

# Run a quick check that the binaries are available in the PATH by listing the 'version'
run_out = run(
"eksctl version && kubectl version --short --client && aws-iam-authenticator version",
"eksctl version && kubectl version --client && aws-iam-authenticator version",
warn=True,
)

Expand All @@ -182,12 +182,12 @@ def eks_setup():

kubectl_download_command = (
f"curl --silent --location https://amazon-eks.s3-us-west-2.amazonaws.com/"
f"{EKS_VERSION}/2021-04-12/bin/{platform.lower()}/amd64/kubectl -o /usr/local/bin/kubectl"
f"{EKS_VERSION}/2024-12-20/bin/{platform.lower()}/amd64/kubectl -o /usr/local/bin/kubectl"
)

aws_iam_authenticator_download_command = (
f"curl --silent --location https://amazon-eks.s3-us-west-2.amazonaws.com/"
f"{EKS_VERSION}/2021-04-12/bin/{platform.lower()}/amd64/aws-iam-authenticator "
f"{EKS_VERSION}/2024-12-20/bin/{platform.lower()}/amd64/aws-iam-authenticator "
f"-o /usr/local/bin/aws-iam-authenticator"
)

Expand All @@ -210,7 +210,7 @@ def eks_setup():

# Run a quick check that the binaries are available in the PATH by listing the 'version'
run("eksctl version", echo=True)
run("kubectl version --short --client", echo=True)
run("kubectl version --client", echo=True)
run("aws-iam-authenticator version", echo=True)


Expand Down