Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions demo/clusters/eks/create-cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash

CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)"
PROJECT_DIR="$(cd -- "$( dirname -- "${CURRENT_DIR}/../../../.." )" &> /dev/null && pwd)"

set -ex
set -o pipefail

# We extract information from versions.mk
function from_versions_mk() {
local makevar=$1
local value=$(grep -E "^\s*${makevar}\s+[\?:]= " ${PROJECT_DIR}/versions.mk)
echo ${value##*= }
}
DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")

: "${AWS_REGION:=us-east-1}"
: "${CLUSTER_NAME:=${DRIVER_NAME}-cluster}"
: "${EKS_VERSION:=1.33}"
: "${INSTANCE_TYPE:=g6e.4xlarge}"

export AWS_REGION
export CLUSTER_NAME
export EKS_VERSION
export INSTANCE_TYPE

export EKS_CP_AZS=$(aws ec2 describe-availability-zones \
--region ${AWS_REGION} \
--filters "Name=opt-in-status,Values=opt-in-not-required" \
--query "AvailabilityZones[?ZoneId!='use1-az3'].[ZoneName][:3]" \
--output text | sed 's/ /, /g; s/^/ - /')

## Create eksctl configuration file
envsubst < eksctl.yaml > ${CLUSTER_NAME}-${AWS_REGION}.yaml

## Create EKS cluster using eksctl
eksctl create cluster -f ${CLUSTER_NAME}-${AWS_REGION}.yaml --install-nvidia-plugin=false

## Setup EKS cluster credentials
aws eks update-kubeconfig --name ${CLUSTER_NAME} --region ${AWS_REGION}
24 changes: 24 additions & 0 deletions demo/clusters/eks/delete-cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)"
PROJECT_DIR="$(cd -- "$( dirname -- "${CURRENT_DIR}/../../../.." )" &> /dev/null && pwd)"

set -ex
set -o pipefail

# We extract information from versions.mk
function from_versions_mk() {
local makevar=$1
local value=$(grep -E "^\s*${makevar}\s+[\?:]= " ${PROJECT_DIR}/versions.mk)
echo ${value##*= }
}
DRIVER_NAME=$(from_versions_mk "DRIVER_NAME")

: "${CLUSTER_NAME:=${DRIVER_NAME}-cluster}"
: "${AWS_REGION:=us-east-1}"

export CLUSTER_NAME
export AWS_REGION

## Delete EKS cluster using eksctl
eksctl delete cluster --name ${CLUSTER_NAME} --region ${AWS_REGION} --wait
63 changes: 63 additions & 0 deletions demo/clusters/eks/eksctl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: ${CLUSTER_NAME}
region: ${AWS_REGION}
version: "${EKS_VERSION}"
availabilityZones:
${EKS_CP_AZS}
managedNodeGroups:
- name: system
amiFamily: AmazonLinux2023
instanceType: m6g.large
privateNetworking: true
desiredCapacity: 2
minSize: 2
maxSize: 4
labels:
role: system
taints:
- key: CriticalAddonsOnly
effect: NoSchedule
updateConfig:
maxUnavailable: 1
- name: gpu
amiFamily: AmazonLinux2023
instanceType: ${INSTANCE_TYPE}
privateNetworking: true
desiredCapacity: 1
minSize: 1
maxSize: 4
labels:
role: gpu
nvidia.com/gpu.present: "true"
taints:
- key: nvidia.com/gpu
effect: NoSchedule
updateConfig:
maxUnavailable: 1
overrideBootstrapCommand: |
apiVersion: node.eks.aws/v1alpha1
kind: NodeConfig
spec:
kubelet:
config:
featureGates:
DynamicResourceAllocation: true
cloudWatch:
clusterLogging:
enableTypes: ["*"]
addons:
- name: vpc-cni
version: latest
- name: coredns
version: latest
configurationValues: |-
nodeSelector:
role: system
- name: eks-pod-identity-agent
version: latest
- name: kube-proxy
version: latest
iam:
withOIDC: true
16 changes: 16 additions & 0 deletions demo/clusters/eks/install-dra-driver-gpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

CURRENT_DIR="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)"
PROJECT_DIR="$(cd -- "$( dirname -- "${CURRENT_DIR}/../../../.." )" &> /dev/null && pwd)"

helm upgrade -i --create-namespace --namespace nvidia-dra-driver-gpu nvidia-dra-driver-gpu ${PROJECT_DIR}/deployments/helm/nvidia-dra-driver-gpu \
--set image.pullPolicy=Always \
--set gpuResourcesEnabledOverride=true \
--set controller.tolerations\[0\].key="CriticalAddonsOnly" \
--set controller.tolerations\[0\].operator=Exists \
--set controller.tolerations\[0\].effect=NoSchedule \
--set controller.nodeSelector.role=system \
--set controller.affinity=null \
--set kubeletPlugin.tolerations\[0\].key="nvidia.com/gpu" \
--set kubeletPlugin.tolerations\[0\].operator=Exists \
--set kubeletPlugin.tolerations\[0\].effect=NoSchedule