-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun.sh
executable file
·64 lines (52 loc) · 1.94 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/bin/env bash
# Ensure script exits on any error
set -e
# Required parameters
RESOURCE_GROUP="rg-pvt-aks-h100"
LOCATION="eastus2"
DEPLOYMENT_NAME="aks-h100-deployment"
TEMPLATE_FILE="main.bicep"
# Optional: update these with actual values or pass via environment/CLI
PARAM_REGISTRY_NAME="gbbpvt"
PARAM_CLUSTER_NAME="pvt-aks-h100"
# Create the resource group if it doesn’t exist
az group create \
--name "$RESOURCE_GROUP" \
--location "$LOCATION"
# Deploy the Bicep template (main.bicep, which uses modules)
az deployment group create \
--template-file "$TEMPLATE_FILE" \
--parameters \
registryName="$PARAM_REGISTRY_NAME" \
clusterName="$PARAM_CLUSTER_NAME" \
resourceGroupName="$RESOURCE_GROUP"
# Done
echo "Deployment complete."
echo "Installing NVIDIA GPU Operator..."
# Install NVIDIA GPU Operator
HELM_REPO_URL="https://nvidia.github.io/gpu-operator"
HELM_INSTALL_CMD="helm install gpu-operator nvidia/gpu-operator -n gpu-operator --create-namespace --set operator.runtimeClass=nvidia-container-runtime"
az aks command invoke \
--resource-group $RESOURCE_GROUP \
--name $PARAM_CLUSTER_NAME \
--command "helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update && $HELM_INSTALL_CMD"
# Wait for the GPU Operator to be ready
CMD="kubectl get pods -n gpu-operator"
az aks command invoke \
--resource-group $RESOURCE_GROUP \
--name $PARAM_CLUSTER_NAME \
--command "$CMD"
# check for allocatable GPUs
GPU_NODE="kubectl get nodes -l accelerator=nvidia -o json"
CMD='kubectl get nodes $GPU_NODE -o jsonpath="{.status.capacity}"'
az aks command invoke \
--resource-group $RESOURCE_GROUP \
--name $PARAM_CLUSTER_NAME \
--command "$CMD"
# testing
CMD="kubectl apply -f pod-check-nvidia-smi.yaml -n default && kubectl logs nvidia-gpu-test -n default"
az aks command invoke \
--resource-group $RESOURCE_GROUP \
--name $PARAM_CLUSTER_NAME \
--command "$CMD" \
--file pod-check-nvidia-smi.yaml