-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrun.sh
More file actions
executable file
·70 lines (61 loc) · 2.32 KB
/
run.sh
File metadata and controls
executable file
·70 lines (61 loc) · 2.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env bash
# Installs NFD + NVIDIA GPU Operator + CRs so nvidia.com/gpu resources become schedulable.
# All samples in this repo are CPU-only by default (tinyllama OCI modelcar) so this
# phase is opt-in.
#
# INSTALL_GPU modes:
# 0 (default) — skip. Samples run CPU-only.
# auto — install only if the cluster has GPU capacity but no driver yet
# (no node has nvidia.com/gpu allocatable)
# 1 — force install
set -Eeuo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=../lib/common.sh
source "${SCRIPT_DIR}/../lib/common.sh"
: "${INSTALL_GPU:=0}"
case "$INSTALL_GPU" in
0)
log "INSTALL_GPU=0 — skipping"
exit 0
;;
auto)
allocatable=$(oc get nodes -o json 2>/dev/null \
| jq '[.items[] | (.status.allocatable["nvidia.com/gpu"] // "0" | tonumber)] | add // 0')
if [[ "${allocatable:-0}" -gt 0 ]]; then
log "INSTALL_GPU=auto — cluster already has ${allocatable} nvidia.com/gpu allocatable; skipping"
exit 0
fi
log "INSTALL_GPU=auto — no GPU allocatable, proceeding with install"
;;
1)
log "INSTALL_GPU=1 — forcing install"
;;
*)
die "INSTALL_GPU must be 0, 1, or auto (got '$INSTALL_GPU')"
;;
esac
apply_manifest "${SCRIPT_DIR}/namespaces.yaml"
apply_manifest "${SCRIPT_DIR}/nfd-operator.yaml"
wait_for_csv_succeeded openshift-nfd nfd 900
wait_for_crd nodefeaturediscoveries.nfd.openshift.io 300
apply_manifest "${SCRIPT_DIR}/nodefeaturediscovery.yaml"
apply_manifest "${SCRIPT_DIR}/gpu-operator.yaml"
wait_for_csv_succeeded nvidia-gpu-operator gpu-operator-certified 900
wait_for_crd clusterpolicies.nvidia.com 300
apply_manifest "${SCRIPT_DIR}/clusterpolicy.yaml"
log "waiting up to 20 min for driver rollout — nvidia.com/gpu to become allocatable"
deadline=$(( $(date +%s) + 1200 ))
while (( $(date +%s) < deadline )); do
total=$(oc get nodes -o json 2>/dev/null \
| jq '[.items[] | (.status.allocatable["nvidia.com/gpu"] // "0" | tonumber)] | add // 0')
if [[ "${total:-0}" -gt 0 ]]; then
log "nvidia.com/gpu=${total} allocatable cluster-wide"
break
fi
sleep 30
done
if [[ "${total:-0}" -eq 0 ]]; then
warn "no GPU allocatable after 20 min — check pods in nvidia-gpu-operator namespace"
warn " oc get pods -n nvidia-gpu-operator"
fi
log "05-gpu: done"