forked from kubernetes-sigs/dra-driver-nvidia-gpu
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcompute-domain-daemon.tmpl.yaml
More file actions
83 lines (83 loc) · 2.65 KB
/
compute-domain-daemon.tmpl.yaml
File metadata and controls
83 lines (83 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
namespace: {{ .Namespace }}
generateName: {{ .GenerateName }}
finalizers:
- {{ .Finalizer }}
labels:
app.kubernetes.io/name: {{ .AppLabelValue }}
{{ .ComputeDomainLabelKey }}: {{ .ComputeDomainLabelValue }}
spec:
selector:
matchLabels:
{{ .ComputeDomainLabelKey }}: {{ .ComputeDomainLabelValue }}
template:
metadata:
labels:
{{ .ComputeDomainLabelKey }}: {{ .ComputeDomainLabelValue }}
spec:
hostNetwork: true
nodeSelector:
{{ .ComputeDomainLabelKey }}: {{ .ComputeDomainLabelValue }}
containers:
- name: compute-domain-daemon
image: {{ .ImageName }}
command: [sh, -c]
args:
- |-
trap 'exit 0' TERM
set -e
if nvidia-smi -q | grep -E "ClusterUUID|CliqueId" | grep -q "N/A" || \
nvidia-smi -q | grep -E "ClusterUUID" | grep -q "00000000-0000-0000-0000-000000000000"; then
echo "ClusterUUID and CliqueId are NOT set for GPUs on this node."
echo "The IMEX daemon will not be started."
echo "Sleeping forever..."
touch /etc/nvidia-imex-null
tail -f /dev/null & wait
fi
# Emit nodes config for facilitating debug.
echo "/etc/nvidia-imex/nodes_config.cfg:"
cat /etc/nvidia-imex/nodes_config.cfg
/usr/bin/nvidia-imex -c /etc/nvidia-imex/config.cfg
tail -n +1 -f /var/log/nvidia-imex.log & wait
resources:
claims:
- name: compute-domain-daemon
startupProbe:
exec:
command:
- "sh"
- "-c"
- |-
if [ -f /etc/nvidia-imex-null ]; then
exit 0
fi
test "$(nvidia-imex-ctl -q -i 127.0.0.1 50005)" = "READY"
initialDelaySeconds: 1
periodSeconds: 1
livenessProbe:
exec:
command:
- "sh"
- "-c"
- |
if [ -f /etc/nvidia-imex-null ]; then
exit 0
fi
test "$(nvidia-imex-ctl -q -i 127.0.0.1 50005)" = "READY"
initialDelaySeconds: 10
periodSeconds: 5
# Repel all node taints.
# See https://github.com/NVIDIA/k8s-dra-driver-gpu/issues/305
tolerations:
- operator: "Exists"
effect: "NoSchedule"
- operator: "Exists"
effect: "NoExecute"
- operator: "Exists"
effect: "PreferNoSchedule"
resourceClaims:
- name: compute-domain-daemon
resourceClaimTemplateName: {{ .ResourceClaimTemplateName }}