forked from NVIDIA/k8s-nim-operator
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmust-gather.sh
More file actions
executable file
·180 lines (149 loc) · 6.71 KB
/
must-gather.sh
File metadata and controls
executable file
·180 lines (149 loc) · 6.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env bash
# Copyright 2025 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###############################################################################
# NVIDIA NIM & NeMo Must-Gather Script
#
# This script collects logs and specs from:
# - GPU node status and descriptions
# - Kubernetes version info
# - Storage Info (StorageClass, PVC and PVs)
# - NIM Operator
# - NIMPipeline/NIMService/NIMCache CRs, Pods and Ingress
# - NIM Model Manifest ConfigMaps
# - NeMo microservices CRs, Pods and Ingress (optional)
#
# Usage:
# export OPERATOR_NAMESPACE=<namespace where NIM Operator is installed>
# export NIM_NAMESPACE=<namespace where NIMService/NIMCache are deployed>
# export NEMO_NAMESPACE=<namespace where NeMo microservices are deployed> # Optional
#
# ./must-gather.sh
#
# Output will be saved to:
# ${ARTIFACT_DIR:-/tmp/nim-nemo-must-gather_<timestamp>}
###############################################################################
set -o nounset
set -o errexit
set -x
K=kubectl
if ! $K version > /dev/null 2>&1; then
K=oc
if ! $K version > /dev/null 2>&1; then
echo "FATAL: neither 'kubectl' nor 'oc' appear to be working. Exiting..."
exit 1
fi
fi
export ARTIFACT_DIR="${ARTIFACT_DIR:-/tmp/nim-nemo-must-gather_$(date +%Y%m%d_%H%M)}"
mkdir -p "$ARTIFACT_DIR"
exec 1> >(tee "$ARTIFACT_DIR/must-gather.log")
exec 2> "$ARTIFACT_DIR/must-gather.stderr.log"
######################################
# CLUSTER INFO
######################################
mkdir -p "$ARTIFACT_DIR/cluster"
echo "Gathering Kubernetes version info"
$K version -o yaml > "$ARTIFACT_DIR/cluster/k8s_version.yaml" || true
echo "Gathering GPU node status"
$K get nodes -l nvidia.com/gpu.present=true -o wide > "$ARTIFACT_DIR/cluster/gpu_nodes.status" || true
echo "Gathering GPU node descriptions"
$K describe nodes -l nvidia.com/gpu.present=true > "$ARTIFACT_DIR/cluster/gpu_nodes.descr" || true
######################################
# NIM OPERATOR PODS
######################################
if [[ -z "${OPERATOR_NAMESPACE:-}" ]]; then
echo "FATAL: OPERATOR_NAMESPACE env variable not set"
exit 1
fi
mkdir -p "$ARTIFACT_DIR/operator"
echo "Gathering NIM Operator pods from $OPERATOR_NAMESPACE"
for pod in $( ); do
pod_name=$(basename "$pod")
$K logs "$pod" -n "$OPERATOR_NAMESPACE" --all-containers --prefix > "$ARTIFACT_DIR/operator/${pod_name}.log" || true
$K describe "$pod" -n "$OPERATOR_NAMESPACE" > "$ARTIFACT_DIR/operator/${pod_name}.descr" || true
done
######################################
# STORAGE CLASSES, PVs, PVCs
######################################
echo "Gathering storage class, PVC and PV information"
mkdir -p "$ARTIFACT_DIR/storage"
$K get storageclass -oyaml > "$ARTIFACT_DIR/storage/storageclasses.yaml" || true
$K get pv -oyaml > "$ARTIFACT_DIR/storage/persistentvolumes.yaml" || true
echo "Gathering PVCs from NIM_NAMESPACE: $NIM_NAMESPACE"
mkdir -p "$ARTIFACT_DIR/storage/nim"
$K get pvc -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/storage/nim/pvcs.yaml" || true
if [[ -n "${NEMO_NAMESPACE:-}" ]]; then
echo "Gathering PVCs from NEMO_NAMESPACE: $NEMO_NAMESPACE"
mkdir -p "$ARTIFACT_DIR/storage/nemo"
$K get pvc -n "$NEMO_NAMESPACE" -oyaml > "$ARTIFACT_DIR/storage/nemo/pvcs.yaml" || true
fi
######################################
# NIM SERVICE & CACHE
######################################
if [[ -z "${NIM_NAMESPACE:-}" ]]; then
echo "FATAL: NIM_NAMESPACE env variable not set"
exit 1
fi
mkdir -p "$ARTIFACT_DIR/nim"
echo "Gathering NIMPipeline, NIMService and NIMCache CRs from $NIM_NAMESPACE"
$K get nimcaches.apps.nvidia.com -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nim/nimcaches.yaml" || true
$K get nimpipelines.apps.nvidia.com -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nim/nimpipelines.yaml" || true
$K get nimservices.apps.nvidia.com -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nim/nimservices.yaml" || true
echo "Gathering ConfigMaps in $NIM_NAMESPACE owned by NIMCache"
mkdir -p "$ARTIFACT_DIR/nim/configmaps"
for cm in $($K get configmaps -n "$NIM_NAMESPACE" -o name); do
# Check if the ownerReference has kind: NIMCache
if $K get "$cm" -n "$NIM_NAMESPACE" -o yaml | grep -A 5 'ownerReferences:' | grep -q 'kind: NIMCache'; then
cm_name=$(basename "$cm")
$K get "$cm" -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nim/configmaps/${cm_name}.yaml" || true
fi
done
echo "Gathering NIMService pods from $NIM_NAMESPACE"
for pod in $($K get pods -n "$NIM_NAMESPACE" -l "app.kubernetes.io/part-of=nim-service,app.kubernetes.io/managed-by=k8s-nim-operator" -oname); do
pod_name=$(basename "$pod")
$K logs "$pod" -n "$NIM_NAMESPACE" --all-containers --prefix > "$ARTIFACT_DIR/nim/${pod_name}.log" || true
$K describe "$pod" -n "$NIM_NAMESPACE" > "$ARTIFACT_DIR/nim/${pod_name}.descr" || true
done
echo "Gathering Ingress configuration from $NIM_NAMESPACE"
mkdir -p "$ARTIFACT_DIR/nim/ingress"
$K get ingress -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nim/ingress/ingress.yaml" || true
######################################
# NEMO MICROSERVICES
######################################
if [[ -n "${NEMO_NAMESPACE:-}" ]]; then
mkdir -p "$ARTIFACT_DIR/nemo"
echo "Gathering NeMo CRs from $NEMO_NAMESPACE"
RESOURCES=(
nemocustomizers.apps.nvidia.com
nemodatastores.apps.nvidia.com
nemoentitystores.apps.nvidia.com
nemoevaluators.apps.nvidia.com
nemoguardrails.apps.nvidia.com
)
for res in "${RESOURCES[@]}"; do
$K get "$res" -n "$NEMO_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nemo/${res}.yaml" || true
done
echo "Gathering NeMo microservice pods from $NEMO_NAMESPACE"
for pod in $($K get pods -n "$NEMO_NAMESPACE" -l "app.kubernetes.io/managed-by=k8s-nim-operator" -oname); do
pod_name=$(basename "$pod")
$K logs "$pod" -n "$NEMO_NAMESPACE" --all-containers --prefix > "$ARTIFACT_DIR/nemo/${pod_name}.log" || true
$K describe "$pod" -n "$NEMO_NAMESPACE" > "$ARTIFACT_DIR/nemo/${pod_name}.descr" || true
done
echo "Gathering Ingress configuration from $NEMO_NAMESPACE"
mkdir -p "$ARTIFACT_DIR/nemo/ingress"
$K get ingress -n "$NEMO_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nemo/ingress/ingress.yaml" || true
else
echo "Skipping NeMo microservice collection. NEMO_NAMESPACE not set."
fi
echo "Must gather logs collected successfully and saved to: $ARTIFACT_DIR"