Skip to content

Commit 35dcc9b

Browse files
shivamerlavarunrsekar
authored andcommitted
Add must-gather script for diag collection of NIM operator and operands(#496)
Signed-off-by: Shiva Krishna, Merla <smerla@nvidia.com> Add Copyright, Usage guide and dump model manifests Signed-off-by: Shiva Krishna, Merla <smerla@nvidia.com> Collect storage information along with ingress Signed-off-by: Shiva Krishna, Merla <smerla@nvidia.com> Update usage Signed-off-by: Shiva Krishna, Merla <smerla@nvidia.com>
1 parent 5ec8359 commit 35dcc9b

1 file changed

Lines changed: 180 additions & 0 deletions

File tree

hack/must-gather.sh

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
#!/usr/bin/env bash
2+
3+
# Copyright 2025 NVIDIA CORPORATION
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
###############################################################################
18+
# NVIDIA NIM & NeMo Must-Gather Script
19+
#
20+
# This script collects logs and specs from:
21+
# - GPU node status and descriptions
22+
# - Kubernetes version info
23+
# - Storage Info (StorageClass, PVC and PVs)
24+
# - NIM Operator
25+
# - NIMPipeline/NIMService/NIMCache CRs, Pods and Ingress
26+
# - NIM Model Manifest ConfigMaps
27+
# - NeMo microservices CRs, Pods and Ingress (optional)
28+
#
29+
# Usage:
30+
# export OPERATOR_NAMESPACE=<namespace where NIM Operator is installed>
31+
# export NIM_NAMESPACE=<namespace where NIMService/NIMCache are deployed>
32+
# export NEMO_NAMESPACE=<namespace where NeMo microservices are deployed> # Optional
33+
#
34+
# ./must-gather.sh
35+
#
36+
# Output will be saved to:
37+
# ${ARTIFACT_DIR:-/tmp/nim-nemo-must-gather_<timestamp>}
38+
###############################################################################
39+
40+
set -o nounset
41+
set -o errexit
42+
set -x
43+
44+
K=kubectl
45+
if ! $K version > /dev/null 2>&1; then
46+
K=oc
47+
if ! $K version > /dev/null 2>&1; then
48+
echo "FATAL: neither 'kubectl' nor 'oc' appear to be working. Exiting..."
49+
exit 1
50+
fi
51+
fi
52+
53+
export ARTIFACT_DIR="${ARTIFACT_DIR:-/tmp/nim-nemo-must-gather_$(date +%Y%m%d_%H%M)}"
54+
mkdir -p "$ARTIFACT_DIR"
55+
56+
exec 1> >(tee "$ARTIFACT_DIR/must-gather.log")
57+
exec 2> "$ARTIFACT_DIR/must-gather.stderr.log"
58+
59+
######################################
60+
# CLUSTER INFO
61+
######################################
62+
mkdir -p "$ARTIFACT_DIR/cluster"
63+
64+
echo "Gathering Kubernetes version info"
65+
$K version -o yaml > "$ARTIFACT_DIR/cluster/k8s_version.yaml" || true
66+
67+
echo "Gathering GPU node status"
68+
$K get nodes -l nvidia.com/gpu.present=true -o wide > "$ARTIFACT_DIR/cluster/gpu_nodes.status" || true
69+
70+
echo "Gathering GPU node descriptions"
71+
$K describe nodes -l nvidia.com/gpu.present=true > "$ARTIFACT_DIR/cluster/gpu_nodes.descr" || true
72+
73+
74+
######################################
75+
# NIM OPERATOR PODS
76+
######################################
77+
if [[ -z "${OPERATOR_NAMESPACE:-}" ]]; then
78+
echo "FATAL: OPERATOR_NAMESPACE env variable not set"
79+
exit 1
80+
fi
81+
82+
mkdir -p "$ARTIFACT_DIR/operator"
83+
84+
echo "Gathering NIM Operator pods from $OPERATOR_NAMESPACE"
85+
for pod in $( ); do
86+
pod_name=$(basename "$pod")
87+
$K logs "$pod" -n "$OPERATOR_NAMESPACE" --all-containers --prefix > "$ARTIFACT_DIR/operator/${pod_name}.log" || true
88+
$K describe "$pod" -n "$OPERATOR_NAMESPACE" > "$ARTIFACT_DIR/operator/${pod_name}.descr" || true
89+
done
90+
91+
######################################
92+
# STORAGE CLASSES, PVs, PVCs
93+
######################################
94+
echo "Gathering storage class, PVC and PV information"
95+
mkdir -p "$ARTIFACT_DIR/storage"
96+
97+
$K get storageclass -oyaml > "$ARTIFACT_DIR/storage/storageclasses.yaml" || true
98+
$K get pv -oyaml > "$ARTIFACT_DIR/storage/persistentvolumes.yaml" || true
99+
100+
echo "Gathering PVCs from NIM_NAMESPACE: $NIM_NAMESPACE"
101+
mkdir -p "$ARTIFACT_DIR/storage/nim"
102+
$K get pvc -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/storage/nim/pvcs.yaml" || true
103+
104+
if [[ -n "${NEMO_NAMESPACE:-}" ]]; then
105+
echo "Gathering PVCs from NEMO_NAMESPACE: $NEMO_NAMESPACE"
106+
mkdir -p "$ARTIFACT_DIR/storage/nemo"
107+
$K get pvc -n "$NEMO_NAMESPACE" -oyaml > "$ARTIFACT_DIR/storage/nemo/pvcs.yaml" || true
108+
fi
109+
110+
######################################
111+
# NIM SERVICE & CACHE
112+
######################################
113+
if [[ -z "${NIM_NAMESPACE:-}" ]]; then
114+
echo "FATAL: NIM_NAMESPACE env variable not set"
115+
exit 1
116+
fi
117+
118+
mkdir -p "$ARTIFACT_DIR/nim"
119+
120+
echo "Gathering NIMPipeline, NIMService and NIMCache CRs from $NIM_NAMESPACE"
121+
$K get nimcaches.apps.nvidia.com -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nim/nimcaches.yaml" || true
122+
$K get nimpipelines.apps.nvidia.com -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nim/nimpipelines.yaml" || true
123+
$K get nimservices.apps.nvidia.com -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nim/nimservices.yaml" || true
124+
125+
echo "Gathering ConfigMaps in $NIM_NAMESPACE owned by NIMCache"
126+
mkdir -p "$ARTIFACT_DIR/nim/configmaps"
127+
128+
for cm in $($K get configmaps -n "$NIM_NAMESPACE" -o name); do
129+
# Check if the ownerReference has kind: NIMCache
130+
if $K get "$cm" -n "$NIM_NAMESPACE" -o yaml | grep -A 5 'ownerReferences:' | grep -q 'kind: NIMCache'; then
131+
cm_name=$(basename "$cm")
132+
$K get "$cm" -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nim/configmaps/${cm_name}.yaml" || true
133+
fi
134+
done
135+
136+
echo "Gathering NIMService pods from $NIM_NAMESPACE"
137+
for pod in $($K get pods -n "$NIM_NAMESPACE" -l "app.kubernetes.io/part-of=nim-service,app.kubernetes.io/managed-by=k8s-nim-operator" -oname); do
138+
pod_name=$(basename "$pod")
139+
$K logs "$pod" -n "$NIM_NAMESPACE" --all-containers --prefix > "$ARTIFACT_DIR/nim/${pod_name}.log" || true
140+
$K describe "$pod" -n "$NIM_NAMESPACE" > "$ARTIFACT_DIR/nim/${pod_name}.descr" || true
141+
done
142+
143+
echo "Gathering Ingress configuration from $NIM_NAMESPACE"
144+
mkdir -p "$ARTIFACT_DIR/nim/ingress"
145+
$K get ingress -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nim/ingress/ingress.yaml" || true
146+
147+
######################################
148+
# NEMO MICROSERVICES
149+
######################################
150+
if [[ -n "${NEMO_NAMESPACE:-}" ]]; then
151+
mkdir -p "$ARTIFACT_DIR/nemo"
152+
153+
echo "Gathering NeMo CRs from $NEMO_NAMESPACE"
154+
RESOURCES=(
155+
nemocustomizers.apps.nvidia.com
156+
nemodatastores.apps.nvidia.com
157+
nemoentitystores.apps.nvidia.com
158+
nemoevaluators.apps.nvidia.com
159+
nemoguardrails.apps.nvidia.com
160+
)
161+
162+
for res in "${RESOURCES[@]}"; do
163+
$K get "$res" -n "$NEMO_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nemo/${res}.yaml" || true
164+
done
165+
166+
echo "Gathering NeMo microservice pods from $NEMO_NAMESPACE"
167+
for pod in $($K get pods -n "$NEMO_NAMESPACE" -l "app.kubernetes.io/managed-by=k8s-nim-operator" -oname); do
168+
pod_name=$(basename "$pod")
169+
$K logs "$pod" -n "$NEMO_NAMESPACE" --all-containers --prefix > "$ARTIFACT_DIR/nemo/${pod_name}.log" || true
170+
$K describe "$pod" -n "$NEMO_NAMESPACE" > "$ARTIFACT_DIR/nemo/${pod_name}.descr" || true
171+
done
172+
173+
echo "Gathering Ingress configuration from $NEMO_NAMESPACE"
174+
mkdir -p "$ARTIFACT_DIR/nemo/ingress"
175+
$K get ingress -n "$NEMO_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nemo/ingress/ingress.yaml" || true
176+
else
177+
echo "Skipping NeMo microservice collection. NEMO_NAMESPACE not set."
178+
fi
179+
180+
echo "Must gather logs collected successfully and saved to: $ARTIFACT_DIR"

0 commit comments

Comments
 (0)