|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +# Copyright 2025 NVIDIA CORPORATION |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +############################################################################### |
| 18 | +# NVIDIA NIM & NeMo Must-Gather Script |
| 19 | +# |
| 20 | +# This script collects logs and specs from: |
| 21 | +# - GPU node status and descriptions |
| 22 | +# - Kubernetes version info |
| 23 | +# - Storage Info (StorageClass, PVC and PVs) |
| 24 | +# - NIM Operator |
| 25 | +# - NIMPipeline/NIMService/NIMCache CRs, Pods and Ingress |
| 26 | +# - NIM Model Manifest ConfigMaps |
| 27 | +# - NeMo microservices CRs, Pods and Ingress (optional) |
| 28 | +# |
| 29 | +# Usage: |
| 30 | +# export OPERATOR_NAMESPACE=<namespace where NIM Operator is installed> |
| 31 | +# export NIM_NAMESPACE=<namespace where NIMService/NIMCache are deployed> |
| 32 | +# export NEMO_NAMESPACE=<namespace where NeMo microservices are deployed> # Optional |
| 33 | +# |
| 34 | +# ./must-gather.sh |
| 35 | +# |
| 36 | +# Output will be saved to: |
| 37 | +# ${ARTIFACT_DIR:-/tmp/nim-nemo-must-gather_<timestamp>} |
| 38 | +############################################################################### |
| 39 | + |
| 40 | +set -o nounset |
| 41 | +set -o errexit |
| 42 | +set -x |
| 43 | + |
| 44 | +K=kubectl |
| 45 | +if ! $K version > /dev/null 2>&1; then |
| 46 | + K=oc |
| 47 | + if ! $K version > /dev/null 2>&1; then |
| 48 | + echo "FATAL: neither 'kubectl' nor 'oc' appear to be working. Exiting..." |
| 49 | + exit 1 |
| 50 | + fi |
| 51 | +fi |
| 52 | + |
| 53 | +export ARTIFACT_DIR="${ARTIFACT_DIR:-/tmp/nim-nemo-must-gather_$(date +%Y%m%d_%H%M)}" |
| 54 | +mkdir -p "$ARTIFACT_DIR" |
| 55 | + |
| 56 | +exec 1> >(tee "$ARTIFACT_DIR/must-gather.log") |
| 57 | +exec 2> "$ARTIFACT_DIR/must-gather.stderr.log" |
| 58 | + |
| 59 | +###################################### |
| 60 | +# CLUSTER INFO |
| 61 | +###################################### |
| 62 | +mkdir -p "$ARTIFACT_DIR/cluster" |
| 63 | + |
| 64 | +echo "Gathering Kubernetes version info" |
| 65 | +$K version -o yaml > "$ARTIFACT_DIR/cluster/k8s_version.yaml" || true |
| 66 | + |
| 67 | +echo "Gathering GPU node status" |
| 68 | +$K get nodes -l nvidia.com/gpu.present=true -o wide > "$ARTIFACT_DIR/cluster/gpu_nodes.status" || true |
| 69 | + |
| 70 | +echo "Gathering GPU node descriptions" |
| 71 | +$K describe nodes -l nvidia.com/gpu.present=true > "$ARTIFACT_DIR/cluster/gpu_nodes.descr" || true |
| 72 | + |
| 73 | + |
| 74 | +###################################### |
| 75 | +# NIM OPERATOR PODS |
| 76 | +###################################### |
| 77 | +if [[ -z "${OPERATOR_NAMESPACE:-}" ]]; then |
| 78 | + echo "FATAL: OPERATOR_NAMESPACE env variable not set" |
| 79 | + exit 1 |
| 80 | +fi |
| 81 | + |
| 82 | +mkdir -p "$ARTIFACT_DIR/operator" |
| 83 | + |
| 84 | +echo "Gathering NIM Operator pods from $OPERATOR_NAMESPACE" |
| 85 | +for pod in $( ); do |
| 86 | + pod_name=$(basename "$pod") |
| 87 | + $K logs "$pod" -n "$OPERATOR_NAMESPACE" --all-containers --prefix > "$ARTIFACT_DIR/operator/${pod_name}.log" || true |
| 88 | + $K describe "$pod" -n "$OPERATOR_NAMESPACE" > "$ARTIFACT_DIR/operator/${pod_name}.descr" || true |
| 89 | +done |
| 90 | + |
| 91 | +###################################### |
| 92 | +# STORAGE CLASSES, PVs, PVCs |
| 93 | +###################################### |
| 94 | +echo "Gathering storage class, PVC and PV information" |
| 95 | +mkdir -p "$ARTIFACT_DIR/storage" |
| 96 | + |
| 97 | +$K get storageclass -oyaml > "$ARTIFACT_DIR/storage/storageclasses.yaml" || true |
| 98 | +$K get pv -oyaml > "$ARTIFACT_DIR/storage/persistentvolumes.yaml" || true |
| 99 | + |
| 100 | +echo "Gathering PVCs from NIM_NAMESPACE: $NIM_NAMESPACE" |
| 101 | +mkdir -p "$ARTIFACT_DIR/storage/nim" |
| 102 | +$K get pvc -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/storage/nim/pvcs.yaml" || true |
| 103 | + |
| 104 | +if [[ -n "${NEMO_NAMESPACE:-}" ]]; then |
| 105 | + echo "Gathering PVCs from NEMO_NAMESPACE: $NEMO_NAMESPACE" |
| 106 | + mkdir -p "$ARTIFACT_DIR/storage/nemo" |
| 107 | + $K get pvc -n "$NEMO_NAMESPACE" -oyaml > "$ARTIFACT_DIR/storage/nemo/pvcs.yaml" || true |
| 108 | +fi |
| 109 | + |
| 110 | +###################################### |
| 111 | +# NIM SERVICE & CACHE |
| 112 | +###################################### |
| 113 | +if [[ -z "${NIM_NAMESPACE:-}" ]]; then |
| 114 | + echo "FATAL: NIM_NAMESPACE env variable not set" |
| 115 | + exit 1 |
| 116 | +fi |
| 117 | + |
| 118 | +mkdir -p "$ARTIFACT_DIR/nim" |
| 119 | + |
| 120 | +echo "Gathering NIMPipeline, NIMService and NIMCache CRs from $NIM_NAMESPACE" |
| 121 | +$K get nimcaches.apps.nvidia.com -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nim/nimcaches.yaml" || true |
| 122 | +$K get nimpipelines.apps.nvidia.com -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nim/nimpipelines.yaml" || true |
| 123 | +$K get nimservices.apps.nvidia.com -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nim/nimservices.yaml" || true |
| 124 | + |
| 125 | +echo "Gathering ConfigMaps in $NIM_NAMESPACE owned by NIMCache" |
| 126 | +mkdir -p "$ARTIFACT_DIR/nim/configmaps" |
| 127 | + |
| 128 | +for cm in $($K get configmaps -n "$NIM_NAMESPACE" -o name); do |
| 129 | + # Check if the ownerReference has kind: NIMCache |
| 130 | + if $K get "$cm" -n "$NIM_NAMESPACE" -o yaml | grep -A 5 'ownerReferences:' | grep -q 'kind: NIMCache'; then |
| 131 | + cm_name=$(basename "$cm") |
| 132 | + $K get "$cm" -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nim/configmaps/${cm_name}.yaml" || true |
| 133 | + fi |
| 134 | +done |
| 135 | + |
| 136 | +echo "Gathering NIMService pods from $NIM_NAMESPACE" |
| 137 | +for pod in $($K get pods -n "$NIM_NAMESPACE" -l "app.kubernetes.io/part-of=nim-service,app.kubernetes.io/managed-by=k8s-nim-operator" -oname); do |
| 138 | + pod_name=$(basename "$pod") |
| 139 | + $K logs "$pod" -n "$NIM_NAMESPACE" --all-containers --prefix > "$ARTIFACT_DIR/nim/${pod_name}.log" || true |
| 140 | + $K describe "$pod" -n "$NIM_NAMESPACE" > "$ARTIFACT_DIR/nim/${pod_name}.descr" || true |
| 141 | +done |
| 142 | + |
| 143 | +echo "Gathering Ingress configuration from $NIM_NAMESPACE" |
| 144 | +mkdir -p "$ARTIFACT_DIR/nim/ingress" |
| 145 | +$K get ingress -n "$NIM_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nim/ingress/ingress.yaml" || true |
| 146 | + |
| 147 | +###################################### |
| 148 | +# NEMO MICROSERVICES |
| 149 | +###################################### |
| 150 | +if [[ -n "${NEMO_NAMESPACE:-}" ]]; then |
| 151 | + mkdir -p "$ARTIFACT_DIR/nemo" |
| 152 | + |
| 153 | + echo "Gathering NeMo CRs from $NEMO_NAMESPACE" |
| 154 | + RESOURCES=( |
| 155 | + nemocustomizers.apps.nvidia.com |
| 156 | + nemodatastores.apps.nvidia.com |
| 157 | + nemoentitystores.apps.nvidia.com |
| 158 | + nemoevaluators.apps.nvidia.com |
| 159 | + nemoguardrails.apps.nvidia.com |
| 160 | + ) |
| 161 | + |
| 162 | + for res in "${RESOURCES[@]}"; do |
| 163 | + $K get "$res" -n "$NEMO_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nemo/${res}.yaml" || true |
| 164 | + done |
| 165 | + |
| 166 | + echo "Gathering NeMo microservice pods from $NEMO_NAMESPACE" |
| 167 | + for pod in $($K get pods -n "$NEMO_NAMESPACE" -l "app.kubernetes.io/managed-by=k8s-nim-operator" -oname); do |
| 168 | + pod_name=$(basename "$pod") |
| 169 | + $K logs "$pod" -n "$NEMO_NAMESPACE" --all-containers --prefix > "$ARTIFACT_DIR/nemo/${pod_name}.log" || true |
| 170 | + $K describe "$pod" -n "$NEMO_NAMESPACE" > "$ARTIFACT_DIR/nemo/${pod_name}.descr" || true |
| 171 | + done |
| 172 | + |
| 173 | + echo "Gathering Ingress configuration from $NEMO_NAMESPACE" |
| 174 | + mkdir -p "$ARTIFACT_DIR/nemo/ingress" |
| 175 | + $K get ingress -n "$NEMO_NAMESPACE" -oyaml > "$ARTIFACT_DIR/nemo/ingress/ingress.yaml" || true |
| 176 | +else |
| 177 | + echo "Skipping NeMo microservice collection. NEMO_NAMESPACE not set." |
| 178 | +fi |
| 179 | + |
| 180 | +echo "Must gather logs collected successfully and saved to: $ARTIFACT_DIR" |
0 commit comments