@@ -123,28 +123,44 @@ verify_external_metrics() {
123123 if [[ " ${has_data} " != " true" ]]; then
124124 log_error " External metric dcgm_gpu_memory_used has no data after 4 minutes"
125125
126- # Diagnostic 1: Raw external metrics API response
127- log_warn " --- Raw external metrics API response ---"
126+ # Diagnostic 1: Raw external metrics API responses for BOTH metrics
127+ log_warn " --- External API: dcgm_gpu_memory_used ---"
128128 kubectl --context=" ${KUBE_CTX} " get --raw \
129- " /apis/external.metrics.k8s.io/v1beta1/namespaces/default/dcgm_gpu_memory_used" 2>&1 | head -20 || true
129+ " /apis/external.metrics.k8s.io/v1beta1/namespaces/default/dcgm_gpu_memory_used" 2>&1 || true
130+ log_warn " --- External API: dcgm_gpu_utilization ---"
131+ kubectl --context=" ${KUBE_CTX} " get --raw \
132+ " /apis/external.metrics.k8s.io/v1beta1/namespaces/default/dcgm_gpu_utilization" 2>&1 || true
130133
131- # Diagnostic 2: Query Prometheus directly for the underlying metric
134+ # Diagnostic 2: Run the exact PromQL the adapter would execute
132135 local prom_svc=" http://kube-prometheus-prometheus.monitoring.svc:9090"
133- log_warn " --- Prometheus direct query: DCGM_FI_DEV_FB_USED ---"
134- kubectl --context=" ${KUBE_CTX} " -n monitoring run prom-check --rm -i --restart=Never \
135- --image=curlimages/curl:latest -- \
136- curl -sf " ${prom_svc} /api/v1/query?query=DCGM_FI_DEV_FB_USED" 2> /dev/null \
137- | jq ' .data.result | length' || true
138-
139- # Diagnostic 3: Check prometheus-adapter ConfigMap for externalRules
140- log_warn " --- prometheus-adapter ConfigMap (externalRules) ---"
136+ log_warn " --- Prometheus: exact adapter PromQL ---"
137+ kubectl --context=" ${KUBE_CTX} " -n monitoring run prom-diag --rm -i --restart=Never \
138+ --image=curlimages/curl:latest -- sh -c "
139+ echo 'Raw series count:';
140+ curl -sf '${prom_svc} /api/v1/query?query=DCGM_FI_DEV_FB_USED' | head -c 500;
141+ echo;
142+ echo 'Adapter metricsQuery result:';
143+ curl -sf '${prom_svc} /api/v1/query?query=avg(avg_over_time(DCGM_FI_DEV_FB_USED%5B2m%5D))' | head -c 500;
144+ echo;
145+ " 2> /dev/null || true
146+
147+ # Diagnostic 3: Full externalRules from ConfigMap (not truncated)
148+ log_warn " --- prometheus-adapter externalRules (full) ---"
141149 kubectl --context=" ${KUBE_CTX} " -n monitoring get configmap -l app.kubernetes.io/name=prometheus-adapter \
142150 -o jsonpath=' {.items[0].data.config\.yaml}' 2> /dev/null \
143- | grep -A5 ' externalRules' || echo " No externalRules found in ConfigMap"
151+ | python3 -c " import sys,yaml; cfg=yaml.safe_load(sys.stdin); print(yaml.dump(cfg.get('externalRules', 'MISSING')))" 2> /dev/null \
152+ || kubectl --context=" ${KUBE_CTX} " -n monitoring get configmap -l app.kubernetes.io/name=prometheus-adapter \
153+ -o jsonpath=' {.items[0].data.config\.yaml}' 2> /dev/null | grep -A20 ' externalRules' || echo " No externalRules found"
154+
155+ # Diagnostic 4: Adapter deployment args (verify metricsRelistInterval)
156+ log_warn " --- prometheus-adapter container args ---"
157+ kubectl --context=" ${KUBE_CTX} " -n monitoring get deployment -l app.kubernetes.io/name=prometheus-adapter \
158+ -o jsonpath=' {.items[0].spec.template.spec.containers[0].args}' 2> /dev/null || true
159+ echo
144160
145- # Diagnostic 4 : prometheus-adapter logs (last 20 lines)
161+ # Diagnostic 5 : prometheus-adapter logs (last 30 lines)
146162 log_warn " --- prometheus-adapter logs (tail) ---"
147- kubectl --context=" ${KUBE_CTX} " -n monitoring logs deployment/prometheus-adapter --tail=20 2> /dev/null || true
163+ kubectl --context=" ${KUBE_CTX} " -n monitoring logs deployment/prometheus-adapter --tail=30 2> /dev/null || true
148164
149165 exit 1
150166 fi
0 commit comments