Skip to content

Commit c67e021

Browse files
committed
fix: improve external metrics diagnostics for CI debugging
Previous diagnostics were truncated (grep -A5 only showed first rule). Now dumps: both external metrics responses, exact adapter PromQL via Prometheus direct query, full externalRules from ConfigMap (using python3 yaml parser with grep fallback), adapter container args to verify metricsRelistInterval, and more adapter log lines.
1 parent b771fbb commit c67e021

File tree

1 file changed

+31
-15
lines changed

1 file changed

+31
-15
lines changed

kwok/scripts/validate-cluster-autoscaling.sh

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -123,28 +123,44 @@ verify_external_metrics() {
123123
if [[ "${has_data}" != "true" ]]; then
124124
log_error "External metric dcgm_gpu_memory_used has no data after 4 minutes"
125125

126-
# Diagnostic 1: Raw external metrics API response
127-
log_warn "--- Raw external metrics API response ---"
126+
# Diagnostic 1: Raw external metrics API responses for BOTH metrics
127+
log_warn "--- External API: dcgm_gpu_memory_used ---"
128128
kubectl --context="${KUBE_CTX}" get --raw \
129-
"/apis/external.metrics.k8s.io/v1beta1/namespaces/default/dcgm_gpu_memory_used" 2>&1 | head -20 || true
129+
"/apis/external.metrics.k8s.io/v1beta1/namespaces/default/dcgm_gpu_memory_used" 2>&1 || true
130+
log_warn "--- External API: dcgm_gpu_utilization ---"
131+
kubectl --context="${KUBE_CTX}" get --raw \
132+
"/apis/external.metrics.k8s.io/v1beta1/namespaces/default/dcgm_gpu_utilization" 2>&1 || true
130133

131-
# Diagnostic 2: Query Prometheus directly for the underlying metric
134+
# Diagnostic 2: Run the exact PromQL the adapter would execute
132135
local prom_svc="http://kube-prometheus-prometheus.monitoring.svc:9090"
133-
log_warn "--- Prometheus direct query: DCGM_FI_DEV_FB_USED ---"
134-
kubectl --context="${KUBE_CTX}" -n monitoring run prom-check --rm -i --restart=Never \
135-
--image=curlimages/curl:latest -- \
136-
curl -sf "${prom_svc}/api/v1/query?query=DCGM_FI_DEV_FB_USED" 2>/dev/null \
137-
| jq '.data.result | length' || true
138-
139-
# Diagnostic 3: Check prometheus-adapter ConfigMap for externalRules
140-
log_warn "--- prometheus-adapter ConfigMap (externalRules) ---"
136+
log_warn "--- Prometheus: exact adapter PromQL ---"
137+
kubectl --context="${KUBE_CTX}" -n monitoring run prom-diag --rm -i --restart=Never \
138+
--image=curlimages/curl:latest -- sh -c "
139+
echo 'Raw series count:';
140+
curl -sf '${prom_svc}/api/v1/query?query=DCGM_FI_DEV_FB_USED' | head -c 500;
141+
echo;
142+
echo 'Adapter metricsQuery result:';
143+
curl -sf '${prom_svc}/api/v1/query?query=avg(avg_over_time(DCGM_FI_DEV_FB_USED%5B2m%5D))' | head -c 500;
144+
echo;
145+
" 2>/dev/null || true
146+
147+
# Diagnostic 3: Full externalRules from ConfigMap (not truncated)
148+
log_warn "--- prometheus-adapter externalRules (full) ---"
141149
kubectl --context="${KUBE_CTX}" -n monitoring get configmap -l app.kubernetes.io/name=prometheus-adapter \
142150
-o jsonpath='{.items[0].data.config\.yaml}' 2>/dev/null \
143-
| grep -A5 'externalRules' || echo "No externalRules found in ConfigMap"
151+
| python3 -c "import sys,yaml; cfg=yaml.safe_load(sys.stdin); print(yaml.dump(cfg.get('externalRules', 'MISSING')))" 2>/dev/null \
152+
|| kubectl --context="${KUBE_CTX}" -n monitoring get configmap -l app.kubernetes.io/name=prometheus-adapter \
153+
-o jsonpath='{.items[0].data.config\.yaml}' 2>/dev/null | grep -A20 'externalRules' || echo "No externalRules found"
154+
155+
# Diagnostic 4: Adapter deployment args (verify metricsRelistInterval)
156+
log_warn "--- prometheus-adapter container args ---"
157+
kubectl --context="${KUBE_CTX}" -n monitoring get deployment -l app.kubernetes.io/name=prometheus-adapter \
158+
-o jsonpath='{.items[0].spec.template.spec.containers[0].args}' 2>/dev/null || true
159+
echo
144160

145-
# Diagnostic 4: prometheus-adapter logs (last 20 lines)
161+
# Diagnostic 5: prometheus-adapter logs (last 30 lines)
146162
log_warn "--- prometheus-adapter logs (tail) ---"
147-
kubectl --context="${KUBE_CTX}" -n monitoring logs deployment/prometheus-adapter --tail=20 2>/dev/null || true
163+
kubectl --context="${KUBE_CTX}" -n monitoring logs deployment/prometheus-adapter --tail=30 2>/dev/null || true
148164

149165
exit 1
150166
fi

0 commit comments

Comments
 (0)