Skip to content

Commit 6137c0b

Browse files
authored
feat(evidence): split ai_service_metrics and fix imagePullPolicy for local images (#463)
1 parent a0a1dc9 commit 6137c0b

File tree

12 files changed

+1046
-141
lines changed

12 files changed

+1046
-141
lines changed

.github/workflows/gpu-h100-inference-test.yaml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,27 @@ jobs:
160160
overQuotaWeight: 1
161161
EOF
162162
163+
# Create DRA ResourceClaim for GPU allocation.
164+
# Required on DRA-only clusters where device-plugin GPU requests cannot be scheduled.
165+
# The kai.scheduler/queue label is required for KAI scheduler to manage the claim.
166+
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF'
167+
apiVersion: resource.k8s.io/v1
168+
kind: ResourceClaim
169+
metadata:
170+
name: vllm-smoke-gpu-claim
171+
namespace: dynamo-system
172+
labels:
173+
kai.scheduler/queue: dynamo
174+
spec:
175+
devices:
176+
requests:
177+
- name: gpu
178+
exactly:
179+
deviceClassName: gpu.nvidia.com
180+
allocationMode: ExactCount
181+
count: 1
182+
EOF
183+
163184
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
164185
-f tests/manifests/dynamo-vllm-smoke-test.yaml -n dynamo-system
165186
@@ -318,6 +339,10 @@ jobs:
318339
kubectl --context="kind-${KIND_CLUSTER_NAME}" delete \
319340
-f tests/manifests/dynamo-vllm-smoke-test.yaml \
320341
-n dynamo-system --ignore-not-found 2>/dev/null || true
342+
kubectl --context="kind-${KIND_CLUSTER_NAME}" delete resourceclaim \
343+
vllm-smoke-gpu-claim -n dynamo-system --ignore-not-found 2>/dev/null || true
344+
kubectl --context="kind-${KIND_CLUSTER_NAME}" delete queue \
345+
dynamo --ignore-not-found 2>/dev/null || true
321346
322347
- name: GPU Test Cleanup
323348
if: always()
Lines changed: 177 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,118 +1,224 @@
1-
# AI Service Metrics (Prometheus ServiceMonitor Discovery)
1+
# AI Service Metrics (Prometheus Discovery)
22

3-
**Cluster:** `EKS / p5.48xlarge / NVIDIA-H100-80GB-HBM3`
4-
**Generated:** 2026-03-24 14:06:00 UTC
53
**Kubernetes Version:** v1.35
64
**Platform:** linux/amd64
5+
**Validated on:** EKS / p5.48xlarge / NVIDIA H100 80GB HBM3
76

87
---
98

109
Demonstrates that Prometheus discovers and collects metrics from AI workloads
11-
that expose them in Prometheus exposition format, using the ServiceMonitor CRD
12-
for automatic target discovery.
10+
that expose them in Prometheus exposition format, using PodMonitor and
11+
ServiceMonitor CRDs for automatic target discovery across both inference and
12+
training workloads.
1313

14-
## vLLM Inference Workload
14+
## Inference: Dynamo Platform (PodMonitor)
1515

16-
A vLLM inference server (serving Qwen/Qwen3-0.6B on GPU via DRA ResourceClaim)
17-
exposes application-level metrics in Prometheus format at `:8000/metrics`.
18-
A ServiceMonitor enables Prometheus to automatically discover and scrape the endpoint.
16+
**Cluster:** `aicr-cuj2` (EKS, inference)
17+
**Generated:** 2026-03-25 10:18:30 UTC
1918

20-
**vLLM workload pod**
19+
The Dynamo operator auto-creates PodMonitors for worker and frontend pods.
20+
The Dynamo vLLM runtime exposes both Dynamo-specific and embedded vLLM metrics
21+
on port 9090 (`system` port) in Prometheus format.
22+
23+
### Dynamo Workload Pods
24+
25+
**Dynamo workload pods**
2126
```
22-
$ kubectl get pods -n vllm-metrics-test -o wide
23-
NAME READY STATUS RESTARTS AGE
24-
vllm-server 1/1 Running 0 5m
27+
$ kubectl get pods -n dynamo-workload -o wide
28+
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
29+
vllm-agg-0-frontend-qqrff 1/1 Running 0 3m29s 10.0.159.241 ip-10-0-184-187.ec2.internal <none> <none>
30+
vllm-agg-0-vllmdecodeworker-95ths 1/1 Running 0 3m29s 10.0.214.229 ip-10-0-180-136.ec2.internal <none> <none>
2531
```
2632

27-
**vLLM metrics endpoint (sampled after 10 inference requests)**
33+
### Worker Metrics Endpoint
34+
35+
**Worker metrics (sampled after 10 inference requests)**
2836
```
29-
$ kubectl exec -n vllm-metrics-test vllm-server -- python3 -c "..." | grep vllm:
30-
vllm:request_success_total{engine="0",finished_reason="length",model_name="Qwen/Qwen3-0.6B"} 10.0
31-
vllm:prompt_tokens_total{engine="0",model_name="Qwen/Qwen3-0.6B"} 80.0
32-
vllm:generation_tokens_total{engine="0",model_name="Qwen/Qwen3-0.6B"} 500.0
33-
vllm:time_to_first_token_seconds_count{engine="0",model_name="Qwen/Qwen3-0.6B"} 10.0
34-
vllm:time_to_first_token_seconds_sum{engine="0",model_name="Qwen/Qwen3-0.6B"} 0.205
35-
vllm:inter_token_latency_seconds_count{engine="0",model_name="Qwen/Qwen3-0.6B"} 490.0
36-
vllm:inter_token_latency_seconds_sum{engine="0",model_name="Qwen/Qwen3-0.6B"} 0.864
37-
vllm:e2e_request_latency_seconds_count{engine="0",model_name="Qwen/Qwen3-0.6B"} 10.0
38-
vllm:kv_cache_usage_perc{engine="0",model_name="Qwen/Qwen3-0.6B"} 0.0
39-
vllm:prefix_cache_queries_total{engine="0",model_name="Qwen/Qwen3-0.6B"} 80.0
40-
vllm:num_requests_running{engine="0",model_name="Qwen/Qwen3-0.6B"} 0.0
41-
vllm:num_requests_waiting{engine="0",model_name="Qwen/Qwen3-0.6B"} 0.0
37+
dynamo_component_request_bytes_total{dynamo_component="backend",dynamo_endpoint="generate",model="Qwen/Qwen3-0.6B"} 11230
38+
dynamo_component_request_duration_seconds_sum{dynamo_component="backend",dynamo_endpoint="generate",model="Qwen/Qwen3-0.6B"} 0.984
39+
dynamo_component_request_duration_seconds_count{dynamo_component="backend",dynamo_endpoint="generate",model="Qwen/Qwen3-0.6B"} 10
40+
dynamo_component_requests_total{dynamo_component="backend",dynamo_endpoint="generate",model="Qwen/Qwen3-0.6B"} 10
41+
dynamo_component_response_bytes_total{dynamo_component="backend",dynamo_endpoint="generate",model="Qwen/Qwen3-0.6B"} 31826
42+
dynamo_component_uptime_seconds 223.250
43+
vllm:engine_sleep_state{engine="0",model_name="Qwen/Qwen3-0.6B",sleep_state="awake"} 1.0
44+
vllm:prefix_cache_queries_total{engine="0",model_name="Qwen/Qwen3-0.6B"} 50.0
4245
```
4346

44-
## ServiceMonitor
47+
### PodMonitors (Auto-Created by Dynamo Operator)
4548

46-
**ServiceMonitor for vLLM**
49+
**Dynamo PodMonitors**
50+
```
51+
$ kubectl get podmonitors -n dynamo-system
52+
NAME AGE
53+
dynamo-frontend 11d
54+
dynamo-planner 11d
55+
dynamo-worker 11d
4756
```
48-
$ kubectl get servicemonitor vllm-inference -n vllm-metrics-test -o yaml
57+
58+
**Worker PodMonitor spec**
59+
```
60+
$ kubectl get podmonitor dynamo-worker -n dynamo-system -o yaml
4961
apiVersion: monitoring.coreos.com/v1
50-
kind: ServiceMonitor
62+
kind: PodMonitor
5163
metadata:
52-
labels:
53-
release: prometheus
54-
name: vllm-inference
55-
namespace: vllm-metrics-test
64+
name: dynamo-worker
65+
namespace: dynamo-system
5666
spec:
57-
endpoints:
58-
- interval: 15s
67+
namespaceSelector:
68+
any: true
69+
podMetricsEndpoints:
70+
- interval: 5s
5971
path: /metrics
60-
port: http
72+
port: system
6173
selector:
6274
matchLabels:
63-
app: vllm-inference
75+
nvidia.com/dynamo-component-type: worker
76+
nvidia.com/metrics-enabled: "true"
77+
```
78+
79+
### Prometheus Target Discovery
80+
81+
**Prometheus scrape targets (active)**
82+
```
83+
{
84+
"job": "dynamo-system/dynamo-frontend",
85+
"endpoint": "http://10.0.159.241:8000/metrics",
86+
"health": "up",
87+
"lastScrape": "2026-03-25T10:19:21.101766071Z"
88+
}
89+
{
90+
"job": "dynamo-system/dynamo-worker",
91+
"endpoint": "http://10.0.214.229:9090/metrics",
92+
"health": "up",
93+
"lastScrape": "2026-03-25T10:19:22.70334816Z"
94+
}
95+
```
96+
97+
### Dynamo Metrics in Prometheus
98+
99+
**Dynamo metrics queried from Prometheus (after 10 inference requests)**
100+
```
101+
dynamo_component_requests_total{endpoint="generate"} = 10
102+
dynamo_component_request_bytes_total{endpoint="generate"} = 11230
103+
dynamo_component_response_bytes_total{endpoint="generate"} = 31826
104+
dynamo_component_request_duration_seconds_count{endpoint="generate"} = 10
105+
dynamo_component_request_duration_seconds_sum{endpoint="generate"} = 0.984
106+
dynamo_component_uptime_seconds = 223.250
107+
dynamo_frontend_input_sequence_tokens_sum = 50
108+
dynamo_frontend_input_sequence_tokens_count = 10
109+
dynamo_frontend_inter_token_latency_seconds_sum = 0.866
110+
dynamo_frontend_inter_token_latency_seconds_count = 490
111+
dynamo_frontend_model_context_length = 40960
112+
dynamo_frontend_model_total_kv_blocks = 37710
113+
```
114+
115+
**Result: PASS** — Prometheus discovers Dynamo inference workloads (frontend + worker) via operator-managed PodMonitors and actively scrapes their Prometheus-format metrics endpoints. Application-level AI inference metrics (request count, request duration, inter-token latency, token throughput, KV cache utilization) are collected and queryable.
116+
117+
---
118+
119+
## Training: PyTorch Workload (ServiceMonitor)
120+
121+
**Cluster:** `aicr-cuj1` (EKS, training)
122+
**Generated:** 2026-03-25 11:03:00 UTC
123+
124+
A PyTorch training workload runs a GPU training loop and exposes training-level
125+
metrics (step count, loss, throughput, GPU memory) on port 8080 in Prometheus
126+
format, discovered via ServiceMonitor.
127+
128+
### Training Workload Pod
129+
130+
**Training pod**
131+
```
132+
$ kubectl get pods -n trainer-metrics-test -o wide
133+
NAME READY STATUS RESTARTS AGE
134+
pytorch-training-job 1/1 Running 0 2m
64135
```
65136

66-
**Service endpoint**
137+
### Training Metrics Endpoint
138+
139+
**Training metrics (after 100 training steps)**
67140
```
68-
$ kubectl get endpoints vllm-inference -n vllm-metrics-test
69-
NAME ENDPOINTS AGE
70-
vllm-inference 10.0.170.78:8000 5m
141+
# HELP training_step_total Total training steps completed
142+
# TYPE training_step_total counter
143+
training_step_total 100
144+
# HELP training_loss Current training loss
145+
# TYPE training_loss gauge
146+
training_loss 1.334257
147+
# HELP training_throughput_samples_per_sec Training throughput
148+
# TYPE training_throughput_samples_per_sec gauge
149+
training_throughput_samples_per_sec 549228.55
150+
# HELP training_gpu_memory_used_bytes GPU memory used
151+
# TYPE training_gpu_memory_used_bytes gauge
152+
training_gpu_memory_used_bytes 79213568
153+
# HELP training_gpu_memory_total_bytes GPU memory total
154+
# TYPE training_gpu_memory_total_bytes gauge
155+
training_gpu_memory_total_bytes 85017624576
71156
```
72157

73-
## Prometheus Target Discovery
158+
### ServiceMonitor
159+
160+
**Training ServiceMonitor**
161+
```
162+
$ kubectl get servicemonitor pytorch-training -n trainer-metrics-test -o yaml
163+
apiVersion: monitoring.coreos.com/v1
164+
kind: ServiceMonitor
165+
metadata:
166+
labels:
167+
release: kube-prometheus-stack
168+
name: pytorch-training
169+
namespace: trainer-metrics-test
170+
spec:
171+
endpoints:
172+
- interval: 15s
173+
path: /metrics
174+
port: metrics
175+
selector:
176+
matchLabels:
177+
app: pytorch-training
178+
```
74179

75-
Prometheus automatically discovers the vLLM workload as a scrape target via
76-
the ServiceMonitor and actively collects metrics.
180+
### Prometheus Target Discovery
77181

78182
**Prometheus scrape target (active)**
79183
```
80-
$ kubectl exec -n monitoring prometheus-kube-prometheus-prometheus-0 -- \
81-
wget -qO- 'http://localhost:9090/api/v1/targets?state=active' | \
82-
jq '.data.activeTargets[] | select(.labels.job=="vllm-inference")'
83184
{
84-
"job": "vllm-inference",
85-
"endpoint": "http://10.0.170.78:8000/metrics",
185+
"job": "pytorch-training-metrics",
186+
"endpoint": "http://10.0.212.201:8080/metrics",
86187
"health": "up",
87-
"lastScrape": "2026-03-24T14:06:50.899967845Z"
188+
"lastScrape": "2026-03-25T11:03:49.310258779Z"
88189
}
89190
```
90191

91-
## vLLM Metrics in Prometheus
92-
93-
Prometheus collects vLLM application-level inference metrics including request
94-
throughput, token counts, latency distributions, and KV cache utilization.
192+
### Training Metrics in Prometheus
95193

96-
**vLLM metrics queried from Prometheus (after 10 inference requests)**
194+
**Training metrics queried from Prometheus**
97195
```
98-
$ kubectl exec -n monitoring prometheus-kube-prometheus-prometheus-0 -- \
99-
wget -qO- 'http://localhost:9090/api/v1/query?query={job="vllm-inference",__name__=~"vllm:.*"}'
100-
vllm:request_success_total{model_name="Qwen/Qwen3-0.6B"} 10
101-
vllm:prompt_tokens_total{model_name="Qwen/Qwen3-0.6B"} 80
102-
vllm:generation_tokens_total{model_name="Qwen/Qwen3-0.6B"} 500
103-
vllm:time_to_first_token_seconds_count{model_name="Qwen/Qwen3-0.6B"} 10
104-
vllm:time_to_first_token_seconds_sum{model_name="Qwen/Qwen3-0.6B"} 0.205
105-
vllm:inter_token_latency_seconds_count{model_name="Qwen/Qwen3-0.6B"} 490
106-
vllm:inter_token_latency_seconds_sum{model_name="Qwen/Qwen3-0.6B"} 0.864
107-
vllm:prefix_cache_queries_total{model_name="Qwen/Qwen3-0.6B"} 80
108-
vllm:iteration_tokens_total_sum{model_name="Qwen/Qwen3-0.6B"} 580
196+
training_step_total = 100
197+
training_loss = 1.334257
198+
training_throughput_samples_per_sec = 549228.55
199+
training_gpu_memory_used_bytes = 79213568
200+
training_gpu_memory_total_bytes = 85017624576
109201
```
110202

111-
**Result: PASS** — Prometheus discovers the vLLM inference workload via ServiceMonitor and actively scrapes its Prometheus-format metrics endpoint. Application-level AI inference metrics (request success count, prompt/generation token throughput, time-to-first-token latency, inter-token latency, KV cache usage, prefix cache queries) are collected and queryable in Prometheus.
203+
**Result: PASS** — Prometheus discovers the PyTorch training workload via ServiceMonitor and actively scrapes its Prometheus-format metrics endpoint. Training-level metrics (step count, loss, throughput, GPU memory) are collected and queryable.
204+
205+
---
206+
207+
## Summary
208+
209+
| Workload | Discovery | Metrics Port | Metrics Type | Result |
210+
|----------|-----------|-------------|--------------|--------|
211+
| **Dynamo vLLM** (inference) | PodMonitor (auto-created) | 9090 (HTTP) | `dynamo_component_*`, `dynamo_frontend_*`, `vllm:*` | **PASS** |
212+
| **PyTorch training** (training) | ServiceMonitor | 8080 (HTTP) | `training_step_total`, `training_loss`, `training_throughput_*`, `training_gpu_memory_*` | **PASS** |
112213

113214
## Cleanup
114215

115-
**Delete test namespace**
216+
**Delete inference workload**
217+
```
218+
$ kubectl delete ns dynamo-workload
219+
```
220+
221+
**Delete training workload**
116222
```
117-
$ kubectl delete ns vllm-metrics-test
223+
$ kubectl delete ns trainer-metrics-test
118224
```

pkg/evidence/collector.go

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ var ValidFeatures = []string{
3939
"gang-scheduling",
4040
"secure-access",
4141
"accelerator-metrics",
42+
"ai-service-metrics",
4243
"inference-gateway",
4344
"robust-operator",
4445
"pod-autoscaling",
@@ -50,7 +51,8 @@ var featureToScript = map[string]string{
5051
"dra-support": "dra",
5152
"gang-scheduling": "gang",
5253
"secure-access": "secure",
53-
"accelerator-metrics": "metrics",
54+
"accelerator-metrics": "accelerator-metrics",
55+
"ai-service-metrics": "service-metrics",
5456
"inference-gateway": "gateway",
5557
"robust-operator": "operator",
5658
"pod-autoscaling": "hpa",
@@ -59,13 +61,14 @@ var featureToScript = map[string]string{
5961

6062
// featureAliases maps short names to canonical feature names for convenience.
6163
var featureAliases = map[string]string{
62-
"dra": "dra-support",
63-
"gang": "gang-scheduling",
64-
"secure": "secure-access",
65-
"metrics": "accelerator-metrics",
66-
"gateway": "inference-gateway",
67-
"operator": "robust-operator",
68-
"hpa": "pod-autoscaling",
64+
"dra": "dra-support",
65+
"gang": "gang-scheduling",
66+
"secure": "secure-access",
67+
"metrics": "accelerator-metrics",
68+
"service-metrics": "ai-service-metrics",
69+
"gateway": "inference-gateway",
70+
"operator": "robust-operator",
71+
"hpa": "pod-autoscaling",
6972
}
7073

7174
// ResolveFeature returns the canonical feature name, resolving aliases.
@@ -103,7 +106,8 @@ var FeatureDescriptions = map[string]string{
103106
"dra-support": "DRA GPU allocation test",
104107
"gang-scheduling": "Gang scheduling co-scheduling test",
105108
"secure-access": "Secure accelerator access verification",
106-
"accelerator-metrics": "Accelerator & AI service metrics",
109+
"accelerator-metrics": "Accelerator metrics (DCGM exporter)",
110+
"ai-service-metrics": "AI service metrics (Prometheus ServiceMonitor discovery)",
107111
"inference-gateway": "Inference API gateway conditions",
108112
"robust-operator": "Robust AI operator + webhook test",
109113
"pod-autoscaling": "HPA pod autoscaling (scale-up + scale-down)",

0 commit comments

Comments
 (0)