NVIDIA · yuanchen8911 · Mar 25, 2026 · Mar 25, 2026
@@ -160,6 +160,27 @@ jobs:
                 overQuotaWeight: 1
           EOF
 
+          # Create DRA ResourceClaim for GPU allocation.
+          # Required on DRA-only clusters where device-plugin GPU requests cannot be scheduled.
+          # The kai.scheduler/queue label is required for KAI scheduler to manage the claim.
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF'
+          apiVersion: resource.k8s.io/v1
+          kind: ResourceClaim
+          metadata:
+            name: vllm-smoke-gpu-claim
+            namespace: dynamo-system
+            labels:
+              kai.scheduler/queue: dynamo
+          spec:
+            devices:
+              requests:
+                - name: gpu
+                  exactly:
+                    deviceClassName: gpu.nvidia.com
+                    allocationMode: ExactCount
+                    count: 1
+          EOF
+
           kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
             -f tests/manifests/dynamo-vllm-smoke-test.yaml -n dynamo-system
 
@@ -318,6 +339,10 @@ jobs:
           kubectl --context="kind-${KIND_CLUSTER_NAME}" delete \
             -f tests/manifests/dynamo-vllm-smoke-test.yaml \
             -n dynamo-system --ignore-not-found 2>/dev/null || true
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" delete resourceclaim \
+            vllm-smoke-gpu-claim -n dynamo-system --ignore-not-found 2>/dev/null || true
+          kubectl --context="kind-${KIND_CLUSTER_NAME}" delete queue \
+            dynamo --ignore-not-found 2>/dev/null || true
 
       - name: GPU Test Cleanup
         if: always()

@@ -1,118 +1,224 @@
-# AI Service Metrics (Prometheus ServiceMonitor Discovery)
+# AI Service Metrics (Prometheus Discovery)
 
-**Cluster:** `EKS / p5.48xlarge / NVIDIA-H100-80GB-HBM3`
-**Generated:** 2026-03-24 14:06:00 UTC
 **Kubernetes Version:** v1.35
 **Platform:** linux/amd64
+**Validated on:** EKS / p5.48xlarge / NVIDIA H100 80GB HBM3
 
 ---
 
 Demonstrates that Prometheus discovers and collects metrics from AI workloads
-that expose them in Prometheus exposition format, using the ServiceMonitor CRD
-for automatic target discovery.
+that expose them in Prometheus exposition format, using PodMonitor and
+ServiceMonitor CRDs for automatic target discovery across both inference and
+training workloads.
 
-## vLLM Inference Workload
+## Inference: Dynamo Platform (PodMonitor)
 
-A vLLM inference server (serving Qwen/Qwen3-0.6B on GPU via DRA ResourceClaim)
-exposes application-level metrics in Prometheus format at `:8000/metrics`.
-A ServiceMonitor enables Prometheus to automatically discover and scrape the endpoint.
+**Cluster:** `aicr-cuj2` (EKS, inference)
+**Generated:** 2026-03-25 10:18:30 UTC
 
-**vLLM workload pod**
+The Dynamo operator auto-creates PodMonitors for worker and frontend pods.
+The Dynamo vLLM runtime exposes both Dynamo-specific and embedded vLLM metrics
+on port 9090 (`system` port) in Prometheus format.
+
+### Dynamo Workload Pods
+
+**Dynamo workload pods**
 ```
-$ kubectl get pods -n vllm-metrics-test -o wide
-NAME          READY   STATUS    RESTARTS   AGE
-vllm-server   1/1     Running   0          5m
+$ kubectl get pods -n dynamo-workload -o wide
+NAME                                READY   STATUS    RESTARTS   AGE     IP             NODE                           NOMINATED NODE   READINESS GATES
+vllm-agg-0-frontend-qqrff           1/1     Running   0          3m29s   10.0.159.241   ip-10-0-184-187.ec2.internal   <none>           <none>
+vllm-agg-0-vllmdecodeworker-95ths   1/1     Running   0          3m29s   10.0.214.229   ip-10-0-180-136.ec2.internal   <none>           <none>
 ```
 
-**vLLM metrics endpoint (sampled after 10 inference requests)**
+### Worker Metrics Endpoint
+
+**Worker metrics (sampled after 10 inference requests)**
 ```
-$ kubectl exec -n vllm-metrics-test vllm-server -- python3 -c "..." | grep vllm:
-vllm:request_success_total{engine="0",finished_reason="length",model_name="Qwen/Qwen3-0.6B"} 10.0
-vllm:prompt_tokens_total{engine="0",model_name="Qwen/Qwen3-0.6B"} 80.0
-vllm:generation_tokens_total{engine="0",model_name="Qwen/Qwen3-0.6B"} 500.0
-vllm:time_to_first_token_seconds_count{engine="0",model_name="Qwen/Qwen3-0.6B"} 10.0
-vllm:time_to_first_token_seconds_sum{engine="0",model_name="Qwen/Qwen3-0.6B"} 0.205
-vllm:inter_token_latency_seconds_count{engine="0",model_name="Qwen/Qwen3-0.6B"} 490.0
-vllm:inter_token_latency_seconds_sum{engine="0",model_name="Qwen/Qwen3-0.6B"} 0.864
-vllm:e2e_request_latency_seconds_count{engine="0",model_name="Qwen/Qwen3-0.6B"} 10.0
-vllm:kv_cache_usage_perc{engine="0",model_name="Qwen/Qwen3-0.6B"} 0.0
-vllm:prefix_cache_queries_total{engine="0",model_name="Qwen/Qwen3-0.6B"} 80.0
-vllm:num_requests_running{engine="0",model_name="Qwen/Qwen3-0.6B"} 0.0
-vllm:num_requests_waiting{engine="0",model_name="Qwen/Qwen3-0.6B"} 0.0
+dynamo_component_request_bytes_total{dynamo_component="backend",dynamo_endpoint="generate",model="Qwen/Qwen3-0.6B"} 11230
+dynamo_component_request_duration_seconds_sum{dynamo_component="backend",dynamo_endpoint="generate",model="Qwen/Qwen3-0.6B"} 0.984
+dynamo_component_request_duration_seconds_count{dynamo_component="backend",dynamo_endpoint="generate",model="Qwen/Qwen3-0.6B"} 10
+dynamo_component_requests_total{dynamo_component="backend",dynamo_endpoint="generate",model="Qwen/Qwen3-0.6B"} 10
+dynamo_component_response_bytes_total{dynamo_component="backend",dynamo_endpoint="generate",model="Qwen/Qwen3-0.6B"} 31826
+dynamo_component_uptime_seconds 223.250
+vllm:engine_sleep_state{engine="0",model_name="Qwen/Qwen3-0.6B",sleep_state="awake"} 1.0
+vllm:prefix_cache_queries_total{engine="0",model_name="Qwen/Qwen3-0.6B"} 50.0
 ```
 
-## ServiceMonitor
+### PodMonitors (Auto-Created by Dynamo Operator)
 
-**ServiceMonitor for vLLM**
+**Dynamo PodMonitors**
+```
+$ kubectl get podmonitors -n dynamo-system
+NAME              AGE
+dynamo-frontend   11d
+dynamo-planner    11d
+dynamo-worker     11d
 ```
-$ kubectl get servicemonitor vllm-inference -n vllm-metrics-test -o yaml
+
+**Worker PodMonitor spec**
+```
+$ kubectl get podmonitor dynamo-worker -n dynamo-system -o yaml
 apiVersion: monitoring.coreos.com/v1
-kind: ServiceMonitor
+kind: PodMonitor
 metadata:
-  labels:
-    release: prometheus
-  name: vllm-inference
-  namespace: vllm-metrics-test
+  name: dynamo-worker
+  namespace: dynamo-system
 spec:
-  endpoints:
-  - interval: 15s
+  namespaceSelector:
+    any: true
+  podMetricsEndpoints:
+  - interval: 5s
     path: /metrics
-    port: http
+    port: system
   selector:
     matchLabels:
-      app: vllm-inference
+      nvidia.com/dynamo-component-type: worker
+      nvidia.com/metrics-enabled: "true"
+```
+
+### Prometheus Target Discovery
+
+**Prometheus scrape targets (active)**
+```
+{
+  "job": "dynamo-system/dynamo-frontend",
+  "endpoint": "http://10.0.159.241:8000/metrics",
+  "health": "up",
+  "lastScrape": "2026-03-25T10:19:21.101766071Z"
+}
+{
+  "job": "dynamo-system/dynamo-worker",
+  "endpoint": "http://10.0.214.229:9090/metrics",
+  "health": "up",
+  "lastScrape": "2026-03-25T10:19:22.70334816Z"
+}
+```
+
+### Dynamo Metrics in Prometheus
+
+**Dynamo metrics queried from Prometheus (after 10 inference requests)**
+```
+dynamo_component_requests_total{endpoint="generate"} = 10
+dynamo_component_request_bytes_total{endpoint="generate"} = 11230
+dynamo_component_response_bytes_total{endpoint="generate"} = 31826
+dynamo_component_request_duration_seconds_count{endpoint="generate"} = 10
+dynamo_component_request_duration_seconds_sum{endpoint="generate"} = 0.984
+dynamo_component_uptime_seconds = 223.250
+dynamo_frontend_input_sequence_tokens_sum = 50
+dynamo_frontend_input_sequence_tokens_count = 10
+dynamo_frontend_inter_token_latency_seconds_sum = 0.866
+dynamo_frontend_inter_token_latency_seconds_count = 490
+dynamo_frontend_model_context_length = 40960
+dynamo_frontend_model_total_kv_blocks = 37710
+```
+
+**Result: PASS** — Prometheus discovers Dynamo inference workloads (frontend + worker) via operator-managed PodMonitors and actively scrapes their Prometheus-format metrics endpoints. Application-level AI inference metrics (request count, request duration, inter-token latency, token throughput, KV cache utilization) are collected and queryable.
+
+---
+
+## Training: PyTorch Workload (ServiceMonitor)
+
+**Cluster:** `aicr-cuj1` (EKS, training)
+**Generated:** 2026-03-25 11:03:00 UTC
+
+A PyTorch training workload runs a GPU training loop and exposes training-level
+metrics (step count, loss, throughput, GPU memory) on port 8080 in Prometheus
+format, discovered via ServiceMonitor.
+
+### Training Workload Pod
+
+**Training pod**
+```
+$ kubectl get pods -n trainer-metrics-test -o wide
+NAME                   READY   STATUS    RESTARTS   AGE
+pytorch-training-job   1/1     Running   0          2m
 ```
 
-**Service endpoint**
+### Training Metrics Endpoint
+
+**Training metrics (after 100 training steps)**
 ```
-$ kubectl get endpoints vllm-inference -n vllm-metrics-test
-NAME             ENDPOINTS          AGE
-vllm-inference   10.0.170.78:8000   5m
+# HELP training_step_total Total training steps completed
+# TYPE training_step_total counter
+training_step_total 100
+# HELP training_loss Current training loss
+# TYPE training_loss gauge
+training_loss 1.334257
+# HELP training_throughput_samples_per_sec Training throughput
+# TYPE training_throughput_samples_per_sec gauge
+training_throughput_samples_per_sec 549228.55
+# HELP training_gpu_memory_used_bytes GPU memory used
+# TYPE training_gpu_memory_used_bytes gauge
+training_gpu_memory_used_bytes 79213568
+# HELP training_gpu_memory_total_bytes GPU memory total
+# TYPE training_gpu_memory_total_bytes gauge
+training_gpu_memory_total_bytes 85017624576
 ```
 
-## Prometheus Target Discovery
+### ServiceMonitor
+
+**Training ServiceMonitor**
+```
+$ kubectl get servicemonitor pytorch-training -n trainer-metrics-test -o yaml
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  labels:
+    release: kube-prometheus-stack
+  name: pytorch-training
+  namespace: trainer-metrics-test
+spec:
+  endpoints:
+  - interval: 15s
+    path: /metrics
+    port: metrics
+  selector:
+    matchLabels:
+      app: pytorch-training
+```
 
-Prometheus automatically discovers the vLLM workload as a scrape target via
-the ServiceMonitor and actively collects metrics.
+### Prometheus Target Discovery
 
 **Prometheus scrape target (active)**
 ```
-$ kubectl exec -n monitoring prometheus-kube-prometheus-prometheus-0 -- \
-    wget -qO- 'http://localhost:9090/api/v1/targets?state=active' | \
-    jq '.data.activeTargets[] | select(.labels.job=="vllm-inference")'
 {
-  "job": "vllm-inference",
-  "endpoint": "http://10.0.170.78:8000/metrics",
+  "job": "pytorch-training-metrics",
+  "endpoint": "http://10.0.212.201:8080/metrics",
   "health": "up",
-  "lastScrape": "2026-03-24T14:06:50.899967845Z"
+  "lastScrape": "2026-03-25T11:03:49.310258779Z"
 }
 ```
 
-## vLLM Metrics in Prometheus
-
-Prometheus collects vLLM application-level inference metrics including request
-throughput, token counts, latency distributions, and KV cache utilization.
+### Training Metrics in Prometheus
 
-**vLLM metrics queried from Prometheus (after 10 inference requests)**
+**Training metrics queried from Prometheus**
 ```
-$ kubectl exec -n monitoring prometheus-kube-prometheus-prometheus-0 -- \
-    wget -qO- 'http://localhost:9090/api/v1/query?query={job="vllm-inference",__name__=~"vllm:.*"}'
-vllm:request_success_total{model_name="Qwen/Qwen3-0.6B"} 10
-vllm:prompt_tokens_total{model_name="Qwen/Qwen3-0.6B"} 80
-vllm:generation_tokens_total{model_name="Qwen/Qwen3-0.6B"} 500
-vllm:time_to_first_token_seconds_count{model_name="Qwen/Qwen3-0.6B"} 10
-vllm:time_to_first_token_seconds_sum{model_name="Qwen/Qwen3-0.6B"} 0.205
-vllm:inter_token_latency_seconds_count{model_name="Qwen/Qwen3-0.6B"} 490
-vllm:inter_token_latency_seconds_sum{model_name="Qwen/Qwen3-0.6B"} 0.864
-vllm:prefix_cache_queries_total{model_name="Qwen/Qwen3-0.6B"} 80
-vllm:iteration_tokens_total_sum{model_name="Qwen/Qwen3-0.6B"} 580
+training_step_total = 100
+training_loss = 1.334257
+training_throughput_samples_per_sec = 549228.55
+training_gpu_memory_used_bytes = 79213568
+training_gpu_memory_total_bytes = 85017624576
 ```
 
-**Result: PASS** — Prometheus discovers the vLLM inference workload via ServiceMonitor and actively scrapes its Prometheus-format metrics endpoint. Application-level AI inference metrics (request success count, prompt/generation token throughput, time-to-first-token latency, inter-token latency, KV cache usage, prefix cache queries) are collected and queryable in Prometheus.
+**Result: PASS** — Prometheus discovers the PyTorch training workload via ServiceMonitor and actively scrapes its Prometheus-format metrics endpoint. Training-level metrics (step count, loss, throughput, GPU memory) are collected and queryable.
+
+---
+
+## Summary
+
+| Workload | Discovery | Metrics Port | Metrics Type | Result |
+|----------|-----------|-------------|--------------|--------|
+| **Dynamo vLLM** (inference) | PodMonitor (auto-created) | 9090 (HTTP) | `dynamo_component_*`, `dynamo_frontend_*`, `vllm:*` | **PASS** |
+| **PyTorch training** (training) | ServiceMonitor | 8080 (HTTP) | `training_step_total`, `training_loss`, `training_throughput_*`, `training_gpu_memory_*` | **PASS** |
 
 ## Cleanup
 
-**Delete test namespace**
+**Delete inference workload**
+```
+$ kubectl delete ns dynamo-workload
+```
+
+**Delete training workload**
 ```
-$ kubectl delete ns vllm-metrics-test
+$ kubectl delete ns trainer-metrics-test
 ```
@@ -39,6 +39,7 @@ var ValidFeatures = []string{
 	"gang-scheduling",
 	"secure-access",
 	"accelerator-metrics",
+	"ai-service-metrics",
 	"inference-gateway",
 	"robust-operator",
 	"pod-autoscaling",
@@ -50,7 +51,8 @@ var featureToScript = map[string]string{
 	"dra-support":         "dra",
 	"gang-scheduling":     "gang",
 	"secure-access":       "secure",
-	"accelerator-metrics": "metrics",
+	"accelerator-metrics": "accelerator-metrics",
+	"ai-service-metrics":  "service-metrics",
 	"inference-gateway":   "gateway",
 	"robust-operator":     "operator",
 	"pod-autoscaling":     "hpa",
@@ -59,13 +61,14 @@ var featureToScript = map[string]string{
 
 // featureAliases maps short names to canonical feature names for convenience.
 var featureAliases = map[string]string{
-	"dra":      "dra-support",
-	"gang":     "gang-scheduling",
-	"secure":   "secure-access",
-	"metrics":  "accelerator-metrics",
-	"gateway":  "inference-gateway",
-	"operator": "robust-operator",
-	"hpa":      "pod-autoscaling",
+	"dra":             "dra-support",
+	"gang":            "gang-scheduling",
+	"secure":          "secure-access",
+	"metrics":         "accelerator-metrics",
+	"service-metrics": "ai-service-metrics",
+	"gateway":         "inference-gateway",
+	"operator":        "robust-operator",
+	"hpa":             "pod-autoscaling",
 }
 
 // ResolveFeature returns the canonical feature name, resolving aliases.
@@ -103,7 +106,8 @@ var FeatureDescriptions = map[string]string{
 	"dra-support":         "DRA GPU allocation test",
 	"gang-scheduling":     "Gang scheduling co-scheduling test",
 	"secure-access":       "Secure accelerator access verification",
-	"accelerator-metrics": "Accelerator & AI service metrics",
+	"accelerator-metrics": "Accelerator metrics (DCGM exporter)",
+	"ai-service-metrics":  "AI service metrics (Prometheus ServiceMonitor discovery)",
 	"inference-gateway":   "Inference API gateway conditions",
 	"robust-operator":     "Robust AI operator + webhook test",
 	"pod-autoscaling":     "HPA pod autoscaling (scale-up + scale-down)",