Skip to content

Commit 3b9c189

Browse files
committed
feat(conformance): enrich metrics, gateway, operator, and HPA evidence
1 parent b2c352e commit 3b9c189

File tree

8 files changed

+391
-155
lines changed

8 files changed

+391
-155
lines changed

pkg/validator/checks/conformance/ai_service_metrics_check.go

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ package conformance
1717
import (
1818
"encoding/json"
1919
"fmt"
20+
"strings"
2021

2122
"github.com/NVIDIA/aicr/pkg/errors"
2223
"github.com/NVIDIA/aicr/pkg/validator/checks"
@@ -60,16 +61,20 @@ func checkAIServiceMetricsWithURL(ctx *checks.ValidationContext, promBaseURL str
6061
}
6162

6263
var promResp struct {
63-
Data struct {
64+
Status string `json:"status"`
65+
Data struct {
6466
Result []json.RawMessage `json:"result"`
6567
} `json:"data"`
6668
}
6769
if err := json.Unmarshal(body, &promResp); err != nil {
6870
return errors.Wrap(errors.ErrCodeInternal, "failed to parse Prometheus response", err)
6971
}
7072

71-
recordArtifact(ctx, "Prometheus Query: DCGM_FI_DEV_GPU_UTIL",
72-
fmt.Sprintf("Endpoint: %s\nTime series count: %d", queryURL, len(promResp.Data.Result)))
73+
recordRawTextArtifact(ctx, "Prometheus Query: DCGM_FI_DEV_GPU_UTIL",
74+
fmt.Sprintf("curl -sf '%s'", queryURL),
75+
fmt.Sprintf("Status: %s\nTime series count: %d", valueOrUnknown(promResp.Status), len(promResp.Data.Result)))
76+
recordChunkedTextArtifact(ctx, "Prometheus query response (GPU util)",
77+
fmt.Sprintf("curl -sf '%s'", queryURL), string(body))
7378

7479
if len(promResp.Data.Result) == 0 {
7580
return errors.New(errors.ErrCodeNotFound,
@@ -83,34 +88,42 @@ func checkAIServiceMetricsWithURL(ctx *checks.ValidationContext, promBaseURL str
8388
return errors.New(errors.ErrCodeInternal, "discovery REST client is not available")
8489
}
8590
result := restClient.Get().AbsPath(rawURL).Do(ctx.Context)
86-
var statusCode int
87-
result.StatusCode(&statusCode)
8891
if cmErr := result.Error(); cmErr != nil {
89-
recordArtifact(ctx, "Custom Metrics API",
90-
fmt.Sprintf("Endpoint: %s\nHTTP Status: %d\nStatus: unavailable\nError: %v",
91-
rawURL, statusCode, cmErr))
92+
recordRawTextArtifact(ctx, "Custom Metrics API",
93+
"kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1",
94+
fmt.Sprintf("Status: unavailable\nError: %v", cmErr))
9295
return errors.Wrap(errors.ErrCodeNotFound,
9396
"custom metrics API not available", cmErr)
9497
}
95-
96-
groupVersion := "unknown"
97-
resourceCount := 0
98-
discoveryBody, rawErr := result.Raw()
99-
if rawErr == nil {
100-
var discovery struct {
101-
GroupVersion string `json:"groupVersion"`
102-
Resources []json.RawMessage `json:"resources"`
103-
}
104-
if json.Unmarshal(discoveryBody, &discovery) == nil {
105-
if discovery.GroupVersion != "" {
106-
groupVersion = discovery.GroupVersion
107-
}
108-
resourceCount = len(discovery.Resources)
109-
}
98+
var statusCode int
99+
result.StatusCode(&statusCode)
100+
rawBody, rawErr := result.Raw()
101+
if rawErr != nil {
102+
return errors.Wrap(errors.ErrCodeInternal, "failed to read custom metrics API response", rawErr)
103+
}
104+
var customMetricsResp struct {
105+
GroupVersion string `json:"groupVersion"`
106+
Resources []struct {
107+
Name string `json:"name"`
108+
Namespaced bool `json:"namespaced"`
109+
} `json:"resources"`
110+
}
111+
if err := json.Unmarshal(rawBody, &customMetricsResp); err != nil {
112+
return errors.Wrap(errors.ErrCodeInternal, "failed to parse custom metrics API response", err)
113+
}
114+
var resources strings.Builder
115+
limit := len(customMetricsResp.Resources)
116+
if limit > 20 {
117+
limit = 20
118+
}
119+
for i := 0; i < limit; i++ {
120+
r := customMetricsResp.Resources[i]
121+
fmt.Fprintf(&resources, "- %s (namespaced=%t)\n", r.Name, r.Namespaced)
110122
}
111-
recordArtifact(ctx, "Custom Metrics API",
112-
fmt.Sprintf("Endpoint: %s\nHTTP Status: %d\nGroupVersion: %s\nAPI Resources: %d\nStatus: available",
113-
rawURL, statusCode, groupVersion, resourceCount))
123+
recordRawTextArtifact(ctx, "Custom Metrics API",
124+
"kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1",
125+
fmt.Sprintf("HTTP Status: %d\nGroupVersion: %s\nResource count: %d\n\nResources:\n%s",
126+
statusCode, valueOrUnknown(customMetricsResp.GroupVersion), len(customMetricsResp.Resources), resources.String()))
114127

115128
return nil
116129
}

pkg/validator/checks/conformance/dra_support_check.go

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -192,21 +192,3 @@ func CheckDRASupport(ctx *checks.ValidationContext) error {
192192

193193
return nil
194194
}
195-
196-
func valueOrUnknown(v string) string {
197-
if strings.TrimSpace(v) == "" {
198-
return "unknown"
199-
}
200-
return v
201-
}
202-
203-
func podReadyCount(pod corev1.Pod) string {
204-
var ready, total int
205-
for _, cs := range pod.Status.ContainerStatuses {
206-
total++
207-
if cs.Ready {
208-
ready++
209-
}
210-
}
211-
return fmt.Sprintf("%d/%d", ready, total)
212-
}

pkg/validator/checks/conformance/helpers.go

Lines changed: 48 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,21 @@ func httpGet(ctx context.Context, url string) ([]byte, error) {
7474
return io.ReadAll(resp.Body)
7575
}
7676

77+
// checkCondition verifies a status condition on an unstructured object.
78+
func checkCondition(obj *unstructured.Unstructured, condType, expectedStatus string) error {
79+
obs, err := getConditionObservation(obj, condType)
80+
if err != nil {
81+
return err
82+
}
83+
if obs.Status == expectedStatus {
84+
return nil
85+
}
86+
return errors.New(errors.ErrCodeInternal,
87+
fmt.Sprintf("condition %s=%v (want %s)", condType, obs.Status, expectedStatus))
88+
}
89+
7790
type conditionObservation struct {
91+
Type string
7892
Status string
7993
Reason string
8094
Message string
@@ -91,29 +105,29 @@ func getConditionObservation(obj *unstructured.Unstructured, condType string) (*
91105
if !ok {
92106
continue
93107
}
94-
condName, _ := cond["type"].(string)
95-
if condName != condType {
108+
109+
kind, _, _ := unstructured.NestedString(cond, "type")
110+
if kind != condType {
96111
continue
97112
}
98113

99-
status, _ := cond["status"].(string)
114+
status, foundStatus, _ := unstructured.NestedString(cond, "status")
115+
if !foundStatus {
116+
if v, ok := cond["status"]; ok {
117+
status = fmt.Sprintf("%v", v)
118+
}
119+
}
120+
reason, _, _ := unstructured.NestedString(cond, "reason")
121+
message, _, _ := unstructured.NestedString(cond, "message")
100122
return &conditionObservation{
101-
Status: status,
102-
Reason: stringFieldOrDefault(cond, "reason", "not-reported"),
103-
Message: stringFieldOrDefault(cond, "message", "not-reported"),
123+
Type: condType,
124+
Status: valueOrUnknown(status),
125+
Reason: valueOrUnknown(reason),
126+
Message: valueOrUnknown(message),
104127
}, nil
105128
}
106129

107-
return nil, errors.New(errors.ErrCodeNotFound,
108-
fmt.Sprintf("condition %s not found", condType))
109-
}
110-
111-
func stringFieldOrDefault(obj map[string]interface{}, key, fallback string) string {
112-
v, _ := obj[key].(string)
113-
if v == "" {
114-
return fallback
115-
}
116-
return v
130+
return nil, errors.New(errors.ErrCodeNotFound, fmt.Sprintf("condition %s not found", condType))
117131
}
118132

119133
// verifyDeploymentAvailable checks that a Deployment has at least one available replica.
@@ -262,6 +276,24 @@ func firstContainerImage(containers []corev1.Container) string {
262276
return "unknown"
263277
}
264278

279+
func valueOrUnknown(v string) string {
280+
if strings.TrimSpace(v) == "" {
281+
return "unknown"
282+
}
283+
return v
284+
}
285+
286+
func podReadyCount(pod corev1.Pod) string {
287+
var ready, total int
288+
for _, cs := range pod.Status.ContainerStatuses {
289+
total++
290+
if cs.Ready {
291+
ready++
292+
}
293+
}
294+
return fmt.Sprintf("%d/%d", ready, total)
295+
}
296+
265297
// truncateLines limits text to at most n lines, appending a truncation marker if needed.
266298
func truncateLines(text string, n int) string {
267299
lines := strings.SplitN(text, "\n", n+1)

pkg/validator/checks/conformance/inference_gateway_check.go

Lines changed: 80 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,11 @@ var httpRouteGVR = schema.GroupVersionResource{
3131
}
3232

3333
type gatewayDataPlaneReport struct {
34-
ListenerAttachedRoutes []string
35-
AttachedHTTPRoutes int
36-
MatchingEndpointSlices int
37-
ReadyEndpoints int
34+
ListenerCount int
35+
AttachedHTTPRoutes int
36+
TotalHTTPRoutes int
37+
MatchingEndpointSlice int
38+
ReadyEndpoints int
3839
}
3940

4041
func init() {
@@ -61,6 +62,8 @@ func CheckInferenceGateway(ctx *checks.ValidationContext) error {
6162
return err
6263
}
6364

65+
collectGatewayControlPlaneArtifacts(ctx)
66+
6467
// 1. GatewayClass "kgateway" accepted
6568
gcGVR := schema.GroupVersionResource{
6669
Group: "gateway.networking.k8s.io", Version: "v1", Resource: "gatewayclasses",
@@ -74,13 +77,15 @@ func CheckInferenceGateway(ctx *checks.ValidationContext) error {
7477
return errors.Wrap(errors.ErrCodeInternal, "GatewayClass not accepted", condErr)
7578
}
7679
if gcCond.Status != "True" {
77-
return errors.Wrap(errors.ErrCodeInternal, "GatewayClass not accepted",
78-
errors.New(errors.ErrCodeInternal,
79-
fmt.Sprintf("condition Accepted=%s (want True)", gcCond.Status)))
80+
return errors.New(errors.ErrCodeInternal,
81+
fmt.Sprintf("GatewayClass not accepted: status=%s reason=%s message=%s",
82+
gcCond.Status, gcCond.Reason, gcCond.Message))
8083
}
81-
recordArtifact(ctx, "GatewayClass Status",
82-
fmt.Sprintf("Name: %s\nAccepted: %s\nReason: %s\nMessage: %s",
83-
gc.GetName(), gcCond.Status, gcCond.Reason, gcCond.Message))
84+
controllerName, _, _ := unstructured.NestedString(gc.Object, "spec", "controllerName")
85+
recordRawTextArtifact(ctx, "GatewayClass",
86+
"kubectl get gatewayclass kgateway -o yaml",
87+
fmt.Sprintf("Name: %s\nControllerName: %s\nAccepted: %s\nReason: %s\nMessage: %s",
88+
gc.GetName(), valueOrUnknown(controllerName), gcCond.Status, gcCond.Reason, gcCond.Message))
8489

8590
// 2. Gateway "inference-gateway" programmed
8691
gwGVR := schema.GroupVersionResource{
@@ -96,13 +101,21 @@ func CheckInferenceGateway(ctx *checks.ValidationContext) error {
96101
return errors.Wrap(errors.ErrCodeInternal, "Gateway not programmed", condErr)
97102
}
98103
if gwCond.Status != "True" {
99-
return errors.Wrap(errors.ErrCodeInternal, "Gateway not programmed",
100-
errors.New(errors.ErrCodeInternal,
101-
fmt.Sprintf("condition Programmed=%s (want True)", gwCond.Status)))
104+
return errors.New(errors.ErrCodeInternal,
105+
fmt.Sprintf("Gateway not programmed: status=%s reason=%s message=%s",
106+
gwCond.Status, gwCond.Reason, gwCond.Message))
107+
}
108+
addresses, found, _ := unstructured.NestedSlice(gw.Object, "status", "addresses")
109+
addressCount := 0
110+
if found {
111+
addressCount = len(addresses)
102112
}
103-
recordArtifact(ctx, "Gateway Status",
104-
fmt.Sprintf("Name: %s\nNamespace: %s\nProgrammed: %s\nReason: %s\nMessage: %s",
105-
gw.GetName(), gw.GetNamespace(), gwCond.Status, gwCond.Reason, gwCond.Message))
113+
recordRawTextArtifact(ctx, "Gateways",
114+
"kubectl get gateways -A",
115+
fmt.Sprintf("Name: %s/%s\nProgrammed: %s\nReason: %s\nMessage: %s\nAddressCount: %d",
116+
gw.GetNamespace(), gw.GetName(), gwCond.Status, gwCond.Reason, gwCond.Message, addressCount))
117+
recordObjectYAMLArtifact(ctx, "Gateway details",
118+
"kubectl get gateway inference-gateway -n kgateway-system -o yaml", gw.Object)
106119

107120
// 3. Required CRDs exist
108121
crdGVR := schema.GroupVersionResource{
@@ -115,28 +128,25 @@ func CheckInferenceGateway(ctx *checks.ValidationContext) error {
115128
}
116129
var crdSummary strings.Builder
117130
for _, crdName := range requiredCRDs {
118-
_, crdErr := dynClient.Resource(crdGVR).Get(ctx.Context, crdName, metav1.GetOptions{})
119-
if crdErr != nil {
131+
_, err := dynClient.Resource(crdGVR).Get(ctx.Context, crdName, metav1.GetOptions{})
132+
if err != nil {
120133
return errors.Wrap(errors.ErrCodeNotFound,
121-
fmt.Sprintf("CRD %s not found", crdName), crdErr)
134+
fmt.Sprintf("CRD %s not found", crdName), err)
122135
}
123136
fmt.Fprintf(&crdSummary, " %s: present\n", crdName)
124137
}
125-
recordArtifact(ctx, "Required CRDs", crdSummary.String())
138+
recordRawTextArtifact(ctx, "Required CRDs", "", crdSummary.String())
126139

127140
// 4. Gateway data-plane readiness (behavioral validation).
128-
dpReport, err := validateGatewayDataPlane(ctx)
141+
report, err := validateGatewayDataPlane(ctx)
129142
if err != nil {
130143
return err
131144
}
132-
133-
listenerSummary := "none"
134-
if len(dpReport.ListenerAttachedRoutes) > 0 {
135-
listenerSummary = strings.Join(dpReport.ListenerAttachedRoutes, ", ")
136-
}
137-
recordArtifact(ctx, "Gateway Data Plane",
138-
fmt.Sprintf("Listeners: %s\nAttached HTTPRoutes: %d\nMatching EndpointSlices: %d\nReady Endpoints: %d",
139-
listenerSummary, dpReport.AttachedHTTPRoutes, dpReport.MatchingEndpointSlices, dpReport.ReadyEndpoints))
145+
recordRawTextArtifact(ctx, "Gateway Data Plane",
146+
"kubectl get endpointslices -n kgateway-system",
147+
fmt.Sprintf("Listeners: %d\nAttached HTTPRoutes: %d\nHTTPRoutes (all): %d\nMatching EndpointSlices: %d\nReady endpoints: %d",
148+
report.ListenerCount, report.AttachedHTTPRoutes, report.TotalHTTPRoutes,
149+
report.MatchingEndpointSlice, report.ReadyEndpoints))
140150
return nil
141151
}
142152

@@ -164,12 +174,12 @@ func validateGatewayDataPlane(ctx *checks.ValidationContext) (*gatewayDataPlaneR
164174
if gwErr == nil {
165175
listeners, found, _ := unstructured.NestedSlice(gw.Object, "status", "listeners")
166176
if found {
177+
report.ListenerCount = len(listeners)
167178
for _, l := range listeners {
168179
if lMap, ok := l.(map[string]interface{}); ok {
169180
name, _, _ := unstructured.NestedString(lMap, "name")
170181
attached, _, _ := unstructured.NestedInt64(lMap, "attachedRoutes")
171-
report.ListenerAttachedRoutes = append(report.ListenerAttachedRoutes,
172-
fmt.Sprintf("%s=%d", name, attached))
182+
report.AttachedHTTPRoutes += int(attached)
173183
slog.Info("gateway listener status", "listener", name, "attachedRoutes", attached)
174184
}
175185
}
@@ -180,6 +190,7 @@ func validateGatewayDataPlane(ctx *checks.ValidationContext) (*gatewayDataPlaneR
180190
httpRouteList, listErr := dynClient.Resource(httpRouteGVR).Namespace("").List(
181191
ctx.Context, metav1.ListOptions{})
182192
if listErr == nil {
193+
report.TotalHTTPRoutes = len(httpRouteList.Items)
183194
var attached int
184195
for _, route := range httpRouteList.Items {
185196
parentRefs, found, _ := unstructured.NestedSlice(route.Object, "spec", "parentRefs")
@@ -215,7 +226,7 @@ func validateGatewayDataPlane(ctx *checks.ValidationContext) (*gatewayDataPlaneR
215226
if !strings.Contains(svcName, "inference-gateway") {
216227
continue
217228
}
218-
report.MatchingEndpointSlices++
229+
report.MatchingEndpointSlice++
219230
for _, ep := range slice.Endpoints {
220231
if ep.Conditions.Ready != nil && *ep.Conditions.Ready {
221232
report.ReadyEndpoints++
@@ -230,3 +241,40 @@ func validateGatewayDataPlane(ctx *checks.ValidationContext) (*gatewayDataPlaneR
230241

231242
return report, nil
232243
}
244+
245+
func collectGatewayControlPlaneArtifacts(ctx *checks.ValidationContext) {
246+
if ctx.Clientset == nil {
247+
return
248+
}
249+
250+
deploys, deployErr := ctx.Clientset.AppsV1().Deployments("kgateway-system").List(
251+
ctx.Context, metav1.ListOptions{})
252+
if deployErr != nil {
253+
recordRawTextArtifact(ctx, "kgateway deployments", "kubectl get deploy -n kgateway-system",
254+
fmt.Sprintf("failed to list deployments: %v", deployErr))
255+
} else {
256+
var deploymentSummary strings.Builder
257+
for _, d := range deploys.Items {
258+
expected := int32(1)
259+
if d.Spec.Replicas != nil {
260+
expected = *d.Spec.Replicas
261+
}
262+
fmt.Fprintf(&deploymentSummary, "%-40s available=%d/%d image=%s\n",
263+
d.Name, d.Status.AvailableReplicas, expected, firstContainerImage(d.Spec.Template.Spec.Containers))
264+
}
265+
recordRawTextArtifact(ctx, "kgateway deployments", "kubectl get deploy -n kgateway-system", deploymentSummary.String())
266+
}
267+
268+
pods, podErr := ctx.Clientset.CoreV1().Pods("kgateway-system").List(ctx.Context, metav1.ListOptions{})
269+
if podErr != nil {
270+
recordRawTextArtifact(ctx, "kgateway pods", "kubectl get pods -n kgateway-system",
271+
fmt.Sprintf("failed to list pods: %v", podErr))
272+
return
273+
}
274+
var podSummary strings.Builder
275+
for _, pod := range pods.Items {
276+
fmt.Fprintf(&podSummary, "%-48s ready=%s phase=%s node=%s\n",
277+
pod.Name, podReadyCount(pod), pod.Status.Phase, valueOrUnknown(pod.Spec.NodeName))
278+
}
279+
recordRawTextArtifact(ctx, "kgateway pods", "kubectl get pods -n kgateway-system", podSummary.String())
280+
}

0 commit comments

Comments
 (0)