Skip to content

Commit ea16a33

Browse files
authored
feat(conformance): enrich evidence with observed cluster state (#213)
1 parent 1f1758a commit ea16a33

17 files changed

+1759
-312
lines changed

pkg/defaults/timeouts.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,14 @@ const (
228228
ArtifactMaxPerCheck = 20
229229
)
230230

231+
// HTTP response limits for conformance checks.
232+
const (
233+
// HTTPResponseBodyLimit is the maximum size in bytes for HTTP response bodies
234+
// read by conformance checks (e.g., Prometheus metric scrapes). Prevents
235+
// unbounded reads from in-cluster services.
236+
HTTPResponseBodyLimit = 1 * 1024 * 1024 // 1 MiB
237+
)
238+
231239
// Job configuration constants.
232240
const (
233241
// JobTTLAfterFinished is the time-to-live for completed Jobs.

pkg/validator/checks/conformance/ai_service_metrics_check.go

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ package conformance
1717
import (
1818
"encoding/json"
1919
"fmt"
20+
"strings"
2021

2122
"github.com/NVIDIA/aicr/pkg/errors"
2223
"github.com/NVIDIA/aicr/pkg/validator/checks"
@@ -60,16 +61,20 @@ func checkAIServiceMetricsWithURL(ctx *checks.ValidationContext, promBaseURL str
6061
}
6162

6263
var promResp struct {
63-
Data struct {
64+
Status string `json:"status"`
65+
Data struct {
6466
Result []json.RawMessage `json:"result"`
6567
} `json:"data"`
6668
}
6769
if err := json.Unmarshal(body, &promResp); err != nil {
6870
return errors.Wrap(errors.ErrCodeInternal, "failed to parse Prometheus response", err)
6971
}
7072

71-
recordArtifact(ctx, "Prometheus Query: DCGM_FI_DEV_GPU_UTIL",
72-
fmt.Sprintf("Endpoint: %s\nTime series count: %d", queryURL, len(promResp.Data.Result)))
73+
recordRawTextArtifact(ctx, "Prometheus Query: DCGM_FI_DEV_GPU_UTIL",
74+
fmt.Sprintf("curl -sf '%s'", queryURL),
75+
fmt.Sprintf("Status: %s\nTime series count: %d", valueOrUnknown(promResp.Status), len(promResp.Data.Result)))
76+
recordChunkedTextArtifact(ctx, "Prometheus query response (GPU util)",
77+
fmt.Sprintf("curl -sf '%s'", queryURL), string(body))
7378

7479
if len(promResp.Data.Result) == 0 {
7580
return errors.New(errors.ErrCodeNotFound,
@@ -83,34 +88,42 @@ func checkAIServiceMetricsWithURL(ctx *checks.ValidationContext, promBaseURL str
8388
return errors.New(errors.ErrCodeInternal, "discovery REST client is not available")
8489
}
8590
result := restClient.Get().AbsPath(rawURL).Do(ctx.Context)
86-
var statusCode int
87-
result.StatusCode(&statusCode)
8891
if cmErr := result.Error(); cmErr != nil {
89-
recordArtifact(ctx, "Custom Metrics API",
90-
fmt.Sprintf("Endpoint: %s\nHTTP Status: %d\nStatus: unavailable\nError: %v",
91-
rawURL, statusCode, cmErr))
92+
recordRawTextArtifact(ctx, "Custom Metrics API",
93+
"kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1",
94+
fmt.Sprintf("Status: unavailable\nError: %v", cmErr))
9295
return errors.Wrap(errors.ErrCodeNotFound,
9396
"custom metrics API not available", cmErr)
9497
}
95-
96-
groupVersion := "unknown"
97-
resourceCount := 0
98-
discoveryBody, rawErr := result.Raw()
99-
if rawErr == nil {
100-
var discovery struct {
101-
GroupVersion string `json:"groupVersion"`
102-
Resources []json.RawMessage `json:"resources"`
103-
}
104-
if json.Unmarshal(discoveryBody, &discovery) == nil {
105-
if discovery.GroupVersion != "" {
106-
groupVersion = discovery.GroupVersion
107-
}
108-
resourceCount = len(discovery.Resources)
109-
}
98+
var statusCode int
99+
result.StatusCode(&statusCode)
100+
rawBody, rawErr := result.Raw()
101+
if rawErr != nil {
102+
return errors.Wrap(errors.ErrCodeInternal, "failed to read custom metrics API response", rawErr)
103+
}
104+
var customMetricsResp struct {
105+
GroupVersion string `json:"groupVersion"`
106+
Resources []struct {
107+
Name string `json:"name"`
108+
Namespaced bool `json:"namespaced"`
109+
} `json:"resources"`
110+
}
111+
if err := json.Unmarshal(rawBody, &customMetricsResp); err != nil {
112+
return errors.Wrap(errors.ErrCodeInternal, "failed to parse custom metrics API response", err)
113+
}
114+
var resources strings.Builder
115+
limit := len(customMetricsResp.Resources)
116+
if limit > 20 {
117+
limit = 20
118+
}
119+
for i := 0; i < limit; i++ {
120+
r := customMetricsResp.Resources[i]
121+
fmt.Fprintf(&resources, "- %s (namespaced=%t)\n", r.Name, r.Namespaced)
110122
}
111-
recordArtifact(ctx, "Custom Metrics API",
112-
fmt.Sprintf("Endpoint: %s\nHTTP Status: %d\nGroupVersion: %s\nAPI Resources: %d\nStatus: available",
113-
rawURL, statusCode, groupVersion, resourceCount))
123+
recordRawTextArtifact(ctx, "Custom Metrics API",
124+
"kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1",
125+
fmt.Sprintf("HTTP Status: %d\nGroupVersion: %s\nResource count: %d\n\nResources:\n%s",
126+
statusCode, valueOrUnknown(customMetricsResp.GroupVersion), len(customMetricsResp.Resources), resources.String()))
114127

115128
return nil
116129
}

0 commit comments

Comments
 (0)