Skip to content

Commit 4bf95bd

Browse files
committed
feat(validator): add Go-based CNCF AI conformance checks for aicr validate --phase conformance
Implement 11 conformance checks mapping to CNCF AI Conformance v1.35 MUST requirements. Each check runs inside a K8s Job with in-cluster access, replacing hundreds of lines of bash assertions with structured, testable Go. Checks: platform-health, gpu-operator-health, dra-support, secure-accelerator-access, accelerator-metrics, ai-service-metrics, inference-gateway, gang-scheduling, robust-controller, pod-autoscaling, cluster-autoscaling. Infrastructure: wire buildTestPattern for conformance phase, add DynamicClient to ValidationContext for CRD reads, add 10 read-only RBAC rules, register conformance package in main.go, update 4 recipe overlays with per-intent check lists.
1 parent d474757 commit 4bf95bd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+4483
-5
lines changed

cmd/aicr/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919

2020
// Import check packages for side-effect registration.
2121
// Each package's init() function registers its validators.
22+
_ "github.com/NVIDIA/aicr/pkg/validator/checks/conformance"
2223
_ "github.com/NVIDIA/aicr/pkg/validator/checks/deployment"
2324
_ "github.com/NVIDIA/aicr/pkg/validator/checks/readiness"
2425
)

pkg/validator/agent/rbac.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,65 @@ func (d *Deployer) ensureClusterRole(ctx context.Context) error {
118118
Resources: []string{"pods", "services", "nodes"},
119119
Verbs: []string{"get", "list"},
120120
},
121+
// Conformance: cluster-wide core resources (platform-health, robust-controller)
122+
{
123+
APIGroups: []string{""},
124+
Resources: []string{"namespaces", "endpoints"},
125+
Verbs: []string{"get", "list"},
126+
},
127+
// Conformance: CRD discovery (inference-gateway, dra-support, gang-scheduling, robust-controller)
128+
{
129+
APIGroups: []string{"apiextensions.k8s.io"},
130+
Resources: []string{"customresourcedefinitions"},
131+
Verbs: []string{"get", "list"},
132+
},
133+
// Conformance: DRA support validation (resource.k8s.io/v1 — GA)
134+
{
135+
APIGroups: []string{"resource.k8s.io"},
136+
Resources: []string{"resourceslices", "resourceclaims"},
137+
Verbs: []string{"get", "list"},
138+
},
139+
// Conformance: GPU operator ClusterPolicy
140+
{
141+
APIGroups: []string{"nvidia.com"},
142+
Resources: []string{"clusterpolicies"},
143+
Verbs: []string{"get", "list"},
144+
},
145+
// Conformance: Gateway API validation
146+
{
147+
APIGroups: []string{"gateway.networking.k8s.io"},
148+
Resources: []string{"gatewayclasses", "gateways"},
149+
Verbs: []string{"get", "list"},
150+
},
151+
// Conformance: Gang scheduling (KAI scheduler)
152+
{
153+
APIGroups: []string{"scheduling.run.ai"},
154+
Resources: []string{"queues", "podgroups"},
155+
Verbs: []string{"get", "list"},
156+
},
157+
// Conformance: Cluster autoscaling (Karpenter)
158+
{
159+
APIGroups: []string{"karpenter.sh"},
160+
Resources: []string{"nodepools"},
161+
Verbs: []string{"get", "list"},
162+
},
163+
// Conformance: Aggregated metrics APIs (pod-autoscaling, ai-service-metrics)
164+
{
165+
APIGroups: []string{"custom.metrics.k8s.io"},
166+
Resources: []string{"*"},
167+
Verbs: []string{"get", "list"},
168+
},
169+
{
170+
APIGroups: []string{"external.metrics.k8s.io"},
171+
Resources: []string{"*"},
172+
Verbs: []string{"get", "list"},
173+
},
174+
// Conformance: Robust controller — webhook configurations
175+
{
176+
APIGroups: []string{"admissionregistration.k8s.io"},
177+
Resources: []string{"validatingwebhookconfigurations"},
178+
Verbs: []string{"get", "list"},
179+
},
121180
},
122181
}
123182

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package conformance
16+
17+
import (
18+
"fmt"
19+
"strings"
20+
21+
"github.com/NVIDIA/aicr/pkg/errors"
22+
"github.com/NVIDIA/aicr/pkg/validator/checks"
23+
)
24+
25+
// requiredDCGMMetrics are the DCGM metrics required by CNCF AI Conformance requirement #4.
26+
var requiredDCGMMetrics = []string{
27+
"DCGM_FI_DEV_GPU_UTIL",
28+
"DCGM_FI_DEV_FB_USED",
29+
"DCGM_FI_DEV_GPU_TEMP",
30+
"DCGM_FI_DEV_POWER_USAGE",
31+
}
32+
33+
const dcgmExporterURL = "http://nvidia-dcgm-exporter.gpu-operator.svc:9400/metrics"
34+
35+
func init() {
36+
checks.RegisterCheck(&checks.Check{
37+
Name: "accelerator-metrics",
38+
Description: "Verify DCGM exporter exposes required GPU metrics (utilization, memory, temperature, power)",
39+
Phase: "conformance",
40+
Func: CheckAcceleratorMetrics,
41+
TestName: "TestAcceleratorMetrics",
42+
})
43+
}
44+
45+
// CheckAcceleratorMetrics validates CNCF requirement #4: Accelerator Metrics.
46+
// Calls the DCGM exporter metrics endpoint directly via in-cluster DNS and verifies
47+
// that all required GPU metrics are present.
48+
func CheckAcceleratorMetrics(ctx *checks.ValidationContext) error {
49+
return checkAcceleratorMetricsWithURL(ctx, dcgmExporterURL)
50+
}
51+
52+
// checkAcceleratorMetricsWithURL is the testable implementation that accepts a configurable URL.
53+
func checkAcceleratorMetricsWithURL(ctx *checks.ValidationContext, url string) error {
54+
body, err := httpGet(ctx.Context, url)
55+
if err != nil {
56+
return errors.Wrap(errors.ErrCodeUnavailable,
57+
"DCGM exporter metrics endpoint unreachable", err)
58+
}
59+
60+
missing := containsAllMetrics(string(body), requiredDCGMMetrics)
61+
if len(missing) > 0 {
62+
return errors.New(errors.ErrCodeNotFound,
63+
fmt.Sprintf("DCGM metrics missing: %s", strings.Join(missing, ", ")))
64+
}
65+
66+
return nil
67+
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package conformance
16+
17+
import (
18+
"testing"
19+
20+
"github.com/NVIDIA/aicr/pkg/validator/checks"
21+
)
22+
23+
func TestAcceleratorMetrics(t *testing.T) {
24+
if testing.Short() {
25+
t.Skip("Skipping integration test in short mode")
26+
}
27+
28+
runner, err := checks.NewTestRunner(t)
29+
if err != nil {
30+
t.Skipf("Not in Job environment: %v", err)
31+
}
32+
defer runner.Cancel()
33+
34+
if !runner.HasCheck("conformance", "accelerator-metrics") {
35+
t.Skip("Check accelerator-metrics not enabled in recipe")
36+
}
37+
38+
t.Logf("Running check: accelerator-metrics")
39+
40+
ctx := runner.Context()
41+
err = CheckAcceleratorMetrics(ctx)
42+
43+
if err != nil {
44+
t.Errorf("Check failed: %v", err)
45+
} else {
46+
t.Logf("Check passed: accelerator-metrics")
47+
}
48+
}
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package conformance
16+
17+
import (
18+
"context"
19+
"fmt"
20+
"net/http"
21+
"net/http/httptest"
22+
"strings"
23+
"testing"
24+
25+
"github.com/NVIDIA/aicr/pkg/validator/checks"
26+
)
27+
28+
func TestCheckAcceleratorMetrics(t *testing.T) {
29+
allMetrics := `# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization
30+
# TYPE DCGM_FI_DEV_GPU_UTIL gauge
31+
DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-abc"} 42
32+
# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used
33+
# TYPE DCGM_FI_DEV_FB_USED gauge
34+
DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-abc"} 1024
35+
# HELP DCGM_FI_DEV_GPU_TEMP GPU temperature
36+
# TYPE DCGM_FI_DEV_GPU_TEMP gauge
37+
DCGM_FI_DEV_GPU_TEMP{gpu="0",UUID="GPU-abc"} 65
38+
# HELP DCGM_FI_DEV_POWER_USAGE Power draw
39+
# TYPE DCGM_FI_DEV_POWER_USAGE gauge
40+
DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-abc"} 200
41+
`
42+
43+
tests := []struct {
44+
name string
45+
handler http.HandlerFunc
46+
wantErr bool
47+
errContains string
48+
}{
49+
{
50+
name: "all metrics present",
51+
handler: func(w http.ResponseWriter, r *http.Request) {
52+
fmt.Fprint(w, allMetrics)
53+
},
54+
wantErr: false,
55+
},
56+
{
57+
name: "missing one metric",
58+
handler: func(w http.ResponseWriter, r *http.Request) {
59+
// Only 3 of 4 metrics
60+
fmt.Fprint(w, `DCGM_FI_DEV_GPU_UTIL{gpu="0"} 42
61+
DCGM_FI_DEV_FB_USED{gpu="0"} 1024
62+
DCGM_FI_DEV_GPU_TEMP{gpu="0"} 65
63+
`)
64+
},
65+
wantErr: true,
66+
errContains: "DCGM_FI_DEV_POWER_USAGE",
67+
},
68+
{
69+
name: "missing all metrics",
70+
handler: func(w http.ResponseWriter, r *http.Request) {
71+
fmt.Fprint(w, "# no metrics here\n")
72+
},
73+
wantErr: true,
74+
errContains: "DCGM metrics missing",
75+
},
76+
{
77+
name: "server returns 500",
78+
handler: func(w http.ResponseWriter, r *http.Request) {
79+
w.WriteHeader(http.StatusInternalServerError)
80+
},
81+
wantErr: true,
82+
errContains: "HTTP 500",
83+
},
84+
{
85+
name: "server unreachable",
86+
handler: nil, // No server started
87+
wantErr: true,
88+
errContains: "DCGM exporter metrics endpoint unreachable",
89+
},
90+
}
91+
92+
for _, tt := range tests {
93+
t.Run(tt.name, func(t *testing.T) {
94+
var url string
95+
if tt.handler != nil {
96+
server := httptest.NewServer(tt.handler)
97+
defer server.Close()
98+
url = server.URL + "/metrics"
99+
} else {
100+
// Use an unreachable URL
101+
url = "http://127.0.0.1:1/metrics"
102+
}
103+
104+
ctx := &checks.ValidationContext{
105+
Context: context.Background(),
106+
}
107+
108+
err := checkAcceleratorMetricsWithURL(ctx, url)
109+
110+
if (err != nil) != tt.wantErr {
111+
t.Errorf("checkAcceleratorMetricsWithURL() error = %v, wantErr %v", err, tt.wantErr)
112+
return
113+
}
114+
115+
if tt.wantErr && err != nil && tt.errContains != "" {
116+
if !strings.Contains(err.Error(), tt.errContains) {
117+
t.Errorf("checkAcceleratorMetricsWithURL() error = %v, should contain %q", err, tt.errContains)
118+
}
119+
}
120+
})
121+
}
122+
}
123+
124+
func TestCheckAcceleratorMetricsRegistration(t *testing.T) {
125+
check, ok := checks.GetCheck("accelerator-metrics")
126+
if !ok {
127+
t.Fatal("accelerator-metrics check not registered")
128+
}
129+
if check.Phase != "conformance" {
130+
t.Errorf("Phase = %v, want conformance", check.Phase)
131+
}
132+
if check.Func == nil {
133+
t.Fatal("Func is nil")
134+
}
135+
}
136+
137+
func TestContainsAllMetrics(t *testing.T) {
138+
tests := []struct {
139+
name string
140+
text string
141+
required []string
142+
want []string
143+
}{
144+
{
145+
name: "all present",
146+
text: "DCGM_FI_DEV_GPU_UTIL 42\nDCGM_FI_DEV_FB_USED 1024",
147+
required: []string{"DCGM_FI_DEV_GPU_UTIL", "DCGM_FI_DEV_FB_USED"},
148+
want: nil,
149+
},
150+
{
151+
name: "one missing",
152+
text: "DCGM_FI_DEV_GPU_UTIL 42",
153+
required: []string{"DCGM_FI_DEV_GPU_UTIL", "DCGM_FI_DEV_FB_USED"},
154+
want: []string{"DCGM_FI_DEV_FB_USED"},
155+
},
156+
{
157+
name: "all missing",
158+
text: "no metrics here",
159+
required: []string{"DCGM_FI_DEV_GPU_UTIL", "DCGM_FI_DEV_FB_USED"},
160+
want: []string{"DCGM_FI_DEV_GPU_UTIL", "DCGM_FI_DEV_FB_USED"},
161+
},
162+
{
163+
name: "empty text",
164+
text: "",
165+
required: []string{"DCGM_FI_DEV_GPU_UTIL"},
166+
want: []string{"DCGM_FI_DEV_GPU_UTIL"},
167+
},
168+
}
169+
170+
for _, tt := range tests {
171+
t.Run(tt.name, func(t *testing.T) {
172+
got := containsAllMetrics(tt.text, tt.required)
173+
if len(got) != len(tt.want) {
174+
t.Errorf("containsAllMetrics() = %v, want %v", got, tt.want)
175+
return
176+
}
177+
for i := range got {
178+
if got[i] != tt.want[i] {
179+
t.Errorf("containsAllMetrics()[%d] = %v, want %v", i, got[i], tt.want[i])
180+
}
181+
}
182+
})
183+
}
184+
}

0 commit comments

Comments
 (0)