Skip to content

Commit a93f3d6

Browse files
committed
feat(validator): make secure-accelerator-access check self-contained
Rewrite CheckSecureAcceleratorAccess to programmatically create DRA test resources instead of expecting pre-deployed pods. The check now creates a namespace, ResourceClaim, and GPU test pod, waits for completion, validates DRA access patterns, and cleans up. - Create dra-test namespace, ResourceClaim, and Pod programmatically - Poll for pod terminal state with 5-minute timeout (image pull) - Validate: resourceClaims present, no device plugin, no hostPath, ResourceClaim exists, pod succeeded - Cleanup: delete pod and claim (skip namespace to avoid finalizer hangs) - Add DRATestPodTimeout constant to pkg/defaults - Expand ClusterRole RBAC: create/delete for namespaces, pods, resourceclaims - Update unit tests with fake client reactors for self-contained flow Verified on live EKS cluster with H100 GPU: TestSecureAcceleratorAccess PASS (3.60s)
1 parent 709fade commit a93f3d6

File tree

4 files changed

+251
-166
lines changed

4 files changed

+251
-166
lines changed

pkg/defaults/timeouts.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,13 @@ const (
137137
ComponentRenderTimeout = 60 * time.Second
138138
)
139139

140+
// DRA test timeouts for conformance validation.
141+
const (
142+
// DRATestPodTimeout is the timeout for the DRA test pod to complete.
143+
// The pod runs a simple CUDA device check but may need time for image pull.
144+
DRATestPodTimeout = 5 * time.Minute
145+
)
146+
140147
// Pod operation timeouts for validation and agent operations.
141148
const (
142149
// PodWaitTimeout is the maximum time to wait for pod operations to complete.

pkg/validator/agent/rbac.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,13 +116,13 @@ func (d *Deployer) ensureClusterRole(ctx context.Context) error {
116116
{
117117
APIGroups: []string{""},
118118
Resources: []string{"pods", "services", "nodes"},
119-
Verbs: []string{"get", "list"},
119+
Verbs: []string{"get", "list", "create", "delete"},
120120
},
121121
// Conformance: cluster-wide core resources (platform-health, robust-controller)
122122
{
123123
APIGroups: []string{""},
124124
Resources: []string{"namespaces", "endpoints"},
125-
Verbs: []string{"get", "list"},
125+
Verbs: []string{"get", "list", "create", "delete"},
126126
},
127127
// Conformance: CRD discovery (inference-gateway, dra-support, gang-scheduling, robust-controller)
128128
{
@@ -134,7 +134,7 @@ func (d *Deployer) ensureClusterRole(ctx context.Context) error {
134134
{
135135
APIGroups: []string{"resource.k8s.io"},
136136
Resources: []string{"resourceslices", "resourceclaims"},
137-
Verbs: []string{"get", "list"},
137+
Verbs: []string{"get", "list", "create", "delete"},
138138
},
139139
// Conformance: GPU operator ClusterPolicy
140140
{

pkg/validator/checks/conformance/secure_access_check.go

Lines changed: 193 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,27 @@
1515
package conformance
1616

1717
import (
18+
"context"
1819
"fmt"
1920
"strings"
2021

22+
"github.com/NVIDIA/aicr/pkg/defaults"
2123
"github.com/NVIDIA/aicr/pkg/errors"
24+
"github.com/NVIDIA/aicr/pkg/k8s"
2225
"github.com/NVIDIA/aicr/pkg/validator/checks"
2326
corev1 "k8s.io/api/core/v1"
2427
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28+
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
2529
"k8s.io/apimachinery/pkg/runtime/schema"
30+
"k8s.io/apimachinery/pkg/util/wait"
31+
"k8s.io/client-go/dynamic"
32+
"k8s.io/client-go/kubernetes"
33+
)
34+
35+
const (
36+
draTestNamespace = "dra-test"
37+
draTestPodName = "dra-gpu-test"
38+
draClaimName = "gpu-claim"
2639
)
2740

2841
func init() {
@@ -36,29 +49,113 @@ func init() {
3649
}
3750

3851
// CheckSecureAcceleratorAccess validates CNCF requirement #3: Secure Accelerator Access.
39-
// Verifies that a DRA-based GPU workload uses proper access patterns:
52+
// Creates a DRA-based GPU test pod, waits for completion, and verifies proper access patterns:
4053
// resourceClaims instead of device plugin, no hostPath to GPU devices,
4154
// and ResourceClaim is allocated.
4255
func CheckSecureAcceleratorAccess(ctx *checks.ValidationContext) error {
4356
if ctx.Clientset == nil {
4457
return errors.New(errors.ErrCodeInvalidRequest, "kubernetes client is not available")
4558
}
4659

47-
// 1. Get the DRA test pod (deployed by workflow before aicr validate runs)
48-
pod, err := ctx.Clientset.CoreV1().Pods("dra-test").Get(
49-
ctx.Context, "dra-gpu-test", metav1.GetOptions{})
60+
dynClient, err := getDynamicClient(ctx)
61+
if err != nil {
62+
return err
63+
}
64+
65+
// Deploy DRA test resources and ensure cleanup.
66+
if err = deployDRATestResources(ctx.Context, ctx.Clientset, dynClient); err != nil {
67+
return err
68+
}
69+
defer cleanupDRATestResources(ctx.Context, ctx.Clientset, dynClient)
70+
71+
// Wait for test pod to reach terminal state.
72+
pod, err := waitForDRATestPod(ctx.Context, ctx.Clientset)
5073
if err != nil {
51-
return errors.Wrap(errors.ErrCodeNotFound,
52-
"DRA test pod not found (deploy dra-gpu-test.yaml first)", err)
74+
return err
75+
}
76+
77+
// Validate DRA access patterns on the completed pod.
78+
return validateDRAPatterns(ctx.Context, dynClient, pod)
79+
}
80+
81+
// deployDRATestResources creates the namespace, ResourceClaim, and Pod for the DRA test.
82+
func deployDRATestResources(ctx context.Context, clientset kubernetes.Interface, dynClient dynamic.Interface) error {
83+
// 1. Create namespace (idempotent).
84+
ns := &corev1.Namespace{
85+
ObjectMeta: metav1.ObjectMeta{Name: draTestNamespace},
86+
}
87+
if _, err := clientset.CoreV1().Namespaces().Create(ctx, ns, metav1.CreateOptions{}); k8s.IgnoreAlreadyExists(err) != nil {
88+
return errors.Wrap(errors.ErrCodeInternal, "failed to create namespace", err)
89+
}
90+
91+
// 2. Delete existing ResourceClaim if present, then create fresh.
92+
claimGVR := schema.GroupVersionResource{
93+
Group: "resource.k8s.io", Version: "v1", Resource: "resourceclaims",
94+
}
95+
if err := k8s.IgnoreNotFound(dynClient.Resource(claimGVR).Namespace(draTestNamespace).Delete(
96+
ctx, draClaimName, metav1.DeleteOptions{})); err != nil {
97+
return errors.Wrap(errors.ErrCodeInternal, "failed to delete existing ResourceClaim", err)
98+
}
99+
100+
claim := buildResourceClaim()
101+
if _, err := dynClient.Resource(claimGVR).Namespace(draTestNamespace).Create(
102+
ctx, claim, metav1.CreateOptions{}); err != nil {
103+
return errors.Wrap(errors.ErrCodeInternal, "failed to create ResourceClaim", err)
104+
}
105+
106+
// 3. Delete existing Pod if present, then create fresh.
107+
if err := k8s.IgnoreNotFound(clientset.CoreV1().Pods(draTestNamespace).Delete(
108+
ctx, draTestPodName, metav1.DeleteOptions{})); err != nil {
109+
return errors.Wrap(errors.ErrCodeInternal, "failed to delete existing pod", err)
53110
}
54111

55-
// 2. Pod uses resourceClaims (DRA pattern)
112+
pod := buildDRATestPod()
113+
if _, err := clientset.CoreV1().Pods(draTestNamespace).Create(ctx, pod, metav1.CreateOptions{}); err != nil {
114+
return errors.Wrap(errors.ErrCodeInternal, "failed to create DRA test pod", err)
115+
}
116+
117+
return nil
118+
}
119+
120+
// waitForDRATestPod polls until the DRA test pod reaches a terminal state.
121+
func waitForDRATestPod(ctx context.Context, clientset kubernetes.Interface) (*corev1.Pod, error) {
122+
var resultPod *corev1.Pod
123+
124+
waitCtx, cancel := context.WithTimeout(ctx, defaults.DRATestPodTimeout)
125+
defer cancel()
126+
127+
err := wait.PollUntilContextCancel(waitCtx, defaults.PodPollInterval, true,
128+
func(ctx context.Context) (bool, error) {
129+
pod, err := clientset.CoreV1().Pods(draTestNamespace).Get(
130+
ctx, draTestPodName, metav1.GetOptions{})
131+
if err != nil {
132+
return false, errors.Wrap(errors.ErrCodeInternal, "failed to get DRA test pod", err)
133+
}
134+
switch pod.Status.Phase { //nolint:exhaustive // only terminal states matter
135+
case corev1.PodSucceeded, corev1.PodFailed:
136+
resultPod = pod
137+
return true, nil
138+
default:
139+
return false, nil
140+
}
141+
},
142+
)
143+
if err != nil {
144+
return nil, errors.Wrap(errors.ErrCodeTimeout, "DRA test pod did not complete", err)
145+
}
146+
147+
return resultPod, nil
148+
}
149+
150+
// validateDRAPatterns verifies the completed pod uses proper DRA access patterns.
151+
func validateDRAPatterns(ctx context.Context, dynClient dynamic.Interface, pod *corev1.Pod) error {
152+
// 1. Pod uses resourceClaims (DRA pattern).
56153
if len(pod.Spec.ResourceClaims) == 0 {
57154
return errors.New(errors.ErrCodeInternal,
58155
"pod does not use DRA resourceClaims")
59156
}
60157

61-
// 3. No nvidia.com/gpu in resources.limits (device plugin pattern)
158+
// 2. No nvidia.com/gpu in resources.limits (device plugin pattern).
62159
for _, c := range pod.Spec.Containers {
63160
if c.Resources.Limits != nil {
64161
if _, hasGPU := c.Resources.Limits["nvidia.com/gpu"]; hasGPU {
@@ -68,31 +165,25 @@ func CheckSecureAcceleratorAccess(ctx *checks.ValidationContext) error {
68165
}
69166
}
70167

71-
// 4. No hostPath volumes to /dev/nvidia*
168+
// 3. No hostPath volumes to /dev/nvidia*.
72169
for _, vol := range pod.Spec.Volumes {
73170
if vol.HostPath != nil && strings.Contains(vol.HostPath.Path, "/dev/nvidia") {
74171
return errors.New(errors.ErrCodeInternal,
75172
fmt.Sprintf("pod has hostPath volume to %s", vol.HostPath.Path))
76173
}
77174
}
78175

79-
// 5. ResourceClaim exists
80-
dynClient, err := getDynamicClient(ctx)
81-
if err != nil {
82-
return err
83-
}
84-
gvr := schema.GroupVersionResource{
176+
// 4. ResourceClaim exists.
177+
claimGVR := schema.GroupVersionResource{
85178
Group: "resource.k8s.io", Version: "v1", Resource: "resourceclaims",
86179
}
87-
_, err = dynClient.Resource(gvr).Namespace("dra-test").Get(
88-
ctx.Context, "gpu-claim", metav1.GetOptions{})
180+
_, err := dynClient.Resource(claimGVR).Namespace(draTestNamespace).Get(
181+
ctx, draClaimName, metav1.GetOptions{})
89182
if err != nil {
90183
return errors.Wrap(errors.ErrCodeNotFound, "ResourceClaim gpu-claim not found", err)
91184
}
92185

93-
// 6. Pod completed successfully — proves DRA allocation worked.
94-
// Note: status.allocation may be cleared after pod completion, so we verify
95-
// success via the pod phase rather than the claim's allocation status.
186+
// 5. Pod completed successfully — proves DRA allocation worked.
96187
if pod.Status.Phase != corev1.PodSucceeded {
97188
return errors.New(errors.ErrCodeInternal,
98189
fmt.Sprintf("DRA test pod phase=%s (want Succeeded), GPU allocation may have failed",
@@ -101,3 +192,85 @@ func CheckSecureAcceleratorAccess(ctx *checks.ValidationContext) error {
101192

102193
return nil
103194
}
195+
196+
// cleanupDRATestResources removes test resources. Errors are logged but not returned
197+
// since cleanup failures should not mask test results.
198+
// Note: the namespace is intentionally NOT deleted — it's harmless to leave and
199+
// namespace deletion can hang on DRA finalizers.
200+
func cleanupDRATestResources(ctx context.Context, clientset kubernetes.Interface, dynClient dynamic.Interface) {
201+
claimGVR := schema.GroupVersionResource{
202+
Group: "resource.k8s.io", Version: "v1", Resource: "resourceclaims",
203+
}
204+
205+
// Delete pod first (releases claim reservation), then claim.
206+
_ = k8s.IgnoreNotFound(clientset.CoreV1().Pods(draTestNamespace).Delete(
207+
ctx, draTestPodName, metav1.DeleteOptions{}))
208+
_ = k8s.IgnoreNotFound(dynClient.Resource(claimGVR).Namespace(draTestNamespace).Delete(
209+
ctx, draClaimName, metav1.DeleteOptions{}))
210+
}
211+
212+
// buildDRATestPod returns the Pod spec for the DRA GPU allocation test.
213+
func buildDRATestPod() *corev1.Pod {
214+
return &corev1.Pod{
215+
ObjectMeta: metav1.ObjectMeta{
216+
Name: draTestPodName,
217+
Namespace: draTestNamespace,
218+
},
219+
Spec: corev1.PodSpec{
220+
RestartPolicy: corev1.RestartPolicyNever,
221+
Tolerations: []corev1.Toleration{
222+
{Operator: corev1.TolerationOpExists},
223+
},
224+
ResourceClaims: []corev1.PodResourceClaim{
225+
{
226+
Name: "gpu",
227+
ResourceClaimName: strPtr(draClaimName),
228+
},
229+
},
230+
Containers: []corev1.Container{
231+
{
232+
Name: "gpu-test",
233+
Image: "nvidia/cuda:12.9.0-base-ubuntu24.04",
234+
Command: []string{"bash", "-c", "ls /dev/nvidia* && echo 'DRA GPU allocation successful'"},
235+
Resources: corev1.ResourceRequirements{
236+
Claims: []corev1.ResourceClaim{
237+
{Name: "gpu"},
238+
},
239+
},
240+
},
241+
},
242+
},
243+
}
244+
}
245+
246+
// buildResourceClaim returns the unstructured ResourceClaim for the DRA test.
247+
func buildResourceClaim() *unstructured.Unstructured {
248+
return &unstructured.Unstructured{
249+
Object: map[string]interface{}{
250+
"apiVersion": "resource.k8s.io/v1",
251+
"kind": "ResourceClaim",
252+
"metadata": map[string]interface{}{
253+
"name": draClaimName,
254+
"namespace": draTestNamespace,
255+
},
256+
"spec": map[string]interface{}{
257+
"devices": map[string]interface{}{
258+
"requests": []interface{}{
259+
map[string]interface{}{
260+
"name": "gpu",
261+
"exactly": map[string]interface{}{
262+
"deviceClassName": "gpu.nvidia.com",
263+
"allocationMode": "ExactCount",
264+
"count": int64(1),
265+
},
266+
},
267+
},
268+
},
269+
},
270+
},
271+
}
272+
}
273+
274+
func strPtr(s string) *string {
275+
return &s
276+
}

0 commit comments

Comments
 (0)