Skip to content

Commit c6dbeb7

Browse files
committed
feat(validator): make secure-accelerator-access check self-contained
Rewrite CheckSecureAcceleratorAccess to programmatically create DRA test resources instead of expecting pre-deployed pods. The check now creates a namespace, ResourceClaim, and GPU test pod, waits for completion, validates DRA access patterns, and cleans up. - Create dra-test namespace, ResourceClaim, and Pod programmatically - Poll for pod terminal state with 5-minute timeout (image pull) - Validate: resourceClaims present, no device plugin, no hostPath, ResourceClaim exists, pod succeeded - Cleanup: delete pod and claim (skip namespace to avoid finalizer hangs) - Add DRATestPodTimeout constant to pkg/defaults - Expand ClusterRole RBAC: create/delete for namespaces, pods, resourceclaims - Update unit tests with fake client reactors for self-contained flow Verified on live EKS cluster with H100 GPU: TestSecureAcceleratorAccess PASS (3.60s)
1 parent 709fade commit c6dbeb7

File tree

6 files changed

+317
-171
lines changed

6 files changed

+317
-171
lines changed

pkg/defaults/timeouts.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,13 @@ const (
137137
ComponentRenderTimeout = 60 * time.Second
138138
)
139139

140+
// DRA test timeouts for conformance validation.
141+
const (
142+
// DRATestPodTimeout is the timeout for the DRA test pod to complete.
143+
// The pod runs a simple CUDA device check but may need time for image pull.
144+
DRATestPodTimeout = 5 * time.Minute
145+
)
146+
140147
// Pod operation timeouts for validation and agent operations.
141148
const (
142149
// PodWaitTimeout is the maximum time to wait for pod operations to complete.

pkg/validator/agent/rbac.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,17 @@ func (d *Deployer) ensureClusterRole(ctx context.Context) error {
136136
Resources: []string{"resourceslices", "resourceclaims"},
137137
Verbs: []string{"get", "list"},
138138
},
139+
// Conformance: secure-accelerator-access — DRA test pod lifecycle
140+
{
141+
APIGroups: []string{""},
142+
Resources: []string{"namespaces", "pods"},
143+
Verbs: []string{"create", "delete"},
144+
},
145+
{
146+
APIGroups: []string{"resource.k8s.io"},
147+
Resources: []string{"resourceclaims"},
148+
Verbs: []string{"create", "delete"},
149+
},
139150
// Conformance: GPU operator ClusterPolicy
140151
{
141152
APIGroups: []string{"nvidia.com"},

pkg/validator/checks/conformance/secure_access_check.go

Lines changed: 232 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,54 @@
1515
package conformance
1616

1717
import (
18+
"context"
19+
"crypto/rand"
20+
"encoding/hex"
1821
"fmt"
1922
"strings"
2023

24+
"github.com/NVIDIA/aicr/pkg/defaults"
2125
"github.com/NVIDIA/aicr/pkg/errors"
26+
"github.com/NVIDIA/aicr/pkg/k8s"
2227
"github.com/NVIDIA/aicr/pkg/validator/checks"
2328
corev1 "k8s.io/api/core/v1"
29+
k8serrors "k8s.io/apimachinery/pkg/api/errors"
2430
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
31+
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
2532
"k8s.io/apimachinery/pkg/runtime/schema"
33+
"k8s.io/apimachinery/pkg/util/wait"
34+
"k8s.io/client-go/dynamic"
35+
"k8s.io/client-go/kubernetes"
2636
)
2737

38+
const (
39+
draTestNamespace = "dra-test"
40+
draTestPrefix = "dra-gpu-test-"
41+
draClaimPrefix = "gpu-claim-"
42+
)
43+
44+
// draTestRun holds per-invocation resource names to avoid collisions.
45+
type draTestRun struct {
46+
podName string
47+
claimName string
48+
}
49+
50+
func newDRATestRun() (*draTestRun, error) {
51+
b := make([]byte, 4)
52+
if _, err := rand.Read(b); err != nil {
53+
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to generate random suffix", err)
54+
}
55+
suffix := hex.EncodeToString(b)
56+
return &draTestRun{
57+
podName: draTestPrefix + suffix,
58+
claimName: draClaimPrefix + suffix,
59+
}, nil
60+
}
61+
62+
var claimGVR = schema.GroupVersionResource{
63+
Group: "resource.k8s.io", Version: "v1", Resource: "resourceclaims",
64+
}
65+
2866
func init() {
2967
checks.RegisterCheck(&checks.Check{
3068
Name: "secure-accelerator-access",
@@ -36,29 +74,109 @@ func init() {
3674
}
3775

3876
// CheckSecureAcceleratorAccess validates CNCF requirement #3: Secure Accelerator Access.
39-
// Verifies that a DRA-based GPU workload uses proper access patterns:
40-
// resourceClaims instead of device plugin, no hostPath to GPU devices,
41-
// and ResourceClaim is allocated.
77+
// Creates a DRA-based GPU test pod with unique names, waits for completion, and verifies
78+
// proper access patterns: resourceClaims instead of device plugin, no hostPath to GPU
79+
// devices, and ResourceClaim is allocated.
4280
func CheckSecureAcceleratorAccess(ctx *checks.ValidationContext) error {
4381
if ctx.Clientset == nil {
4482
return errors.New(errors.ErrCodeInvalidRequest, "kubernetes client is not available")
4583
}
4684

47-
// 1. Get the DRA test pod (deployed by workflow before aicr validate runs)
48-
pod, err := ctx.Clientset.CoreV1().Pods("dra-test").Get(
49-
ctx.Context, "dra-gpu-test", metav1.GetOptions{})
85+
dynClient, err := getDynamicClient(ctx)
5086
if err != nil {
51-
return errors.Wrap(errors.ErrCodeNotFound,
52-
"DRA test pod not found (deploy dra-gpu-test.yaml first)", err)
87+
return err
88+
}
89+
90+
run, err := newDRATestRun()
91+
if err != nil {
92+
return err
93+
}
94+
95+
// Deploy DRA test resources and ensure cleanup.
96+
if err = deployDRATestResources(ctx.Context, ctx.Clientset, dynClient, run); err != nil {
97+
return err
5398
}
99+
defer cleanupDRATestResources(ctx.Context, ctx.Clientset, dynClient, run)
54100

55-
// 2. Pod uses resourceClaims (DRA pattern)
101+
// Wait for test pod to reach terminal state.
102+
pod, err := waitForDRATestPod(ctx.Context, ctx.Clientset, run)
103+
if err != nil {
104+
return err
105+
}
106+
107+
// Validate DRA access patterns on the completed pod.
108+
return validateDRAPatterns(ctx.Context, dynClient, pod, run)
109+
}
110+
111+
// deployDRATestResources creates the namespace, ResourceClaim, and Pod for the DRA test.
112+
func deployDRATestResources(ctx context.Context, clientset kubernetes.Interface, dynClient dynamic.Interface, run *draTestRun) error {
113+
// 1. Create namespace (idempotent).
114+
ns := &corev1.Namespace{
115+
ObjectMeta: metav1.ObjectMeta{Name: draTestNamespace},
116+
}
117+
if _, err := clientset.CoreV1().Namespaces().Create(ctx, ns, metav1.CreateOptions{}); k8s.IgnoreAlreadyExists(err) != nil {
118+
return errors.Wrap(errors.ErrCodeInternal, "failed to create namespace", err)
119+
}
120+
121+
// 2. Create ResourceClaim with unique name.
122+
claim := buildResourceClaim(run)
123+
if _, err := dynClient.Resource(claimGVR).Namespace(draTestNamespace).Create(
124+
ctx, claim, metav1.CreateOptions{}); err != nil {
125+
return errors.Wrap(errors.ErrCodeInternal, "failed to create ResourceClaim", err)
126+
}
127+
128+
// 3. Create Pod with unique name.
129+
pod := buildDRATestPod(run)
130+
if _, err := clientset.CoreV1().Pods(draTestNamespace).Create(ctx, pod, metav1.CreateOptions{}); err != nil {
131+
return errors.Wrap(errors.ErrCodeInternal, "failed to create DRA test pod", err)
132+
}
133+
134+
return nil
135+
}
136+
137+
// waitForDRATestPod polls until the DRA test pod reaches a terminal state.
138+
func waitForDRATestPod(ctx context.Context, clientset kubernetes.Interface, run *draTestRun) (*corev1.Pod, error) {
139+
var resultPod *corev1.Pod
140+
141+
waitCtx, cancel := context.WithTimeout(ctx, defaults.DRATestPodTimeout)
142+
defer cancel()
143+
144+
err := wait.PollUntilContextCancel(waitCtx, defaults.PodPollInterval, true,
145+
func(ctx context.Context) (bool, error) {
146+
pod, err := clientset.CoreV1().Pods(draTestNamespace).Get(
147+
ctx, run.podName, metav1.GetOptions{})
148+
if err != nil {
149+
return false, errors.Wrap(errors.ErrCodeInternal, "failed to get DRA test pod", err)
150+
}
151+
switch pod.Status.Phase { //nolint:exhaustive // only terminal states matter
152+
case corev1.PodSucceeded, corev1.PodFailed:
153+
resultPod = pod
154+
return true, nil
155+
default:
156+
return false, nil
157+
}
158+
},
159+
)
160+
if err != nil {
161+
// Distinguish timeout from other poll errors (RBAC, NotFound, etc).
162+
if ctx.Err() != nil || waitCtx.Err() != nil {
163+
return nil, errors.Wrap(errors.ErrCodeTimeout, "DRA test pod did not complete in time", err)
164+
}
165+
return nil, errors.Wrap(errors.ErrCodeInternal, "DRA test pod polling failed", err)
166+
}
167+
168+
return resultPod, nil
169+
}
170+
171+
// validateDRAPatterns verifies the completed pod uses proper DRA access patterns.
172+
func validateDRAPatterns(ctx context.Context, dynClient dynamic.Interface, pod *corev1.Pod, run *draTestRun) error {
173+
// 1. Pod uses resourceClaims (DRA pattern).
56174
if len(pod.Spec.ResourceClaims) == 0 {
57175
return errors.New(errors.ErrCodeInternal,
58176
"pod does not use DRA resourceClaims")
59177
}
60178

61-
// 3. No nvidia.com/gpu in resources.limits (device plugin pattern)
179+
// 2. No nvidia.com/gpu in resources.limits (device plugin pattern).
62180
for _, c := range pod.Spec.Containers {
63181
if c.Resources.Limits != nil {
64182
if _, hasGPU := c.Resources.Limits["nvidia.com/gpu"]; hasGPU {
@@ -68,31 +186,22 @@ func CheckSecureAcceleratorAccess(ctx *checks.ValidationContext) error {
68186
}
69187
}
70188

71-
// 4. No hostPath volumes to /dev/nvidia*
189+
// 3. No hostPath volumes to /dev/nvidia*.
72190
for _, vol := range pod.Spec.Volumes {
73191
if vol.HostPath != nil && strings.Contains(vol.HostPath.Path, "/dev/nvidia") {
74192
return errors.New(errors.ErrCodeInternal,
75193
fmt.Sprintf("pod has hostPath volume to %s", vol.HostPath.Path))
76194
}
77195
}
78196

79-
// 5. ResourceClaim exists
80-
dynClient, err := getDynamicClient(ctx)
81-
if err != nil {
82-
return err
83-
}
84-
gvr := schema.GroupVersionResource{
85-
Group: "resource.k8s.io", Version: "v1", Resource: "resourceclaims",
86-
}
87-
_, err = dynClient.Resource(gvr).Namespace("dra-test").Get(
88-
ctx.Context, "gpu-claim", metav1.GetOptions{})
89-
if err != nil {
90-
return errors.Wrap(errors.ErrCodeNotFound, "ResourceClaim gpu-claim not found", err)
197+
// 4. ResourceClaim exists.
198+
if _, err := dynClient.Resource(claimGVR).Namespace(draTestNamespace).Get(
199+
ctx, run.claimName, metav1.GetOptions{}); err != nil {
200+
return errors.Wrap(errors.ErrCodeNotFound,
201+
fmt.Sprintf("ResourceClaim %s not found", run.claimName), err)
91202
}
92203

93-
// 6. Pod completed successfully — proves DRA allocation worked.
94-
// Note: status.allocation may be cleared after pod completion, so we verify
95-
// success via the pod phase rather than the claim's allocation status.
204+
// 5. Pod completed successfully — proves DRA allocation worked.
96205
if pod.Status.Phase != corev1.PodSucceeded {
97206
return errors.New(errors.ErrCodeInternal,
98207
fmt.Sprintf("DRA test pod phase=%s (want Succeeded), GPU allocation may have failed",
@@ -101,3 +210,100 @@ func CheckSecureAcceleratorAccess(ctx *checks.ValidationContext) error {
101210

102211
return nil
103212
}
213+
214+
// cleanupDRATestResources removes test resources. Best-effort: errors are ignored
215+
// since cleanup failures should not mask test results.
216+
// The namespace is intentionally NOT deleted — it's harmless to leave and
217+
// namespace deletion can hang on DRA finalizers.
218+
func cleanupDRATestResources(ctx context.Context, clientset kubernetes.Interface, dynClient dynamic.Interface, run *draTestRun) {
219+
// Delete pod first (releases claim reservation), then claim.
220+
_ = k8s.IgnoreNotFound(clientset.CoreV1().Pods(draTestNamespace).Delete(
221+
ctx, run.podName, metav1.DeleteOptions{}))
222+
waitForDeletion(ctx, func() error {
223+
_, err := clientset.CoreV1().Pods(draTestNamespace).Get(ctx, run.podName, metav1.GetOptions{})
224+
return err
225+
})
226+
_ = k8s.IgnoreNotFound(dynClient.Resource(claimGVR).Namespace(draTestNamespace).Delete(
227+
ctx, run.claimName, metav1.DeleteOptions{}))
228+
}
229+
230+
// waitForDeletion polls until a resource is gone (NotFound) or the context expires.
231+
func waitForDeletion(ctx context.Context, getFunc func() error) {
232+
pollCtx, cancel := context.WithTimeout(ctx, defaults.K8sCleanupTimeout)
233+
defer cancel()
234+
_ = wait.PollUntilContextCancel(pollCtx, defaults.PodPollInterval, true,
235+
func(ctx context.Context) (bool, error) {
236+
err := getFunc()
237+
if k8serrors.IsNotFound(err) {
238+
return true, nil
239+
}
240+
return false, nil
241+
},
242+
)
243+
}
244+
245+
// buildDRATestPod returns the Pod spec for the DRA GPU allocation test.
246+
func buildDRATestPod(run *draTestRun) *corev1.Pod {
247+
return &corev1.Pod{
248+
ObjectMeta: metav1.ObjectMeta{
249+
Name: run.podName,
250+
Namespace: draTestNamespace,
251+
},
252+
Spec: corev1.PodSpec{
253+
RestartPolicy: corev1.RestartPolicyNever,
254+
Tolerations: []corev1.Toleration{
255+
{Operator: corev1.TolerationOpExists},
256+
},
257+
ResourceClaims: []corev1.PodResourceClaim{
258+
{
259+
Name: "gpu",
260+
ResourceClaimName: strPtr(run.claimName),
261+
},
262+
},
263+
Containers: []corev1.Container{
264+
{
265+
Name: "gpu-test",
266+
Image: "nvidia/cuda:12.9.0-base-ubuntu24.04",
267+
Command: []string{"bash", "-c", "ls /dev/nvidia* && echo 'DRA GPU allocation successful'"},
268+
Resources: corev1.ResourceRequirements{
269+
Claims: []corev1.ResourceClaim{
270+
{Name: "gpu"},
271+
},
272+
},
273+
},
274+
},
275+
},
276+
}
277+
}
278+
279+
// buildResourceClaim returns the unstructured ResourceClaim for the DRA test.
280+
func buildResourceClaim(run *draTestRun) *unstructured.Unstructured {
281+
return &unstructured.Unstructured{
282+
Object: map[string]interface{}{
283+
"apiVersion": "resource.k8s.io/v1",
284+
"kind": "ResourceClaim",
285+
"metadata": map[string]interface{}{
286+
"name": run.claimName,
287+
"namespace": draTestNamespace,
288+
},
289+
"spec": map[string]interface{}{
290+
"devices": map[string]interface{}{
291+
"requests": []interface{}{
292+
map[string]interface{}{
293+
"name": "gpu",
294+
"exactly": map[string]interface{}{
295+
"deviceClassName": "gpu.nvidia.com",
296+
"allocationMode": "ExactCount",
297+
"count": int64(1),
298+
},
299+
},
300+
},
301+
},
302+
},
303+
},
304+
}
305+
}
306+
307+
func strPtr(s string) *string {
308+
return &s
309+
}

0 commit comments

Comments
 (0)