1515package conformance
1616
1717import (
18+ "context"
19+ "crypto/rand"
20+ "encoding/hex"
1821 "fmt"
1922 "strings"
2023
24+ "github.com/NVIDIA/aicr/pkg/defaults"
2125 "github.com/NVIDIA/aicr/pkg/errors"
26+ "github.com/NVIDIA/aicr/pkg/k8s"
2227 "github.com/NVIDIA/aicr/pkg/validator/checks"
2328 corev1 "k8s.io/api/core/v1"
29+ k8serrors "k8s.io/apimachinery/pkg/api/errors"
2430 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
31+ "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
2532 "k8s.io/apimachinery/pkg/runtime/schema"
33+ "k8s.io/apimachinery/pkg/util/wait"
34+ "k8s.io/client-go/dynamic"
35+ "k8s.io/client-go/kubernetes"
2636)
2737
38+ const (
39+ draTestNamespace = "dra-test"
40+ draTestPrefix = "dra-gpu-test-"
41+ draClaimPrefix = "gpu-claim-"
42+ )
43+
44+ // draTestRun holds per-invocation resource names to avoid collisions.
45+ type draTestRun struct {
46+ podName string
47+ claimName string
48+ }
49+
50+ func newDRATestRun () (* draTestRun , error ) {
51+ b := make ([]byte , 4 )
52+ if _ , err := rand .Read (b ); err != nil {
53+ return nil , errors .Wrap (errors .ErrCodeInternal , "failed to generate random suffix" , err )
54+ }
55+ suffix := hex .EncodeToString (b )
56+ return & draTestRun {
57+ podName : draTestPrefix + suffix ,
58+ claimName : draClaimPrefix + suffix ,
59+ }, nil
60+ }
61+
62+ var claimGVR = schema.GroupVersionResource {
63+ Group : "resource.k8s.io" , Version : "v1" , Resource : "resourceclaims" ,
64+ }
65+
2866func init () {
2967 checks .RegisterCheck (& checks.Check {
3068 Name : "secure-accelerator-access" ,
@@ -36,29 +74,109 @@ func init() {
3674}
3775
3876// CheckSecureAcceleratorAccess validates CNCF requirement #3: Secure Accelerator Access.
39- // Verifies that a DRA-based GPU workload uses proper access patterns:
40- // resourceClaims instead of device plugin, no hostPath to GPU devices,
41- // and ResourceClaim is allocated.
77+ // Creates a DRA-based GPU test pod with unique names, waits for completion, and verifies
78+ // proper access patterns: resourceClaims instead of device plugin, no hostPath to GPU
79+ // devices, and ResourceClaim is allocated.
4280func CheckSecureAcceleratorAccess (ctx * checks.ValidationContext ) error {
4381 if ctx .Clientset == nil {
4482 return errors .New (errors .ErrCodeInvalidRequest , "kubernetes client is not available" )
4583 }
4684
47- // 1. Get the DRA test pod (deployed by workflow before aicr validate runs)
48- pod , err := ctx .Clientset .CoreV1 ().Pods ("dra-test" ).Get (
49- ctx .Context , "dra-gpu-test" , metav1.GetOptions {})
85+ dynClient , err := getDynamicClient (ctx )
5086 if err != nil {
51- return errors .Wrap (errors .ErrCodeNotFound ,
52- "DRA test pod not found (deploy dra-gpu-test.yaml first)" , err )
87+ return err
88+ }
89+
90+ run , err := newDRATestRun ()
91+ if err != nil {
92+ return err
93+ }
94+
95+ // Deploy DRA test resources and ensure cleanup.
96+ if err = deployDRATestResources (ctx .Context , ctx .Clientset , dynClient , run ); err != nil {
97+ return err
5398 }
99+ defer cleanupDRATestResources (ctx .Context , ctx .Clientset , dynClient , run )
54100
55- // 2. Pod uses resourceClaims (DRA pattern)
101+ // Wait for test pod to reach terminal state.
102+ pod , err := waitForDRATestPod (ctx .Context , ctx .Clientset , run )
103+ if err != nil {
104+ return err
105+ }
106+
107+ // Validate DRA access patterns on the completed pod.
108+ return validateDRAPatterns (ctx .Context , dynClient , pod , run )
109+ }
110+
111+ // deployDRATestResources creates the namespace, ResourceClaim, and Pod for the DRA test.
112+ func deployDRATestResources (ctx context.Context , clientset kubernetes.Interface , dynClient dynamic.Interface , run * draTestRun ) error {
113+ // 1. Create namespace (idempotent).
114+ ns := & corev1.Namespace {
115+ ObjectMeta : metav1.ObjectMeta {Name : draTestNamespace },
116+ }
117+ if _ , err := clientset .CoreV1 ().Namespaces ().Create (ctx , ns , metav1.CreateOptions {}); k8s .IgnoreAlreadyExists (err ) != nil {
118+ return errors .Wrap (errors .ErrCodeInternal , "failed to create namespace" , err )
119+ }
120+
121+ // 2. Create ResourceClaim with unique name.
122+ claim := buildResourceClaim (run )
123+ if _ , err := dynClient .Resource (claimGVR ).Namespace (draTestNamespace ).Create (
124+ ctx , claim , metav1.CreateOptions {}); err != nil {
125+ return errors .Wrap (errors .ErrCodeInternal , "failed to create ResourceClaim" , err )
126+ }
127+
128+ // 3. Create Pod with unique name.
129+ pod := buildDRATestPod (run )
130+ if _ , err := clientset .CoreV1 ().Pods (draTestNamespace ).Create (ctx , pod , metav1.CreateOptions {}); err != nil {
131+ return errors .Wrap (errors .ErrCodeInternal , "failed to create DRA test pod" , err )
132+ }
133+
134+ return nil
135+ }
136+
137+ // waitForDRATestPod polls until the DRA test pod reaches a terminal state.
138+ func waitForDRATestPod (ctx context.Context , clientset kubernetes.Interface , run * draTestRun ) (* corev1.Pod , error ) {
139+ var resultPod * corev1.Pod
140+
141+ waitCtx , cancel := context .WithTimeout (ctx , defaults .DRATestPodTimeout )
142+ defer cancel ()
143+
144+ err := wait .PollUntilContextCancel (waitCtx , defaults .PodPollInterval , true ,
145+ func (ctx context.Context ) (bool , error ) {
146+ pod , err := clientset .CoreV1 ().Pods (draTestNamespace ).Get (
147+ ctx , run .podName , metav1.GetOptions {})
148+ if err != nil {
149+ return false , errors .Wrap (errors .ErrCodeInternal , "failed to get DRA test pod" , err )
150+ }
151+ switch pod .Status .Phase { //nolint:exhaustive // only terminal states matter
152+ case corev1 .PodSucceeded , corev1 .PodFailed :
153+ resultPod = pod
154+ return true , nil
155+ default :
156+ return false , nil
157+ }
158+ },
159+ )
160+ if err != nil {
161+ // Distinguish timeout from other poll errors (RBAC, NotFound, etc).
162+ if ctx .Err () != nil || waitCtx .Err () != nil {
163+ return nil , errors .Wrap (errors .ErrCodeTimeout , "DRA test pod did not complete in time" , err )
164+ }
165+ return nil , errors .Wrap (errors .ErrCodeInternal , "DRA test pod polling failed" , err )
166+ }
167+
168+ return resultPod , nil
169+ }
170+
171+ // validateDRAPatterns verifies the completed pod uses proper DRA access patterns.
172+ func validateDRAPatterns (ctx context.Context , dynClient dynamic.Interface , pod * corev1.Pod , run * draTestRun ) error {
173+ // 1. Pod uses resourceClaims (DRA pattern).
56174 if len (pod .Spec .ResourceClaims ) == 0 {
57175 return errors .New (errors .ErrCodeInternal ,
58176 "pod does not use DRA resourceClaims" )
59177 }
60178
61- // 3 . No nvidia.com/gpu in resources.limits (device plugin pattern)
179+ // 2 . No nvidia.com/gpu in resources.limits (device plugin pattern).
62180 for _ , c := range pod .Spec .Containers {
63181 if c .Resources .Limits != nil {
64182 if _ , hasGPU := c .Resources .Limits ["nvidia.com/gpu" ]; hasGPU {
@@ -68,31 +186,22 @@ func CheckSecureAcceleratorAccess(ctx *checks.ValidationContext) error {
68186 }
69187 }
70188
71- // 4 . No hostPath volumes to /dev/nvidia*
189+ // 3 . No hostPath volumes to /dev/nvidia*.
72190 for _ , vol := range pod .Spec .Volumes {
73191 if vol .HostPath != nil && strings .Contains (vol .HostPath .Path , "/dev/nvidia" ) {
74192 return errors .New (errors .ErrCodeInternal ,
75193 fmt .Sprintf ("pod has hostPath volume to %s" , vol .HostPath .Path ))
76194 }
77195 }
78196
79- // 5. ResourceClaim exists
80- dynClient , err := getDynamicClient (ctx )
81- if err != nil {
82- return err
83- }
84- gvr := schema.GroupVersionResource {
85- Group : "resource.k8s.io" , Version : "v1" , Resource : "resourceclaims" ,
86- }
87- _ , err = dynClient .Resource (gvr ).Namespace ("dra-test" ).Get (
88- ctx .Context , "gpu-claim" , metav1.GetOptions {})
89- if err != nil {
90- return errors .Wrap (errors .ErrCodeNotFound , "ResourceClaim gpu-claim not found" , err )
197+ // 4. ResourceClaim exists.
198+ if _ , err := dynClient .Resource (claimGVR ).Namespace (draTestNamespace ).Get (
199+ ctx , run .claimName , metav1.GetOptions {}); err != nil {
200+ return errors .Wrap (errors .ErrCodeNotFound ,
201+ fmt .Sprintf ("ResourceClaim %s not found" , run .claimName ), err )
91202 }
92203
93- // 6. Pod completed successfully — proves DRA allocation worked.
94- // Note: status.allocation may be cleared after pod completion, so we verify
95- // success via the pod phase rather than the claim's allocation status.
204+ // 5. Pod completed successfully — proves DRA allocation worked.
96205 if pod .Status .Phase != corev1 .PodSucceeded {
97206 return errors .New (errors .ErrCodeInternal ,
98207 fmt .Sprintf ("DRA test pod phase=%s (want Succeeded), GPU allocation may have failed" ,
@@ -101,3 +210,100 @@ func CheckSecureAcceleratorAccess(ctx *checks.ValidationContext) error {
101210
102211 return nil
103212}
213+
214+ // cleanupDRATestResources removes test resources. Best-effort: errors are ignored
215+ // since cleanup failures should not mask test results.
216+ // The namespace is intentionally NOT deleted — it's harmless to leave and
217+ // namespace deletion can hang on DRA finalizers.
218+ func cleanupDRATestResources (ctx context.Context , clientset kubernetes.Interface , dynClient dynamic.Interface , run * draTestRun ) {
219+ // Delete pod first (releases claim reservation), then claim.
220+ _ = k8s .IgnoreNotFound (clientset .CoreV1 ().Pods (draTestNamespace ).Delete (
221+ ctx , run .podName , metav1.DeleteOptions {}))
222+ waitForDeletion (ctx , func () error {
223+ _ , err := clientset .CoreV1 ().Pods (draTestNamespace ).Get (ctx , run .podName , metav1.GetOptions {})
224+ return err
225+ })
226+ _ = k8s .IgnoreNotFound (dynClient .Resource (claimGVR ).Namespace (draTestNamespace ).Delete (
227+ ctx , run .claimName , metav1.DeleteOptions {}))
228+ }
229+
230+ // waitForDeletion polls until a resource is gone (NotFound) or the context expires.
231+ func waitForDeletion (ctx context.Context , getFunc func () error ) {
232+ pollCtx , cancel := context .WithTimeout (ctx , defaults .K8sCleanupTimeout )
233+ defer cancel ()
234+ _ = wait .PollUntilContextCancel (pollCtx , defaults .PodPollInterval , true ,
235+ func (ctx context.Context ) (bool , error ) {
236+ err := getFunc ()
237+ if k8serrors .IsNotFound (err ) {
238+ return true , nil
239+ }
240+ return false , nil
241+ },
242+ )
243+ }
244+
245+ // buildDRATestPod returns the Pod spec for the DRA GPU allocation test.
246+ func buildDRATestPod (run * draTestRun ) * corev1.Pod {
247+ return & corev1.Pod {
248+ ObjectMeta : metav1.ObjectMeta {
249+ Name : run .podName ,
250+ Namespace : draTestNamespace ,
251+ },
252+ Spec : corev1.PodSpec {
253+ RestartPolicy : corev1 .RestartPolicyNever ,
254+ Tolerations : []corev1.Toleration {
255+ {Operator : corev1 .TolerationOpExists },
256+ },
257+ ResourceClaims : []corev1.PodResourceClaim {
258+ {
259+ Name : "gpu" ,
260+ ResourceClaimName : strPtr (run .claimName ),
261+ },
262+ },
263+ Containers : []corev1.Container {
264+ {
265+ Name : "gpu-test" ,
266+ Image : "nvidia/cuda:12.9.0-base-ubuntu24.04" ,
267+ Command : []string {"bash" , "-c" , "ls /dev/nvidia* && echo 'DRA GPU allocation successful'" },
268+ Resources : corev1.ResourceRequirements {
269+ Claims : []corev1.ResourceClaim {
270+ {Name : "gpu" },
271+ },
272+ },
273+ },
274+ },
275+ },
276+ }
277+ }
278+
279+ // buildResourceClaim returns the unstructured ResourceClaim for the DRA test.
280+ func buildResourceClaim (run * draTestRun ) * unstructured.Unstructured {
281+ return & unstructured.Unstructured {
282+ Object : map [string ]interface {}{
283+ "apiVersion" : "resource.k8s.io/v1" ,
284+ "kind" : "ResourceClaim" ,
285+ "metadata" : map [string ]interface {}{
286+ "name" : run .claimName ,
287+ "namespace" : draTestNamespace ,
288+ },
289+ "spec" : map [string ]interface {}{
290+ "devices" : map [string ]interface {}{
291+ "requests" : []interface {}{
292+ map [string ]interface {}{
293+ "name" : "gpu" ,
294+ "exactly" : map [string ]interface {}{
295+ "deviceClassName" : "gpu.nvidia.com" ,
296+ "allocationMode" : "ExactCount" ,
297+ "count" : int64 (1 ),
298+ },
299+ },
300+ },
301+ },
302+ },
303+ },
304+ }
305+ }
306+
307+ func strPtr (s string ) * string {
308+ return & s
309+ }
0 commit comments