1515package conformance
1616
1717import (
18+ "context"
1819 "fmt"
1920 "strings"
2021
22+ "github.com/NVIDIA/aicr/pkg/defaults"
2123 "github.com/NVIDIA/aicr/pkg/errors"
24+ "github.com/NVIDIA/aicr/pkg/k8s"
2225 "github.com/NVIDIA/aicr/pkg/validator/checks"
2326 corev1 "k8s.io/api/core/v1"
2427 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28+ "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
2529 "k8s.io/apimachinery/pkg/runtime/schema"
30+ "k8s.io/apimachinery/pkg/util/wait"
31+ "k8s.io/client-go/dynamic"
32+ "k8s.io/client-go/kubernetes"
33+ )
34+
35+ const (
36+ draTestNamespace = "dra-test"
37+ draTestPodName = "dra-gpu-test"
38+ draClaimName = "gpu-claim"
2639)
2740
2841func init () {
@@ -36,29 +49,113 @@ func init() {
3649}
3750
3851// CheckSecureAcceleratorAccess validates CNCF requirement #3: Secure Accelerator Access.
39- // Verifies that a DRA-based GPU workload uses proper access patterns:
52+ // Creates a DRA-based GPU test pod, waits for completion, and verifies proper access patterns:
4053// resourceClaims instead of device plugin, no hostPath to GPU devices,
4154// and ResourceClaim is allocated.
4255func CheckSecureAcceleratorAccess (ctx * checks.ValidationContext ) error {
4356 if ctx .Clientset == nil {
4457 return errors .New (errors .ErrCodeInvalidRequest , "kubernetes client is not available" )
4558 }
4659
47- // 1. Get the DRA test pod (deployed by workflow before aicr validate runs)
48- pod , err := ctx .Clientset .CoreV1 ().Pods ("dra-test" ).Get (
49- ctx .Context , "dra-gpu-test" , metav1.GetOptions {})
60+ dynClient , err := getDynamicClient (ctx )
61+ if err != nil {
62+ return err
63+ }
64+
65+ // Deploy DRA test resources and ensure cleanup.
66+ if err = deployDRATestResources (ctx .Context , ctx .Clientset , dynClient ); err != nil {
67+ return err
68+ }
69+ defer cleanupDRATestResources (ctx .Context , ctx .Clientset , dynClient )
70+
71+ // Wait for test pod to reach terminal state.
72+ pod , err := waitForDRATestPod (ctx .Context , ctx .Clientset )
5073 if err != nil {
51- return errors .Wrap (errors .ErrCodeNotFound ,
52- "DRA test pod not found (deploy dra-gpu-test.yaml first)" , err )
74+ return err
75+ }
76+
77+ // Validate DRA access patterns on the completed pod.
78+ return validateDRAPatterns (ctx .Context , dynClient , pod )
79+ }
80+
81+ // deployDRATestResources creates the namespace, ResourceClaim, and Pod for the DRA test.
82+ func deployDRATestResources (ctx context.Context , clientset kubernetes.Interface , dynClient dynamic.Interface ) error {
83+ // 1. Create namespace (idempotent).
84+ ns := & corev1.Namespace {
85+ ObjectMeta : metav1.ObjectMeta {Name : draTestNamespace },
86+ }
87+ if _ , err := clientset .CoreV1 ().Namespaces ().Create (ctx , ns , metav1.CreateOptions {}); k8s .IgnoreAlreadyExists (err ) != nil {
88+ return errors .Wrap (errors .ErrCodeInternal , "failed to create namespace" , err )
89+ }
90+
91+ // 2. Delete existing ResourceClaim if present, then create fresh.
92+ claimGVR := schema.GroupVersionResource {
93+ Group : "resource.k8s.io" , Version : "v1" , Resource : "resourceclaims" ,
94+ }
95+ if err := k8s .IgnoreNotFound (dynClient .Resource (claimGVR ).Namespace (draTestNamespace ).Delete (
96+ ctx , draClaimName , metav1.DeleteOptions {})); err != nil {
97+ return errors .Wrap (errors .ErrCodeInternal , "failed to delete existing ResourceClaim" , err )
98+ }
99+
100+ claim := buildResourceClaim ()
101+ if _ , err := dynClient .Resource (claimGVR ).Namespace (draTestNamespace ).Create (
102+ ctx , claim , metav1.CreateOptions {}); err != nil {
103+ return errors .Wrap (errors .ErrCodeInternal , "failed to create ResourceClaim" , err )
104+ }
105+
106+ // 3. Delete existing Pod if present, then create fresh.
107+ if err := k8s .IgnoreNotFound (clientset .CoreV1 ().Pods (draTestNamespace ).Delete (
108+ ctx , draTestPodName , metav1.DeleteOptions {})); err != nil {
109+ return errors .Wrap (errors .ErrCodeInternal , "failed to delete existing pod" , err )
53110 }
54111
55- // 2. Pod uses resourceClaims (DRA pattern)
112+ pod := buildDRATestPod ()
113+ if _ , err := clientset .CoreV1 ().Pods (draTestNamespace ).Create (ctx , pod , metav1.CreateOptions {}); err != nil {
114+ return errors .Wrap (errors .ErrCodeInternal , "failed to create DRA test pod" , err )
115+ }
116+
117+ return nil
118+ }
119+
120+ // waitForDRATestPod polls until the DRA test pod reaches a terminal state.
121+ func waitForDRATestPod (ctx context.Context , clientset kubernetes.Interface ) (* corev1.Pod , error ) {
122+ var resultPod * corev1.Pod
123+
124+ waitCtx , cancel := context .WithTimeout (ctx , defaults .DRATestPodTimeout )
125+ defer cancel ()
126+
127+ err := wait .PollUntilContextCancel (waitCtx , defaults .PodPollInterval , true ,
128+ func (ctx context.Context ) (bool , error ) {
129+ pod , err := clientset .CoreV1 ().Pods (draTestNamespace ).Get (
130+ ctx , draTestPodName , metav1.GetOptions {})
131+ if err != nil {
132+ return false , errors .Wrap (errors .ErrCodeInternal , "failed to get DRA test pod" , err )
133+ }
134+ switch pod .Status .Phase { //nolint:exhaustive // only terminal states matter
135+ case corev1 .PodSucceeded , corev1 .PodFailed :
136+ resultPod = pod
137+ return true , nil
138+ default :
139+ return false , nil
140+ }
141+ },
142+ )
143+ if err != nil {
144+ return nil , errors .Wrap (errors .ErrCodeTimeout , "DRA test pod did not complete" , err )
145+ }
146+
147+ return resultPod , nil
148+ }
149+
150+ // validateDRAPatterns verifies the completed pod uses proper DRA access patterns.
151+ func validateDRAPatterns (ctx context.Context , dynClient dynamic.Interface , pod * corev1.Pod ) error {
152+ // 1. Pod uses resourceClaims (DRA pattern).
56153 if len (pod .Spec .ResourceClaims ) == 0 {
57154 return errors .New (errors .ErrCodeInternal ,
58155 "pod does not use DRA resourceClaims" )
59156 }
60157
61- // 3 . No nvidia.com/gpu in resources.limits (device plugin pattern)
158+ // 2 . No nvidia.com/gpu in resources.limits (device plugin pattern).
62159 for _ , c := range pod .Spec .Containers {
63160 if c .Resources .Limits != nil {
64161 if _ , hasGPU := c .Resources .Limits ["nvidia.com/gpu" ]; hasGPU {
@@ -68,31 +165,25 @@ func CheckSecureAcceleratorAccess(ctx *checks.ValidationContext) error {
68165 }
69166 }
70167
71- // 4 . No hostPath volumes to /dev/nvidia*
168+ // 3 . No hostPath volumes to /dev/nvidia*.
72169 for _ , vol := range pod .Spec .Volumes {
73170 if vol .HostPath != nil && strings .Contains (vol .HostPath .Path , "/dev/nvidia" ) {
74171 return errors .New (errors .ErrCodeInternal ,
75172 fmt .Sprintf ("pod has hostPath volume to %s" , vol .HostPath .Path ))
76173 }
77174 }
78175
79- // 5. ResourceClaim exists
80- dynClient , err := getDynamicClient (ctx )
81- if err != nil {
82- return err
83- }
84- gvr := schema.GroupVersionResource {
176+ // 4. ResourceClaim exists.
177+ claimGVR := schema.GroupVersionResource {
85178 Group : "resource.k8s.io" , Version : "v1" , Resource : "resourceclaims" ,
86179 }
87- _ , err = dynClient .Resource (gvr ).Namespace ("dra-test" ).Get (
88- ctx . Context , "gpu-claim" , metav1.GetOptions {})
180+ _ , err : = dynClient .Resource (claimGVR ).Namespace (draTestNamespace ).Get (
181+ ctx , draClaimName , metav1.GetOptions {})
89182 if err != nil {
90183 return errors .Wrap (errors .ErrCodeNotFound , "ResourceClaim gpu-claim not found" , err )
91184 }
92185
93- // 6. Pod completed successfully — proves DRA allocation worked.
94- // Note: status.allocation may be cleared after pod completion, so we verify
95- // success via the pod phase rather than the claim's allocation status.
186+ // 5. Pod completed successfully — proves DRA allocation worked.
96187 if pod .Status .Phase != corev1 .PodSucceeded {
97188 return errors .New (errors .ErrCodeInternal ,
98189 fmt .Sprintf ("DRA test pod phase=%s (want Succeeded), GPU allocation may have failed" ,
@@ -101,3 +192,85 @@ func CheckSecureAcceleratorAccess(ctx *checks.ValidationContext) error {
101192
102193 return nil
103194}
195+
196+ // cleanupDRATestResources removes test resources. Errors are logged but not returned
197+ // since cleanup failures should not mask test results.
198+ // Note: the namespace is intentionally NOT deleted — it's harmless to leave and
199+ // namespace deletion can hang on DRA finalizers.
200+ func cleanupDRATestResources (ctx context.Context , clientset kubernetes.Interface , dynClient dynamic.Interface ) {
201+ claimGVR := schema.GroupVersionResource {
202+ Group : "resource.k8s.io" , Version : "v1" , Resource : "resourceclaims" ,
203+ }
204+
205+ // Delete pod first (releases claim reservation), then claim.
206+ _ = k8s .IgnoreNotFound (clientset .CoreV1 ().Pods (draTestNamespace ).Delete (
207+ ctx , draTestPodName , metav1.DeleteOptions {}))
208+ _ = k8s .IgnoreNotFound (dynClient .Resource (claimGVR ).Namespace (draTestNamespace ).Delete (
209+ ctx , draClaimName , metav1.DeleteOptions {}))
210+ }
211+
212+ // buildDRATestPod returns the Pod spec for the DRA GPU allocation test.
213+ func buildDRATestPod () * corev1.Pod {
214+ return & corev1.Pod {
215+ ObjectMeta : metav1.ObjectMeta {
216+ Name : draTestPodName ,
217+ Namespace : draTestNamespace ,
218+ },
219+ Spec : corev1.PodSpec {
220+ RestartPolicy : corev1 .RestartPolicyNever ,
221+ Tolerations : []corev1.Toleration {
222+ {Operator : corev1 .TolerationOpExists },
223+ },
224+ ResourceClaims : []corev1.PodResourceClaim {
225+ {
226+ Name : "gpu" ,
227+ ResourceClaimName : strPtr (draClaimName ),
228+ },
229+ },
230+ Containers : []corev1.Container {
231+ {
232+ Name : "gpu-test" ,
233+ Image : "nvidia/cuda:12.9.0-base-ubuntu24.04" ,
234+ Command : []string {"bash" , "-c" , "ls /dev/nvidia* && echo 'DRA GPU allocation successful'" },
235+ Resources : corev1.ResourceRequirements {
236+ Claims : []corev1.ResourceClaim {
237+ {Name : "gpu" },
238+ },
239+ },
240+ },
241+ },
242+ },
243+ }
244+ }
245+
246+ // buildResourceClaim returns the unstructured ResourceClaim for the DRA test.
247+ func buildResourceClaim () * unstructured.Unstructured {
248+ return & unstructured.Unstructured {
249+ Object : map [string ]interface {}{
250+ "apiVersion" : "resource.k8s.io/v1" ,
251+ "kind" : "ResourceClaim" ,
252+ "metadata" : map [string ]interface {}{
253+ "name" : draClaimName ,
254+ "namespace" : draTestNamespace ,
255+ },
256+ "spec" : map [string ]interface {}{
257+ "devices" : map [string ]interface {}{
258+ "requests" : []interface {}{
259+ map [string ]interface {}{
260+ "name" : "gpu" ,
261+ "exactly" : map [string ]interface {}{
262+ "deviceClassName" : "gpu.nvidia.com" ,
263+ "allocationMode" : "ExactCount" ,
264+ "count" : int64 (1 ),
265+ },
266+ },
267+ },
268+ },
269+ },
270+ },
271+ }
272+ }
273+
274+ func strPtr (s string ) * string {
275+ return & s
276+ }
0 commit comments