@@ -16,10 +16,13 @@ package conformance
1616
1717import (
1818 "fmt"
19+ "strings"
1920
2021 "github.com/NVIDIA/aicr/pkg/errors"
2122 "github.com/NVIDIA/aicr/pkg/validator/checks"
23+ corev1 "k8s.io/api/core/v1"
2224 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
25+ "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
2326 "k8s.io/apimachinery/pkg/runtime/schema"
2427)
2528
@@ -46,37 +49,61 @@ func CheckDRASupport(ctx *checks.ValidationContext) error {
4649 return errors .New (errors .ErrCodeInvalidRequest , "kubernetes client is not available" )
4750 }
4851
49- // 1. DRA driver controller Deployment available
52+ // 1. DRA API resources are discoverable.
53+ resources , err := ctx .Clientset .Discovery ().ServerResourcesForGroupVersion ("resource.k8s.io/v1" )
54+ if err != nil {
55+ return errors .Wrap (errors .ErrCodeNotFound , "resource.k8s.io/v1 API resources not available" , err )
56+ }
57+ var apiResources strings.Builder
58+ for _ , r := range resources .APIResources {
59+ fmt .Fprintf (& apiResources , "%-26s %-22s namespaced=%t\n " , r .Name , r .Kind , r .Namespaced )
60+ }
61+ recordRawTextArtifact (ctx , "DRA API resources" ,
62+ "kubectl api-resources --api-group=resource.k8s.io" , apiResources .String ())
63+
64+ // 2. DRA driver pods inventory.
65+ pods , err := ctx .Clientset .CoreV1 ().Pods ("nvidia-dra-driver" ).List (ctx .Context , metav1.ListOptions {})
66+ if err != nil {
67+ return errors .Wrap (errors .ErrCodeInternal , "failed to list DRA driver pods" , err )
68+ }
69+ var driverPods strings.Builder
70+ for _ , pod := range pods .Items {
71+ fmt .Fprintf (& driverPods , "%-48s ready=%s phase=%s node=%s\n " ,
72+ pod .Name , podReadyCount (pod ), pod .Status .Phase , pod .Spec .NodeName )
73+ }
74+ recordRawTextArtifact (ctx , "DRA driver pods" , "kubectl get pods -n nvidia-dra-driver -o wide" , driverPods .String ())
75+
76+ // 3. DRA driver controller Deployment available.
5077 deploy , deployErr := getDeploymentIfAvailable (ctx , "nvidia-dra-driver" , "nvidia-dra-driver-gpu-controller" )
78+ if deployErr != nil {
79+ return errors .Wrap (errors .ErrCodeNotFound , "DRA driver controller check failed" , deployErr )
80+ }
5181 if deploy != nil {
5282 expected := int32 (1 )
5383 if deploy .Spec .Replicas != nil {
5484 expected = * deploy .Spec .Replicas
5585 }
56- recordArtifact (ctx , "DRA Controller Deployment" ,
86+ recordRawTextArtifact (ctx , "DRA Controller Deployment" , " " ,
5787 fmt .Sprintf ("Name: %s/%s\n Replicas: %d/%d available\n Image: %s" ,
5888 deploy .Namespace , deploy .Name ,
5989 deploy .Status .AvailableReplicas , expected ,
6090 firstContainerImage (deploy .Spec .Template .Spec .Containers )))
6191 }
62- if deployErr != nil {
63- return errors .Wrap (errors .ErrCodeNotFound , "DRA driver controller check failed" , deployErr )
64- }
6592
66- // 2 . DRA kubelet plugin DaemonSet ready
93+ // 4 . DRA kubelet plugin DaemonSet ready.
6794 ds , dsErr := getDaemonSetIfReady (ctx , "nvidia-dra-driver" , "nvidia-dra-driver-gpu-kubelet-plugin" )
95+ if dsErr != nil {
96+ return errors .Wrap (errors .ErrCodeNotFound , "DRA kubelet plugin check failed" , dsErr )
97+ }
6898 if ds != nil {
69- recordArtifact (ctx , "DRA Kubelet Plugin DaemonSet" ,
99+ recordRawTextArtifact (ctx , "DRA Kubelet Plugin DaemonSet" , " " ,
70100 fmt .Sprintf ("Name: %s/%s\n Ready: %d/%d pods\n Image: %s" ,
71101 ds .Namespace , ds .Name ,
72102 ds .Status .NumberReady , ds .Status .DesiredNumberScheduled ,
73103 firstContainerImage (ds .Spec .Template .Spec .Containers )))
74104 }
75- if dsErr != nil {
76- return errors .Wrap (errors .ErrCodeNotFound , "DRA kubelet plugin check failed" , dsErr )
77- }
78105
79- // 3 . ResourceSlices exist (GPU resources advertised via resource.k8s.io/v1 — GA)
106+ // 5 . ResourceSlices exist (GPU resources advertised via resource.k8s.io/v1 — GA).
80107 dynClient , err := getDynamicClient (ctx )
81108 if err != nil {
82109 return err
@@ -88,11 +115,98 @@ func CheckDRASupport(ctx *checks.ValidationContext) error {
88115 if err != nil {
89116 return errors .Wrap (errors .ErrCodeInternal , "failed to list ResourceSlices" , err )
90117 }
91- recordArtifact (ctx , "ResourceSlices" ,
92- fmt .Sprintf ("Total ResourceSlices: %d" , len (slices .Items )))
118+ var sliceSummary strings.Builder
119+ fmt .Fprintf (& sliceSummary , "Total ResourceSlices: %d\n " , len (slices .Items ))
120+ for _ , item := range slices .Items {
121+ driver , _ , _ := unstructured .NestedString (item .Object , "spec" , "driver" )
122+ nodeName , _ , _ := unstructured .NestedString (item .Object , "spec" , "nodeName" )
123+ poolName , _ , _ := unstructured .NestedString (item .Object , "spec" , "pool" , "name" )
124+ fmt .Fprintf (& sliceSummary , "%-48s node=%s driver=%s pool=%s\n " ,
125+ item .GetName (), nodeName , driver , poolName )
126+ }
127+ recordRawTextArtifact (ctx , "ResourceSlices" , "kubectl get resourceslices" , sliceSummary .String ())
93128 if len (slices .Items ) == 0 {
94129 return errors .New (errors .ErrCodeNotFound , "no ResourceSlices found (GPU resources not advertised)" )
95130 }
96131
132+ // 6. Behavioral DRA allocation validation (create claim+pod, wait, capture observed state).
133+ run , err := newDRATestRun ()
134+ if err != nil {
135+ return err
136+ }
137+ recordRawTextArtifact (ctx , "Apply test manifest" ,
138+ "kubectl apply -f docs/conformance/cncf/manifests/dra-gpu-test.yaml" ,
139+ fmt .Sprintf ("Created Namespace=%s ResourceClaim=%s Pod=%s via Kubernetes API" ,
140+ draTestNamespace , run .claimName , run .podName ))
141+
142+ if err := deployDRATestResources (ctx .Context , ctx .Clientset , dynClient , run ); err != nil {
143+ return err
144+ }
145+ defer func () {
146+ cleanupDRATestResources (ctx .Context , ctx .Clientset , dynClient , run )
147+ recordRawTextArtifact (ctx , "Delete test namespace" ,
148+ "kubectl delete namespace dra-test --ignore-not-found" ,
149+ "Deleted DRA test pod and ResourceClaim; namespace retained intentionally to avoid DRA finalizer stalls." )
150+ }()
151+
152+ pod , err := waitForDRATestPod (ctx .Context , ctx .Clientset , run )
153+ if err != nil {
154+ return err
155+ }
156+
157+ claimObj , err := dynClient .Resource (claimGVR ).Namespace (draTestNamespace ).Get (
158+ ctx .Context , run .claimName , metav1.GetOptions {})
159+ if err != nil {
160+ return errors .Wrap (errors .ErrCodeInternal , "failed to read DRA test ResourceClaim" , err )
161+ }
162+ state , _ , _ := unstructured .NestedString (claimObj .Object , "status" , "state" )
163+ claimLines := []string {
164+ fmt .Sprintf ("Name: %s/%s" , draTestNamespace , run .claimName ),
165+ fmt .Sprintf ("State: %s" , valueOrUnknown (state )),
166+ }
167+ recordRawTextArtifact (ctx , "ResourceClaim status" ,
168+ "kubectl get resourceclaim -n dra-test -o wide" , strings .Join (claimLines , "\n " ))
169+
170+ podLines := []string {
171+ fmt .Sprintf ("Name: %s/%s" , pod .Namespace , pod .Name ),
172+ fmt .Sprintf ("Phase: %s" , pod .Status .Phase ),
173+ fmt .Sprintf ("Node: %s" , valueOrUnknown (pod .Spec .NodeName )),
174+ fmt .Sprintf ("PodIP: %s" , valueOrUnknown (pod .Status .PodIP )),
175+ fmt .Sprintf ("Claims: %d" , len (pod .Spec .ResourceClaims )),
176+ }
177+ recordRawTextArtifact (ctx , "Pod status" ,
178+ "kubectl get pod dra-gpu-test -n dra-test -o wide" , strings .Join (podLines , "\n " ))
179+
180+ logBytes , logErr := ctx .Clientset .CoreV1 ().Pods (draTestNamespace ).GetLogs (run .podName , & corev1.PodLogOptions {}).DoRaw (ctx .Context )
181+ if logErr != nil {
182+ recordRawTextArtifact (ctx , "Pod logs" , "kubectl logs dra-gpu-test -n dra-test" ,
183+ fmt .Sprintf ("failed to read logs: %v" , logErr ))
184+ } else {
185+ recordChunkedTextArtifact (ctx , "Pod logs" , "kubectl logs dra-gpu-test -n dra-test" , string (logBytes ))
186+ }
187+
188+ if pod .Status .Phase != corev1 .PodSucceeded {
189+ return errors .New (errors .ErrCodeInternal ,
190+ fmt .Sprintf ("DRA test pod phase=%s (want Succeeded), GPU allocation may have failed" , pod .Status .Phase ))
191+ }
192+
97193 return nil
98194}
195+
196+ func valueOrUnknown (v string ) string {
197+ if strings .TrimSpace (v ) == "" {
198+ return "unknown"
199+ }
200+ return v
201+ }
202+
203+ func podReadyCount (pod corev1.Pod ) string {
204+ var ready , total int
205+ for _ , cs := range pod .Status .ContainerStatuses {
206+ total ++
207+ if cs .Ready {
208+ ready ++
209+ }
210+ }
211+ return fmt .Sprintf ("%d/%d" , ready , total )
212+ }
0 commit comments