Skip to content

Commit b2c352e

Browse files
committed
feat(conformance): enrich DRA, gang, and secure access evidence
1 parent 1f1758a commit b2c352e

File tree

7 files changed

+512
-52
lines changed

7 files changed

+512
-52
lines changed

pkg/validator/checks/conformance/dra_support_check.go

Lines changed: 127 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,13 @@ package conformance
1616

1717
import (
1818
"fmt"
19+
"strings"
1920

2021
"github.com/NVIDIA/aicr/pkg/errors"
2122
"github.com/NVIDIA/aicr/pkg/validator/checks"
23+
corev1 "k8s.io/api/core/v1"
2224
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
25+
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
2326
"k8s.io/apimachinery/pkg/runtime/schema"
2427
)
2528

@@ -46,37 +49,61 @@ func CheckDRASupport(ctx *checks.ValidationContext) error {
4649
return errors.New(errors.ErrCodeInvalidRequest, "kubernetes client is not available")
4750
}
4851

49-
// 1. DRA driver controller Deployment available
52+
// 1. DRA API resources are discoverable.
53+
resources, err := ctx.Clientset.Discovery().ServerResourcesForGroupVersion("resource.k8s.io/v1")
54+
if err != nil {
55+
return errors.Wrap(errors.ErrCodeNotFound, "resource.k8s.io/v1 API resources not available", err)
56+
}
57+
var apiResources strings.Builder
58+
for _, r := range resources.APIResources {
59+
fmt.Fprintf(&apiResources, "%-26s %-22s namespaced=%t\n", r.Name, r.Kind, r.Namespaced)
60+
}
61+
recordRawTextArtifact(ctx, "DRA API resources",
62+
"kubectl api-resources --api-group=resource.k8s.io", apiResources.String())
63+
64+
// 2. DRA driver pods inventory.
65+
pods, err := ctx.Clientset.CoreV1().Pods("nvidia-dra-driver").List(ctx.Context, metav1.ListOptions{})
66+
if err != nil {
67+
return errors.Wrap(errors.ErrCodeInternal, "failed to list DRA driver pods", err)
68+
}
69+
var driverPods strings.Builder
70+
for _, pod := range pods.Items {
71+
fmt.Fprintf(&driverPods, "%-48s ready=%s phase=%s node=%s\n",
72+
pod.Name, podReadyCount(pod), pod.Status.Phase, pod.Spec.NodeName)
73+
}
74+
recordRawTextArtifact(ctx, "DRA driver pods", "kubectl get pods -n nvidia-dra-driver -o wide", driverPods.String())
75+
76+
// 3. DRA driver controller Deployment available.
5077
deploy, deployErr := getDeploymentIfAvailable(ctx, "nvidia-dra-driver", "nvidia-dra-driver-gpu-controller")
78+
if deployErr != nil {
79+
return errors.Wrap(errors.ErrCodeNotFound, "DRA driver controller check failed", deployErr)
80+
}
5181
if deploy != nil {
5282
expected := int32(1)
5383
if deploy.Spec.Replicas != nil {
5484
expected = *deploy.Spec.Replicas
5585
}
56-
recordArtifact(ctx, "DRA Controller Deployment",
86+
recordRawTextArtifact(ctx, "DRA Controller Deployment", "",
5787
fmt.Sprintf("Name: %s/%s\nReplicas: %d/%d available\nImage: %s",
5888
deploy.Namespace, deploy.Name,
5989
deploy.Status.AvailableReplicas, expected,
6090
firstContainerImage(deploy.Spec.Template.Spec.Containers)))
6191
}
62-
if deployErr != nil {
63-
return errors.Wrap(errors.ErrCodeNotFound, "DRA driver controller check failed", deployErr)
64-
}
6592

66-
// 2. DRA kubelet plugin DaemonSet ready
93+
// 4. DRA kubelet plugin DaemonSet ready.
6794
ds, dsErr := getDaemonSetIfReady(ctx, "nvidia-dra-driver", "nvidia-dra-driver-gpu-kubelet-plugin")
95+
if dsErr != nil {
96+
return errors.Wrap(errors.ErrCodeNotFound, "DRA kubelet plugin check failed", dsErr)
97+
}
6898
if ds != nil {
69-
recordArtifact(ctx, "DRA Kubelet Plugin DaemonSet",
99+
recordRawTextArtifact(ctx, "DRA Kubelet Plugin DaemonSet", "",
70100
fmt.Sprintf("Name: %s/%s\nReady: %d/%d pods\nImage: %s",
71101
ds.Namespace, ds.Name,
72102
ds.Status.NumberReady, ds.Status.DesiredNumberScheduled,
73103
firstContainerImage(ds.Spec.Template.Spec.Containers)))
74104
}
75-
if dsErr != nil {
76-
return errors.Wrap(errors.ErrCodeNotFound, "DRA kubelet plugin check failed", dsErr)
77-
}
78105

79-
// 3. ResourceSlices exist (GPU resources advertised via resource.k8s.io/v1 — GA)
106+
// 5. ResourceSlices exist (GPU resources advertised via resource.k8s.io/v1 — GA).
80107
dynClient, err := getDynamicClient(ctx)
81108
if err != nil {
82109
return err
@@ -88,11 +115,98 @@ func CheckDRASupport(ctx *checks.ValidationContext) error {
88115
if err != nil {
89116
return errors.Wrap(errors.ErrCodeInternal, "failed to list ResourceSlices", err)
90117
}
91-
recordArtifact(ctx, "ResourceSlices",
92-
fmt.Sprintf("Total ResourceSlices: %d", len(slices.Items)))
118+
var sliceSummary strings.Builder
119+
fmt.Fprintf(&sliceSummary, "Total ResourceSlices: %d\n", len(slices.Items))
120+
for _, item := range slices.Items {
121+
driver, _, _ := unstructured.NestedString(item.Object, "spec", "driver")
122+
nodeName, _, _ := unstructured.NestedString(item.Object, "spec", "nodeName")
123+
poolName, _, _ := unstructured.NestedString(item.Object, "spec", "pool", "name")
124+
fmt.Fprintf(&sliceSummary, "%-48s node=%s driver=%s pool=%s\n",
125+
item.GetName(), nodeName, driver, poolName)
126+
}
127+
recordRawTextArtifact(ctx, "ResourceSlices", "kubectl get resourceslices", sliceSummary.String())
93128
if len(slices.Items) == 0 {
94129
return errors.New(errors.ErrCodeNotFound, "no ResourceSlices found (GPU resources not advertised)")
95130
}
96131

132+
// 6. Behavioral DRA allocation validation (create claim+pod, wait, capture observed state).
133+
run, err := newDRATestRun()
134+
if err != nil {
135+
return err
136+
}
137+
recordRawTextArtifact(ctx, "Apply test manifest",
138+
"kubectl apply -f docs/conformance/cncf/manifests/dra-gpu-test.yaml",
139+
fmt.Sprintf("Created Namespace=%s ResourceClaim=%s Pod=%s via Kubernetes API",
140+
draTestNamespace, run.claimName, run.podName))
141+
142+
if err := deployDRATestResources(ctx.Context, ctx.Clientset, dynClient, run); err != nil {
143+
return err
144+
}
145+
defer func() {
146+
cleanupDRATestResources(ctx.Context, ctx.Clientset, dynClient, run)
147+
recordRawTextArtifact(ctx, "Delete test namespace",
148+
"kubectl delete namespace dra-test --ignore-not-found",
149+
"Deleted DRA test pod and ResourceClaim; namespace retained intentionally to avoid DRA finalizer stalls.")
150+
}()
151+
152+
pod, err := waitForDRATestPod(ctx.Context, ctx.Clientset, run)
153+
if err != nil {
154+
return err
155+
}
156+
157+
claimObj, err := dynClient.Resource(claimGVR).Namespace(draTestNamespace).Get(
158+
ctx.Context, run.claimName, metav1.GetOptions{})
159+
if err != nil {
160+
return errors.Wrap(errors.ErrCodeInternal, "failed to read DRA test ResourceClaim", err)
161+
}
162+
state, _, _ := unstructured.NestedString(claimObj.Object, "status", "state")
163+
claimLines := []string{
164+
fmt.Sprintf("Name: %s/%s", draTestNamespace, run.claimName),
165+
fmt.Sprintf("State: %s", valueOrUnknown(state)),
166+
}
167+
recordRawTextArtifact(ctx, "ResourceClaim status",
168+
"kubectl get resourceclaim -n dra-test -o wide", strings.Join(claimLines, "\n"))
169+
170+
podLines := []string{
171+
fmt.Sprintf("Name: %s/%s", pod.Namespace, pod.Name),
172+
fmt.Sprintf("Phase: %s", pod.Status.Phase),
173+
fmt.Sprintf("Node: %s", valueOrUnknown(pod.Spec.NodeName)),
174+
fmt.Sprintf("PodIP: %s", valueOrUnknown(pod.Status.PodIP)),
175+
fmt.Sprintf("Claims: %d", len(pod.Spec.ResourceClaims)),
176+
}
177+
recordRawTextArtifact(ctx, "Pod status",
178+
"kubectl get pod dra-gpu-test -n dra-test -o wide", strings.Join(podLines, "\n"))
179+
180+
logBytes, logErr := ctx.Clientset.CoreV1().Pods(draTestNamespace).GetLogs(run.podName, &corev1.PodLogOptions{}).DoRaw(ctx.Context)
181+
if logErr != nil {
182+
recordRawTextArtifact(ctx, "Pod logs", "kubectl logs dra-gpu-test -n dra-test",
183+
fmt.Sprintf("failed to read logs: %v", logErr))
184+
} else {
185+
recordChunkedTextArtifact(ctx, "Pod logs", "kubectl logs dra-gpu-test -n dra-test", string(logBytes))
186+
}
187+
188+
if pod.Status.Phase != corev1.PodSucceeded {
189+
return errors.New(errors.ErrCodeInternal,
190+
fmt.Sprintf("DRA test pod phase=%s (want Succeeded), GPU allocation may have failed", pod.Status.Phase))
191+
}
192+
97193
return nil
98194
}
195+
196+
func valueOrUnknown(v string) string {
197+
if strings.TrimSpace(v) == "" {
198+
return "unknown"
199+
}
200+
return v
201+
}
202+
203+
func podReadyCount(pod corev1.Pod) string {
204+
var ready, total int
205+
for _, cs := range pod.Status.ContainerStatuses {
206+
total++
207+
if cs.Ready {
208+
ready++
209+
}
210+
}
211+
return fmt.Sprintf("%d/%d", ready, total)
212+
}

pkg/validator/checks/conformance/dra_support_check_unit_test.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,16 @@ import (
2020
"testing"
2121

2222
"github.com/NVIDIA/aicr/pkg/validator/checks"
23+
corev1 "k8s.io/api/core/v1"
24+
k8serrors "k8s.io/apimachinery/pkg/api/errors"
25+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2326
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
2427
"k8s.io/apimachinery/pkg/runtime"
2528
"k8s.io/apimachinery/pkg/runtime/schema"
29+
discoveryfake "k8s.io/client-go/discovery/fake"
2630
dynamicfake "k8s.io/client-go/dynamic/fake"
2731
"k8s.io/client-go/kubernetes/fake"
32+
k8stesting "k8s.io/client-go/testing"
2833
)
2934

3035
func TestCheckDRASupport(t *testing.T) {
@@ -104,11 +109,65 @@ func TestCheckDRASupport(t *testing.T) {
104109
//nolint:staticcheck // SA1019: fake.NewSimpleClientset is sufficient for tests
105110
clientset := fake.NewSimpleClientset(tt.k8sObjects...)
106111

112+
// Discovery API resources for resource.k8s.io/v1.
113+
if fd, ok := clientset.Discovery().(*discoveryfake.FakeDiscovery); ok {
114+
fd.Resources = []*metav1.APIResourceList{
115+
{
116+
GroupVersion: "resource.k8s.io/v1",
117+
APIResources: []metav1.APIResource{
118+
{Name: "deviceclasses", Kind: "DeviceClass", Namespaced: false},
119+
{Name: "resourceclaims", Kind: "ResourceClaim", Namespaced: true},
120+
{Name: "resourceclaimtemplates", Kind: "ResourceClaimTemplate", Namespaced: true},
121+
{Name: "resourceslices", Kind: "ResourceSlice", Namespaced: false},
122+
},
123+
},
124+
}
125+
}
126+
127+
// Behavioral test pod status stub: pods created with dra test prefix
128+
// immediately appear completed to avoid poll timeouts in unit tests.
129+
podDeleted := false
130+
clientset.PrependReactor("get", "pods", func(action k8stesting.Action) (bool, runtime.Object, error) {
131+
ga, ok := action.(k8stesting.GetAction)
132+
if !ok {
133+
// Let non-standard pod gets (for example log subresource plumbing)
134+
// fall back to the default fake behavior.
135+
return false, nil, nil
136+
}
137+
if strings.HasPrefix(ga.GetName(), draTestPrefix) && ga.GetNamespace() == draTestNamespace {
138+
if podDeleted {
139+
return true, nil, k8serrors.NewNotFound(
140+
schema.GroupResource{Resource: "pods"}, ga.GetName())
141+
}
142+
run := &draTestRun{podName: ga.GetName(), claimName: draClaimPrefix + ga.GetName()[len(draTestPrefix):]}
143+
return true, &corev1.Pod{
144+
ObjectMeta: metav1.ObjectMeta{
145+
Name: run.podName,
146+
Namespace: draTestNamespace,
147+
},
148+
Spec: *buildDRATestPod(run).Spec.DeepCopy(),
149+
Status: corev1.PodStatus{
150+
Phase: corev1.PodSucceeded,
151+
},
152+
}, nil
153+
}
154+
return false, nil, nil
155+
})
156+
clientset.PrependReactor("delete", "pods", func(action k8stesting.Action) (bool, runtime.Object, error) {
157+
da := action.(k8stesting.DeleteAction)
158+
if strings.HasPrefix(da.GetName(), draTestPrefix) && da.GetNamespace() == draTestNamespace {
159+
podDeleted = true
160+
return true, nil, nil
161+
}
162+
return false, nil, nil
163+
})
164+
107165
scheme := runtime.NewScheme()
108166
// Always register custom list kinds so List() works even with 0 objects.
109167
dynClient := dynamicfake.NewSimpleDynamicClientWithCustomListKinds(scheme,
110168
map[schema.GroupVersionResource]string{
111169
{Group: "resource.k8s.io", Version: "v1", Resource: "resourceslices"}: "ResourceSliceList",
170+
{Group: "resource.k8s.io", Version: "v1", Resource: "resourceclaims"}: "ResourceClaimList",
112171
},
113172
tt.dynamicObjects...)
114173

pkg/validator/checks/conformance/gang_scheduling_check.go

Lines changed: 67 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ func CheckGangScheduling(ctx *checks.ValidationContext) error {
115115
}
116116

117117
// 1. All KAI scheduler deployments available.
118-
var schedulerSummary strings.Builder
118+
var deploymentsSummary strings.Builder
119119
for _, name := range kaiSchedulerDeployments {
120120
deploy, err := getDeploymentIfAvailable(ctx, "kai-scheduler", name)
121121
if err != nil {
@@ -126,11 +126,24 @@ func CheckGangScheduling(ctx *checks.ValidationContext) error {
126126
if deploy.Spec.Replicas != nil {
127127
expected = *deploy.Spec.Replicas
128128
}
129-
fmt.Fprintf(&schedulerSummary, " %-25s available=%d/%d image=%s\n",
129+
fmt.Fprintf(&deploymentsSummary, "%-25s available=%d/%d image=%s\n",
130130
name, deploy.Status.AvailableReplicas, expected,
131131
firstContainerImage(deploy.Spec.Template.Spec.Containers))
132132
}
133-
recordArtifact(ctx, "KAI Scheduler Components", schedulerSummary.String())
133+
recordRawTextArtifact(ctx, "KAI scheduler deployments",
134+
"kubectl get deploy -n kai-scheduler", deploymentsSummary.String())
135+
136+
// KAI scheduler pods.
137+
kaiPods, err := ctx.Clientset.CoreV1().Pods("kai-scheduler").List(ctx.Context, metav1.ListOptions{})
138+
if err != nil {
139+
return errors.Wrap(errors.ErrCodeInternal, "failed to list KAI scheduler pods", err)
140+
}
141+
var podsSummary strings.Builder
142+
for _, p := range kaiPods.Items {
143+
fmt.Fprintf(&podsSummary, "%-44s ready=%s phase=%s\n", p.Name, podReadyCount(p), p.Status.Phase)
144+
}
145+
recordRawTextArtifact(ctx, "KAI scheduler pods",
146+
"kubectl get pods -n kai-scheduler", podsSummary.String())
134147

135148
// 2. Required CRDs for gang scheduling.
136149
dynClient, err := getDynamicClient(ctx)
@@ -152,7 +165,9 @@ func CheckGangScheduling(ctx *checks.ValidationContext) error {
152165
}
153166
fmt.Fprintf(&crdSummary, " %s: present\n", crd)
154167
}
155-
recordArtifact(ctx, "Gang Scheduling CRDs", crdSummary.String())
168+
recordRawTextArtifact(ctx, "Gang Scheduling CRDs",
169+
"kubectl get crd queues.scheduling.run.ai podgroups.scheduling.run.ai",
170+
crdSummary.String())
156171

157172
// 3. Pre-flight: ensure enough free GPUs for the gang test.
158173
total, free, gpuErr := countAvailableGPUs(ctx.Context, dynClient)
@@ -173,7 +188,18 @@ func CheckGangScheduling(ctx *checks.ValidationContext) error {
173188
return err
174189
}
175190

176-
defer cleanupGangTestResources(ctx.Context, ctx.Clientset, dynClient, run)
191+
defer func() {
192+
cleanupGangTestResources(ctx.Context, ctx.Clientset, dynClient, run)
193+
recordRawTextArtifact(ctx, "Delete test namespace",
194+
"kubectl delete namespace gang-scheduling-test --ignore-not-found",
195+
"Deleted gang test pods, claims, and PodGroup; namespace retained intentionally to avoid DRA finalizer stalls.")
196+
}()
197+
198+
recordRawTextArtifact(ctx, "Apply test manifest",
199+
"kubectl apply -f docs/conformance/cncf/manifests/gang-scheduling-test.yaml",
200+
fmt.Sprintf("Created PodGroup=%s ResourceClaims=%s,%s Pods=%s,%s in namespace=%s",
201+
run.groupName, run.claims[0], run.claims[1], run.pods[0], run.pods[1], gangTestNamespace))
202+
177203
if err = deployGangTestResources(ctx.Context, ctx.Clientset, dynClient, run); err != nil {
178204
return err
179205
}
@@ -188,7 +214,24 @@ func CheckGangScheduling(ctx *checks.ValidationContext) error {
188214
return err
189215
}
190216

191-
// Record gang test results with scheduling timestamps.
217+
// PodGroup status.
218+
pgList, listErr := dynClient.Resource(podGroupGVR).Namespace(gangTestNamespace).List(
219+
ctx.Context, metav1.ListOptions{})
220+
if listErr != nil {
221+
recordRawTextArtifact(ctx, "PodGroup status",
222+
"kubectl get podgroups -n gang-scheduling-test -o wide",
223+
fmt.Sprintf("failed to list PodGroups: %v", listErr))
224+
} else {
225+
var pgSummary strings.Builder
226+
for _, item := range pgList.Items {
227+
minMember, _, _ := unstructured.NestedInt64(item.Object, "spec", "minMember")
228+
fmt.Fprintf(&pgSummary, "%-36s minMember=%d\n", item.GetName(), minMember)
229+
}
230+
recordRawTextArtifact(ctx, "PodGroup status",
231+
"kubectl get podgroups -n gang-scheduling-test -o wide", pgSummary.String())
232+
}
233+
234+
// Pod status and scheduling timestamps.
192235
var gangResults strings.Builder
193236
for i, pod := range pods {
194237
if pod == nil {
@@ -209,7 +252,24 @@ func CheckGangScheduling(ctx *checks.ValidationContext) error {
209252
fmt.Fprintf(&gangResults, "Earliest/Latest: %s / %s\n",
210253
gangReport.EarliestScheduled.Format(time.RFC3339),
211254
gangReport.LatestScheduled.Format(time.RFC3339))
212-
recordArtifact(ctx, "Gang Scheduling Test Results", gangResults.String())
255+
recordRawTextArtifact(ctx, "Pod status",
256+
"kubectl get pods -n gang-scheduling-test -o wide", gangResults.String())
257+
258+
// Worker logs.
259+
for i := range gangMinMembers {
260+
logBytes, logErr := ctx.Clientset.CoreV1().Pods(gangTestNamespace).GetLogs(
261+
run.pods[i], &corev1.PodLogOptions{}).DoRaw(ctx.Context)
262+
label := fmt.Sprintf("gang-worker-%d logs", i)
263+
if logErr != nil {
264+
recordRawTextArtifact(ctx, label,
265+
fmt.Sprintf("kubectl logs gang-worker-%d -n gang-scheduling-test", i),
266+
fmt.Sprintf("failed to read logs: %v", logErr))
267+
continue
268+
}
269+
recordChunkedTextArtifact(ctx, label,
270+
fmt.Sprintf("kubectl logs gang-worker-%d -n gang-scheduling-test", i),
271+
string(logBytes))
272+
}
213273

214274
return nil
215275
}

0 commit comments

Comments
 (0)