Skip to content

Commit fff5371

Browse files
authored
Merge pull request #1922 from mboersma/fix-gpu-e2e
Update NVIDIA GPU operator componentry
2 parents 28fc000 + 330ab44 commit fff5371

6 files changed

Lines changed: 51 additions & 15 deletions

File tree

templates/cluster-template-nvidia-gpu.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6036,7 +6036,7 @@ metadata:
60366036
---
60376037
apiVersion: v1
60386038
data:
6039-
gpu-operator-components.yaml: |-
6039+
gpu-operator-components.yaml: |
60406040
---
60416041
# Source: gpu-operator/templates/resources-namespace.yaml
60426042
apiVersion: v1
@@ -6383,7 +6383,7 @@ data:
63836383
- name: node-feature-discovery-master
63846384
securityContext:
63856385
{}
6386-
image: "quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0"
6386+
image: "k8s.gcr.io/nfd/node-feature-discovery:v0.9.0"
63876387
imagePullPolicy: IfNotPresent
63886388
ports:
63896389
- name: api
@@ -6509,7 +6509,7 @@ data:
65096509
driver:
65106510
repository: nvcr.io/nvidia
65116511
image: driver
6512-
version: 460.32.03
6512+
version: 470.82.01
65136513
imagePullPolicy: IfNotPresent
65146514
repoConfig:
65156515
configMapName: ""
@@ -6529,7 +6529,7 @@ data:
65296529
toolkit:
65306530
repository: nvcr.io/nvidia/k8s
65316531
image: container-toolkit
6532-
version: 1.4.7-ubuntu18.04
6532+
version: 1.7.2
65336533
imagePullPolicy: IfNotPresent
65346534
tolerations:
65356535
- key: CriticalAddonsOnly

templates/flavors/nvidia-gpu/gpu-operator-components.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ spec:
344344
- name: node-feature-discovery-master
345345
securityContext:
346346
{}
347-
image: "quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0"
347+
image: "k8s.gcr.io/nfd/node-feature-discovery:v0.9.0"
348348
imagePullPolicy: IfNotPresent
349349
ports:
350350
- name: api
@@ -470,7 +470,7 @@ spec:
470470
driver:
471471
repository: nvcr.io/nvidia
472472
image: driver
473-
version: 460.32.03
473+
version: 470.82.01
474474
imagePullPolicy: IfNotPresent
475475
repoConfig:
476476
configMapName: ""
@@ -490,7 +490,7 @@ spec:
490490
toolkit:
491491
repository: nvcr.io/nvidia/k8s
492492
image: container-toolkit
493-
version: 1.4.7-ubuntu18.04
493+
version: 1.7.2
494494
imagePullPolicy: IfNotPresent
495495
tolerations:
496496
- key: CriticalAddonsOnly
@@ -535,4 +535,4 @@ spec:
535535
nodeSelector:
536536
nvidia.com/gpu.present: "true"
537537
migStrategy: single
538-
discoveryIntervalSeconds: 60
538+
discoveryIntervalSeconds: 60

templates/flavors/nvidia-gpu/patches/cluster.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ kind: Cluster
33
metadata:
44
name: ${CLUSTER_NAME}
55
labels:
6-
gpu: "nvidia"
6+
gpu: "nvidia"

templates/test/ci/cluster-template-prow-nvidia-gpu.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6041,7 +6041,7 @@ metadata:
60416041
---
60426042
apiVersion: v1
60436043
data:
6044-
gpu-operator-components.yaml: |-
6044+
gpu-operator-components.yaml: |
60456045
---
60466046
# Source: gpu-operator/templates/resources-namespace.yaml
60476047
apiVersion: v1
@@ -6388,7 +6388,7 @@ data:
63886388
- name: node-feature-discovery-master
63896389
securityContext:
63906390
{}
6391-
image: "quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0"
6391+
image: "k8s.gcr.io/nfd/node-feature-discovery:v0.9.0"
63926392
imagePullPolicy: IfNotPresent
63936393
ports:
63946394
- name: api
@@ -6514,7 +6514,7 @@ data:
65146514
driver:
65156515
repository: nvcr.io/nvidia
65166516
image: driver
6517-
version: 460.32.03
6517+
version: 470.82.01
65186518
imagePullPolicy: IfNotPresent
65196519
repoConfig:
65206520
configMapName: ""
@@ -6534,7 +6534,7 @@ data:
65346534
toolkit:
65356535
repository: nvcr.io/nvidia/k8s
65366536
image: container-toolkit
6537-
version: 1.4.7-ubuntu18.04
6537+
version: 1.7.2
65386538
imagePullPolicy: IfNotPresent
65396539
tolerations:
65406540
- key: CriticalAddonsOnly

test/e2e/azure_gpu.go

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,17 @@ package e2e
2020

2121
import (
2222
"context"
23+
"fmt"
2324
"os"
25+
"strings"
2426

2527
. "github.com/onsi/ginkgo"
2628
. "github.com/onsi/gomega"
2729
batchv1 "k8s.io/api/batch/v1"
2830
corev1 "k8s.io/api/core/v1"
2931
"k8s.io/apimachinery/pkg/api/resource"
3032
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
33+
"k8s.io/client-go/kubernetes"
3134
"sigs.k8s.io/cluster-api/test/framework"
3235
)
3336

@@ -73,7 +76,9 @@ func AzureGPUSpec(ctx context.Context, inputGetter func() AzureGPUSpecInput) {
7376
}
7477
}
7578
return false
76-
}, e2eConfig.GetIntervals(specName, "wait-worker-nodes")...).Should(BeTrue())
79+
}, e2eConfig.GetIntervals(specName, "wait-worker-nodes")...).Should(BeTrue(), func() string {
80+
return getGPUOperatorPodLogs(ctx, clientset)
81+
})
7782

7883
By("running a CUDA vector calculation job")
7984
jobsClient := clientset.BatchV1().Jobs(corev1.NamespaceDefault)
@@ -90,7 +95,7 @@ func AzureGPUSpec(ctx context.Context, inputGetter func() AzureGPUSpecInput) {
9095
Containers: []corev1.Container{
9196
{
9297
Name: jobName,
93-
Image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.1-ubuntu18.04",
98+
Image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.2.1",
9499
Resources: corev1.ResourceRequirements{
95100
Limits: corev1.ResourceList{
96101
"nvidia.com/gpu": resource.MustParse("1"),
@@ -112,3 +117,18 @@ func AzureGPUSpec(ctx context.Context, inputGetter func() AzureGPUSpecInput) {
112117
}
113118
WaitForJobComplete(ctx, gpuJobInput, e2eConfig.GetIntervals(specName, "wait-job")...)
114119
}
120+
121+
// getGPUOperatorPodLogs returns the logs of the Nvidia GPU operator pods.
122+
func getGPUOperatorPodLogs(ctx context.Context, clientset *kubernetes.Clientset) string {
123+
podsClient := clientset.CoreV1().Pods(corev1.NamespaceAll)
124+
pods, err := podsClient.List(ctx, metav1.ListOptions{LabelSelector: "app.kubernetes.io/instance=gpu-operator"})
125+
if err != nil {
126+
return err.Error()
127+
}
128+
b := strings.Builder{}
129+
for _, pod := range pods.Items {
130+
b.WriteString(fmt.Sprintf("\nLogs for pod %s:\n", pod.Name))
131+
b.WriteString(getPodLogs(ctx, clientset, pod))
132+
}
133+
return b.String()
134+
}

test/e2e/helpers.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -634,3 +634,19 @@ func resolveKubetestRepoListPath(version string, path string) (string, error) {
634634

635635
return filepath.Join(path, "repo-list.yaml"), nil
636636
}
637+
638+
// getPodLogs returns the logs of a pod, or an error in string format.
639+
func getPodLogs(ctx context.Context, clientset *kubernetes.Clientset, pod corev1.Pod) string {
640+
req := clientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &corev1.PodLogOptions{})
641+
logs, err := req.Stream(ctx)
642+
if err != nil {
643+
return fmt.Sprintf("error streaming logs for pod %s: %v", pod.Name, err)
644+
}
645+
defer logs.Close()
646+
647+
b := new(bytes.Buffer)
648+
if _, err = io.Copy(b, logs); err != nil {
649+
return fmt.Sprintf("error copying logs for pod %s: %v", pod.Name, err)
650+
}
651+
return b.String()
652+
}

0 commit comments

Comments
 (0)