Skip to content

Commit 31a8207

Browse files
Refactor e2e test infrastructure with unified helpers leveraging GPU operator clientsets
Signed-off-by: Karthik Vetrivel <[email protected]>
1 parent 70b458d commit 31a8207

File tree

12 files changed

+853
-167
lines changed

12 files changed

+853
-167
lines changed

tests/e2e/go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ require (
3636
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
3737
github.com/emicklei/go-restful/v3 v3.11.2 // indirect
3838
github.com/evanphx/json-patch v5.9.11+incompatible // indirect
39+
github.com/evanphx/json-patch/v5 v5.9.11 // indirect
3940
github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f // indirect
4041
github.com/fatih/color v1.16.0 // indirect
4142
github.com/fxamacker/cbor/v2 v2.8.0 // indirect

tests/e2e/go.sum

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ github.com/emicklei/go-restful/v3 v3.11.2 h1:1onLa9DcsMYO9P+CXaL0dStDqQ2EHHXLiz+
7575
github.com/emicklei/go-restful/v3 v3.11.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
7676
github.com/evanphx/json-patch v5.9.11+incompatible h1:ixHHqfcGvxhWkniF1tWxBHA0yb4Z+d1UQi45df52xW8=
7777
github.com/evanphx/json-patch v5.9.11+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
78+
github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU=
79+
github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM=
7880
github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f h1:Wl78ApPPB2Wvf/TIe2xdyJxTlb6obmF18d8QdkxNDu4=
7981
github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f/go.mod h1:OSYXu++VVOHnXeitef/D8n/6y4QV8uLHSFXX4NeXMGc=
8082
github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM=
@@ -101,6 +103,8 @@ github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
101103
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
102104
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
103105
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
106+
github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
107+
github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
104108
github.com/go-openapi/jsonpointer v0.21.1 h1:whnzv/pNXtK2FbX/W9yJfRmE2gsmkfahjMKB0fZvcic=
105109
github.com/go-openapi/jsonpointer v0.21.1/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk=
106110
github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ=
@@ -353,8 +357,8 @@ go.opentelemetry.io/otel/sdk v1.36.0 h1:b6SYIuLRs88ztox4EyrvRti80uXIFy+Sqzoh9kFU
353357
go.opentelemetry.io/otel/sdk v1.36.0/go.mod h1:+lC+mTgD+MUWfjJubi2vvXWcVxyr9rmlshZni72pXeY=
354358
go.opentelemetry.io/otel/sdk/log v0.8.0 h1:zg7GUYXqxk1jnGF/dTdLPrK06xJdrXgqgFLnI4Crxvs=
355359
go.opentelemetry.io/otel/sdk/log v0.8.0/go.mod h1:50iXr0UVwQrYS45KbruFrEt4LvAdCaWWgIrsN3ZQggo=
356-
go.opentelemetry.io/otel/sdk/metric v1.36.0 h1:r0ntwwGosWGaa0CrSt8cuNuTcccMXERFwHX4dThiPis=
357-
go.opentelemetry.io/otel/sdk/metric v1.36.0/go.mod h1:qTNOhFDfKRwX0yXOqJYegL5WRaW376QbB7P4Pb0qva4=
360+
go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5JpUCaEqEI9o=
361+
go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w=
358362
go.opentelemetry.io/otel/trace v1.36.0 h1:ahxWNuqZjpdiFAyrIoQ4GIiAIhxAunQR6MUoKrsNd4w=
359363
go.opentelemetry.io/otel/trace v1.36.0/go.mod h1:gQ+OnDZzrybY4k4seLzPAWNwVBBVlF2szhehOBB/tGA=
360364
go.opentelemetry.io/proto/otlp v1.4.0 h1:TA9WRvW6zMwP+Ssb6fLoUIuirti1gGbP28GcKG1jgeg=
@@ -363,6 +367,10 @@ go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs=
363367
go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8=
364368
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
365369
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
370+
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
371+
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
372+
go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8=
373+
go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
366374
go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
367375
go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
368376
go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=

tests/e2e/gpu_operator_test.go

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ import (
2828

2929
"github.com/NVIDIA/gpu-operator/tests/e2e/framework"
3030
e2elog "github.com/NVIDIA/gpu-operator/tests/e2e/framework/logs"
31-
k8stest "github.com/NVIDIA/gpu-operator/tests/e2e/kubernetes"
32-
"github.com/NVIDIA/gpu-operator/tests/e2e/operator"
31+
32+
"github.com/NVIDIA/gpu-operator/tests/e2e/helpers"
3333
)
3434

3535
var _ = Describe(e2eTestPrefix+"-premerge-suite", func() {
@@ -44,15 +44,15 @@ var _ = Describe(e2eTestPrefix+"-premerge-suite", func() {
4444

4545
// Init global suite vars vars
4646
var (
47-
operatorClient *operator.Client
47+
operatorClient *helpers.OperatorClient
4848
helmReleaseName string
49-
k8sClient *k8stest.Client
49+
k8sClient *helpers.PodClient
5050
testNamespace *corev1.Namespace
5151
)
5252

5353
BeforeAll(func(ctx context.Context) {
5454
var err error
55-
k8sClient = k8stest.NewClient(f.ClientSet.CoreV1())
55+
k8sClient = helpers.NewPodClient(f.ClientSet.CoreV1())
5656
nsLabels := map[string]string{
5757
"e2e-run": string(framework.RunID),
5858
}
@@ -62,10 +62,10 @@ var _ = Describe(e2eTestPrefix+"-premerge-suite", func() {
6262
Fail(fmt.Sprintf("failed to create gpu operator namespace %s: %v", tcfg.namespace, err))
6363
}
6464

65-
operatorClient, err = operator.NewClient(
66-
operator.WithNamespace(testNamespace.Name),
67-
operator.WithKubeConfig(framework.TestContext.KubeConfig),
68-
operator.WithChart(tcfg.helmChart),
65+
operatorClient, err = helpers.NewOperatorClient(
66+
helpers.WithNamespace(testNamespace.Name),
67+
helpers.WithKubeConfig(framework.TestContext.KubeConfig),
68+
helpers.WithChart(tcfg.helmChart),
6969
)
7070
if err != nil {
7171
Fail(fmt.Sprintf("failed to instantiate gpu operator client: %v", err))
@@ -79,7 +79,7 @@ var _ = Describe(e2eTestPrefix+"-premerge-suite", func() {
7979
fmt.Sprintf("validator.image=%s", tcfg.validatorImage),
8080
fmt.Sprintf("validator.version=%s", tcfg.validatorVersion),
8181
}
82-
helmReleaseName, err = operatorClient.Install(ctx, values, operator.ChartOptions{
82+
helmReleaseName, err = operatorClient.Install(ctx, values, helpers.ChartOptions{
8383
CleanupOnFail: true,
8484
GenerateName: true,
8585
Timeout: 5 * time.Minute,
@@ -157,8 +157,12 @@ var _ = Describe(e2eTestPrefix+"-premerge-suite", func() {
157157
hasRestarts, err := k8sClient.EnsureNoPodRestarts(ctx, pod.Name, pod.Namespace)
158158
Expect(err).NotTo(HaveOccurred())
159159
if !hasRestarts {
160-
errLogs := k8sClient.GetPodLogs(ctx, pod)
161-
e2elog.Logf("printing logs from the pod %s/%s: %s", pod.Namespace, pod.Name, errLogs)
160+
errLogs, err := k8sClient.GetPodLogs(ctx, pod)
161+
if err != nil {
162+
e2elog.Logf("WARN: failed to retrieve logs from pod %s/%s: %v", pod.Namespace, pod.Name, err)
163+
} else {
164+
e2elog.Logf("printing logs from the pod %s/%s: %s", pod.Namespace, pod.Name, errLogs)
165+
}
162166
e2elog.Failf("pod %s/%s has unexpected restarts", pod.Namespace, pod.Name)
163167
}
164168
}

tests/e2e/helpers/clusterpolicy.go

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
/**
2+
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
**/
16+
17+
package helpers
18+
19+
import (
20+
"context"
21+
"time"
22+
23+
nvidiav1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
24+
gpuclientset "github.com/NVIDIA/gpu-operator/api/versioned"
25+
"github.com/NVIDIA/gpu-operator/internal/conditions"
26+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
"k8s.io/apimachinery/pkg/util/wait"
28+
"k8s.io/client-go/util/retry"
29+
"k8s.io/utils/ptr"
30+
)
31+
32+
type ClusterPolicyClient struct {
33+
client gpuclientset.Interface
34+
}
35+
36+
func NewClusterPolicyClient(client gpuclientset.Interface) *ClusterPolicyClient {
37+
return &ClusterPolicyClient{
38+
client: client,
39+
}
40+
}
41+
42+
func (h *ClusterPolicyClient) Get(ctx context.Context, name string) (*nvidiav1.ClusterPolicy, error) {
43+
return h.client.NvidiaV1().ClusterPolicies().Get(ctx, name, metav1.GetOptions{})
44+
}
45+
46+
func (h *ClusterPolicyClient) Update(ctx context.Context, cp *nvidiav1.ClusterPolicy) (*nvidiav1.ClusterPolicy, error) {
47+
return h.client.NvidiaV1().ClusterPolicies().Update(ctx, cp, metav1.UpdateOptions{})
48+
}
49+
50+
// modify applies a mutation function to a ClusterPolicy and persists the changes.
51+
// It uses RetryOnConflict to handle concurrent modifications by the operator controller.
52+
func (h *ClusterPolicyClient) modify(ctx context.Context, name string, mutate func(*nvidiav1.ClusterPolicy)) error {
53+
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
54+
clusterPolicy, err := h.Get(ctx, name)
55+
if err != nil {
56+
return err
57+
}
58+
59+
mutate(clusterPolicy)
60+
61+
_, err = h.Update(ctx, clusterPolicy)
62+
return err
63+
})
64+
}
65+
66+
func (h *ClusterPolicyClient) UpdateDriverVersion(ctx context.Context, name, version string) error {
67+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
68+
clusterPolicy.Spec.Driver.Version = version
69+
})
70+
}
71+
72+
func (h *ClusterPolicyClient) EnableDCGM(ctx context.Context, name string) error {
73+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
74+
clusterPolicy.Spec.DCGM.Enabled = ptr.To(true)
75+
})
76+
}
77+
78+
func (h *ClusterPolicyClient) DisableDCGM(ctx context.Context, name string) error {
79+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
80+
clusterPolicy.Spec.DCGM.Enabled = ptr.To(false)
81+
})
82+
}
83+
84+
func (h *ClusterPolicyClient) EnableDCGMExporter(ctx context.Context, name string) error {
85+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
86+
clusterPolicy.Spec.DCGMExporter.Enabled = ptr.To(true)
87+
})
88+
}
89+
90+
func (h *ClusterPolicyClient) DisableDCGMExporter(ctx context.Context, name string) error {
91+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
92+
clusterPolicy.Spec.DCGMExporter.Enabled = ptr.To(false)
93+
})
94+
}
95+
96+
func (h *ClusterPolicyClient) EnableGFD(ctx context.Context, name string) error {
97+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
98+
clusterPolicy.Spec.GPUFeatureDiscovery.Enabled = ptr.To(true)
99+
})
100+
}
101+
102+
func (h *ClusterPolicyClient) DisableGFD(ctx context.Context, name string) error {
103+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
104+
clusterPolicy.Spec.GPUFeatureDiscovery.Enabled = ptr.To(false)
105+
})
106+
}
107+
108+
func (h *ClusterPolicyClient) SetMIGStrategy(ctx context.Context, name, strategy string) error {
109+
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
110+
clusterPolicy.Spec.MIG.Strategy = nvidiav1.MIGStrategy(strategy)
111+
})
112+
}
113+
114+
func (h *ClusterPolicyClient) WaitForReady(ctx context.Context, name string, timeout time.Duration) error {
115+
return wait.PollUntilContextTimeout(ctx, defaultPollingInterval, timeout, true, func(ctx context.Context) (bool, error) {
116+
clusterPolicy, err := h.Get(ctx, name)
117+
if err != nil {
118+
return false, err
119+
}
120+
121+
for _, condition := range clusterPolicy.Status.Conditions {
122+
if condition.Type == conditions.Ready && condition.Status == metav1.ConditionTrue {
123+
return true, nil
124+
}
125+
}
126+
127+
return false, nil
128+
})
129+
}
130+

tests/e2e/helpers/constants.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
/**
2+
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
**/
16+
17+
package helpers
18+
19+
import "time"
20+
21+
const (
22+
// defaultPollingInterval is the default interval for polling operations
23+
defaultPollingInterval = 5 * time.Second
24+
25+
// upgradeDoneState represents the state when a driver upgrade is complete
26+
upgradeDoneState = "upgrade-done"
27+
)
28+

0 commit comments

Comments
 (0)