Skip to content

Commit 284a9a6

Browse files
committed
Add integration tests for GPU Partitioner node controller
1 parent b756568 commit 284a9a6

File tree

3 files changed

+320
-0
lines changed

3 files changed

+320
-0
lines changed
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
//go:build integration
2+
3+
/*
4+
* Copyright 2023 nebuly.com
5+
*
6+
* Licensed under the Apache License, Version 2.0 (the "License");
7+
* you may not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package gpupartitioner_test
20+
21+
import (
22+
"fmt"
23+
"github.com/nebuly-ai/nos/pkg/api/nos.nebuly.com/v1alpha1"
24+
"github.com/nebuly-ai/nos/pkg/constant"
25+
"github.com/nebuly-ai/nos/pkg/gpu"
26+
"github.com/nebuly-ai/nos/pkg/test/factory"
27+
. "github.com/onsi/ginkgo/v2"
28+
. "github.com/onsi/gomega"
29+
"github.com/stretchr/testify/mock"
30+
"time"
31+
)
32+
33+
var _ = Describe("Node Controller", func() {
34+
const (
35+
timeout = time.Second * 10
36+
interval = time.Second * 1
37+
)
38+
39+
BeforeEach(func() {
40+
})
41+
42+
AfterEach(func() {
43+
})
44+
45+
When("A node does not have GPU Count label", func() {
46+
It("Should not be added to the Cluster State", func() {
47+
By("By creating a node without GPU Count label")
48+
nodeName := "node-without-gpu-count-label"
49+
node := factory.BuildNode(nodeName).WithLabels(map[string]string{
50+
constant.LabelNvidiaProduct: gpu.GPUModel_A100_PCIe_80GB.String(),
51+
v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMps.String(),
52+
}).Get()
53+
Expect(k8sClient.Create(ctx, &node)).To(Succeed())
54+
55+
By("Checking that the node is not added to the Cluster State")
56+
Consistently(func() bool {
57+
_, ok := clusterState.GetNode(nodeName)
58+
return ok
59+
}, 3, interval).Should(BeFalse())
60+
})
61+
})
62+
63+
When("A node does not have GPU Model label", func() {
64+
It("Should not be added to the Cluster State", func() {
65+
66+
By("By creating a node without GPU Model label")
67+
nodeName := "node-without-gpu-model-label"
68+
node := factory.BuildNode(nodeName).WithLabels(map[string]string{
69+
constant.LabelNvidiaCount: "1",
70+
v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMps.String(),
71+
}).Get()
72+
Expect(k8sClient.Create(ctx, &node)).To(Succeed())
73+
74+
By("Checking that the node is not added to the Cluster State")
75+
Consistently(func() bool {
76+
_, ok := clusterState.GetNode(nodeName)
77+
return ok
78+
}, 3, interval).Should(BeFalse())
79+
})
80+
})
81+
82+
When("A node with GPU labels has MPS partitioning enabled", func() {
83+
It("Should always be added to the Cluster State", func() {
84+
By("By creating a node with MPS partitioning enabled")
85+
nodeName := "node-mps"
86+
node := factory.BuildNode(nodeName).WithLabels(map[string]string{
87+
constant.LabelNvidiaProduct: gpu.GPUModel_A100_PCIe_80GB.String(),
88+
constant.LabelNvidiaCount: "1",
89+
v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMps.String(),
90+
}).Get()
91+
Expect(k8sClient.Create(ctx, &node)).To(Succeed())
92+
93+
By("Checking that the node is added to the Cluster State")
94+
Eventually(func() bool {
95+
_, ok := clusterState.GetNode(nodeName)
96+
return ok
97+
}, timeout, interval).Should(BeTrue())
98+
})
99+
})
100+
101+
When("A node with GPU labels has MIG partitioning enabled", func() {
102+
It("Should *not* be added to the Cluster State it is not initialized", func() {
103+
By("By creating a node with MIG partitioning enabled, but not initialized")
104+
nodeName := "node-mig-not-initialized"
105+
node := factory.BuildNode(nodeName).WithLabels(map[string]string{
106+
constant.LabelNvidiaProduct: gpu.GPUModel_A100_PCIe_80GB.String(),
107+
constant.LabelNvidiaCount: "1",
108+
v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMig.String(),
109+
}).Get()
110+
Expect(k8sClient.Create(ctx, &node)).To(Succeed())
111+
112+
By("Checking that the controller triggers Node initialization")
113+
migNodeInitializer.On("InitNodePartitioning", mock.Anything, mock.Anything).
114+
Return(nil).
115+
Once()
116+
117+
By("Checking that the node is *not* added to the Cluster State")
118+
Consistently(func() bool {
119+
_, ok := clusterState.GetNode(nodeName)
120+
return ok
121+
}, 5, interval).Should(BeFalse())
122+
})
123+
124+
It("Should be added to the Cluster State it is initialized", func() {
125+
By("By creating an initialized node with MIG partitioning enabled")
126+
nodeName := "node-mig-initialized"
127+
node := factory.BuildNode(nodeName).
128+
WithLabels(map[string]string{
129+
constant.LabelNvidiaProduct: gpu.GPUModel_A100_PCIe_80GB.String(),
130+
constant.LabelNvidiaCount: "1",
131+
v1alpha1.LabelGpuPartitioning: gpu.PartitioningKindMig.String(),
132+
}).
133+
WithAnnotations(map[string]string{
134+
fmt.Sprintf(v1alpha1.AnnotationGpuSpecFormat, 0, "10gb"): "1",
135+
}).
136+
Get()
137+
Expect(k8sClient.Create(ctx, &node)).To(Succeed())
138+
139+
migNodeInitializer.On("InitNodePartitioning", mock.Anything, mock.Anything).
140+
Return(nil).
141+
Maybe()
142+
143+
By("Checking that the node is added to the Cluster State")
144+
Eventually(func() bool {
145+
_, ok := clusterState.GetNode(nodeName)
146+
return ok
147+
}, timeout, interval).Should(BeTrue())
148+
})
149+
})
150+
})
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
//go:build integration
2+
3+
/*
4+
* Copyright 2023 nebuly.com
5+
*
6+
* Licensed under the Apache License, Version 2.0 (the "License");
7+
* you may not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package gpupartitioner_test
20+
21+
import (
22+
"context"
23+
"github.com/nebuly-ai/nos/internal/controllers/gpupartitioner"
24+
"github.com/nebuly-ai/nos/internal/partitioning/core"
25+
"github.com/nebuly-ai/nos/internal/partitioning/state"
26+
"github.com/nebuly-ai/nos/pkg/api/nos.nebuly.com/v1alpha1"
27+
partitioningmock "github.com/nebuly-ai/nos/pkg/test/mocks/partitioning"
28+
. "github.com/onsi/ginkgo/v2"
29+
. "github.com/onsi/gomega"
30+
"k8s.io/client-go/kubernetes/scheme"
31+
"k8s.io/client-go/rest"
32+
"k8s.io/kubernetes/pkg/scheduler/framework"
33+
"path/filepath"
34+
ctrl "sigs.k8s.io/controller-runtime"
35+
"sigs.k8s.io/controller-runtime/pkg/client"
36+
"sigs.k8s.io/controller-runtime/pkg/envtest"
37+
logf "sigs.k8s.io/controller-runtime/pkg/log"
38+
"sigs.k8s.io/controller-runtime/pkg/log/zap"
39+
"testing"
40+
)
41+
42+
var cfg *rest.Config
43+
var k8sClient client.Client
44+
var testEnv *envtest.Environment
45+
var (
46+
ctx context.Context
47+
cancel context.CancelFunc
48+
migNodeInitializer *partitioningmock.NodeInitializer
49+
clusterState *state.ClusterState
50+
)
51+
52+
var _ core.NodeInitializer = migNodeInitializer
53+
54+
func TestAPIs(t *testing.T) {
55+
RegisterFailHandler(Fail)
56+
migNodeInitializer = partitioningmock.NewNodeInitializer(t)
57+
RunSpecs(t, "Controllers Suite")
58+
}
59+
60+
var _ = BeforeSuite(func() {
61+
logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true)))
62+
ctx, cancel = context.WithCancel(context.Background())
63+
64+
By("bootstrapping test environment")
65+
testEnv = &envtest.Environment{
66+
CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "operator", "crd", "bases")},
67+
ErrorIfCRDPathMissing: true,
68+
}
69+
70+
var err error
71+
72+
// cfg is defined in this file globally.
73+
cfg, err = testEnv.Start()
74+
Expect(err).NotTo(HaveOccurred())
75+
Expect(cfg).NotTo(BeNil())
76+
77+
err = v1alpha1.AddToScheme(scheme.Scheme)
78+
Expect(err).NotTo(HaveOccurred())
79+
80+
k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
81+
Expect(err).NotTo(HaveOccurred())
82+
Expect(k8sClient).NotTo(BeNil())
83+
84+
k8sManager, err := ctrl.NewManager(cfg, ctrl.Options{
85+
Scheme: scheme.Scheme,
86+
MetricsBindAddress: ":8083",
87+
})
88+
Expect(err).ToNot(HaveOccurred())
89+
90+
// Init Cluster State
91+
clusterState = state.NewClusterState(map[string]framework.NodeInfo{})
92+
93+
// Setup Node Controller
94+
reporter := gpupartitioner.NewNodeController(k8sClient, scheme.Scheme, migNodeInitializer, clusterState)
95+
Expect(reporter.SetupWithManager(k8sManager, "NodeController")).To(Succeed())
96+
97+
go func() {
98+
defer GinkgoRecover()
99+
err = k8sManager.Start(ctx)
100+
Expect(err).ToNot(HaveOccurred(), "failed to run manager")
101+
}()
102+
})
103+
104+
var _ = AfterSuite(func() {
105+
cancel()
106+
By("tearing down the test environment")
107+
err := testEnv.Stop()
108+
Expect(err).NotTo(HaveOccurred())
109+
})

pkg/test/mocks/partitioning/initializer.go

Lines changed: 61 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)