Skip to content

Commit 4dce616

Browse files
committed
Add unit test for multi node NIMService
Signed-off-by: Sheng Lin <shelin@nvidia.com>
1 parent 0f32b3d commit 4dce616

2 files changed

Lines changed: 231 additions & 0 deletions

File tree

internal/controller/platform/standalone/nimservice_test.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ import (
4949
ctrl "sigs.k8s.io/controller-runtime"
5050
"sigs.k8s.io/controller-runtime/pkg/client"
5151
"sigs.k8s.io/controller-runtime/pkg/client/fake"
52+
lwsv1 "sigs.k8s.io/lws/api/leaderworkerset/v1"
5253

5354
appsv1alpha1 "github.com/NVIDIA/k8s-nim-operator/api/apps/v1alpha1"
5455
"github.com/NVIDIA/k8s-nim-operator/internal/conditions"
@@ -134,6 +135,7 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
134135
Expect(networkingv1.AddToScheme(scheme)).To(Succeed())
135136
Expect(corev1.AddToScheme(scheme)).To(Succeed())
136137
Expect(monitoringv1.AddToScheme(scheme)).To(Succeed())
138+
Expect(lwsv1.AddToScheme(scheme)).To(Succeed())
137139

138140
client = fake.NewClientBuilder().WithScheme(scheme).
139141
WithStatusSubresource(&appsv1alpha1.NIMService{}).
@@ -816,6 +818,67 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
816818
})
817819
})
818820

821+
Describe("LWS deployment for multi-node inferencing NIMService", func() {
822+
AfterEach(func() {
823+
lws := &lwsv1.LeaderWorkerSet{
824+
ObjectMeta: metav1.ObjectMeta{
825+
Name: "test-nimservice-lws",
826+
Namespace: "default",
827+
},
828+
}
829+
err := client.Delete(context.TODO(), lws)
830+
Expect(err).NotTo(HaveOccurred())
831+
})
832+
833+
It("should report ready when LWS is available", func() {
834+
lws := &lwsv1.LeaderWorkerSet{
835+
ObjectMeta: metav1.ObjectMeta{
836+
Name: "test-nimservice-lws",
837+
Namespace: "default",
838+
},
839+
Status: lwsv1.LeaderWorkerSetStatus{
840+
Conditions: []metav1.Condition{
841+
{
842+
Type: string(lwsv1.LeaderWorkerSetAvailable),
843+
Status: metav1.ConditionTrue,
844+
},
845+
},
846+
},
847+
}
848+
err := client.Create(context.TODO(), lws)
849+
Expect(err).NotTo(HaveOccurred())
850+
msg, ready, err := reconciler.isLeaderWorkerSetReady(context.TODO(), nimService)
851+
Expect(err).ToNot(HaveOccurred())
852+
Expect(ready).To(Equal(true))
853+
Expect(msg).To(Equal(fmt.Sprintf("leaderworkerset %q is ready", lws.Name)))
854+
})
855+
It("should report not ready when LWS is not available", func() {
856+
lws := &lwsv1.LeaderWorkerSet{
857+
ObjectMeta: metav1.ObjectMeta{
858+
Name: "test-nimservice-lws",
859+
Namespace: "default",
860+
},
861+
Status: lwsv1.LeaderWorkerSetStatus{
862+
Conditions: []metav1.Condition{
863+
{
864+
Type: string(lwsv1.LeaderWorkerSetProgressing),
865+
Status: metav1.ConditionTrue,
866+
},
867+
{
868+
Type: string(lwsv1.LeaderWorkerSetAvailable),
869+
Status: metav1.ConditionFalse,
870+
},
871+
},
872+
},
873+
}
874+
err := client.Create(context.TODO(), lws)
875+
Expect(err).NotTo(HaveOccurred())
876+
msg, ready, err := reconciler.isLeaderWorkerSetReady(context.TODO(), nimService)
877+
Expect(err).ToNot(HaveOccurred())
878+
Expect(ready).To(Equal(false))
879+
Expect(msg).To(Equal(fmt.Sprintf("leaderworkerset %q is not ready", lws.Name)))
880+
})
881+
})
819882
Describe("update model status on NIMService", func() {
820883
BeforeEach(func() {
821884
ingress := &networkingv1.Ingress{
@@ -1275,6 +1338,23 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
12751338
Expect(resources.Limits).To(HaveKeyWithValue(corev1.ResourceName("nvidia.com/gpu"), resource.MustParse("1")))
12761339
})
12771340

1341+
It("should assign GPU resource equal to multiNode.GPUPerWorker if tensor parallism is not provided", func() {
1342+
nimService.Spec.MultiNode = &appsv1alpha1.NimServiceMultiNodeConfig{
1343+
GPUPerWorker: 2,
1344+
}
1345+
profile := &appsv1alpha1.NIMProfile{
1346+
Name: "test-profile",
1347+
Config: map[string]string{},
1348+
}
1349+
1350+
resources, err := reconciler.addGPUResources(context.TODO(), nimService, profile)
1351+
Expect(err).ToNot(HaveOccurred())
1352+
Expect(resources).ToNot(BeNil())
1353+
1354+
Expect(resources.Requests).To(HaveKeyWithValue(corev1.ResourceName("nvidia.com/gpu"), resource.MustParse("2")))
1355+
Expect(resources.Limits).To(HaveKeyWithValue(corev1.ResourceName("nvidia.com/gpu"), resource.MustParse("2")))
1356+
})
1357+
12781358
It("should return an error if tensor parallelism cannot be parsed", func() {
12791359
profile := &appsv1alpha1.NIMProfile{
12801360
Name: "test-profile",

internal/render/render_test.go

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,157 @@ var _ = Describe("K8s Resources Rendering", func() {
129129
templatesDir := filepath.Join(path.Dir(path.Dir(cwd)), "manifests")
130130

131131
Context("Rendering templates", func() {
132+
It("should render LeaderWorkerSet template correctly", func() {
133+
params := types.LeaderWorkerSetParams{
134+
Name: "test-lws",
135+
Namespace: "default",
136+
Labels: map[string]string{"app": "test-app"},
137+
Annotations: map[string]string{"annotation-key": "annotation-value"},
138+
Replicas: 3,
139+
Size: 2,
140+
Image: "nim-llm:latest",
141+
ImagePullSecrets: []string{"ngc-secret"},
142+
LeaderVolumes: []corev1.Volume{
143+
{
144+
Name: "test-leader-volume",
145+
VolumeSource: corev1.VolumeSource{
146+
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
147+
ClaimName: "test-leader-pvc",
148+
},
149+
},
150+
},
151+
},
152+
WorkerVolumes: []corev1.Volume{
153+
{
154+
Name: "test-worker-volume",
155+
VolumeSource: corev1.VolumeSource{
156+
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
157+
ClaimName: "test-worker-pvc",
158+
},
159+
},
160+
},
161+
},
162+
LeaderVolumeMounts: []corev1.VolumeMount{
163+
{
164+
Name: "test-leader-volume",
165+
MountPath: "/data",
166+
},
167+
},
168+
WorkerVolumeMounts: []corev1.VolumeMount{
169+
{
170+
Name: "test-worker-volume",
171+
MountPath: "/data",
172+
},
173+
},
174+
LeaderEnvs: []corev1.EnvVar{
175+
{
176+
Name: "LEADER_ENV_VAR",
177+
Value: "value",
178+
},
179+
},
180+
WorkerEnvs: []corev1.EnvVar{
181+
{
182+
Name: "WORKER_ENV_VAR",
183+
Value: "value",
184+
},
185+
},
186+
Ports: []corev1.ContainerPort{
187+
{
188+
ContainerPort: 8080,
189+
},
190+
},
191+
Resources: &corev1.ResourceRequirements{
192+
Limits: corev1.ResourceList{
193+
corev1.ResourceCPU: resource.MustParse("500m"),
194+
corev1.ResourceMemory: resource.MustParse("128Mi"),
195+
},
196+
Requests: corev1.ResourceList{
197+
corev1.ResourceCPU: resource.MustParse("250m"),
198+
corev1.ResourceMemory: resource.MustParse("64Mi"),
199+
},
200+
},
201+
ReadinessProbe: &corev1.Probe{
202+
InitialDelaySeconds: 15,
203+
TimeoutSeconds: 1,
204+
PeriodSeconds: 10,
205+
SuccessThreshold: 1,
206+
FailureThreshold: 3,
207+
ProbeHandler: corev1.ProbeHandler{
208+
HTTPGet: &corev1.HTTPGetAction{
209+
Path: "/v1/health/live",
210+
Port: intstr.FromString("8080"),
211+
},
212+
},
213+
},
214+
LivenessProbe: &corev1.Probe{
215+
InitialDelaySeconds: 15,
216+
TimeoutSeconds: 1,
217+
PeriodSeconds: 10,
218+
SuccessThreshold: 1,
219+
FailureThreshold: 3,
220+
ProbeHandler: corev1.ProbeHandler{
221+
HTTPGet: &corev1.HTTPGetAction{
222+
Path: "/v1/health/ready",
223+
Port: intstr.FromString("8080"),
224+
},
225+
},
226+
},
227+
StartupProbe: &corev1.Probe{
228+
InitialDelaySeconds: 15,
229+
TimeoutSeconds: 1,
230+
PeriodSeconds: 10,
231+
SuccessThreshold: 1,
232+
FailureThreshold: 3,
233+
ProbeHandler: corev1.ProbeHandler{
234+
HTTPGet: &corev1.HTTPGetAction{
235+
Path: "/v1/health/ready",
236+
Port: intstr.FromString("8080"),
237+
},
238+
},
239+
},
240+
NodeSelector: map[string]string{"disktype": "ssd"},
241+
Tolerations: []corev1.Toleration{
242+
{
243+
Key: "key1",
244+
Operator: corev1.TolerationOpExists,
245+
Effect: corev1.TaintEffectNoSchedule,
246+
},
247+
},
248+
}
249+
250+
r := render.NewRenderer(templatesDir)
251+
lws, err := r.LeaderWorkerSet(&params)
252+
Expect(err).NotTo(HaveOccurred())
253+
Expect(lws.Name).To(Equal("test-lws"))
254+
Expect(lws.Namespace).To(Equal("default"))
255+
Expect(lws.Labels["app"]).To(Equal("test-app"))
256+
Expect(lws.Annotations["annotation-key"]).To(Equal("annotation-value"))
257+
Expect(*lws.Spec.Replicas).To(Equal(int32(3)))
258+
Expect(*lws.Spec.LeaderWorkerTemplate.Size).To(Equal(int32(2)))
259+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Name).To(Equal("nim-leader"))
260+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Image).To(Equal("nim-llm:latest"))
261+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Name).To(Equal("nim-worker"))
262+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Image).To(Equal("nim-llm:latest"))
263+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Volumes[0].Name).To(Equal("test-leader-volume"))
264+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Volumes[0].VolumeSource.PersistentVolumeClaim.ClaimName).To(Equal("test-leader-pvc"))
265+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Volumes[0].Name).To(Equal("test-worker-volume"))
266+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Volumes[0].VolumeSource.PersistentVolumeClaim.ClaimName).To(Equal("test-worker-pvc"))
267+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].VolumeMounts[0].Name).To(Equal("test-leader-volume"))
268+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].VolumeMounts[0].MountPath).To(Equal("/data"))
269+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].VolumeMounts[0].Name).To(Equal("test-worker-volume"))
270+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].VolumeMounts[0].MountPath).To(Equal("/data"))
271+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].ReadinessProbe).ToNot(BeNil())
272+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].StartupProbe).ToNot(BeNil())
273+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].LivenessProbe).ToNot(BeNil())
274+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.ImagePullSecrets[0].Name).To(Equal("ngc-secret"))
275+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.ImagePullSecrets[0].Name).To(Equal("ngc-secret"))
276+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Ports[0].ContainerPort).To(Equal(int32(8080)))
277+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Ports[0].ContainerPort).To(Equal(int32(8080)))
278+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.NodeSelector).To(Equal(map[string]string{"disktype": "ssd"}))
279+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.NodeSelector).To(Equal(map[string]string{"disktype": "ssd"}))
280+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Tolerations).To(Equal([]corev1.Toleration{{Key: "key1", Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule}}))
281+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Tolerations).To(Equal([]corev1.Toleration{{Key: "key1", Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule}}))
282+
})
132283
It("should render Deployment template correctly", func() {
133284
params := types.DeploymentParams{
134285
Name: "test-deployment",

0 commit comments

Comments
 (0)