Skip to content

Commit 0bde2d7

Browse files
committed
Add unit test for multi node NIMService
Signed-off-by: Sheng Lin <shelin@nvidia.com>
1 parent ee06055 commit 0bde2d7

2 files changed

Lines changed: 231 additions & 0 deletions

File tree

internal/controller/platform/standalone/nimservice_test.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ import (
5252
ctrl "sigs.k8s.io/controller-runtime"
5353
"sigs.k8s.io/controller-runtime/pkg/client"
5454
"sigs.k8s.io/controller-runtime/pkg/client/fake"
55+
lwsv1 "sigs.k8s.io/lws/api/leaderworkerset/v1"
5556

5657
"k8s.io/apimachinery/pkg/version"
5758

@@ -140,6 +141,7 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
140141
Expect(networkingv1.AddToScheme(scheme)).To(Succeed())
141142
Expect(corev1.AddToScheme(scheme)).To(Succeed())
142143
Expect(monitoringv1.AddToScheme(scheme)).To(Succeed())
144+
Expect(lwsv1.AddToScheme(scheme)).To(Succeed())
143145

144146
client = fake.NewClientBuilder().WithScheme(scheme).
145147
WithStatusSubresource(&appsv1alpha1.NIMService{}).
@@ -1011,6 +1013,67 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
10111013
})
10121014
})
10131015

1016+
Describe("LWS deployment for multi-node inferencing NIMService", func() {
1017+
AfterEach(func() {
1018+
lws := &lwsv1.LeaderWorkerSet{
1019+
ObjectMeta: metav1.ObjectMeta{
1020+
Name: "test-nimservice-lws",
1021+
Namespace: "default",
1022+
},
1023+
}
1024+
err := client.Delete(context.TODO(), lws)
1025+
Expect(err).NotTo(HaveOccurred())
1026+
})
1027+
1028+
It("should report ready when LWS is available", func() {
1029+
lws := &lwsv1.LeaderWorkerSet{
1030+
ObjectMeta: metav1.ObjectMeta{
1031+
Name: "test-nimservice-lws",
1032+
Namespace: "default",
1033+
},
1034+
Status: lwsv1.LeaderWorkerSetStatus{
1035+
Conditions: []metav1.Condition{
1036+
{
1037+
Type: string(lwsv1.LeaderWorkerSetAvailable),
1038+
Status: metav1.ConditionTrue,
1039+
},
1040+
},
1041+
},
1042+
}
1043+
err := client.Create(context.TODO(), lws)
1044+
Expect(err).NotTo(HaveOccurred())
1045+
msg, ready, err := reconciler.isLeaderWorkerSetReady(context.TODO(), nimService)
1046+
Expect(err).ToNot(HaveOccurred())
1047+
Expect(ready).To(Equal(true))
1048+
Expect(msg).To(Equal(fmt.Sprintf("leaderworkerset %q is ready", lws.Name)))
1049+
})
1050+
It("should report not ready when LWS is not available", func() {
1051+
lws := &lwsv1.LeaderWorkerSet{
1052+
ObjectMeta: metav1.ObjectMeta{
1053+
Name: "test-nimservice-lws",
1054+
Namespace: "default",
1055+
},
1056+
Status: lwsv1.LeaderWorkerSetStatus{
1057+
Conditions: []metav1.Condition{
1058+
{
1059+
Type: string(lwsv1.LeaderWorkerSetProgressing),
1060+
Status: metav1.ConditionTrue,
1061+
},
1062+
{
1063+
Type: string(lwsv1.LeaderWorkerSetAvailable),
1064+
Status: metav1.ConditionFalse,
1065+
},
1066+
},
1067+
},
1068+
}
1069+
err := client.Create(context.TODO(), lws)
1070+
Expect(err).NotTo(HaveOccurred())
1071+
msg, ready, err := reconciler.isLeaderWorkerSetReady(context.TODO(), nimService)
1072+
Expect(err).ToNot(HaveOccurred())
1073+
Expect(ready).To(Equal(false))
1074+
Expect(msg).To(Equal(fmt.Sprintf("leaderworkerset %q is not ready", lws.Name)))
1075+
})
1076+
})
10141077
Describe("update model status on NIMService", func() {
10151078
BeforeEach(func() {
10161079
ingress := &networkingv1.Ingress{
@@ -1470,6 +1533,23 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
14701533
Expect(resources.Limits).To(HaveKeyWithValue(corev1.ResourceName("nvidia.com/gpu"), resource.MustParse("1")))
14711534
})
14721535

1536+
It("should assign GPU resource equal to multiNode.GPUPerWorker if tensor parallism is not provided", func() {
1537+
nimService.Spec.MultiNode = &appsv1alpha1.NimServiceMultiNodeConfig{
1538+
GPUPerWorker: 2,
1539+
}
1540+
profile := &appsv1alpha1.NIMProfile{
1541+
Name: "test-profile",
1542+
Config: map[string]string{},
1543+
}
1544+
1545+
resources, err := reconciler.addGPUResources(context.TODO(), nimService, profile)
1546+
Expect(err).ToNot(HaveOccurred())
1547+
Expect(resources).ToNot(BeNil())
1548+
1549+
Expect(resources.Requests).To(HaveKeyWithValue(corev1.ResourceName("nvidia.com/gpu"), resource.MustParse("2")))
1550+
Expect(resources.Limits).To(HaveKeyWithValue(corev1.ResourceName("nvidia.com/gpu"), resource.MustParse("2")))
1551+
})
1552+
14731553
It("should return an error if tensor parallelism cannot be parsed", func() {
14741554
profile := &appsv1alpha1.NIMProfile{
14751555
Name: "test-profile",

internal/render/render_test.go

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,157 @@ var _ = Describe("K8s Resources Rendering", func() {
129129
templatesDir := filepath.Join(path.Dir(path.Dir(cwd)), "manifests")
130130

131131
Context("Rendering templates", func() {
132+
It("should render LeaderWorkerSet template correctly", func() {
133+
params := types.LeaderWorkerSetParams{
134+
Name: "test-lws",
135+
Namespace: "default",
136+
Labels: map[string]string{"app": "test-app"},
137+
Annotations: map[string]string{"annotation-key": "annotation-value"},
138+
Replicas: 3,
139+
Size: 2,
140+
Image: "nim-llm:latest",
141+
ImagePullSecrets: []string{"ngc-secret"},
142+
LeaderVolumes: []corev1.Volume{
143+
{
144+
Name: "test-leader-volume",
145+
VolumeSource: corev1.VolumeSource{
146+
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
147+
ClaimName: "test-leader-pvc",
148+
},
149+
},
150+
},
151+
},
152+
WorkerVolumes: []corev1.Volume{
153+
{
154+
Name: "test-worker-volume",
155+
VolumeSource: corev1.VolumeSource{
156+
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
157+
ClaimName: "test-worker-pvc",
158+
},
159+
},
160+
},
161+
},
162+
LeaderVolumeMounts: []corev1.VolumeMount{
163+
{
164+
Name: "test-leader-volume",
165+
MountPath: "/data",
166+
},
167+
},
168+
WorkerVolumeMounts: []corev1.VolumeMount{
169+
{
170+
Name: "test-worker-volume",
171+
MountPath: "/data",
172+
},
173+
},
174+
LeaderEnvs: []corev1.EnvVar{
175+
{
176+
Name: "LEADER_ENV_VAR",
177+
Value: "value",
178+
},
179+
},
180+
WorkerEnvs: []corev1.EnvVar{
181+
{
182+
Name: "WORKER_ENV_VAR",
183+
Value: "value",
184+
},
185+
},
186+
Ports: []corev1.ContainerPort{
187+
{
188+
ContainerPort: 8080,
189+
},
190+
},
191+
Resources: &corev1.ResourceRequirements{
192+
Limits: corev1.ResourceList{
193+
corev1.ResourceCPU: resource.MustParse("500m"),
194+
corev1.ResourceMemory: resource.MustParse("128Mi"),
195+
},
196+
Requests: corev1.ResourceList{
197+
corev1.ResourceCPU: resource.MustParse("250m"),
198+
corev1.ResourceMemory: resource.MustParse("64Mi"),
199+
},
200+
},
201+
ReadinessProbe: &corev1.Probe{
202+
InitialDelaySeconds: 15,
203+
TimeoutSeconds: 1,
204+
PeriodSeconds: 10,
205+
SuccessThreshold: 1,
206+
FailureThreshold: 3,
207+
ProbeHandler: corev1.ProbeHandler{
208+
HTTPGet: &corev1.HTTPGetAction{
209+
Path: "/v1/health/live",
210+
Port: intstr.FromString("8080"),
211+
},
212+
},
213+
},
214+
LivenessProbe: &corev1.Probe{
215+
InitialDelaySeconds: 15,
216+
TimeoutSeconds: 1,
217+
PeriodSeconds: 10,
218+
SuccessThreshold: 1,
219+
FailureThreshold: 3,
220+
ProbeHandler: corev1.ProbeHandler{
221+
HTTPGet: &corev1.HTTPGetAction{
222+
Path: "/v1/health/ready",
223+
Port: intstr.FromString("8080"),
224+
},
225+
},
226+
},
227+
StartupProbe: &corev1.Probe{
228+
InitialDelaySeconds: 15,
229+
TimeoutSeconds: 1,
230+
PeriodSeconds: 10,
231+
SuccessThreshold: 1,
232+
FailureThreshold: 3,
233+
ProbeHandler: corev1.ProbeHandler{
234+
HTTPGet: &corev1.HTTPGetAction{
235+
Path: "/v1/health/ready",
236+
Port: intstr.FromString("8080"),
237+
},
238+
},
239+
},
240+
NodeSelector: map[string]string{"disktype": "ssd"},
241+
Tolerations: []corev1.Toleration{
242+
{
243+
Key: "key1",
244+
Operator: corev1.TolerationOpExists,
245+
Effect: corev1.TaintEffectNoSchedule,
246+
},
247+
},
248+
}
249+
250+
r := render.NewRenderer(templatesDir)
251+
lws, err := r.LeaderWorkerSet(&params)
252+
Expect(err).NotTo(HaveOccurred())
253+
Expect(lws.Name).To(Equal("test-lws"))
254+
Expect(lws.Namespace).To(Equal("default"))
255+
Expect(lws.Labels["app"]).To(Equal("test-app"))
256+
Expect(lws.Annotations["annotation-key"]).To(Equal("annotation-value"))
257+
Expect(*lws.Spec.Replicas).To(Equal(int32(3)))
258+
Expect(*lws.Spec.LeaderWorkerTemplate.Size).To(Equal(int32(2)))
259+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Name).To(Equal("nim-leader"))
260+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Image).To(Equal("nim-llm:latest"))
261+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Name).To(Equal("nim-worker"))
262+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Image).To(Equal("nim-llm:latest"))
263+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Volumes[0].Name).To(Equal("test-leader-volume"))
264+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Volumes[0].VolumeSource.PersistentVolumeClaim.ClaimName).To(Equal("test-leader-pvc"))
265+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Volumes[0].Name).To(Equal("test-worker-volume"))
266+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Volumes[0].VolumeSource.PersistentVolumeClaim.ClaimName).To(Equal("test-worker-pvc"))
267+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].VolumeMounts[0].Name).To(Equal("test-leader-volume"))
268+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].VolumeMounts[0].MountPath).To(Equal("/data"))
269+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].VolumeMounts[0].Name).To(Equal("test-worker-volume"))
270+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].VolumeMounts[0].MountPath).To(Equal("/data"))
271+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].ReadinessProbe).ToNot(BeNil())
272+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].StartupProbe).ToNot(BeNil())
273+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].LivenessProbe).ToNot(BeNil())
274+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.ImagePullSecrets[0].Name).To(Equal("ngc-secret"))
275+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.ImagePullSecrets[0].Name).To(Equal("ngc-secret"))
276+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Containers[0].Ports[0].ContainerPort).To(Equal(int32(8080)))
277+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Containers[0].Ports[0].ContainerPort).To(Equal(int32(8080)))
278+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.NodeSelector).To(Equal(map[string]string{"disktype": "ssd"}))
279+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.NodeSelector).To(Equal(map[string]string{"disktype": "ssd"}))
280+
Expect(lws.Spec.LeaderWorkerTemplate.LeaderTemplate.Spec.Tolerations).To(Equal([]corev1.Toleration{{Key: "key1", Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule}}))
281+
Expect(lws.Spec.LeaderWorkerTemplate.WorkerTemplate.Spec.Tolerations).To(Equal([]corev1.Toleration{{Key: "key1", Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule}}))
282+
})
132283
It("should render Deployment template correctly", func() {
133284
params := types.DeploymentParams{
134285
Name: "test-deployment",

0 commit comments

Comments
 (0)