Skip to content

Commit 1188da9

Browse files
Updating pipelineparallelism field
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
1 parent aa56a04 commit 1188da9

File tree

11 files changed

+112
-55
lines changed

11 files changed

+112
-55
lines changed

api/apps/v1alpha1/nimservice_types.go

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -140,10 +140,8 @@ type NimServiceMultiNodeConfig struct {
140140
// BackendType specifies the backend type for the multi-node NIMService. Currently only LWS is supported.
141141
BackendType NIMBackendType `json:"backendType,omitempty"`
142142

143-
// +kubebuilder:default:=1
144-
// PipelineParallelism specifies the number of pods to create for the multi-node NIMService.
145-
// +kubebuilder:validation:Minimum=1
146-
PipelineParallelism int `json:"pipelineParallelism,omitempty"`
143+
// Parallelism specifies the parallelism strategy for the multi-node NIMService.
144+
Parallelism Parallelism `json:"parallelism,omitempty"`
147145

148146
// MPI config for NIMService using LeaderWorkerSet
149147
MPI *MultiNodeMPIConfig `json:"mpi,omitempty"`
@@ -155,6 +153,13 @@ type MultiNodeMPIConfig struct {
155153
MPIStartTimeout int `json:"mpiStartTimeout"`
156154
}
157155

156+
type Parallelism struct {
157+
// +kubebuilder:default:=1
158+
// PipelineParallelism specifies the number of pods to create for the multi-node NIMService.
159+
// +kubebuilder:validation:Minimum=1
160+
PipelineParallelism int `json:"pipelineParallelism,omitempty"`
161+
}
162+
158163
// NIMCacheVolSpec defines the spec to use NIMCache volume.
159164
type NIMCacheVolSpec struct {
160165
Name string `json:"name,omitempty"`
@@ -338,7 +343,7 @@ func (n *NIMService) getLWSCommonEnv() []corev1.EnvVar {
338343
},
339344
{
340345
Name: "NIM_NUM_COMPUTE_NODES",
341-
Value: fmt.Sprintf("%d", n.Spec.MultiNode.PipelineParallelism),
346+
Value: fmt.Sprintf("%d", n.Spec.MultiNode.Parallelism.PipelineParallelism),
342347
},
343348
{
344349
Name: "NIM_MULTI_NODE",
@@ -350,7 +355,7 @@ func (n *NIMService) getLWSCommonEnv() []corev1.EnvVar {
350355
},
351356
{
352357
Name: "NIM_PIPELINE_PARALLEL_SIZE",
353-
Value: fmt.Sprintf("%d", n.Spec.MultiNode.PipelineParallelism),
358+
Value: fmt.Sprintf("%d", n.Spec.MultiNode.Parallelism.PipelineParallelism),
354359
},
355360
{
356361
Name: "NIM_NODE_RANK",
@@ -915,7 +920,7 @@ func (n *NIMService) GetLWSSize() int {
915920
if n.Spec.MultiNode == nil {
916921
return 0
917922
}
918-
return n.Spec.MultiNode.PipelineParallelism
923+
return n.Spec.MultiNode.Parallelism.PipelineParallelism
919924
}
920925

921926
// GetDeploymentKind returns the kind of deployment for NIMService.
@@ -1201,7 +1206,7 @@ func (n *NIMService) generateMPIConfigData() map[string]string {
12011206
data := make(map[string]string)
12021207
for i := 0; i < n.Spec.Replicas; i++ {
12031208
hostfile := fmt.Sprintf("localhost slots=%d\n", n.GetGPUCountPerPod())
1204-
for j := 1; j < n.Spec.MultiNode.PipelineParallelism; j++ {
1209+
for j := 1; j < n.Spec.MultiNode.Parallelism.PipelineParallelism; j++ {
12051210
workerHostname := fmt.Sprintf("%s-%d-%d.%s.%s.svc slots=%d",
12061211
n.GetLWSName(), i, j, n.GetLWSName(), n.GetNamespace(), n.GetGPUCountPerPod())
12071212
hostfile += workerHostname + "\n"

api/apps/v1alpha1/zz_generated.deepcopy.go

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/apps.nvidia.com_nimpipelines.yaml

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1042,12 +1042,17 @@ spec:
10421042
required:
10431043
- mpiStartTimeout
10441044
type: object
1045-
pipelineParallelism:
1046-
default: 1
1047-
description: PipelineParallelism specifies the number
1048-
of pods to create for the multi-node NIMService.
1049-
minimum: 1
1050-
type: integer
1045+
parallelism:
1046+
description: Parallelism specifies the parallelism strategy
1047+
for the multi-node NIMService.
1048+
properties:
1049+
pipelineParallelism:
1050+
default: 1
1051+
description: PipelineParallelism specifies the number
1052+
of pods to create for the multi-node NIMService.
1053+
minimum: 1
1054+
type: integer
1055+
type: object
10511056
type: object
10521057
nodeSelector:
10531058
additionalProperties:

bundle/manifests/apps.nvidia.com_nimservices.yaml

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -976,12 +976,17 @@ spec:
976976
required:
977977
- mpiStartTimeout
978978
type: object
979-
pipelineParallelism:
980-
default: 1
981-
description: PipelineParallelism specifies the number of pods
982-
to create for the multi-node NIMService.
983-
minimum: 1
984-
type: integer
979+
parallelism:
980+
description: Parallelism specifies the parallelism strategy for
981+
the multi-node NIMService.
982+
properties:
983+
pipelineParallelism:
984+
default: 1
985+
description: PipelineParallelism specifies the number of pods
986+
to create for the multi-node NIMService.
987+
minimum: 1
988+
type: integer
989+
type: object
985990
type: object
986991
nodeSelector:
987992
additionalProperties:

config/crd/bases/apps.nvidia.com_nimpipelines.yaml

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1042,12 +1042,17 @@ spec:
10421042
required:
10431043
- mpiStartTimeout
10441044
type: object
1045-
pipelineParallelism:
1046-
default: 1
1047-
description: PipelineParallelism specifies the number
1048-
of pods to create for the multi-node NIMService.
1049-
minimum: 1
1050-
type: integer
1045+
parallelism:
1046+
description: Parallelism specifies the parallelism strategy
1047+
for the multi-node NIMService.
1048+
properties:
1049+
pipelineParallelism:
1050+
default: 1
1051+
description: PipelineParallelism specifies the number
1052+
of pods to create for the multi-node NIMService.
1053+
minimum: 1
1054+
type: integer
1055+
type: object
10511056
type: object
10521057
nodeSelector:
10531058
additionalProperties:

config/crd/bases/apps.nvidia.com_nimservices.yaml

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -976,12 +976,17 @@ spec:
976976
required:
977977
- mpiStartTimeout
978978
type: object
979-
pipelineParallelism:
980-
default: 1
981-
description: PipelineParallelism specifies the number of pods
982-
to create for the multi-node NIMService.
983-
minimum: 1
984-
type: integer
979+
parallelism:
980+
description: Parallelism specifies the parallelism strategy for
981+
the multi-node NIMService.
982+
properties:
983+
pipelineParallelism:
984+
default: 1
985+
description: PipelineParallelism specifies the number of pods
986+
to create for the multi-node NIMService.
987+
minimum: 1
988+
type: integer
989+
type: object
985990
type: object
986991
nodeSelector:
987992
additionalProperties:

deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimpipelines.yaml

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1042,12 +1042,17 @@ spec:
10421042
required:
10431043
- mpiStartTimeout
10441044
type: object
1045-
pipelineParallelism:
1046-
default: 1
1047-
description: PipelineParallelism specifies the number
1048-
of pods to create for the multi-node NIMService.
1049-
minimum: 1
1050-
type: integer
1045+
parallelism:
1046+
description: Parallelism specifies the parallelism strategy
1047+
for the multi-node NIMService.
1048+
properties:
1049+
pipelineParallelism:
1050+
default: 1
1051+
description: PipelineParallelism specifies the number
1052+
of pods to create for the multi-node NIMService.
1053+
minimum: 1
1054+
type: integer
1055+
type: object
10511056
type: object
10521057
nodeSelector:
10531058
additionalProperties:

deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -976,12 +976,17 @@ spec:
976976
required:
977977
- mpiStartTimeout
978978
type: object
979-
pipelineParallelism:
980-
default: 1
981-
description: PipelineParallelism specifies the number of pods
982-
to create for the multi-node NIMService.
983-
minimum: 1
984-
type: integer
979+
parallelism:
980+
description: Parallelism specifies the parallelism strategy for
981+
the multi-node NIMService.
982+
properties:
983+
pipelineParallelism:
984+
default: 1
985+
description: PipelineParallelism specifies the number of pods
986+
to create for the multi-node NIMService.
987+
minimum: 1
988+
type: integer
989+
type: object
985990
type: object
986991
nodeSelector:
987992
additionalProperties:

internal/controller/platform/standalone/nimservice_test.go

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1183,7 +1183,9 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
11831183
Service: appsv1alpha1.Service{Type: corev1.ServiceTypeLoadBalancer, Port: ptr.To[int32](8123), Annotations: map[string]string{"annotation-key-specific": "service"}},
11841184
},
11851185
MultiNode: &appsv1alpha1.NimServiceMultiNodeConfig{
1186-
PipelineParallelism: 2,
1186+
Parallelism: appsv1alpha1.Parallelism{
1187+
PipelineParallelism: 2,
1188+
},
11871189
},
11881190
},
11891191
}
@@ -2004,8 +2006,10 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
20042006
Service: appsv1alpha1.Service{Type: corev1.ServiceTypeLoadBalancer, Port: ptr.To[int32](8123)},
20052007
},
20062008
MultiNode: &appsv1alpha1.NimServiceMultiNodeConfig{
2007-
BackendType: appsv1alpha1.NIMBackendTypeLWS,
2008-
PipelineParallelism: 2,
2009+
BackendType: appsv1alpha1.NIMBackendTypeLWS,
2010+
Parallelism: appsv1alpha1.Parallelism{
2011+
PipelineParallelism: 2,
2012+
},
20092013
},
20102014
Resources: &corev1.ResourceRequirements{
20112015
Requests: corev1.ResourceList{
@@ -2231,8 +2235,10 @@ var _ = Describe("NIMServiceReconciler for a standalone platform", func() {
22312235
Service: appsv1alpha1.Service{Type: corev1.ServiceTypeLoadBalancer, Port: ptr.To[int32](8123)},
22322236
},
22332237
MultiNode: &appsv1alpha1.NimServiceMultiNodeConfig{
2234-
BackendType: appsv1alpha1.NIMBackendTypeLWS,
2235-
PipelineParallelism: 2,
2238+
BackendType: appsv1alpha1.NIMBackendTypeLWS,
2239+
Parallelism: appsv1alpha1.Parallelism{
2240+
PipelineParallelism: 2,
2241+
},
22362242
},
22372243
Resources: &corev1.ResourceRequirements{
22382244
Requests: corev1.ResourceList{

internal/webhook/apps/v1alpha1/nimservice_webhook_validation_helper.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -413,7 +413,7 @@ func validateGPURequirements(spec *appsv1alpha1.NIMServiceSpec, fldPath *field.P
413413
return errList
414414
}
415415

416-
if spec.Resources != nil {
416+
if spec.Resources != nil && len(spec.DRAResources) == 0 {
417417
// At least one of requests or limits must be specified
418418
_, hasRequests := spec.Resources.Requests[gpuResourceName]
419419
_, hasLimits := spec.Resources.Limits[gpuResourceName]

0 commit comments

Comments
 (0)