Skip to content

Commit 34a14ab

Browse files
Adding support for universal NIMs
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
1 parent ccff6e0 commit 34a14ab

10 files changed

Lines changed: 160 additions & 28 deletions

File tree

api/apps/v1alpha1/nimcache_types.go

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ type HuggingFaceHubSource struct {
108108
}
109109

110110
// NGCSource references a model stored on NVIDIA NGC.
111+
// +kubebuilder:validation:XValidation:rule="!(has(self.model) && has(self.modelEndpoint))",message="Only one of 'model' or 'modelEndpoint' can be specified"
111112
type NGCSource struct {
112113
// The name of an existing pull secret containing the NGC_API_KEY
113114
AuthSecret string `json:"authSecret"`
@@ -117,7 +118,9 @@ type NGCSource struct {
117118
// PullSecret to pull the model puller image
118119
PullSecret string `json:"pullSecret,omitempty"`
119120
// Model spec for caching
120-
Model ModelSpec `json:"model,omitempty"`
121+
Model *ModelSpec `json:"model,omitempty"`
122+
// ModelEndpoint is the endpoint for the model to be cached for Universal NIM
123+
ModelEndpoint *string `json:"modelEndpoint,omitempty"`
121124
}
122125

123126
// ModelSpec is the spec required to cache selected models.
@@ -309,6 +312,36 @@ func (n *NIMCache) GetRuntimeClassName() *string {
309312
return &n.Spec.RuntimeClassName
310313
}
311314

315+
// IsUniversalNIM returns true if the NIMCache is for a universal NIM.
316+
func (n *NIMCache) IsUniversalNIM() bool {
317+
// Universal NIM is when the modelEndpoint is set in the NGCSource.
318+
if n.Spec.Source.NGC != nil && n.Spec.Source.NGC.ModelEndpoint != nil {
319+
return true
320+
}
321+
// Universal NIM also support HuggingFaceEndpoints
322+
if n.Spec.Source.HF != nil {
323+
return true
324+
}
325+
return false
326+
}
327+
328+
// IsOptimizedNIM returns true if the NIMCache is for an optimized NIM.
329+
func (n *NIMCache) IsOptimizedNIM() bool {
330+
// Universal NIM is when the modelEndpoint is set in the NGCSource.
331+
if n.Spec.Source.NGC != nil && n.Spec.Source.NGC.ModelEndpoint == nil {
332+
return true
333+
}
334+
return false
335+
}
336+
337+
// GetModelSpec returns the model spec for the NIMCache.
338+
func (n *NIMCache) GetModelSpec() ModelSpec {
339+
if n.Spec.Source.NGC != nil && n.Spec.Source.NGC.Model != nil {
340+
return *n.Spec.Source.NGC.Model
341+
}
342+
return ModelSpec{}
343+
}
344+
312345
// GetProxySpec returns the proxy spec for the NIMService deployment.
313346
func (n *NIMCache) GetProxySpec() *ProxySpec {
314347
return n.Spec.Proxy

api/apps/v1alpha1/nimservice_types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ func (n *NIMService) GetStandardEnv() []corev1.EnvVar {
189189
envVars := []corev1.EnvVar{
190190
{
191191
Name: "NIM_CACHE_PATH",
192-
Value: "/model-store",
192+
Value: utils.DefaultModelStorePath,
193193
},
194194
{
195195
Name: "NGC_API_KEY",

api/apps/v1alpha1/zz_generated.deepcopy.go

Lines changed: 10 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/apps.nvidia.com_nimcaches.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,10 @@ spec:
389389
for the model computations
390390
type: string
391391
type: object
392+
modelEndpoint:
393+
description: ModelEndpoint is the endpoint for the model to
394+
be cached for Universal NIM
395+
type: string
392396
modelPuller:
393397
description: ModelPuller is the container image that can pull
394398
the model
@@ -405,6 +409,9 @@ spec:
405409
- authSecret
406410
- modelPuller
407411
type: object
412+
x-kubernetes-validations:
413+
- message: Only one of 'model' or 'modelEndpoint' can be specified
414+
rule: '!(has(self.model) && has(self.modelEndpoint))'
408415
type: object
409416
x-kubernetes-validations:
410417
- message: Exactly one of ngc, dataStore, or hf must be defined

config/crd/bases/apps.nvidia.com_nimcaches.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,10 @@ spec:
389389
for the model computations
390390
type: string
391391
type: object
392+
modelEndpoint:
393+
description: ModelEndpoint is the endpoint for the model to
394+
be cached for Universal NIM
395+
type: string
392396
modelPuller:
393397
description: ModelPuller is the container image that can pull
394398
the model
@@ -405,6 +409,9 @@ spec:
405409
- authSecret
406410
- modelPuller
407411
type: object
412+
x-kubernetes-validations:
413+
- message: Only one of 'model' or 'modelEndpoint' can be specified
414+
rule: '!(has(self.model) && has(self.modelEndpoint))'
408415
type: object
409416
x-kubernetes-validations:
410417
- message: Exactly one of ngc, dataStore, or hf must be defined

deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimcaches.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,10 @@ spec:
389389
for the model computations
390390
type: string
391391
type: object
392+
modelEndpoint:
393+
description: ModelEndpoint is the endpoint for the model to
394+
be cached for Universal NIM
395+
type: string
392396
modelPuller:
393397
description: ModelPuller is the container image that can pull
394398
the model
@@ -405,6 +409,9 @@ spec:
405409
- authSecret
406410
- modelPuller
407411
type: object
412+
x-kubernetes-validations:
413+
- message: Only one of 'model' or 'modelEndpoint' can be specified
414+
rule: '!(has(self.model) && has(self.modelEndpoint))'
408415
type: object
409416
x-kubernetes-validations:
410417
- message: Exactly one of ngc, dataStore, or hf must be defined

internal/controller/nimcache_controller.go

Lines changed: 66 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,7 @@ func (r *NIMCacheReconciler) reconcileServiceAccount(ctx context.Context, nimCac
494494
}
495495

496496
// Create the ServiceAccount
497+
497498
err = r.Create(ctx, newSA)
498499
if err != nil {
499500
logger.Error(err, "Failed to create ServiceAccount", "Name", saName)
@@ -550,8 +551,8 @@ func (r *NIMCacheReconciler) reconcilePVC(ctx context.Context, nimCache *appsv1a
550551
// Model auto-selection is enabled and
551552
// Explicit model profiles are not provided by the user.
552553
func isModelSelectionRequired(nimCache *appsv1alpha1.NIMCache) bool {
553-
if nimCache.Spec.Source.NGC != nil &&
554-
len(nimCache.Spec.Source.NGC.Model.Profiles) == 0 {
554+
if nimCache.IsOptimizedNIM() &&
555+
len(nimCache.GetModelSpec().Profiles) == 0 {
555556
return true
556557
}
557558
return false
@@ -567,8 +568,8 @@ func isModelSelectionDone(nimCache *appsv1alpha1.NIMCache) bool {
567568
}
568569

569570
func getSelectedProfiles(nimCache *appsv1alpha1.NIMCache) ([]string, error) {
570-
if nimCache.Spec.Source.NGC != nil {
571-
if len(nimCache.Spec.Source.NGC.Model.Profiles) > 0 {
571+
if nimCache.IsOptimizedNIM() {
572+
if len(nimCache.GetModelSpec().Profiles) > 0 {
572573
return nimCache.Spec.Source.NGC.Model.Profiles, nil
573574
}
574575

@@ -589,7 +590,7 @@ func (r *NIMCacheReconciler) reconcileModelManifest(ctx context.Context, nimCach
589590
logger := r.GetLogger()
590591

591592
// Model manifest is available only for NGC model pullers
592-
if nimCache.Spec.Source.NGC == nil {
593+
if !nimCache.IsOptimizedNIM() {
593594
return false, nil
594595
}
595596

@@ -677,7 +678,7 @@ func (r *NIMCacheReconciler) reconcileModelSelection(ctx context.Context, nimCac
677678
if isModelSelectionRequired(nimCache) && !isModelSelectionDone(nimCache) {
678679
var discoveredGPUs []string
679680
// If no specific GPUs are provided, then auto-detect GPUs in the cluster for profile selection
680-
if len(nimCache.Spec.Source.NGC.Model.GPUs) == 0 {
681+
if len(nimCache.GetModelSpec().GPUs) == 0 {
681682
gpusByNode, err := r.GetNodeGPUProducts(ctx)
682683
if err != nil {
683684
logger.Error(err, "Failed to get gpus in the cluster")
@@ -693,7 +694,7 @@ func (r *NIMCacheReconciler) reconcileModelSelection(ctx context.Context, nimCac
693694
}
694695

695696
// Match profiles with user input
696-
profiles, err := nimManifest.MatchProfiles(nimCache.Spec.Source.NGC.Model, discoveredGPUs)
697+
profiles, err := nimManifest.MatchProfiles(nimCache.GetModelSpec(), discoveredGPUs)
697698
if err != nil {
698699
logger.Error(err, "Failed to match profiles for given model parameters")
699700
return err
@@ -1084,7 +1085,8 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
10841085
}
10851086
}
10861087

1087-
if nimCache.Spec.Source.DataStore != nil || nimCache.Spec.Source.HF != nil {
1088+
switch {
1089+
case nimCache.Spec.Source.DataStore != nil || nimCache.Spec.Source.HF != nil:
10881090
var hfDataSource HFInterface
10891091
if nimCache.Spec.Source.DataStore != nil {
10901092
hfDataSource = nimCache.Spec.Source.DataStore
@@ -1117,6 +1119,10 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
11171119
Name: "HF_ENDPOINT",
11181120
Value: hfDataSource.GetEndpoint(),
11191121
},
1122+
{
1123+
Name: "HF_HUB_OFFLINE",
1124+
Value: "0",
1125+
},
11201126
},
11211127
VolumeMounts: []corev1.VolumeMount{
11221128
{
@@ -1152,7 +1158,8 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
11521158
Name: hfDataSource.GetPullSecret(),
11531159
},
11541160
}
1155-
} else if nimCache.Spec.Source.NGC != nil {
1161+
1162+
case nimCache.Spec.Source.NGC != nil && nimCache.Spec.Source.NGC.ModelEndpoint == nil:
11561163
job.Spec.Template.Spec.Containers = []corev1.Container{
11571164
{
11581165
Name: NIMCacheContainerName,
@@ -1220,8 +1227,57 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
12201227
job.Spec.Template.Spec.Containers[0].Args = append(job.Spec.Template.Spec.Containers[0].Args, selectedProfiles...)
12211228
}
12221229
}
1223-
}
12241230

1231+
case nimCache.Spec.Source.NGC != nil && nimCache.Spec.Source.NGC.ModelEndpoint != nil:
1232+
job.Spec.Template.Spec.Containers = []corev1.Container{
1233+
{
1234+
Name: NIMCacheContainerName,
1235+
Image: nimCache.Spec.Source.NGC.ModelPuller,
1236+
Command: []string{"create-model-store"},
1237+
Args: []string{"--model-repo", *nimCache.Spec.Source.NGC.ModelEndpoint, "--model-store", "/model-store"},
1238+
EnvFrom: nimCache.Spec.Source.EnvFromSecrets(),
1239+
Env: []corev1.EnvVar{
1240+
{
1241+
Name: "NIM_CACHE_PATH",
1242+
Value: utils.DefaultModelStorePath,
1243+
},
1244+
},
1245+
VolumeMounts: []corev1.VolumeMount{
1246+
{
1247+
Name: "nim-cache-volume",
1248+
MountPath: utils.DefaultModelStorePath,
1249+
SubPath: nimCache.Spec.Storage.PVC.SubPath,
1250+
},
1251+
},
1252+
Resources: corev1.ResourceRequirements{
1253+
Limits: map[corev1.ResourceName]apiResource.Quantity{
1254+
"cpu": nimCache.Spec.Resources.CPU,
1255+
"memory": nimCache.Spec.Resources.Memory,
1256+
},
1257+
Requests: map[corev1.ResourceName]apiResource.Quantity{
1258+
"cpu": nimCache.Spec.Resources.CPU,
1259+
"memory": nimCache.Spec.Resources.Memory,
1260+
},
1261+
},
1262+
TerminationMessagePath: "/dev/termination-log",
1263+
TerminationMessagePolicy: corev1.TerminationMessageFallbackToLogsOnError,
1264+
SecurityContext: &corev1.SecurityContext{
1265+
AllowPrivilegeEscalation: ptr.To[bool](false),
1266+
Capabilities: &corev1.Capabilities{
1267+
Drop: []corev1.Capability{"ALL"},
1268+
},
1269+
RunAsNonRoot: ptr.To[bool](true),
1270+
RunAsGroup: nimCache.GetGroupID(),
1271+
RunAsUser: nimCache.GetUserID(),
1272+
},
1273+
},
1274+
}
1275+
job.Spec.Template.Spec.ImagePullSecrets = []corev1.LocalObjectReference{
1276+
{
1277+
Name: nimCache.Spec.Source.NGC.PullSecret,
1278+
},
1279+
}
1280+
}
12251281
// Merge env with the user provided values
12261282
job.Spec.Template.Spec.Containers[0].Env = utils.MergeEnvVars(job.Spec.Template.Spec.Containers[0].Env, nimCache.Spec.Env)
12271283

internal/controller/nimcache_controller_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -487,7 +487,7 @@ var _ = Describe("NIMCache Controller", func() {
487487
Namespace: "default",
488488
},
489489
Spec: appsv1alpha1.NIMCacheSpec{
490-
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: appsv1alpha1.ModelSpec{Profiles: profiles}}},
490+
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: &appsv1alpha1.ModelSpec{Profiles: profiles}}},
491491
},
492492
}
493493

@@ -513,7 +513,7 @@ var _ = Describe("NIMCache Controller", func() {
513513
Annotations: map[string]string{SelectedNIMProfilesAnnotationKey: string(profilesJSON)},
514514
},
515515
Spec: appsv1alpha1.NIMCacheSpec{
516-
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: appsv1alpha1.ModelSpec{GPUs: []appsv1alpha1.GPUSpec{{IDs: []string{"26b5"}}}}}},
516+
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: &appsv1alpha1.ModelSpec{GPUs: []appsv1alpha1.GPUSpec{{IDs: []string{"26b5"}}}}}},
517517
Env: []corev1.EnvVar{
518518
{
519519
Name: "NGC_HOME",
@@ -567,7 +567,7 @@ var _ = Describe("NIMCache Controller", func() {
567567
Namespace: "default",
568568
},
569569
Spec: appsv1alpha1.NIMCacheSpec{
570-
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: appsv1alpha1.ModelSpec{Profiles: profiles}}},
570+
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: &appsv1alpha1.ModelSpec{Profiles: profiles}}},
571571
Proxy: &appsv1alpha1.ProxySpec{
572572
HttpProxy: "http://proxy:1000",
573573
HttpsProxy: "https://proxy:1000",

internal/controller/platform/standalone/nimservice.go

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -252,8 +252,8 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi
252252

253253
// Select PVC for model store
254254
nimCacheName := nimService.GetNIMCacheName()
255+
nimCache := appsv1alpha1.NIMCache{}
255256
if nimCacheName != "" { // nolint:gocritic
256-
nimCache := appsv1alpha1.NIMCache{}
257257
if err := r.Get(ctx, types.NamespacedName{Name: nimCacheName, Namespace: nimService.GetNamespace()}, &nimCache); err != nil {
258258
// Fail the NIMService if the NIMCache is not found
259259
if k8serrors.IsNotFound(err) {
@@ -338,24 +338,34 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi
338338
}
339339
deploymentParams.Env = append(deploymentParams.Env, profileEnv)
340340

341-
// Retrieve and set profile details from NIMCache
342-
var profile *appsv1alpha1.NIMProfile
343-
profile, err = r.getNIMCacheProfile(ctx, nimService, modelProfile)
344-
if err != nil {
345-
logger.Error(err, "Failed to get cached NIM profile")
346-
return ctrl.Result{}, err
347-
}
348-
349-
// Auto assign GPU resources in case of the optimized profile
350-
if profile != nil {
351-
if err = r.assignGPUResources(ctx, nimService, profile, deploymentParams); err != nil {
341+
// Only assign GPU resources if the NIMCache is for optimized NIM
342+
if nimCache.IsOptimizedNIM() {
343+
// Retrieve and set profile details from NIMCache
344+
var profile *appsv1alpha1.NIMProfile
345+
profile, err = r.getNIMCacheProfile(ctx, nimService, modelProfile)
346+
if err != nil {
347+
logger.Error(err, "Failed to get cached NIM profile")
352348
return ctrl.Result{}, err
353349
}
350+
351+
// Auto assign GPU resources in case of the optimized profile
352+
if profile != nil {
353+
if err = r.assignGPUResources(ctx, nimService, profile, deploymentParams); err != nil {
354+
return ctrl.Result{}, err
355+
}
356+
}
354357
}
355358

356359
// TODO: assign GPU resources and node selector that is required for the selected profile
357360
}
358361

362+
if nimCache.IsUniversalNIM() {
363+
deploymentParams.Env = append(deploymentParams.Env, corev1.EnvVar{
364+
Name: "NIM_MODEL_NAME",
365+
Value: utils.DefaultModelStorePath,
366+
})
367+
}
368+
359369
// Setup pod resource claims
360370
namedDraResources := shared.GenerateNamedDRAResources(nimService)
361371
deploymentParams.PodResourceClaims = shared.GetPodResourceClaims(namedDraResources)

internal/utils/utils.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ const (
4949

5050
// DRAPodClaimNameAnnotationKey indicates annotation name for the identifier of a resource claim template in a pod spec.
5151
DRAPodClaimNameAnnotationKey = "resource.kubernetes.io/pod-claim-name"
52+
53+
// DefaultModelStorePath is the default path for model store.
54+
DefaultModelStorePath = "/model-store"
5255
)
5356

5457
const (

0 commit comments

Comments
 (0)