Skip to content

Commit 71ca494

Browse files
Adding support for universal NIMs
Signed-off-by: Vishesh Tanksale <vtanksale@nvidia.com>
1 parent 51169e1 commit 71ca494

10 files changed

Lines changed: 160 additions & 28 deletions

File tree

api/apps/v1alpha1/nimcache_types.go

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ type HuggingFaceHubSource struct {
108108
}
109109

110110
// NGCSource references a model stored on NVIDIA NGC.
111+
// +kubebuilder:validation:XValidation:rule="!(has(self.model) && has(self.modelEndpoint))",message="Only one of 'model' or 'modelEndpoint' can be specified"
111112
type NGCSource struct {
112113
// The name of an existing pull secret containing the NGC_API_KEY
113114
AuthSecret string `json:"authSecret"`
@@ -117,7 +118,9 @@ type NGCSource struct {
117118
// PullSecret to pull the model puller image
118119
PullSecret string `json:"pullSecret,omitempty"`
119120
// Model spec for caching
120-
Model ModelSpec `json:"model,omitempty"`
121+
Model *ModelSpec `json:"model,omitempty"`
122+
// ModelEndpoint is the endpoint for the model to be cached for Universal NIM
123+
ModelEndpoint *string `json:"modelEndpoint,omitempty"`
121124
}
122125

123126
// ModelSpec is the spec required to cache selected models.
@@ -309,6 +312,36 @@ func (n *NIMCache) GetRuntimeClassName() *string {
309312
return &n.Spec.RuntimeClassName
310313
}
311314

315+
// IsUniversalNIM returns true if the NIMCache is for a universal NIM.
316+
func (n *NIMCache) IsUniversalNIM() bool {
317+
// Universal NIM is when the modelEndpoint is set in the NGCSource.
318+
if n.Spec.Source.NGC != nil && n.Spec.Source.NGC.ModelEndpoint != nil {
319+
return true
320+
}
321+
// Universal NIM also support HuggingFaceEndpoints
322+
if n.Spec.Source.HF != nil {
323+
return true
324+
}
325+
return false
326+
}
327+
328+
// IsOptimizedNIM returns true if the NIMCache is for an optimized NIM.
329+
func (n *NIMCache) IsOptimizedNIM() bool {
330+
// Universal NIM is when the modelEndpoint is set in the NGCSource.
331+
if n.Spec.Source.NGC != nil && n.Spec.Source.NGC.ModelEndpoint == nil {
332+
return true
333+
}
334+
return false
335+
}
336+
337+
// GetModelSpec returns the model spec for the NIMCache.
338+
func (n *NIMCache) GetModelSpec() ModelSpec {
339+
if n.Spec.Source.NGC != nil && n.Spec.Source.NGC.Model != nil {
340+
return *n.Spec.Source.NGC.Model
341+
}
342+
return ModelSpec{}
343+
}
344+
312345
// GetProxySpec returns the proxy spec for the NIMService deployment.
313346
func (n *NIMCache) GetProxySpec() *ProxySpec {
314347
return n.Spec.Proxy

api/apps/v1alpha1/nimservice_types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ func (n *NIMService) GetStandardEnv() []corev1.EnvVar {
189189
envVars := []corev1.EnvVar{
190190
{
191191
Name: "NIM_CACHE_PATH",
192-
Value: "/model-store",
192+
Value: utils.DefaultModelStorePath,
193193
},
194194
{
195195
Name: "NGC_API_KEY",

api/apps/v1alpha1/zz_generated.deepcopy.go

Lines changed: 10 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/apps.nvidia.com_nimcaches.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,10 @@ spec:
389389
for the model computations
390390
type: string
391391
type: object
392+
modelEndpoint:
393+
description: ModelEndpoint is the endpoint for the model to
394+
be cached for Universal NIM
395+
type: string
392396
modelPuller:
393397
description: ModelPuller is the container image that can pull
394398
the model
@@ -405,6 +409,9 @@ spec:
405409
- authSecret
406410
- modelPuller
407411
type: object
412+
x-kubernetes-validations:
413+
- message: Only one of 'model' or 'modelEndpoint' can be specified
414+
rule: '!(has(self.model) && has(self.modelEndpoint))'
408415
type: object
409416
x-kubernetes-validations:
410417
- message: Exactly one of ngc, dataStore, or hf must be defined

config/crd/bases/apps.nvidia.com_nimcaches.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,10 @@ spec:
389389
for the model computations
390390
type: string
391391
type: object
392+
modelEndpoint:
393+
description: ModelEndpoint is the endpoint for the model to
394+
be cached for Universal NIM
395+
type: string
392396
modelPuller:
393397
description: ModelPuller is the container image that can pull
394398
the model
@@ -405,6 +409,9 @@ spec:
405409
- authSecret
406410
- modelPuller
407411
type: object
412+
x-kubernetes-validations:
413+
- message: Only one of 'model' or 'modelEndpoint' can be specified
414+
rule: '!(has(self.model) && has(self.modelEndpoint))'
408415
type: object
409416
x-kubernetes-validations:
410417
- message: Exactly one of ngc, dataStore, or hf must be defined

deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimcaches.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,10 @@ spec:
389389
for the model computations
390390
type: string
391391
type: object
392+
modelEndpoint:
393+
description: ModelEndpoint is the endpoint for the model to
394+
be cached for Universal NIM
395+
type: string
392396
modelPuller:
393397
description: ModelPuller is the container image that can pull
394398
the model
@@ -405,6 +409,9 @@ spec:
405409
- authSecret
406410
- modelPuller
407411
type: object
412+
x-kubernetes-validations:
413+
- message: Only one of 'model' or 'modelEndpoint' can be specified
414+
rule: '!(has(self.model) && has(self.modelEndpoint))'
408415
type: object
409416
x-kubernetes-validations:
410417
- message: Exactly one of ngc, dataStore, or hf must be defined

internal/controller/nimcache_controller.go

Lines changed: 66 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,7 @@ func (r *NIMCacheReconciler) reconcileServiceAccount(ctx context.Context, nimCac
498498
}
499499

500500
// Create the ServiceAccount
501+
501502
err = r.Create(ctx, newSA)
502503
if err != nil {
503504
logger.Error(err, "Failed to create ServiceAccount", "Name", saName)
@@ -554,8 +555,8 @@ func (r *NIMCacheReconciler) reconcilePVC(ctx context.Context, nimCache *appsv1a
554555
// Model auto-selection is enabled and
555556
// Explicit model profiles are not provided by the user.
556557
func isModelSelectionRequired(nimCache *appsv1alpha1.NIMCache) bool {
557-
if nimCache.Spec.Source.NGC != nil &&
558-
len(nimCache.Spec.Source.NGC.Model.Profiles) == 0 {
558+
if nimCache.IsOptimizedNIM() &&
559+
len(nimCache.GetModelSpec().Profiles) == 0 {
559560
return true
560561
}
561562
return false
@@ -571,8 +572,8 @@ func isModelSelectionDone(nimCache *appsv1alpha1.NIMCache) bool {
571572
}
572573

573574
func getSelectedProfiles(nimCache *appsv1alpha1.NIMCache) ([]string, error) {
574-
if nimCache.Spec.Source.NGC != nil {
575-
if len(nimCache.Spec.Source.NGC.Model.Profiles) > 0 {
575+
if nimCache.IsOptimizedNIM() {
576+
if len(nimCache.GetModelSpec().Profiles) > 0 {
576577
return nimCache.Spec.Source.NGC.Model.Profiles, nil
577578
}
578579

@@ -593,7 +594,7 @@ func (r *NIMCacheReconciler) reconcileModelManifest(ctx context.Context, nimCach
593594
logger := r.GetLogger()
594595

595596
// Model manifest is available only for NGC model pullers
596-
if nimCache.Spec.Source.NGC == nil {
597+
if !nimCache.IsOptimizedNIM() {
597598
return false, nil
598599
}
599600

@@ -681,7 +682,7 @@ func (r *NIMCacheReconciler) reconcileModelSelection(ctx context.Context, nimCac
681682
if isModelSelectionRequired(nimCache) && !isModelSelectionDone(nimCache) {
682683
var discoveredGPUs []string
683684
// If no specific GPUs are provided, then auto-detect GPUs in the cluster for profile selection
684-
if len(nimCache.Spec.Source.NGC.Model.GPUs) == 0 {
685+
if len(nimCache.GetModelSpec().GPUs) == 0 {
685686
gpusByNode, err := r.GetNodeGPUProducts(ctx)
686687
if err != nil {
687688
logger.Error(err, "Failed to get gpus in the cluster")
@@ -697,7 +698,7 @@ func (r *NIMCacheReconciler) reconcileModelSelection(ctx context.Context, nimCac
697698
}
698699

699700
// Match profiles with user input
700-
profiles, err := nimManifest.MatchProfiles(nimCache.Spec.Source.NGC.Model, discoveredGPUs)
701+
profiles, err := nimManifest.MatchProfiles(nimCache.GetModelSpec(), discoveredGPUs)
701702
if err != nil {
702703
logger.Error(err, "Failed to match profiles for given model parameters")
703704
return err
@@ -1117,7 +1118,8 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
11171118
}
11181119
}
11191120

1120-
if nimCache.Spec.Source.DataStore != nil || nimCache.Spec.Source.HF != nil {
1121+
switch {
1122+
case nimCache.Spec.Source.DataStore != nil || nimCache.Spec.Source.HF != nil:
11211123
var hfDataSource HFInterface
11221124
if nimCache.Spec.Source.DataStore != nil {
11231125
hfDataSource = nimCache.Spec.Source.DataStore
@@ -1150,6 +1152,10 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
11501152
Name: "HF_ENDPOINT",
11511153
Value: hfDataSource.GetEndpoint(),
11521154
},
1155+
{
1156+
Name: "HF_HUB_OFFLINE",
1157+
Value: "0",
1158+
},
11531159
},
11541160
VolumeMounts: []corev1.VolumeMount{
11551161
{
@@ -1185,7 +1191,8 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
11851191
Name: hfDataSource.GetPullSecret(),
11861192
},
11871193
}
1188-
} else if nimCache.Spec.Source.NGC != nil {
1194+
1195+
case nimCache.Spec.Source.NGC != nil && nimCache.Spec.Source.NGC.ModelEndpoint == nil:
11891196
job.Spec.Template.Spec.Containers = []corev1.Container{
11901197
{
11911198
Name: NIMCacheContainerName,
@@ -1253,8 +1260,57 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
12531260
job.Spec.Template.Spec.Containers[0].Args = append(job.Spec.Template.Spec.Containers[0].Args, selectedProfiles...)
12541261
}
12551262
}
1256-
}
12571263

1264+
case nimCache.Spec.Source.NGC != nil && nimCache.Spec.Source.NGC.ModelEndpoint != nil:
1265+
job.Spec.Template.Spec.Containers = []corev1.Container{
1266+
{
1267+
Name: NIMCacheContainerName,
1268+
Image: nimCache.Spec.Source.NGC.ModelPuller,
1269+
Command: []string{"create-model-store"},
1270+
Args: []string{"--model-repo", *nimCache.Spec.Source.NGC.ModelEndpoint, "--model-store", "/model-store"},
1271+
EnvFrom: nimCache.Spec.Source.EnvFromSecrets(),
1272+
Env: []corev1.EnvVar{
1273+
{
1274+
Name: "NIM_CACHE_PATH",
1275+
Value: utils.DefaultModelStorePath,
1276+
},
1277+
},
1278+
VolumeMounts: []corev1.VolumeMount{
1279+
{
1280+
Name: "nim-cache-volume",
1281+
MountPath: utils.DefaultModelStorePath,
1282+
SubPath: nimCache.Spec.Storage.PVC.SubPath,
1283+
},
1284+
},
1285+
Resources: corev1.ResourceRequirements{
1286+
Limits: map[corev1.ResourceName]apiResource.Quantity{
1287+
"cpu": nimCache.Spec.Resources.CPU,
1288+
"memory": nimCache.Spec.Resources.Memory,
1289+
},
1290+
Requests: map[corev1.ResourceName]apiResource.Quantity{
1291+
"cpu": nimCache.Spec.Resources.CPU,
1292+
"memory": nimCache.Spec.Resources.Memory,
1293+
},
1294+
},
1295+
TerminationMessagePath: "/dev/termination-log",
1296+
TerminationMessagePolicy: corev1.TerminationMessageFallbackToLogsOnError,
1297+
SecurityContext: &corev1.SecurityContext{
1298+
AllowPrivilegeEscalation: ptr.To[bool](false),
1299+
Capabilities: &corev1.Capabilities{
1300+
Drop: []corev1.Capability{"ALL"},
1301+
},
1302+
RunAsNonRoot: ptr.To[bool](true),
1303+
RunAsGroup: nimCache.GetGroupID(),
1304+
RunAsUser: nimCache.GetUserID(),
1305+
},
1306+
},
1307+
}
1308+
job.Spec.Template.Spec.ImagePullSecrets = []corev1.LocalObjectReference{
1309+
{
1310+
Name: nimCache.Spec.Source.NGC.PullSecret,
1311+
},
1312+
}
1313+
}
12581314
// Merge env with the user provided values
12591315
job.Spec.Template.Spec.Containers[0].Env = utils.MergeEnvVars(job.Spec.Template.Spec.Containers[0].Env, nimCache.Spec.Env)
12601316

internal/controller/nimcache_controller_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -487,7 +487,7 @@ var _ = Describe("NIMCache Controller", func() {
487487
Namespace: "default",
488488
},
489489
Spec: appsv1alpha1.NIMCacheSpec{
490-
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: appsv1alpha1.ModelSpec{Profiles: profiles}}},
490+
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: &appsv1alpha1.ModelSpec{Profiles: profiles}}},
491491
},
492492
}
493493

@@ -513,7 +513,7 @@ var _ = Describe("NIMCache Controller", func() {
513513
Annotations: map[string]string{SelectedNIMProfilesAnnotationKey: string(profilesJSON)},
514514
},
515515
Spec: appsv1alpha1.NIMCacheSpec{
516-
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: appsv1alpha1.ModelSpec{GPUs: []appsv1alpha1.GPUSpec{{IDs: []string{"26b5"}}}}}},
516+
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: &appsv1alpha1.ModelSpec{GPUs: []appsv1alpha1.GPUSpec{{IDs: []string{"26b5"}}}}}},
517517
Env: []corev1.EnvVar{
518518
{
519519
Name: "NGC_HOME",
@@ -567,7 +567,7 @@ var _ = Describe("NIMCache Controller", func() {
567567
Namespace: "default",
568568
},
569569
Spec: appsv1alpha1.NIMCacheSpec{
570-
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: appsv1alpha1.ModelSpec{Profiles: profiles}}},
570+
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: &appsv1alpha1.ModelSpec{Profiles: profiles}}},
571571
Proxy: &appsv1alpha1.ProxySpec{
572572
HttpProxy: "http://proxy:1000",
573573
HttpsProxy: "https://proxy:1000",

internal/controller/platform/standalone/nimservice.go

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -252,8 +252,8 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi
252252

253253
// Select PVC for model store
254254
nimCacheName := nimService.GetNIMCacheName()
255+
nimCache := appsv1alpha1.NIMCache{}
255256
if nimCacheName != "" { // nolint:gocritic
256-
nimCache := appsv1alpha1.NIMCache{}
257257
if err := r.Get(ctx, types.NamespacedName{Name: nimCacheName, Namespace: nimService.GetNamespace()}, &nimCache); err != nil {
258258
// Fail the NIMService if the NIMCache is not found
259259
if k8serrors.IsNotFound(err) {
@@ -338,24 +338,34 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi
338338
}
339339
deploymentParams.Env = append(deploymentParams.Env, profileEnv)
340340

341-
// Retrieve and set profile details from NIMCache
342-
var profile *appsv1alpha1.NIMProfile
343-
profile, err = r.getNIMCacheProfile(ctx, nimService, modelProfile)
344-
if err != nil {
345-
logger.Error(err, "Failed to get cached NIM profile")
346-
return ctrl.Result{}, err
347-
}
348-
349-
// Auto assign GPU resources in case of the optimized profile
350-
if profile != nil {
351-
if err = r.assignGPUResources(ctx, nimService, profile, deploymentParams); err != nil {
341+
// Only assign GPU resources if the NIMCache is for optimized NIM
342+
if nimCache.IsOptimizedNIM() {
343+
// Retrieve and set profile details from NIMCache
344+
var profile *appsv1alpha1.NIMProfile
345+
profile, err = r.getNIMCacheProfile(ctx, nimService, modelProfile)
346+
if err != nil {
347+
logger.Error(err, "Failed to get cached NIM profile")
352348
return ctrl.Result{}, err
353349
}
350+
351+
// Auto assign GPU resources in case of the optimized profile
352+
if profile != nil {
353+
if err = r.assignGPUResources(ctx, nimService, profile, deploymentParams); err != nil {
354+
return ctrl.Result{}, err
355+
}
356+
}
354357
}
355358

356359
// TODO: assign GPU resources and node selector that is required for the selected profile
357360
}
358361

362+
if nimCache.IsUniversalNIM() {
363+
deploymentParams.Env = append(deploymentParams.Env, corev1.EnvVar{
364+
Name: "NIM_MODEL_NAME",
365+
Value: utils.DefaultModelStorePath,
366+
})
367+
}
368+
359369
// Setup pod resource claims
360370
namedDraResources := shared.GenerateNamedDRAResources(nimService)
361371
deploymentParams.PodResourceClaims = shared.GetPodResourceClaims(namedDraResources)

internal/utils/utils.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ const (
4949

5050
// DRAPodClaimNameAnnotationKey indicates annotation name for the identifier of a resource claim template in a pod spec.
5151
DRAPodClaimNameAnnotationKey = "resource.kubernetes.io/pod-claim-name"
52+
53+
// DefaultModelStorePath is the default path for model store.
54+
DefaultModelStorePath = "/model-store"
5255
)
5356

5457
const (

0 commit comments

Comments
 (0)