Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion api/apps/v1alpha1/nimcache_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ type HuggingFaceHubSource struct {
}

// NGCSource references a model stored on NVIDIA NGC.
// +kubebuilder:validation:XValidation:rule="!(has(self.model) && has(self.modelEndpoint))",message="Only one of 'model' or 'modelEndpoint' can be specified"
type NGCSource struct {
// The name of an existing pull secret containing the NGC_API_KEY
AuthSecret string `json:"authSecret"`
Expand All @@ -117,7 +118,9 @@ type NGCSource struct {
// PullSecret to pull the model puller image
PullSecret string `json:"pullSecret,omitempty"`
// Model spec for caching
Model ModelSpec `json:"model,omitempty"`
Model *ModelSpec `json:"model,omitempty"`
// ModelEndpoint is the endpoint for the model to be cached for Universal NIM
ModelEndpoint *string `json:"modelEndpoint,omitempty"`
Comment thread
shivamerla marked this conversation as resolved.
}

// ModelSpec is the spec required to cache selected models.
Expand Down Expand Up @@ -309,6 +312,36 @@ func (n *NIMCache) GetRuntimeClassName() *string {
return &n.Spec.RuntimeClassName
}

// IsUniversalNIM returns true if the NIMCache is for a universal NIM.
func (n *NIMCache) IsUniversalNIM() bool {
// Universal NIM is when the modelEndpoint is set in the NGCSource.
if n.Spec.Source.NGC != nil && n.Spec.Source.NGC.ModelEndpoint != nil {
return true
}
// Universal NIM also support HuggingFaceEndpoints
if n.Spec.Source.HF != nil {
return true
}
return false
}

// IsOptimizedNIM returns true if the NIMCache is for an optimized NIM.
func (n *NIMCache) IsOptimizedNIM() bool {
// Universal NIM is when the modelEndpoint is set in the NGCSource.
if n.Spec.Source.NGC != nil && n.Spec.Source.NGC.ModelEndpoint == nil {
return true
}
return false
}

// GetModelSpec returns the model spec for the NIMCache.
func (n *NIMCache) GetModelSpec() ModelSpec {
if n.Spec.Source.NGC != nil && n.Spec.Source.NGC.Model != nil {
return *n.Spec.Source.NGC.Model
}
return ModelSpec{}
Comment thread
visheshtanksale marked this conversation as resolved.
}

// GetProxySpec returns the proxy spec for the NIMService deployment.
func (n *NIMCache) GetProxySpec() *ProxySpec {
return n.Spec.Proxy
Expand Down
2 changes: 1 addition & 1 deletion api/apps/v1alpha1/nimservice_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ func (n *NIMService) GetStandardEnv() []corev1.EnvVar {
envVars := []corev1.EnvVar{
{
Name: "NIM_CACHE_PATH",
Value: "/model-store",
Value: utils.DefaultModelStorePath,
},
{
Name: "NGC_API_KEY",
Expand Down
11 changes: 10 additions & 1 deletion api/apps/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions bundle/manifests/apps.nvidia.com_nimcaches.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,10 @@ spec:
for the model computations
type: string
type: object
modelEndpoint:
description: ModelEndpoint is the endpoint for the model to
be cached for Universal NIM
type: string
modelPuller:
description: ModelPuller is the container image that can pull
the model
Expand All @@ -405,6 +409,9 @@ spec:
- authSecret
- modelPuller
type: object
x-kubernetes-validations:
- message: Only one of 'model' or 'modelEndpoint' can be specified
rule: '!(has(self.model) && has(self.modelEndpoint))'
type: object
x-kubernetes-validations:
- message: Exactly one of ngc, dataStore, or hf must be defined
Expand Down
7 changes: 7 additions & 0 deletions config/crd/bases/apps.nvidia.com_nimcaches.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,10 @@ spec:
for the model computations
type: string
type: object
modelEndpoint:
description: ModelEndpoint is the endpoint for the model to
be cached for Universal NIM
type: string
modelPuller:
description: ModelPuller is the container image that can pull
the model
Expand All @@ -405,6 +409,9 @@ spec:
- authSecret
- modelPuller
type: object
x-kubernetes-validations:
- message: Only one of 'model' or 'modelEndpoint' can be specified
rule: '!(has(self.model) && has(self.modelEndpoint))'
type: object
x-kubernetes-validations:
- message: Exactly one of ngc, dataStore, or hf must be defined
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,10 @@ spec:
for the model computations
type: string
type: object
modelEndpoint:
description: ModelEndpoint is the endpoint for the model to
be cached for Universal NIM
type: string
modelPuller:
description: ModelPuller is the container image that can pull
the model
Expand All @@ -405,6 +409,9 @@ spec:
- authSecret
- modelPuller
type: object
x-kubernetes-validations:
- message: Only one of 'model' or 'modelEndpoint' can be specified
rule: '!(has(self.model) && has(self.modelEndpoint))'
type: object
x-kubernetes-validations:
- message: Exactly one of ngc, dataStore, or hf must be defined
Expand Down
76 changes: 66 additions & 10 deletions internal/controller/nimcache_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,7 @@ func (r *NIMCacheReconciler) reconcileServiceAccount(ctx context.Context, nimCac
}

// Create the ServiceAccount

err = r.Create(ctx, newSA)
if err != nil {
logger.Error(err, "Failed to create ServiceAccount", "Name", saName)
Expand Down Expand Up @@ -550,8 +551,8 @@ func (r *NIMCacheReconciler) reconcilePVC(ctx context.Context, nimCache *appsv1a
// Model auto-selection is enabled and
// Explicit model profiles are not provided by the user.
func isModelSelectionRequired(nimCache *appsv1alpha1.NIMCache) bool {
if nimCache.Spec.Source.NGC != nil &&
len(nimCache.Spec.Source.NGC.Model.Profiles) == 0 {
if nimCache.IsOptimizedNIM() &&
len(nimCache.GetModelSpec().Profiles) == 0 {
return true
}
return false
Expand All @@ -567,8 +568,8 @@ func isModelSelectionDone(nimCache *appsv1alpha1.NIMCache) bool {
}

func getSelectedProfiles(nimCache *appsv1alpha1.NIMCache) ([]string, error) {
if nimCache.Spec.Source.NGC != nil {
if len(nimCache.Spec.Source.NGC.Model.Profiles) > 0 {
if nimCache.IsOptimizedNIM() {
if len(nimCache.GetModelSpec().Profiles) > 0 {
return nimCache.Spec.Source.NGC.Model.Profiles, nil
}

Expand All @@ -589,7 +590,7 @@ func (r *NIMCacheReconciler) reconcileModelManifest(ctx context.Context, nimCach
logger := r.GetLogger()

// Model manifest is available only for NGC model pullers
if nimCache.Spec.Source.NGC == nil {
if !nimCache.IsOptimizedNIM() {
return false, nil
}

Expand Down Expand Up @@ -677,7 +678,7 @@ func (r *NIMCacheReconciler) reconcileModelSelection(ctx context.Context, nimCac
if isModelSelectionRequired(nimCache) && !isModelSelectionDone(nimCache) {
var discoveredGPUs []string
// If no specific GPUs are provided, then auto-detect GPUs in the cluster for profile selection
if len(nimCache.Spec.Source.NGC.Model.GPUs) == 0 {
if len(nimCache.GetModelSpec().GPUs) == 0 {
gpusByNode, err := r.GetNodeGPUProducts(ctx)
if err != nil {
logger.Error(err, "Failed to get gpus in the cluster")
Expand All @@ -693,7 +694,7 @@ func (r *NIMCacheReconciler) reconcileModelSelection(ctx context.Context, nimCac
}

// Match profiles with user input
profiles, err := nimManifest.MatchProfiles(nimCache.Spec.Source.NGC.Model, discoveredGPUs)
profiles, err := nimManifest.MatchProfiles(nimCache.GetModelSpec(), discoveredGPUs)
if err != nil {
logger.Error(err, "Failed to match profiles for given model parameters")
return err
Expand Down Expand Up @@ -1084,7 +1085,8 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
}
}

if nimCache.Spec.Source.DataStore != nil || nimCache.Spec.Source.HF != nil {
switch {
case nimCache.Spec.Source.DataStore != nil || nimCache.Spec.Source.HF != nil:
var hfDataSource HFInterface
if nimCache.Spec.Source.DataStore != nil {
hfDataSource = nimCache.Spec.Source.DataStore
Expand Down Expand Up @@ -1117,6 +1119,10 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
Name: "HF_ENDPOINT",
Value: hfDataSource.GetEndpoint(),
},
{
Name: "HF_HUB_OFFLINE",
Value: "0",
},
},
VolumeMounts: []corev1.VolumeMount{
{
Expand Down Expand Up @@ -1152,7 +1158,8 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
Name: hfDataSource.GetPullSecret(),
},
}
} else if nimCache.Spec.Source.NGC != nil {

case nimCache.Spec.Source.NGC != nil && nimCache.Spec.Source.NGC.ModelEndpoint == nil:
job.Spec.Template.Spec.Containers = []corev1.Container{
{
Name: NIMCacheContainerName,
Expand Down Expand Up @@ -1220,8 +1227,57 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
job.Spec.Template.Spec.Containers[0].Args = append(job.Spec.Template.Spec.Containers[0].Args, selectedProfiles...)
}
}
}

case nimCache.Spec.Source.NGC != nil && nimCache.Spec.Source.NGC.ModelEndpoint != nil:
job.Spec.Template.Spec.Containers = []corev1.Container{
{
Name: NIMCacheContainerName,
Image: nimCache.Spec.Source.NGC.ModelPuller,
Command: []string{"create-model-store"},
Args: []string{"--model-repo", *nimCache.Spec.Source.NGC.ModelEndpoint, "--model-store", "/model-store"},
EnvFrom: nimCache.Spec.Source.EnvFromSecrets(),
Env: []corev1.EnvVar{
{
Name: "NIM_CACHE_PATH",
Value: utils.DefaultModelStorePath,
},
},
VolumeMounts: []corev1.VolumeMount{
{
Name: "nim-cache-volume",
MountPath: utils.DefaultModelStorePath,
SubPath: nimCache.Spec.Storage.PVC.SubPath,
},
},
Resources: corev1.ResourceRequirements{
Limits: map[corev1.ResourceName]apiResource.Quantity{
"cpu": nimCache.Spec.Resources.CPU,
"memory": nimCache.Spec.Resources.Memory,
},
Requests: map[corev1.ResourceName]apiResource.Quantity{
"cpu": nimCache.Spec.Resources.CPU,
"memory": nimCache.Spec.Resources.Memory,
},
},
TerminationMessagePath: "/dev/termination-log",
TerminationMessagePolicy: corev1.TerminationMessageFallbackToLogsOnError,
SecurityContext: &corev1.SecurityContext{
AllowPrivilegeEscalation: ptr.To[bool](false),
Capabilities: &corev1.Capabilities{
Drop: []corev1.Capability{"ALL"},
},
RunAsNonRoot: ptr.To[bool](true),
RunAsGroup: nimCache.GetGroupID(),
RunAsUser: nimCache.GetUserID(),
},
},
}
job.Spec.Template.Spec.ImagePullSecrets = []corev1.LocalObjectReference{
{
Name: nimCache.Spec.Source.NGC.PullSecret,
},
}
}
// Merge env with the user provided values
job.Spec.Template.Spec.Containers[0].Env = utils.MergeEnvVars(job.Spec.Template.Spec.Containers[0].Env, nimCache.Spec.Env)

Expand Down
6 changes: 3 additions & 3 deletions internal/controller/nimcache_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ var _ = Describe("NIMCache Controller", func() {
Namespace: "default",
},
Spec: appsv1alpha1.NIMCacheSpec{
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: appsv1alpha1.ModelSpec{Profiles: profiles}}},
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: &appsv1alpha1.ModelSpec{Profiles: profiles}}},
},
}

Expand All @@ -513,7 +513,7 @@ var _ = Describe("NIMCache Controller", func() {
Annotations: map[string]string{SelectedNIMProfilesAnnotationKey: string(profilesJSON)},
},
Spec: appsv1alpha1.NIMCacheSpec{
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: appsv1alpha1.ModelSpec{GPUs: []appsv1alpha1.GPUSpec{{IDs: []string{"26b5"}}}}}},
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: &appsv1alpha1.ModelSpec{GPUs: []appsv1alpha1.GPUSpec{{IDs: []string{"26b5"}}}}}},
Env: []corev1.EnvVar{
{
Name: "NGC_HOME",
Expand Down Expand Up @@ -567,7 +567,7 @@ var _ = Describe("NIMCache Controller", func() {
Namespace: "default",
},
Spec: appsv1alpha1.NIMCacheSpec{
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: appsv1alpha1.ModelSpec{Profiles: profiles}}},
Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: &appsv1alpha1.ModelSpec{Profiles: profiles}}},
Proxy: &appsv1alpha1.ProxySpec{
HttpProxy: "http://proxy:1000",
HttpsProxy: "https://proxy:1000",
Expand Down
34 changes: 22 additions & 12 deletions internal/controller/platform/standalone/nimservice.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,8 +252,8 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi

// Select PVC for model store
nimCacheName := nimService.GetNIMCacheName()
nimCache := appsv1alpha1.NIMCache{}
if nimCacheName != "" { // nolint:gocritic
nimCache := appsv1alpha1.NIMCache{}
if err := r.Get(ctx, types.NamespacedName{Name: nimCacheName, Namespace: nimService.GetNamespace()}, &nimCache); err != nil {
// Fail the NIMService if the NIMCache is not found
if k8serrors.IsNotFound(err) {
Expand Down Expand Up @@ -338,24 +338,34 @@ func (r *NIMServiceReconciler) reconcileNIMService(ctx context.Context, nimServi
}
deploymentParams.Env = append(deploymentParams.Env, profileEnv)

// Retrieve and set profile details from NIMCache
var profile *appsv1alpha1.NIMProfile
profile, err = r.getNIMCacheProfile(ctx, nimService, modelProfile)
if err != nil {
logger.Error(err, "Failed to get cached NIM profile")
return ctrl.Result{}, err
}

// Auto assign GPU resources in case of the optimized profile
if profile != nil {
if err = r.assignGPUResources(ctx, nimService, profile, deploymentParams); err != nil {
// Only assign GPU resources if the NIMCache is for optimized NIM
if nimCache.IsOptimizedNIM() {
// Retrieve and set profile details from NIMCache
var profile *appsv1alpha1.NIMProfile
profile, err = r.getNIMCacheProfile(ctx, nimService, modelProfile)
if err != nil {
logger.Error(err, "Failed to get cached NIM profile")
return ctrl.Result{}, err
}

// Auto assign GPU resources in case of the optimized profile
if profile != nil {
if err = r.assignGPUResources(ctx, nimService, profile, deploymentParams); err != nil {
return ctrl.Result{}, err
}
}
}

// TODO: assign GPU resources and node selector that is required for the selected profile
}

if nimCache.IsUniversalNIM() {
deploymentParams.Env = append(deploymentParams.Env, corev1.EnvVar{
Name: "NIM_MODEL_NAME",
Comment thread
visheshtanksale marked this conversation as resolved.
Value: utils.DefaultModelStorePath,
})
}

// Setup pod resource claims
namedDraResources := shared.GenerateNamedDRAResources(nimService)
deploymentParams.PodResourceClaims = shared.GetPodResourceClaims(namedDraResources)
Expand Down
3 changes: 3 additions & 0 deletions internal/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ const (

// DRAPodClaimNameAnnotationKey indicates annotation name for the identifier of a resource claim template in a pod spec.
DRAPodClaimNameAnnotationKey = "resource.kubernetes.io/pod-claim-name"

// DefaultModelStorePath is the default path for model store.
DefaultModelStorePath = "/model-store"
)

const (
Expand Down
Loading