Skip to content

Commit 9710ab5

Browse files
committed
Update Datastore source for NIMCache
1 parent b630bd8 commit 9710ab5

7 files changed

Lines changed: 80 additions & 45 deletions

File tree

api/apps/v1alpha1/nimcache_types.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -110,13 +110,17 @@ type GPUSpec struct {
110110

111111
// DataStoreSource references a model stored on NVIDIA DataStore service.
112112
type DataStoreSource struct {
113-
// The endpoint for datastore
113+
// HF-compatible datastore endpoint
114+
// +kubebuilder:validation:Pattern=`^http(s)?://.*v1/hf/?$`
114115
Endpoint string `json:"endpoint"`
115-
// Name of either model/checkpoint or dataset to download
116-
ModelName *string `json:"modelName,omitempty"`
117-
CheckpointName *string `json:"checkpointName,omitempty"`
118-
DatasetName *string `json:"datasetName,omitempty"`
119-
// The name of an existing auth secret containing the AUTH_TOKEN"
116+
// +kubebuilder:default="default"
117+
Namespace string `json:"namespace"`
118+
119+
// Name of either model or dataset to download
120+
ModelName *string `json:"modelName,omitempty"`
121+
DatasetName *string `json:"datasetName,omitempty"`
122+
123+
// The name of an existing auth secret containing the HF_TOKEN
120124
AuthSecret string `json:"authSecret"`
121125
// ModelPuller is the container image that can pull the model
122126
ModelPuller string `json:"modelPuller"`

api/apps/v1alpha1/zz_generated.deepcopy.go

Lines changed: 0 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bundle/manifests/apps.nvidia.com_nimcaches.yaml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -244,30 +244,32 @@ spec:
244244
properties:
245245
authSecret:
246246
description: The name of an existing auth secret containing
247-
the AUTH_TOKEN"
248-
type: string
249-
checkpointName:
247+
the HF_TOKEN
250248
type: string
251249
datasetName:
252250
type: string
253251
endpoint:
254-
description: The endpoint for datastore
252+
description: HF-compatible datastore endpoint
253+
pattern: ^http(s)?://.*v1/hf/?$
255254
type: string
256255
modelName:
257-
description: Name of either model/checkpoint or dataset to
258-
download
256+
description: Name of either model or dataset to download
259257
type: string
260258
modelPuller:
261259
description: ModelPuller is the container image that can pull
262260
the model
263261
type: string
262+
namespace:
263+
default: default
264+
type: string
264265
pullSecret:
265266
description: PullSecret for the model puller image
266267
type: string
267268
required:
268269
- authSecret
269270
- endpoint
270271
- modelPuller
272+
- namespace
271273
type: object
272274
ngc:
273275
description: NGCSource represents models stored in NGC

config/crd/bases/apps.nvidia.com_nimcaches.yaml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -244,30 +244,32 @@ spec:
244244
properties:
245245
authSecret:
246246
description: The name of an existing auth secret containing
247-
the AUTH_TOKEN"
248-
type: string
249-
checkpointName:
247+
the HF_TOKEN
250248
type: string
251249
datasetName:
252250
type: string
253251
endpoint:
254-
description: The endpoint for datastore
252+
description: HF-compatible datastore endpoint
253+
pattern: ^http(s)?://.*v1/hf/?$
255254
type: string
256255
modelName:
257-
description: Name of either model/checkpoint or dataset to
258-
download
256+
description: Name of either model or dataset to download
259257
type: string
260258
modelPuller:
261259
description: ModelPuller is the container image that can pull
262260
the model
263261
type: string
262+
namespace:
263+
default: default
264+
type: string
264265
pullSecret:
265266
description: PullSecret for the model puller image
266267
type: string
267268
required:
268269
- authSecret
269270
- endpoint
270271
- modelPuller
272+
- namespace
271273
type: object
272274
ngc:
273275
description: NGCSource represents models stored in NGC

config/samples/nim/llm/nimcache-llm.yaml

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,27 @@
11
apiVersion: apps.nvidia.com/v1alpha1
22
kind: NIMCache
33
metadata:
4-
name: meta-llama3-8b-instruct
4+
name: meta-llama3-1b-instruct-datastore
5+
spec:
6+
source:
7+
dataStore:
8+
endpoint: http://nemodatastore-sample.nemo.svc.cluster.local:8000/v1/hf
9+
modelName: "llama-3-1b-instruct"
10+
authSecret: hfAuth
11+
modelPuller: nvcr.io/nvidia/nemo-microservices/nds-v2-huggingface-cli:25.04
12+
pullSecret: nvcrimagepullsecret
13+
storage:
14+
pvc:
15+
create: true
16+
storageClass: ""
17+
size: "50Gi"
18+
volumeAccessMode: ReadWriteOnce
19+
---
20+
21+
apiVersion: apps.nvidia.com/v1alpha1
22+
kind: NIMCache
23+
metadata:
24+
name: meta-llama3-8b-instruct-ngc
525
spec:
626
source:
727
ngc:

deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimcaches.yaml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -244,30 +244,32 @@ spec:
244244
properties:
245245
authSecret:
246246
description: The name of an existing auth secret containing
247-
the AUTH_TOKEN"
248-
type: string
249-
checkpointName:
247+
the HF_TOKEN
250248
type: string
251249
datasetName:
252250
type: string
253251
endpoint:
254-
description: The endpoint for datastore
252+
description: HF-compatible datastore endpoint
253+
pattern: ^http(s)?://.*v1/hf/?$
255254
type: string
256255
modelName:
257-
description: Name of either model/checkpoint or dataset to
258-
download
256+
description: Name of either model or dataset to download
259257
type: string
260258
modelPuller:
261259
description: ModelPuller is the container image that can pull
262260
the model
263261
type: string
262+
namespace:
263+
default: default
264+
type: string
264265
pullSecret:
265266
description: PullSecret for the model puller image
266267
type: string
267268
required:
268269
- authSecret
269270
- endpoint
270271
- modelPuller
272+
- namespace
271273
type: object
272274
ngc:
273275
description: NGCSource represents models stored in NGC

internal/controller/nimcache_controller.go

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -555,18 +555,20 @@ func isModelSelectionDone(nimCache *appsv1alpha1.NIMCache) bool {
555555
}
556556

557557
func getSelectedProfiles(nimCache *appsv1alpha1.NIMCache) ([]string, error) {
558-
// Return profiles explicitly specified by the user in the spec
559-
if len(nimCache.Spec.Source.NGC.Model.Profiles) > 0 {
560-
return nimCache.Spec.Source.NGC.Model.Profiles, nil
561-
} else if isModelSelectionRequired(nimCache) {
562-
// Retrieve the selected profiles from the annotation
563-
var selectedProfiles []string
564-
if annotation, exists := nimCache.Annotations[SelectedNIMProfilesAnnotationKey]; exists {
565-
if err := json.Unmarshal([]byte(annotation), &selectedProfiles); err != nil {
566-
return nil, err
558+
if nimCache.Spec.Source.NGC != nil {
559+
if len(nimCache.Spec.Source.NGC.Model.Profiles) > 0 {
560+
return nimCache.Spec.Source.NGC.Model.Profiles, nil
561+
}
562+
563+
if isModelSelectionDone(nimCache) {
564+
var selectedProfiles []string
565+
if annotation, exists := nimCache.Annotations[SelectedNIMProfilesAnnotationKey]; exists {
566+
if err := json.Unmarshal([]byte(annotation), &selectedProfiles); err != nil {
567+
return nil, err
568+
}
567569
}
570+
return selectedProfiles, nil
568571
}
569-
return selectedProfiles, nil
570572
}
571573
return nil, nil
572574
}
@@ -1105,19 +1107,27 @@ func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1a
11051107
outputPath = fmt.Sprintf("%v/%v", outputPath, *nimCache.Spec.Storage.HostPath)
11061108
}
11071109
var command []string
1108-
if nimCache.Spec.Source.DataStore.ModelName != nil && nimCache.Spec.Source.DataStore.CheckpointName != nil { // nolint:gocritic
1109-
command = []string{"datastore-tools", "checkpoint", "download", "--model-name", *nimCache.Spec.Source.DataStore.ModelName, "--checkpoint-name", *nimCache.Spec.Source.DataStore.CheckpointName, "--path", outputPath, "--end-point", nimCache.Spec.Source.DataStore.Endpoint}
1110+
1111+
if nimCache.Spec.Source.DataStore.ModelName != nil { // nolint:gocritic
1112+
hfRepo := fmt.Sprintf("%s/%s", nimCache.Spec.Source.DataStore.Namespace, *nimCache.Spec.Source.DataStore.ModelName)
1113+
command = []string{"huggingface-cli", "download", "--local-dir", outputPath, "--repo-type", "model", hfRepo}
11101114
} else if nimCache.Spec.Source.DataStore.DatasetName != nil {
1111-
command = []string{"datastore-tools", "dataset", "download", "--dataset-name", *nimCache.Spec.Source.DataStore.DatasetName, "--path", outputPath, "--end-point", nimCache.Spec.Source.DataStore.Endpoint}
1115+
hfRepo := fmt.Sprintf("%s/%s", nimCache.Spec.Source.DataStore.Namespace, *nimCache.Spec.Source.DataStore.DatasetName)
1116+
command = []string{"huggingface-cli", "download", "--local-dir", outputPath, "--repo-type", "dataset", hfRepo}
11121117
} else {
1113-
return nil, errors.NewBadRequest("either datasetName or (modelName and checkpointName) must be provided")
1118+
return nil, errors.NewBadRequest("either modelName or datasetName must be provided")
11141119
}
11151120
job.Spec.Template.Spec.Containers = []corev1.Container{
11161121
{
11171122
Name: NIMCacheContainerName,
11181123
Image: nimCache.Spec.Source.DataStore.ModelPuller,
11191124
EnvFrom: nimCache.Spec.Source.EnvFromSecrets(),
1120-
Env: []corev1.EnvVar{},
1125+
Env: []corev1.EnvVar{
1126+
{
1127+
Name: "HF_ENDPOINT",
1128+
Value: nimCache.Spec.Source.DataStore.Endpoint,
1129+
},
1130+
},
11211131
VolumeMounts: []corev1.VolumeMount{
11221132
{
11231133
Name: "nim-cache-volume",

0 commit comments

Comments
 (0)