Skip to content

Commit 8bf5379

Browse files
committed
Add samples for NMR 25.08 bits
Signed-off-by: Shiva Krishna, Merla <smerla@nvidia.com>
1 parent 5425f59 commit 8bf5379

15 files changed

+870
-12
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# NeMo Custom Resources
2+
3+
These CRs are designed to deploy NeMo microservices using the NIM Operator.
4+
5+
## Compatible NIM Operator Version
6+
7+
- **NIM Operator v2.0.1**
8+
9+
> Using these CRs with any other version may lead to validation or runtime errors.
10+
11+
## Notes
12+
13+
- The CR schema and fields in this version match the capabilities of NIM Operator v2.0.1.
14+
15+
## Upgrade Notes
16+
17+
If upgrading from a previous NeMo service version (e.g., `25.06`) using the existing operator version:
18+
- Check for renamed or deprecated fields.
19+
- Review updated model config parameters.
20+
- Revalidate against the new CR using:
21+
22+
```bash
23+
kubectl apply --dry-run=server -f apps_v1alpha1_nemodatastore.yaml \
24+
-f apps_v1alpha1_nemocustomizer.yaml \
25+
-f apps_v1alpha1_nemoentitystore.yaml \
26+
-f apps_v1alpha1_nemoguardrails.yaml \
27+
-f apps_v1alpha1_nemoevaluator.yaml
28+
```
29+
30+
```text
31+
nemodatastore.apps.nvidia.com/nemodatastore-sample created (server dry run)
32+
nemocustomizer.apps.nvidia.com/nemocustomizer-sample created (server dry run)
33+
nemoentitystore.apps.nvidia.com/nemoentitystore-sample created (server dry run)
34+
nemoguardrail.apps.nvidia.com/nemoguardrails-sample configured (server dry run)
35+
nemoevaluator.apps.nvidia.com/nemoevaluator-sample created (server dry run)
36+
```
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
apiVersion: apps.nvidia.com/v1alpha1
2+
kind: NemoCustomizer
3+
metadata:
4+
name: nemocustomizer-sample
5+
namespace: nemo
6+
spec:
7+
# Scheduler configuration for training jobs (volcano (default))
8+
scheduler:
9+
type: "volcano"
10+
# Weights & Biases configuration for experiment tracking
11+
wandb:
12+
secretName: wandb-secret # Kubernetes secret that stores WANDB_API_KEY and optionally encryption key
13+
apiKeyKey: apiKey # Key in the secret that holds the W&B API key
14+
encryptionKey: encryptionKey # Key in the secret that holds optional encryption key
15+
# OpenTelemetry tracing configuration
16+
otel:
17+
enabled: true
18+
exporterOtlpEndpoint: http://customizer-otel-opentelemetry-collector.nemo.svc.cluster.local:4317
19+
# PostgreSQL database connection configuration
20+
databaseConfig:
21+
credentials:
22+
user: ncsuser # Database username
23+
secretName: customizer-pg-existing-secret # Secret containing password
24+
passwordKey: password # Key inside secret that contains the password
25+
host: customizer-pg-postgresql.nemo.svc.cluster.local
26+
port: 5432
27+
databaseName: ncsdb
28+
# Customizer API service exposure settings
29+
expose:
30+
service:
31+
type: ClusterIP
32+
port: 8000
33+
# Global image pull settings used in various subcomponents
34+
image:
35+
repository: nvcr.io/nvidia/nemo-microservices/customizer-api
36+
tag: "25.06"
37+
pullPolicy: IfNotPresent
38+
pullSecrets:
39+
- ngc-secret
40+
# URL to the NeMo Entity Store microservice
41+
entitystore:
42+
endpoint: http://nemoentitystore-sample.nemo.svc.cluster.local:8000
43+
# URL to the NeMo Data Store microservice
44+
datastore:
45+
endpoint: http://nemodatastore-sample.nemo.svc.cluster.local:8000
46+
# URL for MLflow tracking server
47+
mlflow:
48+
endpoint: http://mlflow-tracking.nemo.svc.cluster.local:80
49+
# Configuration for the data store CLI tools
50+
nemoDatastoreTools:
51+
image: nvcr.io/nvidia/nemo-microservices/nds-v2-huggingface-cli:25.06
52+
# Configuration for model download jobs
53+
modelDownloadJobs:
54+
image: "nvcr.io/nvidia/nemo-microservices/customizer-api:25.06"
55+
ngcAPISecret:
56+
# Secret that stores NGC API key
57+
name: ngc-api-secret
58+
# Key inside secret
59+
key: "NGC_API_KEY"
60+
securityContext:
61+
fsGroup: 1000
62+
runAsNonRoot: true
63+
runAsUser: 1000
64+
runAsGroup: 1000
65+
# Time (in seconds) to retain job after completion
66+
ttlSecondsAfterFinished: 600
67+
# Polling frequency to check job status
68+
pollIntervalSeconds: 15
69+
# Name to the ConfigMap containing model definitions
70+
modelConfig:
71+
name: nemo-model-config
72+
# Training configuration
73+
trainingConfig:
74+
configMap:
75+
# Optional: Additional configuration to merge into training config
76+
name: nemo-training-config
77+
# PVC where model artifacts are cached or used during training
78+
modelPVC:
79+
create: true
80+
name: finetuning-ms-models-pvc
81+
# StorageClass for the PVC (can be empty to use default)
82+
storageClass: ""
83+
volumeAccessMode: ReadWriteOnce
84+
size: 50Gi
85+
# Workspace PVC automatically created per job
86+
workspacePVC:
87+
storageClass: "local-path"
88+
volumeAccessMode: ReadWriteOnce
89+
size: 10Gi
90+
# Mount path for workspace inside container
91+
mountPath: /pvc/workspace
92+
image:
93+
repository: nvcr.io/nvidia/nemo-microservices/customizer
94+
tag: "25.06"
95+
env:
96+
- name: LOG_LEVEL
97+
value: INFO
98+
# Multi-node networking environment variables for training (CSPs)
99+
networkConfig:
100+
- name: NCCL_IB_SL
101+
value: "0"
102+
- name: NCCL_IB_TC
103+
value: "41"
104+
- name: NCCL_IB_QPS_PER_CONNECTION
105+
value: "4"
106+
- name: UCX_TLS
107+
value: TCP
108+
- name: UCX_NET_DEVICES
109+
value: eth0
110+
- name: HCOLL_ENABLE_MCAST_ALL
111+
value: "0"
112+
- name: NCCL_IB_GID_INDEX
113+
value: "3"
114+
# TTL for training job after it completes
115+
ttlSecondsAfterFinished: 3600
116+
# Timeout duration (in seconds) for training job
117+
timeout: 3600
118+
# Node tolerations
119+
tolerations:
120+
- key: "nvidia.com/gpu"
121+
operator: "Exists"
122+
effect: "NoSchedule"
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
apiVersion: apps.nvidia.com/v1alpha1
2+
kind: NemoDatastore
3+
metadata:
4+
name: nemodatastore-sample
5+
namespace: nemo
6+
spec:
7+
secrets:
8+
datastoreConfigSecret: "nemo-ms-nemo-datastore"
9+
datastoreInitSecret: "nemo-ms-nemo-datastore-init"
10+
datastoreInlineConfigSecret: "nemo-ms-nemo-datastore-inline-config"
11+
giteaAdminSecret: "gitea-admin-credentials"
12+
lfsJwtSecret: "nemo-ms-nemo-datastore--lfs-jwt"
13+
databaseConfig:
14+
credentials:
15+
user: ndsuser
16+
secretName: datastore-pg-existing-secret
17+
passwordKey: password
18+
host: datastore-pg-postgresql.nemo.svc.cluster.local
19+
port: 5432
20+
databaseName: ndsdb
21+
pvc:
22+
name: "pvc-shared-data"
23+
create: true
24+
storageClass: ""
25+
volumeAccessMode: ReadWriteOnce
26+
size: "10Gi"
27+
expose:
28+
service:
29+
type: ClusterIP
30+
port: 8000
31+
image:
32+
repository: nvcr.io/nvidia/nemo-microservices/datastore
33+
tag: "25.06"
34+
pullPolicy: IfNotPresent
35+
pullSecrets:
36+
- ngc-secret
37+
replicas: 1
38+
resources:
39+
requests:
40+
memory: "256Mi"
41+
cpu: "500m"
42+
limits:
43+
memory: "512Mi"
44+
cpu: "1"
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
---
2+
apiVersion: apps.nvidia.com/v1alpha1
3+
kind: NemoEntitystore
4+
metadata:
5+
name: nemoentitystore-sample
6+
namespace: nemo
7+
spec:
8+
image:
9+
repository: nvcr.io/nvidia/nemo-microservices/entity-store
10+
tag: "25.06"
11+
pullPolicy: IfNotPresent
12+
pullSecrets:
13+
- ngc-secret
14+
expose:
15+
service:
16+
type: ClusterIP
17+
port: 8000
18+
databaseConfig:
19+
databaseName: nesdb
20+
host: entity-store-pg-postgresql.nemo.svc.cluster.local
21+
port: 5432
22+
credentials:
23+
user: nesuser
24+
secretName: entity-store-pg-existing-secret
25+
passwordKey: password
26+
datastore:
27+
endpoint: http://nemodatastore-sample.nemo.svc.cluster.local:8000
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
apiVersion: apps.nvidia.com/v1alpha1
2+
kind: NemoEvaluator
3+
metadata:
4+
name: nemoevaluator-sample
5+
namespace: nemo
6+
spec:
7+
evaluationImages:
8+
bigcodeEvalHarness: "nvcr.io/nvidia/nemo-microservices/eval-tool-benchmark-bigcode:0.12.21"
9+
lmEvalHarness: "nvcr.io/nvidia/nemo-microservices/eval-tool-benchmark-lm-eval-harness:0.12.21"
10+
similarityMetrics: "nvcr.io/nvidia/nemo-microservices/eval-tool-benchmark-custom-eval:0.12.21"
11+
llmAsJudge: "nvcr.io/nvidia/nemo-microservices/eval-tool-benchmark-llm-as-a-judge:0.12.21"
12+
mtBench: "nvcr.io/nvidia/nemo-microservices/eval-tool-benchmark-llm-as-a-judge:0.12.21"
13+
retriever: "nvcr.io/nvidia/nemo-microservices/eval-tool-benchmark-retriever:0.12.21"
14+
rag: "nvcr.io/nvidia/nemo-microservices/eval-tool-benchmark-rag:0.12.21"
15+
bfcl: "nvcr.io/nvidia/nemo-microservices/eval-factory-benchmark-bfcl:25.6.1"
16+
agenticEval: "nvcr.io/nvidia/nemo-microservices/eval-factory-benchmark-agentic-eval:25.6.1"
17+
image:
18+
repository: nvcr.io/nvidia/nemo-microservices/evaluator
19+
tag: "25.06"
20+
pullPolicy: IfNotPresent
21+
pullSecrets:
22+
- ngc-secret
23+
expose:
24+
service:
25+
type: ClusterIP
26+
port: 8000
27+
argoWorkflows:
28+
endpoint: https://argo-workflows-server.nemo.svc.cluster.local:2746
29+
serviceAccount: argo-workflows-executor
30+
vectorDB:
31+
endpoint: http://milvus.nemo.svc.cluster.local:19530
32+
datastore:
33+
endpoint: http://nemodatastore-sample.nemo.svc.cluster.local:8000/v1/hf
34+
entitystore:
35+
endpoint: http://nemoentitystore-sample.nemo.svc.cluster.local:8000
36+
databaseConfig:
37+
host: evaluator-pg-postgresql.nemo.svc.cluster.local
38+
port: 5432
39+
databaseName: evaldb
40+
credentials:
41+
user: evaluser
42+
secretName: evaluator-pg-existing-secret
43+
passwordKey: password
44+
otel:
45+
enabled: true
46+
exporterOtlpEndpoint: http://evaluator-otel-opentelemetry-collector.nemo.svc.cluster.local:4317
47+
replicas: 1
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
apiVersion: apps.nvidia.com/v1alpha1
2+
kind: NemoGuardrail
3+
metadata:
4+
name: nemoguardrails-sample
5+
namespace: nemo
6+
spec:
7+
# required if a NIM endpoint is hosted by NVIDIA
8+
configStore:
9+
pvc:
10+
name: "pvc-guardrail-config"
11+
create: true
12+
storageClass: ""
13+
volumeAccessMode: ReadWriteOnce
14+
size: "1Gi"
15+
nimEndpoint:
16+
baseURL: "http://meta-llama3-1b-instruct.nemo.svc.cluster.local:8000/v1"
17+
expose:
18+
service:
19+
type: ClusterIP
20+
port: 8000
21+
image:
22+
repository: nvcr.io/nvidia/nemo-microservices/guardrails
23+
tag: "25.06"
24+
pullPolicy: IfNotPresent
25+
pullSecrets:
26+
- ngc-secret
27+
metrics:
28+
serviceMonitor: {}
29+
replicas: 1
30+
resources:
31+
limits:
32+
cpu: "1"
33+
ephemeral-storage: 10Gi
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
---
2+
apiVersion: apps.nvidia.com/v1alpha1
3+
kind: NIMCache
4+
metadata:
5+
name: meta-llama3-1b-instruct
6+
namespace: nemo
7+
spec:
8+
source:
9+
ngc:
10+
modelPuller: nvcr.io/nim/meta/llama-3.2-1b-instruct:1.8.3
11+
pullSecret: ngc-secret
12+
authSecret: ngc-api-secret
13+
model:
14+
engine: tensorrt_llm
15+
tensorParallelism: "1"
16+
storage:
17+
pvc:
18+
create: true
19+
storageClass: ""
20+
size: "50Gi"
21+
volumeAccessMode: ReadWriteOnce
22+
23+
---
24+
apiVersion: apps.nvidia.com/v1alpha1
25+
kind: NIMPipeline
26+
metadata:
27+
name: llama3-1b-pipeline
28+
namespace: nemo
29+
spec:
30+
services:
31+
- name: meta-llama3-1b-instruct
32+
enabled: true
33+
spec:
34+
env:
35+
- name: NIM_PEFT_SOURCE
36+
value: http://nemoentitystore-sample.nemo.svc.cluster.local:8000
37+
- name: NIM_PEFT_REFRESH_INTERVAL
38+
value: "180"
39+
- name: NIM_MAX_CPU_LORAS
40+
value: "16"
41+
- name: NIM_MAX_GPU_LORAS
42+
value: "8"
43+
- name: NIM_GUIDED_DECODING_BACKEND
44+
value: fast_outlines
45+
image:
46+
repository: nvcr.io/nim/meta/llama-3.2-1b-instruct
47+
tag: 1.8.3
48+
pullPolicy: IfNotPresent
49+
pullSecrets:
50+
- ngc-secret
51+
authSecret: ngc-api-secret
52+
storage:
53+
nimCache:
54+
name: meta-llama3-1b-instruct
55+
profile: ''
56+
replicas: 1
57+
resources:
58+
limits:
59+
nvidia.com/gpu: 1
60+
expose:
61+
service:
62+
type: ClusterIP
63+
port: 8000
64+

0 commit comments

Comments
 (0)