Skip to content

Commit b3631f3

Browse files
shengnuoshivamerla
authored andcommitted
Add samples of multi-node NIM
Signed-off-by: Sheng Lin <shelin@nvidia.com>
1 parent 4a6959a commit b3631f3

3 files changed

Lines changed: 102 additions & 5 deletions

File tree

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
apiVersion: apps.nvidia.com/v1alpha1
2+
kind: NIMCache
3+
metadata:
4+
name: deepseek-r1-nimcache
5+
spec:
6+
source:
7+
ngc:
8+
modelPuller: nvcr.io/nim/deepseek-ai/deepseek-r1:1.7.3
9+
pullSecret: ngc-secret
10+
authSecret: ngc-api-secret
11+
model:
12+
storage:
13+
pvc:
14+
create: true
15+
storageClass: "nfs-storage"
16+
size: "100Gi"
17+
volumeAccessMode: ReadWriteOnce
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
2+
apiVersion: apps.nvidia.com/v1alpha1
3+
kind: NIMService
4+
metadata:
5+
name: deepseek-r1
6+
spec:
7+
env:
8+
- name: NIM_USE_SGLANG
9+
value: "1"
10+
- name: NIM_MULTI_NODE
11+
value: "1"
12+
- name: NIM_TENSOR_PARALLEL_SIZE
13+
value: '8'
14+
- name: NIM_PIPELINE_PARALLEL_SIZE
15+
value: '2'
16+
- name: NGC_HOME
17+
value: /model-store/ngc/hub
18+
- name: HF_HOME
19+
value: /model-store/huggingface/hub
20+
- name: NUMBA_CACHE_DIR
21+
value: /tmp/numba
22+
- name: OUTLINES_CACHE_DIR
23+
value: /tmp/outlines
24+
- name: UCX_TLS
25+
value: ib,tcp,shm
26+
- name: UCC_TLS
27+
value: ucp
28+
- name: UCC_CONFIG_FILE
29+
value: " "
30+
- name: GLOO_SOCKET_IFNAME
31+
value: eth0
32+
- name: NCCL_SOCKET_IFNAME
33+
value: eth0
34+
- name: NIM_TRUST_CUSTOM_CODE
35+
value: "1"
36+
- name: NIM_NODE_RANK
37+
valueFrom:
38+
fieldRef:
39+
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
40+
readinessProbe:
41+
probe:
42+
failureThreshold: 3
43+
httpGet:
44+
path: "/v1/health/ready"
45+
port: "api"
46+
initialDelaySeconds: 15
47+
periodSeconds: 10
48+
successThreshold: 1
49+
timeoutSeconds: 1
50+
startupProbe:
51+
probe:
52+
failureThreshold: 100
53+
httpGet:
54+
path: "/v1/health/ready"
55+
port: "api"
56+
initialDelaySeconds: 900
57+
periodSeconds: 10
58+
successThreshold: 1
59+
timeoutSeconds: 1
60+
image:
61+
repository: nvcr.io/nim/deepseek-ai/deepseek-r1
62+
tag: "1.7.3"
63+
pullPolicy: IfNotPresent
64+
pullSecrets:
65+
- ngc-secret
66+
authSecret: ngc-api-secret
67+
storage:
68+
nimCache:
69+
name: deepseek-r1-nimcache
70+
# profile: '12c2a31b069ad9d7c027fcec7083e3c1c900d75211de97f15df9beb45a81e954'
71+
replicas: 1
72+
resources:
73+
limits:
74+
nvidia.com/gpu: 8
75+
requests:
76+
nvidia.com/gpu: 8
77+
expose:
78+
service:
79+
type: ClusterIP
80+
port: 8000
81+
multiNode:
82+
size: 2
83+
gpusPerPod: 8
84+
mpi:
85+
mpiStartTimeout: 6000

config/samples/nim/llm/nimservice.yaml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,3 @@ spec:
2222
service:
2323
type: ClusterIP
2424
port: 8000
25-
multiNode:
26-
workers: 2
27-
gpusPerWorker: 1
28-
# mpi:
29-
# clusterStartTimeout: 300

0 commit comments

Comments
 (0)