forked from ai-dynamo/dynamo
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdeploy.yaml
More file actions
128 lines (128 loc) · 3.63 KB
/
Copy pathdeploy.yaml
File metadata and controls
128 lines (128 loc) · 3.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Disaggregated SGLang deployment: prefill/decode split with nixl KV transfer.
# Tested with dynamo 1.0 (SGLang 0.5.9).
#
# Uses TP=2 per worker (prefill: 2 GPUs, decode: 2 GPUs) for a total of 4 GPUs.
# KV cache is transferred between workers via nixl (GPU-direct).
#
# NOT working on dynamo 0.9.1 — same blocking bugs as sglang/agg.
#
# Known issue: Prefill warmup logs a non-blocking warning:
# "Prefill warmup failed: 'SamplingParams' object is not subscriptable"
# This does not affect functionality.
#
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: nemotron-super-fp8-sglang-disagg
spec:
backendFramework: sglang
envs:
- name: HF_HOME
value: /opt/models
pvcs:
- name: model-cache
create: false
services:
Frontend:
componentType: frontend
replicas: 1
volumeMounts:
- name: model-cache
mountPoint: /opt/models
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.2.0
command:
- /bin/sh
- -c
args:
- python3 -m dynamo.frontend --router-mode kv --no-kv-events --http-port 8000
prefill:
componentType: worker
subComponentType: prefill
envFromSecret: hf-token-secret
replicas: 1
resources:
limits:
gpu: "2"
requests:
gpu: "2"
volumeMounts:
- name: model-cache
mountPoint: /opt/models
sharedMemory:
size: 16Gi
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.2.0
workingDir: /workspace
command:
- python3
- -m
- dynamo.sglang
args:
- --model-path
- nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8
- --served-model-name
- nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8
- --tp
- "2"
- --trust-remote-code
- --disaggregation-mode
- prefill
- --disaggregation-bootstrap-port
- "12345"
- --disaggregation-transfer-backend
- nixl
- --host
- 0.0.0.0
- --dyn-tool-call-parser
- nemotron_nano
- --dyn-reasoning-parser
- nemotron_nano
decode:
componentType: worker
subComponentType: decode
envFromSecret: hf-token-secret
replicas: 1
resources:
limits:
gpu: "2"
requests:
gpu: "2"
volumeMounts:
- name: model-cache
mountPoint: /opt/models
sharedMemory:
size: 16Gi
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:1.2.0
workingDir: /workspace
command:
- python3
- -m
- dynamo.sglang
args:
- --model-path
- nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8
- --served-model-name
- nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8
- --tp
- "2"
- --trust-remote-code
- --disaggregation-mode
- decode
- --disaggregation-bootstrap-port
- "12345"
- --disaggregation-transfer-backend
- nixl
- --host
- 0.0.0.0
- --dyn-tool-call-parser
- nemotron_nano
- --dyn-reasoning-parser
- nemotron_nano