forked from ai-dynamo/dynamo
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdeploy.yaml
More file actions
119 lines (119 loc) · 3.47 KB
/
deploy.yaml
File metadata and controls
119 lines (119 loc) · 3.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: v1
kind: ConfigMap
metadata:
name: agg-config
data:
agg.yaml: |
backend: pytorch
trust_remote_code: true
tensor_parallel_size: 4
moe_expert_parallel_size: 4
moe_tensor_parallel_size: 1
enable_attention_dp: false
enable_chunked_prefill: true
kv_cache_config:
enable_block_reuse: true
free_gpu_memory_fraction: 0.8
dtype: auto
cache_transceiver_config:
backend: DEFAULT
cuda_graph_config:
enable_padding: true
max_batch_size: 128
disable_overlap_scheduler: false
print_iter_log: false
# DEEPGEMM requires Blackwell (SM100+). Remove moe_config block to run on Hopper (SM90).
moe_config:
backend: DEEPGEMM
max_num_tokens: 8192
---
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: qwen3-235b-a22b-agg
spec:
backendFramework: trtllm
pvcs:
- name: model-cache
create: false
services:
Frontend:
componentType: frontend
replicas: 1
extraPodSpec:
tolerations: []
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: nvidia.com/dynamo-graph-deployment-name
operator: In
values:
- qwen3-235b-a22b-agg-frontend
topologyKey: kubernetes.io/hostname
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
args:
- python3 -m dynamo.frontend --router-mode kv --http-port 8000
command:
- /bin/sh
- -c
TrtllmWorker:
componentType: worker
envFromSecret: hf-token-secret
sharedMemory:
size: 256Gi
extraPodSpec:
tolerations: []
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.present
operator: In
values:
- "true"
mainContainer:
env:
- name: MODEL_PATH
value: Qwen/Qwen3-235B-A22B-FP8
- name: HF_HOME
value: /mnt/model-cache
- name: ENGINE_ARGS
value: /engine_configs/agg.yaml
command:
- /bin/sh
- -c
args:
- |
python3 -m dynamo.trtllm \
--model-path "${MODEL_PATH}" \
--served-model-name "Qwen/Qwen3-235B-A22B-FP8" \
--max-batch-size 128 \
--max-num-tokens 8192 \
--max-seq-len 8192 \
--extra-engine-args "${ENGINE_ARGS}"
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1
workingDir: /workspace/components/backends/trtllm
volumeMounts:
- name: agg-config
mountPath: /engine_configs
- name: model-cache
mountPath: /mnt/model-cache
volumes:
- name: agg-config
configMap:
name: agg-config
- name: model-cache
persistentVolumeClaim:
claimName: model-cache
replicas: 4
resources:
limits:
gpu: "4"
requests:
gpu: "4"