-
Notifications
You must be signed in to change notification settings - Fork 36
Expand file tree
/
Copy pathbs64-2p3d-mtp.yaml
More file actions
120 lines (96 loc) · 2.81 KB
/
bs64-2p3d-mtp.yaml
File metadata and controls
120 lines (96 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
name: "bs64-2p3d-h200-fp8-mtp"
model:
path: "dsr1"
container: "lmsysorg/sglang:v0.5.8.post1-cu130"
precision: "fp8"
frontend:
nginx_container: nginx
resources:
gpu_type: "h200"
prefill_nodes: 2
prefill_workers: 2
decode_nodes: 3
decode_workers: 3
gpus_per_node: 8
backend:
# Prefill-specific environment variables
prefill_environment:
SGLANG_ENABLE_SPEC_V2: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
# Decode-specific environment variables
decode_environment:
SGLANG_ENABLE_SPEC_V2: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
sglang_config:
prefill:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
skip-tokenizer-init: true
trust-remote-code: true
watchdog-timeout: 1000000
# Parallelism
tp-size: 8
dp-size: 1
ep-size: 1
# KV cache and attention
attention-backend: "flashinfer"
# Radix cache disabled
disable-radix-cache: true
# Other flags
# stream-interval: 50
max-running-requests: 16
# Prefill-specific mode
disaggregation-bootstrap-port: 30001
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl
# Memory and token limits
mem-fraction-static: 0.82
max-prefill-tokens: 32768
chunked-prefill-size: 32768
# Request handling
load-balance-method: "round_robin"
decode:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
skip-tokenizer-init: true
trust-remote-code: true
watchdog-timeout: 1000000
# Parallelism
tp-size: 8
dp-size: 1
ep-size: 1
# KV cache and attention
attention-backend: "flashinfer"
# Other flags
disable-radix-cache: true
stream-interval: 10
# Disagg
disaggregation-bootstrap-port: 30001
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl
context-length: 72000
max-total-tokens: 128000
# Memory and token limits
mem-fraction-static: 0.75
max-running-requests: 16
cuda-graph-max-bs: 16
# MTP settings
speculative-algorithm: "EAGLE"
speculative-num-steps: 2
speculative-eagle-topk: 1
speculative-num-draft-tokens: 3
benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "32x64x128"
req_rate: "inf"
# See configs/gpqa/run.sh + docs/accuracy.md for the script-based GPQA recipe.