-
Notifications
You must be signed in to change notification settings - Fork 36
Expand file tree
/
Copy path1p1d-dep4tp4-gpqa.yaml
More file actions
123 lines (104 loc) · 3.43 KB
/
1p1d-dep4tp4-gpqa.yaml
File metadata and controls
123 lines (104 loc) · 3.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Qwen3.5-397B-A17B-FP8 Disaggregated 1P1D: DEP4 Prefill + TP4 Decode
# Prefill uses DEP4 (DP4 + TP4 + EP4 with dp-attention)
# Decode uses pure TP4
# No GPU staging buffer (direct scatter RDMA)
# GPQA accuracy benchmark
name: "qwen3.5-1p1d-dep4tp4-gpqa"
model:
path: "qwen3.5-fp8"
container: "dev"
precision: "fp8"
resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
backend:
prefill_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
PYTHONUNBUFFERED: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
MC_FORCE_MNNVL: "1"
SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache"
FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
decode_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
PYTHONUNBUFFERED: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
MC_FORCE_MNNVL: "1"
SGLANG_DG_CACHE_DIR: "/configs/deepgemm-cache"
FLASHINFER_WORKSPACE_BASE: "/configs/flashinfer-cache"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
sglang_config:
prefill:
served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
model-path: "/model/"
attention-backend: "trtllm_mha"
quantization: "fp8"
kv-cache-dtype: "fp8_e4m3"
moe-runner-backend: "flashinfer_trtllm"
# DEP4: DP4 + TP4 + EP4 with dp-attention (same layout as dep4dep4 prefill)
tensor-parallel-size: 4
data-parallel-size: 4
expert-parallel-size: 4
enable-dp-attention: true
enable-dp-lm-head: true
moe-dense-tp-size: 1
mamba-scheduler-strategy: "no_buffer"
disable-radix-cache: true
mamba-track-interval: 2048
mamba-ssm-dtype: "bfloat16"
disaggregation-mode: "prefill"
mem-fraction-static: 0.80
chunked-prefill-size: 16384
load-balance-method: "round_robin"
watchdog-timeout: 1000000
decode:
served-model-name: "Qwen/Qwen3.5-397B-A17B-FP8"
model-path: "/model/"
attention-backend: "trtllm_mha"
quantization: "fp8"
kv-cache-dtype: "fp8_e4m3"
moe-runner-backend: "flashinfer_trtllm"
# TP4: pure tensor parallel, no dp-attention
tensor-parallel-size: 4
mamba-scheduler-strategy: "no_buffer"
disable-radix-cache: true
mamba-track-interval: 2048
mamba-ssm-dtype: "bfloat16"
disaggregation-mode: "decode"
mem-fraction-static: 0.80
chunked-prefill-size: 16384
cuda-graph-max-bs: 1024
decode-log-interval: 1
stream-interval: 50
watchdog-timeout: 1000000
benchmark:
type: custom
container_image: nemo-skills
env:
OPENAI_API_KEY: "EMPTY"
HF_TOKEN: "${HF_TOKEN}"
MODEL: "Qwen/Qwen3.5-397B-A17B-FP8"
MAX_TOKENS: "65536"
REPEAT: "8"
NUM_THREADS: "128"
command: |
bash /configs/gpqa/run.sh