Skip to content

Commit 31c3e59

Browse files
authored
Sync ishandhanani/srt-slurm history into NVIDIA/srt-slurm (#14)
1 parent 896eabe commit 31c3e59

58 files changed

Lines changed: 3245 additions & 466 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

configs/rebuild-deepep.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ fi
1515
cd "$DEEPEP_SRC"
1616

1717
# Find NVSHMEM
18-
NVSHMEM_DIR=$(find /usr/local -name "nvshmem" -type d 2>/dev/null | head -1)
18+
NVSHMEM_DIR=$(find /usr/local -name "nvshmem" -type d -not -path "*/flashinfer*" 2>/dev/null | head -1)
1919
if [ -z "${NVSHMEM_DIR:-}" ]; then
2020
echo "ERROR: NVSHMEM installation not found under /usr/local" >&2
2121
exit 1
Lines changed: 119 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -1,117 +1,128 @@
1-
name: "gb200-fp8-8k1k-low-latency"
1+
base:
2+
name: "gb200-fp8-8k1k-low-latency"
23

3-
dynamo:
4-
version: 0.8.1
4+
dynamo:
5+
version: 0.8.1
56

6-
frontend:
7-
type: dynamo
8-
enable_multiple_frontends: true
9-
num_additional_frontends: 2
10-
nginx_container: nginx
7+
frontend:
8+
type: dynamo
9+
enable_multiple_frontends: true
10+
num_additional_frontends: 2
11+
nginx_container: nginx
1112

12-
model:
13-
path: "dsr1-fp8"
14-
container: "dynamo-sglang"
15-
precision: "fp8"
13+
model:
14+
path: "dsr1-fp8"
15+
container: "dynamo-sglang"
16+
precision: "fp8"
1617

17-
resources:
18-
gpu_type: "gb200"
19-
prefill_nodes: 2
20-
decode_nodes: 2
21-
prefill_workers: 1
22-
decode_workers: 1
23-
gpus_per_node: 4
18+
resources:
19+
gpu_type: "gb200"
20+
gpus_per_node: 4
21+
prefill_nodes: 2
22+
prefill_workers: 1
2423

25-
backend:
26-
prefill_environment:
27-
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
28-
PYTHONUNBUFFERED: "1"
29-
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
30-
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
31-
SGLANG_ENABLE_JIT_DEEPGEMM: "false"
32-
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
33-
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
34-
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
35-
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
36-
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
37-
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
38-
MC_TE_METRIC: "true"
39-
MC_FORCE_MNNVL: "1"
40-
NCCL_MNNVL_ENABLE: "1"
41-
NCCL_CUMEM_ENABLE: "1"
4224

43-
decode_environment:
44-
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
45-
PYTHONUNBUFFERED: "1"
46-
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
47-
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
48-
SGLANG_ENABLE_JIT_DEEPGEMM: "false"
49-
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
50-
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
51-
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
52-
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
53-
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
54-
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
55-
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
56-
MC_TE_METRIC: "true"
57-
MC_FORCE_MNNVL: "1"
58-
NCCL_MNNVL_ENABLE: "1"
59-
NCCL_CUMEM_ENABLE: "1"
25+
backend:
26+
prefill_environment:
27+
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
28+
PYTHONUNBUFFERED: "1"
29+
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
30+
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
31+
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
32+
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
33+
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
34+
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
35+
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
36+
MC_TE_METRIC: "true"
37+
MC_FORCE_MNNVL: "1"
38+
NCCL_MNNVL_ENABLE: "1"
39+
NCCL_CUMEM_ENABLE: "1"
6040

61-
sglang_config:
62-
prefill:
63-
served-model-name: "deepseek-ai/DeepSeek-R1"
64-
trust-remote-code: true
65-
kv-cache-dtype: "fp8_e4m3"
66-
attention-backend: "trtllm_mla"
67-
quantization: "fp8"
68-
moe-runner-backend: "flashinfer_trtllm"
69-
disable-radix-cache: true
70-
watchdog-timeout: 1000000
71-
context-length: 9600
72-
disaggregation-mode: "prefill"
73-
mem-fraction-static: 0.8
74-
max-total-tokens: 32768
75-
chunked-prefill-size: 24576
76-
cuda-graph-max-bs: 512
77-
max-running-requests: 512
78-
load-balance-method: "round_robin"
79-
scheduler-recv-interval: 10
80-
tensor-parallel-size: 8
81-
data-parallel-size: 1
82-
expert-parallel-size: 1
83-
fp8-gemm-backend: "flashinfer_trtllm"
84-
disaggregation-bootstrap-port: 30001
85-
disaggregation-transfer-backend: nixl
41+
decode_environment:
42+
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
43+
PYTHONUNBUFFERED: "1"
44+
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
45+
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
46+
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
47+
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
48+
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
49+
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
50+
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
51+
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
52+
MC_TE_METRIC: "true"
53+
MC_FORCE_MNNVL: "1"
54+
NCCL_MNNVL_ENABLE: "1"
55+
NCCL_CUMEM_ENABLE: "1"
8656

87-
decode:
88-
served-model-name: "deepseek-ai/DeepSeek-R1"
89-
trust-remote-code: true
90-
kv-cache-dtype: "fp8_e4m3"
91-
attention-backend: "trtllm_mla"
92-
quantization: "fp8"
93-
moe-runner-backend: "flashinfer_trtllm"
94-
disable-radix-cache: true
95-
watchdog-timeout: 1000000
96-
context-length: 9600
97-
disaggregation-mode: "decode"
98-
mem-fraction-static: 0.8
99-
chunked-prefill-size: 8192
100-
cuda-graph-max-bs: 512
101-
max-running-requests: 512
102-
scheduler-recv-interval: 10
103-
enable-symm-mem: true
104-
prefill-round-robin-balance: true
105-
tensor-parallel-size: 8
106-
data-parallel-size: 1
107-
expert-parallel-size: 1
108-
fp8-gemm-backend: "flashinfer_trtllm"
109-
disaggregation-bootstrap-port: 30001
110-
disaggregation-transfer-backend: nixl
57+
sglang_config:
58+
prefill:
59+
served-model-name: "deepseek-ai/DeepSeek-R1"
60+
trust-remote-code: true
61+
kv-cache-dtype: "fp8_e4m3"
62+
attention-backend: "trtllm_mla"
63+
quantization: "fp8"
64+
moe-runner-backend: "flashinfer_trtllm"
65+
disable-radix-cache: true
66+
watchdog-timeout: 1000000
67+
context-length: 9600
68+
disaggregation-mode: "prefill"
69+
mem-fraction-static: 0.6
70+
max-running-requests: 32
71+
cuda-graph-max-bs: 32
72+
load-balance-method: "round_robin"
73+
scheduler-recv-interval: 10
74+
tensor-parallel-size: 8
75+
data-parallel-size: 1
76+
expert-parallel-size: 1
77+
fp8-gemm-backend: "flashinfer_trtllm"
78+
disaggregation-bootstrap-port: 30001
79+
disaggregation-transfer-backend: nixl
11180

112-
benchmark:
113-
type: "sa-bench"
114-
isl: 8192
115-
osl: 1024
116-
concurrencies: "4x8x16"
117-
req_rate: "inf"
81+
decode:
82+
served-model-name: "deepseek-ai/DeepSeek-R1"
83+
trust-remote-code: true
84+
kv-cache-dtype: "fp8_e4m3"
85+
attention-backend: "trtllm_mla"
86+
quantization: "fp8"
87+
moe-runner-backend: "flashinfer_trtllm"
88+
disable-radix-cache: true
89+
watchdog-timeout: 1000000
90+
context-length: 9600
91+
disaggregation-mode: "decode"
92+
mem-fraction-static: 0.7
93+
cuda-graph-max-bs: 32
94+
max-running-requests: 32
95+
scheduler-recv-interval: 10
96+
enable-symm-mem: true
97+
prefill-round-robin-balance: true
98+
tensor-parallel-size: 8
99+
data-parallel-size: 1
100+
expert-parallel-size: 1
101+
fp8-gemm-backend: "flashinfer_trtllm"
102+
disaggregation-bootstrap-port: 30001
103+
disaggregation-transfer-backend: nixl
104+
105+
benchmark:
106+
type: "sa-bench"
107+
isl: 8192
108+
osl: 1024
109+
concurrencies: "4x8x16"
110+
req_rate: "inf"
111+
112+
zip_override_lowlat:
113+
resources:
114+
decode_nodes: [14, 14, 6]
115+
decode_workers: [7, 7, 3]
116+
name:
117+
- "gb200-fp8-8k1k-low-latency-c8"
118+
- "gb200-fp8-8k1k-low-latency-c16-32"
119+
- "gb200-fp8-8k1k-low-latency-c64"
120+
backend:
121+
sglang_config:
122+
prefill:
123+
max-prefill-tokens: [8192, 16384, 32768]
124+
chunked-prefill-size: [8192, 16384, 32768]
125+
decode:
126+
max-running-requests: [1, 4, 16]
127+
benchmark:
128+
concurrencies: ["8", "16x32", "64"]

recipes/qwen3.5/experimental/1p1d-tp4-deepep-deepgemm.yaml

Lines changed: 0 additions & 147 deletions
This file was deleted.

0 commit comments

Comments
 (0)