-
Notifications
You must be signed in to change notification settings - Fork 36
Expand file tree
/
Copy pathdecode-bench-gb200-tep8.yaml
More file actions
70 lines (61 loc) · 1.71 KB
/
decode-bench-gb200-tep8.yaml
File metadata and controls
70 lines (61 loc) · 1.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
name: "dsv4-vllm-decode-only-tep8"
model:
path: "deepseek-v4-pro"
container: "dsv4-sqsh"
precision: "fp4"
dynamo:
version: 1.0.2
install: true
setup_script: vllm-container-deps.sh
slurm:
time_limit: "02:00:00"
resources:
gpu_type: "gb200"
gpus_per_node: 4
agg_nodes: 2
agg_workers: 1
gpus_per_agg: 8
frontend:
type: dynamo
enable_multiple_frontends: false
backend:
type: vllm
connector: null
aggregated_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
# VLLM_USE_NCCL_SYMM_MEM: "1"
# NCCL_CUMEM_ENABLE: "1"
# NCCL_MNNVL_ENABLE: "1"
# NCCL_NVLS_ENABLE: "1"
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
VLLM_LOG_STATS_INTERVAL: "1"
# TORCH_SYMMMEM: "NVSHMEM"
vllm_config:
aggregated:
kv-transfer-config: '{"kv_connector": "DecodeBenchConnector", "kv_role": "kv_both", "kv_connector_extra_config": {"fill_mean": 0.015, "fill_std": 0.0}}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 8
pipeline-parallel-size: 1
enable-expert-parallel: true
max-model-len: 16384
max-num-seqs: 512
max-cudagraph-capture-size: 512
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
tokenizer-mode: deepseek_v4
enable-ep-weight-filter: true
moe-backend: "deep_gemm_mega_moe"
benchmark:
type: "vllm-bench"
isl: 8192
osl: 1024
concurrencies: "128x256x512"
random_range_ratio: 1.0
num_warmups: 64