srt-slurm/recipes/vllm/deepseek-v4-pro/GB200/8k1k/decode-bench-gb200-tep8.yaml at 5dfaf9b00c17d12f5104e701aef2e688b10807bb · NVIDIA/srt-slurm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
name: "dsv4-vllm-decode-only-tep8"
model:
  path: "deepseek-v4-pro"
  container: "dsv4-sqsh"
  precision: "fp4"

dynamo:
  version: 1.0.2
  install: true

setup_script: vllm-container-deps.sh

slurm:
  time_limit: "02:00:00"

resources:
  gpu_type: "gb200"
  gpus_per_node: 4
  agg_nodes: 2
  agg_workers: 1
  gpus_per_agg: 8

frontend:
  type: dynamo
  enable_multiple_frontends: false

backend:
  type: vllm
  connector: null

  aggregated_environment:
    TILELANG_CLEANUP_TEMP_FILES: "1"
    # VLLM_USE_NCCL_SYMM_MEM: "1"
    # NCCL_CUMEM_ENABLE: "1"
    # NCCL_MNNVL_ENABLE: "1"
    # NCCL_NVLS_ENABLE: "1"
    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
    VLLM_LOG_STATS_INTERVAL: "1"
    # TORCH_SYMMMEM: "NVSHMEM"

  vllm_config:
    aggregated:
      kv-transfer-config: '{"kv_connector": "DecodeBenchConnector", "kv_role": "kv_both", "kv_connector_extra_config": {"fill_mean": 0.015, "fill_std": 0.0}}'
      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
      kv-cache-dtype: "fp8"
      tensor-parallel-size: 8
      pipeline-parallel-size: 1
      enable-expert-parallel: true
      max-model-len: 16384
      max-num-seqs: 512
      max-cudagraph-capture-size: 512
      trust-remote-code: true
      no-enable-prefix-caching: true
      block-size: 256
      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
      gpu-memory-utilization: 0.9
      stream-interval: 50
      no-disable-hybrid-kv-cache-manager: true
      tokenizer-mode: deepseek_v4
      enable-ep-weight-filter: true
      moe-backend: "deep_gemm_mega_moe"

benchmark:
  type: "vllm-bench"
  isl: 8192
  osl: 1024
  concurrencies: "128x256x512"
  random_range_ratio: 1.0
  num_warmups: 64