srt-slurm/recipes/h200/8k1k/bs64-2p3d-mtp.yaml at 7f8eb9674fd0d169a270f6d5d63076f400c914ea · NVIDIA/srt-slurm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
name: "bs64-2p3d-h200-fp8-mtp"

model:
  path: "dsr1"
  container: "lmsysorg/sglang:v0.5.8.post1-cu130"
  precision: "fp8"

frontend:
  nginx_container: nginx

resources:
  gpu_type: "h200"
  prefill_nodes: 2
  prefill_workers: 2
  decode_nodes: 3
  decode_workers: 3
  gpus_per_node: 8

backend:

  # Prefill-specific environment variables
  prefill_environment:
    SGLANG_ENABLE_SPEC_V2: "1"
    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"

  # Decode-specific environment variables
  decode_environment:
    SGLANG_ENABLE_SPEC_V2: "1"
    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"

  sglang_config:
    prefill:
      # Model configuration
      served-model-name: "deepseek-ai/DeepSeek-R1"
      model-path: "/model/"
      skip-tokenizer-init: true
      trust-remote-code: true
      watchdog-timeout: 1000000

      # Parallelism
      tp-size: 8
      dp-size: 1
      ep-size: 1

      # KV cache and attention
      attention-backend: "flashinfer"

      # Radix cache disabled
      disable-radix-cache: true

      # Other flags
      # stream-interval: 50
      max-running-requests: 16


      # Prefill-specific mode
      disaggregation-bootstrap-port: 30001
      disaggregation-mode: "prefill"
      disaggregation-transfer-backend: nixl

      # Memory and token limits
      mem-fraction-static: 0.82
      max-prefill-tokens: 32768
      chunked-prefill-size: 32768

      # Request handling
      load-balance-method: "round_robin"

    decode:
      # Model configuration
      served-model-name: "deepseek-ai/DeepSeek-R1"
      model-path: "/model/"
      skip-tokenizer-init: true
      trust-remote-code: true
      watchdog-timeout: 1000000

      # Parallelism
      tp-size: 8
      dp-size: 1
      ep-size: 1

      # KV cache and attention
      attention-backend: "flashinfer"

      # Other flags
      disable-radix-cache: true
      stream-interval: 10

      # Disagg
      disaggregation-bootstrap-port: 30001
      disaggregation-mode: "decode"
      disaggregation-transfer-backend: nixl

      context-length: 72000
      max-total-tokens: 128000
      # Memory and token limits
      mem-fraction-static: 0.75
      max-running-requests: 16
      cuda-graph-max-bs: 16

      # MTP settings
      speculative-algorithm: "EAGLE"
      speculative-num-steps: 2
      speculative-eagle-topk: 1
      speculative-num-draft-tokens: 3

benchmark:
  type: "sa-bench"
  isl: 8192
  osl: 1024
  concurrencies: "32x64x128"
  req_rate: "inf"

# See configs/gpqa/run.sh + docs/accuracy.md for the script-based GPQA recipe.