Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
name: "svf-vllm-disagg-gb200-1p1d-dep8-dep8-gsm8k-smoke"
setup_script: "vllm-container-deps.sh"
model:
path: "deepseekv4-fp4"
container: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260426"

health_check:
max_attempts: 360
interval_seconds: 10

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 2
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 8
frontend:
type: dynamo
enable_multiple_frontends: false
backend:
type: vllm
connector: null
prefill_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
# VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
# VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
# VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
# VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enforce-eager: true
max-model-len: 16384
max-num-seqs: 16
max-num-batched-tokens: 32768
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.8
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
numa-bind: true
offload-group-size: 3
offload-num-in-group: 1
offload-prefetch-step: 2
# offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
tokenizer-mode: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
max-model-len: 16384
max-num-seqs: 128
max-cudagraph-capture-size: 128
max-num-batched-tokens: 128
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4
benchmark:
type: "lm-eval"
isl: 8192
osl: 1024
concurrencies: "8x16x32x64x128x256x512"
req_rate: "inf"
env:
RANDOM_RANGE_RATIO: "0.8"
EXP_NAME: "dsv4_8k1k"
IMAGE: "vllm/vllm-openai:v0.20.0-ubuntu2404"
MODEL_PREFIX: "dsv4"
MODEL: "deepseek-ai/DeepSeek-V4-Pro"
MODEL_PATH: "/model"
FRAMEWORK: "dynamo-vllm"
PRECISION: "fp4"
ISL: "8192"
OSL: "1024"
MAX_MODEL_LEN: "9472"
CONC_LIST: "8 16 32 64 128 256 512"
SPEC_DECODING: "none"
DISAGG: "true"
RUN_EVAL: "true"
EVAL_ONLY: "true"
EVAL_CONC: "128"
EVAL_TASKS_DIR: "/srtctl-benchmarks/lm-eval/gsm8k.yaml"
VALIDATE_EVAL_SCORES: "true"
IS_MULTINODE: "true"
RUNNER_TYPE: "gb200"
PYTHONDONTWRITEBYTECODE: "1"
PYTHONPYCACHEPREFIX: "/tmp/inferencex-pycache"
PREFILL_NUM_WORKERS: "1"
PREFILL_TP: "8"
PREFILL_EP: "8"
PREFILL_DP_ATTN: "true"
DECODE_NUM_WORKERS: "1"
DECODE_TP: "1"
DECODE_EP: "8"
DECODE_DP_ATTN: "true"
RESULT_FILENAME: "dsv4_8k1k_fp4_dynamo-vllm_prefill-tp8-ep8-dptrue-nw1_decode-tp1-ep8-dptrue-nw1_disagg-true_spec-none_conc8x16x32x64x128x256x512_gb200-nv_1"

identity:
model:
repo: "deepseek-ai/DeepSeek-V4-Pro"
revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
container:
image: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3"
frameworks:
dynamo: "1.2.0.dev20260426"
vllm: "0.20.0"
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
name: "svf-vllm-disagg-gb200-1p1d-dep8-tp8-gsm8k-smoke"
setup_script: "vllm-container-deps.sh"
model:
path: "deepseekv4-fp4"
container: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260426"

health_check:
max_attempts: 360
interval_seconds: 10

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 2
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 8
frontend:
type: dynamo
enable_multiple_frontends: false
backend:
type: vllm
connector: null
prefill_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
# VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
# VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
decode_environment:
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
VLLM_SERVER_DEV_MODE: "1"
# VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
# VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
UCX_MEMTYPE_CACHE: "n"
UCX_MEMTYPE_REG_WHOLE: "n"
UCX_TLS: "cuda_copy,cuda_ipc,tcp"
UCX_CUDA_IPC_ENABLE_MNNVL: "y"
NCCL_P2P_LEVEL: NVL
vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enforce-eager: true
max-model-len: 16384
max-num-seqs: 16
max-num-batched-tokens: 32768
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.8
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
numa-bind: true
offload-group-size: 3
offload-num-in-group: 1
offload-prefetch-step: 2
# offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
tokenizer-mode: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 8
pipeline-parallel-size: 1
# data-parallel-size: 8
# data-parallel-rpc-port: 13345
# enable-expert-parallel: true
max-model-len: 16384
max-num-seqs: 256
max-cudagraph-capture-size: 256
max-num-batched-tokens: 256
trust-remote-code: true
no-enable-prefix-caching: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4
benchmark:
type: "lm-eval"
isl: 8192
osl: 1024
concurrencies: "8x16x32x64x128x256x512"
req_rate: "inf"
env:
RANDOM_RANGE_RATIO: "0.8"
EXP_NAME: "dsv4_8k1k"
IMAGE: "vllm/vllm-openai:v0.20.0-ubuntu2404"
MODEL_PREFIX: "dsv4"
MODEL: "deepseek-ai/DeepSeek-V4-Pro"
MODEL_PATH: "/model"
FRAMEWORK: "dynamo-vllm"
PRECISION: "fp4"
ISL: "8192"
OSL: "1024"
MAX_MODEL_LEN: "9472"
CONC_LIST: "8 16 32 64 128 256 512"
SPEC_DECODING: "none"
DISAGG: "true"
RUN_EVAL: "true"
EVAL_ONLY: "true"
EVAL_CONC: "128"
EVAL_TASKS_DIR: "/srtctl-benchmarks/lm-eval/gsm8k.yaml"
VALIDATE_EVAL_SCORES: "true"
IS_MULTINODE: "true"
RUNNER_TYPE: "gb200"
PYTHONDONTWRITEBYTECODE: "1"
PYTHONPYCACHEPREFIX: "/tmp/inferencex-pycache"
PREFILL_NUM_WORKERS: "1"
PREFILL_TP: "8"
PREFILL_EP: "8"
PREFILL_DP_ATTN: "true"
DECODE_NUM_WORKERS: "1"
DECODE_TP: "8"
DECODE_EP: "1"
DECODE_DP_ATTN: "false"
RESULT_FILENAME: "dsv4_8k1k_fp4_dynamo-vllm_prefill-tp8-ep8-dptrue-nw1_decode-tp8-ep1-dpfalse-nw1_disagg-true_spec-none_conc8x16x32x64x128x256x512_gb200-nv_1"

identity:
model:
repo: "deepseek-ai/DeepSeek-V4-Pro"
revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
container:
image: "vllm/vllm-openai:v0.20.0-ubuntu2404@sha256:46da022ce07aae43e4ffae844aeab467a223437e071abadf566555699fbf16c3"
frameworks:
dynamo: "1.2.0.dev20260426"
vllm: "0.20.0"
2 changes: 2 additions & 0 deletions src/srtctl/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
custom,
gpqa,
gsm8k,
lm_eval,
longbenchv2,
mmlu,
mooncake_router,
Expand All @@ -32,6 +33,7 @@
"custom",
"sa_bench",
"sglang_bench",
"lm_eval",
"mmlu",
"gpqa",
"gsm8k",
Expand Down
56 changes: 56 additions & 0 deletions src/srtctl/benchmarks/lm_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2026 SemiAnalysis LLC. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""lm-eval benchmark runner for self-contained accuracy evals."""

from __future__ import annotations

from typing import TYPE_CHECKING

from srtctl.benchmarks.base import SCRIPTS_DIR, BenchmarkRunner, register_benchmark

if TYPE_CHECKING:
from srtctl.core.runtime import RuntimeContext
from srtctl.core.schema import SrtConfig


@register_benchmark("lm-eval")
class LMEvalRunner(BenchmarkRunner):
"""lm-eval accuracy evaluation for OpenAI-compatible chat APIs."""

@property
def name(self) -> str:
return "lm-eval"

@property
def script_path(self) -> str:
return "/srtctl-benchmarks/lm-eval/bench.sh"

@property
def local_script_dir(self) -> str:
return str(SCRIPTS_DIR / "lm-eval")

def validate_config(self, config: SrtConfig) -> list[str]:
del config
return []

def build_command(
self,
config: SrtConfig,
runtime: RuntimeContext,
) -> list[str]:
del config
endpoint = f"http://localhost:{runtime.frontend_port}"
return [
"bash",
self.script_path,
endpoint,
]

def get_environment(self, config: SrtConfig, runtime: RuntimeContext) -> dict[str, str]:
del runtime
env = dict(config.benchmark.env)
env.setdefault("MODEL_NAME", config.served_model_name)
env.setdefault("MODEL_PATH", "/model")
return env
Loading
Loading