diff --git a/recipes/vllm/deepseek-v4-pro/GB200/8k1k/decode-bench-gb200-dep8.yaml b/recipes/vllm/deepseek-v4-pro/GB200/8k1k/decode-bench-gb200-dep8.yaml new file mode 100644 index 00000000..5652bd2d --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/GB200/8k1k/decode-bench-gb200-dep8.yaml @@ -0,0 +1,73 @@ +name: "dsv4-vllm-decode-only-dep8" +model: + path: "deepseek-v4-pro" + container: "dsv4-sqsh" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "02:00:00" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + gpus_per_agg: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + aggregated: + kv-transfer-config: '{"kv_connector": "DecodeBenchConnector", "kv_role": "kv_both", "kv_connector_extra_config": {"fill_mean": 0.015, "fill_std": 0.0}}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 384 + max-cudagraph-capture-size: 384 + trust-remote-code: true + no-enable-flashinfer-autotune: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.96 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + moe-backend: "deep_gemm_mega_moe" + +benchmark: + type: "vllm-bench" + isl: 8192 + osl: 1024 + concurrencies: "3200" + random_range_ratio: 1.0 + num_warmups: 256 diff --git a/recipes/vllm/deepseek-v4-pro/GB200/8k1k/decode-bench-gb200-tep8.yaml b/recipes/vllm/deepseek-v4-pro/GB200/8k1k/decode-bench-gb200-tep8.yaml new file mode 100644 index 00000000..2c888898 --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/GB200/8k1k/decode-bench-gb200-tep8.yaml @@ -0,0 +1,70 @@ +name: "dsv4-vllm-decode-only-tep8" +model: + path: "deepseek-v4-pro" + container: "dsv4-sqsh" + precision: "fp4" + +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "02:00:00" + +resources: + gpu_type: "gb200" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + gpus_per_agg: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + TILELANG_CLEANUP_TEMP_FILES: "1" + # VLLM_USE_NCCL_SYMM_MEM: "1" + # NCCL_CUMEM_ENABLE: "1" + # NCCL_MNNVL_ENABLE: "1" + # NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + VLLM_LOG_STATS_INTERVAL: "1" + # TORCH_SYMMMEM: "NVSHMEM" + + vllm_config: + aggregated: + kv-transfer-config: '{"kv_connector": "DecodeBenchConnector", "kv_role": "kv_both", "kv_connector_extra_config": {"fill_mean": 0.015, "fill_std": 0.0}}' + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 8 + pipeline-parallel-size: 1 + enable-expert-parallel: true + max-model-len: 16384 + max-num-seqs: 512 + max-cudagraph-capture-size: 512 + trust-remote-code: true + no-enable-prefix-caching: true + block-size: 256 + compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}' + gpu-memory-utilization: 0.9 + stream-interval: 50 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + moe-backend: "deep_gemm_mega_moe" + +benchmark: + type: "vllm-bench" + isl: 8192 + osl: 1024 + concurrencies: "128x256x512" + random_range_ratio: 1.0 + num_warmups: 64 diff --git a/recipes/vllm/deepseek-v4-pro/GB200/8k1k/prefill-bench-gb200-dep8.yaml b/recipes/vllm/deepseek-v4-pro/GB200/8k1k/prefill-bench-gb200-dep8.yaml new file mode 100644 index 00000000..b19b56af --- /dev/null +++ b/recipes/vllm/deepseek-v4-pro/GB200/8k1k/prefill-bench-gb200-dep8.yaml @@ -0,0 +1,75 @@ +name: "dsv4-vllm-disagg-gb300-prefill-dep4" + +model: + path: "deepseek-v4-pro" + container: "dsv4-sqsh" + precision: "fp4" + +# dynamo: +# hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b +# # Install handled by our custom vllm-container-deps.sh, which builds +# # the dynamo wheel ONCE on /mnt/vast and lets every rank pip-install +# # from cache. See runners/gb300-cw-vllm-container-deps.sh. +# install: false +dynamo: + version: 1.0.2 + install: true + +setup_script: vllm-container-deps.sh + +slurm: + time_limit: "02:00:00" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + agg_nodes: 2 + agg_workers: 1 + gpus_per_agg: 8 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + aggregated_environment: + VLLM_USE_NCCL_SYMM_MEM: "1" + NCCL_CUMEM_ENABLE: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_NVLS_ENABLE: "1" + VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1" + VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random" + + vllm_config: + aggregated: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + kv-cache-dtype: "fp8" + tensor-parallel-size: 1 + pipeline-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + enforce-eager: true + max-model-len: 9216 + max-num-seqs: 16 + max-num-batched-tokens: 24576 + trust-remote-code: true + no-enable-prefix-caching: true + no-enable-flashinfer-autotune: true + no-async-scheduling: true + block-size: 256 + gpu-memory-utilization: 0.92 + no-disable-hybrid-kv-cache-manager: true + tokenizer-mode: deepseek_v4 + enable-ep-weight-filter: true + all2all-backend: "flashinfer_nvlink_one_sided" + +benchmark: + type: "vllm-bench" + isl: 8192 + osl: 1 + concurrencies: "64" + num_prompts_mult: 100