|
| 1 | +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES |
| 2 | +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 3 | +# SPDX-License-Identifier: Apache-2.0 |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +name = "dynamo-vllm-slurm" |
| 18 | + |
| 19 | +[[Tests]] |
| 20 | +id = "qwen3-0.6B" |
| 21 | +num_nodes = 3 |
| 22 | +time_limit = "00:20:00" |
| 23 | + |
| 24 | +name = "vllm" |
| 25 | +description = "vllm" |
| 26 | +test_template_name = "AIDynamo" |
| 27 | + |
| 28 | + [Tests.cmd_args] |
| 29 | + docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1" |
| 30 | + |
| 31 | + [Tests.cmd_args.dynamo] |
| 32 | + backend = "vllm" |
| 33 | + model = "Qwen/Qwen3-0.6B" |
| 34 | + decode-cmd = 'python3 -m dynamo.vllm' |
| 35 | + decode-initialized-regex = 'VllmWorker.*has.been.initialized' |
| 36 | + etcd-cmd = "etcd --log-level debug" |
| 37 | + etcd-port = 2379 |
| 38 | + genai-perf-cmd = 'genai-perf profile' |
| 39 | + ingress-cmd = "python -m dynamo.frontend --router-mode kv" |
| 40 | + nats-cmd = "nats-server -js" |
| 41 | + nats-port = 4222 |
| 42 | + node-setup-cmd = "apt-get update -o APT::Sandbox::User=root && apt-get install -y curl libibverbs1 rdma-core ibverbs-utils libibumad3 libnuma1 librdmacm1 ibverbs-providers; /usr/local/ucx/bin/ucx_info -d |grep Transport | sort -u;" |
| 43 | + port = 8787 |
| 44 | + prefill-cmd = 'python3 -m dynamo.vllm --is-prefill-worker' |
| 45 | + prefill-initialized-regex = 'VllmWorker.*has.been.initialized' |
| 46 | + workspace-path = "/workspace/" |
| 47 | + |
| 48 | + [Tests.cmd_args.dynamo.prefill_worker] |
| 49 | + data-parallel-size = 1 |
| 50 | + gpu-memory-utilization = 0.90 |
| 51 | + max_model_len = 19280 |
| 52 | + num-nodes = 2 |
| 53 | + pipeline-parallel-size = 1 |
| 54 | + tensor-parallel-size = 2 |
| 55 | + extra-args = "--no-enable-expert-parallel" |
| 56 | + |
| 57 | + [Tests.cmd_args.dynamo.decode_worker] |
| 58 | + data-parallel-size = 1 |
| 59 | + gpu-memory-utilization = 0.90 |
| 60 | + max_model_len = 19280 |
| 61 | + num-nodes = 1 |
| 62 | + pipeline-parallel-size = 1 |
| 63 | + tensor-parallel-size = 2 |
| 64 | + extra-args = "--no-enable-expert-parallel" |
| 65 | + |
| 66 | + [Tests.cmd_args.genai_perf] |
| 67 | + concurrency = 8 |
| 68 | + endpoint = "v1/chat/completions" |
| 69 | + endpoint-type = "chat" |
| 70 | + extra-inputs = 'min_tokens:10' |
| 71 | + output-tokens-mean = 150 |
| 72 | + output-tokens-stddev = 0 |
| 73 | + random-seed = 123 |
| 74 | + request-count = 128 |
| 75 | + synthetic-input-tokens-mean = 3000 |
| 76 | + synthetic-input-tokens-stddev = 0 |
| 77 | + warmup-request-count = 8 |
| 78 | + extra-args = "--streaming -- -v --async" |
| 79 | + |
| 80 | + [Tests.extra_env_vars] |
| 81 | + UCX_LOG_LEVEL = "warn" |
| 82 | + UCX_TLS = "cuda_copy,rc_x" |
| 83 | + DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')" |
0 commit comments