Skip to content

Commit be304b7

Browse files
authored
Merge pull request #736 from NVIDIA/am/dynamo-pass-fail
Dynamo pass/fail and slurm example
2 parents 01b3ab7 + 69d14c8 commit be304b7

File tree

4 files changed

+112
-2
lines changed

4 files changed

+112
-2
lines changed
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "dynamo-vllm-slurm"
18+
19+
[[Tests]]
20+
id = "qwen3-0.6B"
21+
num_nodes = 3
22+
time_limit = "00:20:00"
23+
24+
name = "vllm"
25+
description = "vllm"
26+
test_template_name = "AIDynamo"
27+
28+
[Tests.cmd_args]
29+
docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1"
30+
31+
[Tests.cmd_args.dynamo]
32+
backend = "vllm"
33+
model = "Qwen/Qwen3-0.6B"
34+
decode-cmd = 'python3 -m dynamo.vllm'
35+
decode-initialized-regex = 'VllmWorker.*has.been.initialized'
36+
etcd-cmd = "etcd --log-level debug"
37+
etcd-port = 2379
38+
genai-perf-cmd = 'genai-perf profile'
39+
ingress-cmd = "python -m dynamo.frontend --router-mode kv"
40+
nats-cmd = "nats-server -js"
41+
nats-port = 4222
42+
node-setup-cmd = "apt-get update -o APT::Sandbox::User=root && apt-get install -y curl libibverbs1 rdma-core ibverbs-utils libibumad3 libnuma1 librdmacm1 ibverbs-providers; /usr/local/ucx/bin/ucx_info -d |grep Transport | sort -u;"
43+
port = 8787
44+
prefill-cmd = 'python3 -m dynamo.vllm --is-prefill-worker'
45+
prefill-initialized-regex = 'VllmWorker.*has.been.initialized'
46+
workspace-path = "/workspace/"
47+
48+
[Tests.cmd_args.dynamo.prefill_worker]
49+
data-parallel-size = 1
50+
gpu-memory-utilization = 0.90
51+
max_model_len = 19280
52+
num-nodes = 2
53+
pipeline-parallel-size = 1
54+
tensor-parallel-size = 2
55+
extra-args = "--no-enable-expert-parallel"
56+
57+
[Tests.cmd_args.dynamo.decode_worker]
58+
data-parallel-size = 1
59+
gpu-memory-utilization = 0.90
60+
max_model_len = 19280
61+
num-nodes = 1
62+
pipeline-parallel-size = 1
63+
tensor-parallel-size = 2
64+
extra-args = "--no-enable-expert-parallel"
65+
66+
[Tests.cmd_args.genai_perf]
67+
concurrency = 8
68+
endpoint = "v1/chat/completions"
69+
endpoint-type = "chat"
70+
extra-inputs = 'min_tokens:10'
71+
output-tokens-mean = 150
72+
output-tokens-stddev = 0
73+
random-seed = 123
74+
request-count = 128
75+
synthetic-input-tokens-mean = 3000
76+
synthetic-input-tokens-stddev = 0
77+
warmup-request-count = 8
78+
extra-args = "--streaming -- -v --async"
79+
80+
[Tests.extra_env_vars]
81+
UCX_LOG_LEVEL = "warn"
82+
UCX_TLS = "cuda_copy,rc_x"
83+
DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"

src/cloudai/reporter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def print_summary(self) -> None:
187187
logging.debug("No test runs found, skipping summary.")
188188
return
189189

190-
table = Table(title="Scenario results", title_justify="left", show_lines=True, box=box.MINIMAL_HEAVY_HEAD)
190+
table = Table(title="Scenario results", title_justify="left", show_lines=True, box=box.DOUBLE_EDGE)
191191
for col in ["Case", "Status", "Details"]:
192192
table.add_column(col, overflow="fold")
193193

src/cloudai/workloads/ai_dynamo/ai_dynamo.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,17 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17+
import logging
1718
from pathlib import Path
1819
from typing import Optional, Union
1920

2021
from pydantic import BaseModel, ConfigDict, Field
2122

22-
from cloudai.core import DockerImage, File, GitRepo, HFModel, Installable, PythonExecutable
23+
from cloudai.core import DockerImage, File, GitRepo, HFModel, Installable, JobStatusResult, PythonExecutable, TestRun
2324
from cloudai.models.workload import CmdArgs, TestDefinition
2425

26+
from .report_generation_strategy import CSV_FILES_PATTERN, JSON_FILES_PATTERN
27+
2528

2629
class WorkerBaseArgs(BaseModel):
2730
"""Base arguments for VLLM workers."""
@@ -117,3 +120,13 @@ def python_executable(self) -> PythonExecutable:
117120
GitRepo(url=self.genai_perf_repo.url, commit=self.genai_perf_repo.commit),
118121
)
119122
return self._python_executable
123+
124+
def was_run_successful(self, tr: TestRun) -> JobStatusResult:
125+
output_path = tr.output_path
126+
csv_files = list(output_path.rglob(CSV_FILES_PATTERN))
127+
json_files = list(output_path.rglob(JSON_FILES_PATTERN))
128+
logging.debug(f"Found CSV files: {csv_files}, JSON files: {json_files}")
129+
has_results = len(csv_files) > 0 and len(json_files) > 0
130+
if not has_results:
131+
return JobStatusResult(False, "No result files found in the output directory.")
132+
return JobStatusResult(True)

tests/report_generation_strategy/test_ai_dynamo_report_generation_strategy.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,3 +171,17 @@ def test_ai_dynamo_get_metric_invalid(slurm_system: SlurmSystem, ai_dynamo_tr: T
171171

172172
(ai_dynamo_tr.output_path / "profile_genai_perf.csv").write_text("")
173173
assert strategy.get_metric("default") == METRIC_ERROR
174+
175+
176+
def test_was_run_successful(ai_dynamo_tr: TestRun) -> None:
177+
test_def = ai_dynamo_tr.test
178+
result = test_def.was_run_successful(ai_dynamo_tr)
179+
assert result.is_successful is True
180+
181+
182+
def test_was_run_successful_no_results(ai_dynamo_tr: TestRun, tmp_path: Path) -> None:
183+
test_def = ai_dynamo_tr.test
184+
ai_dynamo_tr.output_path = tmp_path / "empty_output"
185+
ai_dynamo_tr.output_path.mkdir(parents=True, exist_ok=True)
186+
result = test_def.was_run_successful(ai_dynamo_tr)
187+
assert result.is_successful is False

0 commit comments

Comments
 (0)