Merge pull request #736 from NVIDIA/am/dynamo-pass-fail

amaslenn · web-flow · commit be304b751eef · 2025-12-04T13:27:55.000+01:00
Dynamo pass/fail and slurm example
diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_slurm.toml
@@ -0,0 +1,83 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "dynamo-vllm-slurm"
+
+[[Tests]]
+id = "qwen3-0.6B"
+num_nodes = 3
+time_limit = "00:20:00"
+
+name = "vllm"
+description = "vllm"
+test_template_name = "AIDynamo"
+
+  [Tests.cmd_args]
+  docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1"
+
+    [Tests.cmd_args.dynamo]
+    backend = "vllm"
+    model = "Qwen/Qwen3-0.6B"
+    decode-cmd = 'python3 -m dynamo.vllm'
+    decode-initialized-regex = 'VllmWorker.*has.been.initialized'
+    etcd-cmd = "etcd --log-level debug"
+    etcd-port = 2379
+    genai-perf-cmd = 'genai-perf profile'
+    ingress-cmd = "python -m dynamo.frontend --router-mode kv"
+    nats-cmd = "nats-server -js"
+    nats-port = 4222
+    node-setup-cmd = "apt-get update -o APT::Sandbox::User=root && apt-get install -y curl libibverbs1 rdma-core ibverbs-utils libibumad3 libnuma1 librdmacm1 ibverbs-providers; /usr/local/ucx/bin/ucx_info -d |grep Transport | sort -u;"
+    port = 8787
+    prefill-cmd = 'python3 -m dynamo.vllm --is-prefill-worker'
+    prefill-initialized-regex = 'VllmWorker.*has.been.initialized'
+    workspace-path = "/workspace/"
+
+      [Tests.cmd_args.dynamo.prefill_worker]
+      data-parallel-size = 1
+      gpu-memory-utilization = 0.90
+      max_model_len = 19280
+      num-nodes = 2
+      pipeline-parallel-size = 1
+      tensor-parallel-size = 2
+      extra-args = "--no-enable-expert-parallel"
+
+      [Tests.cmd_args.dynamo.decode_worker]
+      data-parallel-size = 1
+      gpu-memory-utilization = 0.90
+      max_model_len = 19280
+      num-nodes = 1
+      pipeline-parallel-size = 1
+      tensor-parallel-size = 2
+      extra-args = "--no-enable-expert-parallel"
+
+    [Tests.cmd_args.genai_perf]
+    concurrency = 8
+    endpoint = "v1/chat/completions"
+    endpoint-type = "chat"
+    extra-inputs = 'min_tokens:10'
+    output-tokens-mean = 150
+    output-tokens-stddev = 0
+    random-seed = 123
+    request-count = 128
+    synthetic-input-tokens-mean = 3000
+    synthetic-input-tokens-stddev = 0
+    warmup-request-count = 8
+    extra-args = "--streaming -- -v --async"
+
+  [Tests.extra_env_vars]
+  UCX_LOG_LEVEL = "warn"
+  UCX_TLS = "cuda_copy,rc_x"
+  DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
diff --git a/src/cloudai/reporter.py b/src/cloudai/reporter.py
@@ -187,7 +187,7 @@ def print_summary(self) -> None:
             logging.debug("No test runs found, skipping summary.")
             return
 
-        table = Table(title="Scenario results", title_justify="left", show_lines=True, box=box.MINIMAL_HEAVY_HEAD)
+        table = Table(title="Scenario results", title_justify="left", show_lines=True, box=box.DOUBLE_EDGE)
         for col in ["Case", "Status", "Details"]:
             table.add_column(col, overflow="fold")
 
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
@@ -14,14 +14,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 from pathlib import Path
 from typing import Optional, Union
 
 from pydantic import BaseModel, ConfigDict, Field
 
-from cloudai.core import DockerImage, File, GitRepo, HFModel, Installable, PythonExecutable
+from cloudai.core import DockerImage, File, GitRepo, HFModel, Installable, JobStatusResult, PythonExecutable, TestRun
 from cloudai.models.workload import CmdArgs, TestDefinition
 
+from .report_generation_strategy import CSV_FILES_PATTERN, JSON_FILES_PATTERN
+
 
 class WorkerBaseArgs(BaseModel):
     """Base arguments for VLLM workers."""
@@ -117,3 +120,13 @@ def python_executable(self) -> PythonExecutable:
                 GitRepo(url=self.genai_perf_repo.url, commit=self.genai_perf_repo.commit),
             )
         return self._python_executable
+
+    def was_run_successful(self, tr: TestRun) -> JobStatusResult:
+        output_path = tr.output_path
+        csv_files = list(output_path.rglob(CSV_FILES_PATTERN))
+        json_files = list(output_path.rglob(JSON_FILES_PATTERN))
+        logging.debug(f"Found CSV files: {csv_files}, JSON files: {json_files}")
+        has_results = len(csv_files) > 0 and len(json_files) > 0
+        if not has_results:
+            return JobStatusResult(False, "No result files found in the output directory.")
+        return JobStatusResult(True)
diff --git a/tests/report_generation_strategy/test_ai_dynamo_report_generation_strategy.py b/tests/report_generation_strategy/test_ai_dynamo_report_generation_strategy.py
@@ -171,3 +171,17 @@ def test_ai_dynamo_get_metric_invalid(slurm_system: SlurmSystem, ai_dynamo_tr: T
 
     (ai_dynamo_tr.output_path / "profile_genai_perf.csv").write_text("")
     assert strategy.get_metric("default") == METRIC_ERROR
+
+
+def test_was_run_successful(ai_dynamo_tr: TestRun) -> None:
+    test_def = ai_dynamo_tr.test
+    result = test_def.was_run_successful(ai_dynamo_tr)
+    assert result.is_successful is True
+
+
+def test_was_run_successful_no_results(ai_dynamo_tr: TestRun, tmp_path: Path) -> None:
+    test_def = ai_dynamo_tr.test
+    ai_dynamo_tr.output_path = tmp_path / "empty_output"
+    ai_dynamo_tr.output_path.mkdir(parents=True, exist_ok=True)
+    result = test_def.was_run_successful(ai_dynamo_tr)
+    assert result.is_successful is False