Merge branch 'main' into main

srivatsankrishnan · web-flow · commit 7b74e68264bf · 2025-12-16T14:41:46.000-08:00
diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml
@@ -19,7 +19,7 @@ description = "vLLM backend with Qwen3-0.6B model"
 test_template_name = "AIDynamo"
 
 [cmd_args]
-docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1"
+docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.0"
 
   [cmd_args.dynamo]
   backend = "vllm"
diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
@@ -16,13 +16,15 @@ Before running the AI Dynamo workload on a Kubernetes cluster, ensure that the c
 .. code-block:: bash
 
    export NAMESPACE=dynamo-system
-   export RELEASE_VERSION=0.6.1  # replace with the desired release version
+   export RELEASE_VERSION=0.7.0  # replace with the desired release version
 
-   helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-crds-${RELEASE_VERSION}.tgz
-   helm install dynamo-crds dynamo-crds-${RELEASE_VERSION}.tgz --namespace default
+   helm upgrade -n default -i dynamo-crds https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-crds-${RELEASE_VERSION}.tgz
+   helm upgrade -n default -i dynamo-platform https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-${RELEASE_VERSION}.tgz
 
-   helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-${RELEASE_VERSION}.tgz
-   helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz --namespace ${NAMESPACE} --create-namespace
+   # The following components are required for multi node only.
+   # Versions should be aligned with Dynamo version.
+   helm upgrade -n default -i grove oci://ghcr.io/ai-dynamo/grove/grove-charts:v0.0.0-gd462e65
+   helm upgrade -n default -i kai-scheduler oci://ghcr.io/nvidia/kai-scheduler/kai-scheduler:0.0.0-4c29820
 
 Launch and Monitor the Job
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/src/cloudai/_core/json_gen_strategy.py b/src/cloudai/_core/json_gen_strategy.py
@@ -18,6 +18,8 @@
 from abc import ABC, abstractmethod
 from typing import Any, Dict
 
+import toml
+
 from .system import System
 from .test_scenario import TestRun
 
@@ -29,6 +31,8 @@ class JsonGenStrategy(ABC):
     It specifies how to generate JSON job specifications based on system and test parameters.
     """
 
+    TEST_RUN_DUMP_FILE_NAME: str = "test-run.toml"
+
     def __init__(self, system: System, test_run: TestRun) -> None:
         self.system = system
         self.test_run = test_run
@@ -54,6 +58,14 @@ def sanitize_k8s_job_name(self, job_name: str) -> str:
         sanitized_name = re.sub(r"[^a-z0-9]+$", "", sanitized_name)
         return sanitized_name[:253]
 
+    def store_test_run(self) -> None:
+        from cloudai.models.scenario import TestRunDetails
+
+        test_cmd, srun_cmd = ("", "n/a")
+        with (self.test_run.output_path / self.TEST_RUN_DUMP_FILE_NAME).open("w") as f:
+            trd = TestRunDetails.from_test_run(self.test_run, test_cmd=test_cmd, full_cmd=srun_cmd)
+            toml.dump(trd.model_dump(), f)
+
     @abstractmethod
     def gen_json(self) -> Dict[Any, Any]:
         """
diff --git a/src/cloudai/_core/registry.py b/src/cloudai/_core/registry.py
@@ -223,6 +223,16 @@ def update_scenario_report(self, name: str, report: type[Reporter], config: Repo
         self.scenario_reports[name] = report
         self.report_configs[name] = config
 
+    def ordered_scenario_reports(self) -> list[tuple[str, type[Reporter]]]:
+        def report_order(k: str) -> int:
+            return {
+                "per_test": 0,  # first
+                "status": 2,
+                "tarball": 3,  # last
+            }.get(k, 1)
+
+        return sorted(self.scenario_reports.items(), key=lambda kv: report_order(kv[0]))
+
     def add_reward_function(self, name: str, value: RewardFunction) -> None:
         if name in self.reward_functions_map:
             raise ValueError(f"Duplicating implementation for '{name}', use 'update()' for replacement.")
diff --git a/src/cloudai/cli/cli.py b/src/cloudai/cli/cli.py
@@ -74,6 +74,10 @@ def setup_logging(log_file: str, log_level: str) -> None:
                 "handlers": ["debug_file"],
                 "propagate": False,
             },
+            "kubernetes": {
+                "handlers": [],
+                "propagate": False,
+            },
         },
     }
     logging.config.dictConfig(LOGGING_CONFIG)
diff --git a/src/cloudai/cli/handlers.py b/src/cloudai/cli/handlers.py
@@ -169,8 +169,7 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int:
 def generate_reports(system: System, test_scenario: TestScenario, result_dir: Path) -> None:
     registry = Registry()
 
-    # Ensure "status" report goes last for better readability
-    for name, reporter_class in sorted(registry.scenario_reports.items(), key=lambda x: (x[0] == "status", x[0])):
+    for name, reporter_class in registry.ordered_scenario_reports():
         logging.debug(f"Generating report '{name}' ({reporter_class.__name__})")
 
         cfg = registry.report_configs.get(name, ReportConfig(enable=False))
diff --git a/src/cloudai/report_generator/comparison_report.py b/src/cloudai/report_generator/comparison_report.py
@@ -143,12 +143,13 @@ def create_table(
                     no_wrap=False,
                 )
 
-        for row_idx in range(len(dfs[0][info_columns[0]])):
+        df_with_max_rows = max(dfs, key=len)
+        for row_idx in range(len(df_with_max_rows)):
             data = []
             for df in dfs:
-                data.extend([str(df[col].get(row_idx)) for col in data_columns])
+                data.extend([str(df[col].get(row_idx, "n/a")) for col in data_columns])
 
-            table.add_row(*[str(dfs[0][col][row_idx]) for col in info_columns], *data)
+            table.add_row(*[str(df_with_max_rows[col][row_idx]) for col in info_columns], *data)
 
         return table
 
@@ -178,7 +179,14 @@ def create_chart(
         hover = lazy.bokeh_models.HoverTool(tooltips=[("X", "@x"), ("Y", "@y"), ("Segment Type", "@segment_type")])
         p.add_tools(hover)
 
+        if all(df.empty for df in dfs):
+            logging.debug(f"No data available to create chart for group {group.name}, skipping.")
+            return p
+
         for df, name in zip(dfs, [item.name for item in group.items], strict=True):
+            if df.empty:
+                continue
+
             for col in data_columns:
                 source = lazy.bokeh_models.ColumnDataSource(
                     data={
@@ -195,12 +203,13 @@ def create_chart(
         p.legend.location = "top_left"
         p.legend.click_policy = "hide"
 
-        y_max = max(df[col].max() for df in dfs for col in data_columns)
-        y_min = min(df[col].min() for df in dfs for col in data_columns)
+        y_max = max(df[col].max() for df in dfs for col in data_columns if not df.empty)
+        y_min = min(df[col].min() for df in dfs for col in data_columns if not df.empty)
         p.y_range = lazy.bokeh_models.Range1d(start=y_min * -1 * y_max * 0.01, end=y_max * 1.1)
 
-        x_min = dfs[0][info_columns[0]].min()
-        x_max = dfs[0][info_columns[0]].max()
+        df_with_max_rows = max(dfs, key=len)
+        x_min = df_with_max_rows[info_columns[0]].min()
+        x_max = df_with_max_rows[info_columns[0]].max()
         p.xaxis.ticker = calculate_power_of_two_ticks(x_min, x_max)
         p.xaxis.formatter = lazy.bokeh_models.CustomJSTickFormatter(code=bokeh_size_unit_js_tick_formatter)
         p.xaxis.major_label_orientation = lazy.np.pi / 4
diff --git a/src/cloudai/systems/kubernetes/kubernetes_runner.py b/src/cloudai/systems/kubernetes/kubernetes_runner.py
@@ -42,6 +42,10 @@ def _submit_test(self, tr: TestRun) -> KubernetesJob:
 
         return job
 
+    def on_job_submit(self, tr: TestRun) -> None:
+        json_gen = self.get_json_gen_strategy(self.system, tr)
+        json_gen.store_test_run()
+
     def on_job_completion(self, job: BaseJob) -> None:
         k8s_system: KubernetesSystem = cast(KubernetesSystem, self.system)
         k_job = cast(KubernetesJob, job)
diff --git a/src/cloudai/systems/kubernetes/kubernetes_system.py b/src/cloudai/systems/kubernetes/kubernetes_system.py
@@ -16,7 +16,6 @@
 
 from __future__ import annotations
 
-import json
 import logging
 import subprocess
 import time
@@ -43,7 +42,6 @@ class KubernetesSystem(System):
     _core_v1: Optional[k8s.client.CoreV1Api] = None
     _batch_v1: Optional[k8s.client.BatchV1Api] = None
     _custom_objects_api: Optional[k8s.client.CustomObjectsApi] = None
-    _port_forward_process: subprocess.Popen | None = None
     _genai_perf_completed: bool = False
 
     def __getstate__(self) -> dict[str, Any]:
@@ -279,58 +277,15 @@ def are_vllm_pods_ready(self, job: KubernetesJob) -> bool:
 
         return all_ready
 
-    def _setup_port_forward(self, job: KubernetesJob) -> None:
-        if self._port_forward_process and self._port_forward_process.poll() is None:
-            logging.debug("Port forwarding is already running")
-            return
-
-        if not self.are_vllm_pods_ready(job):
-            logging.debug("Pods are not ready yet, skipping port forward")
-            return
-
-        cmd = f"kubectl port-forward svc/{job.name}-frontend 8000:8000 -n {self.default_namespace}"
-        logging.debug("Starting port forwarding")
-        self._port_forward_process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-
-        logging.debug(f"Port forwarding started (pid={self._port_forward_process.pid})")
-
-    def _check_model_server(self) -> bool:
-        if not self._port_forward_process:
-            logging.debug("Port forward process is not running")
-            return False
-
-        server = "localhost:8000"
-        cmd = f"curl -s http://{server}/v1/models"
-        logging.debug(f"Checking if model server is up at {server}: {cmd}")
-        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
-
-        if result.returncode != 0:
-            logging.debug(
-                f"Failed to connect to model server={server}, "
-                f"output={result.stdout.strip()}, "
-                f"error={result.stderr.strip()}"
-            )
-            return False
-
-        try:
-            response = json.loads(result.stdout)
-            if response.get("data") and len(response["data"]) > 0:
-                logging.debug(f"Model server is running. Response: {result.stdout}")
-                return True
-            else:
-                logging.debug("Model server is up but no models are loaded yet")
-                return False
-        except json.JSONDecodeError:
-            logging.warning("Invalid JSON response from model server")
-            return False
-
-    def _get_frontend_pod_name(self) -> str:
+    def _get_dynamo_pod_by_role(self, role: str) -> str:
         for pod in self.core_v1.list_namespaced_pod(namespace=self.default_namespace).items:
             labels = pod.metadata.labels
             logging.debug(f"Found pod: {pod.metadata.name} with labels: {labels}")
-            if labels and str(labels.get("nvidia.com/dynamo-component", "")).lower() == "frontend":
+            if labels and str(labels.get("nvidia.com/dynamo-component", "")).lower() == role.lower():  # v0.6.x
+                return pod.metadata.name
+            if labels and str(labels.get("nvidia.com/dynamo-component-type", "")).lower() == role.lower():  # v0.7.x
                 return pod.metadata.name
-        raise RuntimeError("No frontend pod found for the job")
+        raise RuntimeError(f"No pod found for the role '{role}'")
 
     def _run_genai_perf(self, job: KubernetesJob) -> None:
         from cloudai.workloads.ai_dynamo.ai_dynamo import AIDynamoTestDefinition
@@ -350,7 +305,7 @@ def _run_genai_perf(self, job: KubernetesJob) -> None:
             genai_perf_cmd.extend(extra_args.split())
         logging.debug(f"GenAI perf arguments: {genai_perf_cmd=}")
 
-        frontend_pod = self._get_frontend_pod_name()
+        frontend_pod = self._get_dynamo_pod_by_role(role="frontend")
 
         logging.debug(f"Executing genai-perf in pod={frontend_pod} cmd={genai_perf_cmd}")
         try:
@@ -400,12 +355,20 @@ def _is_dynamo_graph_deployment_running(self, job: KubernetesJob) -> bool:
             return False
 
         if self.are_vllm_pods_ready(job):
-            self._setup_port_forward(job)
-            if self._port_forward_process and self._check_model_server():
-                logging.debug("vLLM server is up and models are loaded")
-                self._run_genai_perf(job)
-                self._genai_perf_completed = True
-                return False
+            self._run_genai_perf(job)
+            self._genai_perf_completed = True
+
+            for pod_role in {"decode", "prefill", "frontend"}:
+                try:
+                    pod_name = self._get_dynamo_pod_by_role(pod_role)
+                    logging.debug(f"Fetching logs for {pod_role=} {pod_name=}")
+                    logs = self.core_v1.read_namespaced_pod_log(name=pod_name, namespace=self.default_namespace)
+                    with (job.test_run.output_path / f"{pod_role}_pod.log").open("w") as f:
+                        f.write(logs)
+                except Exception as e:
+                    logging.debug(f"Error fetching logs for role '{pod_role}': {e}")
+
+            return False
 
         deployment = cast(
             dict,
@@ -483,9 +446,7 @@ def _delete_dynamo_graph_deployment(self, job_name: str) -> None:
         if result.returncode != 0:
             logging.debug(f"Failed to delete DynamoGraphDeployment: {result.stderr}")
 
-        if self._port_forward_process and self._port_forward_process.poll() is None:
-            self._port_forward_process.kill()
-        self._port_forward_process = None
+        self._genai_perf_completed = False
 
     def create_job(self, job_spec: Dict[Any, Any], timeout: int = 60, interval: int = 1) -> str:
         """
@@ -560,6 +521,10 @@ def _create_mpi_job(self, job_spec: Dict[Any, Any]) -> str:
         return job_name
 
     def _create_dynamo_graph_deployment(self, job_spec: Dict[Any, Any]) -> str:
+        logging.debug(f"Attempting to delete existing job='{job_spec['metadata']['name']}' before creation.")
+        self._delete_dynamo_graph_deployment(job_spec["metadata"]["name"])
+
+        logging.debug("Creating DynamoGraphDeployment with spec")
         try:
             api_response = self.custom_objects_api.create_namespaced_custom_object(
                 group="nvidia.com",
diff --git a/src/cloudai/util/nixl_report_template.jinja2 b/src/cloudai/util/nixl_report_template.jinja2
@@ -4,9 +4,9 @@
 <head>
     <title>{{ title }}</title>
     <meta charset="UTF-8">
-    <script src="https://cdn.bokeh.org/bokeh/release/bokeh-3.4.0.min.js"></script>
-    <script src="https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.4.0.min.js"></script>
-    <script src="https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.4.0.min.js"></script>
+    <script src="https://cdn.bokeh.org/bokeh/release/bokeh-3.8.0.min.js"></script>
+    <script src="https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.8.0.min.js"></script>
+    <script src="https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.8.0.min.js"></script>
     <style>
         body {
             font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
diff --git a/src/cloudai/workloads/ai_dynamo/ai_dynamo.py b/src/cloudai/workloads/ai_dynamo/ai_dynamo.py
@@ -154,7 +154,7 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult:
         output_path = tr.output_path
         csv_files = list(output_path.rglob(CSV_FILES_PATTERN))
         json_files = list(output_path.rglob(JSON_FILES_PATTERN))
-        logging.debug(f"Found CSV files: {csv_files}, JSON files: {json_files}")
+        logging.debug(f"Found CSV files in {output_path.absolute()}: {csv_files}, JSON files: {json_files}")
         has_results = len(csv_files) > 0 and len(json_files) > 0
         if not has_results:
             return JobStatusResult(False, "No result files found in the output directory.")
diff --git a/tests/report_generation_strategy/test_comparison_report.py b/tests/report_generation_strategy/test_comparison_report.py
@@ -14,9 +14,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from pathlib import Path
+
 import bokeh.plotting as bk
 import pandas as pd
 import pytest
+import toml
+from packaging.requirements import Requirement
+from packaging.version import Version
 from rich.table import Table
 
 from cloudai.core import TestRun, TestScenario
@@ -101,8 +106,8 @@ def test_one_data_point_is_empty(self, cmp_report: MyComparisonReport, nccl_tr:
                 ],
             ),
             [
-                pd.DataFrame({"size": [1, 2, 4], "value": [10, 20, 40]}),
                 pd.DataFrame({"size": [], "value": []}),
+                pd.DataFrame({"size": [1, 2, 4], "value": [10, 20, 40]}),
             ],
             "title",
             ["size"],
@@ -112,8 +117,8 @@ def test_one_data_point_is_empty(self, cmp_report: MyComparisonReport, nccl_tr:
         assert len(table.columns) == 3
         assert len(table.rows) == 3
         assert list(table.columns[0].cells) == ["1", "2", "4"]
-        assert list(table.columns[1].cells) == ["10", "20", "40"]
-        assert list(table.columns[2].cells) == ["None", "None", "None"]
+        assert list(table.columns[1].cells) == ["n/a", "n/a", "n/a"]
+        assert list(table.columns[2].cells) == ["10", "20", "40"]
 
 
 def test_create_charts(cmp_report: MyComparisonReport, nccl_tr: TestRun) -> None:
@@ -135,3 +140,30 @@ def test_create_charts(cmp_report: MyComparisonReport, nccl_tr: TestRun) -> None
         ["value"],
         "y_axis_label",
     )
+
+
+def test_bokeh_cdn_version_matches_pyproject():
+    bokeh_dep = None
+    for dep in toml.load(Path("pyproject.toml"))["project"]["dependencies"]:
+        if dep.startswith("bokeh"):
+            bokeh_dep = dep
+            break
+
+    assert bokeh_dep is not None, "bokeh dependency not found in pyproject.toml"
+
+    req = Requirement(bokeh_dep)
+    assert req.specifier, f"No version specifier found in: {bokeh_dep}"
+
+    template_path = Path("src/cloudai/util/nixl_report_template.jinja2")
+    template_content = template_path.read_text()
+
+    pyproject_version = Version(f"{req.specifier}".lstrip("~=<>!"))
+    ver_str = f"-{pyproject_version.major}.{pyproject_version.minor}.0"
+
+    for line in template_content.splitlines():
+        if "cdn.bokeh.org/bokeh/release" not in line:
+            continue
+
+        assert ver_str in line, (
+            f"Bokeh CDN version ({line}) in template does not match pyproject.toml version ({pyproject_version})."
+        )
diff --git a/tests/test_reporter.py b/tests/test_reporter.py
@@ -288,3 +288,10 @@ def test_metadata_for_single_sbatch(self, slurm_system: SlurmSystem, slurm_metad
         meta = SlurmReportItem.get_metadata(run_dir, slurm_system.output_path)
         assert meta is not None
         assert meta.slurm.node_list == slurm_metadata.slurm.node_list
+
+
+def test_report_order() -> None:
+    reports = Registry().ordered_scenario_reports()
+    assert reports[0][0] == "per_test"
+    assert reports[-2][0] == "status"
+    assert reports[-1][0] == "tarball"