Skip to content

Commit 7b74e68

Browse files
Merge branch 'main' into main
2 parents f8ca295 + f5b2ceb commit 7b74e68

File tree

13 files changed

+126
-82
lines changed

13 files changed

+126
-82
lines changed

conf/experimental/ai_dynamo/test/vllm.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ description = "vLLM backend with Qwen3-0.6B model"
1919
test_template_name = "AIDynamo"
2020

2121
[cmd_args]
22-
docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1"
22+
docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.7.0"
2323

2424
[cmd_args.dynamo]
2525
backend = "vllm"

doc/workloads/ai_dynamo.rst

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,15 @@ Before running the AI Dynamo workload on a Kubernetes cluster, ensure that the c
1616
.. code-block:: bash
1717
1818
export NAMESPACE=dynamo-system
19-
export RELEASE_VERSION=0.6.1 # replace with the desired release version
19+
export RELEASE_VERSION=0.7.0 # replace with the desired release version
2020
21-
helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-crds-${RELEASE_VERSION}.tgz
22-
helm install dynamo-crds dynamo-crds-${RELEASE_VERSION}.tgz --namespace default
21+
helm upgrade -n default -i dynamo-crds https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-crds-${RELEASE_VERSION}.tgz
22+
helm upgrade -n default -i dynamo-platform https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-${RELEASE_VERSION}.tgz
2323
24-
helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-${RELEASE_VERSION}.tgz
25-
helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz --namespace ${NAMESPACE} --create-namespace
24+
# The following components are required for multi node only.
25+
# Versions should be aligned with Dynamo version.
26+
helm upgrade -n default -i grove oci://ghcr.io/ai-dynamo/grove/grove-charts:v0.0.0-gd462e65
27+
helm upgrade -n default -i kai-scheduler oci://ghcr.io/nvidia/kai-scheduler/kai-scheduler:0.0.0-4c29820
2628
2729
Launch and Monitor the Job
2830
~~~~~~~~~~~~~~~~~~~~~~~~~~~~

src/cloudai/_core/json_gen_strategy.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
from abc import ABC, abstractmethod
1919
from typing import Any, Dict
2020

21+
import toml
22+
2123
from .system import System
2224
from .test_scenario import TestRun
2325

@@ -29,6 +31,8 @@ class JsonGenStrategy(ABC):
2931
It specifies how to generate JSON job specifications based on system and test parameters.
3032
"""
3133

34+
TEST_RUN_DUMP_FILE_NAME: str = "test-run.toml"
35+
3236
def __init__(self, system: System, test_run: TestRun) -> None:
3337
self.system = system
3438
self.test_run = test_run
@@ -54,6 +58,14 @@ def sanitize_k8s_job_name(self, job_name: str) -> str:
5458
sanitized_name = re.sub(r"[^a-z0-9]+$", "", sanitized_name)
5559
return sanitized_name[:253]
5660

61+
def store_test_run(self) -> None:
62+
from cloudai.models.scenario import TestRunDetails
63+
64+
test_cmd, srun_cmd = ("", "n/a")
65+
with (self.test_run.output_path / self.TEST_RUN_DUMP_FILE_NAME).open("w") as f:
66+
trd = TestRunDetails.from_test_run(self.test_run, test_cmd=test_cmd, full_cmd=srun_cmd)
67+
toml.dump(trd.model_dump(), f)
68+
5769
@abstractmethod
5870
def gen_json(self) -> Dict[Any, Any]:
5971
"""

src/cloudai/_core/registry.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,16 @@ def update_scenario_report(self, name: str, report: type[Reporter], config: Repo
223223
self.scenario_reports[name] = report
224224
self.report_configs[name] = config
225225

226+
def ordered_scenario_reports(self) -> list[tuple[str, type[Reporter]]]:
227+
def report_order(k: str) -> int:
228+
return {
229+
"per_test": 0, # first
230+
"status": 2,
231+
"tarball": 3, # last
232+
}.get(k, 1)
233+
234+
return sorted(self.scenario_reports.items(), key=lambda kv: report_order(kv[0]))
235+
226236
def add_reward_function(self, name: str, value: RewardFunction) -> None:
227237
if name in self.reward_functions_map:
228238
raise ValueError(f"Duplicating implementation for '{name}', use 'update()' for replacement.")

src/cloudai/cli/cli.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ def setup_logging(log_file: str, log_level: str) -> None:
7474
"handlers": ["debug_file"],
7575
"propagate": False,
7676
},
77+
"kubernetes": {
78+
"handlers": [],
79+
"propagate": False,
80+
},
7781
},
7882
}
7983
logging.config.dictConfig(LOGGING_CONFIG)

src/cloudai/cli/handlers.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,7 @@ def handle_dse_job(runner: Runner, args: argparse.Namespace) -> int:
169169
def generate_reports(system: System, test_scenario: TestScenario, result_dir: Path) -> None:
170170
registry = Registry()
171171

172-
# Ensure "status" report goes last for better readability
173-
for name, reporter_class in sorted(registry.scenario_reports.items(), key=lambda x: (x[0] == "status", x[0])):
172+
for name, reporter_class in registry.ordered_scenario_reports():
174173
logging.debug(f"Generating report '{name}' ({reporter_class.__name__})")
175174

176175
cfg = registry.report_configs.get(name, ReportConfig(enable=False))

src/cloudai/report_generator/comparison_report.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -143,12 +143,13 @@ def create_table(
143143
no_wrap=False,
144144
)
145145

146-
for row_idx in range(len(dfs[0][info_columns[0]])):
146+
df_with_max_rows = max(dfs, key=len)
147+
for row_idx in range(len(df_with_max_rows)):
147148
data = []
148149
for df in dfs:
149-
data.extend([str(df[col].get(row_idx)) for col in data_columns])
150+
data.extend([str(df[col].get(row_idx, "n/a")) for col in data_columns])
150151

151-
table.add_row(*[str(dfs[0][col][row_idx]) for col in info_columns], *data)
152+
table.add_row(*[str(df_with_max_rows[col][row_idx]) for col in info_columns], *data)
152153

153154
return table
154155

@@ -178,7 +179,14 @@ def create_chart(
178179
hover = lazy.bokeh_models.HoverTool(tooltips=[("X", "@x"), ("Y", "@y"), ("Segment Type", "@segment_type")])
179180
p.add_tools(hover)
180181

182+
if all(df.empty for df in dfs):
183+
logging.debug(f"No data available to create chart for group {group.name}, skipping.")
184+
return p
185+
181186
for df, name in zip(dfs, [item.name for item in group.items], strict=True):
187+
if df.empty:
188+
continue
189+
182190
for col in data_columns:
183191
source = lazy.bokeh_models.ColumnDataSource(
184192
data={
@@ -195,12 +203,13 @@ def create_chart(
195203
p.legend.location = "top_left"
196204
p.legend.click_policy = "hide"
197205

198-
y_max = max(df[col].max() for df in dfs for col in data_columns)
199-
y_min = min(df[col].min() for df in dfs for col in data_columns)
206+
y_max = max(df[col].max() for df in dfs for col in data_columns if not df.empty)
207+
y_min = min(df[col].min() for df in dfs for col in data_columns if not df.empty)
200208
p.y_range = lazy.bokeh_models.Range1d(start=y_min * -1 * y_max * 0.01, end=y_max * 1.1)
201209

202-
x_min = dfs[0][info_columns[0]].min()
203-
x_max = dfs[0][info_columns[0]].max()
210+
df_with_max_rows = max(dfs, key=len)
211+
x_min = df_with_max_rows[info_columns[0]].min()
212+
x_max = df_with_max_rows[info_columns[0]].max()
204213
p.xaxis.ticker = calculate_power_of_two_ticks(x_min, x_max)
205214
p.xaxis.formatter = lazy.bokeh_models.CustomJSTickFormatter(code=bokeh_size_unit_js_tick_formatter)
206215
p.xaxis.major_label_orientation = lazy.np.pi / 4

src/cloudai/systems/kubernetes/kubernetes_runner.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@ def _submit_test(self, tr: TestRun) -> KubernetesJob:
4242

4343
return job
4444

45+
def on_job_submit(self, tr: TestRun) -> None:
46+
json_gen = self.get_json_gen_strategy(self.system, tr)
47+
json_gen.store_test_run()
48+
4549
def on_job_completion(self, job: BaseJob) -> None:
4650
k8s_system: KubernetesSystem = cast(KubernetesSystem, self.system)
4751
k_job = cast(KubernetesJob, job)

src/cloudai/systems/kubernetes/kubernetes_system.py

Lines changed: 25 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
from __future__ import annotations
1818

19-
import json
2019
import logging
2120
import subprocess
2221
import time
@@ -43,7 +42,6 @@ class KubernetesSystem(System):
4342
_core_v1: Optional[k8s.client.CoreV1Api] = None
4443
_batch_v1: Optional[k8s.client.BatchV1Api] = None
4544
_custom_objects_api: Optional[k8s.client.CustomObjectsApi] = None
46-
_port_forward_process: subprocess.Popen | None = None
4745
_genai_perf_completed: bool = False
4846

4947
def __getstate__(self) -> dict[str, Any]:
@@ -279,58 +277,15 @@ def are_vllm_pods_ready(self, job: KubernetesJob) -> bool:
279277

280278
return all_ready
281279

282-
def _setup_port_forward(self, job: KubernetesJob) -> None:
283-
if self._port_forward_process and self._port_forward_process.poll() is None:
284-
logging.debug("Port forwarding is already running")
285-
return
286-
287-
if not self.are_vllm_pods_ready(job):
288-
logging.debug("Pods are not ready yet, skipping port forward")
289-
return
290-
291-
cmd = f"kubectl port-forward svc/{job.name}-frontend 8000:8000 -n {self.default_namespace}"
292-
logging.debug("Starting port forwarding")
293-
self._port_forward_process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
294-
295-
logging.debug(f"Port forwarding started (pid={self._port_forward_process.pid})")
296-
297-
def _check_model_server(self) -> bool:
298-
if not self._port_forward_process:
299-
logging.debug("Port forward process is not running")
300-
return False
301-
302-
server = "localhost:8000"
303-
cmd = f"curl -s http://{server}/v1/models"
304-
logging.debug(f"Checking if model server is up at {server}: {cmd}")
305-
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
306-
307-
if result.returncode != 0:
308-
logging.debug(
309-
f"Failed to connect to model server={server}, "
310-
f"output={result.stdout.strip()}, "
311-
f"error={result.stderr.strip()}"
312-
)
313-
return False
314-
315-
try:
316-
response = json.loads(result.stdout)
317-
if response.get("data") and len(response["data"]) > 0:
318-
logging.debug(f"Model server is running. Response: {result.stdout}")
319-
return True
320-
else:
321-
logging.debug("Model server is up but no models are loaded yet")
322-
return False
323-
except json.JSONDecodeError:
324-
logging.warning("Invalid JSON response from model server")
325-
return False
326-
327-
def _get_frontend_pod_name(self) -> str:
280+
def _get_dynamo_pod_by_role(self, role: str) -> str:
328281
for pod in self.core_v1.list_namespaced_pod(namespace=self.default_namespace).items:
329282
labels = pod.metadata.labels
330283
logging.debug(f"Found pod: {pod.metadata.name} with labels: {labels}")
331-
if labels and str(labels.get("nvidia.com/dynamo-component", "")).lower() == "frontend":
284+
if labels and str(labels.get("nvidia.com/dynamo-component", "")).lower() == role.lower(): # v0.6.x
285+
return pod.metadata.name
286+
if labels and str(labels.get("nvidia.com/dynamo-component-type", "")).lower() == role.lower(): # v0.7.x
332287
return pod.metadata.name
333-
raise RuntimeError("No frontend pod found for the job")
288+
raise RuntimeError(f"No pod found for the role '{role}'")
334289

335290
def _run_genai_perf(self, job: KubernetesJob) -> None:
336291
from cloudai.workloads.ai_dynamo.ai_dynamo import AIDynamoTestDefinition
@@ -350,7 +305,7 @@ def _run_genai_perf(self, job: KubernetesJob) -> None:
350305
genai_perf_cmd.extend(extra_args.split())
351306
logging.debug(f"GenAI perf arguments: {genai_perf_cmd=}")
352307

353-
frontend_pod = self._get_frontend_pod_name()
308+
frontend_pod = self._get_dynamo_pod_by_role(role="frontend")
354309

355310
logging.debug(f"Executing genai-perf in pod={frontend_pod} cmd={genai_perf_cmd}")
356311
try:
@@ -400,12 +355,20 @@ def _is_dynamo_graph_deployment_running(self, job: KubernetesJob) -> bool:
400355
return False
401356

402357
if self.are_vllm_pods_ready(job):
403-
self._setup_port_forward(job)
404-
if self._port_forward_process and self._check_model_server():
405-
logging.debug("vLLM server is up and models are loaded")
406-
self._run_genai_perf(job)
407-
self._genai_perf_completed = True
408-
return False
358+
self._run_genai_perf(job)
359+
self._genai_perf_completed = True
360+
361+
for pod_role in {"decode", "prefill", "frontend"}:
362+
try:
363+
pod_name = self._get_dynamo_pod_by_role(pod_role)
364+
logging.debug(f"Fetching logs for {pod_role=} {pod_name=}")
365+
logs = self.core_v1.read_namespaced_pod_log(name=pod_name, namespace=self.default_namespace)
366+
with (job.test_run.output_path / f"{pod_role}_pod.log").open("w") as f:
367+
f.write(logs)
368+
except Exception as e:
369+
logging.debug(f"Error fetching logs for role '{pod_role}': {e}")
370+
371+
return False
409372

410373
deployment = cast(
411374
dict,
@@ -483,9 +446,7 @@ def _delete_dynamo_graph_deployment(self, job_name: str) -> None:
483446
if result.returncode != 0:
484447
logging.debug(f"Failed to delete DynamoGraphDeployment: {result.stderr}")
485448

486-
if self._port_forward_process and self._port_forward_process.poll() is None:
487-
self._port_forward_process.kill()
488-
self._port_forward_process = None
449+
self._genai_perf_completed = False
489450

490451
def create_job(self, job_spec: Dict[Any, Any], timeout: int = 60, interval: int = 1) -> str:
491452
"""
@@ -560,6 +521,10 @@ def _create_mpi_job(self, job_spec: Dict[Any, Any]) -> str:
560521
return job_name
561522

562523
def _create_dynamo_graph_deployment(self, job_spec: Dict[Any, Any]) -> str:
524+
logging.debug(f"Attempting to delete existing job='{job_spec['metadata']['name']}' before creation.")
525+
self._delete_dynamo_graph_deployment(job_spec["metadata"]["name"])
526+
527+
logging.debug("Creating DynamoGraphDeployment with spec")
563528
try:
564529
api_response = self.custom_objects_api.create_namespaced_custom_object(
565530
group="nvidia.com",

src/cloudai/util/nixl_report_template.jinja2

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
<head>
55
<title>{{ title }}</title>
66
<meta charset="UTF-8">
7-
<script src="https://cdn.bokeh.org/bokeh/release/bokeh-3.4.0.min.js"></script>
8-
<script src="https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.4.0.min.js"></script>
9-
<script src="https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.4.0.min.js"></script>
7+
<script src="https://cdn.bokeh.org/bokeh/release/bokeh-3.8.0.min.js"></script>
8+
<script src="https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.8.0.min.js"></script>
9+
<script src="https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.8.0.min.js"></script>
1010
<style>
1111
body {
1212
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;

0 commit comments

Comments
 (0)