Skip to content

Commit dd70526

Browse files
authored
Merge pull request #724 from NVIDIA/am/dynamo-k8s
Updates for Dynamo over K8s
2 parents 0f69871 + 8fa1dff commit dd70526

File tree

7 files changed

+291
-46
lines changed

7 files changed

+291
-46
lines changed
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
17+
apiVersion: nvidia.com/v1alpha1
18+
kind: DynamoGraphDeployment
19+
metadata:
20+
name: cloudai-vllm-agg
21+
spec:
22+
services:
23+
Frontend:
24+
dynamoNamespace: cloudai-vllm-agg
25+
componentType: frontend
26+
replicas: 1
27+
extraPodSpec:
28+
mainContainer:
29+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1
30+
VllmDecodeWorker:
31+
envFromSecret: hf-token-secret
32+
dynamoNamespace: cloudai-vllm-agg
33+
componentType: worker
34+
replicas: 1
35+
resources:
36+
limits:
37+
gpu: "1"
38+
extraPodSpec:
39+
mainContainer:
40+
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1
41+
workingDir: /workspace/examples/backends/vllm
42+
command:
43+
- python3
44+
- -m
45+
- dynamo.vllm
46+
args:
47+
- --model
48+
- Qwen/Qwen3-0.6B
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "vllm"
18+
description = "vllm"
19+
test_template_name = "AIDynamo"
20+
21+
[cmd_args]
22+
docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1"
23+
24+
[cmd_args.dynamo]
25+
backend = "vllm"
26+
27+
[cmd_args.dynamo.prefill_worker]
28+
num-nodes = 1
29+
[cmd_args.dynamo.decode_worker]
30+
num-nodes = 1
31+
32+
[cmd_args.genai_perf]
33+
endpoint = "v1/chat/completions"
34+
endpoint-type = "chat"
35+
extra-inputs = 'min_tokens:10'
36+
output-tokens-mean = 500
37+
output-tokens-stddev = 0
38+
random-seed = 123
39+
request-count = 20
40+
synthetic-input-tokens-mean = 3000
41+
synthetic-input-tokens-stddev = 0
42+
warmup-request-count = 10
43+
concurrency = 1
44+
extra-args = "--streaming -- -v --async"
45+
46+
[extra_env_vars]
47+
UCX_LOG_LEVEL = "warn"
48+
UCX_TLS = "cuda_copy,rc_x"
49+
DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
name = "vllm_k8s"
18+
19+
[[Tests]]
20+
id = "dynamo.vllm"
21+
test_name = "vllm"
22+
23+
[Tests.cmd_args]
24+
docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1"
25+
dynamo_graph_path = "conf/experimental/ai_dynamo/test/agg.yaml"
26+
[Tests.cmd_args.dynamo]
27+
[Tests.cmd_args.dynamo.prefill_worker]
28+
num-nodes = 1
29+
[Tests.cmd_args.dynamo.decode_worker]
30+
num-nodes = 1
31+
[Tests.cmd_args.genai_perf]
32+
model = "Qwen/Qwen3-0.6B"
33+
endpoint = "v1/chat/completions"
34+
endpoint-type = "chat"
35+
extra-inputs = 'min_tokens:10'
36+
output-tokens-mean = 500
37+
output-tokens-stddev = 0
38+
random-seed = 123
39+
request-count = 2
40+
synthetic-input-tokens-mean = 300
41+
synthetic-input-tokens-stddev = 0
42+
warmup-request-count = 1
43+
concurrency = 1
44+
extra-args = "--streaming -- -v --async"

doc/workloads/ai_dynamo.rst

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,32 @@ Test Definition
2525
.. autoclass:: cloudai.workloads.ai_dynamo.ai_dynamo.AIDynamoTestDefinition
2626
:members:
2727
:show-inheritance:
28+
29+
Run using Kubernetes
30+
--------------------
31+
32+
Prepare cluster
33+
~~~~~~~~~~~~~~~
34+
Before running the AI Dynamo workload on a Kubernetes cluster, ensure that the cluster is set up according to the instructions in the `official documentation`_. Below is a short summary of the required steps:
35+
36+
.. _official documentation: https://docs.nvidia.com/dynamo/latest/_sections/k8s_deployment.html
37+
38+
.. code-block:: bash
39+
40+
export NAMESPACE=dynamo-system
41+
export RELEASE_VERSION=0.6.1 # replace with the desired release version
42+
43+
helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-crds-${RELEASE_VERSION}.tgz
44+
helm install dynamo-crds dynamo-crds-${RELEASE_VERSION}.tgz --namespace default
45+
46+
helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-${RELEASE_VERSION}.tgz
47+
helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz --namespace ${NAMESPACE} --create-namespace
48+
49+
50+
Run CloudAI to deploy AI Dynamo worker nodes according to your spec and run ``genai-perf`` tests:
51+
52+
.. code-block:: bash
53+
54+
uv run cloudai run --system-config <k8s system toml> \
55+
--tests-dir conf/experimental/ai_dynamo/test \
56+
--test-scenario conf/experimental/ai_dynamo/test_scenario/vllm_k8s.toml

src/cloudai/systems/kubernetes/kubernetes_system.py

Lines changed: 54 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
if TYPE_CHECKING:
2727
import kubernetes as k8s
2828

29+
2930
from pydantic import BaseModel, ConfigDict
3031

3132
from cloudai.core import BaseJob, System
@@ -65,7 +66,7 @@ class KubernetesSystem(BaseModel, System):
6566
_core_v1: Optional[k8s.client.CoreV1Api] = None
6667
_batch_v1: Optional[k8s.client.BatchV1Api] = None
6768
_custom_objects_api: Optional[k8s.client.CustomObjectsApi] = None
68-
_port_forward_process = None
69+
_port_forward_process: subprocess.Popen | None = None
6970
_genai_perf_completed: bool = False
7071

7172
def __getstate__(self) -> dict[str, Any]:
@@ -252,7 +253,7 @@ def _is_batch_job_running(self, job_name: str) -> bool:
252253
)
253254
raise
254255

255-
def are_vllm_pods_ready(self) -> bool:
256+
def are_vllm_pods_ready(self, job: KubernetesJob) -> bool:
256257
cmd = ["kubectl", "get", "pods", "-n", self.default_namespace]
257258
try:
258259
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
@@ -272,7 +273,7 @@ def are_vllm_pods_ready(self) -> bool:
272273
continue
273274

274275
pod_name = columns[0]
275-
if "vllm-v1-agg" not in pod_name:
276+
if job.name not in pod_name:
276277
continue
277278

278279
vllm_pods_found = True
@@ -296,36 +297,42 @@ def are_vllm_pods_ready(self) -> bool:
296297
all_ready = False
297298

298299
if not vllm_pods_found:
299-
logging.warning("No vLLM pods found")
300+
logging.debug("No vLLM pods found")
300301
return False
301302

302303
return all_ready
303304

304-
def _setup_port_forward(self) -> None:
305+
def _setup_port_forward(self, job: KubernetesJob) -> None:
305306
if self._port_forward_process and self._port_forward_process.poll() is None:
306307
logging.debug("Port forwarding is already running")
307308
return
308309

309-
if not self.are_vllm_pods_ready():
310+
if not self.are_vllm_pods_ready(job):
310311
logging.debug("Pods are not ready yet, skipping port forward")
311312
return
312313

313-
get_pod_cmd = (
314-
f"kubectl get pods -n {self.default_namespace} --no-headers | "
315-
"grep vllm-v1-agg-frontend | "
316-
"awk 'NR==1{print $1}'"
317-
)
318-
cmd = f"kubectl port-forward pod/$({get_pod_cmd}) 8000:8000 -n {self.default_namespace}"
314+
cmd = f"kubectl port-forward svc/{job.name}-frontend 8000:8000 -n {self.default_namespace}"
319315
logging.debug("Starting port forwarding")
320316
self._port_forward_process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
321-
logging.debug("Port forwarding started")
317+
318+
logging.debug(f"Port forwarding started (pid={self._port_forward_process.pid})")
322319

323320
def _check_model_server(self) -> bool:
324-
cmd = "curl -s http://localhost:8000/v1/models"
321+
if not self._port_forward_process:
322+
logging.debug("Port forward process is not running")
323+
return False
324+
325+
server = "localhost:8000"
326+
cmd = f"curl -s http://{server}/v1/models"
327+
logging.debug(f"Checking if model server is up at {server}: {cmd}")
325328
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
326329

327330
if result.returncode != 0:
328-
logging.debug("Failed to connect to model server")
331+
logging.debug(
332+
f"Failed to connect to model server={server}, "
333+
f"output={result.stdout.strip()}, "
334+
f"error={result.stderr.strip()}"
335+
)
329336
return False
330337

331338
try:
@@ -374,18 +381,23 @@ def _run_genai_perf(self, job: KubernetesJob) -> None:
374381
args_str = " ".join(args)
375382

376383
venv_path = python_exec.venv_path.absolute()
377-
cmd = f". {venv_path}/bin/activate && genai-perf profile {args_str}"
378-
logging.debug("Running GenAI performance test with command:")
379-
logging.debug(cmd)
384+
cmd = f"{venv_path}/bin/genai-perf profile {args_str}"
385+
logging.debug(f"Running GenAI performance test: {cmd}")
386+
result: subprocess.CompletedProcess | None = None
380387
try:
381388
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True)
382389
logging.debug("GenAI performance test completed successfully")
383-
logging.debug(f"Output: {result.stdout}")
384390
except subprocess.CalledProcessError as e:
385391
logging.error(f"GenAI performance test failed: {e.stderr}")
386-
raise
392+
393+
if result:
394+
with (job.test_run.output_path / "stdout.txt").open("w") as f:
395+
f.write(result.stdout)
396+
with (job.test_run.output_path / "stderr.txt").open("w") as f:
397+
f.write(result.stderr)
387398

388399
def _check_deployment_conditions(self, conditions: list) -> bool:
400+
logging.debug(f"Checking deployment conditions: {conditions}")
389401
if not conditions:
390402
return True
391403

@@ -401,8 +413,8 @@ def _is_dynamo_graph_deployment_running(self, job: KubernetesJob) -> bool:
401413
if self._genai_perf_completed:
402414
return False
403415

404-
if self.are_vllm_pods_ready():
405-
self._setup_port_forward()
416+
if self.are_vllm_pods_ready(job):
417+
self._setup_port_forward(job)
406418
if self._port_forward_process and self._check_model_server():
407419
logging.debug("vLLM server is up and models are loaded")
408420
self._run_genai_perf(job)
@@ -438,7 +450,7 @@ def delete_job(self, job_name: str, job_kind: str) -> None:
438450
elif "job" in job_kind.lower():
439451
self._delete_batch_job(job_name)
440452
elif "dynamographdeployment" in job_kind.lower():
441-
pass
453+
self._delete_dynamo_graph_deployment(job_name)
442454
else:
443455
error_message = f"Unsupported job kind: '{job_kind}'."
444456
logging.error(error_message)
@@ -480,11 +492,14 @@ def _delete_batch_job(self, job_name: str) -> None:
480492

481493
def _delete_dynamo_graph_deployment(self, job_name: str) -> None:
482494
logging.debug(f"Deleting DynamoGraphDeployment '{job_name}'")
483-
cmd = f"kubectl delete dgd vllm-v1-agg -n {self.default_namespace}"
495+
cmd = f"kubectl delete dgd {job_name} -n {self.default_namespace}"
484496
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
485497
if result.returncode != 0:
486-
raise subprocess.SubprocessError(f"Failed to delete DynamoGraphDeployment: {result.stderr}")
487-
logging.debug("DynamoGraphDeployment deleted successfully")
498+
logging.debug(f"Failed to delete DynamoGraphDeployment: {result.stderr}")
499+
500+
if self._port_forward_process and self._port_forward_process.poll() is None:
501+
self._port_forward_process.kill()
502+
self._port_forward_process = None
488503

489504
def create_job(self, job_spec: Dict[Any, Any], timeout: int = 60, interval: int = 1) -> str:
490505
"""
@@ -559,15 +574,20 @@ def _create_mpi_job(self, job_spec: Dict[Any, Any]) -> str:
559574
return job_name
560575

561576
def _create_dynamo_graph_deployment(self, job_spec: Dict[Any, Any]) -> str:
562-
api_response = self.custom_objects_api.create_namespaced_custom_object(
563-
group="nvidia.com",
564-
version="v1alpha1",
565-
namespace=self.default_namespace,
566-
plural="dynamographdeployments",
567-
body=job_spec,
568-
)
577+
try:
578+
api_response = self.custom_objects_api.create_namespaced_custom_object(
579+
group="nvidia.com",
580+
version="v1alpha1",
581+
namespace=self.default_namespace,
582+
plural="dynamographdeployments",
583+
body=job_spec,
584+
)
585+
except lazy.k8s.client.ApiException as e:
586+
logging.error(f"An error occurred while creating DynamoGraphDeployment: {e.reason}")
587+
self._delete_dynamo_graph_deployment(job_spec["metadata"]["name"])
588+
raise
569589

570-
job_name: str = api_response["metadata"]["name"]
590+
job_name = str(api_response["metadata"]["name"])
571591
logging.debug(f"DynamoGraphDeployment '{job_name}' created with status: {api_response.get('status')}")
572592
return job_name
573593

0 commit comments

Comments
 (0)