Merge pull request #724 from NVIDIA/am/dynamo-k8s

amaslenn · web-flow · commit dd70526a68f2 · 2025-11-24T14:16:26.000+01:00
Updates for Dynamo over K8s
diff --git a/conf/experimental/ai_dynamo/test/agg.yaml b/conf/experimental/ai_dynamo/test/agg.yaml
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: cloudai-vllm-agg
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: cloudai-vllm-agg
+      componentType: frontend
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1
+    VllmDecodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: cloudai-vllm-agg
+      componentType: worker
+      replicas: 1
+      resources:
+        limits:
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1
+          workingDir: /workspace/examples/backends/vllm
+          command:
+            - python3
+            - -m
+            - dynamo.vllm
+          args:
+            - --model
+            - Qwen/Qwen3-0.6B
diff --git a/conf/experimental/ai_dynamo/test/vllm.toml b/conf/experimental/ai_dynamo/test/vllm.toml
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "vllm"
+description = "vllm"
+test_template_name = "AIDynamo"
+
+[cmd_args]
+docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1"
+
+  [cmd_args.dynamo]
+  backend = "vllm"
+
+    [cmd_args.dynamo.prefill_worker]
+    num-nodes = 1
+    [cmd_args.dynamo.decode_worker]
+    num-nodes = 1
+
+  [cmd_args.genai_perf]
+  endpoint = "v1/chat/completions"
+  endpoint-type = "chat"
+  extra-inputs = 'min_tokens:10'
+  output-tokens-mean = 500
+  output-tokens-stddev = 0
+  random-seed = 123
+  request-count = 20
+  synthetic-input-tokens-mean = 3000
+  synthetic-input-tokens-stddev = 0
+  warmup-request-count = 10
+  concurrency = 1
+  extra-args = "--streaming -- -v --async"
+
+[extra_env_vars]
+UCX_LOG_LEVEL = "warn"
+UCX_TLS = "cuda_copy,rc_x"
+DYNAMO_NODELIST = "$(scontrol show hostname $SLURM_JOB_NODELIST | tr -s '\\n' ',')"
diff --git a/conf/experimental/ai_dynamo/test_scenario/vllm_k8s.toml b/conf/experimental/ai_dynamo/test_scenario/vllm_k8s.toml
@@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "vllm_k8s"
+
+[[Tests]]
+id = "dynamo.vllm"
+test_name = "vllm"
+
+  [Tests.cmd_args]
+  docker_image_url = "nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.6.1.post1"
+  dynamo_graph_path = "conf/experimental/ai_dynamo/test/agg.yaml"
+    [Tests.cmd_args.dynamo]
+      [Tests.cmd_args.dynamo.prefill_worker]
+      num-nodes = 1
+      [Tests.cmd_args.dynamo.decode_worker]
+      num-nodes = 1
+    [Tests.cmd_args.genai_perf]
+    model = "Qwen/Qwen3-0.6B"
+    endpoint = "v1/chat/completions"
+    endpoint-type = "chat"
+    extra-inputs = 'min_tokens:10'
+    output-tokens-mean = 500
+    output-tokens-stddev = 0
+    random-seed = 123
+    request-count = 2
+    synthetic-input-tokens-mean = 300
+    synthetic-input-tokens-stddev = 0
+    warmup-request-count = 1
+    concurrency = 1
+    extra-args = "--streaming -- -v --async"
diff --git a/doc/workloads/ai_dynamo.rst b/doc/workloads/ai_dynamo.rst
@@ -25,3 +25,32 @@ Test Definition
 .. autoclass:: cloudai.workloads.ai_dynamo.ai_dynamo.AIDynamoTestDefinition
    :members:
    :show-inheritance:
+
+Run using Kubernetes
+--------------------
+
+Prepare cluster
+~~~~~~~~~~~~~~~
+Before running the AI Dynamo workload on a Kubernetes cluster, ensure that the cluster is set up according to the instructions in the `official documentation`_. Below is a short summary of the required steps:
+
+.. _official documentation: https://docs.nvidia.com/dynamo/latest/_sections/k8s_deployment.html
+
+.. code-block:: bash
+
+   export NAMESPACE=dynamo-system
+   export RELEASE_VERSION=0.6.1  # replace with the desired release version
+
+   helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-crds-${RELEASE_VERSION}.tgz
+   helm install dynamo-crds dynamo-crds-${RELEASE_VERSION}.tgz --namespace default
+
+   helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-${RELEASE_VERSION}.tgz
+   helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz --namespace ${NAMESPACE} --create-namespace
+
+
+Run CloudAI to deploy AI Dynamo worker nodes according to your spec and run ``genai-perf`` tests:
+
+.. code-block:: bash
+
+   uv run cloudai run --system-config <k8s system toml> \
+      --tests-dir conf/experimental/ai_dynamo/test \
+      --test-scenario conf/experimental/ai_dynamo/test_scenario/vllm_k8s.toml
diff --git a/src/cloudai/systems/kubernetes/kubernetes_system.py b/src/cloudai/systems/kubernetes/kubernetes_system.py
@@ -26,6 +26,7 @@
 if TYPE_CHECKING:
     import kubernetes as k8s
 
+
 from pydantic import BaseModel, ConfigDict
 
 from cloudai.core import BaseJob, System
@@ -65,7 +66,7 @@ class KubernetesSystem(BaseModel, System):
     _core_v1: Optional[k8s.client.CoreV1Api] = None
     _batch_v1: Optional[k8s.client.BatchV1Api] = None
     _custom_objects_api: Optional[k8s.client.CustomObjectsApi] = None
-    _port_forward_process = None
+    _port_forward_process: subprocess.Popen | None = None
     _genai_perf_completed: bool = False
 
     def __getstate__(self) -> dict[str, Any]:
@@ -252,7 +253,7 @@ def _is_batch_job_running(self, job_name: str) -> bool:
                 )
                 raise
 
-    def are_vllm_pods_ready(self) -> bool:
+    def are_vllm_pods_ready(self, job: KubernetesJob) -> bool:
         cmd = ["kubectl", "get", "pods", "-n", self.default_namespace]
         try:
             result = subprocess.run(cmd, capture_output=True, text=True, check=True)
@@ -272,7 +273,7 @@ def are_vllm_pods_ready(self) -> bool:
                 continue
 
             pod_name = columns[0]
-            if "vllm-v1-agg" not in pod_name:
+            if job.name not in pod_name:
                 continue
 
             vllm_pods_found = True
@@ -296,36 +297,42 @@ def are_vllm_pods_ready(self) -> bool:
                 all_ready = False
 
         if not vllm_pods_found:
-            logging.warning("No vLLM pods found")
+            logging.debug("No vLLM pods found")
             return False
 
         return all_ready
 
-    def _setup_port_forward(self) -> None:
+    def _setup_port_forward(self, job: KubernetesJob) -> None:
         if self._port_forward_process and self._port_forward_process.poll() is None:
             logging.debug("Port forwarding is already running")
             return
 
-        if not self.are_vllm_pods_ready():
+        if not self.are_vllm_pods_ready(job):
             logging.debug("Pods are not ready yet, skipping port forward")
             return
 
-        get_pod_cmd = (
-            f"kubectl get pods -n {self.default_namespace} --no-headers | "
-            "grep vllm-v1-agg-frontend | "
-            "awk 'NR==1{print $1}'"
-        )
-        cmd = f"kubectl port-forward pod/$({get_pod_cmd}) 8000:8000 -n {self.default_namespace}"
+        cmd = f"kubectl port-forward svc/{job.name}-frontend 8000:8000 -n {self.default_namespace}"
         logging.debug("Starting port forwarding")
         self._port_forward_process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        logging.debug("Port forwarding started")
+
+        logging.debug(f"Port forwarding started (pid={self._port_forward_process.pid})")
 
     def _check_model_server(self) -> bool:
-        cmd = "curl -s http://localhost:8000/v1/models"
+        if not self._port_forward_process:
+            logging.debug("Port forward process is not running")
+            return False
+
+        server = "localhost:8000"
+        cmd = f"curl -s http://{server}/v1/models"
+        logging.debug(f"Checking if model server is up at {server}: {cmd}")
         result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
 
         if result.returncode != 0:
-            logging.debug("Failed to connect to model server")
+            logging.debug(
+                f"Failed to connect to model server={server}, "
+                f"output={result.stdout.strip()}, "
+                f"error={result.stderr.strip()}"
+            )
             return False
 
         try:
@@ -374,18 +381,23 @@ def _run_genai_perf(self, job: KubernetesJob) -> None:
         args_str = " ".join(args)
 
         venv_path = python_exec.venv_path.absolute()
-        cmd = f". {venv_path}/bin/activate && genai-perf profile {args_str}"
-        logging.debug("Running GenAI performance test with command:")
-        logging.debug(cmd)
+        cmd = f"{venv_path}/bin/genai-perf profile {args_str}"
+        logging.debug(f"Running GenAI performance test: {cmd}")
+        result: subprocess.CompletedProcess | None = None
         try:
             result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True)
             logging.debug("GenAI performance test completed successfully")
-            logging.debug(f"Output: {result.stdout}")
         except subprocess.CalledProcessError as e:
             logging.error(f"GenAI performance test failed: {e.stderr}")
-            raise
+
+        if result:
+            with (job.test_run.output_path / "stdout.txt").open("w") as f:
+                f.write(result.stdout)
+            with (job.test_run.output_path / "stderr.txt").open("w") as f:
+                f.write(result.stderr)
 
     def _check_deployment_conditions(self, conditions: list) -> bool:
+        logging.debug(f"Checking deployment conditions: {conditions}")
         if not conditions:
             return True
 
@@ -401,8 +413,8 @@ def _is_dynamo_graph_deployment_running(self, job: KubernetesJob) -> bool:
         if self._genai_perf_completed:
             return False
 
-        if self.are_vllm_pods_ready():
-            self._setup_port_forward()
+        if self.are_vllm_pods_ready(job):
+            self._setup_port_forward(job)
             if self._port_forward_process and self._check_model_server():
                 logging.debug("vLLM server is up and models are loaded")
                 self._run_genai_perf(job)
@@ -438,7 +450,7 @@ def delete_job(self, job_name: str, job_kind: str) -> None:
         elif "job" in job_kind.lower():
             self._delete_batch_job(job_name)
         elif "dynamographdeployment" in job_kind.lower():
-            pass
+            self._delete_dynamo_graph_deployment(job_name)
         else:
             error_message = f"Unsupported job kind: '{job_kind}'."
             logging.error(error_message)
@@ -480,11 +492,14 @@ def _delete_batch_job(self, job_name: str) -> None:
 
     def _delete_dynamo_graph_deployment(self, job_name: str) -> None:
         logging.debug(f"Deleting DynamoGraphDeployment '{job_name}'")
-        cmd = f"kubectl delete dgd vllm-v1-agg -n {self.default_namespace}"
+        cmd = f"kubectl delete dgd {job_name} -n {self.default_namespace}"
         result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
         if result.returncode != 0:
-            raise subprocess.SubprocessError(f"Failed to delete DynamoGraphDeployment: {result.stderr}")
-        logging.debug("DynamoGraphDeployment deleted successfully")
+            logging.debug(f"Failed to delete DynamoGraphDeployment: {result.stderr}")
+
+        if self._port_forward_process and self._port_forward_process.poll() is None:
+            self._port_forward_process.kill()
+        self._port_forward_process = None
 
     def create_job(self, job_spec: Dict[Any, Any], timeout: int = 60, interval: int = 1) -> str:
         """
@@ -559,15 +574,20 @@ def _create_mpi_job(self, job_spec: Dict[Any, Any]) -> str:
         return job_name
 
     def _create_dynamo_graph_deployment(self, job_spec: Dict[Any, Any]) -> str:
-        api_response = self.custom_objects_api.create_namespaced_custom_object(
-            group="nvidia.com",
-            version="v1alpha1",
-            namespace=self.default_namespace,
-            plural="dynamographdeployments",
-            body=job_spec,
-        )
+        try:
+            api_response = self.custom_objects_api.create_namespaced_custom_object(
+                group="nvidia.com",
+                version="v1alpha1",
+                namespace=self.default_namespace,
+                plural="dynamographdeployments",
+                body=job_spec,
+            )
+        except lazy.k8s.client.ApiException as e:
+            logging.error(f"An error occurred while creating DynamoGraphDeployment: {e.reason}")
+            self._delete_dynamo_graph_deployment(job_spec["metadata"]["name"])
+            raise
 
-        job_name: str = api_response["metadata"]["name"]
+        job_name = str(api_response["metadata"]["name"])
         logging.debug(f"DynamoGraphDeployment '{job_name}' created with status: {api_response.get('status')}")
         return job_name
 
diff --git a/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py b/src/cloudai/workloads/ai_dynamo/report_generation_strategy.py
diff --git a/tests/test_kubernetes_system.py b/tests/test_kubernetes_system.py