NVIDIA
diff --git a/‎USER_GUIDE.md‎
Lines changed: 49 additions & 0 deletions b/‎USER_GUIDE.md‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎src/cloudai/__init__.py‎
Lines changed: 12 additions & 0 deletions b/‎src/cloudai/__init__.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/cloudai/workloads/triton_inference/__init__.py‎
Lines changed: 26 additions & 0 deletions b/‎src/cloudai/workloads/triton_inference/__init__.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎src/cloudai/workloads/triton_inference/report_generation_strategy.py‎
Lines changed: 27 additions & 0 deletions b/‎src/cloudai/workloads/triton_inference/report_generation_strategy.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎src/cloudai/workloads/triton_inference/slurm_command_gen_strategy.py‎
Lines changed: 161 additions & 0 deletions b/‎src/cloudai/workloads/triton_inference/slurm_command_gen_strategy.py‎
Lines changed: 161 additions & 0 deletions
@@ -442,6 +442,55 @@ test_name = "nccl_test_all_reduce"
 time_limit = "00:20:00"
 ```
 
+## Downloading DeepSeek Weights
+To run DeepSeek R1 tests in CloudAI, you must download the model weights in advance. These weights are distributed via the NVIDIA NGC Registry and must be manually downloaded using the NGC CLI.
+
+### Step 1: Install NGC CLI
+Download and install the NGC CLI using the following commands:
+
+```bash
+wget --content-disposition https://api.ngc.nvidia.com/v2/resources/nvidia/ngc-apps/ngc_cli/versions/3.64.2/files/ngccli_linux.zip -O ngccli_linux.zip
+unzip ngccli_linux.zip
+chmod u+x ngc-cli/ngc
+echo "export PATH=\"$PATH:$(pwd)/ngc-cli\"" >> ~/.bash_profile && source ~/.bash_profile
+```
+
+This will make the `ngc` command available in your terminal.
+
+### Step 2: Configure NGC CLI
+Authenticate your CLI with your NGC API key by running:
+
+```bash
+ngc config set
+```
+
+When prompted, paste your API key, which you can obtain from [https://org.ngc.nvidia.com/setup](https://org.ngc.nvidia.com/setup).
+
+### Step 3: Download the Weights
+Navigate to the directory where you want the DeepSeek model weights to be stored, then run:
+
+```bash
+ngc registry model download-version nim/deepseek-ai/deepseek-r1-instruct:hf-5dde110-nim-fp8 --dest .
+```
+
+This command will create a folder named:
+
+```
+deepseek-r1-instruct_vhf-5dde110-nim-fp8/
+```
+
+inside your current directory.
+
+### Step 4: Verify the Download
+Ensure the full model has been downloaded by checking the folder size:
+
+```bash
+du -sh deepseek-r1-instruct_vhf-5dde110-nim-fp8
+```
+
+The expected size is approximately 642 GB. If it’s significantly smaller, remove the folder and re-run the download.
+
+
 ## Slurm specifics
 
 ### Extra srun and sbatch arguments
 
@@ -126,6 +126,11 @@
     SlurmContainerReportGenerationStrategy,
     SlurmContainerTestDefinition,
 )
+from .workloads.triton_inference import (
+    TritonInferenceReportGenerationStrategy,
+    TritonInferenceSlurmCommandGenStrategy,
+    TritonInferenceTestDefinition,
+)
 from .workloads.ucc_test import (
     UCCTestDefinition,
     UCCTestGradingStrategy,
@@ -192,6 +197,7 @@
         NeMoRunTestDefinition,
         SlurmContainerTestDefinition,
         MegatronRunTestDefinition,
+        TritonInferenceTestDefinition,
     ],
     SlurmJobIdRetrievalStrategy,
 )
@@ -229,6 +235,7 @@
         NeMoRunTestDefinition,
         SlurmContainerTestDefinition,
         MegatronRunTestDefinition,
+        TritonInferenceTestDefinition,
     ],
     DefaultJobStatusRetrievalStrategy,
 )
@@ -254,6 +261,9 @@
 Registry().add_strategy(
     CommandGenStrategy, [SlurmSystem], [SlurmContainerTestDefinition], SlurmContainerCommandGenStrategy
 )
+Registry().add_strategy(
+    CommandGenStrategy, [SlurmSystem], [TritonInferenceTestDefinition], TritonInferenceSlurmCommandGenStrategy
+)
 
 Registry().add_installer("slurm", SlurmInstaller)
 Registry().add_installer("standalone", StandaloneInstaller)
@@ -278,6 +288,7 @@
 Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition)
 Registry().add_test_definition("SlurmContainer", SlurmContainerTestDefinition)
 Registry().add_test_definition("MegatronRun", MegatronRunTestDefinition)
+Registry().add_test_definition("TritonInference", TritonInferenceTestDefinition)
 
 Registry().add_agent("grid_search", GridSearchAgent)
 
@@ -293,6 +304,7 @@
 Registry().add_report(SleepTestDefinition, SleepReportGenerationStrategy)
 Registry().add_report(SlurmContainerTestDefinition, SlurmContainerReportGenerationStrategy)
 Registry().add_report(UCCTestDefinition, UCCTestReportGenerationStrategy)
+Registry().add_report(TritonInferenceTestDefinition, TritonInferenceReportGenerationStrategy)
 
 Registry().add_scenario_report(PerTestReporter)
 Registry().add_scenario_report(StatusReporter)
 
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .report_generation_strategy import TritonInferenceReportGenerationStrategy
+from .slurm_command_gen_strategy import TritonInferenceSlurmCommandGenStrategy
+from .triton_inference import TritonInferenceCmdArgs, TritonInferenceTestDefinition
+
+__all__ = [
+    "TritonInferenceCmdArgs",
+    "TritonInferenceReportGenerationStrategy",
+    "TritonInferenceSlurmCommandGenStrategy",
+    "TritonInferenceTestDefinition",
+]
@@ -0,0 +1,27 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cloudai import ReportGenerationStrategy
+
+
+class TritonInferenceReportGenerationStrategy(ReportGenerationStrategy):
+    """Report generation strategy for TritonInference."""
+
+    def can_handle_directory(self) -> bool:
+        return False
+
+    def generate_report(self) -> None:
+        pass
@@ -0,0 +1,161 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+from typing import Any, Dict, List, Tuple, Union, cast
+
+from cloudai import TestRun
+from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy
+
+from .triton_inference import TritonInferenceTestDefinition
+
+
+class TritonInferenceSlurmCommandGenStrategy(SlurmCommandGenStrategy):
+    """Command generation strategy for TritonInference server and client."""
+
+    def _container_mounts(self, tr: TestRun) -> list[str]:
+        td = cast(TritonInferenceTestDefinition, tr.test.test_definition)
+        mounts = [
+            f"{td.nim_model_path}:{td.nim_model_path}:ro",
+            f"{td.nim_cache_path}:{td.nim_cache_path}:rw",
+        ]
+
+        wrapper_host = (tr.output_path / "start_server_wrapper.sh").resolve()
+        wrapper_container = "/opt/nim/start_server_wrapper.sh"
+        self._generate_start_wrapper_script(wrapper_host, td.extra_env_vars)
+        mounts.append(f"{wrapper_host}:{wrapper_container}:ro")
+
+        return mounts
+
+    def _append_sbatch_directives(
+        self,
+        batch_script_content: List[str],
+        args: Dict[str, Any],
+        tr: TestRun,
+    ) -> None:
+        super()._append_sbatch_directives(batch_script_content, args, tr)
+        batch_script_content.append("export HEAD_NODE=$SLURM_JOB_MASTER_NODE")
+        batch_script_content.append("export NIM_LEADER_IP_ADDRESS=$SLURM_JOB_MASTER_NODE")
+        batch_script_content.append(f"export NIM_NUM_COMPUTE_NODES={args['num_nodes'] - 1}")
+        batch_script_content.append("export NIM_MODEL_TOKENIZER='deepseek-ai/DeepSeek-R1'")
+
+    def _generate_start_wrapper_script(self, script_path: Path, env_vars: Dict[str, Any]) -> None:
+        lines = ["#!/bin/bash", ""]
+        lines.append("export NIM_LEADER_IP_ADDRESS=${SLURM_JOB_MASTER_NODE}")
+        lines.append("export NIM_NODE_RANK=${SLURM_NODEID}")
+        lines.append("")
+        for key, val in env_vars.items():
+            if key in {"NIM_LEADER_IP_ADDRESS", "NIM_NODE_RANK"}:
+                continue
+            if isinstance(val, str):
+                lines.append(f"export {key}='{val}'")
+        lines.append("")
+        lines.append('if [ "$NIM_NODE_RANK" -eq 0 ]; then')
+        lines.append("  export NIM_LEADER_ROLE=1")
+        lines.append("else")
+        lines.append("  export NIM_LEADER_ROLE=0")
+        lines.append("fi")
+        lines.append("")
+        lines.append('echo "Starting NIM server on node rank ${NIM_NODE_RANK} with leader role ${NIM_LEADER_ROLE}"')
+        lines.append("exec /opt/nim/start_server.sh")
+        script_path.parent.mkdir(parents=True, exist_ok=True)
+        with script_path.open("w", encoding="utf-8") as f:
+            f.write("\n".join(lines))
+        script_path.chmod(0o755)
+
+    def _gen_srun_command(
+        self,
+        slurm_args: Dict[str, Any],
+        env_vars: Dict[str, Union[str, List[str]]],
+        cmd_args: Dict[str, Union[str, List[str]]],
+        tr: TestRun,
+    ) -> str:
+        num_server_nodes, num_client_nodes = self._get_server_client_split(tr)
+        server_line = self._build_server_srun(slurm_args, tr, num_server_nodes)
+        client_line = self._build_client_srun(slurm_args, tr, num_client_nodes)
+        sleep_sec = cast(TritonInferenceTestDefinition, tr.test.test_definition).cmd_args.sleep_seconds
+        return f"{server_line} &\n\nsleep {sleep_sec}\n\n{client_line}"
+
+    def _get_server_client_split(self, tr: TestRun) -> Tuple[int, int]:
+        num_nodes, _ = self.system.get_nodes_by_spec(tr.num_nodes, tr.nodes)
+        if num_nodes < 3:
+            raise ValueError("DeepSeekR1 requires at least 3 nodes: 2 server and 1 client.")
+        return num_nodes - 1, 1
+
+    def _build_server_srun(self, slurm_args: Dict[str, Any], tr: TestRun, num_server_nodes: int) -> str:
+        test_definition = cast(TritonInferenceTestDefinition, tr.test.test_definition)
+        server_slurm_args = {
+            **slurm_args,
+            "image_path": test_definition.server_docker_image.installed_path,
+        }
+        srun_prefix = self.gen_srun_prefix(server_slurm_args, tr)
+        srun_prefix.append(f"--nodes={num_server_nodes}")
+        srun_prefix.append(f"--ntasks={num_server_nodes}")
+        srun_prefix.append("--ntasks-per-node=1")
+        nsys_command = self.gen_nsys_command(tr)
+        server_launch_command = ["/opt/nim/start_server_wrapper.sh"]
+        return " ".join(srun_prefix + nsys_command + server_launch_command)
+
+    def _build_client_srun(self, slurm_args: Dict[str, Any], tr: TestRun, num_client_nodes: int) -> str:
+        test_definition = cast(TritonInferenceTestDefinition, tr.test.test_definition)
+        client_slurm_args = {
+            **slurm_args,
+            "image_path": test_definition.client_docker_image.installed_path,
+        }
+        srun_prefix = self.gen_srun_prefix(client_slurm_args, tr)
+        srun_prefix.append(f"--nodes={num_client_nodes}")
+        srun_prefix.append(f"--ntasks={num_client_nodes}")
+
+        args = test_definition.cmd_args
+        client_command = [
+            "genai-perf",
+            "profile",
+            "-m",
+            args.served_model_name,
+            f"--endpoint-type {args.endpoint_type}",
+            f"--service-kind {args.service_kind}",
+        ]
+        if args.streaming:
+            client_command.append("--streaming")
+        client_command += [
+            "-u",
+            f"$SLURM_JOB_MASTER_NODE:{args.port}",
+            "--num-prompts",
+            str(args.num_prompts),
+            "--synthetic-input-tokens-mean",
+            str(args.input_sequence_length),
+            "--synthetic-input-tokens-stddev",
+            "0",
+            "--concurrency",
+            str(args.concurrency),
+            "--output-tokens-mean",
+            str(args.output_sequence_length),
+            "--extra-inputs",
+            f"max_tokens:{args.output_sequence_length}",
+            "--extra-inputs",
+            f"min_tokens:{args.output_sequence_length}",
+            "--extra-inputs",
+            "ignore_eos:true",
+            "--artifact-dir",
+            "/cloudai_run_results",
+            "--tokenizer",
+            args.tokenizer,
+            "--",
+            "-v",
+            f"--max-threads {args.concurrency}",
+            f"--request-count {args.num_prompts}",
+        ]
+        return " ".join(srun_prefix + client_command)