Skip to content

Commit 0d25f73

Browse files
authored
Add DeepSeek-R1 Inference (#503)
1 parent 92386cc commit 0d25f73

File tree

12 files changed

+646
-2
lines changed

12 files changed

+646
-2
lines changed

USER_GUIDE.md

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,55 @@ test_name = "nccl_test_all_reduce"
442442
time_limit = "00:20:00"
443443
```
444444

445+
## Downloading DeepSeek Weights
446+
To run DeepSeek R1 tests in CloudAI, you must download the model weights in advance. These weights are distributed via the NVIDIA NGC Registry and must be manually downloaded using the NGC CLI.
447+
448+
### Step 1: Install NGC CLI
449+
Download and install the NGC CLI using the following commands:
450+
451+
```bash
452+
wget --content-disposition https://api.ngc.nvidia.com/v2/resources/nvidia/ngc-apps/ngc_cli/versions/3.64.2/files/ngccli_linux.zip -O ngccli_linux.zip
453+
unzip ngccli_linux.zip
454+
chmod u+x ngc-cli/ngc
455+
echo "export PATH=\"$PATH:$(pwd)/ngc-cli\"" >> ~/.bash_profile && source ~/.bash_profile
456+
```
457+
458+
This will make the `ngc` command available in your terminal.
459+
460+
### Step 2: Configure NGC CLI
461+
Authenticate your CLI with your NGC API key by running:
462+
463+
```bash
464+
ngc config set
465+
```
466+
467+
When prompted, paste your API key, which you can obtain from [https://org.ngc.nvidia.com/setup](https://org.ngc.nvidia.com/setup).
468+
469+
### Step 3: Download the Weights
470+
Navigate to the directory where you want the DeepSeek model weights to be stored, then run:
471+
472+
```bash
473+
ngc registry model download-version nim/deepseek-ai/deepseek-r1-instruct:hf-5dde110-nim-fp8 --dest .
474+
```
475+
476+
This command will create a folder named:
477+
478+
```
479+
deepseek-r1-instruct_vhf-5dde110-nim-fp8/
480+
```
481+
482+
inside your current directory.
483+
484+
### Step 4: Verify the Download
485+
Ensure the full model has been downloaded by checking the folder size:
486+
487+
```bash
488+
du -sh deepseek-r1-instruct_vhf-5dde110-nim-fp8
489+
```
490+
491+
The expected size is approximately 642 GB. If it’s significantly smaller, remove the folder and re-run the download.
492+
493+
445494
## Slurm specifics
446495

447496
### Extra srun and sbatch arguments

src/cloudai/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,11 @@
126126
SlurmContainerReportGenerationStrategy,
127127
SlurmContainerTestDefinition,
128128
)
129+
from .workloads.triton_inference import (
130+
TritonInferenceReportGenerationStrategy,
131+
TritonInferenceSlurmCommandGenStrategy,
132+
TritonInferenceTestDefinition,
133+
)
129134
from .workloads.ucc_test import (
130135
UCCTestDefinition,
131136
UCCTestGradingStrategy,
@@ -192,6 +197,7 @@
192197
NeMoRunTestDefinition,
193198
SlurmContainerTestDefinition,
194199
MegatronRunTestDefinition,
200+
TritonInferenceTestDefinition,
195201
],
196202
SlurmJobIdRetrievalStrategy,
197203
)
@@ -229,6 +235,7 @@
229235
NeMoRunTestDefinition,
230236
SlurmContainerTestDefinition,
231237
MegatronRunTestDefinition,
238+
TritonInferenceTestDefinition,
232239
],
233240
DefaultJobStatusRetrievalStrategy,
234241
)
@@ -254,6 +261,9 @@
254261
Registry().add_strategy(
255262
CommandGenStrategy, [SlurmSystem], [SlurmContainerTestDefinition], SlurmContainerCommandGenStrategy
256263
)
264+
Registry().add_strategy(
265+
CommandGenStrategy, [SlurmSystem], [TritonInferenceTestDefinition], TritonInferenceSlurmCommandGenStrategy
266+
)
257267

258268
Registry().add_installer("slurm", SlurmInstaller)
259269
Registry().add_installer("standalone", StandaloneInstaller)
@@ -278,6 +288,7 @@
278288
Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition)
279289
Registry().add_test_definition("SlurmContainer", SlurmContainerTestDefinition)
280290
Registry().add_test_definition("MegatronRun", MegatronRunTestDefinition)
291+
Registry().add_test_definition("TritonInference", TritonInferenceTestDefinition)
281292

282293
Registry().add_agent("grid_search", GridSearchAgent)
283294

@@ -293,6 +304,7 @@
293304
Registry().add_report(SleepTestDefinition, SleepReportGenerationStrategy)
294305
Registry().add_report(SlurmContainerTestDefinition, SlurmContainerReportGenerationStrategy)
295306
Registry().add_report(UCCTestDefinition, UCCTestReportGenerationStrategy)
307+
Registry().add_report(TritonInferenceTestDefinition, TritonInferenceReportGenerationStrategy)
296308

297309
Registry().add_scenario_report(PerTestReporter)
298310
Registry().add_scenario_report(StatusReporter)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from .report_generation_strategy import TritonInferenceReportGenerationStrategy
18+
from .slurm_command_gen_strategy import TritonInferenceSlurmCommandGenStrategy
19+
from .triton_inference import TritonInferenceCmdArgs, TritonInferenceTestDefinition
20+
21+
__all__ = [
22+
"TritonInferenceCmdArgs",
23+
"TritonInferenceReportGenerationStrategy",
24+
"TritonInferenceSlurmCommandGenStrategy",
25+
"TritonInferenceTestDefinition",
26+
]
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from cloudai import ReportGenerationStrategy
18+
19+
20+
class TritonInferenceReportGenerationStrategy(ReportGenerationStrategy):
21+
"""Report generation strategy for TritonInference."""
22+
23+
def can_handle_directory(self) -> bool:
24+
return False
25+
26+
def generate_report(self) -> None:
27+
pass
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from pathlib import Path
18+
from typing import Any, Dict, List, Tuple, Union, cast
19+
20+
from cloudai import TestRun
21+
from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy
22+
23+
from .triton_inference import TritonInferenceTestDefinition
24+
25+
26+
class TritonInferenceSlurmCommandGenStrategy(SlurmCommandGenStrategy):
27+
"""Command generation strategy for TritonInference server and client."""
28+
29+
def _container_mounts(self, tr: TestRun) -> list[str]:
30+
td = cast(TritonInferenceTestDefinition, tr.test.test_definition)
31+
mounts = [
32+
f"{td.nim_model_path}:{td.nim_model_path}:ro",
33+
f"{td.nim_cache_path}:{td.nim_cache_path}:rw",
34+
]
35+
36+
wrapper_host = (tr.output_path / "start_server_wrapper.sh").resolve()
37+
wrapper_container = "/opt/nim/start_server_wrapper.sh"
38+
self._generate_start_wrapper_script(wrapper_host, td.extra_env_vars)
39+
mounts.append(f"{wrapper_host}:{wrapper_container}:ro")
40+
41+
return mounts
42+
43+
def _append_sbatch_directives(
44+
self,
45+
batch_script_content: List[str],
46+
args: Dict[str, Any],
47+
tr: TestRun,
48+
) -> None:
49+
super()._append_sbatch_directives(batch_script_content, args, tr)
50+
batch_script_content.append("export HEAD_NODE=$SLURM_JOB_MASTER_NODE")
51+
batch_script_content.append("export NIM_LEADER_IP_ADDRESS=$SLURM_JOB_MASTER_NODE")
52+
batch_script_content.append(f"export NIM_NUM_COMPUTE_NODES={args['num_nodes'] - 1}")
53+
batch_script_content.append("export NIM_MODEL_TOKENIZER='deepseek-ai/DeepSeek-R1'")
54+
55+
def _generate_start_wrapper_script(self, script_path: Path, env_vars: Dict[str, Any]) -> None:
56+
lines = ["#!/bin/bash", ""]
57+
lines.append("export NIM_LEADER_IP_ADDRESS=${SLURM_JOB_MASTER_NODE}")
58+
lines.append("export NIM_NODE_RANK=${SLURM_NODEID}")
59+
lines.append("")
60+
for key, val in env_vars.items():
61+
if key in {"NIM_LEADER_IP_ADDRESS", "NIM_NODE_RANK"}:
62+
continue
63+
if isinstance(val, str):
64+
lines.append(f"export {key}='{val}'")
65+
lines.append("")
66+
lines.append('if [ "$NIM_NODE_RANK" -eq 0 ]; then')
67+
lines.append(" export NIM_LEADER_ROLE=1")
68+
lines.append("else")
69+
lines.append(" export NIM_LEADER_ROLE=0")
70+
lines.append("fi")
71+
lines.append("")
72+
lines.append('echo "Starting NIM server on node rank ${NIM_NODE_RANK} with leader role ${NIM_LEADER_ROLE}"')
73+
lines.append("exec /opt/nim/start_server.sh")
74+
script_path.parent.mkdir(parents=True, exist_ok=True)
75+
with script_path.open("w", encoding="utf-8") as f:
76+
f.write("\n".join(lines))
77+
script_path.chmod(0o755)
78+
79+
def _gen_srun_command(
80+
self,
81+
slurm_args: Dict[str, Any],
82+
env_vars: Dict[str, Union[str, List[str]]],
83+
cmd_args: Dict[str, Union[str, List[str]]],
84+
tr: TestRun,
85+
) -> str:
86+
num_server_nodes, num_client_nodes = self._get_server_client_split(tr)
87+
server_line = self._build_server_srun(slurm_args, tr, num_server_nodes)
88+
client_line = self._build_client_srun(slurm_args, tr, num_client_nodes)
89+
sleep_sec = cast(TritonInferenceTestDefinition, tr.test.test_definition).cmd_args.sleep_seconds
90+
return f"{server_line} &\n\nsleep {sleep_sec}\n\n{client_line}"
91+
92+
def _get_server_client_split(self, tr: TestRun) -> Tuple[int, int]:
93+
num_nodes, _ = self.system.get_nodes_by_spec(tr.num_nodes, tr.nodes)
94+
if num_nodes < 3:
95+
raise ValueError("DeepSeekR1 requires at least 3 nodes: 2 server and 1 client.")
96+
return num_nodes - 1, 1
97+
98+
def _build_server_srun(self, slurm_args: Dict[str, Any], tr: TestRun, num_server_nodes: int) -> str:
99+
test_definition = cast(TritonInferenceTestDefinition, tr.test.test_definition)
100+
server_slurm_args = {
101+
**slurm_args,
102+
"image_path": test_definition.server_docker_image.installed_path,
103+
}
104+
srun_prefix = self.gen_srun_prefix(server_slurm_args, tr)
105+
srun_prefix.append(f"--nodes={num_server_nodes}")
106+
srun_prefix.append(f"--ntasks={num_server_nodes}")
107+
srun_prefix.append("--ntasks-per-node=1")
108+
nsys_command = self.gen_nsys_command(tr)
109+
server_launch_command = ["/opt/nim/start_server_wrapper.sh"]
110+
return " ".join(srun_prefix + nsys_command + server_launch_command)
111+
112+
def _build_client_srun(self, slurm_args: Dict[str, Any], tr: TestRun, num_client_nodes: int) -> str:
113+
test_definition = cast(TritonInferenceTestDefinition, tr.test.test_definition)
114+
client_slurm_args = {
115+
**slurm_args,
116+
"image_path": test_definition.client_docker_image.installed_path,
117+
}
118+
srun_prefix = self.gen_srun_prefix(client_slurm_args, tr)
119+
srun_prefix.append(f"--nodes={num_client_nodes}")
120+
srun_prefix.append(f"--ntasks={num_client_nodes}")
121+
122+
args = test_definition.cmd_args
123+
client_command = [
124+
"genai-perf",
125+
"profile",
126+
"-m",
127+
args.served_model_name,
128+
f"--endpoint-type {args.endpoint_type}",
129+
f"--service-kind {args.service_kind}",
130+
]
131+
if args.streaming:
132+
client_command.append("--streaming")
133+
client_command += [
134+
"-u",
135+
f"$SLURM_JOB_MASTER_NODE:{args.port}",
136+
"--num-prompts",
137+
str(args.num_prompts),
138+
"--synthetic-input-tokens-mean",
139+
str(args.input_sequence_length),
140+
"--synthetic-input-tokens-stddev",
141+
"0",
142+
"--concurrency",
143+
str(args.concurrency),
144+
"--output-tokens-mean",
145+
str(args.output_sequence_length),
146+
"--extra-inputs",
147+
f"max_tokens:{args.output_sequence_length}",
148+
"--extra-inputs",
149+
f"min_tokens:{args.output_sequence_length}",
150+
"--extra-inputs",
151+
"ignore_eos:true",
152+
"--artifact-dir",
153+
"/cloudai_run_results",
154+
"--tokenizer",
155+
args.tokenizer,
156+
"--",
157+
"-v",
158+
f"--max-threads {args.concurrency}",
159+
f"--request-count {args.num_prompts}",
160+
]
161+
return " ".join(srun_prefix + client_command)

0 commit comments

Comments
 (0)