Skip to content

Commit d0c000a

Browse files
authored
Merge pull request #364 from NVIDIA/am/nsys
Enable and configure Nsys tracing via test config
2 parents add80a2 + 9c4e7a1 commit d0c000a

File tree

7 files changed

+167
-5
lines changed

7 files changed

+167
-5
lines changed

USER_GUIDE.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,26 @@ CloudAI runs all slurm jobs using containers. To simplify file system related ta
441441
#### Dev details
442442
`SlurmCommandGenStrategy` defines abstract method `_container_mounts(tr: TestRun)` that must be implemented by every subclass. This method is used in `SlurmCommandGenStrategy.container_mounts(tr: TestRun)` (defined as `@final`) where mounts like `/cloudai_run_results` (default mount), `TestDefinition.extra_container_mounts` (from Test TOML) and test specific mounts (defined in-code) are added.
443443

444+
### Nsys tracing
445+
Users can enable Nsys tracing for any workload when running via Slurm. Note, that `nsys` should be available on the compute nodes, CloudAI doesn't manage it.
446+
447+
Configuration fields are:
448+
```py
449+
enable: bool = True
450+
nsys_binary: str = "nsys"
451+
task: str = "profile"
452+
output: Optional[str] = None
453+
sample: Optional[str] = None
454+
trace: Optional[str] = None
455+
force_overwrite: Optional[bool] = None
456+
capture_range: Optional[str] = None
457+
capture_range_end: Optional[str] = None
458+
cuda_graph_trace: Optional[str] = None
459+
gpu_metrics_devices: Optional[str] = None
460+
extra_args: list[str] = []
461+
```
462+
Fields with `None` value are not passed to `nsys` command.
463+
444464
## Troubleshooting
445465
In this section, we will guide you through identifying the root cause of issues, determining whether they stem from system infrastructure or a bug in CloudAI. Users should closely follow the USER_GUIDE.md and README.md for installation, tests, and test scenarios.
446466

src/cloudai/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
from ._core.reporter import Reporter
3939
from ._core.runner import Runner
4040
from ._core.system import System
41-
from ._core.test import CmdArgs, Test, TestDefinition
41+
from ._core.test import CmdArgs, NsysConfiguration, Test, TestDefinition
4242
from ._core.test_parser import TestParser
4343
from ._core.test_scenario import TestRun, TestScenario
4444
from ._core.test_scenario_parser import TestScenarioParser
@@ -263,6 +263,7 @@
263263
"JobIdRetrievalError",
264264
"JobStatusResult",
265265
"JsonGenStrategy",
266+
"NsysConfiguration",
266267
"Parser",
267268
"PythonExecutable",
268269
"ReportGenerationStrategy",

src/cloudai/_core/test.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# limitations under the License.
1616

1717
from abc import ABC, abstractmethod
18-
from typing import Any, Dict, List, Union
18+
from typing import Any, Dict, List, Optional, Union
1919

2020
from pydantic import BaseModel, ConfigDict
2121

@@ -81,6 +81,48 @@ class CmdArgs(BaseModel):
8181
model_config = ConfigDict(extra="forbid")
8282

8383

84+
class NsysConfiguration(BaseModel):
85+
"""NSYS configuration."""
86+
87+
model_config = ConfigDict(extra="forbid")
88+
89+
enable: bool = True
90+
nsys_binary: str = "nsys"
91+
task: str = "profile"
92+
output: Optional[str] = None
93+
sample: Optional[str] = None
94+
trace: Optional[str] = None
95+
force_overwrite: Optional[bool] = None
96+
capture_range: Optional[str] = None
97+
capture_range_end: Optional[str] = None
98+
cuda_graph_trace: Optional[str] = None
99+
gpu_metrics_devices: Optional[str] = None
100+
extra_args: list[str] = []
101+
102+
@property
103+
def cmd_args(self) -> list[str]:
104+
parts = [f"{self.nsys_binary}", f"{self.task}"]
105+
if self.sample:
106+
parts.append(f"-s {self.sample}")
107+
if self.output:
108+
parts.append(f"-o {self.output}")
109+
if self.trace:
110+
parts.append(f"-t {self.trace}")
111+
if self.force_overwrite is not None:
112+
parts.append(f"--force-overwrite={str(self.force_overwrite).lower()}")
113+
if self.capture_range:
114+
parts.append(f"--capture-range={self.capture_range}")
115+
if self.capture_range_end:
116+
parts.append(f"--capture-range-end={self.capture_range_end}")
117+
if self.cuda_graph_trace:
118+
parts.append(f"--cuda-graph-trace={self.cuda_graph_trace}")
119+
if self.gpu_metrics_devices:
120+
parts.append(f"--gpu-metrics-devices={self.gpu_metrics_devices}")
121+
parts.extend(self.extra_args)
122+
123+
return parts
124+
125+
84126
class TestDefinition(BaseModel, ABC):
85127
"""Base Test object."""
86128

@@ -96,6 +138,7 @@ class TestDefinition(BaseModel, ABC):
96138
extra_cmd_args: dict[str, str] = {}
97139
extra_container_mounts: list[str] = []
98140
git_repos: list[GitRepo] = []
141+
nsys: Optional[NsysConfiguration] = None
99142

100143
@property
101144
def cmd_args_dict(self) -> Dict[str, Union[str, List[str]]]:

src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ class SlurmContainerCommandGenStrategy(SlurmCommandGenStrategy):
2727
def _container_mounts(self, tr: TestRun) -> list[str]:
2828
return []
2929

30+
def gen_nsys_command(self, tr: TestRun) -> list[str]:
31+
"""NSYS command is generated as part of the test command and disabled here."""
32+
return []
33+
3034
def gen_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]:
3135
tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition)
3236
slurm_args["image_path"] = tdef.docker_image.installed_path
@@ -37,7 +41,7 @@ def generate_test_command(
3741
self, env_vars: dict[str, str], cmd_args: Dict[str, Union[str, List[str]]], tr: TestRun
3842
) -> list[str]:
3943
tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition)
40-
srun_command_parts: list[str] = [tdef.cmd_args.cmd]
44+
srun_command_parts: list[str] = [*super().gen_nsys_command(tr), tdef.cmd_args.cmd]
4145
if tr.test.extra_cmd_args:
4246
srun_command_parts.append(tr.test.extra_cmd_args)
4347

src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,13 @@ def gen_post_test(self, post_test: TestScenario, base_output_path: Path) -> str:
220220

221221
return "\n".join(post_test_commands)
222222

223+
def gen_nsys_command(self, tr: TestRun) -> list[str]:
224+
nsys = tr.test.test_definition.nsys
225+
if not nsys or not nsys.enable:
226+
return []
227+
228+
return nsys.cmd_args
229+
223230
def _gen_srun_command(
224231
self,
225232
slurm_args: Dict[str, Any],
@@ -228,8 +235,9 @@ def _gen_srun_command(
228235
tr: TestRun,
229236
) -> str:
230237
srun_command_parts = self.gen_srun_prefix(slurm_args, tr)
238+
nsys_command_parts = self.gen_nsys_command(tr)
231239
test_command_parts = self.generate_test_command(env_vars, cmd_args, tr)
232-
return " ".join(srun_command_parts + test_command_parts)
240+
return " ".join(srun_command_parts + nsys_command_parts + test_command_parts)
233241

234242
def gen_srun_prefix(self, slurm_args: Dict[str, Any], tr: TestRun) -> List[str]:
235243
srun_command_parts = ["srun", f"--mpi={self.system.mpi}"]
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from pathlib import Path
18+
19+
import pytest
20+
21+
from cloudai import TestRun
22+
from cloudai._core.test import NsysConfiguration, Test
23+
from cloudai._core.test_template import TestTemplate
24+
from cloudai.schema.test_template.slurm_container.slurm_command_gen_strategy import SlurmContainerCommandGenStrategy
25+
from cloudai.systems import SlurmSystem
26+
from cloudai.test_definitions.slurm_container import SlurmContainerCmdArgs, SlurmContainerTestDefinition
27+
28+
29+
@pytest.fixture
30+
def test_run(slurm_system: SlurmSystem) -> TestRun:
31+
tdef = SlurmContainerTestDefinition(
32+
name="sc",
33+
description="desc",
34+
test_template_name="tt",
35+
cmd_args=SlurmContainerCmdArgs(docker_image_url="docker://url", cmd="cmd"),
36+
)
37+
t = Test(test_definition=tdef, test_template=TestTemplate(name="tt", system=slurm_system))
38+
tr = TestRun(name="name", test=t, num_nodes=1, nodes=[])
39+
return tr
40+
41+
42+
def test_default(slurm_system: SlurmSystem, test_run: TestRun) -> None:
43+
cgs = SlurmContainerCommandGenStrategy(slurm_system, {})
44+
cmd = cgs.gen_srun_command(test_run)
45+
46+
srun_part = (
47+
f"srun --mpi={slurm_system.mpi} --container-image={test_run.test.test_definition.cmd_args.docker_image_url} "
48+
f"--container-mounts={Path.cwd().absolute()}:/cloudai_run_results --no-container-mount-home"
49+
)
50+
51+
assert cmd == f'{srun_part} bash -c "cmd"'
52+
53+
54+
def test_with_nsys(slurm_system: SlurmSystem, test_run: TestRun) -> None:
55+
cgs = SlurmContainerCommandGenStrategy(slurm_system, {})
56+
nsys = NsysConfiguration()
57+
test_run.test.test_definition.nsys = nsys
58+
cmd = cgs.gen_srun_command(test_run)
59+
60+
srun_part = (
61+
f"srun --mpi={slurm_system.mpi} --container-image={test_run.test.test_definition.cmd_args.docker_image_url} "
62+
f"--container-mounts={Path.cwd().absolute()}:/cloudai_run_results --no-container-mount-home"
63+
)
64+
65+
assert cmd == f'{srun_part} bash -c "{" ".join(nsys.cmd_args)} cmd"'

tests/test_test_definitions.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import pytest
2121
import toml
2222

23-
from cloudai import Parser, Registry
23+
from cloudai import NsysConfiguration, Parser, Registry
2424
from cloudai.test_definitions import ChakraReplayCmdArgs, NCCLCmdArgs, NCCLTestDefinition
2525
from cloudai.test_definitions.chakra_replay import ChakraReplayTestDefinition
2626
from cloudai.test_definitions.gpt import GPTCmdArgs, GPTTestDefinition
@@ -146,3 +146,24 @@ def test_python_executable_installable_persists(test: NeMoLauncherTestDefinition
146146
test.python_executable.venv_path = tmp_path
147147
assert test.python_executable.git_repo.installed_path == tmp_path
148148
assert test.python_executable.venv_path == tmp_path
149+
150+
151+
class TestNsysConfiguration:
152+
def test_default(self):
153+
nsys = NsysConfiguration()
154+
assert nsys.enable is True
155+
assert nsys.nsys_binary == "nsys"
156+
assert nsys.task == "profile"
157+
158+
def test_cmd_args(self):
159+
nsys = NsysConfiguration()
160+
assert nsys.cmd_args == ["nsys", "profile"]
161+
162+
@pytest.mark.parametrize("value", [True, False])
163+
def test_force_overwrite(self, value: bool):
164+
nsys = NsysConfiguration(force_overwrite=value)
165+
assert nsys.cmd_args == ["nsys", "profile", f"--force-overwrite={'true' if value else 'false'}"]
166+
167+
def test_extra_args(self):
168+
nsys = NsysConfiguration(extra_args=["--extra", "args"])
169+
assert nsys.cmd_args == ["nsys", "profile", "--extra", "args"]

0 commit comments

Comments
 (0)