Merge pull request #364 from NVIDIA/am/nsys

amaslenn · web-flow · commit d0c000a2c2fa · 2025-02-21T09:31:27.000+01:00
Enable and configure Nsys tracing via test config
diff --git a/USER_GUIDE.md b/USER_GUIDE.md
@@ -441,6 +441,26 @@ CloudAI runs all slurm jobs using containers. To simplify file system related ta
 #### Dev details
 `SlurmCommandGenStrategy` defines abstract method `_container_mounts(tr: TestRun)` that must be implemented by every subclass. This method is used in `SlurmCommandGenStrategy.container_mounts(tr: TestRun)` (defined as `@final`) where mounts like `/cloudai_run_results` (default mount), `TestDefinition.extra_container_mounts` (from Test TOML) and test specific mounts (defined in-code) are added.
 
+### Nsys tracing
+Users can enable Nsys tracing for any workload when running via Slurm. Note, that `nsys` should be available on the compute nodes, CloudAI doesn't manage it.
+
+Configuration fields are:
+```py
+enable: bool = True
+nsys_binary: str = "nsys"
+task: str = "profile"
+output: Optional[str] = None
+sample: Optional[str] = None
+trace: Optional[str] = None
+force_overwrite: Optional[bool] = None
+capture_range: Optional[str] = None
+capture_range_end: Optional[str] = None
+cuda_graph_trace: Optional[str] = None
+gpu_metrics_devices: Optional[str] = None
+extra_args: list[str] = []
+```
+Fields with `None` value are not passed to `nsys` command.
+
 ## Troubleshooting
 In this section, we will guide you through identifying the root cause of issues, determining whether they stem from system infrastructure or a bug in CloudAI. Users should closely follow the USER_GUIDE.md and README.md for installation, tests, and test scenarios.
 
diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py
@@ -38,7 +38,7 @@
 from ._core.reporter import Reporter
 from ._core.runner import Runner
 from ._core.system import System
-from ._core.test import CmdArgs, Test, TestDefinition
+from ._core.test import CmdArgs, NsysConfiguration, Test, TestDefinition
 from ._core.test_parser import TestParser
 from ._core.test_scenario import TestRun, TestScenario
 from ._core.test_scenario_parser import TestScenarioParser
@@ -263,6 +263,7 @@
     "JobIdRetrievalError",
     "JobStatusResult",
     "JsonGenStrategy",
+    "NsysConfiguration",
     "Parser",
     "PythonExecutable",
     "ReportGenerationStrategy",
diff --git a/src/cloudai/_core/test.py b/src/cloudai/_core/test.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Union
 
 from pydantic import BaseModel, ConfigDict
 
@@ -81,6 +81,48 @@ class CmdArgs(BaseModel):
     model_config = ConfigDict(extra="forbid")
 
 
+class NsysConfiguration(BaseModel):
+    """NSYS configuration."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    enable: bool = True
+    nsys_binary: str = "nsys"
+    task: str = "profile"
+    output: Optional[str] = None
+    sample: Optional[str] = None
+    trace: Optional[str] = None
+    force_overwrite: Optional[bool] = None
+    capture_range: Optional[str] = None
+    capture_range_end: Optional[str] = None
+    cuda_graph_trace: Optional[str] = None
+    gpu_metrics_devices: Optional[str] = None
+    extra_args: list[str] = []
+
+    @property
+    def cmd_args(self) -> list[str]:
+        parts = [f"{self.nsys_binary}", f"{self.task}"]
+        if self.sample:
+            parts.append(f"-s {self.sample}")
+        if self.output:
+            parts.append(f"-o {self.output}")
+        if self.trace:
+            parts.append(f"-t {self.trace}")
+        if self.force_overwrite is not None:
+            parts.append(f"--force-overwrite={str(self.force_overwrite).lower()}")
+        if self.capture_range:
+            parts.append(f"--capture-range={self.capture_range}")
+        if self.capture_range_end:
+            parts.append(f"--capture-range-end={self.capture_range_end}")
+        if self.cuda_graph_trace:
+            parts.append(f"--cuda-graph-trace={self.cuda_graph_trace}")
+        if self.gpu_metrics_devices:
+            parts.append(f"--gpu-metrics-devices={self.gpu_metrics_devices}")
+        parts.extend(self.extra_args)
+
+        return parts
+
+
 class TestDefinition(BaseModel, ABC):
     """Base Test object."""
 
@@ -96,6 +138,7 @@ class TestDefinition(BaseModel, ABC):
     extra_cmd_args: dict[str, str] = {}
     extra_container_mounts: list[str] = []
     git_repos: list[GitRepo] = []
+    nsys: Optional[NsysConfiguration] = None
 
     @property
     def cmd_args_dict(self) -> Dict[str, Union[str, List[str]]]:
diff --git a/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/slurm_container/slurm_command_gen_strategy.py
@@ -27,6 +27,10 @@ class SlurmContainerCommandGenStrategy(SlurmCommandGenStrategy):
     def _container_mounts(self, tr: TestRun) -> list[str]:
         return []
 
+    def gen_nsys_command(self, tr: TestRun) -> list[str]:
+        """NSYS command is generated as part of the test command and disabled here."""
+        return []
+
     def gen_srun_prefix(self, slurm_args: dict[str, Any], tr: TestRun) -> list[str]:
         tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition)
         slurm_args["image_path"] = tdef.docker_image.installed_path
@@ -37,7 +41,7 @@ def generate_test_command(
         self, env_vars: dict[str, str], cmd_args: Dict[str, Union[str, List[str]]], tr: TestRun
     ) -> list[str]:
         tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, tr.test.test_definition)
-        srun_command_parts: list[str] = [tdef.cmd_args.cmd]
+        srun_command_parts: list[str] = [*super().gen_nsys_command(tr), tdef.cmd_args.cmd]
         if tr.test.extra_cmd_args:
             srun_command_parts.append(tr.test.extra_cmd_args)
 
diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
@@ -220,6 +220,13 @@ def gen_post_test(self, post_test: TestScenario, base_output_path: Path) -> str:
 
         return "\n".join(post_test_commands)
 
+    def gen_nsys_command(self, tr: TestRun) -> list[str]:
+        nsys = tr.test.test_definition.nsys
+        if not nsys or not nsys.enable:
+            return []
+
+        return nsys.cmd_args
+
     def _gen_srun_command(
         self,
         slurm_args: Dict[str, Any],
@@ -228,8 +235,9 @@ def _gen_srun_command(
         tr: TestRun,
     ) -> str:
         srun_command_parts = self.gen_srun_prefix(slurm_args, tr)
+        nsys_command_parts = self.gen_nsys_command(tr)
         test_command_parts = self.generate_test_command(env_vars, cmd_args, tr)
-        return " ".join(srun_command_parts + test_command_parts)
+        return " ".join(srun_command_parts + nsys_command_parts + test_command_parts)
 
     def gen_srun_prefix(self, slurm_args: Dict[str, Any], tr: TestRun) -> List[str]:
         srun_command_parts = ["srun", f"--mpi={self.system.mpi}"]
diff --git a/tests/slurm_command_gen_strategy/test_slurm_container_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_slurm_container_slurm_command_gen_strategy.py
@@ -0,0 +1,65 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+import pytest
+
+from cloudai import TestRun
+from cloudai._core.test import NsysConfiguration, Test
+from cloudai._core.test_template import TestTemplate
+from cloudai.schema.test_template.slurm_container.slurm_command_gen_strategy import SlurmContainerCommandGenStrategy
+from cloudai.systems import SlurmSystem
+from cloudai.test_definitions.slurm_container import SlurmContainerCmdArgs, SlurmContainerTestDefinition
+
+
+@pytest.fixture
+def test_run(slurm_system: SlurmSystem) -> TestRun:
+    tdef = SlurmContainerTestDefinition(
+        name="sc",
+        description="desc",
+        test_template_name="tt",
+        cmd_args=SlurmContainerCmdArgs(docker_image_url="docker://url", cmd="cmd"),
+    )
+    t = Test(test_definition=tdef, test_template=TestTemplate(name="tt", system=slurm_system))
+    tr = TestRun(name="name", test=t, num_nodes=1, nodes=[])
+    return tr
+
+
+def test_default(slurm_system: SlurmSystem, test_run: TestRun) -> None:
+    cgs = SlurmContainerCommandGenStrategy(slurm_system, {})
+    cmd = cgs.gen_srun_command(test_run)
+
+    srun_part = (
+        f"srun --mpi={slurm_system.mpi} --container-image={test_run.test.test_definition.cmd_args.docker_image_url} "
+        f"--container-mounts={Path.cwd().absolute()}:/cloudai_run_results --no-container-mount-home"
+    )
+
+    assert cmd == f'{srun_part} bash -c "cmd"'
+
+
+def test_with_nsys(slurm_system: SlurmSystem, test_run: TestRun) -> None:
+    cgs = SlurmContainerCommandGenStrategy(slurm_system, {})
+    nsys = NsysConfiguration()
+    test_run.test.test_definition.nsys = nsys
+    cmd = cgs.gen_srun_command(test_run)
+
+    srun_part = (
+        f"srun --mpi={slurm_system.mpi} --container-image={test_run.test.test_definition.cmd_args.docker_image_url} "
+        f"--container-mounts={Path.cwd().absolute()}:/cloudai_run_results --no-container-mount-home"
+    )
+
+    assert cmd == f'{srun_part} bash -c "{" ".join(nsys.cmd_args)} cmd"'
diff --git a/tests/test_test_definitions.py b/tests/test_test_definitions.py
@@ -20,7 +20,7 @@
 import pytest
 import toml
 
-from cloudai import Parser, Registry
+from cloudai import NsysConfiguration, Parser, Registry
 from cloudai.test_definitions import ChakraReplayCmdArgs, NCCLCmdArgs, NCCLTestDefinition
 from cloudai.test_definitions.chakra_replay import ChakraReplayTestDefinition
 from cloudai.test_definitions.gpt import GPTCmdArgs, GPTTestDefinition
@@ -146,3 +146,24 @@ def test_python_executable_installable_persists(test: NeMoLauncherTestDefinition
     test.python_executable.venv_path = tmp_path
     assert test.python_executable.git_repo.installed_path == tmp_path
     assert test.python_executable.venv_path == tmp_path
+
+
+class TestNsysConfiguration:
+    def test_default(self):
+        nsys = NsysConfiguration()
+        assert nsys.enable is True
+        assert nsys.nsys_binary == "nsys"
+        assert nsys.task == "profile"
+
+    def test_cmd_args(self):
+        nsys = NsysConfiguration()
+        assert nsys.cmd_args == ["nsys", "profile"]
+
+    @pytest.mark.parametrize("value", [True, False])
+    def test_force_overwrite(self, value: bool):
+        nsys = NsysConfiguration(force_overwrite=value)
+        assert nsys.cmd_args == ["nsys", "profile", f"--force-overwrite={'true' if value else 'false'}"]
+
+    def test_extra_args(self):
+        nsys = NsysConfiguration(extra_args=["--extra", "args"])
+        assert nsys.cmd_args == ["nsys", "profile", "--extra", "args"]