|
29 | 29 | from cloudai.cli.handlers import generate_reports |
30 | 30 | from cloudai.core import CommandGenStrategy, TestTemplate |
31 | 31 | from cloudai.models.scenario import ReportConfig, TestRunDetails |
32 | | -from cloudai.reporter import PerTestReporter, StatusReporter, TarballReporter |
| 32 | +from cloudai.reporter import PerTestReporter, SlurmReportItem, StatusReporter, TarballReporter |
| 33 | +from cloudai.systems.slurm.slurm_metadata import ( |
| 34 | + MetadataCUDA, |
| 35 | + MetadataMPI, |
| 36 | + MetadataNCCL, |
| 37 | + MetadataNetwork, |
| 38 | + MetadataSlurm, |
| 39 | + MetadataSystem, |
| 40 | + SlurmSystemMetadata, |
| 41 | +) |
33 | 42 | from cloudai.systems.slurm.slurm_system import SlurmSystem |
34 | 43 | from cloudai.systems.standalone.standalone_system import StandaloneSystem |
35 | 44 | from cloudai.workloads.nccl_test import NCCLCmdArgs, NCCLTestDefinition |
@@ -254,3 +263,84 @@ def test_report_scenario_has_highest_priority(self, slurm_system: SlurmSystem) - |
254 | 263 | slurm_system.output_path, |
255 | 264 | ) |
256 | 265 | assert MY_REPORT_CALLED == 1 |
| 266 | + |
| 267 | + |
| 268 | +@pytest.fixture |
| 269 | +def slurm_metadata() -> SlurmSystemMetadata: |
| 270 | + return SlurmSystemMetadata( |
| 271 | + user="user", |
| 272 | + system=MetadataSystem( |
| 273 | + os_type="os_type", |
| 274 | + os_version="os_version", |
| 275 | + linux_kernel_version="linux_kernel_version", |
| 276 | + gpu_arch_type="gpu_arch_type", |
| 277 | + cpu_model_name="cpu_model_name", |
| 278 | + cpu_arch_type="cpu_arch_type", |
| 279 | + ), |
| 280 | + mpi=MetadataMPI( |
| 281 | + mpi_type="mpi_type", |
| 282 | + mpi_version="mpi_version", |
| 283 | + hpcx_version="hpcx_version", |
| 284 | + ), |
| 285 | + cuda=MetadataCUDA( |
| 286 | + cuda_build_version="cuda_build_version", |
| 287 | + cuda_runtime_version="cuda_runtime_version", |
| 288 | + cuda_driver_version="cuda_driver_version", |
| 289 | + ), |
| 290 | + network=MetadataNetwork( |
| 291 | + nics="nics", |
| 292 | + switch_type="switch_type", |
| 293 | + network_name="network_name", |
| 294 | + mofed_version="mofed_version", |
| 295 | + libfabric_version="libfabric_version", |
| 296 | + ), |
| 297 | + nccl=MetadataNCCL( |
| 298 | + version="1.1.1", |
| 299 | + commit_sha="abcdef15", |
| 300 | + ), |
| 301 | + slurm=MetadataSlurm( |
| 302 | + cluster_name="cluster_name", |
| 303 | + node_list="node1,node2", |
| 304 | + num_nodes="2", |
| 305 | + ntasks_per_node="8", |
| 306 | + ntasks="16", |
| 307 | + job_id="123456", |
| 308 | + ), |
| 309 | + ) |
| 310 | + |
| 311 | + |
| 312 | +class TestSlurmReportItem: |
| 313 | + def test_no_metadata_folder(self, slurm_system: SlurmSystem) -> None: |
| 314 | + run_dir = slurm_system.output_path / "run_dir" |
| 315 | + run_dir.mkdir(parents=True, exist_ok=True) |
| 316 | + |
| 317 | + meta = SlurmReportItem.get_metadata(run_dir, slurm_system.output_path) |
| 318 | + assert meta is None |
| 319 | + |
| 320 | + def test_no_metadata_files(self, slurm_system: SlurmSystem) -> None: |
| 321 | + run_dir = slurm_system.output_path / "run_dir" |
| 322 | + (run_dir / "metadata").mkdir(parents=True, exist_ok=True) |
| 323 | + |
| 324 | + meta = SlurmReportItem.get_metadata(run_dir, slurm_system.output_path) |
| 325 | + assert meta is None |
| 326 | + |
| 327 | + def test_metadata_file_in_run_dir(self, slurm_system: SlurmSystem, slurm_metadata: SlurmSystemMetadata) -> None: |
| 328 | + run_dir = slurm_system.output_path / "run_dir" |
| 329 | + (run_dir / "metadata").mkdir(parents=True, exist_ok=True) |
| 330 | + with open(run_dir / "metadata" / "node-0.toml", "w") as f: |
| 331 | + toml.dump(slurm_metadata.model_dump(), f) |
| 332 | + |
| 333 | + meta = SlurmReportItem.get_metadata(run_dir, slurm_system.output_path) |
| 334 | + assert meta is not None |
| 335 | + assert meta.slurm.node_list == slurm_metadata.slurm.node_list |
| 336 | + |
| 337 | + def test_metadata_for_single_sbatch(self, slurm_system: SlurmSystem, slurm_metadata: SlurmSystemMetadata) -> None: |
| 338 | + run_dir = slurm_system.output_path / "run_dir" |
| 339 | + run_dir.mkdir(parents=True, exist_ok=True) |
| 340 | + (slurm_system.output_path / "metadata").mkdir(parents=True, exist_ok=True) |
| 341 | + with open(slurm_system.output_path / "metadata" / "node-0.toml", "w") as f: |
| 342 | + toml.dump(slurm_metadata.model_dump(), f) |
| 343 | + |
| 344 | + meta = SlurmReportItem.get_metadata(run_dir, slurm_system.output_path) |
| 345 | + assert meta is not None |
| 346 | + assert meta.slurm.node_list == slurm_metadata.slurm.node_list |
0 commit comments