Skip to content

Commit 3adef1f

Browse files
authored
Merge pull request #666 from NVIDIA/am/bug-4554508
Handle single-sbatch metadata layout in report
2 parents f900d8b + 7217a68 commit 3adef1f

File tree

2 files changed

+104
-9
lines changed

2 files changed

+104
-9
lines changed

src/cloudai/reporter.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -57,14 +57,19 @@ class SlurmReportItem:
5757
nodes: Optional[str] = None
5858

5959
@classmethod
60-
def get_metadata(cls, run_dir: Path) -> Optional[SlurmSystemMetadata]:
61-
if not (run_dir / "metadata").exists():
62-
logging.debug(f"No metadata folder found in {run_dir}")
63-
return None
64-
65-
node_files = list(run_dir.glob("metadata/node-*.toml"))
60+
def get_metadata(cls, run_dir: Path, results_root: Path) -> Optional[SlurmSystemMetadata]:
61+
metadata_path = run_dir / "metadata"
62+
if not metadata_path.exists():
63+
logging.debug(f"No metadata folder found in {run_dir=}")
64+
if not (results_root / "metadata").exists():
65+
logging.debug(f"No metadata folder found in {results_root=}")
66+
return None
67+
else: # single-sbatch case
68+
metadata_path = results_root / "metadata"
69+
70+
node_files = list(metadata_path.glob("node-*.toml"))
6671
if not node_files:
67-
logging.debug(f"No node files found in {run_dir}/metadata")
72+
logging.debug(f"No node files found in {metadata_path}")
6873
return None
6974

7075
node_file = node_files[0]
@@ -83,7 +88,7 @@ def from_test_runs(cls, test_runs: list[TestRun], results_root: Path) -> list["S
8388
ri = SlurmReportItem(case_name(tr), tr.test.description)
8489
if tr.output_path.exists():
8590
ri.logs_path = f"./{tr.output_path.relative_to(results_root)}"
86-
if metadata := cls.get_metadata(tr.output_path):
91+
if metadata := cls.get_metadata(tr.output_path, results_root):
8792
ri.nodes = metadata.slurm.node_list
8893
report_items.append(ri)
8994

tests/test_reporter.py

Lines changed: 91 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,16 @@
2929
from cloudai.cli.handlers import generate_reports
3030
from cloudai.core import CommandGenStrategy, TestTemplate
3131
from cloudai.models.scenario import ReportConfig, TestRunDetails
32-
from cloudai.reporter import PerTestReporter, StatusReporter, TarballReporter
32+
from cloudai.reporter import PerTestReporter, SlurmReportItem, StatusReporter, TarballReporter
33+
from cloudai.systems.slurm.slurm_metadata import (
34+
MetadataCUDA,
35+
MetadataMPI,
36+
MetadataNCCL,
37+
MetadataNetwork,
38+
MetadataSlurm,
39+
MetadataSystem,
40+
SlurmSystemMetadata,
41+
)
3342
from cloudai.systems.slurm.slurm_system import SlurmSystem
3443
from cloudai.systems.standalone.standalone_system import StandaloneSystem
3544
from cloudai.workloads.nccl_test import NCCLCmdArgs, NCCLTestDefinition
@@ -254,3 +263,84 @@ def test_report_scenario_has_highest_priority(self, slurm_system: SlurmSystem) -
254263
slurm_system.output_path,
255264
)
256265
assert MY_REPORT_CALLED == 1
266+
267+
268+
@pytest.fixture
269+
def slurm_metadata() -> SlurmSystemMetadata:
270+
return SlurmSystemMetadata(
271+
user="user",
272+
system=MetadataSystem(
273+
os_type="os_type",
274+
os_version="os_version",
275+
linux_kernel_version="linux_kernel_version",
276+
gpu_arch_type="gpu_arch_type",
277+
cpu_model_name="cpu_model_name",
278+
cpu_arch_type="cpu_arch_type",
279+
),
280+
mpi=MetadataMPI(
281+
mpi_type="mpi_type",
282+
mpi_version="mpi_version",
283+
hpcx_version="hpcx_version",
284+
),
285+
cuda=MetadataCUDA(
286+
cuda_build_version="cuda_build_version",
287+
cuda_runtime_version="cuda_runtime_version",
288+
cuda_driver_version="cuda_driver_version",
289+
),
290+
network=MetadataNetwork(
291+
nics="nics",
292+
switch_type="switch_type",
293+
network_name="network_name",
294+
mofed_version="mofed_version",
295+
libfabric_version="libfabric_version",
296+
),
297+
nccl=MetadataNCCL(
298+
version="1.1.1",
299+
commit_sha="abcdef15",
300+
),
301+
slurm=MetadataSlurm(
302+
cluster_name="cluster_name",
303+
node_list="node1,node2",
304+
num_nodes="2",
305+
ntasks_per_node="8",
306+
ntasks="16",
307+
job_id="123456",
308+
),
309+
)
310+
311+
312+
class TestSlurmReportItem:
313+
def test_no_metadata_folder(self, slurm_system: SlurmSystem) -> None:
314+
run_dir = slurm_system.output_path / "run_dir"
315+
run_dir.mkdir(parents=True, exist_ok=True)
316+
317+
meta = SlurmReportItem.get_metadata(run_dir, slurm_system.output_path)
318+
assert meta is None
319+
320+
def test_no_metadata_files(self, slurm_system: SlurmSystem) -> None:
321+
run_dir = slurm_system.output_path / "run_dir"
322+
(run_dir / "metadata").mkdir(parents=True, exist_ok=True)
323+
324+
meta = SlurmReportItem.get_metadata(run_dir, slurm_system.output_path)
325+
assert meta is None
326+
327+
def test_metadata_file_in_run_dir(self, slurm_system: SlurmSystem, slurm_metadata: SlurmSystemMetadata) -> None:
328+
run_dir = slurm_system.output_path / "run_dir"
329+
(run_dir / "metadata").mkdir(parents=True, exist_ok=True)
330+
with open(run_dir / "metadata" / "node-0.toml", "w") as f:
331+
toml.dump(slurm_metadata.model_dump(), f)
332+
333+
meta = SlurmReportItem.get_metadata(run_dir, slurm_system.output_path)
334+
assert meta is not None
335+
assert meta.slurm.node_list == slurm_metadata.slurm.node_list
336+
337+
def test_metadata_for_single_sbatch(self, slurm_system: SlurmSystem, slurm_metadata: SlurmSystemMetadata) -> None:
338+
run_dir = slurm_system.output_path / "run_dir"
339+
run_dir.mkdir(parents=True, exist_ok=True)
340+
(slurm_system.output_path / "metadata").mkdir(parents=True, exist_ok=True)
341+
with open(slurm_system.output_path / "metadata" / "node-0.toml", "w") as f:
342+
toml.dump(slurm_metadata.model_dump(), f)
343+
344+
meta = SlurmReportItem.get_metadata(run_dir, slurm_system.output_path)
345+
assert meta is not None
346+
assert meta.slurm.node_list == slurm_metadata.slurm.node_list

0 commit comments

Comments
 (0)