Skip to content

Commit b5c2187

Browse files
committed
Fix success condition
1 parent cb28dde commit b5c2187

File tree

8 files changed

+31
-59
lines changed

8 files changed

+31
-59
lines changed

conf/common/test/osu_test.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,6 @@ description = "OSU Benchmark example"
2121
[cmd_args]
2222
"docker_image_url" = "artifactory.nvidia.com/sw-nbu-swx-hpcx-docker-local/dlfw/pytorch:25.06-hpcx-v2.26-gcc-ubuntu24.04-cuda12-x86_64-latest-nightly"
2323
"benchmarks_dir" = "/opt/hpcx/ompi/tests/osu-micro-benchmarks"
24-
"benchmark" = ["osu_allreduce", "osu_allgather"]
24+
"benchmark" = "osu_allreduce"
2525
"iterations" = 10
2626
"message_size" = "1024"

conf/common/test_scenario/osu_test.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# limitations under the License.
1616

1717
name = "osu_test_scenario"
18+
job_status_check = true
1819

1920
[[Tests]]
2021
id = "Tests.1"

conf/osu/funk.toml

Lines changed: 0 additions & 15 deletions
This file was deleted.

doc/workloads/index.md

Lines changed: 0 additions & 35 deletions
This file was deleted.

src/cloudai/workloads/osu_bench/osu_bench.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult:
104104
),
105105
)
106106

107-
if "# Size Avg Latency(us) Min Latency(us) Max Latency(us) Iterations" not in content:
107+
if "# Size" not in content:
108108
return JobStatusResult(
109109
is_successful=False,
110110
error_message=(

src/cloudai/workloads/osu_bench/slurm_command_gen_strategy.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,16 @@
2121
from .osu_bench import OSUBenchCmdArgs, OSUBenchTestDefinition
2222

2323

24+
FULL_FLAG_UNSUPPORTED = [
25+
"osu_latency", "osu_latency_mt", "osu_latency_mp", "osu_bw",
26+
"osu_bibw", "osu_latency_persistent", "osu_bw_persistent",
27+
"osu_bibw_persistent", "osu_multi_lat", "osu_mbw_mr",
28+
"osu_put_latency", "osu_get_latency", "osu_acc_latency",
29+
"osu_get_acc_latency", "osu_cas_latency", "osu_fop_latency",
30+
"osu_put_bw", "osu_get_bw", "osu_put_bibw", "osu_init", "osu_hello"
31+
]
32+
33+
2434
class OSUBenchSlurmCommandGenStrategy(SlurmCommandGenStrategy):
2535
"""Command generation strategy for OSU Benchmark test on Slurm systems."""
2636

@@ -37,14 +47,25 @@ def generate_test_command(self) -> List[str]:
3747
binary = f"{args.benchmarks_dir}/{args.benchmark}"
3848
srun_command_parts = [binary]
3949

40-
general = {"docker_image_url", "location", "benchmark"}
50+
general = {"docker_image_url", "benchmarks_dir", "benchmark"}
4151

4252
for name, value in args.model_dump(exclude=general).items():
4353
if value is None:
4454
continue
4555

4656
flag = f"--{name.replace('_', '-')}"
47-
srun_command_parts.append(f"{flag} {value}")
57+
58+
if isinstance(value, bool) and value:
59+
argument = flag
60+
else:
61+
argument = f"{flag} {value}"
62+
63+
# Some benchmarks don't support the full flag; suppress it
64+
# to avoid errors.
65+
if name == "full" and args.benchmark in FULL_FLAG_UNSUPPORTED:
66+
continue
67+
68+
srun_command_parts.append(argument)
4869

4970
if self.test_run.test.extra_cmd_args:
5071
srun_command_parts.append(self.test_run.test.extra_args_str)

tests/ref_data/osu-bench.sbatch

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010

1111
export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
1212

13-
srun --export=ALL --mpi=pmix -N1 --container-image=artifactory.nvidia.com/sw-nbu-swx-hpcx-docker-local/dlfw/pytorch:25.06-hpcx-v2.26-gcc-ubuntu24.04-cuda12-x86_64-latest-nightly --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
13+
srun --export=ALL --mpi=pmix --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
1414

15-
srun --export=ALL --mpi=pmix -N1 --container-image=artifactory.nvidia.com/sw-nbu-swx-hpcx-docker-local/dlfw/pytorch:25.06-hpcx-v2.26-gcc-ubuntu24.04-cuda12-x86_64-latest-nightly --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
15+
srun --export=ALL --mpi=pmix --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
1616

17-
srun --export=ALL --mpi=pmix -N1 --container-image=artifactory.nvidia.com/sw-nbu-swx-hpcx-docker-local/dlfw/pytorch:25.06-hpcx-v2.26-gcc-ubuntu24.04-cuda12-x86_64-latest-nightly --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; /opt/hpcx/ompi/tests/osu-micro-benchmarks/osu_allreduce -m 1024 -i 10 -f"
17+
srun --export=ALL --mpi=pmix --container-image=nvcr.io#nvidia/pytorch:24.02-py3 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; /opt/hpcx/ompi/tests/osu-micro-benchmarks/osu_allreduce --message-size 1024 --iterations 10 --full"

tests/test_acceptance.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,8 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
314314
description="osu-bench",
315315
test_template_name="osu-bench",
316316
cmd_args=OSUBenchCmdArgs(
317-
docker_image_url="artifactory.nvidia.com/sw-nbu-swx-hpcx-docker-local/dlfw/pytorch:25.06-hpcx-v2.26-gcc-ubuntu24.04-cuda12-x86_64-latest-nightly",
318-
location="/opt/hpcx/ompi/tests/osu-micro-benchmarks",
317+
docker_image_url="nvcr.io#nvidia/pytorch:24.02-py3",
318+
benchmarks_dir="/opt/hpcx/ompi/tests/osu-micro-benchmarks",
319319
benchmark="osu_allreduce",
320320
iterations=10,
321321
message_size="1024",

0 commit comments

Comments
 (0)