Skip to content

Commit 84b7161

Browse files
authored
Merge pull request #729 from NVIDIA/am/nnodes
Always set number of nodes for srun cmd
2 parents c952278 + 531acef commit 84b7161

29 files changed

+79
-75
lines changed

src/cloudai/systems/slurm/slurm_command_gen_strategy.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,8 +237,11 @@ def _gen_srun_command(self) -> str:
237237
def image_path(self) -> Optional[str]:
238238
return None
239239

240-
def gen_srun_prefix(self, use_pretest_extras: bool = False) -> List[str]:
240+
def gen_srun_prefix(self, use_pretest_extras: bool = False, with_num_nodes: bool = True) -> List[str]:
241+
num_nodes, _ = self.get_cached_nodes_spec()
241242
srun_command_parts = ["srun", "--export=ALL", f"--mpi={self.system.mpi}"]
243+
if with_num_nodes:
244+
srun_command_parts.append(f"-N{num_nodes}")
242245
if use_pretest_extras and self.test_run.pre_test:
243246
for pre_tr in self.test_run.pre_test.test_runs:
244247
srun_command_parts.extend(self._get_cmd_gen_strategy(pre_tr).pre_test_srun_extra_args(self.test_run))

src/cloudai/workloads/bash_cmd/bash_cmd.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def gen_nsys_command(self) -> list[str]:
4646
"""NSYS command is generated as part of the test command and disabled here."""
4747
return []
4848

49-
def gen_srun_prefix(self, use_pretest_extras: bool = False) -> list[str]: # noqa: Vulture
49+
def gen_srun_prefix(self, use_pretest_extras: bool = False, with_num_nodes: bool = True) -> list[str]: # noqa: Vulture
5050
return []
5151

5252
def generate_test_command(self) -> list[str]:

src/cloudai/workloads/common/nixl.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def gen_etcd_srun_command(self, etcd_path: str) -> list[str]:
5353
"--initial-cluster-state=new",
5454
]
5555
cmd = [
56-
*self.gen_srun_prefix(),
56+
*self.gen_srun_prefix(with_num_nodes=False),
5757
f"--output={self.test_run.output_path.absolute() / 'etcd.log'}",
5858
"--overlap",
5959
"--ntasks-per-node=1",
@@ -93,7 +93,7 @@ def gen_kill_and_wait_cmd(self, pid_var: str, timeout: int = 60) -> list[str]:
9393
return cmd
9494

9595
def gen_nixlbench_srun_commands(self, test_cmd: list[str], backend: str) -> list[list[str]]:
96-
prefix_part = self.gen_srun_prefix()
96+
prefix_part = self.gen_srun_prefix(with_num_nodes=False)
9797
bash_part = [
9898
"bash",
9999
"-c",

src/cloudai/workloads/nixl_perftest/slurm_command_gen_strategy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def _gen_srun_command(self) -> str:
6161

6262
def gen_matrix_gen_srun_command(self) -> list[str]:
6363
cmd = [
64-
*self.gen_srun_prefix(),
64+
*self.gen_srun_prefix(with_num_nodes=False),
6565
"--ntasks-per-node=1",
6666
"--ntasks=1",
6767
"-N1",

src/cloudai/workloads/slurm_container/slurm_command_gen_strategy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ def image_path(self) -> str | None:
3535
tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, self.test_run.test)
3636
return str(tdef.docker_image.installed_path)
3737

38-
def gen_srun_prefix(self, use_pretest_extras: bool = False) -> list[str]:
39-
cmd = super().gen_srun_prefix()
38+
def gen_srun_prefix(self, use_pretest_extras: bool = False, with_num_nodes: bool = True) -> list[str]:
39+
cmd = super().gen_srun_prefix(use_pretest_extras, with_num_nodes)
4040
tdef: SlurmContainerTestDefinition = cast(SlurmContainerTestDefinition, self.test_run.test)
4141
return [*cmd, *tdef.extra_srun_args]
4242

src/cloudai/workloads/triton_inference/slurm_command_gen_strategy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def image_path(self) -> str | None:
9494
def _build_server_srun(self, num_server_nodes: int) -> str:
9595
test_definition = cast(TritonInferenceTestDefinition, self.test_run.test)
9696
self._current_container_image = str(test_definition.server_docker_image.installed_path)
97-
srun_prefix = self.gen_srun_prefix()
97+
srun_prefix = self.gen_srun_prefix(with_num_nodes=False)
9898
self._current_container_image = None
9999

100100
srun_prefix.append(f"--nodes={num_server_nodes}")
@@ -107,7 +107,7 @@ def _build_server_srun(self, num_server_nodes: int) -> str:
107107
def _build_client_srun(self, num_client_nodes: int) -> str:
108108
test_definition = cast(TritonInferenceTestDefinition, self.test_run.test)
109109
self._current_container_image = str(test_definition.client_docker_image.installed_path)
110-
srun_prefix = self.gen_srun_prefix()
110+
srun_prefix = self.gen_srun_prefix(with_num_nodes=False)
111111
self._current_container_image = None
112112

113113
srun_prefix.append(f"--nodes={num_client_nodes}")

tests/ref_data/ai-dynamo.sbatch

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@
1010

1111
export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
1212

13-
srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/ai-dynamo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__:__INSTALL_DIR__,__OUTPUT_DIR__/output/hf_home:/root/.cache/huggingface,__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh:__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
13+
srun --export=ALL --mpi=pmix -N2 --container-image=nvcr.io/nvidia/ai-dynamo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__:__INSTALL_DIR__,__OUTPUT_DIR__/output/hf_home:/root/.cache/huggingface,__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh:__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
1414

15-
srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/ai-dynamo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__:__INSTALL_DIR__,__OUTPUT_DIR__/output/hf_home:/root/.cache/huggingface,__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh:__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
15+
srun --export=ALL --mpi=pmix -N2 --container-image=nvcr.io/nvidia/ai-dynamo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__:__INSTALL_DIR__,__OUTPUT_DIR__/output/hf_home:/root/.cache/huggingface,__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh:__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
1616

1717
num_retries=${DYNAMO_NUM_RETRY_ON_FAILURE:-0}
1818
for try in $(seq 0 $num_retries); do
@@ -21,6 +21,7 @@ for try in $(seq 0 $num_retries); do
2121
srun \
2222
--export=ALL \
2323
--mpi=pmix \
24+
-N2 \
2425
--container-image=nvcr.io/nvidia/ai-dynamo:24.09 \
2526
--container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__:__INSTALL_DIR__,__OUTPUT_DIR__/output/hf_home:/root/.cache/huggingface,__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh:__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh \
2627
--nodes=2 \

tests/ref_data/ddlb.sbatch

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010

1111
export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
1212

13-
srun --export=ALL --mpi=pmix --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
13+
srun --export=ALL --mpi=pmix -N1 --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
1414

15-
srun --export=ALL --mpi=pmix --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
15+
srun --export=ALL --mpi=pmix -N1 --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
1616

17-
srun --export=ALL --mpi=pmix --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; python ddlb/cli/benchmark.py --primitive tp_columnwise -m 1024 -n 128 -k 1024 --dtype float16 --num-iterations 50 --num-warmups 5 --impl pytorch;backend=nccl;order=AG_before"
17+
srun --export=ALL --mpi=pmix -N1 --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; python ddlb/cli/benchmark.py --primitive tp_columnwise -m 1024 -n 128 -k 1024 --dtype float16 --num-iterations 50 --num-warmups 5 --impl pytorch;backend=nccl;order=AG_before"

tests/ref_data/deepep-benchmark.sbatch

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ echo Num Nodes: ${#nodes[@]}
2020
echo Head Node IP: $head_node_ip
2121

2222

23-
srun --export=ALL --mpi=pmix --container-image=gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:/workspace/dp-benchmark/results --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
23+
srun --export=ALL --mpi=pmix -N2 --container-image=gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:/workspace/dp-benchmark/results --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
2424

25-
srun --export=ALL --mpi=pmix --container-image=gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:/workspace/dp-benchmark/results --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
25+
srun --export=ALL --mpi=pmix -N2 --container-image=gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:/workspace/dp-benchmark/results --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
2626

27-
srun --export=ALL --mpi=pmix --container-image=gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:/workspace/dp-benchmark/results bash -c "source __OUTPUT_DIR__/output/env_vars.sh; torchrun --nnodes=2 --nproc_per_node=1 --rdzv_id=$RANDOM --rdzv_backend=c10d --rdzv_endpoint=$head_node_ip:29500 /workspace/dp-benchmark/benchmark/benchmark.py __OUTPUT_DIR__/output/config.yaml"
27+
srun --export=ALL --mpi=pmix -N2 --container-image=gitlab-master.nvidia.com/ybenabou/warehouse/deepep:dp-benchmark --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:/workspace/dp-benchmark/results bash -c "source __OUTPUT_DIR__/output/env_vars.sh; torchrun --nnodes=2 --nproc_per_node=1 --rdzv_id=$RANDOM --rdzv_backend=c10d --rdzv_endpoint=$head_node_ip:29500 /workspace/dp-benchmark/benchmark/benchmark.py __OUTPUT_DIR__/output/config.yaml"

tests/ref_data/gpt-no-hook.sbatch

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head
1212
export COMBINE_THRESHOLD=1
1313
export PER_GPU_COMBINE_THRESHOLD=0
1414
export XLA_FLAGS="--xla_gpu_all_gather_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_all_reduce_combine_threshold_bytes=$COMBINE_THRESHOLD --xla_gpu_reduce_scatter_combine_threshold_bytes=$PER_GPU_COMBINE_THRESHOLD"
15-
srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:/opt/paxml/workspace/ --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
15+
srun --export=ALL --mpi=pmix -N1 --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:/opt/paxml/workspace/ --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
1616

17-
srun --export=ALL --mpi=pmix --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:/opt/paxml/workspace/ --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
17+
srun --export=ALL --mpi=pmix -N1 --container-image=https://docker/url --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:/opt/paxml/workspace/ --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
1818

1919
echo "Loading container with srun command"
2020
srun --mpi=none --container-image=https://docker/url --container-name=cont true

0 commit comments

Comments
 (0)