Skip to content

Commit db9de5d

Browse files
committed
Fix inconsistent nodes settings
1 parent 85402bb commit db9de5d

File tree

9 files changed

+20
-18
lines changed

9 files changed

+20
-18
lines changed

src/cloudai/systems/slurm/slurm_command_gen_strategy.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -237,9 +237,11 @@ def _gen_srun_command(self) -> str:
237237
def image_path(self) -> Optional[str]:
238238
return None
239239

240-
def gen_srun_prefix(self, use_pretest_extras: bool = False) -> List[str]:
240+
def gen_srun_prefix(self, use_pretest_extras: bool = False, with_num_nodes: bool = True) -> List[str]:
241241
num_nodes, _ = self.get_cached_nodes_spec()
242-
srun_command_parts = ["srun", "--export=ALL", f"--mpi={self.system.mpi}", f"-N{num_nodes}"]
242+
srun_command_parts = ["srun", "--export=ALL", f"--mpi={self.system.mpi}"]
243+
if with_num_nodes:
244+
srun_command_parts.append(f"-N{num_nodes}")
243245
if use_pretest_extras and self.test_run.pre_test:
244246
for pre_tr in self.test_run.pre_test.test_runs:
245247
srun_command_parts.extend(self._get_cmd_gen_strategy(pre_tr).pre_test_srun_extra_args(self.test_run))

src/cloudai/workloads/common/nixl.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def gen_etcd_srun_command(self, etcd_path: str) -> list[str]:
5353
"--initial-cluster-state=new",
5454
]
5555
cmd = [
56-
*self.gen_srun_prefix(),
56+
*self.gen_srun_prefix(with_num_nodes=False),
5757
f"--output={self.test_run.output_path.absolute() / 'etcd.log'}",
5858
"--overlap",
5959
"--ntasks-per-node=1",
@@ -93,7 +93,7 @@ def gen_kill_and_wait_cmd(self, pid_var: str, timeout: int = 60) -> list[str]:
9393
return cmd
9494

9595
def gen_nixlbench_srun_commands(self, test_cmd: list[str], backend: str) -> list[list[str]]:
96-
prefix_part = self.gen_srun_prefix()
96+
prefix_part = self.gen_srun_prefix(with_num_nodes=False)
9797
bash_part = [
9898
"bash",
9999
"-c",

src/cloudai/workloads/nixl_perftest/slurm_command_gen_strategy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def _gen_srun_command(self) -> str:
6161

6262
def gen_matrix_gen_srun_command(self) -> list[str]:
6363
cmd = [
64-
*self.gen_srun_prefix(),
64+
*self.gen_srun_prefix(with_num_nodes=False),
6565
"--ntasks-per-node=1",
6666
"--ntasks=1",
6767
"-N1",

src/cloudai/workloads/triton_inference/slurm_command_gen_strategy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def image_path(self) -> str | None:
9494
def _build_server_srun(self, num_server_nodes: int) -> str:
9595
test_definition = cast(TritonInferenceTestDefinition, self.test_run.test)
9696
self._current_container_image = str(test_definition.server_docker_image.installed_path)
97-
srun_prefix = self.gen_srun_prefix()
97+
srun_prefix = self.gen_srun_prefix(with_num_nodes=False)
9898
self._current_container_image = None
9999

100100
srun_prefix.append(f"--nodes={num_server_nodes}")
@@ -107,7 +107,7 @@ def _build_server_srun(self, num_server_nodes: int) -> str:
107107
def _build_client_srun(self, num_client_nodes: int) -> str:
108108
test_definition = cast(TritonInferenceTestDefinition, self.test_run.test)
109109
self._current_container_image = str(test_definition.client_docker_image.installed_path)
110-
srun_prefix = self.gen_srun_prefix()
110+
srun_prefix = self.gen_srun_prefix(with_num_nodes=False)
111111
self._current_container_image = None
112112

113113
srun_prefix.append(f"--nodes={num_client_nodes}")

tests/ref_data/nixl-kvbench.sbatch

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,15 @@ srun --export=ALL --mpi=pmix -N2 --container-image=url.com/docker:tag --containe
1616

1717
srun --export=ALL --mpi=pmix -N2 --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
1818

19-
srun --export=ALL --mpi=pmix -N2 --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/etcd.log --overlap --ntasks-per-node=1 --ntasks=1 --nodelist=$SLURM_JOB_MASTER_NODE -N1 etcd --listen-client-urls=http://0.0.0.0:2379 --advertise-client-urls=http://$SLURM_JOB_MASTER_NODE:2379 --listen-peer-urls=http://0.0.0.0:2380 --initial-advertise-peer-urls=http://$SLURM_JOB_MASTER_NODE:2380 --initial-cluster="default=http://$SLURM_JOB_MASTER_NODE:2380" --initial-cluster-state=new &
19+
srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/etcd.log --overlap --ntasks-per-node=1 --ntasks=1 --nodelist=$SLURM_JOB_MASTER_NODE -N1 etcd --listen-client-urls=http://0.0.0.0:2379 --advertise-client-urls=http://$SLURM_JOB_MASTER_NODE:2379 --listen-peer-urls=http://0.0.0.0:2380 --initial-advertise-peer-urls=http://$SLURM_JOB_MASTER_NODE:2380 --initial-cluster="default=http://$SLURM_JOB_MASTER_NODE:2380" --initial-cluster-state=new &
2020
etcd_pid=$!
2121
timeout 60 bash -c "until curl -s $NIXL_ETCD_ENDPOINTS/health > /dev/null 2>&1; do sleep 1; done" || {
2222
echo "ETCD ($NIXL_ETCD_ENDPOINTS) was unreachable after 60 seconds";
2323
exit 1
2424
}
25-
srun --export=ALL --mpi=pmix -N2 --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=0 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS" &
25+
srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=0 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS" &
2626
sleep 15
27-
srun --export=ALL --mpi=pmix -N2 --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=1 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS"
27+
srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=1 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; path/to/python path/to/kvbench_script.sh profile --backend UCX --etcd_endpoints http://$NIXL_ETCD_ENDPOINTS"
2828
kill -9 $etcd_pid
2929
timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || {
3030
echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds";

tests/ref_data/nixl-perftest.sbatch

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ srun --export=ALL --mpi=pmix -N1 --container-image=url.com/docker:tag --containe
1616

1717
srun --export=ALL --mpi=pmix -N1 --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
1818

19-
srun --export=ALL --mpi=pmix -N1 --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks-per-node=1 --ntasks=1 -N1 bash -c "python /workspace/nixl/benchmark/kvbench/test/inference_workload_matgen.py generate --num-user-requests=2 --batch-size=1 --num-prefill-nodes=1 --num-decode-nodes=1 --results-dir=__OUTPUT_DIR__/output/matrices --prefill-tp=1 --prefill-pp=1 --prefill-cp=1 --decode-tp=1 --decode-pp=1 --decode-cp=1 --model=model-name"
20-
srun --export=ALL --mpi=pmix -N1 --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/etcd.log --overlap --ntasks-per-node=1 --ntasks=1 --nodelist=$SLURM_JOB_MASTER_NODE -N1 etcd --listen-client-urls=http://0.0.0.0:2379 --advertise-client-urls=http://$SLURM_JOB_MASTER_NODE:2379 --listen-peer-urls=http://0.0.0.0:2380 --initial-advertise-peer-urls=http://$SLURM_JOB_MASTER_NODE:2380 --initial-cluster="default=http://$SLURM_JOB_MASTER_NODE:2380" --initial-cluster-state=new &
19+
srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks-per-node=1 --ntasks=1 -N1 bash -c "python /workspace/nixl/benchmark/kvbench/test/inference_workload_matgen.py generate --num-user-requests=2 --batch-size=1 --num-prefill-nodes=1 --num-decode-nodes=1 --results-dir=__OUTPUT_DIR__/output/matrices --prefill-tp=1 --prefill-pp=1 --prefill-cp=1 --decode-tp=1 --decode-pp=1 --decode-cp=1 --model=model-name"
20+
srun --export=ALL --mpi=pmix --container-image=url.com/docker:tag --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/etcd.log --overlap --ntasks-per-node=1 --ntasks=1 --nodelist=$SLURM_JOB_MASTER_NODE -N1 etcd --listen-client-urls=http://0.0.0.0:2379 --advertise-client-urls=http://$SLURM_JOB_MASTER_NODE:2379 --listen-peer-urls=http://0.0.0.0:2380 --initial-advertise-peer-urls=http://$SLURM_JOB_MASTER_NODE:2380 --initial-cluster="default=http://$SLURM_JOB_MASTER_NODE:2380" --initial-cluster-state=new &
2121
etcd_pid=$!
2222
timeout 60 bash -c "until curl -s $NIXL_ETCD_ENDPOINTS/health > /dev/null 2>&1; do sleep 1; done" || {
2323
echo "ETCD ($NIXL_ETCD_ENDPOINTS) was unreachable after 60 seconds";

tests/ref_data/nixl_bench.sbatch

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,15 @@ srun --export=ALL --mpi=pmix -N2 --output=__OUTPUT_DIR__/output/mapping-stdout.t
1616

1717
srun --export=ALL --mpi=pmix -N2 --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash __INSTALL_DIR__/slurm-metadata.sh
1818

19-
srun --export=ALL --mpi=pmix -N2 --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/etcd.log --overlap --ntasks-per-node=1 --ntasks=1 --nodelist=$SLURM_JOB_MASTER_NODE -N1 etcd --listen-client-urls=http://0.0.0.0:2379 --advertise-client-urls=http://$SLURM_JOB_MASTER_NODE:2379 --listen-peer-urls=http://0.0.0.0:2380 --initial-advertise-peer-urls=http://$SLURM_JOB_MASTER_NODE:2380 --initial-cluster="default=http://$SLURM_JOB_MASTER_NODE:2380" --initial-cluster-state=new &
19+
srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/etcd.log --overlap --ntasks-per-node=1 --ntasks=1 --nodelist=$SLURM_JOB_MASTER_NODE -N1 etcd --listen-client-urls=http://0.0.0.0:2379 --advertise-client-urls=http://$SLURM_JOB_MASTER_NODE:2379 --listen-peer-urls=http://0.0.0.0:2380 --initial-advertise-peer-urls=http://$SLURM_JOB_MASTER_NODE:2380 --initial-cluster="default=http://$SLURM_JOB_MASTER_NODE:2380" --initial-cluster-state=new &
2020
etcd_pid=$!
2121
timeout 60 bash -c "until curl -s $NIXL_ETCD_ENDPOINTS/health > /dev/null 2>&1; do sleep 1; done" || {
2222
echo "ETCD ($NIXL_ETCD_ENDPOINTS) was unreachable after 60 seconds";
2323
exit 1
2424
}
25-
srun --export=ALL --mpi=pmix -N2 --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=0 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints http://$NIXL_ETCD_ENDPOINTS --backend UCX" &
25+
srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=0 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints http://$NIXL_ETCD_ENDPOINTS --backend UCX" &
2626
sleep 15
27-
srun --export=ALL --mpi=pmix -N2 --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=1 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints http://$NIXL_ETCD_ENDPOINTS --backend UCX"
27+
srun --export=ALL --mpi=pmix --container-image=url.com/docker:2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__INSTALL_DIR__:/cloudai_install,__OUTPUT_DIR__/output --overlap --relative=1 --ntasks-per-node=1 --ntasks=1 -N1 bash -c "source __OUTPUT_DIR__/output/env_vars.sh; ./nixlbench --etcd-endpoints http://$NIXL_ETCD_ENDPOINTS --backend UCX"
2828
kill -9 $etcd_pid
2929
timeout 60 bash -c "while kill -0 $etcd_pid 2>/dev/null; do sleep 1; done" || {
3030
echo "Failed to kill ETCD (pid=$etcd_pid) within 60 seconds";

tests/ref_data/triton-inference.sbatch

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ srun --export=ALL --mpi=pmix -N3 --output=__OUTPUT_DIR__/output/mapping-stdout.t
1919

2020
srun --export=ALL --mpi=pmix -N3 --ntasks=3 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash __OUTPUT_DIR__/install/slurm-metadata.sh
2121

22-
srun --export=ALL --mpi=pmix -N3 --container-image=nvcr.io/nim/deepseek-ai/deepseek-r1:1.7.2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:__OUTPUT_DIR__/output:ro,__OUTPUT_DIR__/output:__OUTPUT_DIR__/output:rw,__OUTPUT_DIR__/output/start_server_wrapper.sh:/opt/nim/start_server_wrapper.sh:ro --nodes=2 --ntasks=2 --ntasks-per-node=1 /opt/nim/start_server_wrapper.sh &
22+
srun --export=ALL --mpi=pmix --container-image=nvcr.io/nim/deepseek-ai/deepseek-r1:1.7.2 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:__OUTPUT_DIR__/output:ro,__OUTPUT_DIR__/output:__OUTPUT_DIR__/output:rw,__OUTPUT_DIR__/output/start_server_wrapper.sh:/opt/nim/start_server_wrapper.sh:ro --nodes=2 --ntasks=2 --ntasks-per-node=1 /opt/nim/start_server_wrapper.sh &
2323

2424
sleep 3300
2525

26-
srun --export=ALL --mpi=pmix -N3 --container-image=nvcr.io/nvidia/tritonserver:25.01-py3-sdk --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:__OUTPUT_DIR__/output:ro,__OUTPUT_DIR__/output:__OUTPUT_DIR__/output:rw,__OUTPUT_DIR__/output/start_server_wrapper.sh:/opt/nim/start_server_wrapper.sh:ro --nodes=1 --ntasks=1 genai-perf profile -m model --endpoint-type chat --service-kind openai --streaming -u $SLURM_JOB_MASTER_NODE:8000 --num-prompts 20 --synthetic-input-tokens-mean 128 --synthetic-input-tokens-stddev 0 --concurrency 1 --output-tokens-mean 128 --extra-inputs max_tokens:128 --extra-inputs min_tokens:128 --extra-inputs ignore_eos:true --artifact-dir /cloudai_run_results --tokenizer tok -- -v --max-threads 1 --request-count 20
26+
srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/tritonserver:25.01-py3-sdk --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__OUTPUT_DIR__/output:__OUTPUT_DIR__/output:ro,__OUTPUT_DIR__/output:__OUTPUT_DIR__/output:rw,__OUTPUT_DIR__/output/start_server_wrapper.sh:/opt/nim/start_server_wrapper.sh:ro --nodes=1 --ntasks=1 genai-perf profile -m model --endpoint-type chat --service-kind openai --streaming -u $SLURM_JOB_MASTER_NODE:8000 --num-prompts 20 --synthetic-input-tokens-mean 128 --synthetic-input-tokens-stddev 0 --concurrency 1 --output-tokens-mean 128 --extra-inputs max_tokens:128 --extra-inputs min_tokens:128 --extra-inputs ignore_eos:true --artifact-dir /cloudai_run_results --tokenizer tok -- -v --max-threads 1 --request-count 20

tests/slurm_command_gen_strategy/test_nixl_perftest_slurm_command_gen_strategy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def test_gen_matrix_gen_srun_command(test_run: TestRun, slurm_system: SlurmSyste
6161
strategy.gen_matrix_gen_command = lambda: ["cmd"]
6262
cmd = strategy.gen_matrix_gen_srun_command()
6363
assert cmd == [
64-
*strategy.gen_srun_prefix(),
64+
*strategy.gen_srun_prefix(with_num_nodes=False),
6565
"--ntasks-per-node=1",
6666
"--ntasks=1",
6767
"-N1",

0 commit comments

Comments
 (0)