Skip to content

Commit def60ed

Browse files
authored
Merge pull request #299 from NVIDIA/am/fix-nemo-launcher
Fix Nemo Launcher cmd generation, add tests
2 parents 95b3681 + cdb76bb commit def60ed

File tree

3 files changed

+59
-6
lines changed

3 files changed

+59
-6
lines changed

src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,14 +53,14 @@ def gen_exec_command(self, tr: TestRun) -> str:
5353
f"Local clone of git repo {tdef.python_executable.git_repo} does not exist. "
5454
"Please ensure to run installation before running the test."
5555
)
56-
repo_path = Path.cwd() # dry-run compatibility
56+
repo_path = self.system.install_path / tdef.python_executable.git_repo.repo_name # dry-run compatibility
5757
venv_path = tdef.python_executable.venv_path
5858
if not venv_path:
5959
logging.warning(
6060
f"The virtual environment for git repo {tdef.python_executable.git_repo} does not exist. "
6161
"Please ensure to run installation before running the test."
6262
)
63-
venv_path = repo_path # dry-run compatibility
63+
venv_path = self.system.install_path / tdef.python_executable.venv_name # dry-run compatibility
6464
py_bin = (venv_path / "bin" / "python").absolute()
6565
self.final_cmd_args.update(
6666
{
@@ -97,7 +97,8 @@ def _prepare_environment(self, cmd_args: Dict[str, str], extra_env_vars: Dict[st
9797
self.final_env_vars = self._override_env_vars(self.system.global_env_vars, extra_env_vars)
9898

9999
overriden_cmd_args = self._override_cmd_args(self.default_cmd_args, cmd_args)
100-
self.final_cmd_args = {k: self._handle_special_keys(k, v) for k, v in overriden_cmd_args.items()}
100+
overriden_cmd_args.pop("launcher_script", None)
101+
self.final_cmd_args = {k: self._handle_special_keys(k, v) for k, v in sorted(overriden_cmd_args.items())}
101102

102103
for key, value in self.final_env_vars.items():
103104
self.final_cmd_args[f"env_vars.{key}"] = value
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
__OUTPUT_DIR__/install/NeMo-Framework-Launcher__599ecfcbbd64fd2de02f2cc093b1610d73854022-venv/bin/python \
2+
__OUTPUT_DIR__/install/NeMo-Framework-Launcher__599ecfcbbd64fd2de02f2cc093b1610d73854022/launcher_scripts/main.py \
3+
cluster.gpus_per_node=null \
4+
numa_mapping.enable=True \
5+
stages=["training"] \
6+
training.exp_manager.create_checkpoint_callback=False \
7+
training.model.data.data_impl=mock \
8+
training.model.data.data_prefix=[] \
9+
training.model.global_batch_size=128 \
10+
training.model.micro_batch_size=2 \
11+
training.model.pipeline_model_parallel_size=4 \
12+
training.model.tensor_model_parallel_size=4 \
13+
training.run.name=run \
14+
training.run.time_limit=3:00:00 \
15+
training.trainer.enable_checkpointing=False \
16+
training.trainer.log_every_n_steps=1 \
17+
training.trainer.max_steps=20 \
18+
training.trainer.val_check_interval=10 \
19+
training=gpt3/40b_improved \
20+
cluster.partition=main \
21+
training.trainer.num_nodes=1 \
22+
container=nvcr.io/nvidia/nemo:24.05.01 \
23+
base_results_dir=__OUTPUT_DIR__/output \
24+
launcher_scripts_path=__OUTPUT_DIR__/install/NeMo-Framework-Launcher__599ecfcbbd64fd2de02f2cc093b1610d73854022/launcher_scripts

tests/test_acceptance.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,16 @@
2727
from cloudai.schema.test_template.jax_toolbox.slurm_command_gen_strategy import JaxToolboxSlurmCommandGenStrategy
2828
from cloudai.schema.test_template.jax_toolbox.template import JaxToolbox
2929
from cloudai.schema.test_template.nccl_test.slurm_command_gen_strategy import NcclTestSlurmCommandGenStrategy
30+
from cloudai.schema.test_template.nemo_launcher.slurm_command_gen_strategy import NeMoLauncherSlurmCommandGenStrategy
31+
from cloudai.schema.test_template.nemo_launcher.template import NeMoLauncher
3032
from cloudai.schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy
3133
from cloudai.schema.test_template.sleep.template import Sleep
3234
from cloudai.schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy
3335
from cloudai.systems import SlurmSystem
3436
from cloudai.test_definitions.gpt import GPTCmdArgs, GPTTestDefinition
3537
from cloudai.test_definitions.grok import GrokCmdArgs, GrokTestDefinition
3638
from cloudai.test_definitions.nccl import NCCLCmdArgs, NCCLTestDefinition
39+
from cloudai.test_definitions.nemo_launcher import NeMoLauncherCmdArgs, NeMoLauncherTestDefinition
3740
from cloudai.test_definitions.sleep import SleepCmdArgs, SleepTestDefinition
3841
from cloudai.test_definitions.ucc import UCCCmdArgs, UCCTestDefinition
3942

@@ -91,7 +94,9 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]:
9194
return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path)
9295

9396

94-
@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-pre-test", "gpt-no-hook", "grok-pre-test", "grok-no-hook"])
97+
@pytest.fixture(
98+
params=["ucc", "nccl", "sleep", "gpt-pre-test", "gpt-no-hook", "grok-pre-test", "grok-no-hook", "nemo-launcher"]
99+
)
95100
def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]:
96101
if request.param == "ucc":
97102
tr = partial_tr(
@@ -211,6 +216,25 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
211216
tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL pre-test", test_runs=[pre_test_tr])
212217

213218
return (tr, f"{request.param}.sbatch", "grok.run")
219+
elif request.param == "nemo-launcher":
220+
tr = partial_tr(
221+
name="nemo-launcher",
222+
test=Test(
223+
test_definition=NeMoLauncherTestDefinition(
224+
name="nemo-launcher",
225+
description="nemo-launcher",
226+
test_template_name="nemo-launcher",
227+
cmd_args=NeMoLauncherCmdArgs(),
228+
),
229+
test_template=NeMoLauncher(slurm_system, name="nemo-launcher"),
230+
),
231+
)
232+
tr.test.test_template.command_gen_strategy = NeMoLauncherSlurmCommandGenStrategy(
233+
slurm_system, tr.test.test_definition.cmd_args_dict
234+
)
235+
tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
236+
237+
return (tr, "nemo-launcher.sbatch", None)
214238

215239
raise ValueError(f"Unknown test: {request.param}")
216240

@@ -221,10 +245,14 @@ def test_sbatch_generation(slurm_system: SlurmSystem, test_req: tuple[TestRun, s
221245
tr = test_req[0]
222246

223247
sbatch_script = tr.test.test_template.gen_exec_command(tr).split()[-1]
248+
ref = (Path(__file__).parent / "ref_data" / test_req[1]).read_text().strip()
249+
if "nemo-launcher" in test_req[1]:
250+
sbatch_script = slurm_system.output_path / "generated_command.sh"
251+
ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path.parent))
252+
else:
253+
ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path)).replace("__JOB_NAME__", "job_name")
224254

225255
curr = Path(sbatch_script).read_text().strip()
226-
ref = (Path(__file__).parent / "ref_data" / test_req[1]).read_text().strip()
227-
ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path)).replace("__JOB_NAME__", "job_name")
228256

229257
assert curr == ref
230258

0 commit comments

Comments
 (0)