Merge pull request #299 from NVIDIA/am/fix-nemo-launcher

amaslenn · web-flow · commit def60edf1a03 · 2024-11-12T11:59:08.000+01:00
Fix Nemo Launcher cmd generation, add tests
diff --git a/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py
@@ -53,14 +53,14 @@ def gen_exec_command(self, tr: TestRun) -> str:
                 f"Local clone of git repo {tdef.python_executable.git_repo} does not exist. "
                 "Please ensure to run installation before running the test."
             )
-            repo_path = Path.cwd()  # dry-run compatibility
+            repo_path = self.system.install_path / tdef.python_executable.git_repo.repo_name  # dry-run compatibility
         venv_path = tdef.python_executable.venv_path
         if not venv_path:
             logging.warning(
                 f"The virtual environment for git repo {tdef.python_executable.git_repo} does not exist. "
                 "Please ensure to run installation before running the test."
             )
-            venv_path = repo_path  # dry-run compatibility
+            venv_path = self.system.install_path / tdef.python_executable.venv_name  # dry-run compatibility
         py_bin = (venv_path / "bin" / "python").absolute()
         self.final_cmd_args.update(
             {
@@ -97,7 +97,8 @@ def _prepare_environment(self, cmd_args: Dict[str, str], extra_env_vars: Dict[st
         self.final_env_vars = self._override_env_vars(self.system.global_env_vars, extra_env_vars)
 
         overriden_cmd_args = self._override_cmd_args(self.default_cmd_args, cmd_args)
-        self.final_cmd_args = {k: self._handle_special_keys(k, v) for k, v in overriden_cmd_args.items()}
+        overriden_cmd_args.pop("launcher_script", None)
+        self.final_cmd_args = {k: self._handle_special_keys(k, v) for k, v in sorted(overriden_cmd_args.items())}
 
         for key, value in self.final_env_vars.items():
             self.final_cmd_args[f"env_vars.{key}"] = value
diff --git a/tests/ref_data/nemo-launcher.sbatch b/tests/ref_data/nemo-launcher.sbatch
@@ -0,0 +1,24 @@
+__OUTPUT_DIR__/install/NeMo-Framework-Launcher__599ecfcbbd64fd2de02f2cc093b1610d73854022-venv/bin/python \
+ __OUTPUT_DIR__/install/NeMo-Framework-Launcher__599ecfcbbd64fd2de02f2cc093b1610d73854022/launcher_scripts/main.py \
+ cluster.gpus_per_node=null \
+ numa_mapping.enable=True \
+ stages=["training"] \
+ training.exp_manager.create_checkpoint_callback=False \
+ training.model.data.data_impl=mock \
+ training.model.data.data_prefix=[] \
+ training.model.global_batch_size=128 \
+ training.model.micro_batch_size=2 \
+ training.model.pipeline_model_parallel_size=4 \
+ training.model.tensor_model_parallel_size=4 \
+ training.run.name=run \
+ training.run.time_limit=3:00:00 \
+ training.trainer.enable_checkpointing=False \
+ training.trainer.log_every_n_steps=1 \
+ training.trainer.max_steps=20 \
+ training.trainer.val_check_interval=10 \
+ training=gpt3/40b_improved \
+ cluster.partition=main \
+ training.trainer.num_nodes=1 \
+ container=nvcr.io/nvidia/nemo:24.05.01 \
+ base_results_dir=__OUTPUT_DIR__/output \
+ launcher_scripts_path=__OUTPUT_DIR__/install/NeMo-Framework-Launcher__599ecfcbbd64fd2de02f2cc093b1610d73854022/launcher_scripts
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
@@ -27,13 +27,16 @@
 from cloudai.schema.test_template.jax_toolbox.slurm_command_gen_strategy import JaxToolboxSlurmCommandGenStrategy
 from cloudai.schema.test_template.jax_toolbox.template import JaxToolbox
 from cloudai.schema.test_template.nccl_test.slurm_command_gen_strategy import NcclTestSlurmCommandGenStrategy
+from cloudai.schema.test_template.nemo_launcher.slurm_command_gen_strategy import NeMoLauncherSlurmCommandGenStrategy
+from cloudai.schema.test_template.nemo_launcher.template import NeMoLauncher
 from cloudai.schema.test_template.sleep.slurm_command_gen_strategy import SleepSlurmCommandGenStrategy
 from cloudai.schema.test_template.sleep.template import Sleep
 from cloudai.schema.test_template.ucc_test.slurm_command_gen_strategy import UCCTestSlurmCommandGenStrategy
 from cloudai.systems import SlurmSystem
 from cloudai.test_definitions.gpt import GPTCmdArgs, GPTTestDefinition
 from cloudai.test_definitions.grok import GrokCmdArgs, GrokTestDefinition
 from cloudai.test_definitions.nccl import NCCLCmdArgs, NCCLTestDefinition
+from cloudai.test_definitions.nemo_launcher import NeMoLauncherCmdArgs, NeMoLauncherTestDefinition
 from cloudai.test_definitions.sleep import SleepCmdArgs, SleepTestDefinition
 from cloudai.test_definitions.ucc import UCCCmdArgs, UCCTestDefinition
 
@@ -91,7 +94,9 @@ def partial_tr(slurm_system: SlurmSystem) -> partial[TestRun]:
     return partial(TestRun, num_nodes=1, nodes=[], output_path=slurm_system.output_path)
 
 
-@pytest.fixture(params=["ucc", "nccl", "sleep", "gpt-pre-test", "gpt-no-hook", "grok-pre-test", "grok-no-hook"])
+@pytest.fixture(
+    params=["ucc", "nccl", "sleep", "gpt-pre-test", "gpt-no-hook", "grok-pre-test", "grok-no-hook", "nemo-launcher"]
+)
 def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> tuple[TestRun, str, Optional[str]]:
     if request.param == "ucc":
         tr = partial_tr(
@@ -211,6 +216,25 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
             tr.pre_test = TestScenario(name=f"{pre_test_tr.name} NCCL pre-test", test_runs=[pre_test_tr])
 
         return (tr, f"{request.param}.sbatch", "grok.run")
+    elif request.param == "nemo-launcher":
+        tr = partial_tr(
+            name="nemo-launcher",
+            test=Test(
+                test_definition=NeMoLauncherTestDefinition(
+                    name="nemo-launcher",
+                    description="nemo-launcher",
+                    test_template_name="nemo-launcher",
+                    cmd_args=NeMoLauncherCmdArgs(),
+                ),
+                test_template=NeMoLauncher(slurm_system, name="nemo-launcher"),
+            ),
+        )
+        tr.test.test_template.command_gen_strategy = NeMoLauncherSlurmCommandGenStrategy(
+            slurm_system, tr.test.test_definition.cmd_args_dict
+        )
+        tr.test.test_template.command_gen_strategy.job_name = Mock(return_value="job_name")
+
+        return (tr, "nemo-launcher.sbatch", None)
 
     raise ValueError(f"Unknown test: {request.param}")
 
@@ -221,10 +245,14 @@ def test_sbatch_generation(slurm_system: SlurmSystem, test_req: tuple[TestRun, s
     tr = test_req[0]
 
     sbatch_script = tr.test.test_template.gen_exec_command(tr).split()[-1]
+    ref = (Path(__file__).parent / "ref_data" / test_req[1]).read_text().strip()
+    if "nemo-launcher" in test_req[1]:
+        sbatch_script = slurm_system.output_path / "generated_command.sh"
+        ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path.parent))
+    else:
+        ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path)).replace("__JOB_NAME__", "job_name")
 
     curr = Path(sbatch_script).read_text().strip()
-    ref = (Path(__file__).parent / "ref_data" / test_req[1]).read_text().strip()
-    ref = ref.replace("__OUTPUT_DIR__", str(slurm_system.output_path)).replace("__JOB_NAME__", "job_name")
 
     assert curr == ref