Improve tokenizer path handling in NeMo Launcher Slurm strategy

TaekyungHeo · TaekyungHeo · commit 70a62a3e0f5f · 2024-07-10T07:25:43.000-04:00
diff --git a/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py
@@ -78,10 +78,18 @@ def gen_exec_command(
         full_cmd = f"python {launcher_path}/launcher_scripts/main.py {cmd_args_str}"
 
         if extra_cmd_args:
-            full_cmd += " " + extra_cmd_args
-            if "training.model.tokenizer.model" in extra_cmd_args:
-                tokenizer_path = extra_cmd_args.split("training.model.tokenizer.model=")[1].split(" ")[0]
-                full_cmd += " " + f"container_mounts=[{tokenizer_path}:{tokenizer_path}]"
+            full_cmd += f" {extra_cmd_args}"
+            tokenizer_key = "training.model.tokenizer.model="
+            if tokenizer_key in extra_cmd_args:
+                tokenizer_path = extra_cmd_args.split(tokenizer_key, 1)[1].split(" ", 1)[0]
+                if not os.path.isfile(tokenizer_path):
+                    raise ValueError(
+                        f"The provided tokenizer path '{tokenizer_path}' is not valid. "
+                        "Please review the test schema file to ensure the tokenizer path is correct. "
+                        "If it contains a placeholder value, refer to USER_GUIDE.md to download the tokenizer "
+                        "and update the schema file accordingly."
+                    )
+                full_cmd += f" container_mounts=[{tokenizer_path}:{tokenizer_path}]"
 
         env_vars_str = " ".join(f"{key}={value}" for key, value in final_env_vars.items())
         full_cmd = f"{env_vars_str} {full_cmd}" if env_vars_str else full_cmd
diff --git a/tests/test_slurm_command_gen_strategy.py b/tests/test_slurm_command_gen_strategy.py
@@ -159,24 +159,54 @@ def test_env_var_escaping(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrateg
 
         assert "TEST_VAR=\\'value,with,commas\\'" in cmd
 
-    def test_tokenizer_handled(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy):
+    def test_tokenizer_handled(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy, tmp_path: Path):
         extra_env_vars = {"TEST_VAR_1": "value1"}
         cmd_args = {
             "docker_image_url": "fake",
             "repository_url": "fake",
             "repository_commit_hash": "fake",
         }
+        tokenizer_path = tmp_path / "tokenizer"
+        tokenizer_path.touch()
+
         cmd = nemo_cmd_gen.gen_exec_command(
             env_vars={},
             cmd_args=cmd_args,
             extra_env_vars=extra_env_vars,
-            extra_cmd_args="training.model.tokenizer.model=value",
+            extra_cmd_args=f"training.model.tokenizer.model={tokenizer_path}",
             output_path="",
             num_nodes=1,
             nodes=[],
         )
 
-        assert "container_mounts=[value:value]" in cmd
+        assert f"container_mounts=[{tokenizer_path}:{tokenizer_path}]" in cmd
+
+    def test_invalid_tokenizer_path(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy):
+        extra_env_vars = {"TEST_VAR_1": "value1"}
+        cmd_args = {
+            "docker_image_url": "fake",
+            "repository_url": "fake",
+            "repository_commit_hash": "fake",
+        }
+        invalid_tokenizer_path = "/invalid/path/to/tokenizer"
+
+        with pytest.raises(
+            ValueError,
+            match=(
+                r"The provided tokenizer path '/invalid/path/to/tokenizer' is not valid. Please review the test "
+                r"schema file to ensure the tokenizer path is correct. If it contains a placeholder value, refer to "
+                r"USER_GUIDE.md to download the tokenizer and update the schema file accordingly."
+            ),
+        ):
+            nemo_cmd_gen.gen_exec_command(
+                env_vars={},
+                cmd_args=cmd_args,
+                extra_env_vars=extra_env_vars,
+                extra_cmd_args=f"training.model.tokenizer.model={invalid_tokenizer_path}",
+                output_path="",
+                num_nodes=1,
+                nodes=[],
+            )
 
 
 class TestWriteSbatchScript: