Skip to content

Commit 8f57e3d

Browse files
Merge remote-tracking branch 'origin/main' into disable-dset-check
2 parents 8e7a7ac + b305b86 commit 8f57e3d

File tree

4 files changed

+133
-34
lines changed

4 files changed

+133
-34
lines changed

src/cloudai/schema/test_template/nemo_launcher/slurm_command_gen_strategy.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,20 @@ def gen_exec_command(
7878
del self.final_cmd_args["repository_commit_hash"]
7979
del self.final_cmd_args["docker_image_url"]
8080

81+
if self.slurm_system.account is not None:
82+
self.final_cmd_args["cluster.account"] = self.slurm_system.account
83+
self.final_cmd_args["cluster.job_name_prefix"] = f"{self.slurm_system.account}-cloudai.nemo:"
84+
self.final_cmd_args["cluster.gpus_per_node"] = (
85+
self.slurm_system.gpus_per_node if self.slurm_system.gpus_per_node is not None else "null"
86+
)
87+
88+
if ("data_dir" in self.final_cmd_args) and (self.final_cmd_args["data_dir"] == "DATA_DIR"):
89+
raise ValueError(
90+
"The 'data_dir' field of the NeMo launcher test contains the placeholder 'DATA_DIR'. "
91+
"Please update the test schema TOML file with a valid path to the dataset. "
92+
"The 'data_dir' field must point to an actual dataset location, not a placeholder."
93+
)
94+
8195
cmd_args_str = self._generate_cmd_args_str(self.final_cmd_args, nodes)
8296

8397
full_cmd = f"python {launcher_path}/launcher_scripts/main.py {cmd_args_str}"

src/cloudai/util/docker_image_cache_manager.py

Lines changed: 9 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def __init__(self, success: bool, docker_image_path: Optional[Path] = None, mess
8282
message (str): A message providing additional information about the result.
8383
"""
8484
self.success = success
85-
self._docker_image_path = docker_image_path or Path()
85+
self.docker_image_path = docker_image_path
8686
self.message = message
8787

8888
def __bool__(self):
@@ -103,26 +103,6 @@ def __str__(self):
103103
"""
104104
return self.message
105105

106-
@property
107-
def docker_image_path(self) -> Path:
108-
"""
109-
Get the path to the Docker image.
110-
111-
Returns
112-
Path: Absolute path to the Docker image.
113-
"""
114-
return self._docker_image_path.absolute()
115-
116-
@docker_image_path.setter
117-
def docker_image_path(self, value: Path) -> None:
118-
"""
119-
Set the path to the Docker image.
120-
121-
Args:
122-
value (Path): The path to the Docker image.
123-
"""
124-
self._docker_image_path = value
125-
126106

127107
class DockerImageCacheManager:
128108
"""
@@ -189,7 +169,7 @@ def check_docker_image_exists(
189169
docker_image_path = Path(docker_image_url)
190170
if docker_image_path.is_file() and docker_image_path.exists():
191171
return DockerImageCacheResult(
192-
True, docker_image_path, f"Docker image file path is valid: {docker_image_url}."
172+
True, docker_image_path.absolute(), f"Docker image file path is valid: {docker_image_url}."
193173
)
194174

195175
# Check if the cache file exists
@@ -208,7 +188,7 @@ def check_docker_image_exists(
208188
if docker_image_path.is_file() and docker_image_path.exists():
209189
message = f"Cached Docker image already exists at {docker_image_path}."
210190
logging.debug(message)
211-
return DockerImageCacheResult(True, docker_image_path, message)
191+
return DockerImageCacheResult(True, docker_image_path.absolute(), message)
212192

213193
message = f"Docker image does not exist at the specified path: {docker_image_path}."
214194
logging.debug(message)
@@ -234,7 +214,7 @@ def cache_docker_image(
234214
if docker_image_path.is_file():
235215
success_message = f"Cached Docker image already exists at {docker_image_path}."
236216
logging.info(success_message)
237-
return DockerImageCacheResult(True, docker_image_path, success_message)
217+
return DockerImageCacheResult(True, docker_image_path.absolute(), success_message)
238218

239219
prerequisite_check = self._check_prerequisites(docker_image_url)
240220
if not prerequisite_check:
@@ -281,7 +261,7 @@ def cache_docker_image(
281261
success_message = f"Docker image cached successfully at {docker_image_path}."
282262
logging.debug(success_message)
283263
logging.debug(f"Command used: {enroot_import_cmd}, stdout: {p.stdout}, stderr: {p.stderr}")
284-
return DockerImageCacheResult(True, docker_image_path, success_message)
264+
return DockerImageCacheResult(True, docker_image_path.absolute(), success_message)
285265
except subprocess.CalledProcessError as e:
286266
error_message = (
287267
f"Failed to import Docker image from {docker_image_url}. Command: {enroot_import_cmd}. Error: {e}"
@@ -416,14 +396,14 @@ def uninstall_cached_image(self, subdir_name: str, docker_image_filename: str) -
416396
subdir_path.rmdir()
417397
success_message = f"Subdirectory removed successfully: {subdir_path}."
418398
logging.info(success_message)
419-
return DockerImageCacheResult(True, subdir_path, success_message)
399+
return DockerImageCacheResult(True, subdir_path.absolute(), success_message)
420400
except OSError as e:
421401
error_message = f"Failed to remove subdirectory {subdir_path}. Error: {e}"
422402
logging.error(error_message)
423403
return DockerImageCacheResult(False, subdir_path, error_message)
424404
success_message = f"Cached Docker image uninstalled successfully from {subdir_path}."
425405
logging.info(success_message)
426-
return DockerImageCacheResult(True, subdir_path, success_message)
406+
return DockerImageCacheResult(True, subdir_path.absolute(), success_message)
427407

428408
def remove_cached_image(self, subdir_name: str, docker_image_filename: str) -> DockerImageCacheResult:
429409
"""
@@ -442,11 +422,11 @@ def remove_cached_image(self, subdir_name: str, docker_image_filename: str) -> D
442422
docker_image_path.unlink()
443423
success_message = f"Cached Docker image removed successfully from {docker_image_path}."
444424
logging.info(success_message)
445-
return DockerImageCacheResult(True, docker_image_path, success_message)
425+
return DockerImageCacheResult(True, docker_image_path.absolute(), success_message)
446426
except OSError as e:
447427
error_message = f"Failed to remove cached Docker image at {docker_image_path}. Error: {e}"
448428
logging.error(error_message)
449429
return DockerImageCacheResult(False, docker_image_path, error_message)
450430
success_message = f"No cached Docker image found to remove at {docker_image_path}."
451431
logging.info(success_message)
452-
return DockerImageCacheResult(True, docker_image_path, success_message)
432+
return DockerImageCacheResult(True, docker_image_path.absolute(), success_message)

tests/test_docker_image_cache_manager.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ def test_cache_docker_image(mock_check_prerequisites, mock_run, mock_access, moc
7171
mock_is_file.return_value = True
7272
result = manager.cache_docker_image("docker.io/hello-world", "subdir", "image.tar.gz")
7373
assert result.success
74+
assert result.docker_image_path == Path("/fake/install/path/subdir/image.tar.gz")
7475
assert result.message == "Cached Docker image already exists at /fake/install/path/subdir/image.tar.gz."
7576

7677
# Test creating subdirectory when it doesn't exist
@@ -301,9 +302,18 @@ def test_check_docker_image_accessibility_with_enroot(mock_tempdir, mock_which,
301302
assert "docker://docker.io/hello-world" in command
302303

303304

304-
def test_docker_image_path_is_always_absolute():
305-
r = DockerImageCacheResult(True, Path("relative/path"), "message")
306-
assert r.docker_image_path.is_absolute()
305+
@patch("pathlib.Path.is_file")
306+
@patch("pathlib.Path.exists")
307+
def test_ensure_docker_image_no_local_cache(mock_exists, mock_is_file):
308+
manager = DockerImageCacheManager(Path("/fake/install/path"), False, "default")
307309

308-
r.docker_image_path = Path("another/relative/path")
309-
assert r.docker_image_path.is_absolute()
310+
mock_is_file.return_value = False
311+
mock_exists.return_value = True
312+
313+
result = manager.ensure_docker_image("docker.io/hello-world", "subdir", "docker_image.sqsh")
314+
315+
assert result.success
316+
assert result.docker_image_path is not None
317+
assert str(result.docker_image_path) == "docker.io/hello-world"
318+
assert not result.docker_image_path.is_absolute()
319+
assert result.message == ""

tests/test_slurm_command_gen_strategy.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,101 @@ def test_invalid_tokenizer_path(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenS
298298
nodes=[],
299299
)
300300

301+
def test_account_in_command(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy):
302+
extra_env_vars = {"TEST_VAR_1": "value1"}
303+
cmd_args = {
304+
"docker_image_url": "fake",
305+
"repository_url": "fake",
306+
"repository_commit_hash": "fake",
307+
}
308+
309+
nemo_cmd_gen.slurm_system.account = "test_account"
310+
cmd = nemo_cmd_gen.gen_exec_command(
311+
cmd_args=cmd_args,
312+
extra_env_vars=extra_env_vars,
313+
extra_cmd_args="",
314+
output_path=Path(""),
315+
num_nodes=1,
316+
nodes=[],
317+
)
318+
319+
assert "cluster.account=test_account" in cmd
320+
assert "cluster.job_name_prefix=test_account-cloudai.nemo:" in cmd
321+
322+
def test_no_account_in_command(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy):
323+
extra_env_vars = {"TEST_VAR_1": "value1"}
324+
cmd_args = {
325+
"docker_image_url": "fake",
326+
"repository_url": "fake",
327+
"repository_commit_hash": "fake",
328+
}
329+
330+
nemo_cmd_gen.slurm_system.account = None
331+
cmd = nemo_cmd_gen.gen_exec_command(
332+
cmd_args=cmd_args,
333+
extra_env_vars=extra_env_vars,
334+
extra_cmd_args="",
335+
output_path=Path(""),
336+
num_nodes=1,
337+
nodes=[],
338+
)
339+
340+
assert "cluster.account" not in cmd
341+
assert "cluster.job_name_prefix" not in cmd
342+
343+
def test_gpus_per_node_value(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy):
344+
extra_env_vars = {"TEST_VAR_1": "value1"}
345+
cmd_args = {
346+
"docker_image_url": "fake",
347+
"repository_url": "fake",
348+
"repository_commit_hash": "fake",
349+
}
350+
351+
nemo_cmd_gen.slurm_system.gpus_per_node = 4
352+
cmd = nemo_cmd_gen.gen_exec_command(
353+
cmd_args=cmd_args,
354+
extra_env_vars=extra_env_vars,
355+
extra_cmd_args="",
356+
output_path=Path(""),
357+
num_nodes=1,
358+
nodes=[],
359+
)
360+
361+
assert "cluster.gpus_per_node=4" in cmd
362+
363+
nemo_cmd_gen.slurm_system.gpus_per_node = None
364+
cmd = nemo_cmd_gen.gen_exec_command(
365+
cmd_args=cmd_args,
366+
extra_env_vars=extra_env_vars,
367+
extra_cmd_args="",
368+
output_path=Path(""),
369+
num_nodes=1,
370+
nodes=[],
371+
)
372+
373+
assert "cluster.gpus_per_node=null" in cmd
374+
375+
def test_data_dir_validation(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy):
376+
extra_env_vars = {"TEST_VAR_1": "value1"}
377+
cmd_args = {
378+
"docker_image_url": "fake",
379+
"repository_url": "fake",
380+
"repository_commit_hash": "fake",
381+
"data_dir": "DATA_DIR", # Invalid placeholder
382+
}
383+
384+
with pytest.raises(
385+
ValueError, match="The 'data_dir' field of the NeMo launcher test contains the placeholder 'DATA_DIR'."
386+
):
387+
nemo_cmd_gen.gen_exec_command(
388+
cmd_args=cmd_args,
389+
extra_env_vars=extra_env_vars,
390+
extra_cmd_args="",
391+
output_path=Path(""),
392+
num_nodes=1,
393+
nodes=[],
394+
)
395+
301396

302397
class TestWriteSbatchScript:
303398
MANDATORY_ARGS = {

0 commit comments

Comments
 (0)