Skip to content

Commit 0ce5c57

Browse files
authored
Merge pull request #394 from NVIDIA/am/no-cache-check
Allow skipping cache validation
2 parents 56190f7 + f3df9e8 commit 0ce5c57

File tree

11 files changed

+124
-4
lines changed

11 files changed

+124
-4
lines changed

USER_GUIDE.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,14 @@ CloudAI runs all slurm jobs using containers. To simplify file system related ta
438438
These mounts are not verified for validity and do not override default mounts.
439439
1. Test specific mounts can be mounted in-code.
440440

441+
### Head node without shared storage available on compute nodes
442+
When compute nodes don't share file system with head node, `--enable-cache-without-check` for `run` and `dry-run` skips real check for cache existence, but still builds all paths correctly. The flow is like this:
443+
444+
1. _[on the head node]_ run `cloudai install`
445+
1. _[on the head node]_ copy cache to compute nodes
446+
1. Modify system.toml to set compute nodes' installation root
447+
1. Run `cloudai run --enable-cache-without-check ...`
448+
441449
#### Dev details
442450
`SlurmCommandGenStrategy` defines abstract method `_container_mounts(tr: TestRun)` that must be implemented by every subclass. This method is used in `SlurmCommandGenStrategy.container_mounts(tr: TestRun)` (defined as `@final`) where mounts like `/cloudai_run_results` (default mount), `TestDefinition.extra_container_mounts` (from Test TOML) and test specific mounts (defined in-code) are added.
443451

src/cloudai/_core/base_installer.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,23 @@ def uninstall(self, items: Iterable[Installable]) -> InstallStatusResult:
187187
nfailed = len([result for result in uninstall_results.values() if result != "Success"])
188188
return InstallStatusResult(False, f"{nfailed} item(s) failed to uninstall.", uninstall_results)
189189

190+
@final
191+
def mark_as_installed(self, items: Iterable[Installable]) -> InstallStatusResult:
192+
"""
193+
Mark the installable items as installed.
194+
195+
Args:
196+
items (Iterable[Installable]): Items to mark as installed.
197+
198+
Returns:
199+
InstallStatusResult: Result containing the status and error message if any.
200+
"""
201+
install_results = {}
202+
for item in items:
203+
self.mark_as_installed_one(item)
204+
205+
return InstallStatusResult(True, "All items marked as installed successfully.", install_results)
206+
190207
@abstractmethod
191208
def install_one(self, item: Installable) -> InstallStatusResult: ...
192209

@@ -195,3 +212,6 @@ def uninstall_one(self, item: Installable) -> InstallStatusResult: ...
195212

196213
@abstractmethod
197214
def is_installed_one(self, item: Installable) -> InstallStatusResult: ...
215+
216+
@abstractmethod
217+
def mark_as_installed_one(self, item: Installable) -> InstallStatusResult: ...

src/cloudai/cli/cli.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def add_run_and_dry_run(self):
124124
desc = "Execute the test scenarios."
125125
if mode == "dry-run":
126126
desc = "Perform a dry-run of the test scenarios without executing them."
127-
self.add_command(
127+
p = self.add_command(
128128
mode,
129129
desc,
130130
handle_dry_run_and_run,
@@ -133,6 +133,12 @@ def add_run_and_dry_run(self):
133133
test_scenario=True,
134134
output_dir=False,
135135
)
136+
p.add_argument(
137+
"--enable-cache-without-check",
138+
action="store_true",
139+
help="Enable cache without checking.",
140+
default=False,
141+
)
136142

137143
def add_install_and_uninstall(self):
138144
for mode in {"install", "uninstall"}:

src/cloudai/cli/handlers.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,10 @@ def handle_dry_run_and_run(args: argparse.Namespace) -> int:
191191
raise NotImplementedError(f"No installer available for scheduler: {system.scheduler}")
192192
installer = installer_class(system)
193193

194-
result = installer.is_installed(installables)
194+
if args.enable_cache_without_check:
195+
result = installer.mark_as_installed(installables)
196+
else:
197+
result = installer.is_installed(installables)
195198

196199
if args.mode == "run" and not result.success:
197200
logging.error("CloudAI has not been installed. Please run install mode first.")

src/cloudai/installer/slurm_installer.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,23 @@ def is_installed_one(self, item: Installable) -> InstallStatusResult:
177177

178178
return InstallStatusResult(False, f"Unsupported item type: {type(item)}")
179179

180+
def mark_as_installed_one(self, item: Installable) -> InstallStatusResult:
181+
if isinstance(item, DockerImage):
182+
item.installed_path = self.system.install_path / item.cache_filename
183+
return InstallStatusResult(True)
184+
elif isinstance(item, GitRepo):
185+
item.installed_path = self.system.install_path / item.repo_name
186+
return InstallStatusResult(True)
187+
elif isinstance(item, PythonExecutable):
188+
item.git_repo.installed_path = self.system.install_path / item.git_repo.repo_name
189+
item.venv_path = self.system.install_path / item.venv_name
190+
return InstallStatusResult(True)
191+
elif isinstance(item, File):
192+
item.installed_path = self.system.install_path / item.src.name
193+
return InstallStatusResult(True)
194+
195+
return InstallStatusResult(False, f"Unsupported item type: {type(item)}")
196+
180197
def _install_docker_image(self, item: DockerImage) -> DockerImageCacheResult:
181198
res = self.docker_image_cache_manager.ensure_docker_image(item.url, item.cache_filename)
182199
if res.success and res.docker_image_path:

src/cloudai/installer/standalone_installer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,6 @@ def uninstall_one(self, item: Installable) -> InstallStatusResult:
4848

4949
def is_installed_one(self, item: Installable) -> InstallStatusResult:
5050
return InstallStatusResult(False, f"Unsupported item type: {type(item)}")
51+
52+
def mark_as_installed_one(self, item: Installable) -> InstallStatusResult:
53+
return InstallStatusResult(False, f"Unsupported item type: {type(item)}")

tests/test_acceptance.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def test_slurm(tmp_path: Path, scenario: Dict):
7979
hook_dir=Path("conf/common/hook"),
8080
test_scenario=test_scenario_path,
8181
output_dir=tmp_path,
82+
enable_cache_without_check=False,
8283
)
8384
with (
8485
patch("asyncio.sleep", return_value=None),

tests/test_base_installer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ def uninstall_one(self, item: Installable) -> InstallStatusResult:
4343
def is_installed_one(self, item: Installable) -> InstallStatusResult:
4444
return InstallStatusResult(success=True)
4545

46+
def mark_as_installed_one(self, item: Installable) -> InstallStatusResult:
47+
return InstallStatusResult(success=True)
48+
4649

4750
@pytest.fixture
4851
def docker_image() -> DockerImage:

tests/test_cli.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,7 @@ def test_run_dry_run_modes(self, cli: CloudAICLI):
290290
tests_dir=Path("tests_dir"),
291291
test_scenario=Path("test_scenario"),
292292
output_dir=None,
293+
enable_cache_without_check=False,
293294
)
294295

295296
@pytest.mark.parametrize(

tests/test_slurm_installer.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
from pathlib import Path
1818
from subprocess import CompletedProcess
19+
from typing import cast
1920
from unittest.mock import Mock, patch
2021

2122
import pytest
@@ -24,6 +25,7 @@
2425
from cloudai.installer.slurm_installer import SlurmInstaller
2526
from cloudai.systems.slurm.slurm_system import SlurmSystem
2627
from cloudai.util.docker_image_cache_manager import DockerImageCacheResult
28+
from cloudai.workloads.nemo_launcher import NeMoLauncherCmdArgs, NeMoLauncherTestDefinition
2729

2830

2931
@pytest.fixture
@@ -348,7 +350,7 @@ def test_check_supported(slurm_system: SlurmSystem):
348350
installer._is_python_executable_installed = lambda item: InstallStatusResult(True)
349351
installer.docker_image_cache_manager.check_docker_image_exists = Mock(return_value=DockerImageCacheResult(True))
350352

351-
git = GitRepo(url="git_url", commit="commit_hash")
353+
git = GitRepo(url="./git_url", commit="commit_hash")
352354
items = [DockerImage("fake_url/img"), PythonExecutable(git), File(Path(__file__))]
353355
for item in items:
354356
res = installer.install_one(item)
@@ -360,6 +362,9 @@ def test_check_supported(slurm_system: SlurmSystem):
360362
res = installer.uninstall_one(item)
361363
assert res.success
362364

365+
res = installer.mark_as_installed_one(item)
366+
assert res.success
367+
363368
class MyInstallable(Installable):
364369
def __eq__(self, other: object) -> bool:
365370
return True
@@ -368,7 +373,35 @@ def __hash__(self) -> int:
368373
return hash("MyInstallable")
369374

370375
unsupported = MyInstallable()
371-
for func in [installer.install_one, installer.uninstall_one, installer.is_installed_one]:
376+
for func in [
377+
installer.install_one,
378+
installer.uninstall_one,
379+
installer.is_installed_one,
380+
installer.mark_as_installed_one,
381+
]:
372382
res = func(unsupported)
373383
assert not res.success
374384
assert res.message == f"Unsupported item type: {type(unsupported)}"
385+
386+
387+
def test_git_repo():
388+
git = GitRepo(url="./git_url", commit="commit_hash")
389+
assert git.container_mount == f"/git/{git.repo_name}"
390+
391+
git.mount_as = "/my_mount"
392+
assert git.container_mount == git.mount_as
393+
394+
395+
def test_mark_as_installed(slurm_system: SlurmSystem):
396+
tdef = NeMoLauncherTestDefinition(
397+
name="name", description="desc", test_template_name="tt", cmd_args=NeMoLauncherCmdArgs()
398+
)
399+
docker = cast(DockerImage, tdef.installables[0])
400+
py_script = cast(PythonExecutable, tdef.installables[1])
401+
402+
installer = SlurmInstaller(slurm_system)
403+
res = installer.mark_as_installed(tdef.installables)
404+
405+
assert res.success
406+
assert docker.installed_path == slurm_system.install_path / docker.cache_filename
407+
assert py_script.git_repo.installed_path == slurm_system.install_path / py_script.git_repo.repo_name

0 commit comments

Comments
 (0)