Skip to content

Commit 5e1a1cf

Browse files
authored
Merge pull request #737 from NVIDIA/am/hf-model-k8s
Add support for HF model in K8s
2 parents be304b7 + 849967b commit 5e1a1cf

File tree

4 files changed

+104
-53
lines changed

4 files changed

+104
-53
lines changed

src/cloudai/systems/kubernetes/kubernetes_installer.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,28 @@
2222
from pathlib import Path
2323
from shutil import rmtree
2424

25-
from cloudai.core import BaseInstaller, DockerImage, File, GitRepo, Installable, InstallStatusResult, PythonExecutable
25+
from cloudai.core import (
26+
BaseInstaller,
27+
DockerImage,
28+
File,
29+
GitRepo,
30+
HFModel,
31+
Installable,
32+
InstallStatusResult,
33+
PythonExecutable,
34+
System,
35+
)
36+
from cloudai.util.hf_model_manager import HFModelManager
2637
from cloudai.util.lazy_imports import lazy
2738

2839

2940
class KubernetesInstaller(BaseInstaller):
3041
"""Installer for Kubernetes systems."""
3142

43+
def __init__(self, system: System) -> None:
44+
super().__init__(system)
45+
self.hf_model_manager = HFModelManager(system.hf_home_path)
46+
3247
def _check_prerequisites(self) -> InstallStatusResult:
3348
"""
3449
Check for the presence of required binaries and Kubernetes configurations.
@@ -70,6 +85,8 @@ def install_one(self, item: Installable) -> InstallStatusResult:
7085
item.installed_path = self.system.install_path / item.src.name
7186
shutil.copyfile(item.src, item.installed_path, follow_symlinks=False)
7287
return InstallStatusResult(True)
88+
elif isinstance(item, HFModel):
89+
return self.hf_model_manager.download_model(item)
7390
return InstallStatusResult(False, f"Unsupported item type: {type(item)}")
7491

7592
def uninstall_one(self, item: Installable) -> InstallStatusResult:
@@ -86,6 +103,8 @@ def uninstall_one(self, item: Installable) -> InstallStatusResult:
86103
return InstallStatusResult(True)
87104
logging.debug(f"File {item.installed_path} does not exist.")
88105
return InstallStatusResult(True)
106+
elif isinstance(item, HFModel):
107+
return self.hf_model_manager.remove_model(item)
89108
return InstallStatusResult(False, f"Unsupported item type: {type(item)}")
90109

91110
def is_installed_one(self, item: Installable) -> InstallStatusResult:
@@ -99,6 +118,8 @@ def is_installed_one(self, item: Installable) -> InstallStatusResult:
99118
return InstallStatusResult(False, f"Git repository {item.url} not cloned")
100119
elif isinstance(item, PythonExecutable):
101120
return self._is_python_executable_installed(item)
121+
elif isinstance(item, HFModel):
122+
return self.hf_model_manager.is_model_downloaded(item)
102123
return InstallStatusResult(False, f"Unsupported item type: {type(item)}")
103124

104125
def mark_as_installed_one(self, item: Installable) -> InstallStatusResult:
@@ -111,6 +132,9 @@ def mark_as_installed_one(self, item: Installable) -> InstallStatusResult:
111132
item.git_repo.installed_path = self.system.install_path / item.git_repo.repo_name
112133
item.venv_path = self.system.install_path / item.venv_name
113134
return InstallStatusResult(True)
135+
elif isinstance(item, HFModel):
136+
item.installed_path = self.system.hf_home_path # fake path is OK here as the whole HF home will be mounted
137+
return InstallStatusResult(True)
114138
return InstallStatusResult(False, f"Unsupported item type: {type(item)}")
115139

116140
def _install_one_git_repo(self, item: GitRepo) -> InstallStatusResult:

src/cloudai/systems/slurm/slurm_installer.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def __init__(self, system: SlurmSystem):
5252
super().__init__(system)
5353
self.system = system
5454
self.docker_image_cache_manager = DockerImageCacheManager(system)
55-
self.hf_model_downloader = HFModelManager(system.hf_home_path)
55+
self.hf_model_manager = HFModelManager(system.hf_home_path)
5656

5757
def _check_prerequisites(self) -> InstallStatusResult:
5858
base_prerequisites_result = super()._check_prerequisites()
@@ -102,7 +102,7 @@ def install_one(self, item: Installable) -> InstallStatusResult:
102102
shutil.copyfile(item.src, item.installed_path, follow_symlinks=False)
103103
return InstallStatusResult(True)
104104
elif isinstance(item, HFModel):
105-
return self.hf_model_downloader.download_model(item)
105+
return self.hf_model_manager.download_model(item)
106106

107107
return InstallStatusResult(False, f"Unsupported item type: {type(item)}")
108108

@@ -123,7 +123,7 @@ def uninstall_one(self, item: Installable) -> InstallStatusResult:
123123
logging.debug(f"File {item.installed_path} does not exist.")
124124
return InstallStatusResult(True)
125125
elif isinstance(item, HFModel):
126-
return self.hf_model_downloader.remove_model(item)
126+
return self.hf_model_manager.remove_model(item)
127127

128128
return InstallStatusResult(False, f"Unsupported item type: {type(item)}")
129129

@@ -149,7 +149,7 @@ def is_installed_one(self, item: Installable) -> InstallStatusResult:
149149
return InstallStatusResult(True)
150150
return InstallStatusResult(False, f"File {item.installed_path} does not exist")
151151
elif isinstance(item, HFModel):
152-
return self.hf_model_downloader.is_model_downloaded(item)
152+
return self.hf_model_manager.is_model_downloaded(item)
153153

154154
return InstallStatusResult(False, f"Unsupported item type: {type(item)}")
155155

tests/test_base_installer.py

Lines changed: 74 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,25 @@
1717
import shutil
1818
from concurrent.futures import Future
1919
from pathlib import Path
20-
from typing import Generator, cast
20+
from typing import Any, Generator, cast
2121
from unittest.mock import Mock, patch
2222

2323
import pytest
2424

25-
from cloudai.core import BaseInstaller, DockerImage, File, GitRepo, Installable, InstallStatusResult
25+
from cloudai.core import (
26+
BaseInstaller,
27+
DockerImage,
28+
File,
29+
GitRepo,
30+
HFModel,
31+
Installable,
32+
InstallStatusResult,
33+
PythonExecutable,
34+
)
35+
from cloudai.systems.kubernetes.kubernetes_installer import KubernetesInstaller
36+
from cloudai.systems.kubernetes.kubernetes_system import KubernetesSystem
2637
from cloudai.systems.slurm import SlurmInstaller, SlurmSystem
38+
from cloudai.systems.slurm.docker_image_cache_manager import DockerImageCacheResult
2739
from cloudai.util import prepare_output_dir
2840

2941

@@ -280,3 +292,63 @@ def test_order_of_items_does_not_matter(self, installer: SlurmInstaller):
280292
"First ('self', present in the statuses) file was not marked as installed"
281293
)
282294
assert f2._installed_path is not None, "Second file was not marked as installed"
295+
296+
297+
@pytest.fixture(params=["k8s", "slurm"])
298+
def installer(
299+
request: Any, kubernetes_system: KubernetesSystem, slurm_system: SlurmSystem
300+
) -> KubernetesInstaller | SlurmInstaller:
301+
installer = KubernetesInstaller(kubernetes_system) if request.param == "k8s" else SlurmInstaller(slurm_system)
302+
303+
installer.system.install_path.mkdir(parents=True)
304+
installer._check_low_thread_environment = lambda threshold=None: False
305+
return installer
306+
307+
308+
def test_check_supported(installer: KubernetesInstaller | SlurmInstaller):
309+
if isinstance(installer, SlurmInstaller):
310+
installer._install_docker_image = lambda item: DockerImageCacheResult(True)
311+
installer._uninstall_docker_image = lambda item: DockerImageCacheResult(True)
312+
installer.docker_image_cache_manager.check_docker_image_exists = Mock(return_value=DockerImageCacheResult(True))
313+
installer._install_python_executable = lambda item: InstallStatusResult(True)
314+
installer._uninstall_python_executable = lambda item: InstallStatusResult(True)
315+
installer._is_python_executable_installed = lambda item: InstallStatusResult(True)
316+
installer.hf_model_manager = Mock()
317+
318+
git = GitRepo(url="./git_url", commit="commit_hash")
319+
items = [DockerImage("fake_url/img"), PythonExecutable(git), HFModel("model_name")]
320+
if isinstance(installer, SlurmInstaller):
321+
items.append(File(Path(__file__)))
322+
for item in items:
323+
res = installer.install_one(item)
324+
assert res.success, f"Failed to install {item} for {installer.__class__.__name__=} {res.message=}"
325+
326+
res = installer.is_installed_one(item)
327+
assert res.success, f"Failed to check installation of {item} for {installer.__class__.__name__=} {res.message=}"
328+
329+
res = installer.uninstall_one(item)
330+
assert res.success, f"Failed to uninstall {item} for {installer.__class__.__name__=} {res.message=}"
331+
332+
res = installer.mark_as_installed_one(item)
333+
assert res.success, f"Failed to mark as installed {item} for {installer.__class__.__name__=} {res.message=}"
334+
335+
336+
class MyInstallable(Installable):
337+
def __eq__(self, other: object) -> bool:
338+
return True
339+
340+
def __hash__(self) -> int:
341+
return hash("MyInstallable")
342+
343+
344+
def test_check_unsupported(installer: KubernetesInstaller | SlurmInstaller):
345+
unsupported = MyInstallable()
346+
for func in [
347+
installer.install_one,
348+
installer.uninstall_one,
349+
installer.is_installed_one,
350+
installer.mark_as_installed_one,
351+
]:
352+
res = func(unsupported)
353+
assert not res.success
354+
assert res.message == f"Unsupported item type: {type(unsupported)}"

tests/test_slurm_installer.py

Lines changed: 1 addition & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,7 @@
2121

2222
import pytest
2323

24-
from cloudai.core import DockerImage, File, GitRepo, Installable, InstallStatusResult, PythonExecutable
25-
from cloudai.systems.slurm.docker_image_cache_manager import DockerImageCacheResult
24+
from cloudai.core import DockerImage, File, GitRepo, InstallStatusResult, PythonExecutable
2625
from cloudai.systems.slurm.slurm_installer import SlurmInstaller
2726
from cloudai.systems.slurm.slurm_system import SlurmSystem
2827
from cloudai.workloads.nemo_launcher import NeMoLauncherCmdArgs, NeMoLauncherTestDefinition
@@ -323,50 +322,6 @@ def test_is_installed_checks_content(self, installer: SlurmInstaller, f: File):
323322
assert not res.success
324323

325324

326-
def test_check_supported(slurm_system: SlurmSystem):
327-
slurm_system.install_path.mkdir()
328-
installer = SlurmInstaller(slurm_system)
329-
installer._install_docker_image = lambda item: DockerImageCacheResult(True)
330-
installer._install_python_executable = lambda item: InstallStatusResult(True)
331-
installer._uninstall_docker_image = lambda item: DockerImageCacheResult(True)
332-
installer._uninstall_python_executable = lambda item: InstallStatusResult(True)
333-
installer._is_python_executable_installed = lambda item: InstallStatusResult(True)
334-
installer.docker_image_cache_manager.check_docker_image_exists = Mock(return_value=DockerImageCacheResult(True))
335-
336-
git = GitRepo(url="./git_url", commit="commit_hash")
337-
items = [DockerImage("fake_url/img"), PythonExecutable(git), File(Path(__file__))]
338-
for item in items:
339-
res = installer.install_one(item)
340-
assert res.success
341-
342-
res = installer.is_installed_one(item)
343-
assert res.success
344-
345-
res = installer.uninstall_one(item)
346-
assert res.success
347-
348-
res = installer.mark_as_installed_one(item)
349-
assert res.success
350-
351-
class MyInstallable(Installable):
352-
def __eq__(self, other: object) -> bool:
353-
return True
354-
355-
def __hash__(self) -> int:
356-
return hash("MyInstallable")
357-
358-
unsupported = MyInstallable()
359-
for func in [
360-
installer.install_one,
361-
installer.uninstall_one,
362-
installer.is_installed_one,
363-
installer.mark_as_installed_one,
364-
]:
365-
res = func(unsupported)
366-
assert not res.success
367-
assert res.message == f"Unsupported item type: {type(unsupported)}"
368-
369-
370325
def test_mark_as_installed(slurm_system: SlurmSystem):
371326
tdef = NeMoLauncherTestDefinition(
372327
name="name", description="desc", test_template_name="tt", cmd_args=NeMoLauncherCmdArgs()

0 commit comments

Comments
 (0)