Skip to content

Commit 7523d60

Browse files
committed
Add HF home dir property inside System model
By default, install_dir/huggingface is used. Users can override it by specifying custom value. AI Dynamo uses this value for mapping this folder into a container. Going further, CloudAI can take care about model download using HF CLI.
1 parent e244c0d commit 7523d60

File tree

8 files changed

+30
-33
lines changed

8 files changed

+30
-33
lines changed

src/cloudai/_core/base_installer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,8 @@ def is_installed(self, items: Iterable[Installable]) -> InstallStatusResult:
142142
"""
143143
if not prepare_output_dir(self.system.install_path):
144144
return InstallStatusResult(False, f"Error preparing install dir '{self.system.install_path.absolute()}'")
145+
elif not prepare_output_dir(self.system.hf_home_path):
146+
return InstallStatusResult(False, f"Error preparing hf home dir '{self.system.hf_home_path.absolute()}'")
145147

146148
install_results: dict[Installable, InstallStatusResult] = {}
147149
for item in self.all_items(items):
@@ -180,6 +182,8 @@ def install(self, items: Iterable[Installable]) -> InstallStatusResult:
180182

181183
if not prepare_output_dir(self.system.install_path):
182184
return InstallStatusResult(False, f"Error preparing install dir '{self.system.install_path.absolute()}'")
185+
elif not prepare_output_dir(self.system.hf_home_path):
186+
return InstallStatusResult(False, f"Error preparing hf home dir '{self.system.hf_home_path.absolute()}'")
183187

184188
logging.debug(f"Going to install {len(set(items))} uniq item(s) (total is {len(list(items))})")
185189

src/cloudai/_core/system.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class System(ABC, BaseModel):
3737
scheduler: str
3838
install_path: Path
3939
output_path: Path
40+
hf_home_path: Path = Field(default_factory=lambda data: data["install_path"] / "huggingface")
4041
global_env_vars: dict[str, Any] = Field(default_factory=dict)
4142
monitor_interval: int = 1
4243

src/cloudai/workloads/ai_dynamo/ai_dynamo.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ class AIDynamoCmdArgs(CmdArgs):
6464
"""Arguments for AI Dynamo."""
6565

6666
docker_image_url: str
67-
huggingface_home_host_path: Path = Path.home() / ".cache/huggingface"
6867
huggingface_home_container_path: Path = Path("/root/.cache/huggingface")
6968
dynamo: AIDynamoArgs
7069
genai_perf: GenAIPerfArgs
@@ -103,13 +102,6 @@ def docker_image(self) -> DockerImage:
103102
def installables(self) -> list[Installable]:
104103
return [self.docker_image, self.script, self.dynamo_repo, self.python_executable]
105104

106-
@property
107-
def huggingface_home_host_path(self) -> Path:
108-
path = Path(self.cmd_args.huggingface_home_host_path)
109-
if not path.is_dir():
110-
raise FileNotFoundError(f"HuggingFace home path not found at {path}")
111-
return path
112-
113105
@property
114106
def python_executable(self) -> PythonExecutable:
115107
if not self._python_executable:

src/cloudai/workloads/ai_dynamo/slurm_command_gen_strategy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def _container_mounts(self) -> list[str]:
3535

3636
mounts = [
3737
f"{dynamo_repo_path}:{dynamo_repo_path}",
38-
f"{td.cmd_args.huggingface_home_host_path}:{td.cmd_args.huggingface_home_container_path}",
38+
f"{self.system.hf_home_path.absolute()}:{td.cmd_args.huggingface_home_container_path}",
3939
f"{td.script.installed_path.absolute()!s}:{td.script.installed_path.absolute()!s}",
4040
]
4141

tests/ref_data/ai-dynamo.sbatch

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@
1010

1111
export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
1212

13-
srun --export=ALL --mpi=pmix -N2 --container-image=nvcr.io/nvidia/ai-dynamo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__:__INSTALL_DIR__,__OUTPUT_DIR__/output/hf_home:/root/.cache/huggingface,__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh:__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
13+
srun --export=ALL --mpi=pmix -N2 --container-image=nvcr.io/nvidia/ai-dynamo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__:__INSTALL_DIR__,__INSTALL_DIR__/huggingface:/root/.cache/huggingface,__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh:__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}."
1414

15-
srun --export=ALL --mpi=pmix -N2 --container-image=nvcr.io/nvidia/ai-dynamo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__:__INSTALL_DIR__,__OUTPUT_DIR__/output/hf_home:/root/.cache/huggingface,__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh:__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
15+
srun --export=ALL --mpi=pmix -N2 --container-image=nvcr.io/nvidia/ai-dynamo:24.09 --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__:__INSTALL_DIR__,__INSTALL_DIR__/huggingface:/root/.cache/huggingface,__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh:__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh --ntasks=2 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh
1616

1717
num_retries=${DYNAMO_NUM_RETRY_ON_FAILURE:-0}
1818
for try in $(seq 0 $num_retries); do
@@ -23,7 +23,7 @@ for try in $(seq 0 $num_retries); do
2323
--mpi=pmix \
2424
-N2 \
2525
--container-image=nvcr.io/nvidia/ai-dynamo:24.09 \
26-
--container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__:__INSTALL_DIR__,__OUTPUT_DIR__/output/hf_home:/root/.cache/huggingface,__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh:__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh \
26+
--container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__:__INSTALL_DIR__,__INSTALL_DIR__/huggingface:/root/.cache/huggingface,__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh:__CLOUDAI_DIR__/src/cloudai/workloads/ai_dynamo/ai_dynamo.sh \
2727
--nodes=2 \
2828
--ntasks=2 \
2929
--ntasks-per-node=1 \

tests/slurm_command_gen_strategy/test_ai_dynamo_slurm_command_gen_strategy.py

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
def cmd_args() -> AIDynamoCmdArgs:
3737
return AIDynamoCmdArgs(
3838
docker_image_url="url",
39-
huggingface_home_host_path=Path.home() / ".cache/huggingface",
4039
huggingface_home_container_path=Path("/root/.cache/huggingface"),
4140
dynamo=AIDynamoArgs(
4241
prefill_worker=PrefillWorkerArgs(
@@ -76,10 +75,6 @@ def cmd_args() -> AIDynamoCmdArgs:
7675

7776
@pytest.fixture
7877
def test_run(tmp_path: Path, cmd_args: AIDynamoCmdArgs) -> TestRun:
79-
hf_home = tmp_path / "huggingface"
80-
hf_home.mkdir()
81-
cmd_args.huggingface_home_host_path = hf_home
82-
8378
dynamo_repo_path = tmp_path / "dynamo_repo"
8479
dynamo_repo_path.mkdir()
8580

@@ -99,20 +94,6 @@ def strategy(slurm_system: SlurmSystem, test_run: TestRun) -> AIDynamoSlurmComma
9994
return AIDynamoSlurmCommandGenStrategy(slurm_system, test_run)
10095

10196

102-
def test_hugging_face_home_path_valid(test_run: TestRun) -> None:
103-
td = cast(AIDynamoTestDefinition, test_run.test)
104-
path = td.huggingface_home_host_path
105-
assert path.exists()
106-
assert path.is_dir()
107-
108-
109-
def test_hugging_face_home_path_missing(test_run: TestRun) -> None:
110-
td = cast(AIDynamoTestDefinition, test_run.test)
111-
td.cmd_args.huggingface_home_host_path = Path("/nonexistent")
112-
with pytest.raises(FileNotFoundError):
113-
_ = td.huggingface_home_host_path
114-
115-
11697
def test_container_mounts(strategy: AIDynamoSlurmCommandGenStrategy, test_run: TestRun) -> None:
11798
mounts = strategy._container_mounts()
11899
td = cast(AIDynamoTestDefinition, test_run.test)
@@ -121,7 +102,7 @@ def test_container_mounts(strategy: AIDynamoSlurmCommandGenStrategy, test_run: T
121102

122103
assert mounts == [
123104
f"{dynamo_repo_path!s}:{dynamo_repo_path!s}",
124-
f"{td.cmd_args.huggingface_home_host_path!s}:{td.cmd_args.huggingface_home_container_path!s}",
105+
f"{strategy.system.hf_home_path.absolute()!s}:{td.cmd_args.huggingface_home_container_path!s}",
125106
f"{td.script.installed_path.absolute()!s}:{td.script.installed_path.absolute()!s}",
126107
]
127108

tests/test_acceptance.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
431431
),
432432
cmd_args=AIDynamoCmdArgs(
433433
docker_image_url="nvcr.io/nvidia/ai-dynamo:24.09",
434-
huggingface_home_host_path=Path.home() / ".cache/huggingface",
435434
dynamo=AIDynamoArgs(
436435
backend="vllm",
437436
prefill_worker=PrefillWorkerArgs(

tests/test_slurm_system.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -657,3 +657,23 @@ def test_update(
657657

658658
all_nodes_set = set([node for p in slurm_system.partitions for node in p.slurm_nodes])
659659
assert all_nodes_set == set(expected_nodes)
660+
661+
662+
class TestHfHomePath:
663+
@pytest.fixture
664+
def system_args(self, tmp_path: Path) -> dict:
665+
return {
666+
"name": "test_system",
667+
"install_path": tmp_path / "install",
668+
"output_path": tmp_path / "output",
669+
"partitions": [],
670+
"default_partition": "main",
671+
}
672+
673+
def test_default(self, system_args: dict):
674+
system = SlurmSystem(**system_args)
675+
assert system.hf_home_path == system_args["install_path"] / "huggingface"
676+
677+
def test_custom(self, system_args: dict):
678+
system = SlurmSystem(**system_args, hf_home_path=system_args["output_path"] / "custom")
679+
assert system.hf_home_path == system_args["output_path"] / "custom"

0 commit comments

Comments
 (0)