Skip to content

Commit 1f5b36e

Browse files
TaekyungHeoamaslenn
andcommitted
Reflect Andrei's comments
Co-authored-by: Andrei Maslennikov <andreyma@nvidia.com>
1 parent f842597 commit 1f5b36e

File tree

11 files changed

+75
-80
lines changed

11 files changed

+75
-80
lines changed

src/cloudai/_core/parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def parse(
6565
if not test_path.exists():
6666
raise FileNotFoundError(f"Test path '{test_path}' not found.")
6767

68-
system_parser = SystemParser(str(self.system_config_path))
68+
system_parser = SystemParser(self.system_config_path)
6969
system = system_parser.parse()
7070
logging.debug("Parsed system config")
7171

src/cloudai/_core/system_parser.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
import os
17+
from pathlib import Path
1818

1919
import toml
2020

@@ -29,19 +29,19 @@ class SystemParser:
2929
Attributes
3030
_parsers (Dict[str, Type[BaseSystemParser]]): A mapping from system types to their corresponding parser
3131
classes.
32-
file_path (str): The file path to the system configuration file.
32+
file_path (Path): The file path to the system configuration file.
3333
"""
3434

3535
_parsers = {}
3636

37-
def __init__(self, file_path: str):
37+
def __init__(self, file_path: Path):
3838
"""
3939
Initialize a SystemParser instance.
4040
4141
Args:
42-
file_path (str): The file path to the system configuration file.
42+
file_path (Path): The file path to the system configuration file.
4343
"""
44-
self.file_path: str = file_path
44+
self.file_path: Path = file_path
4545

4646
def parse(self) -> System:
4747
"""
@@ -55,10 +55,10 @@ def parse(self) -> System:
5555
Returns
5656
System: The parsed system object.
5757
"""
58-
if not os.path.isfile(self.file_path):
58+
if not self.file_path.is_file():
5959
raise FileNotFoundError(f"The file '{self.file_path}' does not exist.")
6060

61-
with open(self.file_path, "r") as file:
61+
with self.file_path.open("r") as file:
6262
data = toml.load(file)
6363
scheduler = data.get("scheduler", "").lower()
6464
registry = Registry()

src/cloudai/installer/slurm_installer.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import contextlib
1818
import os
1919
import subprocess
20+
from pathlib import Path
2021
from typing import Dict, Iterable, cast
2122

2223
import toml
@@ -35,9 +36,9 @@ class SlurmInstaller(BaseInstaller):
3536
CONFIG_FILE_NAME (str): The name of the configuration file.
3637
PREREQUISITES (List[str]): A list of required binaries for the installer.
3738
REQUIRED_SRUN_OPTIONS (List[str]): A list of required srun options to check.
38-
install_path (str): Path where the benchmarks are to be installed. This is optional since uninstallation does
39+
install_path (Path): Path where the benchmarks are to be installed. This is optional since uninstallation does
3940
not require it.
40-
config_path (str): Path to the installation configuration file.
41+
config_path (Path): Path to the installation configuration file.
4142
"""
4243

4344
CONFIG_FILE_NAME = ".cloudai.toml"
@@ -60,7 +61,7 @@ def __init__(self, system: System):
6061
super().__init__(system)
6162
slurm_system = cast(SlurmSystem, self.system)
6263
self.install_path = slurm_system.install_path
63-
self.config_path = os.path.join(os.path.expanduser("~"), self.CONFIG_FILE_NAME)
64+
self.config_path = Path.home() / self.CONFIG_FILE_NAME
6465

6566
def _check_prerequisites(self) -> InstallStatusResult:
6667
"""
@@ -108,16 +109,16 @@ def _check_srun_options(self) -> None:
108109

109110
def _write_config(self) -> InstallStatusResult:
110111
"""Write the installation configuration to a TOML file atomically."""
111-
absolute_install_path = os.path.abspath(self.install_path)
112-
config_data: Dict[str, str] = {"install_path": absolute_install_path}
112+
absolute_install_path = self.install_path.resolve()
113+
config_data: Dict[str, str] = {"install_path": str(absolute_install_path)}
113114

114115
try:
115-
with open(self.config_path, "w") as file:
116+
with self.config_path.open("w") as file:
116117
toml.dump(config_data, file)
117118
return InstallStatusResult(True)
118119
except Exception as e:
119120
with contextlib.suppress(OSError):
120-
os.remove(self.config_path)
121+
self.config_path.unlink()
121122
return InstallStatusResult(False, str(e))
122123

123124
def _read_config(self) -> Dict[str, str]:
@@ -128,7 +129,7 @@ def _read_config(self) -> Dict[str, str]:
128129
Dict[str, str]: Configuration, including installation path.
129130
"""
130131
try:
131-
with open(self.config_path, "r") as file:
132+
with self.config_path.open("r") as file:
132133
return toml.load(file)
133134
except FileNotFoundError as e:
134135
raise FileNotFoundError(
@@ -138,8 +139,8 @@ def _read_config(self) -> Dict[str, str]:
138139

139140
def _remove_config(self) -> None:
140141
"""Remove the installation configuration file."""
141-
if os.path.exists(self.config_path):
142-
os.remove(self.config_path)
142+
if self.config_path.exists():
143+
self.config_path.unlink()
143144

144145
def is_installed(self, test_templates: Iterable[TestTemplate]) -> InstallStatusResult:
145146
"""
@@ -153,7 +154,7 @@ def is_installed(self, test_templates: Iterable[TestTemplate]) -> InstallStatusR
153154
Returns:
154155
InstallStatusResult: Result containing the installation status and error message if not installed.
155156
"""
156-
if not os.path.exists(self.config_path):
157+
if not self.config_path.exists():
157158
return InstallStatusResult(
158159
False,
159160
f"Configuration file does not exist at {self.config_path}. "
@@ -189,11 +190,11 @@ def install(self, test_templates: Iterable[TestTemplate]) -> InstallStatusResult
189190
return prerequisites_result
190191

191192
try:
192-
os.makedirs(self.install_path, exist_ok=True)
193+
self.install_path.mkdir(parents=True, exist_ok=True)
193194
except OSError as e:
194195
return InstallStatusResult(False, f"Failed to create installation directory at {self.install_path}: {e}")
195196

196-
if not os.access(self.install_path, os.W_OK):
197+
if not self.install_path.is_dir() or not os.access(self.install_path, os.W_OK):
197198
return InstallStatusResult(False, f"The installation path {self.install_path} is not writable.")
198199

199200
install_result = super().install(test_templates)

src/cloudai/schema/test_template/jax_toolbox/slurm_command_gen_strategy.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -173,21 +173,20 @@ def _parse_slurm_args(
173173
JaxToolboxSlurmInstallStrategy.DOCKER_IMAGE_FILENAME,
174174
).docker_image_path
175175

176-
local_workspace_dir = os.path.abspath(cmd_args["output_path"])
177-
# Use the dynamic key_prefix for accessing docker_workspace_dir
176+
local_workspace_dir = Path(cmd_args["output_path"]).resolve()
178177
docker_workspace_dir = cmd_args[f"{key_prefix}.setup_flags.docker_workspace_dir"]
179178
container_mounts = f"{local_workspace_dir}:{docker_workspace_dir}"
180179

181180
if "pgo_nsys_converter.profile_path" in cmd_args:
182-
profile_path = cmd_args["pgo_nsys_converter.profile_path"]
181+
profile_path = Path(cmd_args["pgo_nsys_converter.profile_path"]).resolve()
183182
container_mounts += f",{profile_path}:{profile_path}"
184183

185184
base_args.update({"image_path": image_path, "container_mounts": container_mounts})
186185

187-
output_path = os.path.abspath(cmd_args["output_path"])
186+
output_path = Path(cmd_args["output_path"]).resolve()
188187
output_suffix = "-%j.txt" if env_vars.get("UNIFIED_STDOUT_STDERR") == "1" else "-%j-%n-%t.txt"
189-
base_args["output"] = os.path.join(output_path, f"output{output_suffix}")
190-
base_args["error"] = os.path.join(output_path, f"error{output_suffix}")
188+
base_args["output"] = str(output_path / f"output{output_suffix}")
189+
base_args["error"] = str(output_path / f"error{output_suffix}")
191190

192191
return base_args
193192

src/cloudai/schema/test_template/jax_toolbox/slurm_install_strategy.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
import os
1817

1918
from cloudai import InstallStatusResult
2019
from cloudai.systems.slurm.strategy import SlurmInstallStrategy
@@ -34,8 +33,8 @@ def is_installed(self) -> InstallStatusResult:
3433
return InstallStatusResult(success=True)
3534
else:
3635
if self.docker_image_cache_manager.cache_docker_images_locally:
37-
expected_docker_image_path = os.path.join(
38-
self.docker_image_cache_manager.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME
36+
expected_docker_image_path = (
37+
self.docker_image_cache_manager.install_path / self.SUBDIR_PATH / self.DOCKER_IMAGE_FILENAME
3938
)
4039
return InstallStatusResult(
4140
success=False,

src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
import os
1817
from pathlib import Path
1918
from typing import Any, Dict, List
2019

@@ -73,7 +72,7 @@ def _parse_slurm_args(
7372

7473
container_mounts = ""
7574
if "NCCL_TOPO_FILE" in env_vars and "DOCKER_NCCL_TOPO_FILE" in env_vars:
76-
nccl_graph_path = os.path.abspath(env_vars["NCCL_TOPO_FILE"])
75+
nccl_graph_path = Path(env_vars["NCCL_TOPO_FILE"]).resolve()
7776
nccl_graph_file = env_vars["DOCKER_NCCL_TOPO_FILE"]
7877
container_mounts = f"{nccl_graph_path}:{nccl_graph_file}"
7978
elif "NCCL_TOPO_FILE" in env_vars:

src/cloudai/schema/test_template/nccl_test/slurm_install_strategy.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
import os
1817

1918
from cloudai import InstallStatusResult
2019
from cloudai.systems.slurm.strategy import SlurmInstallStrategy
@@ -40,8 +39,8 @@ def is_installed(self) -> InstallStatusResult:
4039
return InstallStatusResult(success=True)
4140
else:
4241
if self.docker_image_cache_manager.cache_docker_images_locally:
43-
expected_docker_image_path = os.path.join(
44-
self.docker_image_cache_manager.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME
42+
expected_docker_image_path = (
43+
self.docker_image_cache_manager.install_path / self.SUBDIR_PATH / self.DOCKER_IMAGE_FILENAME
4544
)
4645
return InstallStatusResult(
4746
success=False,
@@ -55,7 +54,7 @@ def is_installed(self) -> InstallStatusResult:
5554
return InstallStatusResult(
5655
success=False,
5756
message=(
58-
f"Docker image for NCCL test is not accessible.\n" f" - Error: {docker_image_result.message}"
57+
f"Docker image for NCCL test is not accessible.\n - Error: {docker_image_result.message}"
5958
),
6059
)
6160

src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import os
1919
import subprocess
2020
from concurrent.futures import ThreadPoolExecutor, as_completed
21+
from pathlib import Path
2122
from typing import Any, Dict, List
2223

2324
from cloudai import InstallStatusResult, System
@@ -89,15 +90,15 @@ def _validate_cmd_arg(self, cmd_args: Dict[str, Any], arg_name: str) -> str:
8990
return arg_value
9091

9192
def is_installed(self) -> InstallStatusResult:
92-
subdir_path = os.path.join(self.install_path, self.SUBDIR_PATH)
93-
repo_path = os.path.join(subdir_path, self.REPOSITORY_NAME)
94-
repo_installed = os.path.isdir(repo_path)
93+
subdir_path = self.install_path / self.SUBDIR_PATH
94+
repo_path = subdir_path / self.REPOSITORY_NAME
95+
repo_installed = repo_path.is_dir()
9596

9697
docker_image_installed = self.docker_image_cache_manager.check_docker_image_exists(
9798
self.docker_image_url, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME
9899
).success
99100

100-
data_dir_path = self.default_cmd_args["data_dir"]
101+
data_dir_path = Path(self.default_cmd_args["data_dir"])
101102
datasets_check_result = self._check_datasets_on_nodes(data_dir_path)
102103
if not datasets_check_result.success:
103104
return InstallStatusResult(
@@ -121,8 +122,8 @@ def is_installed(self) -> InstallStatusResult:
121122
f"with commit hash {self.repository_commit_hash}"
122123
)
123124
if not docker_image_installed:
124-
docker_image_path = os.path.join(subdir_path, self.DOCKER_IMAGE_FILENAME)
125-
missing_components.append(f"Docker image at {docker_image_path} " f"from URL {self.docker_image_url}")
125+
docker_image_path = subdir_path / self.DOCKER_IMAGE_FILENAME
126+
missing_components.append(f"Docker image at {docker_image_path} from URL {self.docker_image_url}")
126127
if not datasets_check_result.success:
127128
missing_components.append(f"Datasets in {data_dir_path} on some nodes")
128129
return InstallStatusResult(
@@ -141,10 +142,10 @@ def install(self) -> InstallStatusResult:
141142
except PermissionError as e:
142143
return InstallStatusResult(success=False, message=str(e))
143144

144-
subdir_path = os.path.join(self.install_path, self.SUBDIR_PATH)
145-
os.makedirs(subdir_path, exist_ok=True)
145+
subdir_path = self.install_path / self.SUBDIR_PATH
146+
subdir_path.mkdir(parents=True, exist_ok=True)
146147

147-
data_dir_path = self.default_cmd_args["data_dir"]
148+
data_dir_path = Path(self.default_cmd_args["data_dir"])
148149
datasets_check_result = self._check_datasets_on_nodes(data_dir_path)
149150
if not datasets_check_result.success:
150151
return InstallStatusResult(
@@ -195,12 +196,12 @@ def _check_install_path_access(self):
195196
PermissionError: If the install path does not exist or if there is no permission to create directories and
196197
files.
197198
"""
198-
if not os.path.exists(self.install_path):
199+
if not self.install_path.exists():
199200
raise PermissionError(f"Install path {self.install_path} does not exist.")
200-
if not os.access(self.install_path, os.W_OK):
201+
if not self.install_path.is_dir() or not os.access(self.install_path, os.W_OK):
201202
raise PermissionError(f"No permission to write in install path {self.install_path}.")
202203

203-
def _check_datasets_on_nodes(self, data_dir_path: str) -> DatasetCheckResult:
204+
def _check_datasets_on_nodes(self, data_dir_path: Path) -> DatasetCheckResult:
204205
"""
205206
Verify the presence of specified dataset files and directories on all idle compute nodes.
206207
@@ -210,7 +211,7 @@ def _check_datasets_on_nodes(self, data_dir_path: str) -> DatasetCheckResult:
210211
for systems with multiple nodes.
211212
212213
Args:
213-
data_dir_path (str): Path where dataset files and directories are stored.
214+
data_dir_path (Path): Path where dataset files and directories are stored.
214215
215216
Returns:
216217
DatasetCheckResult: Result object containing success status and nodes without datasets.
@@ -248,22 +249,20 @@ def _check_datasets_on_nodes(self, data_dir_path: str) -> DatasetCheckResult:
248249

249250
return DatasetCheckResult(success=not nodes_without_datasets, nodes_without_datasets=nodes_without_datasets)
250251

251-
def _check_dataset_on_node(self, node: str, data_dir_path: str, dataset_items: List[str]) -> bool:
252+
def _check_dataset_on_node(self, node: str, data_dir_path: Path, dataset_items: List[str]) -> bool:
252253
"""
253254
Check if dataset files and directories exist on a single compute node.
254255
255256
Args:
256257
node (str): The name of the compute node.
257-
data_dir_path (str): Path to the data directory.
258+
data_dir_path (Path): Path to the data directory.
258259
dataset_items (List[str]): List of dataset file and directory names to check.
259260
260261
Returns:
261262
bool: True if all dataset files and directories exist on the node, False otherwise.
262263
"""
263264
python_check_script = (
264-
f"import os;print(all(os.path.isfile(os.path.join('{data_dir_path}', "
265-
f"item)) or os.path.isdir(os.path.join('{data_dir_path}', item)) "
266-
f"for item in {dataset_items}))"
265+
f"import os;print(all(Path('{data_dir_path}') / item).exists() for item in {dataset_items})"
267266
)
268267
cmd = (
269268
f"srun --nodes=1 --nodelist={node} "
@@ -273,43 +272,43 @@ def _check_dataset_on_node(self, node: str, data_dir_path: str, dataset_items: L
273272
result = subprocess.run(cmd, shell=True, check=False, capture_output=True, text=True)
274273
return result.returncode == 0 and result.stdout.strip() == "True"
275274

276-
def _clone_repository(self, subdir_path: str) -> None:
275+
def _clone_repository(self, subdir_path: Path) -> None:
277276
"""
278277
Clones NeMo-Launcher repository into specified path if it does not already exist.
279278
280279
Args:
281-
subdir_path (str): Subdirectory path for installation.
280+
subdir_path (Path): Subdirectory path for installation.
282281
"""
283-
repo_path = os.path.join(subdir_path, self.REPOSITORY_NAME)
282+
repo_path = subdir_path / self.REPOSITORY_NAME
284283

285-
if os.path.exists(repo_path):
284+
if repo_path.exists():
286285
logging.warning("Repository already exists at %s, clone skipped", repo_path)
287286
else:
288287
logging.debug("Cloning NeMo-Launcher repository into %s", repo_path)
289-
clone_cmd = ["git", "clone", self.repository_url, repo_path]
288+
clone_cmd = ["git", "clone", self.repository_url, str(repo_path)]
290289
result = subprocess.run(clone_cmd, capture_output=True, text=True)
291290
if result.returncode != 0:
292291
raise RuntimeError(f"Failed to clone repository: {result.stderr}")
293292

294293
logging.debug("Checking out specific commit %s in repository", self.repository_commit_hash)
295294
checkout_cmd = ["git", "checkout", self.repository_commit_hash]
296-
result = subprocess.run(checkout_cmd, cwd=repo_path, capture_output=True, text=True)
295+
result = subprocess.run(checkout_cmd, cwd=str(repo_path), capture_output=True, text=True)
297296
if result.returncode != 0:
298297
raise RuntimeError(f"Failed to checkout commit: {result.stderr}")
299298

300-
def _install_requirements(self, subdir_path: str) -> None:
299+
def _install_requirements(self, subdir_path: Path) -> None:
301300
"""
302301
Installs the required Python packages from the requirements.txt file in the cloned repository.
303302
304303
Args:
305-
subdir_path (str): Subdirectory path for installation.
304+
subdir_path (Path): Subdirectory path for installation.
306305
"""
307-
repo_path = os.path.join(subdir_path, self.REPOSITORY_NAME)
308-
requirements_file = os.path.join(repo_path, "requirements.txt")
306+
repo_path = subdir_path / self.REPOSITORY_NAME
307+
requirements_file = repo_path / "requirements.txt"
309308

310-
if os.path.isfile(requirements_file):
309+
if requirements_file.is_file():
311310
logging.debug("Installing requirements from %s", requirements_file)
312-
install_cmd = ["pip", "install", "-r", requirements_file]
311+
install_cmd = ["pip", "install", "-r", str(requirements_file)]
313312
result = subprocess.run(install_cmd, capture_output=True, text=True)
314313
if result.returncode != 0:
315314
raise RuntimeError(f"Failed to install requirements: {result.stderr}")

0 commit comments

Comments
 (0)