-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathrun_with_submitit.py
More file actions
69 lines (54 loc) · 1.65 KB
/
run_with_submitit.py
File metadata and controls
69 lines (54 loc) · 1.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import shutil
import tempfile
import uuid
from pathlib import Path
import submitit
import sys
from omegaconf import OmegaConf
import protoclr_obow
UUID = uuid.uuid4()
OmegaConf.register_new_resolver("uuid", lambda: str(UUID))
def make_conf_file_copy(path):
"""
for logging purposes and to prevent conflicts with multiple concurrent runs
:param path:
:return: Path
"""
fd, tmp_path = tempfile.mkstemp()
shutil.copy2(path, tmp_path)
return tmp_path
def parse_args():
"""
Loads and returns a yaml file
:return: dict, Path
"""
# TODO make this nicer somehow
path = sys.argv[2]
tmp_path = make_conf_file_copy(path)
conf = OmegaConf.load(tmp_path)
return conf, tmp_path
def main():
cfg, cfg_path = parse_args()
args = cfg["slurm"]
ngpus = int(args["slurm_additional_parameters"]["gres"][-1])
exp_dir = Path(f'./expts/{cfg["job_name"]}')
exp_dir.mkdir(parents=True, exist_ok=True)
executor = submitit.AutoExecutor(folder=exp_dir)
executor.update_parameters(
name=cfg["job_name"],
# gpus_per_node=args["ngpus"],
mem_gb=args["mem_gb"] * ngpus,
tasks_per_node=ngpus,
cpus_per_task=8,
nodes=args["nodes"],
timeout_min=args["timeout"] * 60,
slurm_partition=args["partition"],
slurm_signal_delay_s=120,
slurm_constraint=args["constraint"],
slurm_comment=args["comment"],
slurm_additional_parameters=args["slurm_additional_parameters"]
)
job = executor.submit(protoclr_obow.slurm_main, cfg_path, UUID)
print("Submitted job_id:", job.job_id)
if __name__ == "__main__":
main()