-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvscode_slurm.py
More file actions
150 lines (132 loc) · 3.94 KB
/
Copy pathvscode_slurm.py
File metadata and controls
150 lines (132 loc) · 3.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from argparse import ArgumentParser
import subprocess
JOB_NAME = "VScode_tunnel"
def get_parser():
parser = ArgumentParser()
parser.add_argument(
"mode",
help='modes: use "start" to open the tunnel and "stop" to close the '
'tunnel',
type=str,
)
parser.add_argument(
"--ssh_command",
dest="ssh_command",
default="ssh candide.iap.fr",
type=str,
help="command used to ssh the cluster",
)
parser.add_argument(
"--slurm_dir",
dest="slurm_dir",
default="/usr/local/slurm/latest/bin/",
type=str,
help="slurm directory",
)
parser.add_argument(
"--n_cpu",
dest="n_cpu",
default=1,
type=int,
help="how many cores to use. Default 1",
)
parser.add_argument(
"--mem",
dest="mem",
default=8,
type=int,
help="how much memory to allocate per cores (in Go). Default 8",
)
parser.add_argument(
"--time",
dest="time",
default="4:00:00",
type=str,
help="walltime for the job. Default 4h",
)
parser.add_argument(
"--log",
dest="log",
default=r"\$HOME/vscode_tunnel.log",
type=str,
help='where to save the log (on the cluster). Default '
'"$HOME/vscode_tunnel.log"',
)
parser.add_argument(
"--node",
dest="node",
default=None,
type=str,
help="on which node we run VScode. Default let the cluster decide",
)
parser.add_argument(
"--partition",
dest="partition",
default=None,
type=str,
help="which partition to use to submit the job. Default let the "
"cluster decide",
)
parser.add_argument(
"--exclusive",
dest="exclusive",
action="store_true",
help="specify if the job should be exclusive"
)
return parser
def check_running(args):
res = subprocess.getoutput(
f"{args.ssh_command} "
f"{args.slurm_dir}/squeue --me --name={JOB_NAME} "
"--states=R -h -O JobID"
)
res = res.strip()
if res != "":
try:
_ = int(res)
return res
except ValueError:
raise Exception(
f"Something went wrong while requesting JobID: {res}"
)
else:
return res
def main():
parser = get_parser()
args = parser.parse_args()
jobid = check_running(args)
if args.mode == "stop":
if jobid == "":
raise Exception("No tunnel open")
cmd = f'{args.ssh_command} "{args.slurm_dir}/scancel ' \
rf' \$({args.slurm_dir}/squeue --me --name={JOB_NAME} ' \
f'--states=R -h -O JobID)"'
elif args.mode == "start":
if jobid != "":
raise Exception(f"Tunnel already open. JobID: {jobid}")
cmd = f'{args.ssh_command} "{args.slurm_dir}/sbatch ' \
f'--output={args.log} --job-name={JOB_NAME} --time={args.time} ' \
f'--cpus-per-task={args.n_cpu} --mem-per-cpu={args.mem}G'
if args.exclusive:
cmd += " --exclusive"
if args.node is not None:
cmd += f' --nodelist={args.node}'
if args.partition is not None:
cmd += f' --partition {args.partition}'
cmd += r' --wrap \"code tunnel\""'
else:
raise ValueError(f'mde must be in ["start", "stop"] got: {args.mode}')
result = subprocess.getoutput(cmd)
new_jobid = check_running(args)
if args.mode == "stop":
if new_jobid == "":
print(f"tunnel with JobID: {jobid} closed")
else:
raise Exception(
f"Something went wrong while closing: {new_jobid}"
)
if args.mode == "start":
if new_jobid == "":
raise Exception(f"Tunnel failed to open: {result.strip()}")
else:
print(f"Tunnel with JobID: {new_jobid} open")