Skip to content

Commit 57a7b2c

Browse files
authored
Transition mila code to RemoteV2 [MT-80] (#105)
* Add new version of `code` command Signed-off-by: Fabrice Normandin <[email protected]> * Move the `check_disk_quota` fn to a new module Signed-off-by: Fabrice Normandin <[email protected]> * Update vscode utils and progress bar Signed-off-by: Fabrice Normandin <[email protected]> * Fix bug in _find_allocation (mila code --node) Signed-off-by: Fabrice Normandin <[email protected]> * Tweak logging of warnings and run commands Signed-off-by: Fabrice Normandin <[email protected]> * Fix big stacktrace printing on KeyboardInterrupt Signed-off-by: Fabrice Normandin <[email protected]> * Make job-name optional when fetching test jobs Signed-off-by: Fabrice Normandin <[email protected]> * Add tests for `internet_on_compute_nodes` Signed-off-by: Fabrice Normandin <[email protected]> * Rename test file, adapt test slightly Signed-off-by: Fabrice Normandin <[email protected]> * Add tests for the new `code` implementation Signed-off-by: Fabrice Normandin <[email protected]> * Explicitly set the job name when running tests Signed-off-by: Fabrice Normandin <[email protected]> * Fixup for test_code_v1 Signed-off-by: Fabrice Normandin <[email protected]> * Apply suggestion from code review Signed-off-by: Fabrice Normandin <[email protected]> * Use only `command` instead of `code_command` Signed-off-by: Fabrice Normandin <[email protected]> * Broaden the try-except in vscode loop Signed-off-by: Fabrice Normandin <[email protected]> * Remove check for in a test in parallel_progress.py Signed-off-by: Fabrice Normandin <[email protected]> * Remove _get_runner_and_hostname function Signed-off-by: Fabrice Normandin <[email protected]> * Fix doctest for async progress bar Signed-off-by: Fabrice Normandin <[email protected]> * Simplify test code for async progress bar Signed-off-by: Fabrice Normandin <[email protected]> * Simplify `test_code.py`'s `existing_job` fixture Signed-off-by: Fabrice Normandin <[email protected]> * Fix `parse_args` and `mila` in commands.py Signed-off-by: Fabrice Normandin <[email protected]> * Show a hint about relative paths Signed-off-by: Fabrice Normandin <[email protected]> * Don't cancel persistent job when using --job/node Signed-off-by: Fabrice Normandin <[email protected]> * Fix integration test for `code` with existing job Signed-off-by: Fabrice Normandin <[email protected]> * Fix test for install_vscode_extensions task fn Signed-off-by: Fabrice Normandin <[email protected]> * Add filter for line that may vary in salloc output Signed-off-by: Fabrice Normandin <[email protected]> --------- Signed-off-by: Fabrice Normandin <[email protected]>
1 parent bf82ca9 commit 57a7b2c

30 files changed

+1796
-791
lines changed

milatools/cli/code.py

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
from __future__ import annotations
2+
3+
import asyncio
4+
import shutil
5+
from logging import getLogger as get_logger
6+
from pathlib import PurePosixPath
7+
from typing import Awaitable
8+
9+
from milatools.cli import console
10+
from milatools.cli.init_command import DRAC_CLUSTERS
11+
from milatools.cli.utils import (
12+
CommandNotFoundError,
13+
MilatoolsUserError,
14+
currently_in_a_test,
15+
internet_on_compute_nodes,
16+
)
17+
from milatools.utils.compute_node import ComputeNode, salloc, sbatch
18+
from milatools.utils.disk_quota import check_disk_quota
19+
from milatools.utils.local_v2 import LocalV2
20+
from milatools.utils.remote_v2 import RemoteV2
21+
from milatools.utils.vscode_utils import sync_vscode_extensions
22+
23+
logger = get_logger(__name__)
24+
25+
26+
async def code(
27+
path: str,
28+
command: str,
29+
persist: bool,
30+
job: int | None,
31+
node: str | None,
32+
alloc: list[str],
33+
cluster: str = "mila",
34+
) -> ComputeNode | int:
35+
"""Open a remote VSCode session on a compute node.
36+
37+
Arguments:
38+
path: Path to open on the remote machine
39+
command: Command to use to start vscode (defaults to "code" or the value of \
40+
$MILATOOLS_CODE_COMMAND)
41+
persist: Whether the server should persist or not after exiting the terminal.
42+
job: ID of the job to connect to
43+
node: Name of the node to connect to
44+
alloc: Extra options to pass to slurm
45+
"""
46+
# Check that the `code` command is in the $PATH so that we can use just `code` as
47+
# the command.
48+
if not shutil.which(command):
49+
raise CommandNotFoundError(command)
50+
51+
if (job or node) and not persist:
52+
logger.warning("Assuming persist=True since a job or node was specified.")
53+
persist = True
54+
55+
# Connect to the cluster's login node.
56+
login_node = await RemoteV2.connect(cluster)
57+
58+
relative_path: PurePosixPath | None = None
59+
# Get $HOME because we have to give the full path to the folder to the code command.
60+
home = PurePosixPath(
61+
await login_node.get_output_async("echo $HOME", display=False, hide=True)
62+
)
63+
if not path.startswith("/"):
64+
relative_path = PurePosixPath(path)
65+
path = str(home if path == "." else home / path)
66+
elif (_path := PurePosixPath(path)).is_relative_to(home):
67+
relative_path = _path.relative_to(home)
68+
console.log(
69+
f"Hint: you can use a path relative to your $HOME instead of an absolute path.\n"
70+
f"For example, `mila code {path}` is the same as `mila code {relative_path}`.",
71+
highlight=True,
72+
markup=True,
73+
)
74+
75+
try:
76+
await check_disk_quota(login_node)
77+
except MilatoolsUserError:
78+
# Raise errors that are meant to be shown to the user (disk quota is reached).
79+
raise
80+
except Exception as exc:
81+
logger.warning(
82+
f"Unable to check the disk-quota on the {cluster} cluster: {exc}"
83+
)
84+
85+
# NOTE: Perhaps we could eventually do this check dynamically, if the cluster is an
86+
# unknown cluster?
87+
sync_vscode_extensions_task = None
88+
if not internet_on_compute_nodes(cluster):
89+
# Sync the VsCode extensions from the local machine over to the target cluster.
90+
console.log(
91+
f"Installing VSCode extensions that are on the local machine on "
92+
f"{cluster}.",
93+
style="cyan",
94+
)
95+
# todo: use the mila or the local machine as the reference for vscode
96+
# extensions?
97+
# TODO: If the remote is a cluster that doesn't yet have `vscode-server`, we
98+
# could launch vscode at the same time (or before) syncing the vscode extensions?
99+
sync_vscode_extensions_task = sync_vscode_extensions(
100+
LocalV2(),
101+
[login_node],
102+
)
103+
104+
compute_node_task: Awaitable[ComputeNode]
105+
if job or node:
106+
if job and node:
107+
logger.warning(
108+
"Both job ID and node name were specified. Ignoring the node name and "
109+
"only using the job id."
110+
)
111+
job_id_or_node = job or node
112+
assert job_id_or_node is not None
113+
compute_node_task = ComputeNode.connect(
114+
login_node=login_node, job_id_or_node_name=job_id_or_node
115+
)
116+
else:
117+
if cluster in DRAC_CLUSTERS and not any("--account" in flag for flag in alloc):
118+
logger.warning(
119+
"Warning: When using the DRAC clusters, you usually need to "
120+
"specify the account to use when submitting a job. You can specify "
121+
"this in the job resources with `--alloc`, like so: "
122+
"`--alloc --account=<account_to_use>`, for example:\n"
123+
f"mila code some_path --cluster {cluster} --alloc "
124+
f"--account=your-account-here"
125+
)
126+
# Set the job name to `mila-code`. This should not be changed by the user
127+
# ideally, so we can collect some simple stats about the use of `milatools` on
128+
# the clusters.
129+
if any(flag.split("=")[0] in ("-J", "--job-name") for flag in alloc):
130+
raise MilatoolsUserError(
131+
"The job name flag (--job-name or -J) should be left unset for now "
132+
"because we use the job name to measure how many people use `mila "
133+
"code` on the various clusters. We also make use of the job name when "
134+
"the call to `salloc` is interrupted before we have a chance to know "
135+
"the job id."
136+
)
137+
job_name = "mila-code"
138+
alloc = alloc + [f"--job-name={job_name}"]
139+
140+
if persist:
141+
compute_node_task = sbatch(
142+
login_node, sbatch_flags=alloc, job_name=job_name
143+
)
144+
else:
145+
# NOTE: Here we actually need the job name to be known, so that we can
146+
# scancel jobs if the call is interrupted.
147+
compute_node_task = salloc(
148+
login_node, salloc_flags=alloc, job_name=job_name
149+
)
150+
151+
if sync_vscode_extensions_task is not None:
152+
# Sync the vscode extensions at the same time as waiting for the job.
153+
# Wait until all extensions are done syncing before launching vscode.
154+
# If any of the tasks failed, we want to raise the exception.
155+
# NOTE: Not using this at the moment because when interrupted, the job request
156+
# isn't cancelled properly.
157+
compute_node, _ = await asyncio.gather(
158+
compute_node_task,
159+
sync_vscode_extensions_task,
160+
)
161+
else:
162+
compute_node = await compute_node_task
163+
164+
await launch_vscode_loop(command, compute_node, path)
165+
166+
if not persist and not (job or node):
167+
# Cancel the job if it was not persistent.
168+
# (--job and --node are used to connect to persistent jobs)
169+
await compute_node.close_async()
170+
console.print(f"Ended session on '{compute_node.hostname}'")
171+
return compute_node.job_id
172+
173+
console.print("This allocation is persistent and is still active.")
174+
console.print("To reconnect to this job, run the following:")
175+
console.print(
176+
f" mila code {relative_path or path} "
177+
+ (f"--cluster {cluster} " if cluster != "mila" else "")
178+
+ f"--job {compute_node.job_id}",
179+
style="bold",
180+
)
181+
console.print("To kill this allocation:")
182+
console.print(f" ssh {cluster} scancel {compute_node.job_id}", style="bold")
183+
return compute_node
184+
185+
186+
async def launch_vscode_loop(code_command: str, compute_node: ComputeNode, path: str):
187+
while True:
188+
code_command_to_run = (
189+
code_command,
190+
"--new-window",
191+
"--wait",
192+
"--remote",
193+
f"ssh-remote+{compute_node.hostname}",
194+
path,
195+
)
196+
await LocalV2.run_async(code_command_to_run, display=True)
197+
# TODO: BUG: This now requires two Ctrl+C's instead of one!
198+
console.print(
199+
"The editor was closed. Reopen it with <Enter> or terminate the "
200+
"process with <Ctrl+C> (maybe twice)."
201+
)
202+
if currently_in_a_test():
203+
# NOTE: This early exit kills the job when it is not persistent.
204+
break
205+
try:
206+
input()
207+
except KeyboardInterrupt:
208+
break
209+
except asyncio.CancelledError:
210+
raise
211+
except Exception as exc:
212+
logger.error(f"Error while waiting for user input: {exc}")
213+
break

0 commit comments

Comments
 (0)