From fbfbba4552c0bcb87722f8ba941cbf1e46c30bdb Mon Sep 17 00:00:00 2001 From: Hugo Vandenbroucke-Menu Date: Fri, 15 May 2026 11:13:18 -0400 Subject: [PATCH 1/9] Add --show argument to status command for customizable output --- cluv/__main__.py | 8 ++++++-- cluv/cli/status.py | 15 ++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/cluv/__main__.py b/cluv/__main__.py index 1c71424..ba519d4 100644 --- a/cluv/__main__.py +++ b/cluv/__main__.py @@ -149,8 +149,12 @@ def add_status_args(subparsers: Subparsers) -> argparse.ArgumentParser: metavar="", help=("Cluster(s) to query. Leave empty to query all clusters with an active connection."), ) - # TODO: Add sub-commands to query the status with respect to different things, GPUs, storage, jobs, etc? - # Or just display everything? + status_parser.add_argument( + "--show", + choices=["clusters", "jobs", "all"], + default="all", + help="Which table to display: cluster overview, jobs overview, or both (default: all).", + ) status_parser.set_defaults(func=status) return status_parser diff --git a/cluv/cli/status.py b/cluv/cli/status.py index 2c9ef21..10917a4 100644 --- a/cluv/cli/status.py +++ b/cluv/cli/status.py @@ -381,7 +381,7 @@ def _build_legend() -> Panel: return Panel(legend, title="Legend", border_style="dim", padding=(0, 1)) -async def status(clusters: list[str] | None = None): +async def status(clusters: list[str] | None = None, show: str = "all"): """Gets the status of available clusters. - Gives you an overview of the state of each cluster, and displays an overview of the state of your jobs across the clusters. - Displays the number of idle nodes, or the number of idle GPUs, or something similar, for each cluster @@ -411,9 +411,10 @@ async def status(clusters: list[str] | None = None): console.rule("[bold cyan]cluv status[/bold cyan]") console.print() - console.print(_build_cluster_table(data)) - console.print() - console.print(_build_my_jobs_table(data)) - console.print() - console.print(_build_legend()) - console.print() + if show in ("clusters", "all"): + console.print(_build_cluster_table(data)) + console.print(_build_legend()) + console.print() + if show in ("jobs", "all"): + console.print(_build_my_jobs_table(data)) + console.print() From f2a79fa8c9908cb7ea2415040579446652d9d8b2 Mon Sep 17 00:00:00 2001 From: Hugo Vandenbroucke-Menu Date: Fri, 15 May 2026 11:18:22 -0400 Subject: [PATCH 2/9] Rework tables title --- cluv/cli/status.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cluv/cli/status.py b/cluv/cli/status.py index 10917a4..7e32760 100644 --- a/cluv/cli/status.py +++ b/cluv/cli/status.py @@ -274,11 +274,11 @@ def _gpu_bar(idle: int, total: int, width: int = 10) -> Text: def _build_cluster_table(data: list[ClusterStatus]) -> Table: table = Table( - title="[bold cyan]Cluster Overview[/bold cyan]", + title="Cluster Overview", box=box.ROUNDED, show_lines=True, header_style="bold white on #1a1a2e", - title_style="bold", + title_style="bold cyan", expand=True, ) @@ -323,9 +323,10 @@ def _build_cluster_table(data: list[ClusterStatus]) -> Table: def _build_my_jobs_table(data: list[ClusterStatus]) -> Table: table = Table( - title="[bold cyan]Your Jobs Summary[/bold cyan]", + title="Jobs Overview", box=box.SIMPLE_HEAVY, header_style="bold white on #1a1a2e", + title_style="bold cyan", expand=True, ) table.add_column("Cluster", style="bold magenta") From 01e5a79da60a66e11b1dfe6d0284d442eba9231f Mon Sep 17 00:00:00 2001 From: Hugo Vandenbroucke-Menu Date: Fri, 29 May 2026 09:52:50 -0400 Subject: [PATCH 3/9] Refactor status command to use a table argument for customizable output --- cluv/__main__.py | 13 ++++--------- cluv/cli/status.py | 7 ++++--- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/cluv/__main__.py b/cluv/__main__.py index ba519d4..8223b14 100644 --- a/cluv/__main__.py +++ b/cluv/__main__.py @@ -139,20 +139,15 @@ def add_submit_args( def add_status_args(subparsers: Subparsers) -> argparse.ArgumentParser: status_parser = subparsers.add_parser( "status", - help="Get the status of available clusters.", + help="Get the status of clusters and jobs.", formatter_class=rich_argparse.RichHelpFormatter, ) status_parser.add_argument( - "clusters", - nargs="*", - default=None, - metavar="", - help=("Cluster(s) to query. Leave empty to query all clusters with an active connection."), - ) - status_parser.add_argument( - "--show", + "table", + nargs="?", choices=["clusters", "jobs", "all"], default="all", + metavar="", help="Which table to display: cluster overview, jobs overview, or both (default: all).", ) status_parser.set_defaults(func=status) diff --git a/cluv/cli/status.py b/cluv/cli/status.py index 7e32760..5dcc45e 100644 --- a/cluv/cli/status.py +++ b/cluv/cli/status.py @@ -382,12 +382,13 @@ def _build_legend() -> Panel: return Panel(legend, title="Legend", border_style="dim", padding=(0, 1)) -async def status(clusters: list[str] | None = None, show: str = "all"): +async def status(table: str) -> None: """Gets the status of available clusters. - Gives you an overview of the state of each cluster, and displays an overview of the state of your jobs across the clusters. - Displays the number of idle nodes, or the number of idle GPUs, or something similar, for each cluster """ console = Console() + clusters = get_config().clusters_names clusters = list(clusters or []) if clusters: @@ -412,10 +413,10 @@ async def status(clusters: list[str] | None = None, show: str = "all"): console.rule("[bold cyan]cluv status[/bold cyan]") console.print() - if show in ("clusters", "all"): + if table in ("clusters", "all"): console.print(_build_cluster_table(data)) console.print(_build_legend()) console.print() - if show in ("jobs", "all"): + if table in ("jobs", "all"): console.print(_build_my_jobs_table(data)) console.print() From 49325507a8ddfea406b0ec7e244de983c098ff65 Mon Sep 17 00:00:00 2001 From: Hugo Vandenbroucke-Menu Date: Fri, 29 May 2026 11:59:40 -0400 Subject: [PATCH 4/9] Move StorageStats --- cluv/cli/status.py | 24 ++++++++---------------- cluv/slurm.py | 10 +++++++++- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cluv/cli/status.py b/cluv/cli/status.py index 5dcc45e..02e0099 100644 --- a/cluv/cli/status.py +++ b/cluv/cli/status.py @@ -13,6 +13,14 @@ from cluv.cli.login import get_remote_without_2fa_prompt from cluv.config import get_config from cluv.remote import Remote +from cluv.slurm import ( + StorageStats, + parse_disk_quota, + parse_diskusage_report, + parse_partition_stats, + parse_savail, + parse_sinfo_nodes, +) logger = logging.getLogger(__name__) __all__ = ["status"] @@ -30,15 +38,6 @@ class JobStats: my_completed: int | None = None # recently completed jobs for the current user -@dataclass -class StorageStats: - """Disk usage as (used_gib, quota_gib) for $HOME and $SCRATCH.""" - home_used: float - home_quota: float - scratch_used: float - scratch_quota: float - - @dataclass class ClusterStatus: name: str @@ -97,13 +96,6 @@ async def get_real_cluster_status(remote: Remote) -> ClusterStatus: Uses a single SSH round-trip. Falls back gracefully when commands are unavailable (e.g. partition-stats is DRAC-only). """ - from cluv.slurm import ( - parse_disk_quota, - parse_diskusage_report, - parse_partition_stats, - parse_savail, - parse_sinfo_nodes, - ) cluster = remote.hostname script = _REMOTE_SCRIPT_MILA if cluster in _MILA_CLUSTERS else _REMOTE_SCRIPT_DRAC diff --git a/cluv/slurm.py b/cluv/slurm.py index 7988d58..934c994 100644 --- a/cluv/slurm.py +++ b/cluv/slurm.py @@ -6,8 +6,16 @@ from __future__ import annotations import re +from dataclasses import dataclass -from cluv.cli.status import StorageStats + +@dataclass +class StorageStats: + """Disk usage as (used_gib, quota_gib) for $HOME and $SCRATCH.""" + home_used: float + home_quota: float + scratch_used: float + scratch_quota: float # --------------------------------------------------------------------------- From 11e4c2867e57e85b96039ce2128bd93328bacb6b Mon Sep 17 00:00:00 2001 From: Hugo Vandenbroucke-Menu Date: Fri, 29 May 2026 12:25:09 -0400 Subject: [PATCH 5/9] Refactor cluster status retrieval --- cluv/cli/status.py | 88 ++++++++++++++++------------------------------ 1 file changed, 31 insertions(+), 57 deletions(-) diff --git a/cluv/cli/status.py b/cluv/cli/status.py index 02e0099..7a0f324 100644 --- a/cluv/cli/status.py +++ b/cluv/cli/status.py @@ -12,7 +12,6 @@ from cluv.cli.login import get_remote_without_2fa_prompt from cluv.config import get_config -from cluv.remote import Remote from cluv.slurm import ( StorageStats, parse_disk_quota, @@ -49,6 +48,18 @@ class ClusterStatus: storage: StorageStats +def get_default_cluster_status(cluster: str) -> ClusterStatus: + return ClusterStatus( + name=cluster, + online=False, + gpu_idle=0, + gpu_total=0, + gpu_model="?", + jobs=JobStats(running=0, pending=0, my_running=0, my_pending=0), + storage=StorageStats(home_used=0, home_quota=0, scratch_used=0, scratch_quota=0), + ) + + # --------------------------------------------------------------------------- # Real data layer # --------------------------------------------------------------------------- @@ -90,14 +101,20 @@ class ClusterStatus: _MILA_CLUSTERS = {"mila"} -async def get_real_cluster_status(remote: Remote) -> ClusterStatus: +async def get_real_cluster_status(cluster: str) -> ClusterStatus: """Fetch live Slurm data from a remote cluster and return a ClusterStatus. Uses a single SSH round-trip. Falls back gracefully when commands are unavailable (e.g. partition-stats is DRAC-only). """ - cluster = remote.hostname + # Use get_remote_without_2fa_prompt directly so we never filter out the + # "current" cluster the way login() does. A working socket for mila is + # perfectly usable even when /home/mila is mounted locally. + remote = await get_remote_without_2fa_prompt(cluster) + if remote is None: + return get_default_cluster_status(cluster) + script = _REMOTE_SCRIPT_MILA if cluster in _MILA_CLUSTERS else _REMOTE_SCRIPT_DRAC try: @@ -109,15 +126,7 @@ async def get_real_cluster_status(remote: Remote) -> ClusterStatus: ) except Exception as exc: logger.warning(f"[red]Could not reach {cluster}: {exc}[/red]") - return ClusterStatus( - name=cluster, - online=False, - gpu_idle=0, - gpu_total=0, - gpu_model="?", - jobs=JobStats(running=0, pending=0, my_running=0, my_pending=0), - storage=StorageStats(home_used=0, home_quota=0, scratch_used=0, scratch_quota=0), - ) + return get_default_cluster_status(cluster) parts = raw.split(_SEP) # Pad in case some sections are missing @@ -197,33 +206,6 @@ async def get_real_cluster_status(remote: Remote) -> ClusterStatus: ) -async def get_all_cluster_statuses( - remotes: list[Remote] | None = None, -) -> tuple[list[ClusterStatus], bool]: - """Query clusters in parallel. - - If *remotes* is provided, query exactly those connections. - Otherwise, query all clusters that already have an active SSH connection - (never blocks on 2FA). - - Returns (statuses, any_live) where any_live is False when no cluster - was reachable. - """ - if remotes is None: - clusters = get_config().clusters - remotes = [ - r - for r in await asyncio.gather(*(get_remote_without_2fa_prompt(c) for c in clusters)) - if r is not None - ] - - if not remotes: - return [], False - - statuses = list(await asyncio.gather(*(get_real_cluster_status(r) for r in remotes))) - return statuses, True - - # --------------------------------------------------------------------------- # UI helpers # --------------------------------------------------------------------------- @@ -285,9 +267,9 @@ def _build_cluster_table(data: list[ClusterStatus]) -> Table: for c in data: if not c.online: - status_cell = Text("⚠ offline", style="bold red") + status_cell = Text("⚠ disconnected", style="bold red") else: - status_cell = Text("● online", style="bold green") + status_cell = Text("● connected", style="bold green") my_jobs = Text(f"{c.jobs.my_running} / {c.jobs.my_pending}", style="cyan") all_jobs = Text(f"{c.jobs.running} / {c.jobs.pending}", style="white") @@ -381,24 +363,16 @@ async def status(table: str) -> None: """ console = Console() clusters = get_config().clusters_names - clusters = list(clusters or []) - - if clusters: - # Use get_remote_without_2fa_prompt directly so we never filter out the - # "current" cluster the way login() does. A working socket for mila is - # perfectly usable even when /home/mila is mounted locally. - remotes = [ - r - for r in await asyncio.gather(*(get_remote_without_2fa_prompt(c) for c in clusters)) - if r is not None - ] - data, is_live = await get_all_cluster_statuses(remotes=remotes) - else: - data, is_live = await get_all_cluster_statuses() - if not is_live: + # Query clusters in parallel + data: list[ClusterStatus] = [ + d for d in await asyncio.gather(*(get_real_cluster_status(c) for c in clusters)) + ] + + # Show a tip message if all clusters are offline, which likely means the user hasn't logged in yet (no control sockets). + if all(not c.online for c in data): console.print( - "[yellow]No active cluster connections found. Run [bold]cluv login[/bold] first.[/yellow]" + "[yellow]No active connections to any clusters found. Run [bold]cluv login[/bold] first.[/yellow]" ) console.print() From a9387536e53f53f42d96e922780bea66af3bdb14 Mon Sep 17 00:00:00 2001 From: Hugo Vandenbroucke-Menu Date: Fri, 29 May 2026 13:19:34 -0400 Subject: [PATCH 6/9] Rework legend --- cluv/cli/status.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cluv/cli/status.py b/cluv/cli/status.py index 7a0f324..685ea2c 100644 --- a/cluv/cli/status.py +++ b/cluv/cli/status.py @@ -347,11 +347,11 @@ def _build_my_jobs_table(data: list[ClusterStatus]) -> Table: def _build_legend() -> Panel: legend = ( + "[green]●[/green] connected " + "[red]⚠[/red] disconnected " "[green]▰[/green] free GPU " "[red]▱[/red] busy GPU " "[green]█[/green]/[yellow]█[/yellow]/[red]█[/red] disk usage (low/med/high) " - "[green]●[/green] online " - "[red]⚠[/red] offline" ) return Panel(legend, title="Legend", border_style="dim", padding=(0, 1)) From 70461a14e57a2e6c69625afbbaa9f3214e55f3bc Mon Sep 17 00:00:00 2001 From: Hugo Vandenbroucke-Menu Date: Fri, 29 May 2026 13:19:40 -0400 Subject: [PATCH 7/9] Add sprinner --- cluv/cli/status.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cluv/cli/status.py b/cluv/cli/status.py index 685ea2c..42eb56f 100644 --- a/cluv/cli/status.py +++ b/cluv/cli/status.py @@ -365,9 +365,10 @@ async def status(table: str) -> None: clusters = get_config().clusters_names # Query clusters in parallel - data: list[ClusterStatus] = [ - d for d in await asyncio.gather(*(get_real_cluster_status(c) for c in clusters)) - ] + with console.status("Fetching cluster status...",): + data: list[ClusterStatus] = [ + d for d in await asyncio.gather(*(get_real_cluster_status(c) for c in clusters)) + ] # Show a tip message if all clusters are offline, which likely means the user hasn't logged in yet (no control sockets). if all(not c.online for c in data): From a8b931c51c29ff9dcb2c8c0890442e070d5ca16f Mon Sep 17 00:00:00 2001 From: Hugo Vandenbroucke-Menu Date: Fri, 29 May 2026 13:24:50 -0400 Subject: [PATCH 8/9] Update status indicators from "connected/disconnected" to "online/offline" --- cluv/cli/status.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cluv/cli/status.py b/cluv/cli/status.py index 42eb56f..b10b323 100644 --- a/cluv/cli/status.py +++ b/cluv/cli/status.py @@ -267,9 +267,9 @@ def _build_cluster_table(data: list[ClusterStatus]) -> Table: for c in data: if not c.online: - status_cell = Text("⚠ disconnected", style="bold red") + status_cell = Text("⚠ offline", style="bold red") else: - status_cell = Text("● connected", style="bold green") + status_cell = Text("● online", style="bold green") my_jobs = Text(f"{c.jobs.my_running} / {c.jobs.my_pending}", style="cyan") all_jobs = Text(f"{c.jobs.running} / {c.jobs.pending}", style="white") @@ -347,11 +347,11 @@ def _build_my_jobs_table(data: list[ClusterStatus]) -> Table: def _build_legend() -> Panel: legend = ( - "[green]●[/green] connected " - "[red]⚠[/red] disconnected " + "[green]●[/green] online " + "[red]⚠[/red] offline " "[green]▰[/green] free GPU " "[red]▱[/red] busy GPU " - "[green]█[/green]/[yellow]█[/yellow]/[red]█[/red] disk usage (low/med/high) " + "[green]█[/green]/[yellow]█[/yellow]/[red]█[/red] disk usage (low/med/high)" ) return Panel(legend, title="Legend", border_style="dim", padding=(0, 1)) From e03564fea7fc3e3abc65646e1470029870fd72d1 Mon Sep 17 00:00:00 2001 From: Hugo Vandenbroucke-Menu Date: Fri, 29 May 2026 14:49:41 -0400 Subject: [PATCH 9/9] Refactor cluster status table display and update status indicators --- cluv/cli/status.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/cluv/cli/status.py b/cluv/cli/status.py index b10b323..a1b6794 100644 --- a/cluv/cli/status.py +++ b/cluv/cli/status.py @@ -257,20 +257,15 @@ def _build_cluster_table(data: list[ClusterStatus]) -> Table: ) table.add_column("Cluster", style="bold", ratio=1) - table.add_column("Status", justify="center", ratio=1) - table.add_column("GPU model", justify="center", ratio=1) - table.add_column("Free GPUs", justify="left", ratio=2) + table.add_column("GPU model", justify="center", ratio=2) + table.add_column("Free GPUs", justify="left", ratio=1) table.add_column("My jobs\nrun/pend", justify="center", ratio=1) table.add_column("All jobs\nrun/pend", justify="center", ratio=1) table.add_column("$HOME", justify="left", ratio=2) table.add_column("$SCRATCH", justify="left", ratio=2) for c in data: - if not c.online: - status_cell = Text("⚠ offline", style="bold red") - else: - status_cell = Text("● online", style="bold green") - + status_cell = Text("● ", style="bold green") if c.online else Text("⚠ ", style="bold red") my_jobs = Text(f"{c.jobs.my_running} / {c.jobs.my_pending}", style="cyan") all_jobs = Text(f"{c.jobs.running} / {c.jobs.pending}", style="white") @@ -281,8 +276,7 @@ def _build_cluster_table(data: list[ClusterStatus]) -> Table: row_style = "dim" if not c.online else "" table.add_row( - Text(c.name, style="bold magenta" if c.online else "dim"), - status_cell, + status_cell + Text(c.name, style="bold magenta" if c.online else "bold bright_black"), Text(c.gpu_model, style="bright_blue"), _gpu_bar(c.gpu_idle, c.gpu_total), my_jobs, @@ -365,7 +359,7 @@ async def status(table: str) -> None: clusters = get_config().clusters_names # Query clusters in parallel - with console.status("Fetching cluster status...",): + with console.status("Fetching clusters status..."): data: list[ClusterStatus] = [ d for d in await asyncio.gather(*(get_real_cluster_status(c) for c in clusters)) ]