22
33import asyncio
44import logging
5- import random
65from dataclasses import dataclass
76
87from rich import box
1817logger = logging .getLogger (__name__ )
1918__all__ = ["status" ]
2019
21- # ---------------------------------------------------------------------------
22- # Data layer – replace these with real implementations later
23- # ---------------------------------------------------------------------------
24-
25- CLUSTERS = (
26- "mila" ,
27- "narval" ,
28- "tamia" ,
29- "rorqual" ,
30- "fir" ,
31- "nibi" ,
32- "killarney" ,
33- "vulcan" ,
34- "trillium" ,
35- )
36-
37- MOCK_DATA_SEED = 42 # deterministic seed so the display is reproducible
38- OFFLINE_PROBABILITY = 0.08 # ~8 % chance of a cluster being down/maintenance
39-
40-
41- # Rough GPU pool sizes per cluster (total GPUs available on the cluster).
42- _GPU_TOTALS : dict [str , int ] = {
43- "mila" : 2048 ,
44- "narval" : 1024 ,
45- "tamia" : 512 ,
46- "rorqual" : 768 ,
47- "fir" : 640 ,
48- "nibi" : 256 ,
49- "killarney" : 384 ,
50- "vulcan" : 512 ,
51- "trillium" : 1280 ,
52- }
53-
54- # Storage quota in GiB (home, scratch)
55- _STORAGE_QUOTAS : dict [str , tuple [int , int ]] = {
56- "mila" : (50 , 5000 ),
57- "narval" : (50 , 10000 ),
58- "tamia" : (100 , 8000 ),
59- "rorqual" : (100 , 12000 ),
60- "fir" : (50 , 6000 ),
61- "nibi" : (50 , 4000 ),
62- "killarney" : (100 , 7500 ),
63- "vulcan" : (100 , 9000 ),
64- "trillium" : (50 , 15000 ),
65- }
66-
6720
6821@dataclass
6922class JobStats :
@@ -80,7 +33,6 @@ class JobStats:
8033@dataclass
8134class StorageStats :
8235 """Disk usage as (used_gib, quota_gib) for $HOME and $SCRATCH."""
83-
8436 home_used : float
8537 home_quota : float
8638 scratch_used : float
@@ -96,8 +48,6 @@ class ClusterStatus:
9648 gpu_model : str
9749 jobs : JobStats
9850 storage : StorageStats
99- avg_wait_min : int | None = None # estimated queue wait time in minutes
100- avg_gpu_util_pct : float | None = None # average GPU utilisation across running jobs
10151
10252
10353# ---------------------------------------------------------------------------
@@ -282,69 +232,6 @@ async def get_all_cluster_statuses(
282232 return statuses , True
283233
284234
285- def get_mock_cluster_status (username : str = "you" ) -> list [ClusterStatus ]:
286- """Return fake but plausible status data for every known cluster.
287-
288- This function is intentionally free of any UI logic so it can be swapped
289- out for a real implementation that queries Slurm / the cluster APIs.
290- """
291- rng = random .Random (MOCK_DATA_SEED )
292-
293- gpu_models = ["A100" , "H100" , "V100" , "A40" , "RTX 8000" ]
294-
295- results : list [ClusterStatus ] = []
296- for cluster in get_config ().clusters :
297- gpu_total = _GPU_TOTALS [cluster ]
298- # Simulate varying load – some clusters busier than others
299- load_factor = rng .uniform (0.55 , 0.98 )
300- gpu_busy = int (gpu_total * load_factor )
301- gpu_idle = gpu_total - gpu_busy
302-
303- total_jobs = int (gpu_busy * rng .uniform (0.8 , 1.4 ))
304- pending = int (total_jobs * rng .uniform (0.1 , 0.4 ))
305- running = total_jobs - pending
306- cancelled = int (total_jobs * rng .uniform (0.01 , 0.05 ))
307- completed = int (total_jobs * rng .uniform (0.5 , 2.0 ))
308-
309- my_running = rng .randint (0 , min (8 , running ))
310- my_pending = rng .randint (0 , min (4 , pending ))
311- my_completed = completed * my_running // max (running , 1 )
312-
313- home_quota , scratch_quota = _STORAGE_QUOTAS [cluster ]
314- home_used = round (rng .uniform (5 , home_quota * 0.90 ), 1 )
315- scratch_used = round (rng .uniform (home_quota , scratch_quota * 0.95 ), 1 )
316-
317- online = rng .random () > OFFLINE_PROBABILITY
318-
319- results .append (
320- ClusterStatus (
321- name = cluster ,
322- online = online ,
323- gpu_idle = gpu_idle ,
324- gpu_total = gpu_total ,
325- gpu_model = rng .choice (gpu_models ),
326- jobs = JobStats (
327- running = running ,
328- pending = pending ,
329- my_running = my_running ,
330- my_pending = my_pending ,
331- cancelled = cancelled ,
332- completed = completed ,
333- my_completed = my_completed ,
334- ),
335- storage = StorageStats (
336- home_used = home_used ,
337- home_quota = home_quota ,
338- scratch_used = scratch_used ,
339- scratch_quota = scratch_quota ,
340- ),
341- avg_wait_min = rng .randint (2 , 240 ),
342- avg_gpu_util_pct = round (rng .uniform (40 , 99 ), 1 ),
343- )
344- )
345- return results
346-
347-
348235# ---------------------------------------------------------------------------
349236# UI helpers
350237# ---------------------------------------------------------------------------
@@ -380,27 +267,6 @@ def _gpu_bar(idle: int, total: int, width: int = 10) -> Text:
380267 return Text (f"{ bar_str } { idle :>5} /{ total } " , style = colour )
381268
382269
383- def _wait_text (minutes : int ) -> Text :
384- if minutes < 15 :
385- return Text (f"~{ minutes } m" , style = "green" )
386- elif minutes < 60 :
387- return Text (f"~{ minutes } m" , style = "yellow" )
388- else :
389- h = minutes // 60
390- m = minutes % 60
391- return Text (f"~{ h } h{ m :02d} m" , style = "red" )
392-
393-
394- def _util_text (pct : float ) -> Text :
395- s = f"{ pct :.0f} %"
396- if pct >= 80 :
397- return Text (s , style = "green" )
398- elif pct >= 55 :
399- return Text (s , style = "yellow" )
400- else :
401- return Text (s , style = "red" )
402-
403-
404270# ---------------------------------------------------------------------------
405271# Main display
406272# ---------------------------------------------------------------------------
@@ -422,8 +288,6 @@ def _build_cluster_table(data: list[ClusterStatus]) -> Table:
422288 table .add_column ("Free GPUs" , justify = "left" , ratio = 2 )
423289 table .add_column ("My jobs\n run/pend" , justify = "center" , ratio = 1 )
424290 table .add_column ("All jobs\n run/pend" , justify = "center" , ratio = 1 )
425- table .add_column ("Avg wait" , justify = "center" , ratio = 1 )
426- table .add_column ("GPU util" , justify = "center" , ratio = 1 )
427291 table .add_column ("$HOME" , justify = "left" , ratio = 2 )
428292 table .add_column ("$SCRATCH" , justify = "left" , ratio = 2 )
429293
@@ -449,10 +313,6 @@ def _build_cluster_table(data: list[ClusterStatus]) -> Table:
449313 _gpu_bar (c .gpu_idle , c .gpu_total ),
450314 my_jobs ,
451315 all_jobs ,
452- Text ("—" , style = "dim" ) if c .avg_wait_min is None else _wait_text (c .avg_wait_min ),
453- Text ("—" , style = "dim" )
454- if c .avg_gpu_util_pct is None
455- else _util_text (c .avg_gpu_util_pct ),
456316 home_bar ,
457317 scratch_bar ,
458318 style = row_style ,
@@ -544,18 +404,11 @@ async def status(clusters: list[str] | None = None):
544404
545405 if not is_live :
546406 console .print (
547- "[yellow]No active cluster connections found. "
548- "Run [bold]cluv login[/bold] first, or showing mock data.[/yellow]\n "
407+ "[yellow]No active cluster connections found. Run [bold]cluv login[/bold] first.[/yellow]"
549408 )
550- mock = get_mock_cluster_status ()
551- # When specific clusters were requested, only show mock rows for those.
552- data = [c for c in mock if not clusters or c .name in clusters ]
553- label = "[dim](mock data)[/dim]"
554- else :
555- label = "[dim](live data)[/dim]"
556409
557410 console .print ()
558- console .rule (f "[bold cyan]cluv status[/bold cyan] { label } " )
411+ console .rule ("[bold cyan]cluv status[/bold cyan]" )
559412 console .print ()
560413
561414 console .print (_build_cluster_table (data ))
0 commit comments