Skip to content

Commit dbd8a9c

Browse files
yonglimetayongl user
andauthored
[GCM] Add schedule_exit and bf_exit sdiag counters to slurm monitor (#142)
* [GCM] Add schedule_exit and bf_exit sdiag counters to slurm monitor Summary: Extend the slurm monitor sdiag telemetry with the schedule_exit and bf_exit sub-section counters surfaced by sdiag --json. These counters expose why the main scheduler and the backfill scheduler stopped each cycle (e.g., end of job queue, max time, max job start, max RPC count), which gives much better visibility into scheduler tuning and saturation than the existing aggregate cycle stats alone. Test Plan: Updated and ran the existing sdiag JSON parsing tests against the checked-in sample-sdiag-output.json, including the missing-fields case. Ran 'pytest gcm/tests/test_slurm.py gcm/tests/test_slurm_rest_client.py' - all 25 tests passed. * [GCM] Add schedule_exit and bf_exit sdiag counters to slurm monitor Summary: Extend the slurm monitor sdiag telemetry with the schedule_exit and bf_exit sub-section counters surfaced by sdiag --json. These counters expose why the main scheduler and the backfill scheduler stopped each cycle (e.g., end of job queue, max time, max job start, max RPC count), which gives much better visibility into scheduler tuning and saturation than the existing aggregate cycle stats alone. Test Plan: Updated and ran the existing sdiag JSON parsing tests against the checked-in sample-sdiag-output.json, including the missing-fields case. Ran 'pytest gcm/tests/test_slurm.py gcm/tests/test_slurm_rest_client.py' - all 25 tests passed. --------- Co-authored-by: yongl user <yongl@yongl-login-0.yongl-login.tenant-slurm.svc.cluster.local>
1 parent d40284e commit dbd8a9c

4 files changed

Lines changed: 70 additions & 0 deletions

File tree

gcm/monitoring/slurm/client.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ def sdiag_structured(self) -> Sdiag:
199199
subprocess.check_output(["sdiag", "--all", "--json"], text=True)
200200
)
201201
stats = sdiag_output["statistics"]
202+
schedule_exit = stats.get("schedule_exit") or {}
203+
bf_exit = stats.get("bf_exit") or {}
202204

203205
result = Sdiag(
204206
server_thread_count=stats.get("server_thread_count"),
@@ -224,6 +226,20 @@ def sdiag_structured(self) -> Sdiag:
224226
bf_cycle_sum=stats.get("bf_cycle_sum"),
225227
bf_cycle_max=stats.get("bf_cycle_max"),
226228
bf_queue_len=stats.get("bf_queue_len"),
229+
schedule_exit_end_job_queue=schedule_exit.get("end_job_queue"),
230+
schedule_exit_default_queue_depth=schedule_exit.get(
231+
"default_queue_depth"
232+
),
233+
schedule_exit_max_job_start=schedule_exit.get("max_job_start"),
234+
schedule_exit_max_rpc_cnt=schedule_exit.get("max_rpc_cnt"),
235+
schedule_exit_max_sched_time=schedule_exit.get("max_sched_time"),
236+
schedule_exit_licenses=schedule_exit.get("licenses"),
237+
bf_exit_end_job_queue=bf_exit.get("end_job_queue"),
238+
bf_exit_max_job_start=bf_exit.get("bf_max_job_start"),
239+
bf_exit_max_job_test=bf_exit.get("bf_max_job_test"),
240+
bf_exit_max_time=bf_exit.get("bf_max_time"),
241+
bf_exit_node_space_size=bf_exit.get("bf_node_space_size"),
242+
bf_exit_state_changed=bf_exit.get("state_changed"),
227243
)
228244

229245
# Reset sdiag counters after collection

gcm/monitoring/slurm/rest_client.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ def sinfo(self) -> Iterable[str]:
113113
def sdiag_structured(self) -> Sdiag:
114114
data = self._get(f"/slurm/{self.api_version}/diag")
115115
stats = data.get("statistics", {})
116+
schedule_exit = stats.get("schedule_exit") or {}
117+
bf_exit = stats.get("bf_exit") or {}
116118
result = Sdiag(
117119
server_thread_count=stats.get("server_thread_count"),
118120
agent_queue_size=stats.get("agent_queue_size"),
@@ -137,6 +139,18 @@ def sdiag_structured(self) -> Sdiag:
137139
bf_cycle_sum=stats.get("bf_cycle_sum"),
138140
bf_cycle_max=stats.get("bf_cycle_max"),
139141
bf_queue_len=stats.get("bf_queue_len"),
142+
schedule_exit_end_job_queue=schedule_exit.get("end_job_queue"),
143+
schedule_exit_default_queue_depth=schedule_exit.get("default_queue_depth"),
144+
schedule_exit_max_job_start=schedule_exit.get("max_job_start"),
145+
schedule_exit_max_rpc_cnt=schedule_exit.get("max_rpc_cnt"),
146+
schedule_exit_max_sched_time=schedule_exit.get("max_sched_time"),
147+
schedule_exit_licenses=schedule_exit.get("licenses"),
148+
bf_exit_end_job_queue=bf_exit.get("end_job_queue"),
149+
bf_exit_max_job_start=bf_exit.get("bf_max_job_start"),
150+
bf_exit_max_job_test=bf_exit.get("bf_max_job_test"),
151+
bf_exit_max_time=bf_exit.get("bf_max_time"),
152+
bf_exit_node_space_size=bf_exit.get("bf_node_space_size"),
153+
bf_exit_state_changed=bf_exit.get("state_changed"),
140154
)
141155
return result
142156

gcm/schemas/slurm/sdiag.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,19 @@ class Sdiag:
3535
bf_cycle_sum: Optional[int] = None
3636
bf_cycle_max: Optional[int] = None
3737
bf_queue_len: Optional[int] = None
38+
39+
# Schedule exit statistics
40+
schedule_exit_end_job_queue: Optional[int] = None
41+
schedule_exit_default_queue_depth: Optional[int] = None
42+
schedule_exit_max_job_start: Optional[int] = None
43+
schedule_exit_max_rpc_cnt: Optional[int] = None
44+
schedule_exit_max_sched_time: Optional[int] = None
45+
schedule_exit_licenses: Optional[int] = None
46+
47+
# Backfill exit statistics
48+
bf_exit_end_job_queue: Optional[int] = None
49+
bf_exit_max_job_start: Optional[int] = None
50+
bf_exit_max_job_test: Optional[int] = None
51+
bf_exit_max_time: Optional[int] = None
52+
bf_exit_node_space_size: Optional[int] = None
53+
bf_exit_state_changed: Optional[int] = None

gcm/tests/test_slurm.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -513,6 +513,18 @@ def test_parse_sdiag_json(
513513
bf_cycle_sum=371434634,
514514
bf_cycle_max=47125449,
515515
bf_queue_len=411,
516+
schedule_exit_end_job_queue=54,
517+
schedule_exit_default_queue_depth=0,
518+
schedule_exit_max_job_start=0,
519+
schedule_exit_max_rpc_cnt=0,
520+
schedule_exit_max_sched_time=281,
521+
schedule_exit_licenses=0,
522+
bf_exit_end_job_queue=10,
523+
bf_exit_max_job_start=0,
524+
bf_exit_max_job_test=0,
525+
bf_exit_max_time=0,
526+
bf_exit_node_space_size=0,
527+
bf_exit_state_changed=0,
516528
)
517529

518530
assert result == expected
@@ -572,6 +584,18 @@ def test_parse_sdiag_json_with_missing_fields(
572584
bf_cycle_sum=None,
573585
bf_cycle_max=None,
574586
bf_queue_len=None,
587+
schedule_exit_end_job_queue=None,
588+
schedule_exit_default_queue_depth=None,
589+
schedule_exit_max_job_start=None,
590+
schedule_exit_max_rpc_cnt=None,
591+
schedule_exit_max_sched_time=None,
592+
schedule_exit_licenses=None,
593+
bf_exit_end_job_queue=None,
594+
bf_exit_max_job_start=None,
595+
bf_exit_max_job_test=None,
596+
bf_exit_max_time=None,
597+
bf_exit_node_space_size=None,
598+
bf_exit_state_changed=None,
575599
)
576600

577601
assert result == expected

0 commit comments

Comments
 (0)