Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions gcm/monitoring/slurm/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,3 +278,50 @@ def parse_scontrol_maxnodes(v: str) -> int:
def parse_job_ids(s: str) -> list[str]:
"""Given a comma separated string of job ids, return a list of job ids."""
return s.split(",") if s else []


def parse_gres_gpu_indices(v: str) -> str | None:
"""Parse gres_detail to extract GPU indices for single-node jobs.

The input is a comma-joined string of gres_detail entries from the SLURM REST
API (joined by _map_job_fields). Each entry looks like "gpu:ampere:1(IDX:7)"
or "gpu:ampere:4(IDX:0-3)".

Returns a comma-separated string of GPU indices (e.g., "7" or "0,1,2,3") for
single-node jobs. Returns None for multi-node jobs (multiple IDX entries) or
parse failures.

Examples:

>>> parse_gres_gpu_indices("gpu:ampere:1(IDX:7)")
'7'
>>> parse_gres_gpu_indices("gpu:ampere:3(IDX:0,3,5)")
'0,3,5'
>>> parse_gres_gpu_indices("gpu:ampere:4(IDX:0-3)")
'0,1,2,3'
>>> parse_gres_gpu_indices("gpu:ampere:8(IDX:0-7)")
'0,1,2,3,4,5,6,7'
>>> parse_gres_gpu_indices("gpu:ampere:8(IDX:0-7),gpu:ampere:8(IDX:0-7)")
>>> parse_gres_gpu_indices("")
>>> parse_gres_gpu_indices("(null)")
"""
if not v or v in {"N/A", "(null)", "[]"}:
return None

idx_matches = re.findall(r"IDX:([0-9,\-]+)", v)
if len(idx_matches) != 1:
# Multi-node (multiple IDX entries) or no IDX found
return None

indices: list[int] = []
for part in idx_matches[0].split(","):
if "-" in part:
start_s, end_s = part.split("-", 1)
indices.extend(range(int(start_s), int(end_s) + 1))
else:
indices.append(int(part))

if not indices:
return None

return ",".join(str(i) for i in sorted(indices))
12 changes: 11 additions & 1 deletion gcm/schemas/slurm/squeue.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
from dataclasses import dataclass, fields
from dataclasses import dataclass, field, fields

from gcm.monitoring.clock import time_to_time_aware
from gcm.monitoring.coerce import maybe_float, maybe_int
from gcm.monitoring.slurm.nodelist_parsers import nodelist
from gcm.monitoring.slurm.parsing import (
maybe_parse_memory_to_bytes,
parse_gres_gpu_indices,
parse_gres_or_tres,
parse_value_from_tres,
)
Expand Down Expand Up @@ -75,6 +76,14 @@ class JobData(DerivedCluster):
FEATURE: str = parsed_field(parser=str)
RESTARTCNT: int = parsed_field(parser=int)
SCHEDNODES: list[str] | None = parsed_field(parser=lambda s: nodelist()(s)[0])
GRES_GPU_INDICES: str | None = field(
default=None,
metadata={
"parser": parse_gres_gpu_indices,
"field_name": "GRES_DETAIL",
"slurm_field": False,
},
)
Comment thread
lushengt-meta marked this conversation as resolved.


JOB_DATA_SLURM_FIELDS = list(
Expand Down Expand Up @@ -125,4 +134,5 @@ class JobData(DerivedCluster):
"features": "FEATURE",
"restart_cnt": "RESTARTCNT",
"scheduled_nodes": "SCHEDNODES",
"gres_detail": "GRES_DETAIL",
}
49 changes: 49 additions & 0 deletions gcm/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
maybe_parse_memory_to_bytes,
mb_to_bytes,
parse_gres,
parse_gres_gpu_indices,
parse_memory_to_bytes,
parse_tres,
parse_value_from_tres,
Expand Down Expand Up @@ -549,6 +550,54 @@ def test_parse_gpu_from_tres_bad(s: str, exc: Type[Exception]) -> None:
parse_value_from_tres(s, "gres/gpu")


@pytest.mark.parametrize(
"s, expected",
[
# Single GPU (1-GPU job)
("gpu:ampere:1(IDX:7)", "7"),
# Multiple specific GPUs
("gpu:ampere:3(IDX:0,3,5)", "0,3,5"),
# Range notation
("gpu:ampere:4(IDX:0-3)", "0,1,2,3"),
# Mixed range and specific
("gpu:ampere:5(IDX:0-2,5,7)", "0,1,2,5,7"),
# Full node (8 GPUs) — still returns indices (caller decides whether to filter)
("gpu:ampere:8(IDX:0-7)", "0,1,2,3,4,5,6,7"),
# Multi-node (multiple IDX entries) — returns None, unsupported
(
"gpu:ampere:8(IDX:0-7),gpu:ampere:8(IDX:0-7)",
None,
),
# Multi-node partial GPUs — returns None, unsupported
(
"gpu:ampere:3(IDX:0,3,5),gpu:ampere:3(IDX:1,4,7)",
None,
),
# Empty string
("", None),
# SLURM null values
("(null)", None),
("N/A", None),
# Empty array representation
("[]", None),
# No IDX in the string
("gpu:ampere:8", None),
# 16-GPU node — partial allocation (works for nodes with >8 GPUs)
("gpu:ampere:10(IDX:0-9)", "0,1,2,3,4,5,6,7,8,9"),
# 16-GPU node — full allocation
("gpu:ampere:16(IDX:0-15)", "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15"),
# Comma-join ambiguity: IDX commas inside () are safely delimited by )
(
"gpu:ampere:3(IDX:0,3,5),gpu:ampere:3(IDX:1,4,7)",
None,
),
],
)
@typechecked
def test_parse_gres_gpu_indices(s: str, expected: str | None) -> None:
assert parse_gres_gpu_indices(s) == expected


@pytest.mark.parametrize(
"value, expected",
[
Expand Down
Loading