Skip to content

Commit 21d87ac

Browse files
committed
Improve handling of unexpected values for upcomingMaintenance
* Log VM name for cases where maintenance can not be handled; * Be aware about maintenance `type` (can be different from `SCHEDULED`);
1 parent aa542c3 commit 21d87ac

File tree

3 files changed

+38
-11
lines changed

3 files changed

+38
-11
lines changed

community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -477,8 +477,17 @@ def get_upcoming_maintenance(lkp: util.Lookup) -> Dict[str, Tuple[str, datetime]
477477
upc_maint_map = {}
478478

479479
for node, inst in lkp.instances().items():
480-
if inst.resource_status.upcoming_maintenance:
481-
upc_maint_map[node + "_maintenance"] = (node, inst.resource_status.upcoming_maintenance.window_start_time)
480+
um = inst.resource_status.upcoming_maintenance
481+
if not um:
482+
continue
483+
if um.type != "SCHEDULED":
484+
log.warning(f"Maintenance event: can not handle non-scheduled maintenance of type {um.type} for node {node=}, skipping")
485+
continue
486+
if not um.window_start_time:
487+
log.error(f"Maintenance event: {node=} upcoming scheduled maintenance doesn't have start time, skipping")
488+
continue
489+
490+
upc_maint_map[node + "_maintenance"] = (node, um.window_start_time)
482491

483492
return upc_maint_map
484493

community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -429,20 +429,33 @@ def test_parse_gcp_timestamp(got: str, want: datetime):
429429
[
430430
(None, None),
431431
(dict(
432+
type="Might",
432433
windowStartTime="2025-01-15T00:00:00Z",
433434
somethingToIgnore="past failures",
434-
), UpcomingMaintenance(window_start_time=datetime(2025, 1, 15, 0, 0, tzinfo=UTC))),
435+
), UpcomingMaintenance(
436+
type="Might",
437+
window_start_time=datetime(2025, 1, 15, 0, 0, tzinfo=UTC))),
435438
(dict(
439+
type="And",
436440
startTimeWindow=dict(
437441
earliest="2025-01-15T00:00:00Z"),
438442
somethingToIgnore="past failures",
439-
), UpcomingMaintenance(window_start_time=datetime(2025, 1, 15, 0, 0, tzinfo=UTC))),
443+
), UpcomingMaintenance(
444+
type="And",
445+
window_start_time=datetime(2025, 1, 15, 0, 0, tzinfo=UTC))),
440446
(dict(
447+
type="Magic",
441448
windowStartTime="2025-01-15T00:00:00Z",
442449
startTimeWindow=dict(
443450
earliest="2025-01-25T00:00:00Z"), # ignored
444451
somethingToIgnore="past failures",
445-
), UpcomingMaintenance(window_start_time=datetime(2025, 1, 15, 0, 0, tzinfo=UTC))),
452+
), UpcomingMaintenance(
453+
type="Magic",
454+
window_start_time=datetime(2025, 1, 15, 0, 0, tzinfo=UTC))),
455+
(
456+
dict(type="III"),
457+
UpcomingMaintenance(type="III", window_start_time=None),
458+
),
446459
])
447460
def tests_parse_UpcomingMaintenance_OK(got: dict, want: Optional[UpcomingMaintenance]):
448461
assert UpcomingMaintenance.from_json(got) == want
@@ -453,8 +466,8 @@ def tests_parse_UpcomingMaintenance_OK(got: dict, want: Optional[UpcomingMainten
453466
[
454467
{},
455468
dict(
456-
windowStartTime=dict(
457-
earliest="2025-01-15T00:00:00Z")),
469+
# no type,
470+
windowStartTime=dict(earliest="2025-01-15T00:00:00Z")),
458471
])
459472
def tests_parse_UpcomingMaintenance_FAIL(got: dict):
460473
with pytest.raises(ValueError):
@@ -483,10 +496,13 @@ def tests_parse_UpcomingMaintenance_FAIL(got: dict):
483496
upcoming_maintenance=None)),
484497
(dict(
485498
physicalHost="/aaa/bbb/ccc",
486-
upcomingMaintenance=dict(windowStartTime="2025-01-15T00:00:00Z")),
499+
upcomingMaintenance=dict(
500+
type="Lilac",
501+
windowStartTime="2025-01-15T00:00:00Z")),
487502
InstanceResourceStatus(
488503
physical_host="/aaa/bbb/ccc",
489504
upcoming_maintenance=UpcomingMaintenance(
505+
type="Lilac",
490506
window_start_time=datetime(2025, 1, 15, 0, 0, tzinfo=UTC)))),
491507
])
492508
def test_parse_InstanceResourceStatus(got: dict, want: Optional[InstanceResourceStatus]):

community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,22 +181,24 @@ def sockets(self) -> int:
181181

182182
@dataclass(frozen=True)
183183
class UpcomingMaintenance:
184-
window_start_time: datetime
184+
type: str
185+
window_start_time: Optional[datetime]
185186

186187
@classmethod
187188
def from_json(cls, jo: Optional[dict]) -> Optional["UpcomingMaintenance"]:
188189
if jo is None:
189190
return None
190191
try:
192+
type = jo["type"]
191193
if "windowStartTime" in jo:
192194
ts = parse_gcp_timestamp(jo["windowStartTime"])
193195
elif "startTimeWindow" in jo:
194196
ts = parse_gcp_timestamp(jo["startTimeWindow"]["earliest"])
195197
else:
196-
raise Exception("Neither windowStartTime nor startTimeWindow are found")
198+
ts = None
197199
except BaseException as e:
198200
raise ValueError(f"Unexpected format for upcomingMaintenance: {jo}") from e
199-
return cls(window_start_time=ts)
201+
return cls(type=type, window_start_time=ts)
200202

201203
@dataclass(frozen=True)
202204
class InstanceResourceStatus:

0 commit comments

Comments
 (0)