Skip to content

Commit 01e7151

Browse files
authored
fix: prevent DCGM handle/connection leak on connectivity failure (#1078) (#1089)
1 parent e62efb0 commit 01e7151

File tree

2 files changed

+92
-16
lines changed
  • health-monitors/gpu-health-monitor/gpu_health_monitor

2 files changed

+92
-16
lines changed

health-monitors/gpu-health-monitor/gpu_health_monitor/dcgm_watcher/dcgm.py

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -259,33 +259,48 @@ def _initialize_dcgm_monitoring(self, dcgm_handle: pydcgm.DcgmHandle) -> tuple:
259259
260260
Returns:
261261
A tuple of (dcgm_group, gpu_ids, gpu_serials)
262+
263+
If any step after group creation fails the group is deleted before the
264+
exception propagates so that it does not leak on the DCGM server.
262265
"""
263266
dcgm_group = self._create_dcgm_group_with_all_entities(dcgm_handle)
264-
with metrics.dcgm_api_latency.labels("group_health_set").time():
265-
dcgm_group.health.Set(dcgm_structs.DCGM_HEALTH_WATCH_ALL)
267+
try:
268+
with metrics.dcgm_api_latency.labels("group_health_set").time():
269+
dcgm_group.health.Set(dcgm_structs.DCGM_HEALTH_WATCH_ALL)
266270

267-
gpu_ids = dcgm_group.GetGpuIds()
268-
gpu_serials = self._get_gpu_serial_numbers(dcgm_handle)
269-
log.info(f"dcgm gpu_id are {gpu_ids}")
271+
gpu_ids = dcgm_group.GetGpuIds()
272+
gpu_serials = self._get_gpu_serial_numbers(dcgm_handle)
273+
log.info(f"dcgm gpu_id are {gpu_ids}")
270274

271-
return dcgm_group, gpu_ids, gpu_serials
275+
return dcgm_group, gpu_ids, gpu_serials
276+
except Exception as e:
277+
log.warning(f"DCGM monitoring initialization failed, rolling back group: {e}")
278+
try:
279+
dcgm_group.Delete()
280+
except Exception as del_err:
281+
log.warning(f"Failed to delete DCGM group during init rollback: {del_err}")
282+
metrics.dcgm_api_failures.labels("init_group_rollback").inc()
283+
raise
272284

273285
def _cleanup_dcgm_resources(
274286
self,
275287
dcgm_group: pydcgm.DcgmGroup,
276288
dcgm_handle: pydcgm.DcgmHandle,
277289
):
278-
"""Clean up DCGM resources safely."""
279-
try:
280-
if dcgm_group:
290+
"""Clean up DCGM resources safely.
291+
292+
Group deletion and handle shutdown are in separate try blocks so that
293+
a failure in Delete() does not prevent Shutdown() from running.
294+
"""
295+
if dcgm_group:
296+
try:
281297
dcgm_group.Delete()
282-
dcgm_group = None
283-
if dcgm_handle:
284-
# Clean up the handle
285-
dcgm_handle.Shutdown()
286-
del dcgm_handle
287-
except Exception as e:
288-
log.error(f"Error cleaning up DCGM handle: {e}")
298+
except Exception as e:
299+
log.warning(f"Error deleting DCGM group (will still shut down handle): {e}")
300+
metrics.dcgm_api_failures.labels("group_delete_error").inc()
301+
302+
if dcgm_handle:
303+
dcgm_handle.Shutdown()
289304

290305
def start(self, fields_to_monitor: list[str], exit: Event) -> None:
291306
dcgm_handle = None

health-monitors/gpu-health-monitor/gpu_health_monitor/tests/test_dcgm_watcher/test_dcgm.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import dcgm_structs, dcgm_errors, dcgm_fields, dcgm_field_helpers
1818
from threading import Event, Thread
1919
from ctypes import pointer
20+
import pytest
2021

2122

2223
class FakeEventProcessorInTest(dcgm.types.CallbackInterface):
@@ -505,3 +506,63 @@ def test_initialize_dcgm_monitoring(self, mock_dcgm_group, mock_dcgm_handle):
505506
assert len(gpu_serials) == 4
506507
# Verify that health.Set was called on the actual group object
507508
group.health.Set.assert_called_once()
509+
510+
511+
class TestDCGMHandleLeakFix:
512+
"""Tests for the DCGM handle/connection leak fix (issue #1078).
513+
514+
Covers: split try-blocks in cleanup and init rollback.
515+
"""
516+
517+
def _make_watcher(self):
518+
return dcgm.DCGMWatcher(
519+
addr="localhost:5555",
520+
poll_interval_seconds=10,
521+
callbacks=[],
522+
dcgm_k8s_service_enabled=False,
523+
)
524+
525+
@pytest.mark.parametrize(
526+
"group_is_none, delete_raises",
527+
[
528+
(False, True),
529+
(True, False),
530+
],
531+
ids=["delete_throws", "none_group"],
532+
)
533+
def test_cleanup_dcgm_resources(self, group_is_none, delete_raises):
534+
"""Shutdown() always runs regardless of Delete() outcome."""
535+
watcher = self._make_watcher()
536+
dcgm_handle_mock = MagicMock()
537+
538+
dcgm_group_mock = None
539+
if not group_is_none:
540+
dcgm_group_mock = MagicMock()
541+
if delete_raises:
542+
dcgm_group_mock.Delete.side_effect = Exception("Delete failed")
543+
544+
watcher._cleanup_dcgm_resources(dcgm_group_mock, dcgm_handle_mock)
545+
546+
dcgm_handle_mock.Shutdown.assert_called_once()
547+
if not group_is_none:
548+
dcgm_group_mock.Delete.assert_called_once()
549+
550+
@patch("pydcgm.DcgmGroup.__new__")
551+
def test_init_monitoring_rolls_back_group_on_failure(self, mock_dcgm_group):
552+
"""Group must be deleted if initialization fails after group creation."""
553+
watcher = self._make_watcher()
554+
dcgm_handle_mock = MagicMock()
555+
dcgm_group_mock = MagicMock()
556+
mock_dcgm_group.return_value = dcgm_group_mock
557+
558+
dcgm_system_mock = MagicMock()
559+
dcgm_system_mock.discovery.GetEntityGroupEntities.return_value = [0, 1]
560+
dcgm_handle_mock.GetSystem.return_value = dcgm_system_mock
561+
562+
# health.Set() fails after group is created
563+
dcgm_group_mock.health.Set.side_effect = Exception("DCGM connection lost")
564+
565+
with pytest.raises(Exception, match="DCGM connection lost"):
566+
watcher._initialize_dcgm_monitoring(dcgm_handle_mock)
567+
568+
dcgm_group_mock.Delete.assert_called_once()

0 commit comments

Comments
 (0)